diff --git a/.gitmodules b/.gitmodules
index 51d8eac03..6ebcea592 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,6 @@
 	path = text_to_image/torchtitan
 	url = https://github.com/pytorch/torchtitan.git
 	branch = mlperf-training-flux.1
+[submodule "recommendation_v4/cutlass"]
+	path = recommendation_v4/generative_recommenders/ops/cpp/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
diff --git a/recommendation_v4/.gitignore b/recommendation_v4/.gitignore
new file mode 100644
index 000000000..5edddc5b3
--- /dev/null
+++ b/recommendation_v4/.gitignore
@@ -0,0 +1,159 @@
+# Don't check in parsed data files and other temporary files
+tmp/
+exps/
+ckpts/
+results/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/recommendation_v4/Dockerfile b/recommendation_v4/Dockerfile
new file mode 100644
index 000000000..450a5ab55
--- /dev/null
+++ b/recommendation_v4/Dockerfile
@@ -0,0 +1,86 @@
+# MI350X path — implements docs/training_recipe.md §"MI350X".
+
+FROM rocm/primus:v26.3
+
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+
+WORKDIR /workspace/recommendation_v4
+
+# torch / torchvision / torchaudio — training_recipe.md:38-40.
+RUN pip install --upgrade --no-deps \
+        --index-url https://download.pytorch.org/whl/rocm7.2 \
+        torch==2.12.0+rocm7.2 \
+        torchvision==0.27.0+rocm7.2 \
+        torchaudio==2.11.0+rocm7.2
+
+# torchrec — training_recipe.md:43.
+RUN pip install --force-reinstall --no-deps \
+        "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"
+
+# fbgemm_gpu — training_recipe.md:42. Build from FBGEMM commit 10b77573 for
+# gfx950 against the replaced torch. ~30-60 min.
+RUN apt-get update && apt-get install -y --no-install-recommends git build-essential && \
+    rm -rf /var/lib/apt/lists/* && \
+    git clone --recursive https://github.com/pytorch/FBGEMM.git /tmp/FBGEMM && \
+    cd /tmp/FBGEMM && \
+    git checkout 10b775730212923f65f7b78f79b6a01d80cf3c29 && \
+    git submodule update --init --recursive && \
+    cd fbgemm_gpu && \
+    # Filter `fairscale` and the torch family from fbgemm's requirements.txt:
+    # fairscale pulls a CPU torch that would clobber the +rocm7.2 wheel installed
+    # above. fairscale is a distributed-training lib used by fbgemm tests, not
+    # by the build itself.
+    grep -v -E '^(fairscale|torch|torchvision|torchaudio)([<>=!]|$)' requirements.txt > /tmp/req.txt && \
+    pip install -r /tmp/req.txt && \
+    python setup.py -j 32 bdist_wheel \
+        --build-target=default \
+        --build-variant=rocm \
+        -DHIP_ROOT_DIR=/opt/rocm \
+        -DAMDGPU_TARGETS=gfx950 && \
+    pip install --force-reinstall --no-deps dist/fbgemm_gpu_nightly_rocm*.whl && \
+    cd / && rm -rf /tmp/FBGEMM
+
+# polars-u64-idx — training_recipe.md:44 (mandatory; yambda-5b > 4.29 B rows).
+# Remaining packages — training_recipe.md:156-159 ("Additional Python deps") plus
+# `datasets` + `huggingface_hub`, which the recipe does not list but
+# preprocess_public_data.py:278 imports to download yambda from HuggingFace.
+RUN pip install \
+        polars-u64-idx==1.33.1 \
+        gin-config \
+        absl-py \
+        datasets \
+        huggingface_hub \
+        pyre-extensions \
+        iopath \
+        typing-inspect \
+        psutil \
+        tqdm \
+        pyyaml \
+        lightning-utilities && \
+    # torchmetrics and tensordict declare `torch` as a dep; without --no-deps
+    # pip pulls torch==2.12.0+cu130 from PyPI which clobbers the +rocm7.2 wheel
+    # we installed above (libtorch_hip.so disappears, fbgemm_gpu fails to load).
+    pip install --no-deps \
+        torchmetrics==1.0.3 \
+        tensordict
+
+# mlperf_logging — required by train/mlperf_logging_utils.py for MLPerf
+# compliance logs. Pinned to the Training 6.0 tag for reproducibility; --no-deps
+# so pip does not resolve requirements.txt's torch/fbgemm_gpu/torchrec pins and
+# clobber the +rocm7.2 wheels above.
+RUN pip install --no-deps "git+https://github.com/mlcommons/logging.git@6.0.0-rc6"
+
+# Smoke-test the 6 imports the launch script checks at
+# scripts/launch_smoke_8gpu.sh:26.
+RUN python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; \
+print('torch', torch.__version__, '| hip', getattr(torch.version, 'hip', None))"
+
+COPY . /workspace/recommendation_v4
+
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+    HSTU_HAMMER_KERNEL=TRITON \
+    DLRM_DATA_PATH=/data/mlperf_dlrm_v4
+
+CMD ["bash"]
diff --git a/recommendation_v4/LICENSE b/recommendation_v4/LICENSE
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/recommendation_v4/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
new file mode 100644
index 000000000..e078bcf0a
--- /dev/null
+++ b/recommendation_v4/README.MD
@@ -0,0 +1,380 @@
+# Recommendation v4 — HSTU sequential recommendation (Yambda-5b)
+
+MLPerf Training reference benchmark. This is a fork of
+[meta-recsys/generative-recommenders](https://github.com/meta-recsys/generative-recommenders)
+extended to train an HSTU (Hierarchical Sequential Transduction Units) ranking
+model on the [Yambda-5b](https://huggingface.co/datasets/yandex/yambda)
+music-recommendation dataset, sized as an MLPerf-style training benchmark inside
+the `mlcommons/training` tree.
+
+## 1. Summary
+
+This benchmark trains a model that predicts what a person will listen to next.
+Given the history of songs a user has played, liked, or skipped, the model
+learns to rank which song the user is most likely to genuinely listen to (rather
+than skip) next. This is the same kind of "what should we recommend next?"
+problem that powers music and video streaming feeds. The model is trained on a
+large public dataset of anonymized music-listening events and is scored on how
+well it predicts future listens it has never seen.
+
+## 2. Benchmark overview (technical)
+
+The model is a **sequential recommender**: instead of treating each interaction
+independently (as classic click-through-rate models like DLRM-DCNv2 do), it
+consumes a user's chronologically ordered interaction history as a sequence and
+applies a Transformer-style attention stack (HSTU) over it. Each training
+example is one "anchor" listen event together with that user's prior history
+(user interaction history, or UIH) and a set of contextual/cross features. The
+supervised target is a binary `listen_plus` label (a real listen: played for at
+least 50% of the track) versus a skip.
+
+Training is **streaming / temporal-order**: the timeline is sliced into
+fixed-duration windows and the model trains on window `T` then evaluates on the
+strictly-future window `T+1`, so every reported metric is genuine
+next-period generalization with no future leakage. The quality metric is
+**AUC** on the held-out future window, and the convergence target is
+**AUC >= 0.80275** (matching the DLRM-DCNv2-style target).
+
+The reference runs on 8 GPUs (validated on AMD Instinct MI350X / MI355X and
+NVIDIA B200; see [docs/training_recipe.md](docs/training_recipe.md)) and scales
+to multi-node via SLURM.
+
+## 3. Directions — steps to run
+
+The benchmark follows the standard MLPerf reference script flow:
+
+```bash
+# 0. build/enter the container (canonical frozen environment)
+docker build -t recommendation_v4 .
+docker run --rm -it --device=/dev/kfd --device=/dev/dri \
+    -v /path/to/dlrm_data:/data/mlperf_dlrm_v4 recommendation_v4
+
+# 1. download + preprocess the dataset
+DLRM_DATA_PATH=/data/mlperf_dlrm_v4 ./download_dataset.sh
+
+# 2. verify the preprocessed dataset
+DLRM_DATA_PATH=/data/mlperf_dlrm_v4 ./verify_dataset.sh
+
+# 3. run the benchmark to the quality target and report wall-clock time
+DLRM_DATA_PATH=/data/mlperf_dlrm_v4 ./run_and_time.sh
+```
+
+- [`download_dataset.sh`](download_dataset.sh) wraps the preprocessing pipeline
+  in `generative_recommenders.dlrm_v3.preprocess_public_data` (HuggingFace
+  download + temporal split + session segmentation + item-popularity counts).
+- [`verify_dataset.sh`](verify_dataset.sh) checks the preprocessed files against
+  [`md5sums_yambda_5b_processed.txt`](md5sums_yambda_5b_processed.txt) (falls
+  back to a layout check until the canonical checksums are pinned).
+- [`run_and_time.sh`](run_and_time.sh) runs the full-reference streaming
+  train+eval sweep on a single 8-GPU host with `AUC_THRESHOLD=0.80275` and MLPerf
+  compliance logging, printing the elapsed time of the timed region.
+
+### 3.1 Multi-node (SLURM)
+
+For N >= 1 nodes use [`scripts/launch_slurm.sh`](scripts/launch_slurm.sh), which
+provisions the container on each node and launches the same trainer. A bare
+submit runs a small functional smoke run; set the run-shape knobs for the full
+sweep:
+
+```bash
+# smoke (fast functional check)
+sbatch --nodes=1 scripts/launch_slurm.sh
+
+# full reference sweep
+START_TS=0 NUM_TRAIN_TS=299 \
+NUM_TRAIN_BATCHES=0 NUM_EVAL_BATCHES=0 \
+EVAL_EVERY_N_WINDOWS=0 EVAL_EVERY_DATA_PCT=0.005 \
+AUC_THRESHOLD=0.80275 \
+sbatch --nodes=2 scripts/launch_slurm.sh
+```
+
+Multi-node uses real RDMA (RoCEv2); the fabric/NCCL setup is documented in
+[docs/multi_node_config.md](docs/multi_node_config.md). Keep all run outputs
+(log, checkpoints, mllog, TensorBoard) under a writable scratch path you own —
+the dataset mount is read-only.
+
+## 4. Model
+
+The model is **HSTU** (Hierarchical Sequential Transduction Units), the
+generative-recommender architecture from Meta's ICML'24 paper *Actions Speak
+Louder than Words: Trillion-Parameter Sequential Transducers for Generative
+Recommendations* ([arXiv:2402.17152](https://arxiv.org/abs/2402.17152)).
+
+HSTU replaces the feature-interaction stack of a classic DLRM with a stack of
+pointwise-attention "transducer" layers operating over the user's interaction
+sequence. In this benchmark (the `dlrm_v3` path):
+
+- **Embeddings**: sparse tables for `item_id`, `artist_id`, `album_id`, `uid`,
+  and 7 cross-feature hashes (e.g. `user_x_artist`, `item_x_hour`), sharded
+  across GPUs with TorchRec `DistributedModelParallel`.
+- **Sequence model**: an HSTU attention stack (`HSTU_NUM_LAYERS`, default 3)
+  over the interleaved UIH, computed with a fused jagged-attention Triton kernel
+  in bf16.
+- **Supervision**: a single `listen_plus` binary task. The candidate event's
+  `action_weight` carries the supervision bit, and BCE loss is masked to
+  listen_plus candidates.
+
+See `generative_recommenders/dlrm_v3/configs.py`
+(`get_hstu_configs`, `get_embedding_table_config`) for the exact architecture
+and table specs, and the upstream README for the original modeling code.
+
+## 5. Dataset
+
+[Yambda-5b](https://huggingface.co/datasets/yandex/yambda) is a public
+anonymized music-recommendation dataset from Yandex. The `5b` variant is used
+for the reference. Statistics after preprocessing:
+
+| | |
+|---|---|
+| Total interaction events | **4.76 B** |
+| Unique users | **1.00 M** |
+| Max events per user | 27,738 |
+| Median events per user | 2,695 |
+| Mean events per user | 4,763 |
+| Train events (300d) | 4.76 B |
+| Test events (1d) | 22.4 M |
+| Item catalog size | 9.39 M |
+
+### 5.1 Per-event-type distribution (across the full 4.76 B corpus)
+
+| Pool | Definition | Count | Share |
+|---|---|---|---|
+| **listen_plus (lp)** | `is_listen AND played_ratio >= 50%` | 2.92 B | **61.3%** |
+| **skip** | `is_listen AND played_ratio < 50%` | 1.71 B | **35.9%** |
+| **like** | explicit thumbs-up action | 89 M | **1.9%** |
+| other | dislike / unlike / undislike | 47 M | 1.0% |
+
+The `like` pool is roughly **30x rarer** than `lp` — important context for the
+gather strategy in §6.
+
+### 5.2 Preprocessing & download
+
+`./download_dataset.sh` (which calls
+`python3 -m generative_recommenders.dlrm_v3.preprocess_public_data --dataset
+yambda-5b --data-path <root>`) downloads the 5b variant from HuggingFace, then:
+
+1. **Encodes** the raw `event_type` string into a uint8 lookup (listen=0,
+   like=1, dislike=2, unlike=3, undislike=4).
+2. **Splits** events temporally — 300 train days, 30-min gap, 1 test day — by
+   Global Temporal Split (GTS).
+3. **Segments** per-user event timelines into sessions on a 30-min inactivity
+   gap.
+4. **Computes** per-item popularity for downstream metric weighting.
+5. **Writes** the layout `DLRMv3YambdaDataset` expects:
+
+```
+<data-path>/
+├── raw/5b/multi_event.parquet                    50 GB (downloaded)
+├── shared_metadata/
+│   ├── artist_item_mapping.parquet               60 MB
+│   ├── album_item_mapping.parquet                76 MB
+│   └── embeddings.parquet                         18 GB (unused by HSTU training)
+└── processed_5b/
+    ├── train_sessions.parquet                     47 GB  ← main training input
+    ├── test_events.parquet                       152 MB
+    ├── session_index.parquet                     600 MB
+    ├── item_popularity.npy                        75 MB
+    └── split_meta.json                            anchor + boundary stats
+```
+
+For smaller variants (`yambda-50m` / `yambda-500m`) substitute the dataset name
+(`DATASET=yambda-50m ./download_dataset.sh`). Preprocessing takes ~2 min for 50m
+and ~53 min for 5b end-to-end.
+
+Integrity is verified with `./verify_dataset.sh` against
+[`md5sums_yambda_5b_processed.txt`](md5sums_yambda_5b_processed.txt).
+
+## 6. How data is fed to HSTU
+
+For every training anchor (a LISTEN event with >= `min_history` prior events),
+the dataset builds a `(uih_kjt, candidate_kjt)` pair:
+
+```
+UIH (User Interaction History):
+  ┌─ Sequence features (chronologically interleaved across 3 pools)
+  │    item_id, artist_id, album_id            ← per-position
+  │    action_weight                            ← per-position (LP_BIT/LIKE_BIT/SKIP_BIT)
+  │    action_timestamp, dummy_watch_time      ← per-position
+  └─ Contextual features (length 1 each)
+       uid + 7 cross-feature hashes (user_x_artist, item_x_hour, …)
+                                                = 8 contextual entries
+
+CANDIDATE (the LISTEN event at the anchor):
+  item_id, artist_id, album_id, item_query_time,
+  item_action_weight (LP_BIT if listen_plus, else 0),
+  item_dummy_watchtime
+```
+
+The candidate's `action_weight` is **the supervision label**: HSTU's
+`_get_supervision_labels_and_weights` masks BCE training to
+`(supervision_bitmask & task_weight) > 0`, with `task_weight = 1` (LP bit) for
+the single `listen_plus` task — so only listen_plus candidates supervise.
+
+### 6.1 Per-pool gather (the cap = L // 3 strategy)
+
+The UIH is built by `DLRMv3YambdaDataset._gather_interleaved_history`. For each
+anchor it:
+
+1. Scans the most recent `scan_window` (default 20,000) events of any type
+   before the anchor, **clipped to user_start**.
+2. From those, takes **the last `L // 3` events** from each of the three pools
+   (lp, like, skip) independently.
+3. Concatenates and **re-sorts chronologically** to produce an interleaved
+   sequence.
+4. Tags each event's pool identity into `action_weight` via OR'd bitmask
+   (LP=1, LIKE=2, SKIP=4).
+
+With `history_length = 4086` and `max_seq_len = 4096`: per-pool cap = `4086 //
+3 = 1362`, and `3 × 1362 + 8 contextual + 1 candidate = 4095 <= 4096` (no
+truncation). Because the `like` pool is rare (1.9%) it under-fills (~105 events
+per anchor on average); the Triton jagged-attention backend skips unfilled
+slots, so the under-fill costs sequence budget but not GPU compute.
+
+## 7. Optimizer
+
+Two optimizers, configured in
+[`yambda_5b.gin`](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin):
+
+| component | optimizer | gin binding | key settings |
+|---|---|---|---|
+| Dense params (HSTU blocks, MLPs) | **Adam** | `dense_optimizer_factory_and_class.*` | lr `DENSE_LR`, betas (0.95, 0.999), eps 1e-8, weight_decay 0 |
+| Sparse embedding tables | **RowWiseAdagrad** (fused FBGEMM TBE) | `sparse_optimizer_factory_and_class.*` | lr `SPARSE_LR`, eps 1e-8, weight_decay 0 |
+
+Gradient clipping (`GRAD_CLIP_NORM`, default `max_norm=1.0`) is applied to the
+dense parameters on the streaming path; the fused sparse optimizer is
+unaffected. Training is bf16 mixed precision (`make_model.bf16_training=True`).
+
+## 8. Hyperparameters
+
+All tunable hyperparameters live in
+[`yambda_5b.gin`](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin)
+(the config-file source of truth) and are **overridable via environment
+variables** (the env value takes precedence over the gin default, per MLPerf
+CONTRIBUTING rule 4d). The gin macros (`@env_int`, `@env_float`, `@env_str`)
+enforce the correct type for each parameter.
+
+| hyperparameter | env var | gin binding | type | default | tuning rule |
+|---|---|---|---|---|---|
+| Per-rank batch size | `BATCH_SIZE` | `batch_size` | int | 1024 | positive integer (global batch = `BATCH_SIZE × world_size`) |
+| Dense learning rate | `DENSE_LR` | `dense_optimizer_factory_and_class.learning_rate` | float | 1e-7 | positive float |
+| Sparse learning rate | `SPARSE_LR` | `sparse_optimizer_factory_and_class.learning_rate` | float | 1e-7 | positive float |
+| Grad clip max-norm | `GRAD_CLIP_NORM` | `streaming_train_eval_loop.grad_clip_norm` | float | 1.0 | float >= 0 (0 disables) |
+| RNG seed | `SEED` | `seed_everything.seed` | int | 1 | any integer (-1 = random per run) |
+| HSTU attention layers | `HSTU_NUM_LAYERS` | `get_hstu_configs.hstu_attn_num_layers` | int | 3 | positive integer |
+| UIH history length | `HISTORY_LENGTH` | `get_dataset.history_length` | int | 4086 | positive integer (per-pool cap = L//3) |
+| Max sequence length | `MAX_SEQ_LEN` | `get_hstu_configs.max_seq_len` | int | 4096 | positive integer (>= `history_length + 9`) |
+| History strategy | `HISTORY_STRATEGY` | `get_dataset.history_strategy` | str | `interleaved` | one of `interleaved` \| `last_n` |
+| Min history (anchor floor) | `MIN_HISTORY` | `get_dataset.min_history` | int | 4086 | integer >= 0 |
+| Train user split | `TRAIN_SPLIT_PERCENTAGE` | `*.train_split_percentage` | float | 1.0 | float in (0, 1] |
+| Streaming shuffle fraction | `STREAMING_SHUFFLE_FRACTION` | `get_dataset.streaming_shuffle_fraction` | float | 0.0 | float in [0, 1] |
+| Streaming shuffle seed | `STREAMING_SHUFFLE_SEED` | `get_dataset.streaming_shuffle_seed` | int | 0 | any integer |
+| Split salt | `SPLIT_SALT` | `get_dataset.split_salt` | int | 0 | any integer |
+| Start window | `START_TS` | `streaming_train_eval_loop.start_ts` | int | 150 | integer >= 0 |
+| Number of train windows | `NUM_TRAIN_TS` | `streaming_train_eval_loop.num_train_ts` | int | 149 | positive integer (clamped to available) |
+| Sparse A2A fwd precision | `SPARSE_A2A_FWD` | `make_optimizer_and_shard.sparse_a2a_forward_precision` | str | `fp32` | one of `fp32` \| `bf16` \| `fp16` |
+| Sparse A2A bwd precision | `SPARSE_A2A_BWD` | `make_optimizer_and_shard.sparse_a2a_backward_precision` | str | `fp32` | one of `fp32` \| `bf16` \| `fp16` |
+
+Non-tunable / fixed reference values (optimizer betas (0.95, 0.999), eps 1e-8,
+weight_decay 0, bf16 training, streaming window = 86400 s) are pinned in the gin
+file. Submitters tuning hyperparameters must follow the allowed values above and
+the
+[MLPerf training rules](https://github.com/mlcommons/training_policies/blob/master/training_rules.adoc#hyperparameters).
+
+## 9. Quality target & evaluation
+
+- **Metric**: AUC on the held-out future evaluation window (`window_auc` for the
+  `listen_plus` task), computed by `MetricsLogger` in
+  `generative_recommenders/dlrm_v3/utils.py`.
+- **Target**: **eval AUC >= 0.80275**. Set via `AUC_THRESHOLD=0.80275`
+  (`MetricsLogger.auc_threshold`); the run logs `RUN_STOP` with `SUCCESS` and
+  stops once the target is reached. The gin default of `1.0` is unreachable
+  (trains all windows with no early stop) and is overridden by the reference
+  scripts.
+- **Evaluation frequency**: the full-reference run uses
+  `EVAL_EVERY_DATA_PCT=0.005` — evaluate every 0.5% of the training stream
+  (~200 evenly-data-spaced eval points), independent of node count. The
+  alternative per-window cadence (`EVAL_EVERY_N_WINDOWS`) is mutually exclusive.
+- **Evaluation set**: a fixed held-out future window (`eval_holdout_ts`, default
+  `start_ts + num_train_ts`); with `TRAIN_SPLIT_PERCENTAGE < 1.0` the held-out
+  users' anchors over that window form the eval set. The temporal one-window
+  lead guarantees no future leakage (see §11).
+
+Evaluation is always one window ahead of training, so reported AUC is genuine
+next-period generalization.
+
+## 10. Reference Convergence Points (RCP)
+
+*Placeholder — to be generated.*
+
+RCPs have **not yet been generated** for this benchmark. Per the MLPerf
+[CONTRIBUTING guidance](https://github.com/mlcommons/training_policies/blob/master/CONTRIBUTING.md),
+RCPs must be generated for at least 3 reasonable batch sizes using at least 2N
+seeds (N = number of submission runs), in FP32 or BF16, with the exact precision
+recorded in the RCP JSON. The convergence curves (steps/samples to reach
+AUC >= 0.80275) will be added under [`rcp/`](rcp/) once the convergence runs are
+complete. This section is intentionally left blank for now.
+
+## 11. Streaming (temporal-order) training
+
+`scripts/launch_slurm.sh` and `run_and_time.sh` default to
+`--mode streaming-train-eval`, which trains Yambda in strict wall-clock order
+instead of shuffling the whole corpus. The timeline is sliced into
+fixed-duration **windows** (default 1 day,
+`get_dataset.streaming_window_seconds = 86400`), and the loop walks them forward:
+
+```
+window T:    train          window T+1:  eval (then train)   window T+2: eval (then train) ...
+   └─ train window T ─┐
+                      └─ eval window T+1 ─┐
+                                          └─ train window T+1 ─┐
+                                                               └─ eval window T+2 ...
+```
+
+i.e. for each step it **trains window T, then evaluates window T+1** before
+advancing — always predicting the immediate future from the past.
+
+### 11.1 Temporal guarantee
+
+The streaming path enforces **no future leakage** at two levels:
+
+1. **Across windows** — a window is the set of anchors whose target/candidate
+   timestamp falls in `[t_min + T·W, t_min + (T+1)·W)`. Training only ever sees
+   windows `<= T`; the evaluation window `T+1` is strictly in the future of every
+   training anchor it is scored against.
+2. **Within an anchor** — history is gathered **causally**: the UIH scan is
+   `scan_start:flat_pos` (events strictly before the anchor), so no event at or
+   after the anchor's timestamp can enter its features.
+
+This is a *temporal* split on the training stream — distinct from the
+preprocessing GTS split (§5) that carves off the final test day. Windows are
+indexed off the per-anchor target timestamp via a lazily-built, mmap'd
+`anchor_ts` cache keyed by `(history_length, min_history)`.
+
+### 11.2 Streaming knobs
+
+All configurable via
+[`yambda_5b.gin`](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin) with
+env overrides:
+
+| env | gin | default | meaning |
+|---|---|---|---|
+| `START_TS` | `streaming_train_eval_loop.start_ts` | 150 | first window (early windows are near-empty warm-up) |
+| `NUM_TRAIN_TS` | `streaming_train_eval_loop.num_train_ts` | 149 | number of train windows (clamped to available) |
+| `PERSISTENT_LOADER` | `streaming_train_eval_loop.persistent_loader` | 1 | reuse one worker pool across windows |
+| `DOUBLE_BUFFER` | `streaming_train_eval_loop.double_buffer` | 1 | prepare the next window in a background thread |
+| `EVAL_EVERY_N_WINDOWS` | `streaming_train_eval_loop.eval_every_n_windows` | 1 | eval cadence by window count (0 to use data-pct) |
+| `EVAL_EVERY_DATA_PCT` | `streaming_train_eval_loop.eval_every_data_pct` | 0.0 | eval cadence by fraction of train data (full ref: 0.005) |
+| `MIN_HISTORY` | `get_dataset.min_history` | 4086 | anchor-eligibility floor (0 = ~all users incl. cold-start) |
+
+### 11.3 Checkpointing & resume
+
+The streaming loop is resume-aware: set `CKPT_PATH` to enable DMP checkpoint
+save/load (auto-resolves to the highest-numbered subdir), with retention via
+`KEEP_LAST_N` and cadences `IN_WINDOW_CKPT_FREQ` / `CKPT_STEP_FREQ` /
+`CKPT_TIME_INTERVAL_S`. The MLPerf run state (run-started flag, global sample
+count) is persisted across resume so compliance logging is continuous. See
+`generative_recommenders/dlrm_v3/checkpoint.py`.
+
+## 12. License
+
+Apache 2.0 (inherited from upstream).
diff --git a/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin
new file mode 100644
index 000000000..8fb8b258c
--- /dev/null
+++ b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/amzn-books-l50/
+# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/hstu-sampled-softmax-n512-final.gin --master_port=12346 2>&1 | tee logs/amzn-books-l50/hstu-sampled-softmax-n512-final.log
+
+train_fn.dataset_name = "amzn-books"
+train_fn.max_sequence_length = 50
+train_fn.local_batch_size = 128
+train_fn.eval_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.5
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.item_embedding_dim = 64
+
+hstu_encoder.num_blocks = 4
+hstu_encoder.num_heads = 4
+hstu_encoder.dv = 16
+hstu_encoder.dqk = 16
+hstu_encoder.linear_dropout_rate = 0.5
+
+train_fn.eval_interval = 4000
+train_fn.num_epochs = 201
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 512
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+train_fn.full_eval_every_n = 5
+train_fn.partial_eval_num_iters = 64
+
+create_data_loader.prefetch_factor = 1024
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin
new file mode 100644
index 000000000..097d4cbc7
--- /dev/null
+++ b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on HSTU-large results in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/amzn-books-l50/
+# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin --master_port=12346 2>&1 | tee logs/amzn-books-l50/hstu-sampled-softmax-n512-large-final2.log
+
+train_fn.dataset_name = "amzn-books"
+train_fn.max_sequence_length = 50
+train_fn.local_batch_size = 128
+train_fn.eval_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.5
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.item_embedding_dim = 64
+
+hstu_encoder.num_blocks = 16
+hstu_encoder.num_heads = 8
+hstu_encoder.dv = 8
+hstu_encoder.dqk = 8
+hstu_encoder.linear_dropout_rate = 0.5
+
+train_fn.eval_interval = 4000
+train_fn.num_epochs = 201
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 512
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+train_fn.full_eval_every_n = 5
+train_fn.partial_eval_num_iters = 64
+
+create_data_loader.prefetch_factor = 1024
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin b/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin
new file mode 100644
index 000000000..bc899c9fb
--- /dev/null
+++ b/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
+#
+# Run this as:
+# mkdir -p logs/amzn-books-l50/
+# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/sasrec-sampled-softmax-n512-final.gin  --master_port=12346 2>&1 | tee logs/amzn-books-l50/sasrec-sampled-softmax-n512-final.log
+
+train_fn.dataset_name = "amzn-books"
+train_fn.max_sequence_length = 50
+train_fn.local_batch_size = 128
+train_fn.eval_batch_size = 128
+
+train_fn.main_module = "SASRec"
+train_fn.dropout_rate = 0.5
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.item_embedding_dim = 64
+
+sasrec_encoder.num_blocks = 4
+sasrec_encoder.num_heads = 4
+sasrec_encoder.ffn_dropout_rate = 0.5
+sasrec_encoder.ffn_hidden_dim = 64
+sasrec_encoder.ffn_activation_fn = "relu"
+
+train_fn.eval_interval = 4000
+train_fn.num_epochs = 201
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.save_ckpt_every_n = 10
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 512
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+train_fn.full_eval_every_n = 5
+train_fn.partial_eval_num_iters = 64
+
+create_data_loader.prefetch_factor = 1024
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin
new file mode 100644
index 000000000..841b1c80a
--- /dev/null
+++ b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/11/2024.
+# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/ml-1m-l200/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-1m/hstu-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-1m-l200/hstu-sampled-softmax-n128-final.log
+
+train_fn.dataset_name = "ml-1m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 50
+
+hstu_encoder.num_blocks = 2
+hstu_encoder.num_heads = 1
+hstu_encoder.dqk = 50
+hstu_encoder.dv = 50
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin
new file mode 100644
index 000000000..7ffc7ef64
--- /dev/null
+++ b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/11/2024.
+# Based on HSTU-large results in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/ml-1m-l200/
+# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin --master_port=12346 2>&1 | tee logs/ml-1m-l200/hstu-sampled-softmax-n128-large-final.log
+
+train_fn.dataset_name = "ml-1m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 50
+
+hstu_encoder.num_blocks = 8
+hstu_encoder.num_heads = 2
+hstu_encoder.dqk = 25
+hstu_encoder.dv = 25
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin
new file mode 100644
index 000000000..ead7bb21c
--- /dev/null
+++ b/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin
@@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/11/2024.
+# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
+#
+# Run this as:
+# mkdir -p logs/ml-1m-l200/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-1m/sasrec-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-1m-l200/sasrec-sampled-softmax-n128-final.log
+
+train_fn.dataset_name = "ml-1m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "SASRec"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 50
+
+sasrec_encoder.num_blocks = 2
+sasrec_encoder.num_heads = 1
+sasrec_encoder.ffn_dropout_rate = 0.2
+sasrec_encoder.ffn_hidden_dim = 50
+sasrec_encoder.ffn_activation_fn = "relu"
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.top_k_method = "MIPSBruteForceTopK"
+train_fn.interaction_module_type = "DotProduct"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin
new file mode 100644
index 000000000..5823ad5b6
--- /dev/null
+++ b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/ml-20m-l200/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-20m/hstu-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/hstu-sampled-softmax-n128-final.log
+
+train_fn.dataset_name = "ml-20m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 256
+
+hstu_encoder.num_blocks = 4
+hstu_encoder.num_heads = 4
+hstu_encoder.dv = 64
+hstu_encoder.dqk = 64
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin
new file mode 100644
index 000000000..0199afa24
--- /dev/null
+++ b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on HSTU-large results in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/ml-20m-l200/
+# CUDA_VISIBLE_DEVICES=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python3 main.py --gin_config_file=configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/hstu-sampled-softmax-n128-large-final.log
+
+train_fn.dataset_name = "ml-20m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 256
+
+hstu_encoder.num_blocks = 16
+hstu_encoder.num_heads = 8
+hstu_encoder.dv = 32
+hstu_encoder.dqk = 32
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin
new file mode 100644
index 000000000..3c666f802
--- /dev/null
+++ b/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin
@@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
+#
+# Run this as:
+# mkdir -p logs/ml-20m-l200/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-20m/sasrec-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/sasrec-sampled-softmax-n128-final.log
+
+train_fn.dataset_name = "ml-20m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "SASRec"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 256
+
+sasrec_encoder.num_blocks = 4
+sasrec_encoder.num_heads = 4
+sasrec_encoder.ffn_dropout_rate = 0.2
+sasrec_encoder.ffn_hidden_dim = 256
+sasrec_encoder.ffn_activation_fn = "relu"
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.top_k_method = "MIPSBruteForceTopK"
+train_fn.interaction_module_type = "DotProduct"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin
new file mode 100644
index 000000000..ac7a85350
--- /dev/null
+++ b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Run this as:
+# mkdir -p logs/ml-3b-l500/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/hstu-sampled-softmax-n96-seqlen500-final.log
+
+train_fn.dataset_name = "ml-3b"
+train_fn.max_sequence_length = 500
+train_fn.local_batch_size = 96
+train_fn.eval_batch_size = 96
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 100
+train_fn.item_embedding_dim = 256
+
+hstu_encoder.num_blocks = 4
+hstu_encoder.num_heads = 4
+hstu_encoder.dv = 64
+hstu_encoder.dqk = 64
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin
new file mode 100644
index 000000000..a30ad3657
--- /dev/null
+++ b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Run this as:
+# mkdir -p logs/ml-3b-l500/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/hstu-sampled-softmax-n96-seqlen500-large-final.log
+
+train_fn.dataset_name = "ml-3b"
+train_fn.max_sequence_length = 500
+train_fn.local_batch_size = 96
+train_fn.eval_batch_size = 96
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 100
+train_fn.item_embedding_dim = 256
+
+hstu_encoder.num_blocks = 16
+hstu_encoder.num_heads = 8
+hstu_encoder.dv = 32
+hstu_encoder.dqk = 32
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin b/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin
new file mode 100644
index 000000000..034c478b4
--- /dev/null
+++ b/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Run this as:
+# mkdir -p logs/ml-3b-l500/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/sasrec-sampled-softmax-n96-seqlen500-final.log
+
+train_fn.dataset_name = "ml-3b"
+train_fn.max_sequence_length = 500
+train_fn.local_batch_size = 96
+train_fn.eval_batch_size = 96
+
+train_fn.main_module = "SASRec"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 100
+train_fn.item_embedding_dim = 256
+
+sasrec_encoder.num_blocks = 4
+sasrec_encoder.num_heads = 4
+sasrec_encoder.ffn_dropout_rate = 0.2
+sasrec_encoder.ffn_hidden_dim = 256
+sasrec_encoder.ffn_activation_fn = "relu"
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.top_k_method = "MIPSBruteForceTopK"
+train_fn.interaction_module_type = "DotProduct"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/docs/multi_node_config.md b/recommendation_v4/docs/multi_node_config.md
new file mode 100644
index 000000000..52fdbbb69
--- /dev/null
+++ b/recommendation_v4/docs/multi_node_config.md
@@ -0,0 +1,230 @@
+# Multi-Node Training Enablement (yambda-5b, MI350X / Broadcom bnxt_re RoCE)
+
+How N-node (N×8-GPU) distributed training was brought up for the yambda-5b HSTU
+ranker on the `meta64` cv350 cluster, the hard problems solved, and **exactly
+which settings are cluster/fabric-specific** so this can be reused or re-tuned
+when the underlying network changes.
+
+Companion to [`perf_opt.md`](./perf_opt.md) and [`training_recipe.md`](./training_recipe.md).
+The single entry point is [`scripts/launch_slurm.sh`](../scripts/launch_slurm.sh);
+the Python side is `generative_recommenders/dlrm_v3/train/{train_ranker,utils}.py`.
+
+---
+
+## TL;DR
+
+- Multi-node works over **real RDMA** (RoCEv2 on 8× Broadcom bnxt_re HCAs).
+  2-node = `world_size=16`, clean `rc=0`, ~7.7–8.0k `global_sps` (≈1.28× of
+  1-node 6.2k; weak scaling, per-rank batch fixed).
+- The one non-obvious blocker was a **userspace RDMA provider ABI mismatch**
+  inside the container, fixed with an `LD_PRELOAD`/`LD_LIBRARY_PATH` **overlay**
+  of the host's matched `rdma-core` (no container lib surgery).
+- Everything is one script with three auto-detected phases
+  (`orchestrate` → `provision` → `worker`) plus small Python changes for global
+  ranks. All cluster-specific knobs are env-overridable and tagged
+  `[CLUSTER-SPECIFIC]` in the script.
+
+---
+
+## Architecture: one script, three phases
+
+`launch_slurm.sh` self-dispatches by context (`LAUNCH_SLURM_PHASE`, else
+auto-detected via `/.dockerenv`):
+
+| Phase | Runs on | Does |
+|---|---|---|
+| `orchestrate` | SLURM batch host | Resolve rendezvous (`MASTER_ADDR/PORT`), ensure container on every node (calls `provision`), then `docker exec` the `worker` phase on every node (one srun task per node). |
+| `provision` | each compute node (host) | Ensure the `yambda_primus` container is up (baked image if present, else base image + pip), stage the host RDMA overlay on NFS. |
+| `worker` | inside the container | Derive topology, set NCCL/RDMA env, apply the RDMA overlay, spawn this node's 8 GPU ranks via `train_ranker`. `NNODES==1` => legacy single-node path unchanged. |
+
+Why one script: multi-node enablement is then a single committable file. The
+worker phase is also what the streaming-e2e supervisor invokes directly
+(single-node, already inside the container), so the production path is unchanged.
+
+```
+sbatch --nodes=N launch_slurm.sh
+        │  (batch host: orchestrate)
+        ├─ srun: provision  ──> docker container up + RDMA overlay staged   (×N nodes)
+        └─ srun: docker exec launch_slurm.sh (worker)                       (×N nodes)
+                   │ in container: topology + NCCL/RDMA env + LD overlay
+                   └─ python train_ranker  ──> 8 local ranks  ──> RCCL rendezvous over RDMA
+```
+
+---
+
+## The hard problems (lessons learned)
+
+### 1. RDMA provider ABI mismatch — the core blocker
+
+**Symptom:** multi-node RCCL died at init with
+`ibv_create_qp ... Bad address`.
+
+**Root cause:** the container image (`rocm/primus:v26.3`) ships an **older**
+userspace `rdma-core` (v34, `libbnxt_re-rdmav34.so`) than the **host kernel**
+bnxt_re driver's uapi (host `rdma-core` v61 / `libbnxt_re-rdmav59.so`). The v34
+provider enumerates the HCAs and creates *shallow* QPs fine, but **faults when
+creating a deep send queue** — RCCL uses `max_send_wr=256`. Verified with a
+parameterized verbs probe: v34 `create_qp` is OK at depth ≤16 and faults at ≥64;
+the host v59 provider works at **every** depth. So it is purely the **userspace
+provider**, not the kernel or the fabric (a 2-node RoCEv2 RDMA-write test passes
+on the stock stack, and bare-metal RCCL benchmarks run fine with the host libs).
+
+**Fix (no container surgery):** the `provision` phase stages the host's matched
+`rdma-core` on shared NFS (`$OVERLAY`):
+
+```
+$OVERLAY/lib/libibverbs.so.1          # host libibverbs v61
+$OVERLAY/lib/libibverbs.so -> .so.1   # UNVERSIONED symlink (critical, see below)
+$OVERLAY/lib/libnl-3.so.200, libnl-route-3.so.200
+$OVERLAY/lib/libibverbs/<providers>.so   # incl. libbnxt_re-rdmav59.so
+```
+
+The `worker` phase makes RCCL load it at runtime:
+
+```bash
+export LD_LIBRARY_PATH="$OVERLAY/lib:$OVERLAY/lib/libibverbs:$LD_LIBRARY_PATH"
+export LD_PRELOAD="$OVERLAY/lib/libibverbs.so.1:$LD_PRELOAD"
+```
+
+We do **not** modify the container's system libs — only this process tree's
+`LD_*`. Single-node and other users keep the stock stack.
+
+### 2. The UNVERSIONED `libibverbs.so` symlink is mandatory
+
+An earlier overlay attempt set `LD_LIBRARY_PATH` but still failed with
+`Bad address`. Reason: at `import torch` the ROCm stack pulls in the
+**unversioned** soname `libibverbs.so` (not `libibverbs.so.1`). If the overlay
+only has `libibverbs.so.1`, that unversioned lookup misses the overlay, falls
+through to the **container's** old lib, which then occupies the `libibverbs.so.1`
+slot — so RCCL's later `dlopen("libibverbs.so.1")` binds the v34 stack and
+`create_qp(256)` faults again. The overlay **must** expose
+`libibverbs.so -> libibverbs.so.1`. With it (verified via `/proc/<pid>/maps`),
+the process maps **only** the host lib. `LD_PRELOAD` is belt-and-braces so the
+host lib claims the soname slot first.
+
+### 3. Two network planes — pin TCP bootstrap, RDMA for data
+
+The container is `--network=host`, so RCCL sees **all** host interfaces and, left
+to auto-detect, picks the wrong one. These nodes expose:
+- `benic1p1..benic8p1` — per-GPU point-to-point RoCE links on `192.168.{1..8}.x/31`.
+  These are **not node-routable** for plain TCP; the very first bring-up **hung**
+  in `init_process_group` because RCCL tried the TCP bootstrap over a
+  non-routable `192.168.x` backend addr.
+- `fenic0` — the routable front-end (`10.190.x`).
+
+So we split the planes explicitly:
+- `NCCL_SOCKET_IFNAME=fenic0` → TCP bootstrap/rendezvous over the routable NIC.
+- `NCCL_IB_HCA=bnxt_re0..7` → RDMA **data** over the 8 RoCE HCAs (the RoCEv2
+  fabric *is* reachable rail-to-rail at the RDMA layer even though plain IP is not).
+
+### 4. Minimal proven bnxt_re NCCL config
+
+The minimal set proven on these nodes (matches cmcknigh's bare-metal RCCL
+benchmarks): `NCCL_IB_GID_INDEX=3` (RoCEv2 IPv4 GID), `NCCL_IB_TC=104` (RoCE
+lossless / PFC traffic class). **Do not** add the heavy
+`QPS_PER_CONNECTION / ECE / DMABUF` block — that belongs to a different
+(ionic AINIC) fabric and is counterproductive on bnxt_re. GPU-Direct RDMA
+(`NCCL_NET_GDR_LEVEL`) is left **off**: it needs DMABUF/peermem, unavailable
+in-container here, so RCCL stages through host memory (still real RDMA).
+
+### 5. Rendezvous must be resolved on the host
+
+The container image has **no SLURM client** (`scontrol` absent). So the
+`orchestrate` phase resolves `MASTER_ADDR` (first host of the allocation) and a
+deterministic `MASTER_PORT` (`20000 + job_id % 20000`, same on all nodes) **on
+the host** and forwards them into the container via `docker exec -e`.
+
+### 6. Global rank derivation (Python)
+
+`mp.start_processes` hands out a node-local `local_rank` (0..7). Every downstream
+consumer (data sharding, checkpoint I/O, metrics) needs the **global** rank:
+
+```python
+rank = node_rank * gpus_per_node + local_rank   # train_ranker._main_func
+device = torch.device(f"cuda:{local_rank}")      # CUDA device stays node-local
+```
+
+Also: `make_optimizer_and_shard(local_world_size=gpus_per_node)` so the TorchRec
+planner respects the intra-node GPU count, and `MetricsLogger(world_size=...)`
+gets the live world size (the gin default of 8 would mis-normalize multi-node).
+`NNODES==1` makes `rank == local_rank` — identical to the old single-node path.
+
+### 7. `$0` is the staged `slurm_script`, not the repo path
+
+For an sbatch batch script, `$0` =
+`/var/spool/slurmd/job<ID>/slurm_script` (node-local), so deriving the script /
+repo path from `$0` gives a path that **doesn't exist on other nodes** (`bash
+$SELF` → "No such file", and the worker's `cd $REPO` → exit 127). The
+`orchestrate` phase instead resolves the real shared-NFS path from SLURM:
+
+```bash
+SCRIPT_PATH=$(scontrol show job "$SLURM_JOB_ID" | grep -oP 'Command=\K\S+')
+# fallbacks: $SLURM_SUBMIT_DIR/scripts/launch_slurm.sh, then $SELF
+REPO=$(cd "$(dirname "$SCRIPT_PATH")/.." && pwd)
+```
+
+### 8. `srun ... bash -c "…"` host-vs-remote expansion
+
+Inside the double-quoted srun command string, **plain `$VAR` expands now on the
+batch host** (values computed in orchestrate: `$MASTER_ADDR`, `$SCRIPT_PATH`, …)
+while **`\$VAR` is deferred to each compute node** (`\$SLURM_NODEID`,
+`\$(hostname)`) where the per-node SLURM env lives. Mixing these up sends every
+rank the wrong node id.
+
+### 9. `memlock` ulimit for QP registration
+
+`docker run --ulimit memlock=-1:-1` is **required** — RDMA QP memory
+registration needs unlimited locked memory. A container started with the default
+8 MB memlock fails QP creation regardless of the overlay.
+
+### 10. Provisioning & the image-bake caveat
+
+Fresh nodes otherwise re-download a **6.1 GB** ROCm torch wheel + pip + build
+torchrec-from-git every time. The script supports a pre-baked image
+(`docker commit` → NFS tar → `docker load` offline). **Caveat:** the committed
+image is **~127 GB** (ROCm base is huge), so the full-image NFS tar is impractical
+(loading it can be slower than re-downloading 6 GB). For true download-avoidance
+prefer a **local pip wheelhouse** (`pip install --no-index --find-links` from
+~8 GB of NFS wheels) or a **local registry** (ships only the ~35 GB delta layer).
+The bake hook is left in (`BAKE_IMAGE=1`) but defaults off; provisioning falls
+back to base-image + pip.
+
+### Debunked theory (do not re-introduce)
+
+An earlier claim that the container's rdma-core was "too old → 0 devices /
+Bad address" and needed an **in-place lib copy** was a red herring: the "0
+devices" came from a *broken in-place copy* of the host EL9 libs (mixing v34
+tooling that links `IBVERBS_PRIVATE_34` with host v61 libs breaks symbol-version
+lookup). The stock container enumerates all 8 HCAs fine. The real issue is only
+the deep-QP create path; the fix is the **LD overlay**, never in-place surgery.
+
+---
+
+## Cluster-specific settings — change these when the fabric/hardware changes
+
+All are env-overridable and tagged `[CLUSTER-SPECIFIC]` in `launch_slurm.sh`
+(`grep '\[CLUSTER-SPECIFIC\]' scripts/launch_slurm.sh`).
+
+| Setting | Default (meta64) | What it is | How to find the right value |
+|---|---|---|---|
+| `#SBATCH --partition` | `meta64` | scheduler partition | `sinfo` |
+| bind mounts + default paths | `/home/chcai`, `/apps/chcai` | repo + scratch, **must be shared/NFS on all nodes** | `df -h`, cluster docs |
+| `IMAGE` | `rocm/primus:v26.3` | base container (GPU arch + ROCm version) | vendor image registry |
+| docker `--device` | `/dev/kfd /dev/dri` (AMD) | GPU passthrough | NVIDIA: `--gpus all` / nvidia runtime |
+| `--ulimit memlock` | `-1` | locked mem for RDMA QP | keep `-1` for any RDMA fabric |
+| `TORCH_IDX` / torch,vision,audio | `rocm7.2`, `2.12.0+rocm7.2` … | ROCm-version'd wheels | `download.pytorch.org/whl/<rocmX>` |
+| `FBGEMM_WHL` | gfx950 wheel on NFS | GPU-arch fbgemm | build/stage per arch |
+| `NCCL_SOCKET_IFNAME` | `fenic0` | **routable** host NIC for TCP bootstrap | `ip -br addr` (pick the routable one; NOT the per-GPU RDMA NICs) |
+| `NCCL_IB_HCA` | `bnxt_re0..7` | RDMA HCA device names | `ibv_devices` (vendor: `mlx5_*`, `ionic_*`, …) |
+| `NCCL_IB_GID_INDEX` | `3` | RoCEv2 IPv4 GID index | `show_gids` (v1/v2 & IPv4/IPv6 differ per port) |
+| `NCCL_IB_TC` | `104` | RoCE lossless / PFC traffic class | fabric/switch admin |
+| `RDMA_OVERLAY` (+ provider .so) | `/apps/chcai/rdma_host_el9_new` | host rdma-core overlay | only needed if container rdma-core < host kernel uapi; else set `RDMA_OVERLAY=` to disable. Stage the host's matching `/usr/lib64/libibverbs/<provider>.so` |
+
+**Different NIC vendor (e.g. Mellanox `mlx5`)** typically means: change
+`NCCL_IB_HCA` names, re-check `NCCL_IB_GID_INDEX`/`NCCL_IB_TC`, and the RDMA
+overlay is often **unnecessary** (Mellanox userspace in the image usually matches
+the host) — set `RDMA_OVERLAY=` to skip it.
+
+**Emergency fallback:** `NCCL_NET_TRANSPORT=socket` disables IB and runs
+allreduce over TCP (`fenic0`). Functional but ~100–200× slower; use only to
+isolate a fabric problem.
diff --git a/recommendation_v4/docs/perf_opt.md b/recommendation_v4/docs/perf_opt.md
new file mode 100644
index 000000000..7799848ad
--- /dev/null
+++ b/recommendation_v4/docs/perf_opt.md
@@ -0,0 +1,73 @@
+# Performance Optimizations — MI350X HSTU / OneTrans (yambda-5b, bs=1024, TRITON)
+
+Performance work for the 8× MI350X HSTU ranker on `yambda-5b` at `batch_size=1024`
+with the **TRITON** HSTU kernel and bf16 training. Companion to
+[`training_recipe.md`](./training_recipe.md) (environment + reproduction).
+
+Throughput numbers are global samples/sec across 8 GPUs (`global_sps`), measured
+at steady state (instantaneous, computed from consecutive logged steps).
+
+---
+
+## LN-dropout: multi-row, separated-RNG path on MI350
+
+### What
+
+`_ln_mul_dropout_*` has two kernel variants:
+
+- **legacy** — single program per row, RNG fused inline (`_ln_mul_dropout_fwd`).
+- **separated-RNG** — multiple rows per program, dropout mask precomputed once
+  and reused by the backward (`_ln_mul_dropout_fwd_rng` /
+  `_ln_mul_dropout_bwd_dx_du_rng`).
+
+The separated path was previously gated to Blackwell only (`is_sm100_plus()`).
+MI350X (`gfx950`) benefits from the same structure, so the gate now also enables
+it on MI350.
+
+### Where
+
+| file | change |
+|---|---|
+| `ops/utils.py` | `is_amd_mi350()` (gfx950 detect) + `use_separated_rng_ln_mul_dropout()` gate |
+| `ops/triton/triton_hstu_linear.py` | dispatch LN-dropout fwd to the separated-RNG path when the gate is true |
+
+```python
+# ops/utils.py
+def use_separated_rng_ln_mul_dropout() -> bool:
+    return is_sm100_plus() or is_amd_mi350()
+```
+
+### Perf
+
+**+5.6% end-to-end → 14,222 global sps** (separated-RNG vs legacy fused, identical
+config, full boost clocks — see the caveat below).
+
+---
+
+## Caveat — GPU clock lock can mask all perf changes
+
+A node-level GPU clock lock will silently invalidate any benchmark on this
+machine, so check it before trusting numbers.
+
+During this work all 8 GPUs were stuck in **`perf_determinism`** performance
+level at **sclk 1093 MHz** (DPM level 1) while the real max is **2200 MHz**
+(level 2) — despite 100% utilization, ~370 W of power headroom (629 / 1000 W),
+and low temps (~50 °C). This was **not** thermal/power throttling; it was
+leftover node state from a prior job.
+
+Effect: a **uniform ~1.87× slowdown of every Triton compute kernel**
+(`2200 / 1093 ≈ 2.0×`), including kernels unrelated to any code change. It made
+the LN-dropout fix above look like a regression until the clock state was found.
+
+### Detect + fix
+
+```bash
+rocm-smi --showperflevel          # expect "auto", not perf_determinism/manual/low
+rocm-smi -d 0 --showclocks        # expect sclk ~2000+ MHz under load
+rocm-smi --setperflevel auto      # restore boost
+```
+
+`scripts/launch_slurm.sh` (worker phase) now logs the perf level + a live `sclk` sample on
+every launch, auto-restores `auto` if it finds a `perf_determinism`/`manual`/`low`
+lock, and warns (to reset from the host) if it lacks permission inside the
+container. **Always sanity-check `sclk ≈ 2000+ MHz` before trusting a benchmark.**
diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
new file mode 100644
index 000000000..cf31a9fff
--- /dev/null
+++ b/recommendation_v4/docs/training_recipe.md
@@ -0,0 +1,203 @@
+# Training Recipe
+
+Reproducible environment + configuration for training HSTU / DLRM-v3 on the
+`yambda-5b` dataset.
+
+---
+
+## MI350X
+
+Single-node, 8× AMD **Instinct MI350X** (`gfx950`, ~288 GiB HBM3e each), HSTU
+ranker on `yambda-5b` with the **TRITON** HSTU kernel and **bf16**
+mixed-precision training.
+
+### Hardware / host
+
+| item | value |
+|---|---|
+| GPUs | 8× AMD Instinct MI350X (`gfx950`, ROCm 7.2.1) |
+| Host CPU | AMD EPYC 9655 96-Core (192 cores × 2 threads) |
+
+### Container image
+
+```
+rocm/primus:v26.3
+```
+
+### Dependency versions
+
+Aligned with the B200 path: same torch major.minor, same torchrec commit,
+same fbgemm SHA. The image's native torch / torchvision / torchaudio /
+torchrec / fbgemm_gpu are all replaced; only the image's triton stays.
+
+| package | version | install |
+|---|---|---|
+| **torch** | `2.12.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torch==2.12.0+rocm7.2` |
+| **torchvision** | `0.27.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torchvision` — ABI must match torch 2.12 |
+| **torchaudio** | `2.11.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torchaudio` — ABI must match torch 2.12 |
+| **triton** | `3.6.0` | image native, unchanged |
+| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.2` (built from FBGEMM commit `10b77573`, same SHA as the B200 path) for `gfx950` | rebuild from source against the replaced torch. Build command: `python setup.py -j 32 bdist_wheel --build-target=default --build-variant=rocm -DHIP_ROOT_DIR=/opt/rocm -DAMDGPU_TARGETS=gfx950` |
+| **torchrec** | `1.7.0a0+bf55480` (git tag `v2026.06.01.00`) | `pip install --force-reinstall --no-deps "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"` |
+| **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows. Installed from a pre-staged local tarball by `scripts/launch_smoke_8gpu.sh` |
+
+### Training configuration
+
+From `generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin`:
+
+| parameter | value | gin binding |
+|---|---|---|
+| num_workers (dataloader) | 4 | `make_train_test_dataloaders.num_workers` |
+| prefetch_factor | 8 | `make_train_test_dataloaders.prefetch_factor` |
+| num_blocks | 1 | `make_train_test_dataloaders.num_blocks` |
+| train_split_percentage | 0.90 | `make_train_test_dataloaders.train_split_percentage` |
+| history_length (per-sample UIH budget) | 2039 | `get_dataset.history_length` |
+| max_seq_len (attention budget) | 2048 | `get_hstu_configs.max_seq_len` |
+| bf16 training | True | `make_model.bf16_training` |
+| HBM cap (per GPU) | 260 GiB | `make_optimizer_and_shard.hbm_cap_gb` (env `HBM_CAP_GB`) |
+| **triton autotune pinning** | **False (pinned)** | `apply_env_bootstrap.TRITON_FULL_AUTOTUNE` |
+| dense optimizer | Adam, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `dense_optimizer_factory_and_class.*` |
+| sparse optimizer | RowWiseAdagrad, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `sparse_optimizer_factory_and_class.*` |
+| world_size | 8 | `MetricsLogger.world_size` |
+
+Effective global batch = `batch_size × world_size = 32 × 8 = 256` samples/step.
+
+### Environment variables
+
+| var | value | purpose |
+|---|---|---|
+| `HSTU_HAMMER_KERNEL` | `TRITON` | fast HSTU kernel (vs `PYTORCH` fallback) |
+| `DLRM_DATA_PATH` | dataset root | overrides gin default `/apps/chcai/dlrm_data` |
+| `HBM_CAP_GB` | (optional) | embedding planner HBM budget per GPU |
+| `RUN_NAME` | run id | results dir → `results/<RUN_NAME>/` |
+| `PYTORCH_CUDA_ALLOC_CONF` | `expandable_segments:True` | allocator headroom |
+| `HIP_VISIBLE_DEVICES` / `CUDA_VISIBLE_DEVICES` | `0,1,2,3,4,5,6,7` | rank visibility |
+
+`TRITON_FULL_AUTOTUNE` is set automatically by the gin-driven bootstrap
+(`generative_recommenders.dlrm_v3.train._env_bootstrap.apply_env_bootstrap`),
+which runs in `train_ranker._main_func` BEFORE the triton kernel modules
+import — so the gin file is the source of truth.
+
+### Measured performance
+
+| variant | steady-state ms/step | global sps | epoch ETA (3.23B anchors) |
+|---|---|---|---|
+| nightly + fp32 + PYTORCH attn (baseline) | ~190 | ~1340 | ~28 d |
+| nightly + bf16 + TRITON attn | ~93 | ~2787 | ~13.4 d |
+| primus + bf16 + TRITON attn | ~67.5 | ~3793 | ~9.9 d |
+| primus + fbgemm HEAD + bf16 + TRITON, autotune drift | ~53 fast / ~70 slow | 3700–4860 | 7.7–10.2 d |
+| **primus + fbgemm HEAD + bf16 + TRITON + pinning (default)** | **~52** | **~4970** | **~7.6 d** |
+
+The "pinning" line is the deterministic per-cold-start equilibrium —
+three layer-norm / jagged triton kernels have two stable autotune winners
+and the pin forces the fast one every run.
+
+### Known pitfalls
+
+- The image ships `fbgemm_gpu==2026.5.14`. The wheel built from FBGEMM HEAD
+  (`2026.6.1`) is required for the 70 → 52 ms step. Build inside the
+  container so the wheel links against the image's native torch.
+- Stock `polars` silently overflows on `yambda-5b` (> 4.29 B rows); always
+  use `polars-u64-idx`.
+- When changing shape (batch size, history length), GPU, or triton/torch
+  version, flip `apply_env_bootstrap.TRITON_FULL_AUTOTUNE = True` and run
+  with `TRITON_PRINT_AUTOTUNING=1` to re-capture winners, then update the
+  pinned configs at the `pinned_or_full(...)` call sites in
+  `generative_recommenders/ops/triton/`.
+- Do not run with bf16 on the `PYTORCH` HSTU attention backend at our
+  sequence length — `pt_hstu_attention`'s QK einsum backward overflows in
+  bf16 at N > 1k and produces NaN at step 1. bf16 is only safe with TRITON.
+
+---
+
+## B200
+
+Single-node, 8× NVIDIA **B200** (Blackwell, `sm_100`, ~183 GiB HBM each), HSTU
+ranker on `yambda-5b` with the **TRITON** HSTU kernel and **bf16** mixed-precision
+training.
+
+### Hardware / host
+
+| item | value |
+|---|---|
+| GPUs | 8× NVIDIA B200 (`sm_100`, compute capability 10.0) |
+| Host driver | 580.159.03 (reports CUDA 13.0) |
+| Forward-compat userspace driver | `libcuda.so.595.58.03` (CUDA 13.2.1; engaged automatically by the NGC image) |
+
+### Container image
+
+```
+nvcr.io/nvidia/pytorch:26.04-py3
+```
+
+Digest: `sha256:192d749b4d773610ec9e01c0443a9df545d196c412b7b8fd33bfa3da362a49e7`
+
+The image's native PyTorch is kept as-is and must not be reinstalled (so CUPTI
+stays matched to the driver and `sm_100` support is preserved).
+
+`nvcr.io/nvidia/pytorch:26.01-py3` (torch `2.10.0a0` / CUDA 13.1, digest
+`sha256:38ed2ecb2c16d10677006d73fb0a150855d6ec81db8fc66e800b5ae92741007e`) is
+also validated and performance-equivalent — rebuild `fbgemm_gpu` against
+whichever image's torch you run.
+
+### Dependency versions
+
+| package | version | notes |
+|---|---|---|
+| **torch** | `2.12.0a0+0291f960b6.nv26.04.48445190` (CUDA 13.2) | native to the image; not reinstalled |
+| **triton** | `3.6.0` | native to the image; provides `triton.language.make_tensor_descriptor` (required by the TRITON HSTU path) |
+| **fbgemm_gpu** | FBGEMM commit `10b775730212923f65f7b78f79b6a01d80cf3c29` (2026-06-01 `main`, CUDA 13.2, `sm_100`) | built from source against the native torch; public wheels are ABI-incompatible with the NGC torch. The built wheel is named `fbgemm_gpu_nightly-2026.6.1` — that version is the build date, not the source date, so always identify the build by the commit above. Build command: `TORCH_CUDA_ARCH_LIST=10.0 python setup.py bdist_wheel --build-target default --build-variant cuda --package_channel nightly --nvml_lib_path /usr/lib/x86_64-linux-gnu/libnvidia-ml.so` (~55 min — the `sm_100` TBE-forward kernels dominate via `ptxas`) |
+| **torchrec** | `1.7.0.dev20260601+cu130` (nightly, tested) | installed `--no-deps` from `https://download.pytorch.org/whl/nightly/cu130`. Perf-neutral vs stable `1.4.0`; use `1.4.0` (latest stable) if you prefer a non-pre-release |
+| **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows (overflows stock polars' 32-bit index) |
+| CUPTI (for `torch.profiler`) | 13.2 (native) | matches the driver; the `+cu128` stack's CUPTI 12.8 fails on B200 (`CUPTI_ERROR_INVALID_DEVICE`) |
+
+Additional Python deps:
+`xxhash`, `gin-config`, `absl-py`, `pandas`, `tensorboard`, `pyarrow`, `pyyaml`,
+`tqdm`, `psutil`, `torchmetrics==1.0.3`, `tensordict`, `pyre-extensions`,
+`iopath`, `typing-inspect`.
+
+### Training configuration
+
+From `generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin`:
+
+| parameter | value | gin binding |
+|---|---|---|
+| batch_size (train) | 32 | `make_train_test_dataloaders.batch_size` |
+| eval_batch_size | 32 | `make_train_test_dataloaders.eval_batch_size` |
+| num_workers (dataloader) | 4 | `make_train_test_dataloaders.num_workers` |
+| prefetch_factor | 8 | `make_train_test_dataloaders.prefetch_factor` |
+| num_blocks | 1 | `make_train_test_dataloaders.num_blocks` |
+| train_split_percentage | 0.90 | `make_train_test_dataloaders.train_split_percentage` |
+| history_length (per-sample UIH budget) | 2039 | `get_dataset.history_length` |
+| max_seq_len (attention budget) | 2048 | `get_hstu_configs.max_seq_len` |
+| bf16 training | True | `make_model.bf16_training` |
+| HBM cap (per GPU) | 150 GiB | `make_optimizer_and_shard.hbm_cap_gb` (env `HBM_CAP_GB`) |
+| **triton autotune pinning** | **True (full autotune)** | `apply_env_bootstrap.TRITON_FULL_AUTOTUNE` — the pinned configs are MI350X-specific, so B200 runs full autotune to find its own `sm_100` winners |
+| dense optimizer | Adam, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `dense_optimizer_factory_and_class.*` |
+| sparse optimizer | RowWiseAdagrad, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `sparse_optimizer_factory_and_class.*` |
+| world_size | 8 | `MetricsLogger.world_size` |
+
+Effective global batch = `batch_size × world_size = 32 × 8 = 256` samples/step.
+
+### Environment variables
+
+| var | value | purpose |
+|---|---|---|
+| `HSTU_HAMMER_KERNEL` | `TRITON` | fast HSTU kernel (vs `PYTORCH` fallback) |
+| `TORCH_CUDA_ARCH_LIST` | `10.0` | target `sm_100` for JIT / Triton compilation |
+| `DLRM_DATA_PATH` | dataset root | overrides gin default `/apps/chcai/dlrm_data` |
+| `HBM_CAP_GB` | `150` | embedding planner HBM budget per GPU |
+| `RUN_NAME` | run id | results dir → `results/<RUN_NAME>/` |
+| `PYTORCH_CUDA_ALLOC_CONF` | `expandable_segments:True` | allocator headroom |
+| `TRITON_CACHE_DIR` | cache path | persist compiled Triton kernels across runs |
+| `WORLD_SIZE` / `LOCAL_WORLD_SIZE` | `8` | mp.spawn rank count |
+
+### Known pitfalls
+
+- Never reinstall torch in this image — a cu12x wheel breaks CUPTI and may drop
+  `sm_100`.
+- The `+cu128` stack (`torch==2.7.1+cu128` + `fbgemm-gpu==1.2.0+cu128` +
+  `torchrec==1.2.0+cu128`) runs on B200 but cannot profile GPU activity (CUPTI
+  12.8 vs the 13.2 driver).
+- Stock `polars` silently overflows on `yambda-5b` (> 4.29 B rows); always use
+  `polars-u64-idx`.
+- `EmbeddingBoundsCheck ... Setting idx to zero` warnings are benign data clamps.
diff --git a/recommendation_v4/download_dataset.sh b/recommendation_v4/download_dataset.sh
new file mode 100755
index 000000000..d02f382dc
--- /dev/null
+++ b/recommendation_v4/download_dataset.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# MLPerf Training reference script: download + preprocess the dataset.
+#
+# Downloads the Yambda dataset from HuggingFace (yandex/yambda) and runs the
+# preprocessing pipeline (event-type encoding, temporal GTS split, session
+# segmentation, item-popularity counts) into the on-disk layout that
+# DLRMv3YambdaDataset consumes. This is a thin wrapper over
+#   generative_recommenders.dlrm_v3.preprocess_public_data
+# so the full reference data pipeline lives in one place.
+#
+# Usage:
+#   DLRM_DATA_PATH=/path/to/dlrm_data ./download_dataset.sh
+#   DATASET=yambda-50m DLRM_DATA_PATH=/path/to/dlrm_data ./download_dataset.sh
+#
+# Env:
+#   DATASET         dataset variant (default: yambda-5b). One of
+#                   kuairand-1k | kuairand-27k | yambda-50m | yambda-500m | yambda-5b
+#   DLRM_DATA_PATH  destination data root (required).
+set -euo pipefail
+
+DATASET="${DATASET:-yambda-5b}"
+: "${DLRM_DATA_PATH:?Set DLRM_DATA_PATH to the destination data root}"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "${REPO_ROOT}"
+
+echo "[download_dataset] dataset=${DATASET} data-path=${DLRM_DATA_PATH}"
+mkdir -p "${DLRM_DATA_PATH}"
+
+python3 -m generative_recommenders.dlrm_v3.preprocess_public_data \
+    --dataset "${DATASET}" \
+    --data-path "${DLRM_DATA_PATH}"
+
+echo "[download_dataset] done. Preprocessed layout under ${DLRM_DATA_PATH}:"
+echo "  raw/5b/multi_event.parquet"
+echo "  shared_metadata/{artist,album}_item_mapping.parquet, embeddings.parquet"
+echo "  processed_5b/{train_sessions,test_events,session_index}.parquet"
+echo "  processed_5b/item_popularity.npy, processed_5b/split_meta.json"
+echo "[download_dataset] verify integrity with: ./verify_dataset.sh"
diff --git a/recommendation_v4/generative_recommenders/README.md b/recommendation_v4/generative_recommenders/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/recommendation_v4/generative_recommenders/common.py b/recommendation_v4/generative_recommenders/common.py
new file mode 100644
index 000000000..2ff8edf80
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/common.py
@@ -0,0 +1,513 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import abc
+import copy
+import os
+from enum import Enum, unique
+from typing import Any, Callable, List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.ops.utils import is_sm100_plus, is_sm90_plus
+from torch.fx._symbolic_trace import is_fx_tracing
+from torch.utils._python_dispatch import _get_current_dispatch_mode_stack
+
+# @manual=//triton:triton
+from triton.runtime.autotuner import Autotuner
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+try:
+    # @manual=//triton:triton
+    import triton.language.extra.tlx  # type: ignore
+
+    HAS_TLX = True
+except ImportError:
+    HAS_TLX = False
+
+try:
+    from generative_recommenders.fb.triton_cc.utils import triton_cc
+    from hammer.ops.triton.utils import triton_autotune
+    from hammer.utils import is_dev_mode, set_dev_mode, set_verbose_level
+except ImportError:
+    # pyre-ignore
+    def triton_cc(annotations):
+        # pyre-ignore
+        def decorator(fn):
+            return fn
+
+        return decorator
+
+    # pyre-ignore
+    def triton_autotune(
+        configs: List[triton.Config],
+        key: List[str],
+        # pyre-ignore
+        prune_configs_by=None,
+        # pyre-ignore
+        reset_to_zero=None,
+        # pyre-ignore
+        restore_value=None,
+        warmup: int = 25,
+        rep: int = 100,
+    ):
+        # pyre-ignore
+        def decorator(fn):
+            return Autotuner(
+                fn,
+                fn.arg_names,
+                configs,
+                key,
+                reset_to_zero,
+                restore_value,
+                pre_hook=None,
+                post_hook=None,
+                prune_configs_by=prune_configs_by,
+                warmup=warmup,
+                rep=rep,
+            )
+
+        return decorator
+
+    DEV_MODE: bool = False
+    VERBOSE_LEVEL: int = 0
+
+    def set_dev_mode(val: bool) -> None:
+        global DEV_MODE
+        DEV_MODE = val
+
+    def is_dev_mode() -> bool:
+        global DEV_MODE  # noqa: F824
+        return DEV_MODE
+
+    def set_verbose_level(level: int) -> None:
+        global VERBOSE_LEVEL
+        VERBOSE_LEVEL = level
+
+    def get_verbose_level() -> int:
+        global VERBOSE_LEVEL  # noqa: F824
+        return VERBOSE_LEVEL
+
+
+@unique
+class HammerKernel(Enum):
+    TRITON = "TRITON"
+    TLX = "TLX"
+    PYTORCH = "PYTORCH"
+    CUDA = "CUDA"
+    TRITON_CC = "TRITON_CC"
+    TRITON_INFERENCE = "TRITON_INFERENCE"
+    CUTEDSL = "CUTEDSL"
+
+
+class HammerModule(torch.nn.Module, abc.ABC):
+    _is_inference: bool = False
+    _use_triton_cc: bool = True
+    _training_dtype: torch.dtype = torch.float32
+    _hammer_kernel: Optional[HammerKernel] = None
+
+    def __init__(
+        self,
+        is_inference: bool,
+        training_dytpe: torch.dtype = torch.float32,
+        use_triton_cc: bool = _use_triton_cc,
+        hammer_kernel: Optional[HammerKernel] = None,
+    ) -> None:
+        super().__init__()
+        self._is_inference = is_inference
+        self._training_dtype = training_dytpe
+        self._hammer_kernel = hammer_kernel
+        self._use_triton_cc = use_triton_cc
+
+    def hammer_kernel(self) -> HammerKernel:
+        kernel = self._hammer_kernel
+        if kernel is not None:
+            return kernel
+        if self._is_inference and self._use_triton_cc:
+            return HammerKernel.TRITON_CC
+        else:
+            return HammerKernel.TRITON
+
+    # pyre-ignore[2]
+    def recursive_setattr(self, name: str, value: Any) -> None:
+        for _, module in self.named_modules():
+            if hasattr(module, name):
+                setattr(module, name, value)
+
+    def set_use_triton_cc(self, use_triton_cc: bool) -> None:
+        self._use_triton_cc = use_triton_cc
+        self.recursive_setattr("_use_triton_cc", use_triton_cc)
+
+    def set_is_inference(self, is_inference: bool) -> None:
+        self._is_inference = is_inference
+        self.recursive_setattr("_is_inference", is_inference)
+
+    def set_training_dtype(self, training_dtype: torch.dtype) -> None:
+        self._training_dtype = training_dtype
+        self.recursive_setattr("_training_dtype", training_dtype)
+
+    def set_hammer_kernel(self, hammer_kernel: HammerKernel) -> None:
+        self._hammer_kernel = hammer_kernel
+        self.recursive_setattr("_hammer_kernel", hammer_kernel)
+
+    @property
+    def is_inference(self) -> bool:
+        return self._is_inference
+
+    @property
+    def is_eval(self) -> bool:
+        return (not self._is_inference) and (not self.training)
+
+    @property
+    def is_train(self) -> bool:
+        return (not self._is_inference) and self.training
+
+
+def generate_sparse_seq_len(
+    size: int,
+    max_seq_len: int,
+    sparsity: float,
+    device: torch.device,
+) -> torch.Tensor:
+    if sparsity == 0.0:
+        return torch.zeros(size=(size,), device=device, dtype=torch.int)
+    elif sparsity == 1.0:
+        return torch.ones(size=(size,), device=device, dtype=torch.int) * max_seq_len
+    elif sparsity >= 0.5:
+        min_seq_len: int = int((2 * sparsity - 1.0) * max_seq_len)
+        return torch.randint(
+            low=min_seq_len,
+            high=max_seq_len,
+            size=(size,),
+            device=device,
+            dtype=torch.int,
+        )
+    else:
+        min_seq_len: int = 0
+        max_seq_len: int = int(2 * sparsity * max_seq_len)
+        return torch.randint(
+            low=min_seq_len,
+            high=max_seq_len,
+            size=(size,),
+            device=device,
+            dtype=torch.int,
+        )
+
+
+def apply_sampling(
+    lengths: torch.Tensor,
+    alpha: float,
+    max_seq_len: int,
+) -> torch.Tensor:
+    threshold = int(max_seq_len ** (alpha / 2))
+    no_sample_prob = (max_seq_len**alpha) / torch.pow(lengths, 2)
+    users_to_sample = torch.logical_and(
+        lengths > threshold,
+        torch.rand_like(no_sample_prob) < 1 - no_sample_prob,
+    )
+    lengths = torch.where(users_to_sample, threshold, lengths)
+    return lengths
+
+
+nv_gpu_unavailable: Tuple[bool, str] = (
+    not torch.cuda.is_available() or torch.cuda.device_count() == 0,
+    "CUDA is not available or no GPUs detected",
+)
+nv_gpu_available: bool = not nv_gpu_unavailable[0]
+
+
+amd_gpu_unavailable: Tuple[bool, str] = (
+    not torch.version.hip,
+    "AMD HIP not available or no GPUs detected",
+)
+amd_gpu_available: bool = not amd_gpu_unavailable[0]
+
+gpu_unavailable: Tuple[bool, str] = (
+    not nv_gpu_available and not amd_gpu_available,
+    "CUDA/HIP is not available or no GPUs detected",
+)
+
+gpu_available: bool = not gpu_unavailable[0]
+
+blackwell_tlx_unavailable: Tuple[bool, str] = (
+    not is_sm100_plus() or not HAS_TLX,
+    "Skip TLX and blackwell only tests",
+)
+
+tma_unavailable: Tuple[bool, str] = (
+    not is_sm90_plus(),  # noqa
+    "Skip TMA only tests",
+)
+
+
+def switch_to_contiguous_if_needed(x: torch.Tensor) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        if x.stride(-1) == 1:
+            return x
+        return x.contiguous()
+    if torch.compiler.is_compiling():
+        # Tell Dynamo this data-dependent value is in the range (0, 10**9)
+        torch._check(x.size(0) > 0)
+        torch._check(x.size(0) < 10**9)
+    # FX cannot trace Python control flow over symbolic stride checks
+    # (`x.stride(-1) == 1`). For AOT-T lowering, conservatively emit the
+    # contiguous op instead of branching on a symbolic value.
+    if is_fx_tracing():
+        return x.contiguous()
+    if x.stride(-1) == 1:
+        return x
+    return x.contiguous()
+
+
+def cdiv(x: int, y: int) -> int:
+    return (x + y - 1) // y
+
+
+def backend_allow_tf32() -> bool:
+    return True
+
+
+BACKEND_ALLOW_TF32: bool = backend_allow_tf32()
+
+
+def next_power_of_2(n: int) -> int:
+    """Return the smallest power of 2 greater than or equal to n"""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n
+
+
+def _prev_power_of_2_bitwise(x: int) -> int:
+    """Return the largest power of 2 less than or equal to x."""
+    x |= x >> 1
+    x |= x >> 2
+    x |= x >> 4
+    x |= x >> 8
+    x |= x >> 16
+    x |= x >> 32
+    return (x >> 1) + 1
+
+
+@torch.fx.wrap
+def _prev_power_of_2_legacy(x: int) -> int:
+    if torch.compiler.is_compiling():
+        # Re-write to make Dynamo happy
+        x_tensor = torch.scalar_tensor(x, dtype=torch.int64)  # type: ignore[arg-type]
+        x_tensor_orig = x_tensor.clone()
+        out_val = next_power_of_2(int(x_tensor.item()))  # type: ignore[arg-type]
+        out = torch.scalar_tensor(out_val, dtype=torch.int64)
+        return int(torch.where(torch.lt(x_tensor_orig, out), out // 2, out).item())  # type: ignore[return-value]
+    else:
+        out = next_power_of_2(x)
+        return out // 2 if out > x else out
+
+
+prev_power_of_2: Callable[[int], int] = (
+    _prev_power_of_2_legacy
+    if os.environ.get("PREV_POWER_OF_2_IMPL", "legacy") == "legacy"
+    else _prev_power_of_2_bitwise
+)
+
+
+STATIC_MAX_SEQ_LENS: List[int] = []
+USE_RUNTIME_MAX_SEQ_LEN: bool = False
+
+
+def set_static_max_seq_lens(max_seq_lens: List[int]) -> None:
+    global STATIC_MAX_SEQ_LENS
+    STATIC_MAX_SEQ_LENS = copy.deepcopy(max_seq_lens)
+    STATIC_MAX_SEQ_LENS.sort()
+
+
+def set_use_runtime_max_seq_len(use_runtime_max_seq_len: bool) -> None:
+    global USE_RUNTIME_MAX_SEQ_LEN
+    USE_RUNTIME_MAX_SEQ_LEN = use_runtime_max_seq_len
+
+
+def autotune_max_seq_len(runtime_max_seq_len: int) -> int:
+    global USE_RUNTIME_MAX_SEQ_LEN  # noqa: F824
+
+    if USE_RUNTIME_MAX_SEQ_LEN:
+        return prev_power_of_2(runtime_max_seq_len)
+    else:
+        if STATIC_MAX_SEQ_LENS == []:
+            return 1
+        for max_len in STATIC_MAX_SEQ_LENS:
+            if max_len >= runtime_max_seq_len:
+                return max_len
+        return STATIC_MAX_SEQ_LENS[-1]
+
+
+def fine_grained_autotune_max_seq_len(runtime_max_seq_len: int) -> int:
+    global USE_RUNTIME_MAX_SEQ_LEN  # noqa: F824
+
+    if USE_RUNTIME_MAX_SEQ_LEN:
+        return _fine_grained_bucket_size(runtime_max_seq_len)
+    else:
+        if STATIC_MAX_SEQ_LENS == []:
+            return 1
+        for max_len in STATIC_MAX_SEQ_LENS:
+            if max_len >= runtime_max_seq_len:
+                return max_len
+        return STATIC_MAX_SEQ_LENS[-1]
+
+
+def _generate_fine_grained_buckets() -> List[int]:
+    buckets = [
+        1024,
+        2048,
+        4096,
+        8192,
+        12288,
+        16384,
+        24576,
+        32768,
+        40960,
+        49152,
+        65536,
+        81920,
+        98304,
+    ]
+    return buckets
+
+
+@torch.fx.wrap
+def _fine_grained_bucket_size(x: int) -> int:
+    if torch.compiler.is_compiling():
+        x_tensor = torch.scalar_tensor(x, dtype=torch.int64)
+        buckets = torch.tensor(_generate_fine_grained_buckets(), dtype=torch.int64)
+
+        mask = buckets >= x_tensor
+        valid_buckets = torch.where(
+            mask, buckets, torch.tensor(2**31 - 1, dtype=torch.int64)
+        )
+
+        result = torch.where(mask.any(), valid_buckets.min(), buckets[-1])
+
+        return int(result.item())
+    else:
+        buckets = _generate_fine_grained_buckets()
+
+        for bucket in buckets:
+            if x <= bucket:
+                return bucket
+
+        return buckets[-1]
+
+
+@torch.fx.wrap
+def fx_unwrap_optional_tensor(optional: Optional[torch.Tensor]) -> torch.Tensor:
+    assert optional is not None, "Expected optional to be non-None Tensor"
+    return optional
+
+
+@torch.fx.wrap
+def fx_arange(len: int, device: torch.device) -> torch.Tensor:
+    return torch.arange(len, device=device)
+
+
+@torch.fx.wrap
+def fx_infer_max_len(
+    lengths: torch.Tensor,
+) -> int:
+    # Do not call ".item()" to avoid unbacked symint problems for lowering
+    max_len = int(lengths.max())
+    if not torch.jit.is_scripting() and torch.compiler.is_compiling():
+        # Tell Dynamo this data-dependent value is in the range [0, 10**9)
+        torch._check_is_size(max_len)
+        torch._check(max_len < 10**9)
+        torch._check(max_len > 0)
+    return max_len
+
+
+@torch.fx.wrap
+def fx_mark_length_features(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor
+
+
+@torch.fx.wrap
+def fx_torch_ones(
+    shape: List[int],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return torch.ones(shape, device=device, dtype=dtype)
+
+
+@torch.fx.wrap
+def fx_torch_zeros(shape: List[int], device: torch.device) -> torch.Tensor:
+    return torch.zeros(shape, device=device)
+
+
+def _is_in_dispatch_modes(mode_names: List[str]) -> bool:
+    modes = _get_current_dispatch_mode_stack()
+    return any(mode.__class__.__name__ in mode_names for mode in modes)
+
+
+def should_trigger_eager_impl() -> bool:
+    if torch.jit.is_scripting():
+        return True
+    if torch.compiler.is_compiling():
+        return False
+    return _is_in_dispatch_modes(["SplitDispatchMode", "FakeTensorMode"])
+
+
+@torch.fx.wrap
+def jagged_to_padded_dense(
+    values: torch.Tensor,
+    offsets: List[torch.Tensor],
+    max_lengths: List[int],
+    padding_value: float,
+) -> torch.Tensor:
+    return torch.ops.fbgemm.jagged_to_padded_dense(
+        values=values,
+        offsets=offsets,
+        max_lengths=max_lengths,
+        padding_value=padding_value,
+    )
+
+
+@torch.fx.wrap
+def dense_to_jagged(
+    dense: torch.Tensor,
+    x_offsets: List[torch.Tensor],
+) -> torch.Tensor:
+    return torch.ops.fbgemm.dense_to_jagged(
+        dense=dense,
+        x_offsets=x_offsets,
+    )[0]
+
+
+def init_mlp_weights_optional_bias(m: torch.nn.Module) -> None:
+    if isinstance(m, torch.nn.Linear):
+        torch.nn.init.xavier_uniform_(m.weight)
+        if m.bias is not None:
+            m.bias.data.fill_(0.0)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
new file mode 100644
index 000000000..46cc10e2e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
@@ -0,0 +1,680 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Checkpoint utilities for saving and loading DLRMv3 model checkpoints.
+
+This module provides functions for saving and loading distributed model checkpoints,
+including both sparse (embedding) and dense (non-embedding) components.
+"""
+
+import gc
+import logging
+import os
+import random
+import shutil
+import time
+from datetime import datetime
+from typing import Any, Dict, Optional, Set, Tuple
+
+import gin
+import numpy as np
+import torch
+from generative_recommenders.dlrm_v3.utils import (
+    BinnedCumulativeAUC,
+    LifetimeAUCMetricComputation,
+    MetricsLogger,
+)
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.optim.optimizer import Optimizer
+from torchrec.distributed.types import ShardedTensor
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+# Sentinel meaning "the saved window completed in full" — when the loop reads
+# this back it advances start_ts past the saved train_ts. Anything >=0 means the
+# saved checkpoint stopped mid-window after K batches; resume continues that
+# window at batch K.
+WINDOW_COMPLETE: int = -1
+
+# Filename (per-rank) holding the lifetime-AUC trailing buffers, mirroring the
+# rng_rank{rank}.pt pattern. The buffers are per-rank-local, so a single
+# rank-0 copy in non_sparse.ckpt would (wrongly) restore 1/world_size of the
+# true history to every rank — hence a dedicated per-rank artifact.
+METRICBUF_FILE_FMT: str = "metricbuf_rank{rank}.pt"
+
+
+def _metric_blob_state_dict(m: torch.nn.Module) -> Dict[str, Any]:
+    """State dict for the shared (rank-0) non_sparse.ckpt metric blob.
+
+    Both lifetime-AUC backends carry per-rank-local state that is persisted
+    authoritatively per-rank in ``metricbuf_rank{rank}.pt``; we must keep it out
+    of the shared blob so a rank's load doesn't inherit rank-0's counts:
+
+    - ``LifetimeAUCMetricComputation``: drop the explicitly-serialized trailing
+      buffer keys (the rest of the blob keys are the parent's persistent state).
+    - ``BinnedCumulativeAUC``: zero the histogram buffers (they are persistent so
+      the keys must remain for a strict load, but the values are neutralized).
+
+    All other metrics serialize normally. In both cases the per-rank file is
+    loaded afterward and is authoritative.
+    """
+    sd = m.state_dict()
+    if isinstance(m, LifetimeAUCMetricComputation):
+        prefix = LifetimeAUCMetricComputation._LIFETIME_KEY_PREFIX
+        sd = {k: v for k, v in sd.items() if not k.startswith(prefix)}
+    elif isinstance(m, BinnedCumulativeAUC):
+        sd = {
+            k: (torch.zeros_like(v) if torch.is_tensor(v) else v)
+            for k, v in sd.items()
+        }
+    return sd
+
+
+def _collect_perrank_metric_state(
+    metric_logger: "MetricsLogger",
+) -> Dict[str, Dict[str, Any]]:
+    """Map "<collection>|<mode>|<idx>" -> state_dict for every metric whose
+    cumulative state is per-rank-local and must be restored per-rank:
+
+    - lifetime-AUC instances (`LifetimeAUCMetricComputation` trailing buffer, or
+      `BinnedCumulativeAUC` histograms) in class_metrics train/eval. Covers the
+      train lifetime AUC and, in legacy single-set eval, the eval lifetime AUC,
+      under either configured backend.
+    - the ENTIRE cumulative eval set (`eval_cum`, both class + regression) used
+      by the streaming dual-set eval: the lifetime-AUC backend state plus the
+      persistent cumulative scalar sums of NE/Accuracy/GAUC/MSE/MAE.
+
+    Selected by structure/isinstance (not a hard index) since metric positions
+    depend on the configured tasks/mode.
+    """
+    out: Dict[str, Dict[str, Any]] = {}
+    for mode in ("train", "eval"):
+        for idx, m in enumerate(metric_logger.class_metrics.get(mode, [])):
+            if isinstance(m, (LifetimeAUCMetricComputation, BinnedCumulativeAUC)):
+                out[f"class_metrics|{mode}|{idx}"] = m.state_dict()
+    for coll in ("class_metrics", "regression_metrics"):
+        for idx, m in enumerate(getattr(metric_logger, coll).get("eval_cum", [])):
+            out[f"{coll}|eval_cum|{idx}"] = m.state_dict()
+    return out
+
+
+def _restore_perrank_metric_state(
+    metric_logger: "MetricsLogger", state: Dict[str, Dict[str, Any]]
+) -> None:
+    for key, sd in state.items():
+        coll, mode, idx_str = key.split("|")
+        getattr(metric_logger, coll)[mode][int(idx_str)].load_state_dict(sd)
+
+
+def _perrank_sample_counts(metric_logger: "MetricsLogger") -> Dict[str, int]:
+    out: Dict[str, int] = {}
+
+    def _count(m: torch.nn.Module) -> Optional[int]:
+        if isinstance(m, LifetimeAUCMetricComputation):
+            return m.lifetime_sample_count()
+        if isinstance(m, BinnedCumulativeAUC):
+            return m.cumulative_sample_count()
+        return None
+
+    for mode in ("train", "eval", "eval_cum"):
+        for idx, m in enumerate(metric_logger.class_metrics.get(mode, [])):
+            n = _count(m)
+            if n is not None:
+                out[f"class|{mode}|{idx}"] = n
+    return out
+
+
+class SparseState(Stateful):
+    """
+    Stateful wrapper for sparse (embedding) tensors in a model.
+
+    This class implements the Stateful interface for distributed checkpointing,
+    allowing sparse tensors to be saved and loaded separately from dense tensors.
+
+    Args:
+        model: The PyTorch model containing sparse tensors.
+        sparse_tensor_keys: Set of keys identifying sparse tensors in the model's state dict.
+    """
+
+    def __init__(self, model: torch.nn.Module, sparse_tensor_keys: Set[str]) -> None:
+        self.model = model
+        self.sparse_tensor_keys = sparse_tensor_keys
+
+    def state_dict(self) -> Dict[str, torch.Tensor]:
+        out_dict: Dict[str, torch.Tensor] = {}
+        is_sharded_tensor: Optional[bool] = None
+        for k, v in self.model.state_dict().items():
+            if k in self.sparse_tensor_keys:
+                if is_sharded_tensor is None:
+                    is_sharded_tensor = isinstance(v, ShardedTensor)
+                assert is_sharded_tensor == isinstance(v, ShardedTensor)
+                out_dict[k] = v
+        return out_dict
+
+    def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
+        incompatible_keys = self.model.load_state_dict(state_dict, strict=False)
+        assert not incompatible_keys.unexpected_keys
+
+
+def is_sparse_key(k: str, v: torch.Tensor) -> bool:
+    return isinstance(v, ShardedTensor) or "embedding_collection" in k
+
+
+def load_dense_state_dict(model: torch.nn.Module, state_dict: Dict[str, Any]) -> None:
+    own_state = model.state_dict()
+    own_state_dense_keys = {k for k, v in own_state.items() if not is_sparse_key(k, v)}
+    state_dict_dense_keys = {
+        k for k, v in state_dict.items() if not is_sparse_key(k, v)
+    }
+    assert own_state_dense_keys == state_dict_dense_keys, (
+        f"expects {own_state_dense_keys} but gets {state_dict_dense_keys}"
+    )
+    for name in state_dict_dense_keys:
+        param = state_dict[name]
+        if isinstance(param, torch.nn.Parameter):
+            # backwards compatibility for serialized parameters
+            param = param.data
+        own_state[name].copy_(param)
+
+
+def _rng_state(device: torch.device) -> Dict[str, Any]:
+    """Snapshot every RNG source bit-equal training depends on.
+
+    HSTU has stochastic dropout (input_dropout=0.2, linear_dropout_rate=0.1)
+    consuming the per-device CUDA RNG cycle each step. Without round-tripping
+    these, a resumed run draws different dropout masks and the resumed AUC
+    trajectory diverges from the uninterrupted run within a few steps.
+    """
+    return {
+        "cpu": torch.get_rng_state(),
+        "cuda": torch.cuda.get_rng_state(device),
+        "numpy": np.random.get_state(),
+        "random": random.getstate(),
+    }
+
+
+def _restore_rng_state(state: Dict[str, Any], device: torch.device) -> None:
+    torch.set_rng_state(state["cpu"])
+    torch.cuda.set_rng_state(state["cuda"], device)
+    np.random.set_state(state["numpy"])
+    random.setstate(state["random"])
+
+
+def _list_numeric_subdirs(base_path: str) -> list[str]:
+    """Return subdir names of `base_path` that look like an int, sorted ascending.
+
+    Filters out `*.tmp` (orphaned in-progress saves), `*.sparse/` and any other
+    non-numeric entries.
+    """
+    if not os.path.isdir(base_path):
+        return []
+    out: list[str] = []
+    for name in os.listdir(base_path):
+        if name.isdigit():
+            out.append(name)
+    return sorted(out, key=int)
+
+
+def _resolve_latest_subdir(path: str) -> str:
+    """Map a base ckpt dir → its highest-numbered numeric subdir.
+
+    Used so users can set `load_dmp_checkpoint.path = "<base>"` (or
+    `CKPT_PATH=<base>`) and automatically pick up the most recent save without
+    needing to know which step number to point at.     If `path` already names a leaf save (numeric basename) it's returned
+    unchanged. If the base dir has no numeric subdirs yet — the cold-start case
+    where ``CKPT_PATH`` is configured but nothing has been saved (e.g. the
+    interrupt phase of the resume test starts from a freshly-cleaned dir) — we
+    return ``""`` so ``load_*_checkpoint`` no-ops instead of asserting on a
+    missing ``sparse/.metadata``.
+    """
+    if not path:
+        return path
+    base = path.rstrip("/")
+    leaf = os.path.basename(base)
+    if leaf.isdigit():
+        return base  # already a leaf, caller knows what it wants
+    subs = _list_numeric_subdirs(base)
+    if not subs:
+        logger.info("No checkpoint subdirs under %s — cold start (no load).", base)
+        return ""  # nothing to load → load_*_checkpoint short-circuits
+    resolved = os.path.join(base, subs[-1])
+    logger.info("Auto-latest checkpoint: %s → %s", base, resolved)
+    return resolved
+
+
+def _prune_old_checkpoints(base_path: str, keep_last_n: int, just_saved_subdir: str) -> None:
+    """Delete numeric subdirs older than the keep_last_n most recent.
+
+    Defensive: never prune `just_saved_subdir` even if it would be evicted by
+    the keep_last_n window (shouldn't happen since we just wrote it, but
+    catches off-by-one bugs). Skipped entirely when keep_last_n<=0.
+    """
+    if keep_last_n <= 0:
+        return
+    subs = _list_numeric_subdirs(base_path)
+    if len(subs) <= keep_last_n:
+        return
+    to_prune = subs[:-keep_last_n]
+    for name in to_prune:
+        full = os.path.join(base_path, name)
+        if os.path.realpath(full) == os.path.realpath(just_saved_subdir):
+            continue
+        try:
+            shutil.rmtree(full)
+            logger.info("Pruned old checkpoint: %s", full)
+        except OSError as e:
+            logger.warning("Failed to prune %s: %s", full, e)
+
+
+def _cleanup_stale_tmps(base_path: str) -> None:
+    """Remove `*.tmp`/`*.old` subdirs left by a crashed prior save attempt.
+
+    `*.tmp` = an interrupted write; `*.old` = an interrupted atomic-overwrite
+    swap (see the promotion step in save_dmp_checkpoint). Both are non-numeric
+    so `_resolve_latest_subdir` already ignores them; this just reclaims disk.
+    """
+    if not os.path.isdir(base_path):
+        return
+    for name in os.listdir(base_path):
+        if name.endswith(".tmp") or name.endswith(".old"):
+            full = os.path.join(base_path, name)
+            try:
+                shutil.rmtree(full)
+                logger.warning("Removed stale checkpoint dir: %s", full)
+            except OSError as e:
+                logger.warning("Failed to remove stale dir %s: %s", full, e)
+
+
+@gin.configurable
+def save_dmp_checkpoint(
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    metric_logger: MetricsLogger,
+    rank: int,
+    batch_idx: int,
+    path: str = "",
+    keep_last_n: int = 1,
+    train_ts: Optional[int] = None,
+    batch_idx_in_window: int = WINDOW_COMPLETE,
+    device: Optional[torch.device] = None,
+    split_contract: Optional[Dict[str, Any]] = None,
+) -> None:
+    """
+    Save a distributed model checkpoint including sparse and dense components.
+
+    Writes into a per-rank-coordinated atomic layout:
+        <path>/<batch_idx>.tmp/   ← directory written into during save
+        <path>/<batch_idx>/       ← atomically renamed from .tmp on success
+
+    A crash mid-save leaves the `.tmp/` orphan, which `_cleanup_stale_tmps`
+    sweeps on the next save attempt and which `_resolve_latest_subdir` ignores
+    (non-numeric basename). The previous successful `<N-K>/` remains valid.
+
+    Args:
+        model: The model to checkpoint.
+        optimizer: The optimizer whose state should be saved.
+        metric_logger: The metrics logger containing training/eval metrics.
+        rank: The current process rank in distributed training.
+        batch_idx: Subdir name (for streaming we set this == train_ts so the
+            on-disk layout monotonically increases).
+        path: Base path for saving the checkpoint. If empty, no checkpoint is saved.
+        keep_last_n: Number of most-recent numeric subdirs to retain after a
+            successful save. Set 1 (default) for disk-bounded long runs;
+            <=0 disables pruning.
+        train_ts: For streaming-train-eval, the current train timestamp.
+            Stored in non_sparse.ckpt so resume knows which window to enter.
+        batch_idx_in_window: For streaming-train-eval, batches completed within
+            train_ts. WINDOW_COMPLETE (-1) means the window finished; resume
+            advances to train_ts+1. >=0 means crash happened mid-window; resume
+            re-enters train_ts at batch_idx_in_window.
+        device: CUDA device for the per-rank RNG snapshot. Required for
+            bit-equal trajectories across resume (HSTU dropout consumes the
+            per-device RNG cycle).
+    """
+    if path == "":
+        return
+    # Exclude checkpoint wall-time from the train step-time window so step_ms
+    # reports canonical compute latency; the duration is surfaced separately
+    # (window_ckpt_time_ms + the per-save log below). pause/resume are no-ops if
+    # metric_logger is None. Not wrapped in try/finally: a save that raises
+    # crashes the process (supervisor restarts fresh), so a dangling pause on
+    # the soon-dead logger is irrelevant.
+    _t_ckpt_start = time.perf_counter()
+    if metric_logger is not None:
+        metric_logger.pause_perf("ckpt")
+    base_path = path
+    # Atomic-save layout: write to .tmp, rename to final, prune older.
+    tmp_subdir = f"{base_path}/{batch_idx}.tmp"
+    final_subdir = f"{base_path}/{batch_idx}"
+
+    if rank == 0:
+        _cleanup_stale_tmps(base_path)
+        # Always (re)write into a fresh .tmp. An existing `final_subdir` with the
+        # same batch_idx (e.g. a later in-window save for the same train_ts, or a
+        # deterministic re-run at the same step) is overwritten atomically at the
+        # promotion step below — NOT skipped here. Skipping would desync ranks:
+        # the collective barrier/checkpoint.save calls below run on *every* rank,
+        # so a rank-0-only early return deadlocks ranks 1..N on the next barrier.
+        shutil.rmtree(tmp_subdir, ignore_errors=True)
+        os.makedirs(tmp_subdir, exist_ok=True)
+        os.makedirs(f"{tmp_subdir}/sparse/", exist_ok=True)
+    torch.distributed.barrier()
+    sparse_path = f"{tmp_subdir}/sparse/"
+    non_sparse_ckpt = f"{tmp_subdir}/non_sparse.ckpt"
+
+    sparse_tensor_keys = {
+        k for k, v in model.state_dict().items() if isinstance(v, ShardedTensor)
+    }
+    if rank == 0:
+        dense_state_dict = {
+            k: v
+            for k, v in model.state_dict().items()
+            if not isinstance(v, ShardedTensor)
+        }
+        class_metric_state_dict = {
+            "train": [
+                _metric_blob_state_dict(m)
+                for m in metric_logger.class_metrics["train"]
+            ],
+            "eval": [
+                _metric_blob_state_dict(m)
+                for m in metric_logger.class_metrics["eval"]
+            ],
+        }
+        regression_metric_state_dict = {
+            "train": [
+                m.state_dict() for m in metric_logger.regression_metrics["train"]
+            ],
+            "eval": [m.state_dict() for m in metric_logger.regression_metrics["eval"]],
+        }
+        torch.save(
+            {
+                "dense_dict": dense_state_dict,
+                "optimizer_dict": optimizer.state_dict(),
+                "class_metrics": class_metric_state_dict,
+                "reg_metrics": regression_metric_state_dict,
+                "global_step": metric_logger.global_step,
+                # MLPerf progress counter (global trained samples). Defaulted on
+                # load so pre-existing checkpoints restore as 0 and resume the
+                # count from there.
+                "cumulative_train_samples": metric_logger.cumulative_train_samples,
+                # MLPerf run-marker state: lets a resume relaunch continue the
+                # SAME run's event stream without re-emitting INIT_START/RUN_START.
+                "mlperf_run_started": metric_logger.mlperf_run_started,
+                "sparse_tensor_keys": sparse_tensor_keys,
+                # Streaming resume fields. Defaulted on load so old checkpoints
+                # (pre-streaming-resume) still load as a normal restart.
+                "train_ts": train_ts,
+                "batch_idx_in_window": batch_idx_in_window,
+                # Immutable train:eval split + resume-determinism contract
+                # (train_split_percentage, split_salt, eval holdout window,
+                # batch_size, world_size). Validated on resume so a relaunch
+                # cannot silently change the split (which would desync the skip
+                # offset and/or train on held-out eval users). None for
+                # non-holdout / legacy runs.
+                "split_contract": split_contract,
+            },
+            non_sparse_ckpt,
+        )
+
+    # Per-rank RNG snapshot. Written even on a single rank because dropout's
+    # randomness comes from the CUDA generator which differs across devices.
+    if device is not None:
+        rng_path = f"{tmp_subdir}/rng_rank{rank}.pt"
+        torch.save(_rng_state(device), rng_path)
+
+    # Per-rank cumulative metric state (lifetime-AUC buffers + cumulative-eval
+    # histograms/scalar sums). Written by EVERY rank (outside the rank-0 block)
+    # because this state is per-rank-local; restoring rank-0's copy to all ranks
+    # would lose (world_size-1)/world_size of the history.
+    if metric_logger is not None:
+        perrank_state = _collect_perrank_metric_state(metric_logger)
+        if perrank_state:
+            torch.save(
+                perrank_state,
+                f"{tmp_subdir}/{METRICBUF_FILE_FMT.format(rank=rank)}",
+            )
+            logger.info(
+                "checkpoint save: cumulative metric state rank=%d samples=%s",
+                rank,
+                _perrank_sample_counts(metric_logger),
+            )
+
+    torch.distributed.barrier()
+    sparse_dict = {"sparse_dict": SparseState(model, sparse_tensor_keys)}
+    torch.distributed.checkpoint.save(
+        sparse_dict,
+        storage_writer=torch.distributed.checkpoint.FileSystemWriter(sparse_path),
+    )
+    torch.distributed.barrier()
+    # Promote .tmp → final, then prune. Done on rank 0 only since the directory
+    # operations are global filesystem state.
+    if rank == 0:
+        if os.path.exists(final_subdir):
+            # POSIX rename() refuses to replace a non-empty directory, so we
+            # can't os.replace(tmp, final) directly. Swap the old snapshot aside
+            # (instant rename), move the new one into place, then delete the old.
+            # The `.old` name is non-numeric → ignored by _resolve_latest_subdir
+            # and swept by _cleanup_stale_tmps on the next save if we crash mid-swap.
+            old_aside = f"{final_subdir}.old"
+            shutil.rmtree(old_aside, ignore_errors=True)
+            os.replace(final_subdir, old_aside)
+            os.replace(tmp_subdir, final_subdir)
+            shutil.rmtree(old_aside, ignore_errors=True)
+        else:
+            os.replace(tmp_subdir, final_subdir)
+        _prune_old_checkpoints(base_path, keep_last_n, final_subdir)
+        logger.info(
+            "checkpoint successfully saved → %s (wall-time %.2fs)",
+            final_subdir,
+            time.perf_counter() - _t_ckpt_start,
+        )
+    torch.distributed.barrier()
+    if metric_logger is not None:
+        metric_logger.resume_perf("ckpt")
+
+
+@gin.configurable
+def load_sparse_checkpoint(
+    model: torch.nn.Module,
+    path: str = "",
+) -> None:
+    if path == "":
+        return
+    sparse_path = f"{path}/sparse/"
+
+    sparse_tensor_keys = {
+        k for k, v in model.state_dict().items() if is_sparse_key(k, v)
+    }
+    sparse_dict = {"sparse_dict": SparseState(model, sparse_tensor_keys)}
+    gc.collect()
+    torch.distributed.checkpoint.load(
+        sparse_dict,
+        storage_reader=torch.distributed.checkpoint.FileSystemReader(sparse_path),
+    )
+    gc.collect()
+    print("sparse checkpoint successfully loaded")
+
+
+@gin.configurable
+def load_nonsparse_checkpoint(
+    model: torch.nn.Module,
+    device: torch.device,
+    optimizer: Optional[Optimizer] = None,
+    metric_logger: Optional[MetricsLogger] = None,
+    path: str = "",
+    rank: int = 0,
+) -> Tuple[Optional[int], int, Optional[Dict[str, Any]]]:
+    """
+    Load non-sparse (dense) components from a checkpoint.
+
+    Loads dense model parameters, and optionally optimizer state and metrics.
+    Also restores per-rank RNG state if a matching `rng_rank{rank}.pt` is found
+    next to `non_sparse.ckpt`.
+
+    Returns:
+        (train_ts, batch_idx_in_window, split_contract) — the streaming resume
+        hint and the saved train:eval split contract (None for legacy / non-
+        holdout checkpoints). `(None, WINDOW_COMPLETE, None)` if not a streaming
+        checkpoint or no path supplied.
+    """
+    if path == "":
+        return None, WINDOW_COMPLETE, None
+    non_sparse_ckpt = f"{path}/non_sparse.ckpt"
+
+    # weights_only=False: these are our own trusted checkpoints, and they hold
+    # non-tensor objects (optimizer/metric state dicts, numpy-backed RNG state)
+    # that PyTorch>=2.6's weights_only=True default refuses to unpickle.
+    non_sparse_state_dict = torch.load(
+        non_sparse_ckpt, map_location=device, weights_only=False
+    )
+    load_dense_state_dict(model, non_sparse_state_dict["dense_dict"])
+    print("dense checkpoint successfully loaded")
+    if optimizer is not None:
+        optimizer.load_state_dict(non_sparse_state_dict["optimizer_dict"])
+        print("optimizer checkpoint successfully loaded")
+    if metric_logger is not None:
+        metric_logger.global_step = non_sparse_state_dict["global_step"]
+        # Defaulted for legacy checkpoints written before the counter existed.
+        metric_logger.cumulative_train_samples = non_sparse_state_dict.get(
+            "cumulative_train_samples", 0
+        )
+        # Defaulted False for legacy/cold checkpoints: a resume that loads a
+        # checkpoint where the run was already open continues without re-emitting
+        # the run markers.
+        metric_logger.mlperf_run_started = non_sparse_state_dict.get(
+            "mlperf_run_started", False
+        )
+        class_metric_state_dict = non_sparse_state_dict["class_metrics"]
+        regression_metric_state_dict = non_sparse_state_dict["reg_metrics"]
+        # Length-safe positional restore: if a checkpoint was written with a
+        # different metric set (e.g. tasks added/removed since), restore the
+        # overlap instead of crashing with an IndexError at run end.
+        def _restore_metric_list(
+            live: list, saved: Optional[list], label: str
+        ) -> None:
+            saved = saved or []
+            if len(live) != len(saved):
+                logger.warning(
+                    "metric count mismatch for %s: live=%d saved=%d; "
+                    "restoring overlapping %d",
+                    label,
+                    len(live),
+                    len(saved),
+                    min(len(live), len(saved)),
+                )
+            for i in range(min(len(live), len(saved))):
+                live[i].load_state_dict(saved[i])
+
+        _restore_metric_list(
+            metric_logger.class_metrics["train"],
+            class_metric_state_dict.get("train"),
+            "class/train",
+        )
+        _restore_metric_list(
+            metric_logger.class_metrics["eval"],
+            class_metric_state_dict.get("eval"),
+            "class/eval",
+        )
+        _restore_metric_list(
+            metric_logger.regression_metrics["train"],
+            regression_metric_state_dict.get("train"),
+            "reg/train",
+        )
+        _restore_metric_list(
+            metric_logger.regression_metrics["eval"],
+            regression_metric_state_dict.get("eval"),
+            "reg/eval",
+        )
+
+        # Per-rank cumulative metric state restore. This runs AFTER the generic
+        # load above so it is authoritative: the shared blob carries no lifetime
+        # buffers (stripped at save) nor any eval_cum state, and each rank
+        # restores its OWN cumulative state here. Missing file = legacy/pre-fix
+        # checkpoint; cumulative metrics self-heal (lifetime AUC refills; the
+        # binned-AUC histograms / scalar sums restart from zero).
+        mb_path = f"{path}/{METRICBUF_FILE_FMT.format(rank=rank)}"
+        if os.path.exists(mb_path):
+            perrank_state = torch.load(
+                mb_path, map_location=device, weights_only=False
+            )
+            _restore_perrank_metric_state(metric_logger, perrank_state)
+            logger.info(
+                "checkpoint load: cumulative metric state rank=%d samples=%s",
+                rank,
+                _perrank_sample_counts(metric_logger),
+            )
+        else:
+            logger.info(
+                "checkpoint load: no per-rank cumulative metric state at %s "
+                "(legacy/pre-fix checkpoint); cumulative metrics will refill",
+                mb_path,
+            )
+
+    # Per-rank RNG restore. Missing file = bit-equal trajectory not requested at
+    # save time; we silently continue (the test harness checks for both).
+    rng_path = f"{path}/rng_rank{rank}.pt"
+    if os.path.exists(rng_path):
+        # weights_only=False: RNG state is numpy/Python tuples, not tensors.
+        rng_state = torch.load(rng_path, map_location="cpu", weights_only=False)
+        _restore_rng_state(rng_state, device)
+        logger.info("RNG state restored from %s", rng_path)
+
+    train_ts = non_sparse_state_dict.get("train_ts")
+    batch_idx_in_window = non_sparse_state_dict.get(
+        "batch_idx_in_window", WINDOW_COMPLETE
+    )
+    split_contract = non_sparse_state_dict.get("split_contract")
+    return train_ts, batch_idx_in_window, split_contract
+
+
+@gin.configurable
+def load_dmp_checkpoint(
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    metric_logger: MetricsLogger,
+    device: torch.device,
+    path: str = "",
+    rank: int = 0,
+) -> Tuple[Optional[int], int, Optional[Dict[str, Any]], bool]:
+    """
+    Load a complete distributed model checkpoint (both sparse and dense components).
+
+    `path` is auto-resolved: if it points at a directory containing numeric
+    subdirs (e.g. CKPT_PATH=<base>/), the highest-numbered subdir is used. If it
+    already names a leaf save (e.g. <base>/300), it's used as-is. Empty string =
+    no load.
+
+    Returns:
+        (train_ts, batch_idx_in_window, split_contract, cold_start) — streaming
+        resume hint plus the saved split contract, and `cold_start` which is True
+        iff there was nothing to load (no checkpoint resolved). `cold_start`
+        distinguishes a genuine fresh run (no weights loaded) from a resume that
+        merely lacks a split contract (e.g. a legacy/non-streaming checkpoint),
+        which the caller's split-contract guard must still reject.
+    """
+    resolved = _resolve_latest_subdir(path)
+    cold_start = resolved == ""
+    load_sparse_checkpoint(model=model, path=resolved)
+    train_ts, batch_idx_in_window, split_contract = load_nonsparse_checkpoint(
+        model=model,
+        optimizer=optimizer,
+        metric_logger=metric_logger,
+        path=resolved,
+        device=device,
+        rank=rank,
+    )
+    return train_ts, batch_idx_in_window, split_contract, cold_start
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/configs.py b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
new file mode 100644
index 000000000..387fb4900
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
@@ -0,0 +1,830 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Configuration module for DLRMv3 model.
+
+This module provides configuration functions for the HSTU model architecture and embedding table configurations.
+"""
+
+import hashlib
+import math
+import os
+from typing import Callable, Dict, Optional, Tuple
+
+import gin
+import torch
+
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from generative_recommenders.modules.multitask_module import (
+    MultitaskTaskType,
+    TaskConfig,
+)
+from torchrec.modules.embedding_configs import DataType, EmbeddingConfig
+
+HSTU_EMBEDDING_DIM = 512  # final DLRMv3 model
+HASH_SIZE = 10_000_000
+HASH_SIZE_1B = 1_000_000_000
+
+# (name, keys, num_embeddings, salt) — single source of truth for both
+# get_embedding_table_config("yambda-5b") and the dataset's cross-hash inputs.
+# Sizes mirror Primus-DLRM/configs/bench_onetrans_large_5b_cross_feat_shampoo.yaml.
+YAMBDA_5B_CROSS_SPECS = [
+    ("user_x_artist",        ("uid", "artist_id"),                  100_000_000, 0),
+    ("user_x_album",         ("uid", "album_id"),                    40_000_000, 0),
+    ("user_x_hour",          ("uid", "hour_of_day"),                 24_000_000, 0),
+    ("item_x_hour",          ("item_id", "hour_of_day"),             40_000_000, 0),
+    ("artist_x_hour",        ("artist_id", "hour_of_day"),           32_000_000, 0),
+    ("user_x_is_organic",    ("uid", "is_organic"),                   2_000_000, 0),
+    ("user_x_artist_x_hour", ("uid", "artist_id", "hour_of_day"),    40_000_000, 0),
+]
+
+
+@gin.configurable
+def get_hstu_configs(
+    dataset: str = "debug",
+    max_seq_len: Optional[int] = None,
+    max_num_candidates: Optional[int] = None,
+    hstu_embedding_table_dim: Optional[int] = None,
+    hstu_transducer_embedding_dim: Optional[int] = None,
+    hstu_num_heads: Optional[int] = None,
+    hstu_attn_num_layers: Optional[int] = None,
+    hstu_attn_linear_dim: Optional[int] = None,
+    hstu_attn_qk_dim: Optional[int] = None,
+    hstu_input_dropout_ratio: Optional[float] = None,
+    hstu_linear_dropout_rate: Optional[float] = None,
+) -> DlrmHSTUConfig:
+    """
+    Create and return HSTU model configuration.
+
+    Builds a complete DlrmHSTUConfig with default hyperparameters for the HSTU
+    architecture including attention settings, embedding dimensions, dropout rates,
+    and feature name mappings.
+
+    Args:
+        dataset: Dataset identifier (currently unused, reserved for dataset-specific configs).
+
+    Returns:
+        DlrmHSTUConfig: Complete configuration object for the HSTU model.
+    """
+    hstu_config = DlrmHSTUConfig(
+        hstu_num_heads=4,
+        hstu_attn_linear_dim=128,
+        hstu_attn_qk_dim=128,
+        hstu_attn_num_layers=5,
+        hstu_embedding_table_dim=HSTU_EMBEDDING_DIM,
+        hstu_preprocessor_hidden_dim=256,
+        hstu_transducer_embedding_dim=512,
+        hstu_group_norm=False,
+        hstu_input_dropout_ratio=0.2,
+        hstu_linear_dropout_rate=0.1,
+        causal_multitask_weights=0.2,
+    )
+    if "movielens" in dataset:
+        assert dataset in [
+            "movielens-1m",
+            "movielens-20m",
+            "movielens-13b",
+            "movielens-18b",
+        ]
+        hstu_config.user_embedding_feature_names = (
+            [
+                "movie_id",
+                "user_id",
+                "sex",
+                "age_group",
+                "occupation",
+                "zip_code",
+            ]
+            if dataset == "movielens-1m"
+            else [
+                "movie_id",
+                "user_id",
+            ]
+        )
+        hstu_config.item_embedding_feature_names = [
+            "item_movie_id",
+        ]
+        hstu_config.uih_post_id_feature_name = "movie_id"
+        hstu_config.uih_action_time_feature_name = "action_timestamp"
+        hstu_config.candidates_querytime_feature_name = "item_query_time"
+        hstu_config.candidates_weight_feature_name = "item_action_weights"
+        hstu_config.uih_weight_feature_name = "item_weights"
+        hstu_config.candidates_watchtime_feature_name = "item_movie_rating"
+        hstu_config.action_weights = [1, 2, 4, 8, 16]
+        hstu_config.contextual_feature_to_max_length = (
+            {
+                "user_id": 1,
+                "sex": 1,
+                "age_group": 1,
+                "occupation": 1,
+                "zip_code": 1,
+            }
+            if dataset == "movielens-1m"
+            else {
+                "user_id": 1,
+            }
+        )
+        hstu_config.contextual_feature_to_min_uih_length = (
+            {
+                "user_id": 20,
+                "sex": 20,
+                "age_group": 20,
+                "occupation": 20,
+                "zip_code": 20,
+            }
+            if dataset == "movielens-1m"
+            else {
+                "user_id": 20,
+            }
+        )
+        hstu_config.merge_uih_candidate_feature_mapping = [
+            ("movie_id", "item_movie_id"),
+            ("movie_rating", "item_movie_rating"),
+            ("action_timestamp", "item_query_time"),
+            ("item_weights", "item_action_weights"),
+            ("dummy_watch_time", "item_dummy_watchtime"),
+        ]
+        hstu_config.hstu_uih_feature_names = (
+            [
+                "user_id",
+                "sex",
+                "age_group",
+                "occupation",
+                "zip_code",
+                "movie_id",
+                "movie_rating",
+                "action_timestamp",
+                "item_weights",
+                "dummy_watch_time",
+            ]
+            if dataset == "movielens-1m"
+            else [
+                "user_id",
+                "movie_id",
+                "movie_rating",
+                "action_timestamp",
+                "item_weights",
+                "dummy_watch_time",
+            ]
+        )
+        hstu_config.hstu_candidate_feature_names = [
+            "item_movie_id",
+            "item_movie_rating",
+            "item_query_time",
+            "item_action_weights",
+            "item_dummy_watchtime",
+        ]
+        hstu_config.max_num_candidates = 10
+        hstu_config.max_num_candidates_inference = (
+            5 if dataset not in ["movielens-13b", "movielens-18b"] else 2048
+        )
+        hstu_config.multitask_configs = [
+            TaskConfig(
+                task_name="rating",
+                task_weight=1,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            )
+        ]
+    elif "streaming" in dataset:
+        hstu_config.user_embedding_feature_names = [
+            "item_id",
+            "user_id",
+            "item_category_id",
+        ]
+        hstu_config.item_embedding_feature_names = [
+            "item_candidate_id",
+            "item_candidate_category_id",
+        ]
+        hstu_config.uih_post_id_feature_name = "item_id"
+        hstu_config.uih_action_time_feature_name = "action_timestamp"
+        hstu_config.candidates_querytime_feature_name = "item_query_time"
+        hstu_config.candidates_weight_feature_name = "item_action_weights"
+        hstu_config.uih_weight_feature_name = "item_weights"
+        hstu_config.candidates_watchtime_feature_name = "item_rating"
+        hstu_config.action_weights = [1, 2, 4, 8, 16]
+        hstu_config.action_embedding_init_std = 5.0
+        hstu_config.contextual_feature_to_max_length = {"user_id": 1}
+        hstu_config.contextual_feature_to_min_uih_length = {"user_id": 20}
+        hstu_config.merge_uih_candidate_feature_mapping = [
+            ("item_id", "item_candidate_id"),
+            ("item_rating", "item_candidate_rating"),
+            ("action_timestamp", "item_query_time"),
+            ("item_weights", "item_action_weights"),
+            ("dummy_watch_time", "item_dummy_watchtime"),
+            ("item_category_id", "item_candidate_category_id"),
+        ]
+        hstu_config.hstu_uih_feature_names = [
+            "user_id",
+            "item_id",
+            "item_rating",
+            "action_timestamp",
+            "item_weights",
+            "dummy_watch_time",
+            "item_category_id",
+        ]
+        hstu_config.hstu_candidate_feature_names = [
+            "item_candidate_id",
+            "item_candidate_rating",
+            "item_query_time",
+            "item_action_weights",
+            "item_dummy_watchtime",
+            "item_candidate_category_id",
+        ]
+        hstu_config.max_num_candidates = 32
+        hstu_config.max_num_candidates_inference = 2048
+        hstu_config.multitask_configs = [
+            TaskConfig(
+                task_name="rating",
+                task_weight=1,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            )
+        ]
+    elif "kuairand" in dataset:
+        hstu_config.user_embedding_feature_names = [
+            "video_id",
+            "user_id",
+            "user_active_degree",
+            "follow_user_num_range",
+            "fans_user_num_range",
+            "friend_user_num_range",
+            "register_days_range",
+        ]
+        hstu_config.item_embedding_feature_names = [
+            "item_video_id",
+        ]
+        hstu_config.uih_post_id_feature_name = "video_id"
+        hstu_config.uih_action_time_feature_name = "action_timestamp"
+        hstu_config.candidates_querytime_feature_name = "item_query_time"
+        hstu_config.uih_weight_feature_name = "action_weight"
+        hstu_config.candidates_weight_feature_name = "item_action_weight"
+        hstu_config.candidates_watchtime_feature_name = "item_target_watchtime"
+        # There are more contextual features in the dataset, see https://kuairand.com/ for details
+        hstu_config.contextual_feature_to_max_length = {
+            "user_id": 1,
+            "user_active_degree": 1,
+            "follow_user_num_range": 1,
+            "fans_user_num_range": 1,
+            "friend_user_num_range": 1,
+            "register_days_range": 1,
+        }
+        hstu_config.merge_uih_candidate_feature_mapping = [
+            ("video_id", "item_video_id"),
+            ("action_timestamp", "item_query_time"),
+            ("action_weight", "item_action_weight"),
+            ("watch_time", "item_target_watchtime"),
+        ]
+        hstu_config.hstu_uih_feature_names = [
+            "user_id",
+            "user_active_degree",
+            "follow_user_num_range",
+            "fans_user_num_range",
+            "friend_user_num_range",
+            "register_days_range",
+            "video_id",
+            "action_timestamp",
+            "action_weight",
+            "watch_time",
+        ]
+        hstu_config.hstu_candidate_feature_names = [
+            "item_video_id",
+            "item_action_weight",
+            "item_target_watchtime",
+            "item_query_time",
+        ]
+        hstu_config.multitask_configs = [
+            TaskConfig(
+                task_name="is_click",
+                task_weight=1,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_like",
+                task_weight=2,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_follow",
+                task_weight=4,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_comment",
+                task_weight=8,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_forward",
+                task_weight=16,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_hate",
+                task_weight=32,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="long_view",
+                task_weight=64,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_profile_enter",
+                task_weight=128,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+        ]
+        hstu_config.action_weights = [1, 2, 4, 8, 16, 32, 64, 128]
+    elif "yambda" in dataset:
+        assert dataset in ["yambda-5b"]
+        cross_names = [name for (name, _k, _n, _s) in YAMBDA_5B_CROSS_SPECS]
+        # Per-table dim defaults to HSTU_EMBEDDING_DIM (512); override via the
+        # `get_hstu_configs.hstu_embedding_table_dim = N` gin binding if needed.
+        # Note: the embedding tables in get_embedding_table_config also use
+        # HSTU_EMBEDDING_DIM and must stay aligned with this value.
+        hstu_config.hstu_embedding_table_dim = HSTU_EMBEDDING_DIM
+        hstu_config.hstu_transducer_embedding_dim = 512
+        hstu_config.max_seq_len = 8192
+        hstu_config.max_num_candidates = 1
+        hstu_config.max_num_candidates_inference = 1
+        # Per dlrm_hstu convention (see streaming-100b/movielens):
+        #  - user_embedding_feature_names = UIH-side post-id features + contextual features.
+        #    After main_forward merges UIH + candidate, only these entries hold the merged
+        #    sequence (used by user-side transducer).
+        #  - item_embedding_feature_names = candidate-side names only. _item_forward
+        #    concats these along dim=-1 to feed the item MLP (per-candidate, not per-position).
+        hstu_config.user_embedding_feature_names = (
+            ["uid"]
+            + cross_names
+            + ["item_id", "artist_id", "album_id"]
+        )
+        hstu_config.item_embedding_feature_names = [
+            "item_candidate_id",
+            "item_candidate_artist_id",
+            "item_candidate_album_id",
+        ]
+        hstu_config.uih_post_id_feature_name = "item_id"
+        hstu_config.uih_action_time_feature_name = "action_timestamp"
+        hstu_config.uih_weight_feature_name = "action_weight"
+        hstu_config.candidates_querytime_feature_name = "item_query_time"
+        hstu_config.candidates_weight_feature_name = "item_action_weight"
+        hstu_config.candidates_watchtime_feature_name = "item_dummy_watchtime"
+        hstu_config.action_weights = [1, 2, 4]  # lp, like, skip bits
+        hstu_config.contextual_feature_to_max_length = {
+            "uid": 1,
+            **{name: 1 for name in cross_names},
+        }
+        hstu_config.contextual_feature_to_min_uih_length = {
+            "uid": 0,
+            **{name: 0 for name in cross_names},
+        }
+        # uih names map to candidate names (no name collisions allowed):
+        # item_id/artist_id/album_id appear with prefix "item_" on candidate side.
+        hstu_config.merge_uih_candidate_feature_mapping = [
+            ("item_id", "item_candidate_id"),
+            ("artist_id", "item_candidate_artist_id"),
+            ("album_id", "item_candidate_album_id"),
+            ("action_weight", "item_action_weight"),
+            ("action_timestamp", "item_query_time"),
+            ("dummy_watch_time", "item_dummy_watchtime"),
+        ]
+        hstu_config.hstu_uih_feature_names = (
+            ["uid"]
+            + cross_names
+            + [
+                "item_id",
+                "artist_id",
+                "album_id",
+                "action_weight",
+                "action_timestamp",
+                "dummy_watch_time",
+            ]
+        )
+        hstu_config.hstu_candidate_feature_names = [
+            "item_candidate_id",
+            "item_candidate_artist_id",
+            "item_candidate_album_id",
+            "item_query_time",
+            "item_action_weight",
+            "item_dummy_watchtime",
+        ]
+        hstu_config.multitask_configs = [
+            TaskConfig(
+                task_name="listen_plus",
+                task_weight=1,  # matches action_weights[0] (lp bit)
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            )
+        ]
+    else:
+        hstu_config.user_embedding_feature_names = [
+            "uih_post_id",
+            "uih_owner_id",
+            "viewer_id",
+            "dummy_contexual",
+        ]
+        hstu_config.item_embedding_feature_names = [
+            "item_post_id",
+            "item_owner_id",
+        ]
+        hstu_config.uih_post_id_feature_name = "uih_post_id"
+        hstu_config.uih_action_time_feature_name = "uih_action_time"
+        hstu_config.candidates_querytime_feature_name = "item_query_time"
+        hstu_config.candidates_weight_feature_name = "item_action_weight"
+        hstu_config.candidates_watchtime_feature_name = "item_target_watchtime"
+        hstu_config.contextual_feature_to_max_length = {
+            "viewer_id": 1,
+            "dummy_contexual": 1,
+        }
+        hstu_config.contextual_feature_to_min_uih_length = {
+            "viewer_id": 128,
+            "dummy_contexual": 128,
+        }
+        hstu_config.merge_uih_candidate_feature_mapping = [
+            ("uih_post_id", "item_post_id"),
+            ("uih_owner_id", "item_owner_id"),
+            ("uih_action_time", "item_query_time"),
+            ("uih_weight", "item_action_weight"),
+            ("uih_watchtime", "item_target_watchtime"),
+            ("uih_video_length", "item_video_length"),
+            ("uih_surface_type", "item_surface_type"),
+        ]
+        hstu_config.hstu_uih_feature_names = [
+            "uih_post_id",
+            "uih_action_time",
+            "uih_weight",
+            "uih_owner_id",
+            "uih_watchtime",
+            "uih_surface_type",
+            "uih_video_length",
+            "viewer_id",
+            "dummy_contexual",
+        ]
+        hstu_config.hstu_candidate_feature_names = [
+            "item_post_id",
+            "item_owner_id",
+            "item_surface_type",
+            "item_video_length",
+            "item_action_weight",
+            "item_target_watchtime",
+            "item_query_time",
+        ]
+        hstu_config.multitask_configs = [
+            TaskConfig(
+                task_name="vvp100",
+                task_weight=1,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            )
+        ]
+
+    # Apply gin overrides last so a value set in the gin file wins over the
+    # per-dataset defaults above. Anything left as None inherits the default
+    # the dataset branch (or DlrmHSTUConfig) chose. Example in a gin file:
+    #   get_hstu_configs.max_seq_len = 4096
+    #   get_hstu_configs.hstu_embedding_table_dim = 256
+    _gin_overrides = {
+        "max_seq_len": max_seq_len,
+        "max_num_candidates": max_num_candidates,
+        "max_num_candidates_inference": max_num_candidates,
+        "hstu_embedding_table_dim": hstu_embedding_table_dim,
+        "hstu_transducer_embedding_dim": hstu_transducer_embedding_dim,
+        "hstu_num_heads": hstu_num_heads,
+        "hstu_attn_num_layers": hstu_attn_num_layers,
+        "hstu_attn_linear_dim": hstu_attn_linear_dim,
+        "hstu_attn_qk_dim": hstu_attn_qk_dim,
+        "hstu_input_dropout_ratio": hstu_input_dropout_ratio,
+        "hstu_linear_dropout_rate": hstu_linear_dropout_rate,
+    }
+    for _name, _val in _gin_overrides.items():
+        if _val is not None:
+            setattr(hstu_config, _name, _val)
+
+    return hstu_config
+
+
+def _stable_table_seed(init_seed: int, table_name: str) -> int:
+    """Deterministic 63-bit seed from (init_seed, table_name).
+
+    Uses sha256 (not Python's salted built-in ``hash()``) so the per-table seed
+    is identical across processes/ranks/runs for a given ``$SEED`` + table name.
+    """
+    digest = hashlib.sha256(f"{init_seed}:{table_name}".encode("utf-8")).digest()
+    return int.from_bytes(digest[:8], "big") & 0x7FFF_FFFF_FFFF_FFFF
+
+
+def _uniform_init_bounds(cfg: EmbeddingConfig) -> Tuple[float, float]:
+    """Mirror TorchREC's default per-table init bounds.
+
+    TorchREC falls back to ``uniform_(-1/sqrt(N), +1/sqrt(N))`` when a table does
+    not set ``weight_init_min/max``; honor any explicit bounds the config carries.
+    """
+    bound = math.sqrt(1.0 / cfg.num_embeddings)
+    lo = -bound if cfg.weight_init_min is None else cfg.weight_init_min
+    hi = bound if cfg.weight_init_max is None else cfg.weight_init_max
+    return lo, hi
+
+
+def _make_seeded_uniform_init(
+    table_seed: int, lo: float, hi: float
+) -> Callable[[torch.Tensor], torch.Tensor]:
+    """Build a seeded in-place uniform initializer for one table's weight.
+
+    TorchREC/FBGEMM calls ``init_fn`` with the (per-rank) local shard tensor on
+    its compute device, so we seed a generator on that same device. For a fixed
+    sharding plan (world size + plan unchanged) this makes embedding init
+    byte-reproducible run-to-run.
+    """
+
+    def _init(weight: torch.Tensor) -> torch.Tensor:
+        # TorchREC builds the unsharded EmbeddingCollection on the META device
+        # first (DMP materializes real storage on the compute device later).
+        # Meta tensors have no storage and torch.Generator(device="meta") is
+        # invalid ("META device type not an accelerator"), so skip them: the
+        # seeded init for the sharded/fused TBE path is provided by the RNG
+        # re-seed right before DMP in make_optimizer_and_shard. On a real
+        # device (eager/non-meta path) we still apply the per-table seeded fill.
+        if weight.device.type == "meta":
+            return weight
+        gen = torch.Generator(device=weight.device)
+        gen.manual_seed(table_seed)
+        with torch.no_grad():
+            weight.uniform_(lo, hi, generator=gen)
+        return weight
+
+    return _init
+
+
+@gin.configurable
+def get_embedding_table_config(
+    dataset: str = "debug",
+    embedding_dim: Optional[int] = None,
+    init_seed: Optional[int] = None,
+) -> Dict[str, EmbeddingConfig]:
+    """
+    Create and return embedding table configurations.
+
+    Defines the embedding table configurations for item IDs, category IDs, and user IDs
+    with their respective dimensions and data types.
+
+    Args:
+        dataset: Dataset identifier (currently unused, reserved for dataset-specific configs).
+        embedding_dim: Per-table embedding width override. When set via gin
+            (e.g. `get_embedding_table_config.embedding_dim = 256`), wins over
+            `HSTU_EMBEDDING_DIM`. Keep in sync with the matching gin override on
+            `get_hstu_configs.hstu_embedding_table_dim` — the model and the
+            tables must agree on dim or sharding will reject the plan.
+        init_seed: Base seed for the per-table seeded `init_fn` (Tier 1
+            reproducible embedding init). When None, falls back to `$SEED`
+            (default 1), matching `seed_everything`. Each table draws from a
+            generator seeded by `sha256(init_seed, table_name)` so init is
+            reproducible run-to-run for a fixed sharding plan.
+
+    Returns:
+        Dict mapping table names to their EmbeddingConfig objects.
+    """
+    tables = _build_embedding_table_config(dataset=dataset, embedding_dim=embedding_dim)
+
+    if init_seed is None:
+        init_seed = int(os.environ.get("SEED", "1"))
+    for name, cfg in tables.items():
+        lo, hi = _uniform_init_bounds(cfg)
+        cfg.init_fn = _make_seeded_uniform_init(
+            _stable_table_seed(init_seed, name), lo, hi
+        )
+    return tables
+
+
+def _build_embedding_table_config(
+    dataset: str = "debug",
+    embedding_dim: Optional[int] = None,
+) -> Dict[str, EmbeddingConfig]:
+    DIM = embedding_dim if embedding_dim is not None else HSTU_EMBEDDING_DIM
+    if "movielens" in dataset:
+        assert dataset in [
+            "movielens-1m",
+            "movielens-20m",
+            "movielens-13b",
+            "movielens-18b",
+        ]
+        return (
+            {
+                "movie_id": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=DIM,
+                    name="movie_id",
+                    data_type=DataType.FP16,
+                    feature_names=["movie_id", "item_movie_id"],
+                ),
+                "user_id": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=DIM,
+                    name="user_id",
+                    data_type=DataType.FP16,
+                    feature_names=["user_id"],
+                ),
+                "sex": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=DIM,
+                    name="sex",
+                    data_type=DataType.FP16,
+                    feature_names=["sex"],
+                ),
+                "age_group": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=DIM,
+                    name="age_group",
+                    data_type=DataType.FP16,
+                    feature_names=["age_group"],
+                ),
+                "occupation": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=DIM,
+                    name="occupation",
+                    data_type=DataType.FP16,
+                    feature_names=["occupation"],
+                ),
+                "zip_code": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=DIM,
+                    name="zip_code",
+                    data_type=DataType.FP16,
+                    feature_names=["zip_code"],
+                ),
+            }
+            if dataset == "movielens-1m"
+            else {
+                "movie_id": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE_1B,
+                    embedding_dim=DIM,
+                    name="movie_id",
+                    data_type=DataType.FP16,
+                    feature_names=["movie_id", "item_movie_id"],
+                ),
+                "user_id": EmbeddingConfig(
+                    num_embeddings=3_000_000,
+                    embedding_dim=DIM,
+                    name="user_id",
+                    data_type=DataType.FP16,
+                    feature_names=["user_id"],
+                ),
+            }
+        )
+    elif "streaming" in dataset:
+        return {
+            "item_id": EmbeddingConfig(
+                num_embeddings=HASH_SIZE_1B,
+                embedding_dim=DIM,
+                name="item_id",
+                data_type=DataType.FP16,
+                feature_names=["item_id", "item_candidate_id"],
+            ),
+            "item_category_id": EmbeddingConfig(
+                num_embeddings=128,
+                embedding_dim=DIM,
+                name="item_category_id",
+                data_type=DataType.FP16,
+                weight_init_max=1.0,
+                weight_init_min=-1.0,
+                feature_names=["item_category_id", "item_candidate_category_id"],
+            ),
+            "user_id": EmbeddingConfig(
+                num_embeddings=10_000_000,
+                embedding_dim=DIM,
+                name="user_id",
+                data_type=DataType.FP16,
+                feature_names=["user_id"],
+            ),
+        }
+    elif "kuairand" in dataset:
+        return {
+            "video_id": EmbeddingConfig(
+                num_embeddings=HASH_SIZE,
+                embedding_dim=DIM,
+                name="video_id",
+                data_type=DataType.FP16,
+                feature_names=["video_id", "item_video_id"],
+            ),
+            "user_id": EmbeddingConfig(
+                num_embeddings=HASH_SIZE,
+                embedding_dim=DIM,
+                name="user_id",
+                data_type=DataType.FP16,
+                feature_names=["user_id"],
+            ),
+            "user_active_degree": EmbeddingConfig(
+                num_embeddings=8,
+                embedding_dim=DIM,
+                name="user_active_degree",
+                data_type=DataType.FP16,
+                feature_names=["user_active_degree"],
+            ),
+            "follow_user_num_range": EmbeddingConfig(
+                num_embeddings=9,
+                embedding_dim=DIM,
+                name="follow_user_num_range",
+                data_type=DataType.FP16,
+                feature_names=["follow_user_num_range"],
+            ),
+            "fans_user_num_range": EmbeddingConfig(
+                num_embeddings=9,
+                embedding_dim=DIM,
+                name="fans_user_num_range",
+                data_type=DataType.FP16,
+                feature_names=["fans_user_num_range"],
+            ),
+            "friend_user_num_range": EmbeddingConfig(
+                num_embeddings=8,
+                embedding_dim=DIM,
+                name="friend_user_num_range",
+                data_type=DataType.FP16,
+                feature_names=["friend_user_num_range"],
+            ),
+            "register_days_range": EmbeddingConfig(
+                num_embeddings=8,
+                embedding_dim=DIM,
+                name="register_days_range",
+                data_type=DataType.FP16,
+                feature_names=["register_days_range"],
+            ),
+        }
+    elif "yambda" in dataset:
+        assert dataset in ["yambda-5b"]
+        tables: Dict[str, EmbeddingConfig] = {
+            "item_id": EmbeddingConfig(
+                num_embeddings=9_390_624,
+                embedding_dim=DIM,
+                name="item_id",
+                data_type=DataType.FP32,
+                feature_names=["item_id", "item_candidate_id"],
+            ),
+            "artist_id": EmbeddingConfig(
+                num_embeddings=1_293_395,
+                embedding_dim=DIM,
+                name="artist_id",
+                data_type=DataType.FP32,
+                feature_names=["artist_id", "item_candidate_artist_id"],
+            ),
+            "album_id": EmbeddingConfig(
+                num_embeddings=3_367_692,
+                embedding_dim=DIM,
+                name="album_id",
+                data_type=DataType.FP32,
+                feature_names=["album_id", "item_candidate_album_id"],
+            ),
+            "uid": EmbeddingConfig(
+                num_embeddings=1_000_001,
+                embedding_dim=DIM,
+                name="uid",
+                data_type=DataType.FP32,
+                feature_names=["uid"],
+            ),
+        }
+        for name, _keys, num_embeddings, _salt in YAMBDA_5B_CROSS_SPECS:
+            tables[name] = EmbeddingConfig(
+                num_embeddings=num_embeddings,
+                embedding_dim=DIM,
+                name=name,
+                data_type=DataType.FP32,
+                feature_names=[name],
+            )
+        return tables
+    else:
+        return {
+            "post_id": EmbeddingConfig(
+                num_embeddings=HASH_SIZE,
+                embedding_dim=DIM,
+                name="post_id",
+                data_type=DataType.FP16,
+                feature_names=[
+                    "uih_post_id",
+                    "item_post_id",
+                    "uih_owner_id",
+                    "item_owner_id",
+                ],
+            ),
+            "viewer_id": EmbeddingConfig(
+                num_embeddings=HASH_SIZE,
+                embedding_dim=DIM,
+                name="viewer_id",
+                data_type=DataType.FP16,
+                feature_names=["viewer_id"],
+            ),
+            "dummy_contexual": EmbeddingConfig(
+                num_embeddings=HASH_SIZE,
+                embedding_dim=DIM,
+                name="dummy_contexual",
+                data_type=DataType.FP16,
+                feature_names=["dummy_contexual"],
+            ),
+        }
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py
new file mode 100644
index 000000000..204c06df1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py
@@ -0,0 +1,461 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+"""
+Dataset implementations for DLRMv3.
+
+This module provides dataset classes for loading and processing recommendation
+data, including sample containers, collation functions, and random data generation.
+"""
+
+import logging
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+import torch
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+from torchrec.streamable import Pipelineable
+
+
+logging.basicConfig(level=logging.INFO)
+logger: logging.Logger = logging.getLogger("dlrmv3_dataset")
+
+
+@dataclass
+class Samples(Pipelineable):
+    """
+    Container for batched samples with user interaction history and candidate features.
+
+    Attributes:
+        uih_features_kjt: User interaction history features as KeyedJaggedTensor.
+        candidates_features_kjt: Candidate item features as KeyedJaggedTensor.
+    """
+
+    uih_features_kjt: KeyedJaggedTensor
+    candidates_features_kjt: KeyedJaggedTensor
+    # UIH + candidate features concatenated into the single KJT that the model's
+    # sharded EmbeddingCollection consumes. Pre-built here (dataloader/CPU) rather
+    # than inside DlrmHSTU.forward so the embedding lookup's input is a plain
+    # attribute of the batch — which lets TorchRec's TrainPipelineSparseDist hoist
+    # its input_dist into the prefetch stage (otherwise the runtime cat +
+    # from_lengths_sync counts as an "input modification" and the embedding
+    # collection is left un-pipelined).
+    merged_sparse_features: KeyedJaggedTensor
+
+    def to(self, device: torch.device, non_blocking: bool = False) -> "Samples":
+        """
+        Move all tensors to the specified device (in place) and return self.
+
+        Returning ``self`` (rather than ``None``) and accepting ``non_blocking``
+        makes ``Samples`` conform to TorchRec's ``Pipelineable`` protocol so it
+        can be driven by ``TrainPipelineSparseDist``. Existing call sites that
+        use ``sample.to(device)`` for its side effect continue to work unchanged.
+        """
+        for attr in vars(self):
+            setattr(
+                self,
+                attr,
+                getattr(self, attr).to(device=device, non_blocking=non_blocking),
+            )
+        return self
+
+    def record_stream(self, stream: torch.Stream) -> None:
+        """Record the contained KJTs on ``stream`` (Pipelineable protocol).
+
+        Required by ``TrainPipelineSparseDist`` so the prefetched batch's H2D
+        copy on the side stream is not freed before compute consumes it.
+        """
+        self.uih_features_kjt.record_stream(stream)
+        self.candidates_features_kjt.record_stream(stream)
+        self.merged_sparse_features.record_stream(stream)
+
+    def pin_memory(self) -> "Samples":
+        """Pin the contained KJTs' host memory (Pipelineable protocol)."""
+        self.uih_features_kjt = self.uih_features_kjt.pin_memory()
+        self.candidates_features_kjt = self.candidates_features_kjt.pin_memory()
+        self.merged_sparse_features = self.merged_sparse_features.pin_memory()
+        return self
+
+    def batch_size(self) -> int:
+        """
+        Get the batch size of the samples.
+
+        Returns:
+            Number of samples in the batch.
+        """
+        return self.uih_features_kjt.stride()
+
+
+def merge_uih_candidate_kjts(
+    uih_features: KeyedJaggedTensor,
+    candidates_features: KeyedJaggedTensor,
+) -> KeyedJaggedTensor:
+    """Concatenate the UIH and candidate KJTs into the single KJT consumed by the
+    model's ``EmbeddingCollection``.
+
+    Must mirror ``DlrmHSTU.preprocess`` exactly (key order = uih + candidates,
+    values/lengths concatenated in that order). Built on the dataloader side so
+    the model can read it straight off the batch and TorchRec can pipeline the
+    embedding ``input_dist``.
+    """
+    return KeyedJaggedTensor.from_lengths_sync(
+        keys=uih_features.keys() + candidates_features.keys(),
+        values=torch.cat(
+            [uih_features.values(), candidates_features.values()],
+            dim=0,
+        ),
+        lengths=torch.cat(
+            [uih_features.lengths(), candidates_features.lengths()],
+            dim=0,
+        ),
+    )
+
+
+def collate_fn(
+    samples: List[Tuple[KeyedJaggedTensor, KeyedJaggedTensor]],
+) -> Samples:
+    """
+    Collate multiple samples into a batched Samples object.
+
+    Args:
+        samples: List of (uih_features, candidates_features) tuples.
+
+    Returns:
+        Batched Samples object with concatenated features.
+    """
+    (
+        uih_features_kjt_list,
+        candidates_features_kjt_list,
+    ) = list(zip(*samples))
+
+    uih_features_kjt = kjt_batch_func(uih_features_kjt_list)
+    candidates_features_kjt = kjt_batch_func(candidates_features_kjt_list)
+    return Samples(
+        uih_features_kjt=uih_features_kjt,
+        candidates_features_kjt=candidates_features_kjt,
+        merged_sparse_features=merge_uih_candidate_kjts(
+            uih_features_kjt, candidates_features_kjt
+        ),
+    )
+
+
+class Dataset:
+    """
+    Base dataset class for DLRMv3.
+
+    Provides the interface for loading, accessing, and managing samples
+    for recommendation model training and inference.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        **args: Additional arguments (unused in base class).
+    """
+
+    def __init__(self, hstu_config: DlrmHSTUConfig, **args):
+        self.arrival = None
+        self.image_list = []
+        self.label_list = []
+        self.image_list_inmemory = {}
+        self.last_loaded = -1.0
+
+    def preprocess(self, use_cache=True):
+        """
+        Preprocess the dataset.
+
+        Args:
+            use_cache: Whether to use cached preprocessed data.
+
+        Raises:
+            NotImplementedError: Subclasses must implement this method.
+        """
+        raise NotImplementedError("Dataset:preprocess")
+
+    def get_item_count(self):
+        """
+        Get the total number of items in the dataset.
+
+        Returns:
+            Number of items.
+        """
+        return len(self.image_list)
+
+    def load_query_samples(self, sample_list):
+        """
+        Load specified samples into memory.
+
+        Args:
+            sample_list: List of sample indices to load.
+
+        Raises:
+            NotImplementedError: Subclasses must implement this method.
+        """
+        raise NotImplementedError("Dataset:load_query_samples")
+
+    def unload_query_samples(self, sample_list):
+        """
+        Unload specified samples from memory.
+
+        Args:
+            sample_list: List of sample indices to unload.
+
+        Raises:
+            NotImplementedError: Subclasses must implement this method.
+        """
+        raise NotImplementedError("Dataset:unload_query_samples")
+
+    def get_sample(self, id: int):
+        """
+        Get a single sample by ID.
+
+        Args:
+            id: Sample identifier.
+
+        Raises:
+            NotImplementedError: Subclasses must implement this method.
+        """
+        raise NotImplementedError("Dataset:get_sample")
+
+    def get_samples(self, id_list: List[int]) -> Samples:
+        """
+        Get multiple samples and collate them into a batch.
+
+        Args:
+            id_list: List of sample identifiers.
+
+        Returns:
+            Collated Samples object containing the batch.
+        """
+        list_samples = [self.get_sample(ix) for ix in id_list]
+        return collate_fn(list_samples)
+
+
+@torch.jit.script
+def kjt_batch_func(
+    kjt_list: List[KeyedJaggedTensor],
+) -> KeyedJaggedTensor:
+    """
+    Batch multiple KeyedJaggedTensors into a single tensor.
+
+    Uses FBGEMM operations for efficient batching and reordering of
+    jagged tensor data.
+
+    Args:
+        kjt_list: List of KeyedJaggedTensors to batch.
+
+    Returns:
+        Batched KeyedJaggedTensor with reordered indices and lengths.
+    """
+    bs_list = [kjt.stride() for kjt in kjt_list]
+    bs = sum(bs_list)
+    batched_length = torch.cat([kjt.lengths() for kjt in kjt_list], dim=0)
+    batched_indices = torch.cat([kjt.values() for kjt in kjt_list], dim=0)
+    bs_offset = torch.ops.fbgemm.asynchronous_complete_cumsum(
+        torch.tensor(bs_list)
+    ).int()
+    batched_offset = torch.ops.fbgemm.asynchronous_complete_cumsum(batched_length)
+    reorder_length = torch.ops.fbgemm.reorder_batched_ad_lengths(
+        batched_length, bs_offset, bs
+    )
+    reorder_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(reorder_length)
+    reorder_indices = torch.ops.fbgemm.reorder_batched_ad_indices(
+        batched_offset, batched_indices, reorder_offsets, bs_offset, bs
+    )
+    out = KeyedJaggedTensor(
+        keys=kjt_list[0].keys(),
+        lengths=reorder_length.long(),
+        values=reorder_indices.long(),
+    )
+    return out
+
+
+def get_random_data(
+    contexual_features: List[str],
+    hstu_uih_keys: List[str],
+    hstu_candidates_keys: List[str],
+    uih_max_seq_len: int,
+    max_num_candidates: int,
+    value_bound: int = 1000,
+):
+    """
+    Generate random sample data for testing and debugging.
+
+    Creates synthetic user interaction history and candidate features
+    with random values.
+
+    Args:
+        contexual_features: List of contextual feature names.
+        hstu_uih_keys: List of UIH feature keys.
+        hstu_candidates_keys: List of candidate feature keys.
+        uih_max_seq_len: Maximum sequence length for UIH.
+        max_num_candidates: Maximum number of candidates.
+        value_bound: Upper bound for random values.
+
+    Returns:
+        Tuple of (uih_features_kjt, candidates_features_kjt).
+    """
+    uih_non_seq_feature_keys = contexual_features
+    uih_seq_feature_keys = [
+        k for k in hstu_uih_keys if k not in uih_non_seq_feature_keys
+    ]
+    uih_seq_len = torch.randint(
+        int(uih_max_seq_len * 0.8),
+        uih_max_seq_len + 1,
+        (1,),
+    ).item()
+    uih_lengths = torch.tensor(
+        [1 for _ in uih_non_seq_feature_keys]
+        + [uih_seq_len for _ in uih_seq_feature_keys]
+    )
+    # logging.info(f"uih_lengths: {uih_lengths}")
+    uih_values = torch.randint(
+        1,
+        value_bound,
+        # pyre-ignore[6]
+        (uih_seq_len * len(uih_seq_feature_keys) + len(uih_non_seq_feature_keys),),
+    )
+    uih_features_kjt = KeyedJaggedTensor(
+        keys=uih_non_seq_feature_keys + uih_seq_feature_keys,
+        lengths=uih_lengths.long(),
+        values=uih_values.long(),
+    )
+    num_candidates = torch.randint(
+        1,
+        max_num_candidates + 1,
+        (1,),
+    ).item()
+    candidates_lengths = num_candidates * torch.ones(len(hstu_candidates_keys))
+    candidates_values = torch.randint(
+        1,
+        value_bound,
+        (num_candidates * len(hstu_candidates_keys),),  # pyre-ignore[6]
+    )
+    candidates_features_kjt = KeyedJaggedTensor(
+        keys=hstu_candidates_keys,
+        lengths=candidates_lengths.long(),
+        values=candidates_values.long(),
+    )
+    return uih_features_kjt, candidates_features_kjt
+
+
+class DLRMv3RandomDataset(Dataset):
+    """
+    Dataset that generates random synthetic data for DLRMv3.
+
+    Useful for testing and benchmarking without real data dependencies.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        num_aggregated_samples: Total number of samples to generate.
+        is_inference: Whether the dataset is used for inference mode.
+        *args: Additional positional arguments.
+        **kwargs: Additional keyword arguments.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        num_aggregated_samples: int = 10000,
+        is_inference: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(
+            hstu_config=hstu_config,
+        )
+        self.hstu_config: DlrmHSTUConfig = hstu_config
+        self._max_num_candidates: int = hstu_config.max_num_candidates
+        self._max_num_candidates_inference: int = (
+            hstu_config.max_num_candidates_inference
+        )
+        self._max_seq_len: int = hstu_config.max_seq_len
+        self._uih_keys: List[str] = hstu_config.hstu_uih_feature_names
+        self._candidates_keys: List[str] = hstu_config.hstu_candidate_feature_names
+        self._contextual_feature_to_max_length: Dict[str, int] = (
+            hstu_config.contextual_feature_to_max_length
+        )
+        self._max_uih_len: int = (
+            self._max_seq_len
+            - self._max_num_candidates
+            - (
+                len(self._contextual_feature_to_max_length)
+                if self._contextual_feature_to_max_length
+                else 0
+            )
+        )
+        self._is_inference = is_inference
+
+        self.contexual_features = []
+        if hstu_config.contextual_feature_to_max_length is not None:
+            self.contexual_features = [
+                p[0] for p in hstu_config.contextual_feature_to_max_length
+            ]
+
+        self.num_aggregated_samples = num_aggregated_samples
+        self.items_in_memory = {}
+
+    def get_sample(self, id: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        """
+        Get a sample by ID from in-memory storage.
+
+        Args:
+            id: Sample identifier.
+
+        Returns:
+            Tuple of (uih_features_kjt, candidates_features_kjt).
+        """
+        return self.items_in_memory[id]
+
+    def get_item_count(self):
+        """
+        Get the total number of samples in the dataset.
+
+        Returns:
+            Number of aggregated samples.
+        """
+        return self.num_aggregated_samples
+
+    def unload_query_samples(self, sample_list):
+        """
+        Clear all samples from memory.
+
+        Args:
+            sample_list: Ignored; clears all samples.
+        """
+        self.items_in_memory = {}
+
+    def load_query_samples(self, sample_list):
+        """
+        Generate and load random samples into memory.
+
+        Args:
+            sample_list: List of sample IDs to generate.
+        """
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        self.items_in_memory = {}
+        for sample in sample_list:
+            self.items_in_memory[sample] = get_random_data(
+                contexual_features=self.contexual_features,
+                hstu_uih_keys=self.hstu_config.hstu_uih_feature_names,
+                hstu_candidates_keys=self.hstu_config.hstu_candidate_feature_names,
+                uih_max_seq_len=self._max_uih_len,
+                max_num_candidates=max_num_candidates,
+            )
+        self.last_loaded = time.time()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/kuairand.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/kuairand.py
new file mode 100644
index 000000000..f6cd9e672
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/kuairand.py
@@ -0,0 +1,163 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+import json
+import time
+from functools import partial
+from typing import Any, Dict, List
+
+import pandas as pd
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset
+from generative_recommenders.dlrm_v3.datasets.utils import (
+    maybe_truncate_seq,
+    separate_uih_candidates,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+
+def process_and_hash_x(x: Any, hash_size: int) -> Any:
+    if isinstance(x, str):
+        x = json.loads(x)
+    if isinstance(x, list):
+        return [x_i % hash_size for x_i in x]
+    else:
+        return x % hash_size
+
+
+class DLRMv3KuaiRandDataset(DLRMv3RandomDataset):
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        embedding_config: Dict[str, Any],
+        seq_logs_file: str,
+        is_inference: bool,
+        **kwargs,
+    ) -> None:
+        super().__init__(hstu_config=hstu_config, is_inference=is_inference)
+        self.seq_logs_frame: pd.DataFrame = pd.read_csv(seq_logs_file, delimiter=",")
+        # apply hashing from embedding table config
+        for key, table in embedding_config.items():
+            assert key in self.seq_logs_frame.columns, (
+                "Rename key in embedding table configs!"
+            )
+            hash_size = table.num_embeddings
+            self.seq_logs_frame[key] = self.seq_logs_frame[key].apply(
+                partial(process_and_hash_x, hash_size=hash_size)
+            )
+
+    def get_item_count(self):
+        return len(self.seq_logs_frame)
+
+    def unload_query_samples(self, sample_list):
+        self.items_in_memory = {}
+
+    def load_query_samples(self, sample_list):
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        self.items_in_memory = {}
+        for idx in sample_list:
+            data = self.seq_logs_frame.iloc[idx]
+            if len(data.video_id) <= max_num_candidates:
+                continue
+            sample = self.load_item(data, max_num_candidates)
+            self.items_in_memory[idx] = sample
+
+        self.last_loaded = time.time()
+
+    def load_item(self, data, max_num_candidates):
+        with torch.profiler.record_function("load_item"):
+            video_history_uih, video_history_candidates = separate_uih_candidates(
+                data.video_id,
+                candidates_max_seq_len=max_num_candidates,
+            )
+            action_weights_uih, action_weights_candidates = separate_uih_candidates(
+                data.action_weights,
+                candidates_max_seq_len=max_num_candidates,
+            )
+            timestamps_uih, _ = separate_uih_candidates(
+                data.time_ms,
+                candidates_max_seq_len=max_num_candidates,
+            )
+            watch_time_uih, watch_time_candidates = separate_uih_candidates(
+                data.play_time_ms,
+                candidates_max_seq_len=max_num_candidates,
+            )
+
+            video_history_uih = maybe_truncate_seq(video_history_uih, self._max_uih_len)
+            action_weights_uih = maybe_truncate_seq(
+                action_weights_uih, self._max_uih_len
+            )
+            timestamps_uih = maybe_truncate_seq(timestamps_uih, self._max_uih_len)
+            watch_time_uih = maybe_truncate_seq(watch_time_uih, self._max_uih_len)
+
+            uih_seq_len = len(video_history_uih)
+            assert uih_seq_len == len(timestamps_uih), (
+                "history len differs from timestamp len."
+            )
+            assert uih_seq_len == len(action_weights_uih), (
+                "history len differs from weights len."
+            )
+            assert uih_seq_len == len(watch_time_uih), (
+                "history len differs from watch time len."
+            )
+
+            uih_kjt_values: List[torch.Tensor] = []
+            uih_kjt_lengths: List[torch.Tensor] = []
+            for name, length in self._contextual_feature_to_max_length.items():
+                uih_kjt_values.append(data[name])
+                uih_kjt_lengths.append(length)
+
+            uih_kjt_values.extend(
+                video_history_uih + timestamps_uih + action_weights_uih + watch_time_uih
+            )
+
+            uih_kjt_lengths.extend(
+                [
+                    uih_seq_len
+                    for _ in range(
+                        len(self._uih_keys)
+                        - len(self._contextual_feature_to_max_length)
+                    )
+                ]
+            )
+
+            dummy_query_time = max(timestamps_uih)
+            uih_features_kjt = KeyedJaggedTensor(
+                keys=self._uih_keys,
+                lengths=torch.tensor(uih_kjt_lengths).long(),
+                values=torch.tensor(uih_kjt_values).long(),
+            )
+
+            candidates_kjt_lengths = max_num_candidates * torch.ones(
+                len(self._candidates_keys)
+            )
+            candidates_kjt_values = (
+                video_history_candidates
+                + action_weights_candidates
+                + watch_time_candidates
+                + [dummy_query_time] * max_num_candidates
+            )
+            candidates_features_kjt = KeyedJaggedTensor(
+                keys=self._candidates_keys,
+                lengths=torch.tensor(candidates_kjt_lengths).long(),
+                values=torch.tensor(candidates_kjt_values).long(),
+            )
+
+        return uih_features_kjt, candidates_features_kjt
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/movie_lens.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/movie_lens.py
new file mode 100644
index 000000000..d74fb575b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/movie_lens.py
@@ -0,0 +1,177 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+import logging
+import time
+from typing import List, Optional
+
+import pandas as pd
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset
+from generative_recommenders.dlrm_v3.datasets.utils import (
+    maybe_truncate_seq,
+    separate_uih_candidates,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+logger = logging.getLogger(__name__)
+
+
+class DLRMv3MovieLensDataset(DLRMv3RandomDataset):
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        ratings_file: str,
+        is_inference: bool,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(hstu_config=hstu_config, is_inference=is_inference)
+        self.ratings_frame: Optional[pd.DataFrame] = None
+        if ratings_file != "":
+            self.ratings_frame = pd.read_csv(
+                ratings_file,
+                delimiter=",",
+            )
+        assert hstu_config.action_weights is not None
+        self.action_weights: List[int] = hstu_config.action_weights
+
+    def get_item_count(self):
+        assert self.ratings_frame is not None
+        return len(self.ratings_frame)
+
+    def unload_query_samples(self, sample_list):
+        self.items_in_memory = {}
+
+    def iloc(self, idx):
+        assert self.ratings_frame is not None
+        return self.ratings_frame.iloc[idx]
+
+    def load_query_samples(self, sample_list):
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        self.items_in_memory = {}
+        for idx in sample_list:
+            data = self.iloc(idx)
+            if len(data.sequence_item_ids) <= max_num_candidates:
+                continue
+            sample = self.load_item(data, max_num_candidates)
+            self.items_in_memory[idx] = sample
+
+        self.last_loaded = time.time()
+
+    def get_timestamp_uih(self, data, max_num_candidates, size):
+        movie_timestamps_uih, _ = separate_uih_candidates(
+            data.sequence_timestamps,
+            candidates_max_seq_len=max_num_candidates,
+        )
+        return movie_timestamps_uih
+
+    def load_item(self, data, max_num_candidates):
+        movie_history_uih, movie_history_candidates = separate_uih_candidates(
+            data.sequence_item_ids,
+            candidates_max_seq_len=max_num_candidates,
+        )
+        movie_history_ratings_uih, movie_history_ratings_candidates = (
+            separate_uih_candidates(
+                data.sequence_ratings,
+                candidates_max_seq_len=max_num_candidates,
+            )
+        )
+        movie_timestamps_uih = self.get_timestamp_uih(
+            data=data,
+            max_num_candidates=max_num_candidates,
+            size=len(movie_history_uih),
+        )
+
+        assert len(movie_history_uih) == len(movie_timestamps_uih), (
+            "history len differs from timestamp len."
+        )
+        assert len(movie_history_uih) == len(movie_history_ratings_uih), (
+            "history len differs from ratings len."
+        )
+
+        movie_history_uih = maybe_truncate_seq(movie_history_uih, self._max_uih_len)
+        movie_history_ratings_uih = maybe_truncate_seq(
+            movie_history_ratings_uih, self._max_uih_len
+        )
+        movie_timestamps_uih = maybe_truncate_seq(
+            movie_timestamps_uih, self._max_uih_len
+        )
+
+        uih_kjt_values: List[torch.Tensor] = []
+        uih_kjt_lengths: List[torch.Tensor] = []
+        for name, length in self._contextual_feature_to_max_length.items():
+            uih_kjt_values.append(data[name])
+            uih_kjt_lengths.append(length)
+
+        uih_seq_len = len(movie_history_uih)
+        movie_dummy_watch_times_uih = [0 for _ in range(uih_seq_len)]
+        action_weights_uih = [
+            self.action_weights[int(rating) - 1] for rating in movie_history_ratings_uih
+        ]
+        uih_kjt_values.extend(
+            movie_history_uih
+            + movie_history_ratings_uih
+            + movie_timestamps_uih
+            + action_weights_uih
+            + movie_dummy_watch_times_uih
+        )
+        uih_kjt_lengths.extend(
+            [
+                uih_seq_len
+                for _ in range(
+                    len(self._uih_keys) - len(self._contextual_feature_to_max_length)
+                )
+            ]
+        )
+
+        dummy_query_time = (
+            0 if movie_timestamps_uih == [] else max(movie_timestamps_uih)
+        )
+        uih_kjt_values.append(dummy_query_time)
+        uih_kjt_lengths.append(1)
+        uih_features_kjt = KeyedJaggedTensor(
+            keys=self._uih_keys + ["dummy_query_time"],
+            lengths=torch.tensor(uih_kjt_lengths).long(),
+            values=torch.tensor(uih_kjt_values).long(),
+        )
+
+        candidates_kjt_lengths = max_num_candidates * torch.ones(
+            len(self._candidates_keys)
+        )
+        action_weights_candidates = [
+            int(rating >= 3.5) for rating in movie_history_ratings_candidates
+        ]
+        candidates_kjt_values = (
+            movie_history_candidates
+            + movie_history_ratings_candidates
+            + [dummy_query_time] * max_num_candidates  # item_query_time
+            + action_weights_candidates
+            + [1] * max_num_candidates  # item_dummy_watchtime
+        )
+        candidates_features_kjt = KeyedJaggedTensor(
+            keys=self._candidates_keys,
+            lengths=candidates_kjt_lengths.detach().clone().long(),
+            values=torch.tensor(candidates_kjt_values).long(),
+        )
+        return (
+            uih_features_kjt,
+            candidates_features_kjt,
+        )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_movie_lens.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_movie_lens.py
new file mode 100644
index 000000000..6cf8a5f56
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_movie_lens.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+import csv
+import linecache
+import logging
+import sys
+from typing import List
+
+import numpy as np
+import pandas as pd
+from generative_recommenders.dlrm_v3.datasets.movie_lens import DLRMv3MovieLensDataset
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+
+csv.field_size_limit(sys.maxsize)
+logger = logging.getLogger(__name__)
+
+
+class DLRMv3SyntheticMovieLensDataset(DLRMv3MovieLensDataset):
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        ratings_file_prefix: str,
+        is_inference: bool,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(
+            hstu_config=hstu_config, is_inference=is_inference, ratings_file=""
+        )
+        self.ratings_file_prefix = ratings_file_prefix
+        with open(f"{self.ratings_file_prefix}_users.csv", "r") as file:
+            reader = csv.reader(file)
+            self.users_cumsum: List[int] = np.cumsum(
+                [int(row[1]) for row in reader]
+            ).tolist()
+
+    def get_item_count(self):
+        return self.users_cumsum[-1]
+
+    def _process_line(self, line: str) -> pd.Series:
+        reader = csv.reader([line])
+        parsed_line = next(reader)
+        user_id = int(parsed_line[0])
+        sequence_item_ids = parsed_line[1]
+        sequence_ratings = parsed_line[2]
+        return pd.Series(
+            data={
+                "user_id": user_id,
+                "sequence_item_ids": sequence_item_ids,
+                "sequence_ratings": sequence_ratings,
+            }
+        )
+
+    def iloc(self, idx) -> pd.Series:
+        assert idx < self.users_cumsum[-1]
+        file_idx: int = 0
+        while self.users_cumsum[file_idx] <= idx:
+            file_idx += 1
+        if file_idx == 0:
+            local_idx = idx
+        else:
+            local_idx = idx - self.users_cumsum[file_idx - 1]
+        line = linecache.getline(
+            f"{self.ratings_file_prefix}_{file_idx}.csv", local_idx + 1
+        )
+        data = self._process_line(line)
+        return data
+
+    def get_timestamp_uih(self, data, max_num_candidates, size):
+        return [1] * size
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py
new file mode 100644
index 000000000..6e38fe334
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py
@@ -0,0 +1,403 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Synthetic streaming dataset for DLRMv3 inference benchmarking.
+
+This module provides a streaming dataset implementation that loads user interaction
+data from pre-generated CSV files with temporal (timestamp) organization, suitable
+for simulating real-time recommendation scenarios.
+"""
+
+import csv
+import logging
+import sys
+import time
+from typing import Any, Dict, List, Set, Tuple
+
+import pandas as pd
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import (
+    collate_fn,
+    DLRMv3RandomDataset,
+    Samples,
+)
+from generative_recommenders.dlrm_v3.datasets.utils import (
+    json_loads,
+    maybe_truncate_seq,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+csv.field_size_limit(sys.maxsize)
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class DLRMv3SyntheticStreamingDataset(DLRMv3RandomDataset):
+    """
+    Streaming dataset that loads pre-generated synthetic recommendation data.
+
+    Supports timestamp-based data organization for simulating streaming scenarios
+    where user interaction histories evolve over time.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        ratings_file_prefix: Path prefix for rating data files.
+        is_inference: Whether dataset is used for inference.
+        train_ts: Number of timestamps used for training.
+        total_ts: Total number of timestamps in the data.
+        num_files: Number of data files (for parallelization).
+        num_users: Total number of users in the dataset.
+        num_items: Total number of items in the catalog.
+        num_categories: Number of item categories.
+        *args: Additional positional arguments.
+        **kwargs: Additional keyword arguments.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        ratings_file_prefix: str,
+        is_inference: bool,
+        train_ts: int,
+        total_ts: int,
+        num_files: int,
+        num_users: int,
+        num_items: int,
+        num_categories: int,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(hstu_config=hstu_config, is_inference=is_inference)
+        self.ratings_file_prefix = ratings_file_prefix
+        self.file_to_offsets: Dict[int, List[int]] = {}
+        with open(f"{self.ratings_file_prefix}offset.csv", "r") as file:
+            reader = csv.reader(file)
+            for size in range(num_files):
+                row = next(reader)
+                assert len(row) == 1
+                offset = json_loads(row[0])
+                assert len(offset) == num_users // num_files
+                self.file_to_offsets[size] = offset
+        self.ts_requests_offsets: List[int] = []
+        with open(f"{self.ratings_file_prefix}requests_per_ts_offset.csv", "r") as file:
+            reader = csv.reader(file)
+            row = next(reader)
+            assert len(row) == 1
+            self.ts_requests_offsets = json_loads(row[0])
+            assert len(self.ts_requests_offsets) == total_ts
+        self.requests: List[int] = []
+        self.ts_to_users_cumsum: Dict[int, List[int]] = {}
+        with open(
+            f"{self.ratings_file_prefix}users_cumsum_per_ts.csv", "r"
+        ) as cumsum_file:
+            reader = csv.reader(cumsum_file)
+            ts = 0
+            for row in reader:
+                assert len(row) == 1
+                cumsum = json_loads(row[0])
+                self.ts_to_users_cumsum[ts] = cumsum
+                ts += 1
+        self.train_ts = train_ts
+        self.total_ts = total_ts
+        self.num_files = num_files
+        self.ts: int = -1
+        self.is_inference: bool = False
+        self.is_eval: bool = False
+        self.users_per_file: int = num_users // num_files
+        self.cached_files: Set[str] = set()
+        self.items_per_category: int = num_items // num_categories
+        assert hstu_config.action_weights is not None
+        self.action_weights: List[int] = hstu_config.action_weights
+        self.items_in_memory: Dict[
+            int, Dict[int, Tuple[KeyedJaggedTensor, KeyedJaggedTensor]]
+        ] = {}
+
+    def get_item_count(self) -> int:
+        return len(self.requests)
+
+    def load_query_samples(self, sample_list: List[int]) -> None:
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        for idx in sample_list:
+            data = self.iloc(idx)
+            sample = self.load_item(data, max_num_candidates)
+            if self.ts not in self.items_in_memory:
+                self.items_in_memory[self.ts] = {}
+            self.items_in_memory[self.ts][idx] = sample
+
+        self.last_loaded = time.time()
+
+    def unload_query_samples(self, sample_list: List[int]) -> None:
+        self.items_in_memory = {}
+
+    def get_sample(self, id: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        return self.items_in_memory[self.ts][id]
+
+    def get_sample_with_ts(
+        self, id: int, ts: int
+    ) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        """
+        Get a sample for a specific timestamp.
+
+        Args:
+            id: Sample identifier.
+            ts: Timestamp index.
+
+        Returns:
+            Tuple of (uih_features_kjt, candidates_features_kjt).
+        """
+        return self.items_in_memory[ts][id]
+
+    def get_samples_with_ts(self, id_list: List[int], ts: int) -> Samples:
+        """
+        Get and collate multiple samples for a specific timestamp.
+
+        Args:
+            id_list: List of sample identifiers.
+            ts: Timestamp index.
+
+        Returns:
+            Collated Samples object.
+        """
+        list_samples = [self.get_sample_with_ts(ix, ts) for ix in id_list]
+        return collate_fn(list_samples)
+
+    def _process_line(self, line: str, user_id: int) -> pd.Series:
+        """
+        Parse a CSV line into a pandas Series with user interaction data.
+
+        Args:
+            line: CSV line containing user data.
+            user_id: User identifier.
+
+        Returns:
+            pd.Series with parsed user interaction history and candidates.
+        """
+        reader = csv.reader([line])
+        parsed_line = next(reader)
+        # total ts + one more eval ts + one base ts so that uih won't be zero
+        # for each ts, ordered as candidate_ids, candidate_ratings, uih_ids, uih_ratings
+        assert len(parsed_line) == 4 * (self.total_ts + 2)
+        uih_item_ids_list = []
+        uih_ratings_list = []
+        candidate_item_ids = ""
+        candidate_ratings = ""
+        if (not self.is_eval) and (not self.is_inference):
+            assert self.ts < self.train_ts
+            for i in range(self.ts + 1):
+                if parsed_line[4 * i]:
+                    uih_item_ids_list.append(parsed_line[2 + 4 * i])
+                    uih_ratings_list.append(parsed_line[3 + 4 * i])
+            candidate_item_ids = parsed_line[4 * (self.ts + 1)]
+            candidate_ratings = parsed_line[1 + 4 * (self.ts + 1)]
+        elif self.is_eval:
+            for i in range(self.ts + 1):
+                if parsed_line[4 * i]:
+                    uih_item_ids_list.append(parsed_line[2 + 4 * i])
+                    uih_ratings_list.append(parsed_line[3 + 4 * i])
+            candidate_item_ids = parsed_line[4 * (self.ts + 1)]
+            candidate_ratings = parsed_line[1 + 4 * (self.ts + 1)]
+        else:
+            assert self.is_inference is True
+            assert self.ts >= self.train_ts
+            for i in range(self.train_ts + 1):
+                if parsed_line[4 * i]:
+                    uih_item_ids_list.append(parsed_line[2 + 4 * i])
+                    uih_ratings_list.append(parsed_line[3 + 4 * i])
+            for i in range(self.train_ts + 2, self.ts + 2):
+                if parsed_line[4 * i]:
+                    uih_item_ids_list.append(parsed_line[2 + 4 * i])
+                    uih_ratings_list.append(parsed_line[3 + 4 * i])
+            candidate_item_ids = parsed_line[4 * (self.ts + 2)]
+            candidate_ratings = parsed_line[1 + 4 * (self.ts + 2)]
+        uih_item_ids = ",".join(uih_item_ids_list)
+        uih_ratings = ",".join(uih_ratings_list)
+        assert candidate_item_ids != "" and candidate_ratings != ""
+        return pd.Series(
+            data={
+                "user_id": user_id,
+                "uih_item_ids": uih_item_ids,
+                "uih_ratings": uih_ratings,
+                "candidate_item_ids": candidate_item_ids,
+                "candidate_ratings": candidate_ratings,
+            }
+        )
+
+    def iloc(self, idx: int) -> pd.Series:
+        """
+        Get user data by request index using file offsets for efficient access.
+
+        Args:
+            idx: Request index within the current timestamp.
+
+        Returns:
+            pd.Series with parsed user interaction data.
+        """
+        cumsum: List[int] = self.ts_to_users_cumsum[self.ts]
+        assert cumsum != []
+        assert idx < cumsum[-1]
+        file_idx: int = 0
+        while cumsum[file_idx] <= idx:
+            file_idx += 1
+        user_idx = self.requests[idx]
+        filename = f"{self.ratings_file_prefix}{file_idx}.csv"
+        with open(filename, "r") as file:
+            idx = user_idx % self.users_per_file
+            file.seek(self.file_to_offsets[file_idx][idx])
+            line = file.readline()
+        data = self._process_line(line=line, user_id=user_idx)
+        return data
+
+    def get_timestamp_uih(
+        self, data: pd.Series, max_num_candidates: int, size: int
+    ) -> List[int]:
+        return [1] * size
+
+    def set_ts(self, ts: int, train_only: bool = False) -> None:
+        """
+        Set the current timestamp and load associated request data.
+
+        Args:
+            ts: Timestamp index to set.
+            train_only: Accepted for API parity with the yambda dataset (which
+                supports a user-level train:eval holdout). This synthetic
+                dataset has no holdout, so the flag is ignored.
+        """
+        logger.warning(f"Streaming dataset ts set to {ts}")
+        if ts == self.ts:
+            return
+        self.ts = ts
+        with open(
+            f"{self.ratings_file_prefix}requests_per_ts.csv", "r"
+        ) as request_file:
+            request_file.seek(self.ts_requests_offsets[self.ts])
+            line = request_file.readline()
+            reader = csv.reader([line])
+            row = next(reader)
+            assert len(row) == 1
+            requests = json_loads(row[0])
+            self.requests = requests
+            logger.warning(f"DLRMv3SyntheticStreamingDataset: ts={ts} requests loaded")
+        assert self.ts_to_users_cumsum[self.ts][-1] == len(self.requests)
+        logger.warning(
+            f"DLRMv3SyntheticStreamingDataset: ts={ts} users_cumsum={self.ts_to_users_cumsum[self.ts]}"
+        )
+
+    def load_item(
+        self, data: pd.Series, max_num_candidates: int
+    ) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        """
+        Load and process a single user's data into KeyedJaggedTensors.
+
+        Converts parsed user data into feature tensors suitable for model input,
+        including truncation to maximum sequence lengths.
+
+        Args:
+            data: pd.Series with user interaction history and candidates.
+            max_num_candidates: Maximum number of candidates to include.
+
+        Returns:
+            Tuple of (uih_features_kjt, candidates_features_kjt).
+        """
+        ids_uih = json_loads(data.uih_item_ids)
+        ids_candidates = json_loads(data.candidate_item_ids)
+        ratings_uih = json_loads(data.uih_ratings)
+        ratings_candidates = json_loads(data.candidate_ratings)
+        timestamps_uih = self.get_timestamp_uih(
+            data=data,
+            max_num_candidates=max_num_candidates,
+            size=len(ids_uih),
+        )
+        assert len(ids_uih) == len(timestamps_uih), (
+            "history len differs from timestamp len."
+        )
+        assert len(ids_uih) == len(ratings_uih), (
+            f"history len {len(ids_uih)} differs from ratings len {len(ratings_uih)}."
+        )
+        assert len(ids_candidates) == len(ratings_candidates), (
+            f"candidates len {len(ids_candidates)} differs from ratings len {len(ratings_candidates)}."
+        )
+
+        ids_uih = maybe_truncate_seq(ids_uih, self._max_uih_len)
+        ratings_uih = maybe_truncate_seq(ratings_uih, self._max_uih_len)
+        timestamps_uih = maybe_truncate_seq(timestamps_uih, self._max_uih_len)
+        ids_candidates = maybe_truncate_seq(ids_candidates, max_num_candidates)
+        num_candidates = len(ids_candidates)
+        ratings_candidates = maybe_truncate_seq(ratings_candidates, max_num_candidates)
+        action_weights_uih = [
+            self.action_weights[int(rating) - 1] for rating in ratings_uih
+        ]
+        action_weights_candidates = [
+            int(rating >= 3.5) for rating in ratings_candidates
+        ]
+
+        uih_kjt_values: List[int] = []
+        uih_kjt_lengths: List[int] = []
+        for name, length in self._contextual_feature_to_max_length.items():
+            uih_kjt_values.append(data[name])
+            uih_kjt_lengths.append(length)
+
+        uih_seq_len = len(ids_uih)
+        dummy_watch_times_uih = [0 for _ in range(uih_seq_len)]
+        item_category_ids = [id // self.items_per_category for id in ids_uih]
+        extend_uih_kjt_values: List[int] = (
+            ids_uih
+            + ratings_uih
+            + timestamps_uih
+            + action_weights_uih
+            + dummy_watch_times_uih
+            + item_category_ids
+        )
+        uih_kjt_values.extend(extend_uih_kjt_values)
+        uih_kjt_lengths.extend(
+            [
+                uih_seq_len
+                for _ in range(
+                    len(self._uih_keys) - len(self._contextual_feature_to_max_length)
+                )
+            ]
+        )
+
+        dummy_query_time = 0 if timestamps_uih == [] else max(timestamps_uih)
+        uih_kjt_values.append(dummy_query_time)
+        uih_kjt_lengths.append(1)
+        uih_features_kjt: KeyedJaggedTensor = KeyedJaggedTensor(
+            keys=self._uih_keys + ["dummy_query_time"],
+            lengths=torch.tensor(uih_kjt_lengths).long(),
+            values=torch.tensor(uih_kjt_values).long(),
+        )
+
+        candidates_kjt_lengths = num_candidates * torch.ones(len(self._candidates_keys))
+        item_candidate_category_ids = [
+            id // self.items_per_category for id in ids_candidates
+        ]
+        candidates_kjt_values = (
+            ids_candidates
+            + ratings_candidates
+            + [dummy_query_time] * num_candidates  # item_query_time
+            + action_weights_candidates
+            + [1] * num_candidates  # item_dummy_watchtime
+            + item_candidate_category_ids
+        )
+        candidates_features_kjt: KeyedJaggedTensor = KeyedJaggedTensor(
+            keys=self._candidates_keys,
+            lengths=candidates_kjt_lengths.detach().clone().long(),
+            values=torch.tensor(candidates_kjt_values).long(),
+        )
+        return uih_features_kjt, candidates_features_kjt
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/utils.py
new file mode 100644
index 000000000..aeca75d41
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/utils.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+"""
+Utility functions for dataset processing.
+
+This module provides helper functions for parsing and processing data
+in the DLRMv3 dataset pipeline.
+"""
+
+import json
+import struct
+from typing import Dict, List, Sequence, Tuple
+
+import numpy as np
+import xxhash
+
+
+def json_loads(
+    x: str | int | List[int],
+) -> List[int]:
+    """
+    Parse a JSON-like string into a list of integers.
+
+    Handles multiple input formats including JSON arrays, comma-separated
+    strings, and single values.
+
+    Args:
+        x: Input that can be a JSON array string, a single integer,
+           or already a list of integers.
+
+    Returns:
+        List of integers parsed from the input.
+    """
+    if isinstance(x, str):
+        if x[0] != "[" and x[-1] != "]":
+            x = "[" + x + "]"
+        y = json.loads(x)
+    else:
+        y = x
+    y_list = [y] if type(y) == int else list(y)
+    return y_list
+
+
+def separate_uih_candidates(
+    x: str | int | List[int],
+    candidates_max_seq_len: int,
+) -> Tuple[List[int], List[int]]:
+    """
+    Separate a sequence into user interaction history (UIH) and candidates.
+
+    Splits the input sequence such that the last `candidates_max_seq_len`
+    elements become candidates and the rest become UIH.
+
+    Args:
+        x: Input sequence as JSON string, single int, or list of ints.
+        candidates_max_seq_len: Number of items at the end to use as candidates.
+
+    Returns:
+        Tuple of (uih, candidates) where both are lists of integers.
+    """
+    if isinstance(x, str):
+        if x[0] != "[" and x[-1] != "]":
+            x = "[" + x + "]"
+        y = json.loads(x)
+    else:
+        y = x
+    y_list = [y] if type(y) == int else list(y)
+    candidates, uih = (
+        y_list[-candidates_max_seq_len:],
+        y_list[:-candidates_max_seq_len],
+    )
+    return uih, candidates
+
+
+def maybe_truncate_seq(
+    y: List[int],
+    max_seq_len: int,
+) -> List[int]:
+    """
+    Truncate a sequence if it exceeds the maximum length.
+
+    Args:
+        y: Input sequence to potentially truncate.
+        max_seq_len: Maximum allowed sequence length.
+
+    Returns:
+        The input sequence, truncated to max_seq_len if necessary.
+    """
+    y_len = len(y)
+    if y_len > max_seq_len:
+        y = y[:max_seq_len]
+    return y
+
+
+def xxhash_cross(
+    anchor: Dict[str, int],
+    keys: Sequence[str],
+    table_size: int,
+    salt: int = 0,
+) -> int:
+    """xxhash64(seed=salt) over little-endian int64 concat(anchor[k] for k in keys), mod table_size.
+
+    Bit-identical to primus_dlrm.data.hashing.cross_hash_nway — embedding rows
+    are interchangeable with Primus-trained ones.
+    """
+    n = len(keys)
+    assert n >= 2, f"xxhash_cross needs >=2 keys, got {n}"
+    digest = xxhash.xxh64(seed=salt)
+    digest.update(struct.Struct(f"<{n}q").pack(*(int(anchor[k]) for k in keys)))
+    return digest.intdigest() % table_size
+
+
+def xxhash_cross_batch(
+    arr_by_key: Dict[str, np.ndarray],
+    keys: Sequence[str],
+    table_size: int,
+    salt: int = 0,
+) -> np.ndarray:
+    """Vectorised xxhash_cross over equal-length int64 arrays (one per key)."""
+    n = len(keys)
+    assert n >= 2
+    cols = [np.asarray(arr_by_key[k], dtype=np.int64).ravel() for k in keys]
+    length = cols[0].shape[0]
+    for c in cols:
+        assert c.shape[0] == length
+    pack = struct.Struct(f"<{n}q").pack
+    digest_cls = xxhash.xxh64
+    out = np.empty(length, dtype=np.int64)
+    for i in range(length):
+        d = digest_cls(seed=salt)
+        d.update(pack(*(int(c[i]) for c in cols)))
+        out[i] = d.intdigest() % table_size
+    return out
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
new file mode 100644
index 000000000..00b22cff9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
@@ -0,0 +1,1066 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# pyre-unsafe
+"""
+Yambda dataset for the DLRMv3 HSTU `modules/` path.
+
+Reads the parquets produced by `dlrm_v3/preprocess_public_data.py
+--dataset yambda-<size>`. Each sample is one anchor LISTEN event with:
+  * label = (played_ratio >= LISTEN_PLUS_THRESHOLD) — the listen_plus bit
+  * a chronologically interleaved 3-pool history (listen+/like/skip), with
+    pool identity tagged per-position in `action_weight` (bits 1/2/4)
+  * 7 pre-hashed cross-feature ids exposed as length-1 contextual entries
+"""
+
+import logging
+import mmap as _mmap_mod
+import os
+import time
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import polars as pl
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset
+from generative_recommenders.dlrm_v3.datasets.utils import xxhash_cross
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+logger = logging.getLogger(__name__)
+
+
+def _load_npy_readonly(path: Union[str, Path]) -> np.ndarray:
+    # MAP_SHARED + PROT_READ so the kernel does not charge the mapping against
+    # vm.overcommit_memory=2 limits. numpy's mmap_mode='r' uses MAP_PRIVATE and
+    # reserves per-process commit; at 8 ranks × ~190 GB store, that OOMs.
+    path = Path(path)
+    with open(path, "rb") as f:
+        version = np.lib.format.read_magic(f)
+        if version[0] == 1:
+            shape, _, dtype = np.lib.format.read_array_header_1_0(f)
+        else:
+            shape, _, dtype = np.lib.format.read_array_header_2_0(f)
+        offset = f.tell()
+    fd = os.open(str(path), os.O_RDONLY)
+    try:
+        buf = _mmap_mod.mmap(fd, 0, access=_mmap_mod.ACCESS_READ)
+    finally:
+        os.close(fd)
+    arr = np.ndarray(shape, dtype=dtype, buffer=buf, offset=offset)
+    arr.flags.writeable = False
+    return arr
+
+
+def _uid_unit_hash(uids: np.ndarray, salt: int) -> np.ndarray:
+    """Deterministic uniform-in-[0,1) hash of user ids (splitmix64 finalizer).
+
+    Pure function of (uid, salt): the same uid always maps to the same value,
+    so the train/eval user split is identical across processes, ranks, and
+    crash/resume — the property the no-leakage holdout relies on. Vectorized
+    uint64 arithmetic wraps mod 2**64 (defined for unsigned), so we silence the
+    benign overflow warnings.
+    """
+    GOLDEN = np.uint64(0x9E3779B97F4A7C15)
+    M1 = np.uint64(0xBF58476D1CE4E5B9)
+    M2 = np.uint64(0x94D049BB133111EB)
+    s30, s27, s31 = np.uint64(30), np.uint64(27), np.uint64(31)
+    with np.errstate(over="ignore"):
+        z = uids.astype(np.uint64) + GOLDEN + np.uint64(salt & 0xFFFFFFFFFFFFFFFF)
+        z = (z ^ (z >> s30)) * M1
+        z = (z ^ (z >> s27)) * M2
+        z = z ^ (z >> s31)
+    # Top 53 bits -> uniform [0, 1) double (same trick numpy uses for randoms).
+    return (z >> np.uint64(11)).astype(np.float64) * (1.0 / 9007199254740992.0)
+
+# Yambda event-type encoding written by preprocess_public_data.py.
+LISTEN_TYPE = 0
+LIKE_TYPE = 1
+LISTEN_PLUS_THRESHOLD = 50
+
+# Action-weight bits (must match hstu_config.action_weights = [1, 2, 4]).
+LP_BIT = 1
+LIKE_BIT = 2
+SKIP_BIT = 4
+
+
+class _FlatEventStore:
+    """Per-user flat event index built from the preprocessed sessions parquet.
+
+    Reads `train_sessions.parquet` and explodes per-session arrays into flat
+    numpy columns + per-user `(start, end)` index arrays. Cache-compatible
+    layout, but writes nothing (rebuilds from parquet each construction).
+    """
+
+    # On-disk column layout.
+    _MMAP_COLS = (
+        "flat_uid", "flat_item_ids", "flat_timestamps",
+        "flat_event_types", "flat_played_ratio",
+        "flat_is_listen_plus", "flat_is_like", "flat_is_skip",
+        "flat_is_organic",
+        "user_start", "user_end", "unique_uids",
+    )
+
+    def __init__(self, sessions_df: pl.DataFrame) -> None:
+        logger.info("Building flat event store from sessions...")
+        sorted_sessions = sessions_df.sort(["uid", "session_id"])
+        exploded = sorted_sessions.explode(
+            ["item_ids", "timestamps", "event_types", "is_organic", "played_ratio_pct"]
+        )
+
+        self.flat_uid: np.ndarray = exploded["uid"].to_numpy().astype(np.int64)
+        self.flat_item_ids: np.ndarray = exploded["item_ids"].to_numpy().astype(np.int64)
+        self.flat_timestamps: np.ndarray = exploded["timestamps"].to_numpy().astype(np.int64)
+        self.flat_event_types: np.ndarray = exploded["event_types"].to_numpy().astype(np.int64)
+        self.flat_played_ratio: np.ndarray = exploded["played_ratio_pct"].to_numpy().astype(np.float32)
+        self.flat_is_organic: np.ndarray = exploded["is_organic"].to_numpy().astype(np.int8)
+        np.nan_to_num(self.flat_played_ratio, copy=False, nan=0.0)
+
+        is_listen = self.flat_event_types == LISTEN_TYPE
+        self.flat_is_listen_plus: np.ndarray = is_listen & (
+            self.flat_played_ratio >= LISTEN_PLUS_THRESHOLD
+        )
+        self.flat_is_like: np.ndarray = self.flat_event_types == LIKE_TYPE
+        self.flat_is_skip: np.ndarray = is_listen & (
+            self.flat_played_ratio < LISTEN_PLUS_THRESHOLD
+        )
+
+        uid_changes = np.where(np.diff(self.flat_uid) != 0)[0] + 1
+        starts = np.concatenate([[0], uid_changes])
+        ends = np.concatenate([uid_changes, [len(self.flat_uid)]])
+        uid_vals = self.flat_uid[starts]
+        max_uid = int(uid_vals.max()) + 1
+        self.user_start: np.ndarray = np.full(max_uid, -1, dtype=np.int64)
+        self.user_end: np.ndarray = np.full(max_uid, -1, dtype=np.int64)
+        self.user_start[uid_vals] = starts
+        self.user_end[uid_vals] = ends
+        self.unique_uids: np.ndarray = uid_vals
+        self.num_users: int = len(uid_vals)
+        self.total_events: int = len(self.flat_item_ids)
+        logger.info(
+            f"FlatEventStore: {self.total_events:,} events, {self.num_users:,} users"
+        )
+
+    @classmethod
+    def load_mmap(cls, cache_dir: Union[str, Path]) -> "_FlatEventStore":
+        """Load flat columns by MAP_SHARED+PROT_READ from a prebuilt cache.
+        All ranks on a node share the same physical pages."""
+        import json as _json
+        cache_dir = Path(cache_dir)
+        with open(cache_dir / "store_meta.json") as f:
+            meta = _json.load(f)
+        store = object.__new__(cls)
+        for name in cls._MMAP_COLS:
+            setattr(store, name, _load_npy_readonly(cache_dir / f"{name}.npy"))
+        store.num_users = int(meta["num_users"])
+        store.total_events = int(meta["total_events"])
+        logger.info(
+            f"FlatEventStore mmap from {cache_dir}: "
+            f"{store.total_events:,} events, {store.num_users:,} users"
+        )
+        return store
+
+    def save_mmap(self, cache_dir: Union[str, Path]) -> None:
+        """Persist flat columns to disk as .npy, then write a sentinel.
+        Subsequent runs (any rank, any node sharing the FS) load via mmap."""
+        import json as _json
+        cache_dir = Path(cache_dir)
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        for name in self._MMAP_COLS:
+            np.save(cache_dir / f"{name}.npy", getattr(self, name))
+        with open(cache_dir / "store_meta.json", "w") as f:
+            _json.dump(
+                {"num_users": self.num_users, "total_events": self.total_events}, f
+            )
+        # Sentinel — readers check this before mmap'ing to avoid partial files.
+        (cache_dir / "_READY").touch()
+        logger.info(f"FlatEventStore saved to {cache_dir}")
+
+
+class DLRMv3YambdaDataset(DLRMv3RandomDataset):
+    """Yambda-5b dataset for the DLRMv3 HSTU modules/ path.
+
+    Args:
+        hstu_config: DlrmHSTUConfig (must come from `get_hstu_configs("yambda-5b")`).
+        processed_dir: directory with `train_sessions.parquet` + `item_popularity.npy`.
+        metadata_dir: directory with `{artist,album}_item_mapping.parquet`.
+        history_length: UIH cap. Under "interleaved" it is the per-pool cap
+            (total ≤ 3 * history_length // 3); under "last_n" it is the literal
+            total number of pooled events kept.
+        scan_window: how far back to scan when filling each pool.
+        history_strategy: "interleaved" (equal per-pool L//3 cap, re-interleaved)
+            or "last_n" (last history_length pooled events, no per-pool split).
+        cross_specs: list of (name, keys, num_embeddings, salt). Source of truth
+            in `dlrm_v3/configs.py:YAMBDA_5B_CROSS_SPECS`.
+        is_inference: passed through to base class.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        processed_dir: str,
+        metadata_dir: str,
+        history_length: int = 2048,
+        scan_window: int = 20000,
+        min_history: Optional[int] = None,
+        history_strategy: str = "interleaved",
+        cross_specs: Optional[Sequence[Tuple[str, Sequence[str], int, int]]] = None,
+        cache_dir: Optional[str] = None,
+        is_inference: bool = False,
+        streaming_window_seconds: int = 86400,
+        streaming_sort_within_window: bool = False,
+        streaming_shuffle_fraction: float = 0.0,
+        streaming_shuffle_seed: int = 0,
+        train_split_percentage: float = 1.0,
+        split_salt: int = 0,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(hstu_config=hstu_config, is_inference=is_inference)
+        self._processed_dir: str = processed_dir
+        self._metadata_dir: str = metadata_dir
+        self._history_length: int = history_length
+        self._scan_window: int = scan_window
+        # UIH construction strategy:
+        #   "interleaved" (default) — equal history_length//3 cap per behavior
+        #     pool (listen+/like/skip), re-interleaved chronologically. Likes are
+        #     ~1.9% of the corpus so the like pool over-fills relative to its
+        #     natural frequency while the sequence under-fills overall.
+        #   "last_n" — take the last history_length events of ANY pool type with
+        #     no per-pool split. Fills the sequence to ~history_length (higher
+        #     effective length) and lets the like share fall to its natural rate.
+        # Both exclude dislike/unlike/undislike (no action-weight bit).
+        if history_strategy not in ("interleaved", "last_n"):
+            raise ValueError(
+                f"history_strategy must be 'interleaved' or 'last_n', got "
+                f"{history_strategy!r}"
+            )
+        self._history_strategy: str = history_strategy
+        # Minimum prior-event count for a LISTEN event to qualify as an anchor.
+        # Decoupled from history_length (which is only the gather/truncation cap):
+        # jagged attention handles short UIH, so we no longer require a full
+        # history_length of context to include a sample. Default None preserves
+        # the legacy "need a full history_length of prior events" behavior (which
+        # dropped ~60% of users); set small (e.g. 1) to include ~all users.
+        self._min_history: int = (
+            history_length if min_history is None else int(min_history)
+        )
+        # Streaming/temporal-order state. Everything here is LAZY: nothing is
+        # built or read until the first set_ts()/num_windows() call (only the
+        # streaming-train-eval loop does that), so the default train-eval path
+        # is byte-for-byte unaffected.
+        self._streaming_window_seconds: int = streaming_window_seconds
+        self._streaming_sort_within_window: bool = streaming_sort_within_window
+        # In-window shuffle dial in [0, 1] to break user-major batching (default
+        # 0.0 = off, user-major order preserved for page-local mmap scans). Maps to
+        # a within-segment shuffle with K = round(fraction * per-window train-anchor
+        # count): 1.0 = full per-element shuffle (max user diversity per batch),
+        # intermediate = interpolation. Computed from the global anchor count BEFORE
+        # round-robin striding, so a given fraction yields the same diversity
+        # regardless of world_size / #nodes / batch_size (config-invariant). The
+        # permutation is a pure function of (seed, ts) so the per-rank round-robin
+        # slice + mid-window resume skip stay deterministic across restarts.
+        self._streaming_shuffle_fraction: float = streaming_shuffle_fraction
+        self._streaming_shuffle_seed: int = streaming_shuffle_seed
+        # User-level train:eval split. `train_split_percentage >= 1.0` means no
+        # holdout (legacy behavior: every anchor is trainable). Otherwise the
+        # top `1 - train_split_percentage` fraction of users (by a deterministic
+        # hash of `uid + split_salt`) are held out: NEVER trained, used only to
+        # build the fixed eval set. The split is a pure function of (uid, salt),
+        # so it is identical across crash/resume (no leakage on failover).
+        self._train_split_percentage: float = train_split_percentage
+        self._split_salt: int = split_salt
+        # Cache only the (small) fixed eval-holdout index list; the per-window
+        # train filter is computed on the fly to avoid a full-length mask.
+        self._eval_holdout_cache: Optional[np.ndarray] = None
+        self._eval_holdout_cache_key: Optional[Tuple[int, int]] = None
+        self._active: Optional[np.ndarray] = None
+        self.is_eval: bool = False
+        self._anchor_ts: Optional[np.ndarray] = None
+        self._t_min: Optional[int] = None
+        self._t_max: Optional[int] = None
+        self._cache_dir: Optional[str] = cache_dir
+        self._cross_specs: List[Tuple[str, Tuple[str, ...], int, int]] = [
+            (name, tuple(keys), n, s) for (name, keys, n, s) in (cross_specs or [])
+        ]
+        assert hstu_config.action_weights is not None
+        self._action_weights: List[int] = hstu_config.action_weights
+
+        self._load_metadata(metadata_dir)
+        # Build-once-mmap-many: first rank to arrive acquires the build lock
+        # and explodes the parquet (one ~190 GB in-memory pass), then writes
+        # flat .npy columns + _READY sentinel. All ranks (including the
+        # builder, after dropping its in-memory copy) reload via MAP_SHARED+
+        # PROT_READ — kernel shares physical pages across ranks so the steady-
+        # state per-rank RSS for the dataset is ~0.
+        if cache_dir is None:
+            cache_dir = os.path.join(processed_dir, f"hstu_cache_L{history_length}")
+        self._cache_dir = cache_dir
+        self._ensure_cache_built(cache_dir, processed_dir, history_length)
+        self.store: _FlatEventStore = _FlatEventStore.load_mmap(cache_dir)
+        # Anchor positions depend on min_history (the eligibility floor), not
+        # just history_length (the gather cap), so they live in a
+        # min_history-versioned file that shares the flat store. Built
+        # independently of the _READY sentinel so changing the floor rebuilds
+        # only this (cheap) array, not the whole 150 GB cache.
+        self._positions_name: str = self._positions_filename(
+            history_length, self._min_history
+        )
+        self._ensure_positions_built(
+            cache_dir, self._positions_name, self._min_history
+        )
+        self._positions: np.ndarray = _load_npy_readonly(
+            os.path.join(cache_dir, self._positions_name)
+        )
+        logger.info(
+            f"Yambda dataset ready: {self.store.total_events:,} events, "
+            f"{len(self._positions):,} training positions"
+        )
+
+    @staticmethod
+    def _positions_filename(history_length: int, min_history: int) -> str:
+        """Anchor-positions filename. Uses the legacy name when the floor equals
+        the gather cap (the historical "full history required" behavior) so
+        existing caches are reused as-is; otherwise a min_history-tagged name."""
+        if min_history == history_length:
+            return f"positions_L{history_length}.npy"
+        return f"positions_L{history_length}_m{min_history}.npy"
+
+    @staticmethod
+    def _ensure_positions_built(
+        cache_dir: str, positions_name: str, min_history: int
+    ) -> None:
+        """Build the anchor-positions array for ``min_history`` if absent.
+
+        Anchors are LISTEN events whose user-local offset is >= ``min_history``
+        (i.e. the user already has that many prior events). This is decoupled
+        from the _READY-gated flat-store build so a new floor only rebuilds this
+        (cheap, ~one int64 scan) array rather than the whole 150 GB cache.
+        Multi-rank safe via an exclusive lock + atomic rename; all ranks then
+        mmap the result read-only.
+        """
+        import fcntl
+
+        positions_path = os.path.join(cache_dir, positions_name)
+        if os.path.exists(positions_path):
+            return
+        lock_path = os.path.join(cache_dir, "_positions_lock")
+        with open(lock_path, "w") as lf:
+            logger.info(f"Acquiring positions build lock for {positions_path}...")
+            fcntl.flock(lf, fcntl.LOCK_EX)
+            try:
+                if os.path.exists(positions_path):
+                    return
+                flat_uid = _load_npy_readonly(
+                    os.path.join(cache_dir, "flat_uid.npy")
+                )
+                event_types = _load_npy_readonly(
+                    os.path.join(cache_dir, "flat_event_types.npy")
+                )
+                user_start = _load_npy_readonly(
+                    os.path.join(cache_dir, "user_start.npy")
+                )
+                idx = np.arange(len(flat_uid), dtype=np.int64)
+                keep = (idx - user_start[flat_uid] >= min_history) & (
+                    event_types == LISTEN_TYPE
+                )
+                positions = np.where(keep)[0].astype(np.int64)
+                tmp = positions_path + ".tmp.npy"
+                np.save(tmp, positions)
+                os.replace(tmp, positions_path)
+                logger.info(
+                    f"Wrote {positions_name}: {len(positions):,} anchors "
+                    f"(min_history={min_history})"
+                )
+            finally:
+                fcntl.flock(lf, fcntl.LOCK_UN)
+
+    @staticmethod
+    def _ensure_cache_built(
+        cache_dir: str, processed_dir: str, history_length: int
+    ) -> None:
+        """File-locked one-shot build with column-at-a-time explode.
+
+        A naive `pl.read_parquet(...).explode([5 list cols])` peaks at ~1.6 TB
+        on the 5b dataset (polars holds input list-columns + dense output +
+        parallel-worker scratch all together). Instead we:
+          1) Read parquet + sort once (sorted list-column DF, ~80 GB).
+          2) For each output column: select that single list, explode, write
+             .npy, drop. Bounds incremental peak to one column (~38 GB).
+          3) Derive bool flags and indices from the on-disk mmaps.
+
+        Peak RAM: ~150 GB. Steady state across all ranks afterward: ~0
+        incremental thanks to MAP_SHARED in load_mmap.
+        """
+        import fcntl
+        import gc
+        import json as _json
+
+        ready = os.path.join(cache_dir, "_READY")
+        if os.path.exists(ready):
+            return
+        os.makedirs(cache_dir, exist_ok=True)
+        lock_path = os.path.join(cache_dir, "_lock")
+        with open(lock_path, "w") as lf:
+            logger.info(f"Acquiring build lock for {cache_dir}...")
+            fcntl.flock(lf, fcntl.LOCK_EX)
+            try:
+                if os.path.exists(ready):
+                    return
+                parquet_path = os.path.join(processed_dir, "train_sessions.parquet")
+                logger.info(
+                    f"Building flat-event cache from {parquet_path} "
+                    f"(column-at-a-time, ~150 GB peak RAM)"
+                )
+
+                # Step 1: read + sort. List columns stay nested at this stage.
+                sessions = pl.read_parquet(parquet_path).sort(["uid", "session_id"])
+                logger.info(f"Sessions sorted: {sessions.shape}")
+
+                # Per-session lengths + uids — used to derive flat_uid via
+                # np.repeat (cheap) without exploding the whole DF at once.
+                lengths = (
+                    sessions.select(pl.col("item_ids").list.len())
+                    .to_numpy()
+                    .reshape(-1)
+                    .astype(np.int64)
+                )
+                session_uids = sessions["uid"].to_numpy().astype(np.int64)
+                N = int(lengths.sum())
+                num_users = int(np.unique(session_uids).shape[0])
+                logger.info(f"Total events: {N:,}, users: {num_users:,}")
+
+                # Step 2: column-at-a-time explode → save → drop.
+                # uid is per-session scalar; expand via np.repeat.
+                flat_uid = np.repeat(session_uids, lengths).astype(np.int64)
+                np.save(os.path.join(cache_dir, "flat_uid.npy"), flat_uid)
+                del flat_uid, session_uids, lengths
+                gc.collect()
+                logger.info("Wrote flat_uid.npy")
+
+                # Derived columns flat_is_listen_plus/like/skip depend on
+                # event_types + played_ratio. Save those two first, then
+                # derive the bools from the mmaps.
+                _list_cols = [
+                    ("item_ids", "flat_item_ids", np.int64),
+                    ("timestamps", "flat_timestamps", np.int64),
+                    ("event_types", "flat_event_types", np.int64),
+                    ("is_organic", "flat_is_organic", np.int8),
+                    ("played_ratio_pct", "flat_played_ratio", np.float32),
+                ]
+                for src_col, dst_name, dtype in _list_cols:
+                    exploded = sessions.select(pl.col(src_col).explode())
+                    arr = exploded[src_col].to_numpy().astype(dtype, copy=False)
+                    if dtype == np.float32:
+                        np.nan_to_num(arr, copy=False, nan=0.0)
+                    np.save(os.path.join(cache_dir, f"{dst_name}.npy"), arr)
+                    del exploded, arr
+                    gc.collect()
+                    logger.info(f"Wrote {dst_name}.npy")
+
+                # Drop the sessions DF now that all source columns are on disk.
+                del sessions
+                gc.collect()
+
+                # Step 3: derive bool flags from the just-written mmaps.
+                event_types = _load_npy_readonly(
+                    os.path.join(cache_dir, "flat_event_types.npy")
+                )
+                played_ratio = _load_npy_readonly(
+                    os.path.join(cache_dir, "flat_played_ratio.npy")
+                )
+                is_listen = event_types == LISTEN_TYPE
+                np.save(
+                    os.path.join(cache_dir, "flat_is_listen_plus.npy"),
+                    is_listen & (played_ratio >= LISTEN_PLUS_THRESHOLD),
+                )
+                np.save(
+                    os.path.join(cache_dir, "flat_is_like.npy"),
+                    event_types == LIKE_TYPE,
+                )
+                np.save(
+                    os.path.join(cache_dir, "flat_is_skip.npy"),
+                    is_listen & (played_ratio < LISTEN_PLUS_THRESHOLD),
+                )
+                del is_listen, played_ratio
+                gc.collect()
+                logger.info("Wrote flat_is_listen_plus/like/skip.npy")
+
+                # user_start / user_end / unique_uids from flat_uid mmap.
+                flat_uid = _load_npy_readonly(
+                    os.path.join(cache_dir, "flat_uid.npy")
+                )
+                uid_changes = np.where(np.diff(flat_uid) != 0)[0] + 1
+                starts = np.concatenate([[0], uid_changes])
+                ends = np.concatenate([uid_changes, [len(flat_uid)]])
+                uid_vals = flat_uid[starts]
+                max_uid = int(uid_vals.max()) + 1
+                user_start = np.full(max_uid, -1, dtype=np.int64)
+                user_end = np.full(max_uid, -1, dtype=np.int64)
+                user_start[uid_vals] = starts
+                user_end[uid_vals] = ends
+                np.save(os.path.join(cache_dir, "user_start.npy"), user_start)
+                np.save(os.path.join(cache_dir, "user_end.npy"), user_end)
+                np.save(os.path.join(cache_dir, "unique_uids.npy"), uid_vals)
+                logger.info("Wrote user_start/end/unique_uids.npy")
+
+                # Positions: LISTEN events with ≥history_length prior history.
+                # Done now (before dropping user_start) so all sibling ranks
+                # just mmap the result instead of each running a 75 GB build.
+                user_start_per_event = user_start[flat_uid]
+                idx = np.arange(len(flat_uid), dtype=np.int64)
+                keep = (idx - user_start_per_event >= history_length) & (
+                    event_types == LISTEN_TYPE
+                )
+                positions = np.where(keep)[0].astype(np.int64)
+                np.save(
+                    os.path.join(cache_dir, f"positions_L{history_length}.npy"),
+                    positions,
+                )
+                logger.info(
+                    f"Wrote positions_L{history_length}.npy: {len(positions):,}"
+                )
+                del (
+                    flat_uid, event_types, user_start, user_end, uid_vals,
+                    starts, ends, uid_changes, idx, user_start_per_event,
+                    keep, positions,
+                )
+                gc.collect()
+
+                # Meta + sentinel — written last; readers gate on _READY.
+                with open(os.path.join(cache_dir, "store_meta.json"), "w") as f:
+                    _json.dump(
+                        {"num_users": num_users, "total_events": N}, f
+                    )
+                open(os.path.join(cache_dir, "_READY"), "w").close()
+                logger.info(f"Cache build complete: {cache_dir}")
+            finally:
+                fcntl.flock(lf, fcntl.LOCK_UN)
+
+    def _load_metadata(self, metadata_dir: str) -> None:
+        item_pop_path = os.path.join(metadata_dir, "item_popularity.npy")
+        if os.path.exists(item_pop_path):
+            item_popularity = np.load(item_pop_path)
+        else:
+            # Fallback: derive vocab size from the artist+album maps.
+            item_popularity = None
+
+        artist_map = pl.read_parquet(os.path.join(metadata_dir, "artist_item_mapping.parquet"))
+        album_map = pl.read_parquet(os.path.join(metadata_dir, "album_item_mapping.parquet"))
+        n_items = int(
+            max(
+                int(artist_map["item_id"].max()) + 1,
+                int(album_map["item_id"].max()) + 1,
+                len(item_popularity) if item_popularity is not None else 0,
+            )
+        )
+        self.item_to_artist: np.ndarray = np.zeros(n_items, dtype=np.int64)
+        valid = artist_map.filter(pl.col("item_id") < n_items)
+        self.item_to_artist[valid["item_id"].to_numpy()] = valid["artist_id"].to_numpy()
+        self.item_to_album: np.ndarray = np.zeros(n_items, dtype=np.int64)
+        valid = album_map.filter(pl.col("item_id") < n_items)
+        self.item_to_album[valid["item_id"].to_numpy()] = valid["album_id"].to_numpy()
+        self.num_items: int = n_items
+
+    def get_item_count(self) -> int:
+        # Streaming mode restricts the active set to the current time window;
+        # otherwise the full (user-major) anchor list is used (train-eval).
+        if self._active is not None:
+            return int(len(self._active))
+        return int(len(self._positions))
+
+    def iloc(self, idx: int) -> int:
+        if self._active is not None:
+            return int(self._positions[self._active[idx]])
+        return int(self._positions[idx])
+
+    def _ensure_streaming_index(self) -> None:
+        """Lazily build + mmap the per-anchor target-timestamp array used for
+        time-windowed streaming.
+
+        Built only on the first ``set_ts()``/``num_windows()`` call, so the
+        default train-eval path never reads timestamps or writes a new file.
+        Multi-rank safe via an exclusive file lock + atomic rename; all ranks
+        then mmap the result read-only (shared physical pages, ~0 anon).
+        """
+        if self._anchor_ts is not None:
+            return
+        import fcntl
+
+        assert self._cache_dir is not None
+        # Target-ts array is per-anchor, so it must track the same min_history
+        # versioning as the positions file it indexes into.
+        anchor_path = os.path.join(
+            self._cache_dir,
+            self._positions_name.replace("positions_", "anchor_ts_", 1),
+        )
+        if not os.path.exists(anchor_path):
+            lock_path = os.path.join(self._cache_dir, "_anchor_ts_lock")
+            with open(lock_path, "w") as lf:
+                logger.info(f"Acquiring anchor-ts build lock for {anchor_path}...")
+                fcntl.flock(lf, fcntl.LOCK_EX)
+                if not os.path.exists(anchor_path):
+                    logger.info(
+                        f"Building {anchor_path}: target ts for "
+                        f"{len(self._positions):,} anchors"
+                    )
+                    anchor_ts = self.store.flat_timestamps[self._positions]
+                    tmp = anchor_path + ".tmp.npy"
+                    np.save(tmp, anchor_ts)
+                    os.replace(tmp, anchor_path)
+                    del anchor_ts
+        self._anchor_ts = _load_npy_readonly(anchor_path)
+        self._t_min = int(self._anchor_ts.min())
+        self._t_max = int(self._anchor_ts.max())
+
+    def num_windows(self) -> int:
+        """Number of fixed-duration windows spanning [t_min, t_max]."""
+        self._ensure_streaming_index()
+        assert self._t_min is not None and self._t_max is not None
+        span = self._t_max - self._t_min + 1
+        w = self._streaming_window_seconds
+        return int((span + w - 1) // w)
+
+    def window_indices(
+        self, ts: int, sort_by_time: Optional[bool] = None
+    ) -> np.ndarray:
+        """Global anchor indices (into ``_positions``) whose target timestamp is
+        in window ``ts``: ``[t_min + ts*W, t_min + (ts+1)*W)``.
+
+        Returned in ascending global-index order (user-major), which keeps the
+        per-sample history scans page-local in the mmap'd event arrays. Used by
+        the per-window path (via ``set_ts``) and the persistent path (shipped to
+        workers through the sampler). ``sort_by_time`` defaults to
+        ``streaming_sort_within_window``.
+
+        Note: an O(log N) variant using a cached argsort of the timestamps was
+        evaluated but rejected — it doubles resident mmap (sorted-ts + order
+        permutation, ~52 GB) and that extra residency evicts the event-array
+        page cache, stalling dataloader workers (NCCL watchdog timeouts). The
+        O(N) mask here keeps only one ~26 GB array resident and is robust.
+        """
+        self._ensure_streaming_index()
+        assert self._anchor_ts is not None and self._t_min is not None
+        w = self._streaming_window_seconds
+        lo = self._t_min + ts * w
+        hi = lo + w
+        idx = np.where((self._anchor_ts >= lo) & (self._anchor_ts < hi))[0]
+        do_sort = (
+            self._streaming_sort_within_window if sort_by_time is None else sort_by_time
+        )
+        if do_sort and idx.size > 0:
+            idx = idx[np.argsort(self._anchor_ts[idx], kind="stable")]
+        logger.warning(f"window_indices({ts}): [{lo}, {hi}) -> {idx.size:,} anchors")
+        return idx.astype(np.int64)
+
+    def _eval_anchor_mask(self, anchor_idx: np.ndarray) -> np.ndarray:
+        """Bool mask (aligned to ``anchor_idx``) marking held-out eval users.
+
+        Computed on the fly for just this slice of anchors (a window is ~tens of
+        millions, not the full ~3B ``_positions``), so we never materialize a
+        full-length mask. ``uid``-hash >= ``train_split_percentage`` -> eval.
+        """
+        uids = self.store.flat_uid[self._positions[anchor_idx]]
+        return _uid_unit_hash(uids, self._split_salt) >= self._train_split_percentage
+
+    def _shuffle_window(self, idx: np.ndarray, ts: int) -> np.ndarray:
+        """Optionally break user-major ordering within a train window.
+
+        ``streaming_shuffle_fraction`` (0..1) is the single diversity dial. It
+        maps to a within-segment shuffle with ``K = round(fraction * N)`` where
+        ``N`` is this window's train-anchor count:
+
+          - 0.0  -> off: return ``idx`` unchanged (user-major, page-local scans).
+          - 1.0  -> full per-element shuffle (max user diversity per batch).
+          - else -> permute WITHIN each contiguous size-K segment (segment order
+            preserved). A per-rank batch then draws across a bounded user-major
+            region, so diversity scales with the fraction while the concurrently
+            touched mmap working set stays within ~one K-segment (page locality).
+
+        Because ``N`` is a property of the dataset/window (not the compute layout)
+        and the permutation is applied BEFORE the per-rank round-robin striding, a
+        given fraction yields the same diversity across world_size / #nodes /
+        batch_size (config-invariant).
+
+        The permutation is a pure function of ``(seed, ts)`` via
+        ``np.random.default_rng(seed + ts)``, so every (re)run of this window
+        yields the IDENTICAL order. This keeps the per-rank round-robin slice and
+        the mid-window resume ``skip_samples`` offset consistent across restarts,
+        exactly like the unshuffled path.
+        """
+        frac = self._streaming_shuffle_fraction
+        if idx.size <= 1 or not frac or frac <= 0.0:
+            return idx
+        rng = np.random.default_rng(self._streaming_shuffle_seed + ts)
+        if frac >= 1.0:
+            return idx[rng.permutation(idx.size)]
+        # Within-segment shuffle (K = round(fraction * N)): a single vectorized
+        # lexsort over per-element random keys, stable within each size-K segment
+        # so elements never cross a segment boundary (bounds the working set). O(N
+        # log K), run once per window in the background prep thread.
+        n = idx.size
+        k = max(1, int(round(frac * n)))
+        seg = np.arange(n, dtype=np.int64) // k
+        keys = rng.random(n)
+        order = np.lexsort((keys, seg))
+        return idx[order]
+
+    def train_window_indices(self, ts: int) -> np.ndarray:
+        """Global anchor indices for TRAIN in window ``ts``: ``window_indices``
+        with held-out eval users removed. Identical across resume because
+        ``window_indices``, the uid hash, and the (seed,ts)-keyed in-window
+        shuffle are all pure functions, so the per-rank round-robin slice (and
+        the mid-window skip offset) stay consistent."""
+        idx = self.window_indices(ts)
+        if self._train_split_percentage >= 1.0:
+            return self._shuffle_window(idx, ts)
+        kept = idx[~self._eval_anchor_mask(idx)]
+        logger.warning(
+            f"train_window_indices({ts}): {idx.size:,} -> {kept.size:,} anchors "
+            f"(holdout tsp={self._train_split_percentage}, salt={self._split_salt})"
+        )
+        return self._shuffle_window(kept, ts)
+
+    def eval_holdout_indices(self, start_ts: int, num_windows: int = 1) -> np.ndarray:
+        """Fixed eval set: held-out users' anchors over windows
+        ``[start_ts, start_ts + num_windows)``. Computed once and cached, so the
+        SAME anchors are evaluated at every eval step (stable, comparable curve).
+        With no holdout (tsp>=1.0) this falls back to the full window(s)."""
+        key = (int(start_ts), int(num_windows))
+        if self._eval_holdout_cache is not None and self._eval_holdout_cache_key == key:
+            return self._eval_holdout_cache
+        parts: List[np.ndarray] = []
+        for ts in range(start_ts, start_ts + max(1, num_windows)):
+            idx = self.window_indices(ts)
+            if self._train_split_percentage < 1.0:
+                idx = idx[self._eval_anchor_mask(idx)]
+            parts.append(idx)
+        holdout = (
+            np.concatenate(parts).astype(np.int64)
+            if parts
+            else np.empty(0, dtype=np.int64)
+        )
+        logger.warning(
+            f"eval_holdout_indices(start_ts={start_ts}, num_windows={num_windows}): "
+            f"{holdout.size:,} held-out anchors (tsp={self._train_split_percentage})"
+        )
+        self._eval_holdout_cache = holdout
+        self._eval_holdout_cache_key = key
+        return holdout
+
+    def total_train_anchors(self, start_ts: int, num_ts: int) -> int:
+        """Total TRAIN anchors across windows ``[start_ts, start_ts + num_ts)``.
+
+        A single O(N) pass over the cached ``_anchor_ts`` array (NOT per-window
+        ``train_window_indices`` scans). Used to convert a "fraction of training
+        data" eval cadence into a global train-step interval. With a user holdout
+        (``train_split_percentage`` < 1.0) the held-out eval users are excluded
+        via the SAME uid hash as ``train_window_indices``, so the count matches
+        what is actually trained.
+
+        NOTE: this is an UPPER BOUND on the realized train STEP count — the
+        per-window samplers truncate each window to a multiple of ``world_size``
+        and drop the last partial per-rank batch (``drop_last=True``). The small
+        overcount is acceptable for a cadence knob (it only shifts the eval grid
+        by a fraction of a window).
+        """
+        self._ensure_streaming_index()
+        assert self._anchor_ts is not None and self._t_min is not None
+        if num_ts <= 0:
+            return 0
+        w = self._streaming_window_seconds
+        lo = self._t_min + start_ts * w
+        hi = self._t_min + (start_ts + num_ts) * w
+        in_range = (self._anchor_ts >= lo) & (self._anchor_ts < hi)
+        if self._train_split_percentage >= 1.0:
+            total = int(np.count_nonzero(in_range))
+        else:
+            sel = np.where(in_range)[0]
+            total = int(np.count_nonzero(~self._eval_anchor_mask(sel)))
+        logger.warning(
+            f"total_train_anchors(start_ts={start_ts}, num_ts={num_ts}): "
+            f"{total:,} train anchors (tsp={self._train_split_percentage})"
+        )
+        return total
+
+    def set_ts(self, ts: int, train_only: bool = False) -> None:
+        """Restrict the active sample set to anchors in window ``ts`` (used by
+        the per-window-DataLoader path, where ``iloc``/``get_item_count`` index
+        through ``_active``).
+
+        ``train_only=True`` removes held-out eval users so the non-persistent
+        TRAIN loader never sees them (closes the leakage path). Forward-only
+        temporal slicing for streaming train/eval. History for any anchor is
+        still gathered causally (``scan_start:flat_pos``) and may span earlier
+        windows, so there is no feature leakage from future events.
+        """
+        self._active = (
+            self.train_window_indices(ts) if train_only else self.window_indices(ts)
+        )
+
+    def set_active_indices(self, indices: np.ndarray) -> None:
+        """Restrict the active sample set to an explicit array of global anchor
+        indices (into ``_positions``). Used by the non-persistent eval path to
+        iterate the fixed user-holdout set (which spans a window range, not a
+        single ``ts``)."""
+        self._active = np.asarray(indices, dtype=np.int64)
+
+    def load_query_samples(self, sample_list) -> None:
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        self.items_in_memory = {}
+        for idx in sample_list:
+            flat_pos = self.iloc(idx)
+            self.items_in_memory[idx] = self._build_sample(flat_pos, max_num_candidates)
+        self.last_loaded = time.time()
+
+    def get_sample(self, idx: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        if idx in self.items_in_memory:
+            return self.items_in_memory[idx]
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        flat_pos = self.iloc(idx)
+        return self._build_sample(flat_pos, max_num_candidates)
+
+    @staticmethod
+    def _empty_history() -> Tuple[
+        np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray
+    ]:
+        empty = np.empty(0, dtype=np.int64)
+        return empty, empty, empty, empty, empty
+
+    def _read_scan_window(
+        self, flat_pos: int, user_start: int
+    ) -> Optional[
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]
+    ]:
+        """Read the causal scan window [scan_start, flat_pos) for an anchor.
+        Returns (item_ids, timestamps, is_lp, is_like, is_skip) views, or None
+        if the window is empty."""
+        scan_start = max(int(user_start), int(flat_pos) - self._scan_window)
+        scan_end = int(flat_pos)
+        if scan_end <= scan_start:
+            return None
+        return (
+            self.store.flat_item_ids[scan_start:scan_end],
+            self.store.flat_timestamps[scan_start:scan_end],
+            self.store.flat_is_listen_plus[scan_start:scan_end],
+            self.store.flat_is_like[scan_start:scan_end],
+            self.store.flat_is_skip[scan_start:scan_end],
+        )
+
+    def _materialize_history(
+        self,
+        keep_local: np.ndarray,
+        item_ids: np.ndarray,
+        timestamps: np.ndarray,
+        is_lp: np.ndarray,
+        is_like: np.ndarray,
+        is_skip: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Gather item/artist/album/ts + pool-bitmask `weight` for the kept
+        (chronologically-ordered) local indices."""
+        items = item_ids[keep_local]
+        ts = timestamps[keep_local]
+        artists = self.item_to_artist[np.clip(items, 0, self.item_to_artist.shape[0] - 1)]
+        albums = self.item_to_album[np.clip(items, 0, self.item_to_album.shape[0] - 1)]
+        # Pool bitmask per kept event (LP/LIKE/SKIP are mutually exclusive in
+        # the source data, but OR is safe and forward-compatible).
+        weight = np.zeros(keep_local.shape[0], dtype=np.int64)
+        weight[is_lp[keep_local]] |= LP_BIT
+        weight[is_like[keep_local]] |= LIKE_BIT
+        weight[is_skip[keep_local]] |= SKIP_BIT
+        return items, artists, albums, ts, weight
+
+    def _gather_history(
+        self, flat_pos: int, user_start: int
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Dispatch UIH construction to the configured strategy."""
+        if self._history_strategy == "last_n":
+            return self._gather_last_n_history(flat_pos, user_start)
+        return self._gather_interleaved_history(flat_pos, user_start)
+
+    def _gather_interleaved_history(
+        self, flat_pos: int, user_start: int
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Build a single chronologically-ordered history sequence from the 3
+        behavior pools. Each event's `action_weight` carries the pool bitmask
+        (LP_BIT/LIKE_BIT/SKIP_BIT). Per-pool cap = history_length // 3."""
+        L = self._history_length
+        per_pool = max(1, L // 3)
+        scan = self._read_scan_window(flat_pos, user_start)
+        if scan is None:
+            return self._empty_history()
+        item_ids, timestamps, is_lp, is_like, is_skip = scan
+
+        # Local indices into the scan window — preserves chronological order
+        # within each pool and lets us interleave by re-sorting.
+        idx_all = np.arange(item_ids.shape[0], dtype=np.int64)
+        lp_idx = idx_all[is_lp][-per_pool:]
+        like_idx = idx_all[is_like][-per_pool:]
+        skip_idx = idx_all[is_skip][-per_pool:]
+
+        keep_local = np.concatenate([lp_idx, like_idx, skip_idx])
+        if keep_local.size == 0:
+            return self._empty_history()
+
+        order = np.argsort(keep_local, kind="stable")
+        keep_local = keep_local[order]
+
+        return self._materialize_history(
+            keep_local, item_ids, timestamps, is_lp, is_like, is_skip
+        )
+
+    def _gather_last_n_history(
+        self, flat_pos: int, user_start: int
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Build the UIH from the last `history_length` events of ANY pool type
+        (listen+/like/skip) with no per-pool split. Vs the interleaved strategy
+        this fills the sequence to ~history_length (higher effective length) and
+        lets the like share fall to its natural corpus rate (~1.9%). Events
+        outside the 3 pools (dislike/unlike/undislike) are excluded as before."""
+        L = self._history_length
+        scan = self._read_scan_window(flat_pos, user_start)
+        if scan is None:
+            return self._empty_history()
+        item_ids, timestamps, is_lp, is_like, is_skip = scan
+
+        member = is_lp | is_like | is_skip
+        # Last L pooled events, in chronological order (already position-sorted
+        # within the scan window, so no re-sort is needed).
+        keep_local = np.arange(item_ids.shape[0], dtype=np.int64)[member][-L:]
+        if keep_local.size == 0:
+            return self._empty_history()
+
+        return self._materialize_history(
+            keep_local, item_ids, timestamps, is_lp, is_like, is_skip
+        )
+
+    def _build_sample(
+        self, flat_pos: int, max_num_candidates: int
+    ) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        uid = int(self.store.flat_uid[flat_pos])
+        user_start = int(self.store.user_start[uid])
+
+        items, artists, albums, ts, weight = self._gather_history(
+            flat_pos, user_start
+        )
+
+        target_item = int(self.store.flat_item_ids[flat_pos])
+        target_artist = int(
+            self.item_to_artist[target_item]
+            if target_item < self.item_to_artist.shape[0]
+            else 0
+        )
+        target_album = int(
+            self.item_to_album[target_item]
+            if target_item < self.item_to_album.shape[0]
+            else 0
+        )
+        target_ts = int(self.store.flat_timestamps[flat_pos])
+
+        played_ratio = float(self.store.flat_played_ratio[flat_pos])
+        is_lp = (
+            int(self.store.flat_event_types[flat_pos]) == LISTEN_TYPE
+            and played_ratio >= LISTEN_PLUS_THRESHOLD
+        )
+        # Label encoded into the candidate's action_weight via the LP bit, so
+        # _get_supervision_labels_and_weights sees the right supervision.
+        candidate_action_weight = LP_BIT if is_lp else 0
+
+        cross_id_anchor: Dict[str, int] = {
+            "uid": uid,
+            "item_id": target_item,
+            "artist_id": target_artist,
+            "album_id": target_album,
+            "hour_of_day": int((target_ts // 3600) % 24),
+            "is_organic": int(self.store.flat_is_organic[flat_pos]),
+        }
+        cross_ids: Dict[str, int] = {
+            name: xxhash_cross(cross_id_anchor, list(keys), n, salt)
+            for (name, keys, n, salt) in self._cross_specs
+        }
+
+        # ---- Truncate UIH to fit max_seq_len budget ----
+        uih_seq_len_budget = (
+            self._max_seq_len
+            - max_num_candidates
+            - len(self._contextual_feature_to_max_length or {})
+        )
+        if items.shape[0] > uih_seq_len_budget:
+            items = items[-uih_seq_len_budget:]
+            artists = artists[-uih_seq_len_budget:]
+            albums = albums[-uih_seq_len_budget:]
+            ts = ts[-uih_seq_len_budget:]
+            weight = weight[-uih_seq_len_budget:]
+        uih_seq_len = int(items.shape[0])
+        dummy_watch_time = np.zeros(uih_seq_len, dtype=np.int64)
+
+        # ---- Build UIH KJT ----
+        # Contextual features (length-1 each) iterated in the same order as
+        # `_contextual_feature_to_max_length` (matches movielens reference).
+        uih_kjt_values: List[int] = []
+        uih_kjt_lengths: List[int] = []
+        for name, length in (self._contextual_feature_to_max_length or {}).items():
+            assert length == 1, f"yambda contextuals are length-1, got {name}={length}"
+            if name == "uid":
+                uih_kjt_values.append(uid)
+            else:
+                uih_kjt_values.append(int(cross_ids[name]))
+            uih_kjt_lengths.append(1)
+
+        # Sequential features — order must match the trailing entries of
+        # hstu_uih_feature_names in configs.py:
+        #   item_id, artist_id, album_id, action_weight, action_timestamp, dummy_watch_time
+        uih_kjt_values.extend(items.tolist())
+        uih_kjt_values.extend(artists.tolist())
+        uih_kjt_values.extend(albums.tolist())
+        uih_kjt_values.extend(weight.tolist())
+        uih_kjt_values.extend(ts.tolist())
+        uih_kjt_values.extend(dummy_watch_time.tolist())
+        n_sequential = len(self._uih_keys) - len(self._contextual_feature_to_max_length or {})
+        uih_kjt_lengths.extend([uih_seq_len] * n_sequential)
+
+        dummy_query_time = int(ts[-1]) if uih_seq_len > 0 else target_ts
+        uih_kjt_values.append(dummy_query_time)
+        uih_kjt_lengths.append(1)
+
+        uih_features_kjt = KeyedJaggedTensor(
+            keys=self._uih_keys + ["dummy_query_time"],
+            lengths=torch.tensor(uih_kjt_lengths, dtype=torch.long),
+            values=torch.tensor(uih_kjt_values, dtype=torch.long),
+        )
+
+        # ---- Build candidates KJT ----
+        # Order must match configs.py:hstu_candidate_feature_names exactly:
+        #   item_candidate_id, item_candidate_artist_id, item_candidate_album_id,
+        #   item_query_time, item_action_weight, item_dummy_watchtime
+        candidates_kjt_lengths = max_num_candidates * torch.ones(
+            len(self._candidates_keys), dtype=torch.long
+        )
+        candidates_kjt_values: List[int] = (
+            [target_item] * max_num_candidates
+            + [target_artist] * max_num_candidates
+            + [target_album] * max_num_candidates
+            + [dummy_query_time] * max_num_candidates
+            + [candidate_action_weight] * max_num_candidates
+            + [1] * max_num_candidates  # item_dummy_watchtime
+        )
+        candidates_features_kjt = KeyedJaggedTensor(
+            keys=self._candidates_keys,
+            lengths=candidates_kjt_lengths,
+            values=torch.tensor(candidates_kjt_values, dtype=torch.long),
+        )
+        return uih_features_kjt, candidates_features_kjt
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md
new file mode 100644
index 000000000..ef1c9686d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md
@@ -0,0 +1,88 @@
+# MLPerf Inference reference implementation for DLRMv3
+
+## Install dependencies
+
+The reference implementation has been tested on a single host, with x86_64 CPUs
+and 8 NVIDIA H100/B200 GPUs. Dependencies can be installed below,
+
+```
+cd generative_recommenders/
+pip install -e .
+```
+
+## Build loadgen
+
+```
+cd generative_recommenders/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/
+CFLAGS="-std=c++14 -O3" python -m pip install .
+```
+
+## Dataset download
+
+DLRMv3 uses a synthetic dataset specifically designed to match the model and
+system characteristics of large-scale sequential recommendation (large item set
+and long average sequence length for each request). To generate the dataset used
+for both training and inference, run
+
+```
+cd generative_recommenders/dlrm_v3/
+python streaming_synthetic_data.py
+```
+
+The generated dataset has 2TB size, and contains 5 million users interacting
+with a billion items over 100 timestamps.
+
+Only 1% of the dataset is used in the inference benchmark. The sampled DLRMv3
+dataset and trained checkpoint are available at
+https://inference.mlcommons-storage.org/.
+
+Script to download the sampled dataset used in inference benchmark:
+
+```
+bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) https://inference.mlcommons-storage.org/metadata/dlrm-v3-dataset.uri
+```
+
+Script to download the 1TB trained checkpoint:
+
+```
+bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) https://inference.mlcommons-storage.org/metadata/dlrm-v3-checkpoint.uri
+```
+
+## Inference benchmark
+
+```
+cd generative_recommenders/generative_recommenders/dlrm_v3/inference/
+WORLD_SIZE=8 python main.py --dataset sampled-streaming-100b
+```
+
+The config file is listed in `dlrm_v3/inference/gin/streaming_100b.gin`.
+`WORLD_SIZE` is the number of GPUs used in the inference benchmark.
+
+To load checkpoint from training, modify `run.model_path` inside the inference
+gin config file. (We will relase the checkpoint soon.)
+
+To achieve the best performance, tune `run.target_qps` and `run.batch_size` in
+the config file.
+
+## Accuracy test
+
+Set `run.compute_eval` will run the accuracy test and dump prediction outputs in
+`mlperf_log_accuracy.json`. To check the accuracy, run
+
+```
+python accuracy.py --path path/to/mlperf_log_accuracy.json
+```
+
+We use normalized entropy (NE), accuracy, and AUC as the metrics to evaluate the model quality. For accepted submissions, all three metrics (NE, Accuracy, AUC) must be within 99% of the reference implementation values. The accuracy for the reference implementation evaluated on 34,996 requests across 10 inference timestamps are listed below:
+
+```
+NE: 86.687%
+Accuracy: 69.651%
+AUC: 78.663%
+```
+
+## Run unit tests
+
+```
+python tests/inference_test.py
+```
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py
new file mode 100644
index 000000000..19242f7bd
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Tool to calculate accuracy for loadgen accuracy output found in mlperf_log_accuracy.json
+"""
+
+import argparse
+import json
+import logging
+
+import numpy as np
+import torch
+from generative_recommenders.dlrm_v3.configs import get_hstu_configs
+from generative_recommenders.dlrm_v3.utils import MetricsLogger
+
+logger: logging.Logger = logging.getLogger("main")
+
+
+def get_args() -> argparse.Namespace:
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--path",
+        required=True,
+        help="path to mlperf_log_accuracy.json",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main() -> None:
+    """
+    Main function to calculate accuracy metrics from loadgen output.
+
+    Reads the mlperf_log_accuracy.json file, parses the results, and computes
+    accuracy metrics using the MetricsLogger. Each result entry contains
+    predictions, labels, and weights packed as float32 numpy arrays.
+    """
+    args = get_args()
+    logger.warning("Parsing loadgen accuracy log...")
+    with open(args.path, "r") as f:
+        results = json.load(f)
+    hstu_config = get_hstu_configs(dataset="sampled-streaming-100b")
+    metrics = MetricsLogger(
+        multitask_configs=hstu_config.multitask_configs,
+        batch_size=1,
+        window_size=3000,
+        device=torch.device("cpu"),
+        rank=0,
+    )
+    logger.warning(f"results have {len(results)} entries")
+    for result in results:
+        data = np.frombuffer(bytes.fromhex(result["data"]), np.float32)
+        num_candidates = data[-1].astype(int)
+        assert len(data) == 1 + num_candidates * 3
+        mt_target_preds = torch.from_numpy(data[0:num_candidates])
+        mt_target_labels = torch.from_numpy(data[num_candidates : num_candidates * 2])
+        mt_target_weights = torch.from_numpy(
+            data[num_candidates * 2 : num_candidates * 3]
+        )
+        num_candidates = torch.tensor([num_candidates])
+        metrics.update(
+            predictions=mt_target_preds.view(1, -1),
+            labels=mt_target_labels.view(1, -1),
+            weights=mt_target_weights.view(1, -1),
+            num_candidates=num_candidates,
+        )
+    for k, v in metrics.compute().items():
+        logger.warning(f"{k}: {v}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp b/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp
new file mode 100644
index 000000000..d4d0d4082
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp
@@ -0,0 +1,215 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+//
+// End-to-end runner for the HSTU torch.jit / torch.package artifacts produced
+// by generative_recommenders/dlrm_v3/inference/packager.py and exercised by
+// :end_to_end_test.
+//
+// CLI:
+//   hstu_runner [--aott_library <lib.so> ...] <sparse.pt> <dense.pt>
+//       <inputs.pt> <output.pt>
+//
+// Where:
+//   sparse.pt   ScriptModule whose forward(uih, candidates) returns
+//               Tuple[Dict[str,Tensor], Dict[str,Tensor],
+//                     Dict[str,Tensor], Tensor, Tensor]
+//   dense.pt    ScriptModule (cuda:0, bf16) whose forward(...) returns
+//               Tuple[Tensor, Optional[Tensor], Optional[Tensor]]
+//   inputs.pt   ScriptModule whose forward() returns
+//               Tuple[KeyedJaggedTensor, KeyedJaggedTensor]
+//   output.pt   torch::pickle_save destination for the predictions tensor;
+//               readable from Python as ``torch.load(output.pt)``.
+
+#include <dlfcn.h>
+
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <torch/csrc/jit/serialization/import.h>
+#include <torch/script.h>
+
+namespace {
+
+struct RunnerArgs {
+  std::vector<std::string> aottLibraryPaths;
+  std::string sparsePath;
+  std::string densePath;
+  std::string inputsPath;
+  std::string outputPath;
+};
+
+RunnerArgs parseArgs(int argc, char** argv) {
+  RunnerArgs args;
+  std::vector<std::string> positional;
+  for (int i = 1; i < argc; ++i) {
+    const std::string arg{argv[i]};
+    if (arg == "--aott_library") {
+      if (++i >= argc) {
+        throw std::runtime_error("--aott_library requires a path");
+      }
+      args.aottLibraryPaths.emplace_back(argv[i]);
+    } else {
+      positional.push_back(arg);
+    }
+  }
+
+  if (positional.size() != 4) {
+    throw std::runtime_error(
+        "Usage: hstu_runner [--aott_library <lib.so> ...] <sparse.pt> "
+        "<dense.pt> <inputs.pt> <output.pt>");
+  }
+  args.sparsePath = positional[0];
+  args.densePath = positional[1];
+  args.inputsPath = positional[2];
+  args.outputPath = positional[3];
+  return args;
+}
+
+void loadAottLibraries(
+    const std::vector<std::string>& libraryPaths,
+    const std::function<void(const std::string&)>& log) {
+  for (const auto& path : libraryPaths) {
+    log("[runner] loading AOT-T library " + path);
+    void* handle = dlopen(path.c_str(), RTLD_GLOBAL | RTLD_NOW);
+    if (handle == nullptr) {
+      throw std::runtime_error(
+          "failed to dlopen AOT-T library " + path + ": " + dlerror());
+    }
+  }
+}
+
+torch::jit::Module loadModule(const std::string& path) {
+  // @patternlint-disable-next-line no-torch-low-level-api
+  auto m = torch::jit::load(path);
+  m.eval();
+  return m;
+}
+
+// Walk a Dict<str, Tensor> and replace every value with .to(device) (and
+// optionally .to(bfloat16)). C++ analog of move_sparse_output_to_device.
+void moveDictToDevice(
+    c10::impl::GenericDict& d,
+    const torch::Device& device,
+    bool toBfloat16) {
+  for (auto& kv : d) {
+    auto t = kv.value().toTensor().to(device);
+    if (toBfloat16) {
+      t = t.to(torch::kBFloat16);
+    }
+    d.insert_or_assign(kv.key(), t);
+  }
+}
+
+void writePickle(const torch::Tensor& t, const std::string& path) {
+  // torch::pickle_save returns a byte buffer in the same wire format as
+  // ``torch.save(tensor, ...)``, so the Python side can read it with
+  // ``torch.load(path)``.
+  const auto data = torch::jit::pickle_save(c10::IValue(t));
+  std::ofstream out(path, std::ios::binary);
+  if (!out) {
+    throw std::runtime_error("failed to open output: " + path);
+  }
+  out.write(data.data(), static_cast<std::streamsize>(data.size()));
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  RunnerArgs args;
+  try {
+    args = parseArgs(argc, argv);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << '\n';
+    return 1;
+  }
+
+  // Log to a file next to the output so we can inspect even if
+  // buck2 swallows stderr.
+  const std::string logPath = args.outputPath + ".log";
+  std::ofstream logFile(logPath);
+  auto log = [&](const std::string& msg) {
+    logFile << msg << std::endl;
+    logFile.flush();
+    std::cerr << msg << std::endl;
+  };
+
+  try {
+    log("[runner] step 0: loading AOT-T libraries");
+    loadAottLibraries(args.aottLibraryPaths, log);
+    log("[runner] step 0 done: loaded " +
+        std::to_string(args.aottLibraryPaths.size()) + " AOT-T libraries");
+
+    log("[runner] step 1: loading sparse module from " + args.sparsePath);
+    auto sparse = loadModule(args.sparsePath);
+
+    log("[runner] step 2: loading dense module from " + args.densePath);
+    auto dense = loadModule(args.densePath);
+
+    log("[runner] step 3: loading inputs module from " + args.inputsPath);
+    auto inputs = loadModule(args.inputsPath);
+
+    log("[runner] step 4: running inputs.forward()");
+    auto inputsTuple = inputs.forward({}).toTuple();
+    auto uihLengths = inputsTuple->elements()[0];
+    auto uihValues = inputsTuple->elements()[1];
+    auto candidatesLengths = inputsTuple->elements()[2];
+    auto candidatesValues = inputsTuple->elements()[3];
+    log("[runner] step 4 done: got 4 input tensors");
+
+    log("[runner] step 5: running sparse.forward()");
+    std::vector<c10::IValue> sparseInputs{
+        uihLengths, uihValues, candidatesLengths, candidatesValues};
+    auto sparseOut = sparse.forward(sparseInputs).toTuple();
+    log("[runner] step 5 done: sparse forward returned " +
+        std::to_string(sparseOut->elements().size()) + " elements");
+
+    log("[runner] step 6: unpacking sparse output dicts");
+    auto seqEmbValues = sparseOut->elements()[0].toGenericDict();
+    auto seqEmbLengths = sparseOut->elements()[1].toGenericDict();
+    auto payloadFeatures = sparseOut->elements()[2].toGenericDict();
+    auto uihSeqLengths = sparseOut->elements()[3].toTensor();
+    auto numCandidates = sparseOut->elements()[4].toTensor();
+    log("[runner] step 6 done: unpacked dicts");
+
+    log("[runner] step 7: moving dicts to cuda:0");
+    const auto device = torch::Device(torch::kCUDA, 0);
+    moveDictToDevice(seqEmbValues, device, /*toBfloat16=*/true);
+    log("[runner] step 7a: seqEmbValues moved");
+    moveDictToDevice(seqEmbLengths, device, /*toBfloat16=*/false);
+    log("[runner] step 7b: seqEmbLengths moved");
+    moveDictToDevice(payloadFeatures, device, /*toBfloat16=*/false);
+    log("[runner] step 7c: payloadFeatures moved");
+    uihSeqLengths = uihSeqLengths.to(device);
+    numCandidates = numCandidates.to(device);
+    log("[runner] step 7 done: all on cuda:0");
+
+    log("[runner] step 8: running dense.forward()");
+    std::vector<c10::IValue> denseInputs{
+        seqEmbValues,
+        seqEmbLengths,
+        payloadFeatures,
+        uihSeqLengths,
+        numCandidates,
+    };
+    auto denseOut = dense.forward(denseInputs);
+    log("[runner] step 8 done: dense forward returned");
+
+    auto preds = denseOut.toTensor().detach().cpu();
+    log("[runner] step 9: preds on cpu");
+
+    std::cout << "preds shape: " << preds.sizes() << '\n';
+    std::cout << "preds sum:   "
+              << preds.to(torch::kFloat32).sum().item<float>() << '\n';
+
+    writePickle(preds, args.outputPath);
+    std::cout << "wrote " << args.outputPath << '\n';
+    log("[runner] step 10: done, wrote output");
+    return 0;
+  } catch (const std::exception& e) {
+    log(std::string("hstu_runner FAILED: ") + e.what());
+    return 1;
+  }
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py
new file mode 100644
index 000000000..6a8db77c8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py
@@ -0,0 +1,227 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Data producer module for DLRMv3 inference.
+
+This module provides classes for producing and managing query data during inference,
+supporting both single-threaded and multi-threaded data production modes.
+"""
+
+import logging
+import threading
+import time
+from queue import Queue
+from typing import List, Optional, Tuple, Union
+
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import Dataset, Samples
+
+logging.basicConfig(level=logging.INFO)
+logger: logging.Logger = logging.getLogger("data_producer")
+
+
+class QueryItem:
+    """
+    Container for a query item to be processed by the inference thread pool.
+
+    Attributes:
+        query_ids: List of unique identifiers for the queries in this batch.
+        samples: The sample data containing features for the queries.
+        start: Time when the query was first received.
+        dt_queue: Time spent in the queue before processing.
+        dt_batching: Time spent on batching the data.
+    """
+
+    def __init__(
+        self,
+        query_ids: List[int],
+        samples: Samples,
+        start: float,
+        dt_queue: float,
+        dt_batching: float,
+    ) -> None:
+        self.query_ids = query_ids
+        self.samples = samples
+        self.start: float = start
+        self.dt_queue: float = dt_queue
+        self.dt_batching: float = dt_batching
+
+
+class SingleThreadDataProducer:
+    """
+    Single-threaded data producer for synchronous query processing.
+
+    This producer processes queries on the main thread without any parallelism,
+    suitable for debugging or low-throughput scenarios.
+
+    Args:
+        ds: The dataset to fetch samples from.
+        run_one_item: Callback function to process a single QueryItem.
+    """
+
+    def __init__(self, ds: Dataset, run_one_item) -> None:  # pyre-ignore [2]
+        self.ds = ds
+        self.run_one_item = run_one_item  # pyre-ignore [4]
+
+    def enqueue(
+        self, query_ids: List[int], content_ids: List[int], t0: float, dt_queue: float
+    ) -> None:
+        """
+        Enqueue queries for immediate synchronous processing.
+
+        Args:
+            query_ids: List of unique query identifiers.
+            content_ids: List of content/sample identifiers to fetch.
+            t0: Timestamp when the query batch was created.
+            dt_queue: Time spent waiting in the queue.
+        """
+        with torch.profiler.record_function("data batching"):
+            t0_batching: float = time.time()
+            samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids)
+            dt_batching: float = time.time() - t0_batching
+            if isinstance(samples, Samples):
+                query = QueryItem(
+                    query_ids=query_ids,
+                    samples=samples,
+                    start=t0,
+                    dt_queue=dt_queue,
+                    dt_batching=dt_batching,
+                )
+                self.run_one_item(query)
+            else:
+                start_idx = 0
+                for sample in samples:
+                    batch_size: int = sample.batch_size()
+                    query = QueryItem(
+                        query_ids=query_ids[start_idx : start_idx + batch_size],
+                        samples=sample,
+                        start=t0,
+                        dt_queue=dt_queue,
+                        dt_batching=dt_batching,
+                    )
+                    start_idx += batch_size
+                    self.run_one_item(query)
+
+    def finish(self) -> None:
+        """Finalize the producer. No-op for single-threaded mode."""
+        pass
+
+
+class MultiThreadDataProducer:
+    """
+    Multi-threaded data producer for parallel query processing.
+
+    Uses a thread pool to fetch and batch data in parallel with model inference,
+    improving throughput for high-load scenarios.
+
+    Args:
+        ds: The dataset to fetch samples from.
+        threads: Number of worker threads to use.
+        run_one_item: Callback function to process a single QueryItem.
+    """
+
+    def __init__(
+        self,
+        ds: Dataset,
+        threads: int,
+        run_one_item,  # pyre-ignore [2]
+    ) -> None:
+        queue_size_multiplier = 4
+        self.ds = ds
+        self.threads = threads
+        self.run_one_item = run_one_item  # pyre-ignore [4]
+        self.tasks: Queue[Optional[Tuple[List[int], List[int], float, float]]] = Queue(
+            maxsize=threads * queue_size_multiplier
+        )
+        self.workers: List[threading.Thread] = []
+        for _ in range(self.threads):
+            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
+            worker.daemon = True
+            self.workers.append(worker)
+            worker.start()
+
+    def handle_tasks(
+        self, tasks_queue: Queue[Optional[Tuple[List[int], List[int], float, float]]]
+    ) -> None:
+        """
+        Worker thread main loop to process tasks from the queue.
+
+        Each worker maintains its own CUDA stream for parallel execution.
+
+        Args:
+            tasks_queue: Queue containing task tuples or None for termination.
+        """
+        stream = torch.cuda.Stream()
+        while True:
+            query_and_content_ids = tasks_queue.get()
+            if query_and_content_ids is None:
+                tasks_queue.task_done()
+                break
+            query_ids, content_ids, t0, dt_queue = query_and_content_ids
+            t0_batching: float = time.time()
+            samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids)
+            dt_batching: float = time.time() - t0_batching
+            if isinstance(samples, Samples):
+                qitem = QueryItem(
+                    query_ids=query_ids,
+                    samples=samples,
+                    start=t0,
+                    dt_queue=dt_queue,
+                    dt_batching=dt_batching,
+                )
+                with torch.inference_mode(), torch.cuda.stream(stream):
+                    self.run_one_item(qitem)
+            else:
+                start_idx = 0
+                for sample in samples:
+                    batch_size: int = sample.batch_size()
+                    qitem = QueryItem(
+                        query_ids=query_ids[start_idx : start_idx + batch_size],
+                        samples=sample,
+                        start=t0,
+                        dt_queue=dt_queue,
+                        dt_batching=dt_batching,
+                    )
+                    start_idx += batch_size
+                    with torch.inference_mode(), torch.cuda.stream(stream):
+                        self.run_one_item(qitem)
+            tasks_queue.task_done()
+
+    def enqueue(
+        self, query_ids: List[int], content_ids: List[int], t0: float, dt_queue: float
+    ) -> None:
+        """
+        Enqueue queries for asynchronous processing by worker threads.
+
+        Args:
+            query_ids: List of unique query identifiers.
+            content_ids: List of content/sample identifiers to fetch.
+            t0: Timestamp when the query batch was created.
+            dt_queue: Time spent waiting in the queue.
+        """
+        with torch.profiler.record_function("data batching"):
+            self.tasks.put((query_ids, content_ids, t0, dt_queue))
+
+    def finish(self) -> None:
+        """
+        Signal all worker threads to terminate and wait for completion.
+
+        Sends None to each worker to trigger graceful shutdown.
+        """
+        for _ in self.workers:
+            self.tasks.put(None)
+        for worker in self.workers:
+            worker.join()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py
new file mode 100644
index 000000000..add2781bc
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+TorchScript-friendly wrapper for the HSTU dense path (GPU transformer).
+
+``HSTUDenseScriptModule`` accepts the *flattened* sparse-output dicts produced
+by :class:`HSTUSparseScriptModule`, reconstructs ``Dict[str,
+SequenceEmbedding]`` for the existing :meth:`DlrmHSTU.main_forward` and
+returns a 3-tuple of ``(preds, labels, weights)`` -- the only fields the
+predictor actually consumes.
+"""
+
+from typing import Dict
+
+import torch
+from generative_recommenders.dlrm_v3.inference.inference_modules import get_hstu_model
+from generative_recommenders.dlrm_v3.inference.ts_types import (
+    SeqEmbLengths,
+    SeqEmbValues,
+    unflatten_seq_embeddings,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTU, DlrmHSTUConfig
+from torchrec.modules.embedding_configs import EmbeddingConfig
+
+
+class HSTUDenseScriptModule(torch.nn.Module):
+    """Script-friendly dense module.
+
+    The wrapper owns a dense-only :class:`DlrmHSTU` (no
+    ``_embedding_collection``) and delegates to ``main_forward`` after
+    reconstructing the ``SequenceEmbedding`` NamedTuple form.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        table_config: Dict[str, EmbeddingConfig],
+    ) -> None:
+        super().__init__()
+        self._hstu_model: DlrmHSTU = get_hstu_model(
+            table_config=table_config,
+            hstu_config=hstu_config,
+            table_device="cpu",
+            is_dense=True,
+        )
+
+    def forward(
+        self,
+        seq_emb_values: SeqEmbValues,
+        seq_emb_lengths: SeqEmbLengths,
+        payload_features: Dict[str, torch.Tensor],
+        uih_seq_lengths: torch.Tensor,
+        num_candidates: torch.Tensor,
+    ) -> torch.Tensor:
+        # TorchScript supports ``int(tensor.item())`` on a 0-d tensor.
+        max_uih_len: int = int(uih_seq_lengths.max().item())
+        max_num_candidates: int = int(num_candidates.max().item())
+
+        seq_embeddings = unflatten_seq_embeddings(seq_emb_values, seq_emb_lengths)
+
+        (
+            _,
+            _,
+            _,
+            mt_target_preds,
+            _mt_target_labels,
+            _mt_target_weights,
+        ) = self._hstu_model.main_forward(
+            seq_embeddings=seq_embeddings,
+            payload_features=payload_features,
+            max_uih_len=max_uih_len,
+            uih_seq_lengths=uih_seq_lengths,
+            max_num_candidates=max_num_candidates,
+            num_candidates=num_candidates,
+        )
+        assert mt_target_preds is not None
+        # Return just the predictions tensor; labels/weights are unused by
+        # the predictor at inference time and would force ``Optional[Tensor]``
+        # in the return type, which torch.jit.trace rejects ("Only tensors,
+        # lists, tuples of tensors, or dictionary of tensors can be output
+        # from traced functions").
+        return mt_target_preds
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py
new file mode 100644
index 000000000..f1b956d9c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py
@@ -0,0 +1,795 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+End-to-end smoke test for the HSTU TorchScript + C++ deployment pipeline.
+
+What this binary does, in order:
+
+1. Build a synthetic batch (uih_kjt, candidates_kjt) via :func:`get_random_data`.
+2. Build the eager :class:`HSTUSparseScriptModule` and
+   :class:`HSTUDenseScriptModule`.
+3. Run them eagerly to obtain the reference ``preds_eager``.
+4. ``torch.jit.script`` + save:
+       - ``sparse.pt`` (CPU)
+       - ``dense.pt``  (cuda:0, bf16)
+       - ``inputs.pt`` (an :class:`InputsBundle` ScriptModule whose
+         ``forward()`` returns ``Tuple[KeyedJaggedTensor, KeyedJaggedTensor]``)
+5. Run the C++ runner
+       ``hstu_runner [--aott_library <lib.so> ...] <sparse.pt> <dense.pt> <inputs.pt> <preds_cpp.pt>``.
+6. ``torch.load`` the runner's output and compare against ``preds_eager``
+   with :func:`torch.testing.assert_close` (loose tolerance because the
+   scripted path may use either the PyTorch fallback trace or AOT-T-loaded
+   Triton inference kernels).
+
+Usage (manual override of the runner path):
+
+    buck2 run @mode/opt //generative_recommenders/dlrm_v3/inference:end_to_end_test \\
+        -- --cpp_runner /path/to/hstu_runner
+
+By default the binary locates the runner via ``libfb.py.parutil`` -- it ships
+inside the par as a resource (see BUCK).
+"""
+
+import argparse
+import logging
+import os
+import shutil
+import sys
+import tempfile
+from typing import Any, Dict, List, Tuple
+
+import torch
+from generative_recommenders.dlrm_v3.configs import (
+    get_embedding_table_config,
+    get_hstu_configs,
+)
+from generative_recommenders.dlrm_v3.datasets.dataset import get_random_data
+from generative_recommenders.dlrm_v3.inference.dense_predict_module import (
+    HSTUDenseScriptModule,
+)
+from generative_recommenders.dlrm_v3.inference.sparse_predict_module import (
+    HSTUSparseScriptModule,
+)
+from generative_recommenders.dlrm_v3.inference.ts_types import (
+    SeqEmbLengths,
+    SeqEmbValues,
+    unflatten_seq_embeddings,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from security.frameworks.python.exec.subprocess import TrustedSubprocessWithList
+from torchrec.modules.embedding_configs import EmbeddingConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+_DEFAULT_DATASET = "kuairand-1k"
+
+
+class InputsBundle(torch.nn.Module):
+    """Scripted holder for the test inputs.
+
+    Returns the constituent tensors of the two KJTs as a 4-tuple
+    ``(uih_lengths, uih_values, candidates_lengths, candidates_values)`` so
+    the traced sparse module can rebuild the KJTs inside its forward (KJT
+    instances themselves are not traceable inputs).
+    """
+
+    def __init__(
+        self,
+        uih_kjt: KeyedJaggedTensor,
+        candidates_kjt: KeyedJaggedTensor,
+    ) -> None:
+        super().__init__()
+        self.register_buffer("uih_lengths", uih_kjt.lengths())
+        self.register_buffer("uih_values", uih_kjt.values())
+        self.register_buffer("candidates_lengths", candidates_kjt.lengths())
+        self.register_buffer("candidates_values", candidates_kjt.values())
+
+    def forward(
+        self,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        return (
+            self.uih_lengths,
+            self.uih_values,
+            self.candidates_lengths,
+            self.candidates_values,
+        )
+
+
+class _SparseTraceShim(torch.nn.Module):
+    """Adapter that takes raw tensors and rebuilds the KJTs inside forward.
+
+    ``torch.jit.trace`` does not accept ``KeyedJaggedTensor`` (or any
+    non-Tensor / non-collection-of-Tensor type) as a top-level forward
+    input, so we make the traced boundary tensor-only and bake the
+    ``List[str]`` of feature keys in as Python constants captured by the
+    closure / module attribute.
+    """
+
+    def __init__(
+        self,
+        sparse_module: HSTUSparseScriptModule,
+        uih_keys: List[str],
+        candidates_keys: List[str],
+    ) -> None:
+        super().__init__()
+        self._sparse_module: HSTUSparseScriptModule = sparse_module
+        self._uih_keys: List[str] = uih_keys
+        self._candidates_keys: List[str] = candidates_keys
+
+    def forward(
+        self,
+        uih_lengths: torch.Tensor,
+        uih_values: torch.Tensor,
+        candidates_lengths: torch.Tensor,
+        candidates_values: torch.Tensor,
+    ) -> Tuple[
+        SeqEmbValues,
+        SeqEmbLengths,
+        Dict[str, torch.Tensor],
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        uih_kjt = KeyedJaggedTensor(
+            keys=self._uih_keys,
+            lengths=uih_lengths,
+            values=uih_values,
+        )
+        candidates_kjt = KeyedJaggedTensor(
+            keys=self._candidates_keys,
+            lengths=candidates_lengths,
+            values=candidates_values,
+        )
+        return self._sparse_module(
+            uih_features=uih_kjt, candidates_features=candidates_kjt
+        )
+
+
+class _DenseAottTraceShim(torch.nn.Module):
+    """FX-traceable dense adapter for the representative AOT-T shape."""
+
+    def __init__(
+        self,
+        dense_module: HSTUDenseScriptModule,
+        max_uih_len: int,
+        max_num_candidates: int,
+        total_uih_len: int,
+        total_targets: int,
+    ) -> None:
+        super().__init__()
+        self._dense_module: HSTUDenseScriptModule = dense_module
+        self._max_uih_len: int = max_uih_len
+        self._max_num_candidates: int = max_num_candidates
+        self._total_uih_len: int = total_uih_len
+        self._total_targets: int = total_targets
+
+    def forward(
+        self,
+        seq_emb_values: SeqEmbValues,
+        seq_emb_lengths: SeqEmbLengths,
+        payload_features: Dict[str, torch.Tensor],
+        uih_seq_lengths: torch.Tensor,
+        num_candidates: torch.Tensor,
+    ) -> torch.Tensor:
+        seq_embeddings = unflatten_seq_embeddings(seq_emb_values, seq_emb_lengths)
+
+        (
+            _,
+            _,
+            _,
+            mt_target_preds,
+            _mt_target_labels,
+            _mt_target_weights,
+        ) = self._dense_module._hstu_model.main_forward(
+            seq_embeddings=seq_embeddings,
+            payload_features=payload_features,
+            max_uih_len=self._max_uih_len,
+            uih_seq_lengths=uih_seq_lengths,
+            max_num_candidates=self._max_num_candidates,
+            num_candidates=num_candidates,
+            total_uih_len=self._total_uih_len,
+            total_targets=self._total_targets,
+        )
+        assert mt_target_preds is not None
+        return mt_target_preds
+
+
+def _dense_aott_concrete_args(
+    dense_inputs: Tuple[
+        Dict[str, torch.Tensor],
+        Dict[str, torch.Tensor],
+        Dict[str, torch.Tensor],
+        torch.Tensor,
+        torch.Tensor,
+    ],
+) -> Dict[str, Any]:
+    from torch.fx._symbolic_trace import PH
+
+    seq_emb_values, seq_emb_lengths, payload_features, _, _ = dense_inputs
+    return {
+        "seq_emb_values": {key: PH for key in seq_emb_values},
+        "seq_emb_lengths": {key: PH for key in seq_emb_lengths},
+        "payload_features": {key: PH for key in payload_features},
+    }
+
+
+def _find_cpp_runner() -> str:
+    """Locate the bundled hstu_runner binary.
+
+    Tries ``importlib.resources`` (the canonical fbcode resource resolver,
+    works whether the binary is in a par or unpacked), and falls back to
+    looking next to ``sys.argv[0]``.
+    """
+    try:
+        from importlib.resources import files
+
+        path = files("generative_recommenders.dlrm_v3.inference.cpp").joinpath(
+            "hstu_runner"
+        )
+        if path.is_file():
+            return str(path)
+    except Exception as exc:
+        logger.debug("importlib.resources lookup failed: %s", exc)
+
+    candidate = os.path.join(
+        os.path.dirname(os.path.abspath(sys.argv[0])), "hstu_runner"
+    )
+    if os.path.exists(candidate):
+        return candidate
+
+    raise RuntimeError(
+        "Could not find hstu_runner binary. "
+        "Pass --cpp_runner=<path> or build the cpp_binary target first."
+    )
+
+
+def _eager_run(
+    sparse_module: HSTUSparseScriptModule,
+    dense_module: HSTUDenseScriptModule,
+    uih_kjt: KeyedJaggedTensor,
+    candidates_kjt: KeyedJaggedTensor,
+    device: torch.device,
+) -> torch.Tensor:
+    """Reference path: sparse → device-move + bf16 → dense, all in Python."""
+    with torch.no_grad():
+        seq_emb_values, seq_emb_lengths, payload, uih_lens, num_cands = sparse_module(
+            uih_features=uih_kjt, candidates_features=candidates_kjt
+        )
+        seq_emb_values = {
+            k: v.to(device).to(torch.bfloat16) for k, v in seq_emb_values.items()
+        }
+        seq_emb_lengths = {k: v.to(device) for k, v in seq_emb_lengths.items()}
+        payload = {k: v.to(device) for k, v in payload.items()}
+        uih_lens = uih_lens.to(device)
+        num_cands = num_cands.to(device)
+        preds = dense_module(
+            seq_emb_values, seq_emb_lengths, payload, uih_lens, num_cands
+        )
+    return preds.detach().to(torch.float32).cpu()
+
+
+def _find_aott_libraries() -> List[str]:
+    from generative_recommenders.ops.triton_aot.compile.compile_state import (
+        get_aott_compile_path,
+    )
+
+    compile_path = get_aott_compile_path()
+    libraries: List[str] = []
+    for root, _, files in os.walk(compile_path):
+        for filename in files:
+            if filename.endswith(".so"):
+                libraries.append(os.path.join(root, filename))
+    return sorted(libraries)
+
+
+def _copy_aott_libraries_to_workdir(
+    library_paths: List[str], workdir: str
+) -> List[str]:
+    copied: List[str] = []
+    for index, path in enumerate(library_paths):
+        dst = os.path.join(workdir, f"aott_{index}_{os.path.basename(path)}")
+        shutil.copy2(path, dst)
+        copied.append(dst)
+    return copied
+
+
+def _load_aott_libraries_for_python(library_paths: List[str]) -> None:
+    for library_path in library_paths:
+        logger.info("Python roundtrip: loading AOT-T library %s", library_path)
+        torch.ops.load_library(library_path)
+
+
+def _save_aott_dense_module(
+    dense_module: HSTUDenseScriptModule,
+    dense_inputs: Tuple[
+        Dict[str, torch.Tensor],
+        Dict[str, torch.Tensor],
+        Dict[str, torch.Tensor],
+        torch.Tensor,
+        torch.Tensor,
+    ],
+    dense_path: str,
+    workdir: str,
+    atol: float,
+    rtol: float,
+) -> List[str]:
+    """Lower the dense module with AOT-T and save a TorchScript artifact.
+
+    This follows the AOT-T example flow:
+
+    1. FX trace the module.
+    2. Unwrap outer `aot_triton_kernel_wrapper_*` nodes.
+    3. Run representative CUDA inputs under `TritonAOTCompile`.
+    4. `transform_kernels` to replace wrappers with `torch.ops.triton_aot.*`.
+    5. Script and save the transformed dense module.
+
+    The full HSTU dense wrapper has historically needed tracing rather than FX,
+    so failures here are reported with context and the default path remains the
+    D102 traced TorchScript fallback.
+    """
+    from generative_recommenders.ops.triton_aot.compile.triton_aot_compile import (
+        TritonAOTCompile,
+    )
+    from generative_recommenders.ops.triton_aot.preprocess import (
+        unwrap_aott_wrapper_nodes,
+    )
+    from generative_recommenders.ops.triton_aot.transform.transform_kernels import (
+        transform_kernels,
+    )
+    from tgif.fx.tgif_tracer import TGIFTracer
+
+    max_uih_len = int(dense_inputs[3].max().item())
+    max_num_candidates = int(dense_inputs[4].max().item())
+    total_uih_len = int(dense_inputs[3].sum().item())
+    total_targets = int(dense_inputs[4].sum().item())
+    trace_shim = _DenseAottTraceShim(
+        dense_module=dense_module,
+        max_uih_len=max_uih_len,
+        max_num_candidates=max_num_candidates,
+        total_uih_len=total_uih_len,
+        total_targets=total_targets,
+    ).eval()
+
+    logger.info(
+        "AOT-T dense: FX tracing representative shape "
+        "(max_uih_len=%d, max_num_candidates=%d, "
+        "total_uih_len=%d, total_targets=%d)...",
+        max_uih_len,
+        max_num_candidates,
+        total_uih_len,
+        total_targets,
+    )
+    try:
+        fx_dense = TGIFTracer().symbolic_trace(
+            trace_shim,
+            concrete_args=_dense_aott_concrete_args(dense_inputs),
+        )
+        lowered_dense = unwrap_aott_wrapper_nodes(fx_dense, TGIFTracer())
+    except Exception as exc:
+        raise RuntimeError(
+            "AOT-T dense lowering requires an FX-traceable dense entry point. "
+            "Use --dense_backend=torchscript to fall back to the D102 traced "
+            "TorchScript path."
+        ) from exc
+
+    logger.info("AOT-T dense: compiling Triton kernels from sample inputs...")
+    with torch.no_grad():
+        with TritonAOTCompile():
+            ref_output = lowered_dense(*dense_inputs)
+
+    original_code = lowered_dense.code
+    lowered_dense = transform_kernels(lowered_dense)
+    if lowered_dense.code == original_code:
+        logger.warning(
+            "AOT-T dense: transform_kernels did not change the FX graph. "
+            "This usually means no aot_triton_kernel_wrapper_* nodes were "
+            "present in the dense path."
+        )
+
+    libraries = _find_aott_libraries()
+    if not libraries:
+        raise RuntimeError(
+            "AOT-T dense lowering produced no .so files. Ensure the dense path "
+            "uses HammerKernel.TRITON_INFERENCE branches backed by triton_aot ops."
+        )
+
+    with torch.no_grad():
+        lowered_output = lowered_dense(*dense_inputs)
+    torch.testing.assert_close(ref_output, lowered_output, atol=atol, rtol=rtol)
+
+    logger.info("AOT-T dense: tracing transformed module...")
+    torch.jit.trace(
+        lowered_dense,
+        example_inputs=dense_inputs,
+        strict=False,
+        check_trace=False,
+    ).save(dense_path)
+    copied_libraries = _copy_aott_libraries_to_workdir(libraries, workdir)
+    logger.info("AOT-T dense: copied %d library file(s)", len(copied_libraries))
+    return copied_libraries
+
+
+def _build_synthetic_inputs(
+    hstu_config: DlrmHSTUConfig,
+    table_config: Dict[str, EmbeddingConfig],
+    uih_max_seq_len: int,
+) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+    contextual: List[str] = list(hstu_config.contextual_feature_to_max_length.keys())
+    # The kuairand-1k dataset has tiny embedding tables for some contextual
+    # features (e.g. user_active_degree has num_embeddings=8). Clamp the
+    # random value range so every index stays in range for every table.
+    min_rows = min(t.num_embeddings for t in table_config.values())
+    value_bound = max(2, min_rows)
+    logger.info(
+        "synthetic value_bound=%d (min table rows=%d across %d tables)",
+        value_bound,
+        min_rows,
+        len(table_config),
+    )
+    return get_random_data(
+        contexual_features=contextual,
+        hstu_uih_keys=hstu_config.hstu_uih_feature_names,
+        hstu_candidates_keys=hstu_config.hstu_candidate_feature_names,
+        uih_max_seq_len=uih_max_seq_len,
+        max_num_candidates=hstu_config.max_num_candidates_inference,
+        value_bound=value_bound,
+    )
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--cpp_runner",
+        type=str,
+        default=None,
+        help="Path to the hstu_runner binary; default: bundled resource.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=_DEFAULT_DATASET,
+        help="Dataset key for HSTU/embedding configs.",
+    )
+    parser.add_argument(
+        "--device", type=str, default="cuda:0", help="Dense-module device."
+    )
+    parser.add_argument(
+        "--uih_max_seq_len",
+        type=int,
+        default=128,
+        help="Max UIH length for the synthetic batch.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--atol", type=float, default=1e-2)
+    parser.add_argument("--rtol", type=float, default=1e-2)
+    parser.add_argument(
+        "--dense_backend",
+        choices=("torchscript", "aott"),
+        default="torchscript",
+        help="Dense artifact backend. aott lowers TRITON_INFERENCE wrappers and passes compiled libraries to the C++ runner.",
+    )
+    parser.add_argument(
+        "--aott_library",
+        action="append",
+        default=[],
+        help="Additional prebuilt AOT-T shared library to dlopen before loading dense.pt. May be repeated.",
+    )
+    parser.add_argument(
+        "--keep_workdir",
+        action="store_true",
+        help="Do not delete the temp dir holding the saved artifacts.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:  # noqa: C901
+    logging.basicConfig(level=logging.INFO, format="[e2e] %(message)s", force=True)
+    logger.setLevel(logging.DEBUG)
+    args = _parse_args()
+
+    if not torch.cuda.is_available():
+        logger.error("CUDA is required; aborting.")
+        sys.exit(2)
+
+    runner_path = args.cpp_runner or _find_cpp_runner()
+    logger.info("Using C++ runner: %s", runner_path)
+
+    torch.manual_seed(args.seed)
+    device = torch.device(args.device)
+    torch.cuda.set_device(device)
+
+    hstu_config = get_hstu_configs(args.dataset)
+    table_config = get_embedding_table_config(args.dataset)
+
+    uih_kjt, candidates_kjt = _build_synthetic_inputs(
+        hstu_config, table_config, args.uih_max_seq_len
+    )
+
+    sparse_module = HSTUSparseScriptModule(
+        table_config=table_config,
+        hstu_config=hstu_config,
+        use_no_copy_embedding_collection=True,
+    ).eval()
+    dense_module = (
+        HSTUDenseScriptModule(hstu_config=hstu_config, table_config=table_config)
+        .to(torch.bfloat16)
+        .to(device)
+        .eval()
+    )
+
+    from generative_recommenders.common import HammerKernel
+
+    dense_kernel = (
+        HammerKernel.TRITON_INFERENCE
+        if args.dense_backend == "aott"
+        else HammerKernel.PYTORCH
+    )
+    sparse_module._sparse._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH)
+    dense_module._hstu_model.set_hammer_kernel(dense_kernel)
+
+    # Diagnostic: walk every HammerModule submodule and print its effective
+    # kernel selection, so any submodule that didn't pick up the override
+    # surfaces immediately. Triton/Triton-CC selections will fail at trace
+    # time, so this print is critical for triaging the next iteration if
+    # tracing fails.
+    from generative_recommenders.common import HammerModule as _HM
+
+    for name, m in list(sparse_module.named_modules()) + list(
+        dense_module.named_modules()
+    ):
+        if isinstance(m, _HM):
+            logger.info(
+                "kernel-pin %-60s -> %s (is_inference=%s, use_triton_cc=%s)",
+                name or "<root>",
+                m.hammer_kernel().value,
+                m._is_inference,
+                m._use_triton_cc,
+            )
+
+    # === 1. Eager reference ===
+    logger.info("Running eager reference...")
+    preds_eager = _eager_run(
+        sparse_module, dense_module, uih_kjt, candidates_kjt, device
+    )
+    logger.info(
+        "preds_eager shape=%s sum=%.6f",
+        tuple(preds_eager.shape),
+        preds_eager.sum().item(),
+    )
+
+    # === 2. Trace/lower + save ===
+    # The default path keeps D102's trace-based TorchScript artifact. The
+    # AOT-T path follows ModelStore's compile/transform flow and saves a
+    # scripted FX module whose Triton kernels dispatch through torch.ops.
+    workdir = tempfile.mkdtemp(prefix="hstu_e2e_")
+    sparse_path = os.path.join(workdir, "sparse.pt")
+    dense_path = os.path.join(workdir, "dense.pt")
+    inputs_path = os.path.join(workdir, "inputs.pt")
+    cpp_out_path = os.path.join(workdir, "preds_cpp.pt")
+    eager_out_path = os.path.join(workdir, "preds_eager.pt")
+    aott_library_paths: List[str] = list(args.aott_library)
+    python_roundtrip_aott_library_paths: List[str] = list(args.aott_library)
+    logger.info("workdir: %s", workdir)
+
+    # Re-run sparse eagerly to capture an example output that can drive the
+    # dense trace.
+    with torch.no_grad():
+        sparse_out = sparse_module(
+            uih_features=uih_kjt, candidates_features=candidates_kjt
+        )
+        seq_emb_values = {
+            k: v.to(device).to(torch.bfloat16) for k, v in sparse_out[0].items()
+        }
+        seq_emb_lengths = {k: v.to(device) for k, v in sparse_out[1].items()}
+        payload = {k: v.to(device) for k, v in sparse_out[2].items()}
+        uih_lens = sparse_out[3].to(device)
+        num_cands = sparse_out[4].to(device)
+
+    logger.info("Tracing sparse module via raw-tensor shim (CPU)...")
+    sparse_shim = _SparseTraceShim(
+        sparse_module=sparse_module,
+        uih_keys=list(uih_kjt.keys()),
+        candidates_keys=list(candidates_kjt.keys()),
+    )
+    traced_sparse = torch.jit.trace(
+        sparse_shim,
+        example_inputs=(
+            uih_kjt.lengths(),
+            uih_kjt.values(),
+            candidates_kjt.lengths(),
+            candidates_kjt.values(),
+        ),
+        strict=False,
+        check_trace=False,
+    )
+    traced_sparse.save(sparse_path)
+
+    dense_inputs = (
+        seq_emb_values,
+        seq_emb_lengths,
+        payload,
+        uih_lens,
+        num_cands,
+    )
+    if args.dense_backend == "aott":
+        logger.info("Lowering dense module with AOT-T...")
+        generated_aott_library_paths = _save_aott_dense_module(
+            dense_module,
+            dense_inputs,
+            dense_path,
+            workdir,
+            args.atol,
+            args.rtol,
+        )
+        aott_library_paths.extend(generated_aott_library_paths)
+    else:
+        logger.info("Tracing dense module (cuda:0, bf16)...")
+        traced_dense = torch.jit.trace(
+            dense_module,
+            example_inputs=dense_inputs,
+            strict=False,
+            check_trace=False,
+        )
+        traced_dense.save(dense_path)
+
+    logger.info("Scripting + saving inputs bundle...")
+    torch.jit.script(InputsBundle(uih_kjt, candidates_kjt)).save(inputs_path)
+    torch.save(preds_eager, eager_out_path)
+
+    # === 2.5. Python-side roundtrip verification ===
+    # Load the saved traced artifacts back in Python and verify they produce
+    # the same results as the eager run. This proves the artifacts are correct
+    # independently of the C++ runner.
+    logger.info("Python roundtrip: loading traced artifacts back...")
+    if python_roundtrip_aott_library_paths:
+        _load_aott_libraries_for_python(python_roundtrip_aott_library_paths)
+    rt_inputs = torch.jit.load(inputs_path)
+    rt_sparse = torch.jit.load(sparse_path)
+    rt_dense = torch.jit.load(dense_path)
+
+    with torch.no_grad():
+        rt_uih_l, rt_uih_v, rt_cand_l, rt_cand_v = rt_inputs()
+        logger.info(
+            "  rt inputs: uih_l=%s uih_v=%s cand_l=%s cand_v=%s",
+            rt_uih_l.shape,
+            rt_uih_v.shape,
+            rt_cand_l.shape,
+            rt_cand_v.shape,
+        )
+
+        rt_sparse_out = rt_sparse(rt_uih_l, rt_uih_v, rt_cand_l, rt_cand_v)
+
+        for i, elem in enumerate(rt_sparse_out):
+            if isinstance(elem, dict):
+                for k, v in elem.items():
+                    has_nan = torch.isnan(v).any().item()
+                    has_inf = torch.isinf(v).any().item()
+                    logger.info(
+                        "  sparse_out[%d][%s] shape=%s dtype=%s nan=%s inf=%s",
+                        i,
+                        k,
+                        tuple(v.shape),
+                        v.dtype,
+                        has_nan,
+                        has_inf,
+                    )
+            elif isinstance(elem, torch.Tensor):
+                logger.info(
+                    "  sparse_out[%d] shape=%s dtype=%s nan=%s inf=%s",
+                    i,
+                    tuple(elem.shape),
+                    elem.dtype,
+                    torch.isnan(elem).any().item(),
+                    torch.isinf(elem).any().item(),
+                )
+
+        rt_sev = {
+            k: v.to(device).to(torch.bfloat16) for k, v in rt_sparse_out[0].items()
+        }
+        rt_sel = {k: v.to(device) for k, v in rt_sparse_out[1].items()}
+        rt_pay = {k: v.to(device) for k, v in rt_sparse_out[2].items()}
+        rt_uih = rt_sparse_out[3].to(device)
+        rt_nc = rt_sparse_out[4].to(device)
+
+        preds_rt = rt_dense(rt_sev, rt_sel, rt_pay, rt_uih, rt_nc)
+
+    preds_rt_cpu = preds_rt.detach().to(torch.float32).cpu()
+    logger.info(
+        "preds_roundtrip shape=%s sum=%.6f nan=%s inf=%s",
+        tuple(preds_rt_cpu.shape),
+        preds_rt_cpu.sum().item(),
+        torch.isnan(preds_rt_cpu).any().item(),
+        torch.isinf(preds_rt_cpu).any().item(),
+    )
+
+    try:
+        torch.testing.assert_close(
+            preds_eager, preds_rt_cpu, atol=args.atol, rtol=args.rtol
+        )
+    except AssertionError as e:
+        logger.error("PYTHON ROUNDTRIP PARITY FAILED: %s", e)
+        if not args.keep_workdir:
+            logger.info("(workdir kept for inspection: %s)", workdir)
+        sys.exit(1)
+    logger.info("PYTHON ROUNDTRIP PASSED (atol=%g rtol=%g)", args.atol, args.rtol)
+
+    # === 3. Invoke C++ runner ===
+    runner_args: List[str] = []
+    for library_path in aott_library_paths:
+        runner_args.extend(["--aott_library", library_path])
+    runner_args.extend([sparse_path, dense_path, inputs_path, cpp_out_path])
+
+    logger.info("Running C++: %s %s", runner_path, " ".join(runner_args))
+    # pyre-fixme[6]: TrustedSubprocessWithList requires Literal[str] but this
+    # runner is resolved from a built resource or explicit test argument.
+    result = TrustedSubprocessWithList.run(
+        executable=runner_path,
+        cmd_args=runner_args,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.stdout:
+        logger.info("--- runner stdout ---\n%s", result.stdout.rstrip())
+    if result.stderr:
+        logger.info("--- runner stderr ---\n%s", result.stderr.rstrip())
+    if result.returncode != 0:
+        if result.returncode == -11:
+            logger.warning(
+                "C++ runner SIGSEGV (exit -11). This is a known issue with "
+                "torch-cpp-cuda static initialization on some machines. "
+                "Python roundtrip verification passed above. "
+                "Artifacts in: %s",
+                workdir,
+            )
+            args.keep_workdir = True
+        else:
+            logger.error("C++ runner exited with code %d", result.returncode)
+        if not args.keep_workdir:
+            shutil.rmtree(workdir, ignore_errors=True)
+        sys.exit(result.returncode)
+
+    # === 4. Compare ===
+    if not os.path.exists(cpp_out_path):
+        logger.error("C++ runner did not produce %s", cpp_out_path)
+        sys.exit(1)
+    preds_cpp = torch.load(cpp_out_path, weights_only=False).to(torch.float32).cpu()
+    logger.info(
+        "preds_cpp   shape=%s sum=%.6f",
+        tuple(preds_cpp.shape),
+        preds_cpp.sum().item(),
+    )
+
+    try:
+        torch.testing.assert_close(
+            preds_eager, preds_cpp, atol=args.atol, rtol=args.rtol
+        )
+    except AssertionError as e:
+        logger.error("PARITY FAILED: %s", e)
+        if not args.keep_workdir:
+            logger.info("(workdir kept for inspection: %s)", workdir)
+        sys.exit(1)
+
+    logger.info("PASSED: eager and C++ agree (atol=%g rtol=%g)", args.atol, args.rtol)
+    if not args.keep_workdir:
+        shutil.rmtree(workdir, ignore_errors=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin
new file mode 100644
index 000000000..e2025dee0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin
@@ -0,0 +1,13 @@
+run.model_path = ""
+run.scenario_name = "Server"
+run.batchsize = 16
+run.output_trace = False
+run.data_producer_threads = 4
+run.compute_eval = False
+run.find_peak_performance = False
+run.train_split_percentage = 0.75
+
+# below will override mlperf rules compliant settings - don't use for official submission
+run.target_qps = 2000
+run.num_queries = 10000
+run.numpy_rand_seed = 123
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin
new file mode 100644
index 000000000..a770aa014
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin
@@ -0,0 +1,14 @@
+# run.model_path = "/home/linjianma/ckpts/kuairand_1k/2025_01_12_17_56_43/"
+run.scenario_name = "Server"
+run.batchsize = 16
+run.output_trace = False
+run.data_producer_threads = 4
+run.compute_eval = False
+run.find_peak_performance = False
+run.train_split_percentage = 0.75
+
+# below will override mlperf rules compliant settings - don't use for official submission
+run.target_qps = 2000
+run.num_queries = 10000
+run.numpy_rand_seed = 123
+run.dataset_path_prefix = "/home/linjianma"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin
new file mode 100644
index 000000000..3121ac0e7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin
@@ -0,0 +1,16 @@
+run.model_path = ""
+run.scenario_name = "Server"
+run.batchsize = 5
+run.output_trace = False
+run.data_producer_threads = 8
+run.compute_eval = False
+run.find_peak_performance = False
+run.train_split_percentage = 0.75
+run.sparse_quant = False
+
+# below will override mlperf rules compliant settings - don't use for official submission
+run.target_qps = 5000
+run.num_queries = 30000
+run.numpy_rand_seed = 123
+run.dataset_path_prefix = "/home/linjianma"
+run.dataset_percentage = 0.0625
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin
new file mode 100644
index 000000000..0655734c2
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin
@@ -0,0 +1,15 @@
+# run.model_path = "/home/linjianma/ckpts/streaming_100b/89/"
+run.scenario_name = "Server"
+run.batchsize = 10
+run.output_trace = False
+run.data_producer_threads = 16
+run.compute_eval = False
+run.find_peak_performance = False
+run.sparse_quant = False
+run.numpy_rand_seed = 123
+run.dataset_path_prefix = "/home/linjianma"
+run.dataset_percentage = 0.001
+run.warmup_ratio = 0.3
+run.num_queries = 20000
+# Needs to be tuned for different implementations to balance latency and throughput
+run.target_qps = 1000
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin
new file mode 100644
index 000000000..eed13e0ff
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin
@@ -0,0 +1,15 @@
+run.model_path = ""
+run.scenario_name = "Server"
+run.batchsize = 5
+run.output_trace = False
+run.data_producer_threads = 8
+run.compute_eval = False
+run.find_peak_performance = False
+run.train_split_percentage = 0.75
+run.sparse_quant = False
+
+# below will override mlperf rules compliant settings - don't use for official submission
+run.target_qps = 5000
+run.numpy_rand_seed = 123
+run.dataset_path_prefix = "/home/linjianma"
+run.dataset_percentage = 0.00625
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py
new file mode 100644
index 000000000..cb567df63
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py
@@ -0,0 +1,253 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+"""
+Inference modules for DLRMv3.
+
+This module provides inference-specific components for the HSTU model,
+including sparse inference modules and utilities for moving tensors between devices.
+"""
+
+from typing import Dict, Optional, Tuple
+
+import torch
+import torchrec
+from generative_recommenders.modules.dlrm_hstu import (
+    DlrmHSTU,
+    DlrmHSTUConfig,
+    SequenceEmbedding,
+)
+from torchrec.modules.embedding_modules import (
+    EmbeddingBagCollection,
+    EmbeddingCollection,
+)
+from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
+from torchrec.sparse.tensor_dict import maybe_td_to_kjt
+
+
+IS_INFERENCE: bool = True
+
+
+class _NoCopyEmbeddingCollection(torchrec.EmbeddingCollection):
+    """
+    EmbeddingCollection variant that skips the dtype-cast copy in
+    ``EmbeddingCollection.forward`` and clamps indices into the hash-size
+    range. This is the script-mode replacement for the
+    ``functools.partial`` monkey-patch in
+    :func:`generative_recommenders.dlrm_v3.inference.model_family.ec_patched_forward_wo_embedding_copy`.
+
+    The body mirrors that helper exactly so that the eager and scripted paths
+    produce the same embeddings.
+    """
+
+    def forward(
+        self,
+        features: KeyedJaggedTensor,
+    ) -> Dict[str, JaggedTensor]:
+        features = maybe_td_to_kjt(features, None)
+        feature_embeddings: Dict[str, JaggedTensor] = {}
+        jt_dict: Dict[str, JaggedTensor] = features.to_dict()
+        # Inline HASH_SIZE_1B - 1 as a literal so TorchScript can see it; the
+        # imported module-level constant is treated as an opaque "closed-over
+        # global" by jit.script and would fail with
+        # "python value of type 'int' cannot be used as a value".
+        max_index: int = 999_999_999  # HASH_SIZE_1B - 1
+        for i, emb_module in enumerate(self.embeddings.values()):
+            feature_names = self._feature_names[i]
+            embedding_names = self._embedding_names_by_table[i]
+            for j, embedding_name in enumerate(embedding_names):
+                feature_name = feature_names[j]
+                f = jt_dict[feature_name]
+                indices = torch.clamp(f.values(), min=0, max=max_index)
+                lookup = emb_module(input=indices)
+                feature_embeddings[embedding_name] = JaggedTensor(
+                    values=lookup,
+                    lengths=f.lengths(),
+                    weights=f.values() if self._need_indices else None,
+                )
+        return feature_embeddings
+
+
+def set_is_inference(is_inference: bool = False) -> None:
+    """
+    Set the global inference mode flag.
+
+    Args:
+        is_inference: If True, model operates in inference mode (no labels/weights).
+                     If False, model operates in training/eval mode with labels.
+    """
+    global IS_INFERENCE
+    IS_INFERENCE = is_inference
+
+
+def get_hstu_model(
+    table_config,
+    hstu_config: DlrmHSTUConfig,
+    table_device: str = "meta",
+    max_hash_size: Optional[int] = None,
+    is_dense: bool = False,
+) -> DlrmHSTU:
+    """
+    Create and initialize an HSTU model for inference.
+
+    Args:
+        table_config: Dictionary of embedding table configurations.
+        hstu_config: HSTU model configuration object.
+        table_device: Device to place embedding tables on ('meta', 'cpu', or 'cuda').
+        max_hash_size: Optional maximum hash size to cap embedding table sizes.
+        is_dense: If True, creates model for dense-only operations.
+
+    Returns:
+        Initialized DlrmHSTU model in eval mode.
+    """
+    if max_hash_size is not None:
+        for t in table_config.values():
+            t.num_embeddings = (
+                max_hash_size if t.num_embeddings > max_hash_size else t.num_embeddings
+            )
+    model = DlrmHSTU(
+        hstu_configs=hstu_config,
+        embedding_tables=table_config,
+        is_inference=IS_INFERENCE,
+        is_dense=is_dense,
+    )
+    model.eval()
+    model.recursive_setattr("_use_triton_cc", False)
+    for _, module in model.named_modules():
+        if isinstance(module, EmbeddingBagCollection) or isinstance(
+            module, EmbeddingCollection
+        ):
+            module.to_empty(device=table_device)
+            # to_empty leaves parameters uninitialized; fill with small random
+            # values so downstream bf16 ops don't produce NaN from
+            # uninitialized memory.
+            for p in module.parameters():
+                if not p.is_meta:
+                    torch.nn.init.uniform_(p, -0.01, 0.01)
+    return model
+
+
+class HSTUSparseInferenceModule(torch.nn.Module):
+    """
+    Module for sparse (embedding) inference operations.
+
+    Handles embedding lookups and preprocessing for the HSTU model,
+    running on CPU to handle large embedding tables.
+
+    Args:
+        table_config: Dictionary of embedding table configurations.
+        hstu_config: HSTU model configuration object.
+    """
+
+    def __init__(
+        self,
+        table_config,
+        hstu_config: DlrmHSTUConfig,
+    ) -> None:
+        super().__init__()
+        self._hstu_model: DlrmHSTU = get_hstu_model(
+            table_config,
+            hstu_config,
+            table_device="cpu",
+        )
+
+    def forward(
+        self,
+        uih_features: KeyedJaggedTensor,
+        candidates_features: KeyedJaggedTensor,
+    ) -> Tuple[
+        Dict[str, SequenceEmbedding],
+        Dict[str, torch.Tensor],
+        int,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+    ]:
+        """
+        Run sparse preprocessing and embedding lookups.
+
+        Args:
+            uih_features: User interaction history features as KeyedJaggedTensor.
+            candidates_features: Candidate item features as KeyedJaggedTensor.
+
+        Returns:
+            Tuple containing:
+                - seq_embeddings: Dictionary of sequence embeddings per feature.
+                - payload_features: Dictionary of payload feature tensors.
+                - max_uih_len: Maximum user interaction history length.
+                - uih_seq_lengths: Tensor of UIH sequence lengths per batch item.
+                - max_num_candidates: Maximum number of candidates.
+                - num_candidates: Tensor of candidate counts per batch item.
+        """
+        (
+            seq_embeddings,
+            payload_features,
+            max_uih_len,
+            uih_seq_lengths,
+            max_num_candidates,
+            num_candidates,
+        ) = self._hstu_model.preprocess(
+            uih_features=uih_features,
+            candidates_features=candidates_features,
+        )
+        return (
+            seq_embeddings,
+            payload_features,
+            max_uih_len,
+            uih_seq_lengths,
+            max_num_candidates,
+            num_candidates,
+        )
+
+
+def move_sparse_output_to_device(
+    seq_embeddings: Dict[str, SequenceEmbedding],
+    payload_features: Dict[str, torch.Tensor],
+    uih_seq_lengths: torch.Tensor,
+    num_candidates: torch.Tensor,
+    device: torch.device,
+) -> Tuple[
+    Dict[str, SequenceEmbedding],
+    Dict[str, torch.Tensor],
+    torch.Tensor,
+    torch.Tensor,
+]:
+    """
+    Move sparse module outputs from CPU to the target device (typically GPU).
+
+    Converts embeddings to bfloat16 for efficient GPU computation.
+
+    Args:
+        seq_embeddings: Dictionary of sequence embeddings to move.
+        payload_features: Dictionary of payload features to move.
+        uih_seq_lengths: UIH sequence lengths tensor to move.
+        num_candidates: Number of candidates tensor to move.
+        device: Target device (e.g., torch.device('cuda:0')).
+
+    Returns:
+        Tuple of moved tensors on the target device.
+    """
+    num_candidates = num_candidates.to(device)
+    uih_seq_lengths = uih_seq_lengths.to(device)
+    seq_embeddings = {
+        k: SequenceEmbedding(
+            lengths=seq_embeddings[k].lengths.to(device),
+            embedding=seq_embeddings[k].embedding.to(device).to(torch.bfloat16),
+        )
+        for k in seq_embeddings.keys()
+    }
+    for k, v in payload_features.items():
+        payload_features[k] = v.to(device)
+    return seq_embeddings, payload_features, uih_seq_lengths, num_candidates
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py
new file mode 100644
index 000000000..00e334119
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py
@@ -0,0 +1,805 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+mlperf dlrm_v3 inference benchmarking tool.
+"""
+
+import argparse
+import array
+import logging
+import random
+import threading
+
+logging.basicConfig(level=logging.INFO)
+import os
+import sys
+import time
+from typing import Any, Dict, List, Optional, Union
+
+import gin
+
+# pyre-ignore [21]
+import mlperf_loadgen as lg  # @manual
+import numpy as np
+import torch
+from generative_recommenders.common import set_dev_mode, set_verbose_level
+from generative_recommenders.dlrm_v3.configs import (
+    get_embedding_table_config,
+    get_hstu_configs,
+)
+from generative_recommenders.dlrm_v3.datasets.dataset import Dataset, Samples
+from generative_recommenders.dlrm_v3.datasets.synthetic_streaming import (
+    DLRMv3SyntheticStreamingDataset,
+)
+from generative_recommenders.dlrm_v3.inference.data_producer import (
+    MultiThreadDataProducer,
+    QueryItem,
+    SingleThreadDataProducer,
+)
+from generative_recommenders.dlrm_v3.inference.inference_modules import set_is_inference
+from generative_recommenders.dlrm_v3.inference.model_family import HSTUModelFamily
+from generative_recommenders.dlrm_v3.utils import (
+    get_dataset,
+    profiler_or_nullcontext,
+    SUPPORTED_DATASETS,
+)
+
+
+logger: logging.Logger = logging.getLogger("main")
+
+torch.multiprocessing.set_start_method("spawn", force=True)
+
+USER_CONF = f"{os.path.dirname(__file__)}/user.conf"
+
+SUPPORTED_CONFIGS = {
+    "debug": "debug.gin",
+    "kuairand-1k": "kuairand_1k.gin",
+    "movielens-13b": "movielens_13b.gin",
+    "streaming-400m": "streaming_400m.gin",
+    "sampled-streaming-100b": "streaming_100b.gin",
+}
+
+
+SCENARIO_MAP = {  # pyre-ignore [5]
+    "Server": lg.TestScenario.Server,
+    "Offline": lg.TestScenario.Offline,
+}
+
+
+def get_args():  # pyre-ignore [3]
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset", default="debug", choices=SUPPORTED_DATASETS, help="dataset"
+    )
+    args, unknown_args = parser.parse_known_args()
+    logger.warning(f"unknown_args: {unknown_args}")
+    return args
+
+
+class Runner:
+    """
+    Orchestrates inference benchmark execution.
+
+    Manages data production, model inference, and result collection for
+    MLPerf LoadGen-based benchmarking.
+
+    Args:
+        model: The HSTU model family instance for making predictions.
+        ds: Dataset to fetch samples from.
+        num_queries: Total number of queries to process.
+        data_producer_threads: Number of threads for data loading (default: 1).
+        batchsize: Batch size for inference (default: 128).
+        compute_eval: Whether to compute evaluation metrics (default: False).
+    """
+
+    def __init__(
+        self,
+        model: HSTUModelFamily,
+        ds: Dataset,
+        num_queries: int,
+        data_producer_threads: int = 1,
+        batchsize: int = 128,
+        compute_eval: bool = False,
+    ) -> None:
+        self.model = model
+        if data_producer_threads == 1:
+            self.data_producer: Union[
+                MultiThreadDataProducer, SingleThreadDataProducer
+            ] = SingleThreadDataProducer(ds, self.run_one_item)
+        else:
+            self.data_producer = MultiThreadDataProducer(
+                ds, data_producer_threads, self.run_one_item
+            )
+        self.batchsize = batchsize
+        self.compute_eval = compute_eval
+        self.reset_states(num_queries=num_queries)
+
+    def reset_states(self, num_queries: int) -> None:
+        """
+        Reset all internal state for a new benchmark run.
+
+        Args:
+            num_queries: Number of queries expected in this run.
+        """
+        self.result_timing: List[Dict[str, float]] = []
+        self.result_batches: List[int] = []
+        self.current_query_ids: List[int] = []
+        self.current_content_ids: List[int] = []
+        self.current_t0: List[float] = []
+        self.num_queries: int = num_queries
+        self.processed_queries: int = 0
+
+    def run_one_item(self, qitem: QueryItem) -> None:
+        """
+        Process a single query item through model inference.
+
+        Runs prediction, records timing metrics, and sends results back to LoadGen.
+
+        Args:
+            qitem: Query item containing batch of samples to process.
+        """
+        try:
+            t0_prediction: float = time.time()
+            prediction_output = self.model.predict(qitem.samples)
+            dt_prediction: float = time.time() - t0_prediction
+            assert prediction_output is not None
+            (
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+                dt_sparse,
+                dt_dense,
+            ) = prediction_output
+            if self.compute_eval:
+                assert mt_target_labels is not None
+                assert mt_target_weights is not None
+            self.result_timing.append(
+                {
+                    "total": time.time() - qitem.start,
+                    "prediction": dt_prediction,
+                    "queue": qitem.dt_queue,
+                    "batching": qitem.dt_batching,
+                    "sparse": dt_sparse,
+                    "dense": dt_dense,
+                }
+            )
+            self.result_batches.append(len(qitem.query_ids))
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.error("thread: failed, %s", ex)
+        finally:
+            candidate_size = mt_target_preds.size(1) // len(qitem.query_ids)
+            if not self.compute_eval:
+                for i, query_id in enumerate(qitem.query_ids):
+                    query_mt_target_preds = (
+                        mt_target_preds[  # pyre-ignore [61]
+                            0,
+                            candidate_size * i : candidate_size * (i + 1),
+                        ]
+                        .view(-1)
+                        .float()
+                        .numpy()
+                    )
+                    response_array = array.array("B", query_mt_target_preds.tobytes())
+                    bi = response_array.buffer_info()
+                    # since we send buffer to loadgen, needs `response_array` in memory during send
+                    lg.QuerySamplesComplete(
+                        [lg.QuerySampleResponse(query_id, bi[0], bi[1])]
+                    )
+            else:
+                for i, query_id in enumerate(qitem.query_ids):
+                    query_mt_target_preds = (
+                        mt_target_preds[  # pyre-ignore [61]
+                            0, candidate_size * i : candidate_size * (i + 1)
+                        ]
+                        .view(-1)
+                        .float()
+                        .numpy()
+                    )
+                    query_mt_target_labels = (
+                        mt_target_labels[  # pyre-ignore [16,61]
+                            0, candidate_size * i : candidate_size * (i + 1)
+                        ]
+                        .view(-1)
+                        .float()
+                        .numpy()
+                    )
+                    query_mt_target_weights = (
+                        mt_target_weights[  # pyre-ignore [61]
+                            0, candidate_size * i : candidate_size * (i + 1)
+                        ]
+                        .view(-1)
+                        .float()
+                        .numpy()
+                    )
+                    np_array = np.concatenate(
+                        [
+                            query_mt_target_preds,
+                            query_mt_target_labels,
+                            query_mt_target_weights,
+                            np.array([candidate_size]).astype(np.float32),
+                        ]
+                    )
+                    response_array = array.array("B", np_array.tobytes())
+                    bi = response_array.buffer_info()
+                    # since we send buffer to loadgen, needs `response_array` in memory during send
+                    lg.QuerySamplesComplete(
+                        [lg.QuerySampleResponse(query_id, bi[0], bi[1])]
+                    )
+
+    def enqueue(self, query_samples, t0: float) -> None:  # pyre-ignore [2]
+        """
+        Enqueue query samples for batch processing.
+
+        Collects samples until batch size is reached, then dispatches to data producer.
+
+        Args:
+            query_samples: List of LoadGen query sample objects.
+            t0: Timestamp when this batch started.
+        """
+        self.current_query_ids.extend([q.id for q in query_samples])
+        self.current_content_ids.extend([q.index for q in query_samples])
+        self.current_t0.append(t0)
+        self.processed_queries += len(query_samples)
+        t0: float = min(self.current_t0)
+        dt_queue: float = max(self.current_t0) - min(self.current_t0)
+        if (
+            self.processed_queries >= self.num_queries
+            or len(self.current_query_ids) >= self.batchsize
+        ):
+            for i in range(len(self.current_query_ids) // self.batchsize):
+                self.data_producer.enqueue(
+                    query_ids=self.current_query_ids[
+                        i * self.batchsize : (i + 1) * self.batchsize
+                    ],
+                    content_ids=self.current_content_ids[
+                        i * self.batchsize : (i + 1) * self.batchsize
+                    ],
+                    t0=t0,
+                    dt_queue=dt_queue,
+                )
+            remaining_s: int = len(self.current_query_ids) % self.batchsize
+            if remaining_s > 0:
+                self.data_producer.enqueue(
+                    query_ids=self.current_query_ids[-remaining_s:],
+                    content_ids=self.current_content_ids[-remaining_s:],
+                    t0=t0,
+                    dt_queue=dt_queue,
+                )
+            self.current_query_ids = []
+            self.current_content_ids = []
+            self.current_t0 = []
+
+    def finish(self) -> None:
+        """Signal data producer to finish and wait for completion."""
+        self.data_producer.finish()
+
+
+def add_results(
+    final_results: Dict[str, Any],
+    result_timing: List[Dict[str, float]],
+    result_batches: List[int],
+) -> None:
+    """
+    Aggregate and log benchmark results.
+
+    Computes percentile statistics and QPS metrics from timing data.
+
+    Args:
+        final_results: Dictionary to populate with aggregated results.
+        result_timing: List of timing dictionaries for each batch.
+        result_batches: List of batch sizes processed.
+    """
+    percentiles: list[float] = [50.0, 80.0, 90.0, 95.0, 99.0, 99.9]
+    buckets_dict: Dict[str, List[float]] = {}
+    buckets_str_dict: Dict[str, str] = {}
+    total_timing: list[float] = [result["total"] for result in result_timing]
+    for key in ["total", "prediction", "queue", "batching", "sparse", "dense"]:
+        timing: list[float] = [result[key] for result in result_timing]
+        buckets: List[float] = np.percentile(timing, percentiles).tolist()
+        buckets_str: str = ",".join(
+            ["| {}:{:.4f}| ".format(p, b) for p, b in zip(percentiles, buckets)]
+        )
+        buckets_dict[key] = buckets
+        buckets_str_dict[key] = buckets_str
+    total_batches = sum(result_batches)
+
+    final_results["good"] = len(total_timing)
+    final_results["avg_time"] = np.mean(total_timing)
+    final_results["percentiles"] = {
+        str(k): v for k, v in zip(percentiles, buckets_dict["total"])
+    }
+    final_results["qps"] = total_batches / final_results["took"]
+    final_results["count"] = total_batches
+
+    for i, timing in enumerate(result_timing):
+        logger.warning(f"timing of {i}: {timing}")
+
+    logger.warning(
+        "{} qps={:.2f}, avg_query_time={:.4f}, time={:.3f}, queries={}, tiles={}".format(
+            final_results["scenario"],
+            final_results["qps"],
+            final_results["avg_time"],
+            final_results["took"],
+            len(result_timing),
+            buckets_str_dict["total"],
+        )
+    )
+    for key in ["prediction", "queue", "batching", "sparse", "dense"]:
+        logger.warning(f"{key}: {buckets_str_dict[key]}")
+
+
+def get_num_queries(
+    input_size: Optional[int],
+    one_pass_size: int,
+    scenario_name: str,
+    offline_target_qps: int,
+    target_duration: float,
+) -> int:
+    """
+    Determine the number of queries to run based on scenario and settings.
+
+    Args:
+        input_size: User-specified query count (None to use defaults).
+        one_pass_size: Size of one complete pass through the dataset.
+        scenario_name: MLPerf scenario name ('Server' or 'Offline').
+        offline_target_qps: Target QPS for offline scenario.
+        target_duration: Target duration in milliseconds.
+
+    Returns:
+        Number of queries to execute in the benchmark run.
+    """
+    if scenario_name == "Offline":
+        # consistent with https://github.com/mlcommons/inference/blob/8999c4d686f6e4a180da14597c97063fce7c9f33/loadgen/test_settings_internal.cc#L147
+        return int(1.1 * target_duration / 1000 * offline_target_qps)
+    else:
+        if input_size is None:
+            return one_pass_size
+        return input_size
+
+
+class StreamingQuerySampler:
+    """
+    Sampler for streaming dataset
+    The execution order is determined by `StreamingQuerySampler.run_order`, not by the QSL or input query ID.
+    This ensures that queries are executed according to their timestamp constraints.
+    """
+
+    def __init__(
+        self,
+        ds: DLRMv3SyntheticStreamingDataset,
+        dataset_percentage: float,
+        scenario_name: str,
+        offline_target_qps: int,
+        target_duration: float,
+        input_queries: Optional[int] = None,
+        compute_eval: bool = False,
+    ) -> None:
+        self.ds: DLRMv3SyntheticStreamingDataset = ds
+        self.ds.is_inference = True
+        self.inference_ts: int = self.ds.total_ts - self.ds.train_ts
+        self.start_ts: int = self.ds.train_ts
+        self.dataset_percentage: float = dataset_percentage
+        self.num_unique_requests: List[int] = self.get_num_unique_requests(
+            warmup_ratio=1.0
+        )
+        self.num_unique_requests_cumsum: List[int] = np.cumsum(
+            self.num_unique_requests
+        ).tolist()
+        self.total_requests: int = sum(self.num_unique_requests)
+        self.run_order: List[List[int]] = self.build_random_exec_order()
+        self.ts_idx: int = 0
+        self.ts_processed_cnt: int = 0
+        self.last_loaded: float = -1.0
+        num_queries: int = get_num_queries(
+            input_size=input_queries,
+            one_pass_size=self.total_requests,
+            scenario_name=scenario_name,
+            offline_target_qps=offline_target_qps,
+            target_duration=target_duration,
+        )
+        logger.warning(
+            f"StreamingQuerySampler constructred to handle {num_queries} queries"
+        )
+        self.num_repeats: int = (
+            max(1, num_queries // self.total_requests) if not compute_eval else 1
+        )
+        self.remaining_queries: int = (
+            num_queries % self.total_requests if not compute_eval else 0
+        )
+        self._lock = threading.Lock()
+
+    def get_num_unique_requests(self, warmup_ratio: float) -> List[int]:
+        """
+        Calculate number of unique requests per timestamp.
+
+        Args:
+            warmup_ratio: Fraction of users to include in warmup.
+
+        Returns:
+            List of request counts per timestamp.
+        """
+        num_unique_requests = [
+            int(
+                self.ds.ts_to_users_cumsum[t][-1]
+                * self.dataset_percentage
+                * warmup_ratio
+            )
+            for t in range(self.start_ts, self.start_ts + self.inference_ts)
+        ]
+        return num_unique_requests
+
+    def build_random_exec_order(self) -> List[List[int]]:
+        """
+        Build randomized execution order for each timestamp.
+
+        Returns:
+            List of shuffled index lists, one per timestamp.
+        """
+        order = []
+        for req_size in self.num_unique_requests:
+            within_ts_order = list(range(req_size))
+            random.shuffle(within_ts_order)
+            order.append(within_ts_order)
+        return order
+
+    def init_sut(self) -> None:
+        """Initialize System Under Test state for a new benchmark run."""
+        self.ts_idx = 0
+        self.ts_processed_cnt = 0
+        self.ds.set_ts(self.start_ts)
+
+    def load_query_samples(self, query_ids: List[Optional[int]]) -> None:
+        """
+        Load query samples into memory for the benchmark.
+
+        Args:
+            query_ids: List of query identifiers to load.
+        """
+        length = len(query_ids)
+        ts_idx: int = 0
+        while self.num_unique_requests_cumsum[ts_idx] < length:
+            ts_idx += 1
+        for i in range(0, ts_idx):
+            self.ds.set_ts(i + self.start_ts)
+            self.ds.load_query_samples(self.run_order[i])
+        self.ds.set_ts(ts_idx + self.start_ts)
+        delta_length = (
+            length
+            if ts_idx == 0
+            else length - self.num_unique_requests_cumsum[ts_idx - 1]
+        )
+        self.ds.load_query_samples(self.run_order[ts_idx][:delta_length])
+        self.init_sut()
+        self.last_loaded = time.time()
+
+    def unload_query_samples(self, sample_list: List[int]) -> None:
+        """
+        Unload query samples from memory.
+
+        Args:
+            sample_list: List of sample identifiers to unload.
+        """
+        self.ds.unload_query_samples(sample_list)
+
+    def get_samples(self, id_list: List[int]) -> List[Samples]:
+        """
+        Get samples for a batch of queries, handling timestamp boundaries.
+
+        Args:
+            id_list: List of query identifiers.
+
+        Returns:
+            List of Samples objects, potentially spanning multiple timestamps.
+        """
+        batch_size: int = len(id_list)
+        with self._lock:
+            curr_ts_idx: int = self.ts_idx
+            curr_ts_unique_requests: int = self.num_unique_requests[curr_ts_idx]
+            curr_ts_queries: int = curr_ts_unique_requests * self.num_repeats
+            if curr_ts_idx == self.inference_ts - 1:
+                curr_ts_queries += self.remaining_queries
+            begin_query_idx: int = self.ts_processed_cnt
+            end_query_idx: int = min(begin_query_idx + batch_size, curr_ts_queries)
+            begin_request_idx: int = begin_query_idx % curr_ts_unique_requests
+            end_request_idx: int = end_query_idx % curr_ts_unique_requests
+            if begin_query_idx + batch_size >= curr_ts_queries:
+                self.ts_idx += 1
+                self.ts_processed_cnt = begin_query_idx + batch_size - curr_ts_queries
+            else:
+                self.ts_processed_cnt = begin_query_idx + batch_size
+        # requests of current ts
+        outputs: List[Samples] = []
+        if end_request_idx > begin_request_idx:
+            output: Samples = self.ds.get_samples_with_ts(
+                self.run_order[curr_ts_idx][begin_request_idx:end_request_idx],
+                curr_ts_idx + self.start_ts,
+            )
+            outputs.append(output)
+        else:
+            if begin_request_idx < curr_ts_unique_requests:
+                output: Samples = self.ds.get_samples_with_ts(
+                    self.run_order[curr_ts_idx][begin_request_idx:],
+                    curr_ts_idx + self.start_ts,
+                )
+                outputs.append(output)
+            if end_request_idx > 0:
+                output = self.ds.get_samples_with_ts(
+                    self.run_order[curr_ts_idx][0:end_request_idx],
+                    curr_ts_idx + self.start_ts,
+                )
+                outputs.append(output)
+        # requests of next ts
+        if begin_query_idx + batch_size > curr_ts_queries:
+            output: Samples = self.ds.get_samples_with_ts(
+                self.run_order[curr_ts_idx + 1][
+                    : begin_query_idx + batch_size - curr_ts_queries
+                ],
+                curr_ts_idx + 1 + self.start_ts,
+            )
+            outputs.append(output)
+        return outputs
+
+    def get_item_count(self) -> int:
+        """
+        Get total number of items in the dataset.
+
+        Returns:
+            Total request count across all timestamps.
+        """
+        return self.total_requests
+
+
+@gin.configurable
+def run(
+    dataset: str = "sampled-streaming-100b",
+    model_path: str = "",
+    scenario_name: str = "Server",
+    batchsize: int = 16,
+    output_trace: bool = False,
+    data_producer_threads: int = 4,
+    compute_eval: bool = False,
+    find_peak_performance: bool = False,
+    dataset_path_prefix: str = "",
+    train_split_percentage: float = 0.75,
+    warmup_ratio: float = 0.1,
+    target_qps: Optional[int] = None,
+    num_queries: Optional[int] = None,
+    numpy_rand_seed: int = 123,
+    sparse_quant: bool = False,
+    dataset_percentage: float = 1.0,
+) -> None:
+    """
+    Execute the MLPerf DLRMv3 inference benchmark.
+
+    Sets up the model, dataset, and LoadGen infrastructure, then runs
+    warmup and official benchmark phases.
+
+    Args:
+        dataset: Dataset identifier to use.
+        model_path: Path to model checkpoint directory.
+        scenario_name: MLPerf scenario ('Server' or 'Offline').
+        batchsize: Batch size for inference.
+        output_trace: Whether to output profiling traces.
+        data_producer_threads: Number of data loading threads.
+        compute_eval: Whether to compute accuracy metrics.
+        find_peak_performance: Whether to run peak performance finding mode.
+        dataset_path_prefix: Prefix path for dataset files.
+        warmup_ratio: Fraction of data to use for warmup.
+        target_qps: Target queries per second.
+        num_queries: Number of queries to run (None for automatic).
+        numpy_rand_seed: Random seed for reproducibility.
+        sparse_quant: Whether to quantize sparse embeddings.
+        dataset_percentage: Fraction of dataset to use.
+    """
+    set_dev_mode(False)
+    if scenario_name not in SCENARIO_MAP:
+        raise NotImplementedError("valid scanarios:" + str(list(SCENARIO_MAP.keys())))
+    scenario = SCENARIO_MAP[scenario_name]
+    np.random.seed(numpy_rand_seed)
+    random.seed(numpy_rand_seed)
+
+    hstu_config = get_hstu_configs(dataset)
+    hstu_config.max_num_candidates = hstu_config.max_num_candidates_inference
+    table_config = get_embedding_table_config(dataset)
+    set_is_inference(is_inference=not compute_eval)
+
+    user_conf = os.path.abspath(USER_CONF)
+    if not os.path.exists(user_conf):
+        logger.error("{} not found".format(user_conf))
+        sys.exit(1)
+
+    settings = lg.TestSettings()
+    settings.FromConfig(user_conf, model_path, scenario_name)
+    settings.scenario = scenario
+    settings.mode = lg.TestMode.PerformanceOnly
+    if compute_eval:
+        settings.mode = lg.TestMode.AccuracyOnly
+    if find_peak_performance:
+        settings.mode = lg.TestMode.FindPeakPerformance
+    if target_qps:
+        settings.server_target_qps = float(target_qps)
+        settings.offline_expected_qps = float(target_qps)
+
+    model_family = HSTUModelFamily(
+        hstu_config=hstu_config,
+        table_config=table_config,
+        sparse_quant=sparse_quant,
+        output_trace=output_trace,
+        compute_eval=compute_eval,
+    )
+    is_streaming: bool = "streaming" in dataset
+    dataset, kwargs = get_dataset(dataset, dataset_path_prefix)
+
+    ds: Dataset = dataset(
+        hstu_config=hstu_config,
+        embedding_config=table_config,
+        is_inference=not compute_eval,
+        **kwargs,
+    )
+    if is_streaming:
+        ds = StreamingQuerySampler(  # pyre-ignore
+            ds=ds,  # pyre-ignore [6]
+            dataset_percentage=dataset_percentage,
+            input_queries=num_queries,
+            compute_eval=compute_eval,
+            scenario_name=scenario_name,
+            offline_target_qps=settings.offline_expected_qps,
+            target_duration=settings.min_duration_ms,
+        )
+    model_family.load(model_path)
+
+    # warmup
+    for autotune_bs in range(batchsize, 0, -1):
+        logger.warning(f"Autotune for batch size {autotune_bs}")
+        warmup_ids = list(range(autotune_bs))
+        ds.load_query_samples(warmup_ids)
+        for _ in range(4 * int(os.environ.get("WORLD_SIZE", 1))):
+            if is_streaming:
+                ds.init_sut()  # pyre-ignore [16]
+            sample: Union[Samples, List[Samples]] = ds.get_samples(warmup_ids)
+            if isinstance(sample, Samples):
+                model_family.predict(sample)
+            else:
+                for s in sample:
+                    model_family.predict(s)
+        ds.unload_query_samples(None)
+    for h in logger.handlers:
+        h.flush()
+    logger.info("Model forward warmup done")
+
+    count = int(
+        ds.get_item_count() * dataset_percentage
+        if not is_streaming
+        else ds.get_item_count()
+    )
+    train_size: int = round(train_split_percentage * count) if not is_streaming else 0
+    if compute_eval:
+        count = count - train_size
+
+    runner: Runner = Runner(
+        model_family,
+        ds,
+        data_producer_threads=data_producer_threads,
+        batchsize=batchsize,
+        compute_eval=compute_eval,
+        num_queries=count,
+    )
+
+    def issue_queries(query_samples) -> None:  # pyre-ignore [2]
+        if compute_eval:
+            for sample in query_samples:
+                sample.index = sample.index + train_size
+        runner.enqueue(query_samples, time.time())
+
+    def load_query_samples(query_ids: List[int]) -> None:
+        if compute_eval:
+            query_ids = [q + train_size for q in query_ids]
+        ds.load_query_samples(query_ids)
+
+    def flush_queries() -> None:
+        pass
+
+    if scenario == lg.TestScenario.Server:
+        # inference benchmark warmup
+        if is_streaming:
+            ds.init_sut()
+            warmup_count: int = sum(
+                ds.get_num_unique_requests(  # pyre-ignore [16]
+                    warmup_ratio=warmup_ratio
+                )
+            )
+        else:
+            warmup_count: int = int(count * warmup_ratio)
+        runner.reset_states(num_queries=warmup_count)
+        final_results = {
+            "runtime": model_family.name(),
+            "version": model_family.version(),
+            "time": int(time.time()),
+            "scenario": str(scenario),
+        }
+        settings.min_query_count = warmup_count
+        settings.max_query_count = warmup_count
+        sut = lg.ConstructSUT(issue_queries, flush_queries)
+        qsl = lg.ConstructQSL(
+            warmup_count,
+            warmup_count,
+            load_query_samples,
+            ds.unload_query_samples,
+        )
+        with profiler_or_nullcontext(enabled=output_trace, with_stack=False):
+            logger.info(f"starting warmup {scenario} with {warmup_count} queries")
+            lg.StartTest(sut, qsl, settings)
+            lg.DestroyQSL(qsl)
+            lg.DestroySUT(sut)
+
+    # official run
+    if is_streaming:
+        ds.init_sut()
+    final_results = {
+        "runtime": model_family.name(),
+        "version": model_family.version(),
+        "time": int(time.time()),
+        "scenario": str(scenario),
+    }
+    query_size: int = get_num_queries(
+        input_size=num_queries,
+        one_pass_size=count,
+        scenario_name=scenario_name,
+        offline_target_qps=settings.offline_expected_qps,
+        target_duration=settings.min_duration_ms,
+    )
+    settings.min_query_count = query_size
+    settings.max_query_count = query_size
+    runner.reset_states(num_queries=query_size if not compute_eval else count)
+    sut = lg.ConstructSUT(issue_queries, flush_queries)
+    qsl = lg.ConstructQSL(
+        count,
+        count,
+        load_query_samples,
+        ds.unload_query_samples,
+    )
+    with profiler_or_nullcontext(enabled=output_trace, with_stack=False):
+        logger.info(
+            f"starting {scenario} with {query_size} queries and {query_size // count} repeats"
+        )
+        lg.StartTest(sut, qsl, settings)
+        runner.finish()
+        final_results["took"] = time.time() - ds.last_loaded
+        lg.DestroyQSL(qsl)
+        lg.DestroySUT(sut)
+
+    add_results(
+        final_results,
+        runner.result_timing,
+        runner.result_batches,
+    )
+    # If multiple subprocesses are running the model send a signal to stop them
+    if int(os.environ.get("WORLD_SIZE", 1)) > 1:
+        model_family.predict(None)
+
+
+def main() -> None:
+    set_verbose_level(1)
+    args = get_args()
+    logger.info(args)
+    gin_path = f"{os.path.dirname(__file__)}/gin/{SUPPORTED_CONFIGS[args.dataset]}"
+    gin.parse_config_file(gin_path)
+    run(dataset=args.dataset)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf b/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf
new file mode 100644
index 000000000..a2b4f6fff
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf
@@ -0,0 +1,98 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# Set performance_sample_count for each model.
+# User can optionally set this to higher values in user.conf.
+resnet50.*.performance_sample_count_override = 1024
+ssd-mobilenet.*.performance_sample_count_override = 256
+retinanet.*.performance_sample_count_override = 64
+bert.*.performance_sample_count_override = 10833
+dlrm.*.performance_sample_count_override = 204800
+dlrm-v2.*.performance_sample_count_override = 204800
+rnnt.*.performance_sample_count_override = 2513
+gptj.*.performance_sample_count_override = 13368
+llama2-70b.*.performance_sample_count_override = 24576
+stable-diffusion-xl.*.performance_sample_count_override = 5000
+# set to 0 to let entire sample set to be performance sample
+3d-unet.*.performance_sample_count_override = 0
+
+# Set seeds. The seeds will be distributed two weeks before the submission.
+*.*.qsl_rng_seed = 3066443479025735752
+*.*.sample_index_rng_seed = 10688027786191513374
+*.*.schedule_rng_seed = 14962580496156340209
+# Set seeds for TEST_05. The seeds will be distributed two weeks before the submission.
+*.*.test05_qsl_rng_seed = 16799458546791641818
+*.*.test05_sample_index_rng_seed = 5453809927556429288
+*.*.test05_schedule_rng_seed = 5435552105434836064
+
+
+*.SingleStream.target_latency_percentile = 90
+*.SingleStream.min_duration = 600000
+
+*.MultiStream.target_latency_percentile = 99
+*.MultiStream.samples_per_query = 8
+*.MultiStream.min_duration = 600000
+*.MultiStream.min_query_count = 662
+retinanet.MultiStream.target_latency = 528
+
+# 3D-UNet uses equal issue mode because it has non-uniform inputs
+3d-unet.*.sample_concatenate_permutation = 1
+
+# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
+gptj.*.sample_concatenate_permutation = 1
+llama2-70b.*.sample_concatenate_permutation = 1
+mixtral-8x7b.*.sample_concatenate_permutation = 1
+
+*.Server.target_latency = 10
+*.Server.target_latency_percentile = 99
+*.Server.target_duration = 0
+*.Server.min_duration = 600000
+resnet50.Server.target_latency = 15
+retinanet.Server.target_latency = 100
+bert.Server.target_latency = 130
+dlrm.Server.target_latency = 60
+dlrm-v2.Server.target_latency = 60
+rnnt.Server.target_latency = 1000
+gptj.Server.target_latency = 20000
+stable-diffusion-xl.Server.target_latency = 20000
+# Llama2-70b benchmarks measures token latencies
+llama2-70b.*.use_token_latencies = 1
+mixtral-8x7b.*.use_token_latencies = 1
+# gptj benchmark infers token latencies
+gptj.*.infer_token_latencies = 1
+gptj.*.token_latency_scaling_factor = 69
+# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0
+llama2-70b.Server.target_latency = 0
+llama2-70b.Server.ttft_latency = 2000
+llama2-70b.Server.tpot_latency = 200
+
+mixtral-8x7b.Server.target_latency = 0
+mixtral-8x7b.Server.ttft_latency = 2000
+mixtral-8x7b.Server.tpot_latency = 200
+
+*.Offline.target_latency_percentile = 90
+*.Offline.min_duration = 600000
+
+# In Offline scenario, we always have one query. But LoadGen maps this to
+# min_sample_count internally in Offline scenario. If the dataset size is larger
+# than 24576 we limit the min_query_count to 24576 and otherwise we use
+# the dataset size as the limit
+
+resnet50.Offline.min_query_count = 24576
+retinanet.Offline.min_query_count = 24576
+dlrm-v2.Offline.min_query_count = 24576
+bert.Offline.min_query_count = 10833
+gptj.Offline.min_query_count = 13368
+rnnt.Offline.min_query_count = 2513
+3d-unet.Offline.min_query_count = 43
+stable-diffusion-xl.Offline.min_query_count = 5000
+llama2-70b.Offline.min_query_count = 24576
+mixtral-8x7b.Offline.min_query_count = 15000
+
+# These fields should be defined and overridden by user.conf.
+*.SingleStream.target_latency = 10
+*.MultiStream.target_latency = 80
+*.Server.target_qps = 1.0
+*.Offline.target_qps = 1.0
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py
new file mode 100644
index 000000000..1c8bcd237
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py
@@ -0,0 +1,705 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+model_family for dlrm_v3.
+"""
+
+import copy
+import functools
+import logging
+import os
+import time
+import uuid
+from threading import Event
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.multiprocessing as mp
+import torchrec
+from generative_recommenders.dlrm_v3.checkpoint import (
+    load_nonsparse_checkpoint,
+    load_sparse_checkpoint,
+)
+from generative_recommenders.dlrm_v3.configs import HASH_SIZE_1B
+from generative_recommenders.dlrm_v3.datasets.dataset import Samples
+from generative_recommenders.dlrm_v3.inference.inference_modules import (
+    get_hstu_model,
+    HSTUSparseInferenceModule,
+    move_sparse_output_to_device,
+    set_is_inference,
+)
+from generative_recommenders.dlrm_v3.utils import Profiler
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig, SequenceEmbedding
+from pyre_extensions import none_throws
+from torch import quantization as quant
+from torchrec.distributed.quant_embedding import QuantEmbeddingCollection
+from torchrec.modules.embedding_configs import EmbeddingConfig, QuantConfig
+from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
+from torchrec.sparse.tensor_dict import maybe_td_to_kjt
+from torchrec.test_utils import get_free_port
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class HSTUModelFamily:
+    """
+    High-level interface for the HSTU model family.
+
+    Manages both sparse (embedding) and dense (transformer) components of the
+    HSTU model, supporting distributed inference across multiple GPUs.
+
+    Args:
+        hstu_config: Configuration object for the HSTU model.
+        table_config: Dictionary of embedding table configurations.
+        output_trace: Whether to enable profiling trace output.
+        sparse_quant: Whether to quantize sparse embeddings.
+        compute_eval: Whether to compute evaluation metrics (includes labels).
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        table_config: Dict[str, EmbeddingConfig],
+        output_trace: bool = False,
+        sparse_quant: bool = False,
+        compute_eval: bool = False,
+    ) -> None:
+        self.hstu_config = hstu_config
+        self.table_config = table_config
+        self.sparse: ModelFamilySparseDist = ModelFamilySparseDist(
+            hstu_config=hstu_config,
+            table_config=table_config,
+            quant=sparse_quant,
+        )
+
+        assert torch.cuda.is_available(), "CUDA is required for this benchmark."
+        ngpus = torch.cuda.device_count()
+        self.world_size = int(os.environ.get("WORLD_SIZE", str(ngpus)))
+        logger.warning(f"Using {self.world_size} GPU(s)...")
+        dense_model_family_clazz = (
+            ModelFamilyDenseDist
+            if self.world_size > 1
+            else ModelFamilyDenseSingleWorker
+        )
+        self.dense: Union[ModelFamilyDenseDist, ModelFamilyDenseSingleWorker] = (
+            dense_model_family_clazz(
+                hstu_config=hstu_config,
+                table_config=table_config,
+                output_trace=output_trace,
+                compute_eval=compute_eval,
+            )
+        )
+
+    def version(self) -> str:
+        """Return the PyTorch version string."""
+        return torch.__version__
+
+    def name(self) -> str:
+        """Return the model family name identifier."""
+        return "model-family-hstu"
+
+    def load(self, model_path: str) -> None:
+        """
+        Load model checkpoints from disk.
+
+        Args:
+            model_path: Base path to the model checkpoint directory.
+        """
+        self.sparse.load(model_path=model_path)
+        self.dense.load(model_path=model_path)
+
+    def predict(
+        self, samples: Optional[Samples]
+    ) -> Optional[
+        Tuple[
+            torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], float, float
+        ]
+    ]:
+        """
+        Run inference on a batch of samples.
+
+        Processes samples through sparse embeddings, then dense forward pass.
+
+        Args:
+            samples: Input samples containing features. If None, signals shutdown.
+
+        Returns:
+            Tuple of (predictions, labels, weights, sparse_time, dense_time) or None.
+        """
+        with torch.no_grad():
+            if samples is None:
+                self.dense.predict(None, None, 0, None, 0, None)
+                return None
+            (
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+                dt_sparse,
+            ) = self.sparse.predict(samples)
+            out = self.dense.predict(
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+            )
+            (  # pyre-ignore [23]
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+                dt_dense,
+            ) = out
+            return (
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+                dt_sparse,
+                dt_dense,
+            )
+
+
+def ec_patched_forward_wo_embedding_copy(
+    ec_module: torchrec.EmbeddingCollection,
+    features: KeyedJaggedTensor,  # can also take TensorDict as input
+) -> Dict[str, JaggedTensor]:
+    """
+    Run the EmbeddingBagCollection forward pass. This method takes in a `KeyedJaggedTensor`
+    and returns a `Dict[str, JaggedTensor]`, which is the result of the individual embeddings for each feature.
+
+    Args:
+        features (KeyedJaggedTensor): KJT of form [F X B X L].
+
+    Returns:
+        Dict[str, JaggedTensor]
+    """
+    features = maybe_td_to_kjt(features, None)
+    feature_embeddings: Dict[str, JaggedTensor] = {}
+    jt_dict: Dict[str, JaggedTensor] = features.to_dict()
+    for i, emb_module in enumerate(ec_module.embeddings.values()):
+        feature_names = ec_module._feature_names[i]
+        embedding_names = ec_module._embedding_names_by_table[i]
+        for j, embedding_name in enumerate(embedding_names):
+            feature_name = feature_names[j]
+            f = jt_dict[feature_name]
+            indices = torch.clamp(f.values(), min=0, max=HASH_SIZE_1B - 1)
+            lookup = emb_module(
+                input=indices
+            )  # remove the dtype cast at https://github.com/meta-pytorch/torchrec/blob/0a2cebd5472a7edc5072b3c912ad8aaa4179b9d9/torchrec/modules/embedding_modules.py#L486
+            feature_embeddings[embedding_name] = JaggedTensor(
+                values=lookup,
+                lengths=f.lengths(),
+                weights=f.values() if ec_module._need_indices else None,
+            )
+    return feature_embeddings
+
+
+class ModelFamilySparseDist:
+    """
+    Sparse Arch module manager.
+
+    Handles loading and inference of sparse embedding lookups, optionally
+    with quantization for memory efficiency.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        table_config: Embedding table configurations.
+        quant: Whether to apply dynamic quantization to embeddings.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        table_config: Dict[str, EmbeddingConfig],
+        quant: bool = False,
+    ) -> None:
+        super(ModelFamilySparseDist, self).__init__()
+        self.hstu_config = hstu_config
+        self.table_config = table_config
+        self.module: Optional[torch.nn.Module] = None
+        self.quant: bool = quant
+
+    def load(self, model_path: str) -> None:
+        """
+        Load sparse model checkpoint and optionally apply quantization.
+
+        Args:
+            model_path: Path to the model checkpoint directory.
+        """
+        logger.warning(f"Loading sparse module from {model_path}")
+
+        sparse_arch: HSTUSparseInferenceModule = HSTUSparseInferenceModule(
+            table_config=self.table_config,
+            hstu_config=self.hstu_config,
+        )
+        load_sparse_checkpoint(model=sparse_arch._hstu_model, path=model_path)
+        sparse_arch.eval()
+        if self.quant:
+            self.module = quant.quantize_dynamic(
+                sparse_arch,
+                qconfig_spec={
+                    torchrec.EmbeddingCollection: QuantConfig(
+                        activation=quant.PlaceholderObserver.with_args(
+                            dtype=torch.float
+                        ),
+                        weight=quant.PlaceholderObserver.with_args(dtype=torch.int8),
+                    ),
+                },
+                mapping={
+                    torchrec.EmbeddingCollection: QuantEmbeddingCollection,
+                },
+                inplace=False,
+            )
+        else:
+            sparse_arch._hstu_model._embedding_collection.forward = (  # pyre-ignore[8]
+                functools.partial(
+                    ec_patched_forward_wo_embedding_copy,
+                    sparse_arch._hstu_model._embedding_collection,
+                )
+            )
+            self.module = sparse_arch
+        logger.warning(f"sparse module is {self.module}")
+
+    def predict(
+        self, samples: Samples
+    ) -> Tuple[
+        Dict[str, SequenceEmbedding],
+        Dict[str, torch.Tensor],
+        int,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+        float,
+    ]:
+        """
+        Run sparse forward pass (embedding lookups).
+
+        Args:
+            samples: Input samples with feature tensors.
+
+        Returns:
+            Tuple of (seq_embeddings, payload_features, max_uih_len, uih_seq_lengths,
+            max_num_candidates, num_candidates, elapsed_time).
+        """
+        with torch.profiler.record_function("sparse forward"):
+            module: torch.nn.Module = none_throws(self.module)
+            assert self.module is not None
+            uih_features = samples.uih_features_kjt
+            candidates_features = samples.candidates_features_kjt
+            t0: float = time.time()
+            (
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+            ) = module(
+                uih_features=uih_features,
+                candidates_features=candidates_features,
+            )
+            dt_sparse: float = time.time() - t0
+            return (
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+                dt_sparse,
+            )
+
+
+class ModelFamilyDenseDist:
+    """
+    Distributed dense module manager for multi-GPU inference.
+
+    Spawns worker processes for each GPU to run dense forward passes in parallel,
+    with samples distributed via inter-process queues.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        table_config: Embedding table configurations.
+        output_trace: Whether to enable profiling traces.
+        compute_eval: Whether to compute evaluation metrics.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        table_config: Dict[str, EmbeddingConfig],
+        output_trace: bool = False,
+        compute_eval: bool = False,
+    ) -> None:
+        super(ModelFamilyDenseDist, self).__init__()
+        self.hstu_config = hstu_config
+        self.table_config = table_config
+        self.output_trace = output_trace
+        self.compute_eval = compute_eval
+
+        ngpus = torch.cuda.device_count()
+        self.world_size = int(os.environ.get("WORLD_SIZE", str(ngpus)))
+        self.rank = 0
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = str(get_free_port())
+        self.dist_backend = "nccl"
+
+        ctx = mp.get_context("spawn")
+        self.samples_q: List[mp.Queue] = [ctx.Queue() for _ in range(self.world_size)]
+        self.result_q: List[mp.Queue] = [ctx.Queue() for _ in range(self.world_size)]
+
+    def load(self, model_path: str) -> None:
+        """
+        Load dense model and spawn worker processes for distributed inference.
+
+        Args:
+            model_path: Path to the model checkpoint directory.
+        """
+        logger.warning(f"Loading dense module from {model_path}")
+
+        ctx = mp.get_context("spawn")
+        processes = []
+        for rank in range(self.world_size):
+            p = ctx.Process(
+                target=self.distributed_setup,
+                args=(
+                    rank,
+                    self.world_size,
+                    model_path,
+                ),
+            )
+            p.start()
+            processes.append(p)
+
+    def distributed_setup(self, rank: int, world_size: int, model_path: str) -> None:
+        """
+        Initialize and run a dense worker process.
+
+        Each worker loads the model, processes samples from its queue, and
+        returns results.
+
+        Args:
+            rank: Process rank (GPU index).
+            world_size: Total number of worker processes.
+            model_path: Path to model checkpoint.
+        """
+        nprocs_per_rank = 16
+        start_core: int = nprocs_per_rank * rank
+        cores: set[int] = set([start_core + i for i in range(nprocs_per_rank)])
+        os.sched_setaffinity(0, cores)
+        set_is_inference(is_inference=not self.compute_eval)
+        model = get_hstu_model(
+            table_config=self.table_config,
+            hstu_config=self.hstu_config,
+            table_device="cpu",
+            max_hash_size=100,
+            is_dense=True,
+        ).to(torch.bfloat16)
+        model.set_training_dtype(torch.bfloat16)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(f"cuda:{rank}")
+        load_nonsparse_checkpoint(
+            model=model, device=device, optimizer=None, path=model_path
+        )
+        model = model.to(device)
+        model.eval()
+        profiler = Profiler(rank) if self.output_trace else None
+
+        with torch.no_grad():
+            while True:
+                item = self.samples_q[rank].get()
+                # If -1 is received terminate all subprocesses
+                if item == -1:
+                    break
+                if self.output_trace:
+                    assert profiler is not None
+                    profiler.step()
+                with torch.profiler.record_function("get_item_from_queue"):
+                    # Copy here to release data in the producer to avoid invalid cuda caching allocator release.
+                    item = copy.deepcopy(item)
+                    (
+                        id,
+                        seq_embeddings,
+                        payload_features,
+                        max_uih_len,
+                        uih_seq_lengths,
+                        max_num_candidates,
+                        num_candidates,
+                    ) = item
+                    assert seq_embeddings is not None
+                with torch.profiler.record_function("dense forward"):
+                    (
+                        _,
+                        _,
+                        _,
+                        mt_target_preds,
+                        mt_target_labels,
+                        mt_target_weights,
+                    ) = model.main_forward(
+                        seq_embeddings=seq_embeddings,
+                        payload_features=payload_features,
+                        max_uih_len=max_uih_len,
+                        uih_seq_lengths=uih_seq_lengths,
+                        max_num_candidates=max_num_candidates,
+                        num_candidates=num_candidates,
+                    )
+                    # mt_target_preds = torch.empty(1, 2048 * 20).to(device="cpu")
+                    # mt_target_labels = None
+                    # mt_target_weights = None
+                    assert mt_target_preds is not None
+                    mt_target_preds = mt_target_preds.detach().to(device="cpu")
+                    if mt_target_labels is not None:
+                        mt_target_labels = mt_target_labels.detach().to(device="cpu")
+                    if mt_target_weights is not None:
+                        mt_target_weights = mt_target_weights.detach().to(device="cpu")
+                    self.result_q[rank].put(
+                        (id, mt_target_preds, mt_target_labels, mt_target_weights)
+                    )
+
+    def capture_output(
+        self, id: uuid.UUID, rank: int
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Retrieve inference results from a worker process.
+
+        Args:
+            id: Unique identifier for the request.
+            rank: Worker rank to retrieve from.
+
+        Returns:
+            Tuple of (predictions, labels, weights).
+        """
+        while True:
+            recv_id, preds, labels, weights = self.result_q[rank].get()
+            assert recv_id == id
+            return preds, labels, weights
+
+    def get_rank(self) -> int:
+        """
+        Get the next worker rank for load balancing.
+
+        Returns:
+            Rank index, cycling through available workers.
+        """
+        rank = self.rank
+        self.rank = (self.rank + 1) % self.world_size
+        return rank
+
+    def predict(
+        self,
+        seq_embeddings: Optional[Dict[str, SequenceEmbedding]],
+        payload_features: Optional[Dict[str, torch.Tensor]],
+        max_uih_len: int,
+        uih_seq_lengths: Optional[torch.Tensor],
+        max_num_candidates: int,
+        num_candidates: Optional[torch.Tensor],
+    ) -> Optional[
+        Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], float]
+    ]:
+        """
+        Run distributed dense forward pass.
+
+        Dispatches work to a worker process and collects results.
+
+        Args:
+            seq_embeddings: Sequence embeddings from sparse module.
+            payload_features: Additional feature tensors.
+            max_uih_len: Maximum UIH sequence length.
+            uih_seq_lengths: Per-sample UIH lengths.
+            max_num_candidates: Maximum candidates per sample.
+            num_candidates: Per-sample candidate counts.
+
+        Returns:
+            Tuple of (predictions, labels, weights, elapsed_time) or None if shutdown.
+        """
+        id = uuid.uuid4()
+        # If none is received terminate all subprocesses
+        if seq_embeddings is None:
+            for rank in range(self.world_size):
+                self.samples_q[rank].put(-1)
+            return None
+        rank = self.get_rank()
+        device = torch.device(f"cuda:{rank}")
+        assert (
+            payload_features is not None
+            and num_candidates is not None
+            and uih_seq_lengths is not None
+        )
+        t0: float = time.time()
+        seq_embeddings, payload_features, uih_seq_lengths, num_candidates = (
+            move_sparse_output_to_device(
+                seq_embeddings=seq_embeddings,
+                payload_features=payload_features,
+                uih_seq_lengths=uih_seq_lengths,
+                num_candidates=num_candidates,
+                device=device,
+            )
+        )
+        self.samples_q[rank].put(
+            (
+                id,
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+            )
+        )
+        (mt_target_preds, mt_target_labels, mt_target_weights) = self.capture_output(
+            id, rank
+        )
+        dt_dense = time.time() - t0
+        return (
+            mt_target_preds,
+            mt_target_labels,
+            mt_target_weights,
+            dt_dense,
+        )
+
+
+class ModelFamilyDenseSingleWorker:
+    """
+    Single-worker dense module manager for single-GPU inference.
+
+    Simpler alternative to ModelFamilyDenseDist for single-GPU setups.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        table_config: Embedding table configurations.
+        output_trace: Whether to enable profiling traces.
+        compute_eval: Whether to compute evaluation metrics.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        table_config: Dict[str, EmbeddingConfig],
+        output_trace: bool = False,
+        compute_eval: bool = False,
+    ) -> None:
+        self.model: Optional[torch.nn.Module] = None
+        self.hstu_config = hstu_config
+        self.table_config = table_config
+        self.output_trace = output_trace
+        self.device: torch.device = torch.device("cuda:0")
+        torch.cuda.set_device(self.device)
+        self.profiler: Optional[Profiler] = (
+            Profiler(rank=0) if self.output_trace else None
+        )
+
+    def load(self, model_path: str) -> None:
+        """
+        Load dense model for single-GPU inference.
+
+        Args:
+            model_path: Path to the model checkpoint directory.
+        """
+        logger.warning(f"Loading dense module from {model_path}")
+        self.model = (
+            get_hstu_model(
+                table_config=self.table_config,
+                hstu_config=self.hstu_config,
+                table_device="cpu",
+                is_dense=True,
+            )
+            .to(self.device)
+            .to(torch.bfloat16)
+        )
+        self.model.set_training_dtype(torch.bfloat16)
+        load_nonsparse_checkpoint(
+            model=self.model, device=self.device, optimizer=None, path=model_path
+        )
+        assert self.model is not None
+        self.model.eval()
+
+    def predict(
+        self,
+        seq_embeddings: Optional[Dict[str, SequenceEmbedding]],
+        payload_features: Optional[Dict[str, torch.Tensor]],
+        max_uih_len: int,
+        uih_seq_lengths: Optional[torch.Tensor],
+        max_num_candidates: int,
+        num_candidates: Optional[torch.Tensor],
+    ) -> Optional[
+        Tuple[
+            torch.Tensor,
+            Optional[torch.Tensor],
+            Optional[torch.Tensor],
+            float,
+        ]
+    ]:
+        """
+        Run dense forward pass on single GPU.
+
+        Args:
+            seq_embeddings: Sequence embeddings from sparse module.
+            payload_features: Additional feature tensors.
+            max_uih_len: Maximum UIH sequence length.
+            uih_seq_lengths: Per-sample UIH lengths.
+            max_num_candidates: Maximum candidates per sample.
+            num_candidates: Per-sample candidate counts.
+
+        Returns:
+            Tuple of (predictions, labels, weights, elapsed_time).
+        """
+        if self.output_trace:
+            assert self.profiler is not None
+            self.profiler.step()
+        assert (
+            payload_features is not None
+            and uih_seq_lengths is not None
+            and num_candidates is not None
+            and seq_embeddings is not None
+        )
+        t0: float = time.time()
+        with torch.profiler.record_function("dense forward"):
+            seq_embeddings, payload_features, uih_seq_lengths, num_candidates = (
+                move_sparse_output_to_device(
+                    seq_embeddings=seq_embeddings,
+                    payload_features=payload_features,
+                    uih_seq_lengths=uih_seq_lengths,
+                    num_candidates=num_candidates,
+                    device=self.device,
+                )
+            )
+            assert self.model is not None
+            (
+                _,
+                _,
+                _,
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+            ) = self.model.main_forward(  # pyre-ignore [29]
+                seq_embeddings=seq_embeddings,
+                payload_features=payload_features,
+                max_uih_len=max_uih_len,
+                uih_seq_lengths=uih_seq_lengths,
+                max_num_candidates=max_num_candidates,
+                num_candidates=num_candidates,
+            )
+            assert mt_target_preds is not None
+            mt_target_preds = mt_target_preds.detach().to(device="cpu")
+            if mt_target_labels is not None:
+                mt_target_labels = mt_target_labels.detach().to(device="cpu")
+            if mt_target_weights is not None:
+                mt_target_weights = mt_target_weights.detach().to(device="cpu")
+            dt_dense: float = time.time() - t0
+            return mt_target_preds, mt_target_labels, mt_target_weights, dt_dense
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py
new file mode 100644
index 000000000..e3ec10415
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py
@@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+TorchScript-friendly wrapper for the HSTU sparse path (CPU embedding lookup).
+
+``HSTUSparseScriptModule`` wraps :class:`HSTUSparseInferenceModule` and
+flattens the ``Dict[str, SequenceEmbedding]`` output into the parallel
+value/length dicts defined in :mod:`ts_types` so the boundary is composed
+entirely of TorchScript-supported types.
+"""
+
+from typing import Dict, Tuple
+
+import torch
+from generative_recommenders.dlrm_v3.inference.inference_modules import (
+    _NoCopyEmbeddingCollection,
+    HSTUSparseInferenceModule,
+)
+from generative_recommenders.dlrm_v3.inference.ts_types import (
+    flatten_seq_embeddings,
+    SeqEmbLengths,
+    SeqEmbValues,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.modules.embedding_configs import EmbeddingConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+
+class HSTUSparseScriptModule(torch.nn.Module):
+    """Script-friendly sparse module.
+
+    ``forward`` returns 5 tensors / dicts (no Python ``int`` scalars):
+
+    1. ``seq_emb_values``   ``Dict[str, Tensor]`` -- jagged embedding values.
+    2. ``seq_emb_lengths``  ``Dict[str, Tensor]`` -- per-feature lengths.
+    3. ``payload_features`` ``Dict[str, Tensor]`` -- side features.
+    4. ``uih_seq_lengths``  ``Tensor[B]``        -- UIH lengths.
+    5. ``num_candidates``   ``Tensor[B]``        -- candidate counts.
+
+    The dense module (or the C++ glue) recovers the ``int`` ``max_uih_len`` /
+    ``max_num_candidates`` values from these tensors via ``.max().item()``.
+    """
+
+    def __init__(
+        self,
+        table_config: Dict[str, EmbeddingConfig],
+        hstu_config: DlrmHSTUConfig,
+        use_no_copy_embedding_collection: bool = True,
+    ) -> None:
+        super().__init__()
+        self._sparse: HSTUSparseInferenceModule = HSTUSparseInferenceModule(
+            table_config=table_config,
+            hstu_config=hstu_config,
+        )
+        if use_no_copy_embedding_collection:
+            # Re-class the existing EmbeddingCollection so TorchScript picks up
+            # the no-copy ``forward`` override (matches the eager-only
+            # ``ec_patched_forward_wo_embedding_copy`` monkey-patch).
+            self._sparse._hstu_model._embedding_collection.__class__ = (
+                _NoCopyEmbeddingCollection
+            )
+
+    def forward(
+        self,
+        uih_features: KeyedJaggedTensor,
+        candidates_features: KeyedJaggedTensor,
+    ) -> Tuple[
+        SeqEmbValues,
+        SeqEmbLengths,
+        Dict[str, torch.Tensor],
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        (
+            seq_embeddings,
+            payload_features,
+            _max_uih_len,
+            uih_seq_lengths,
+            _max_num_candidates,
+            num_candidates,
+        ) = self._sparse(
+            uih_features=uih_features,
+            candidates_features=candidates_features,
+        )
+        seq_emb_values, seq_emb_lengths = flatten_seq_embeddings(seq_embeddings)
+        return (
+            seq_emb_values,
+            seq_emb_lengths,
+            payload_features,
+            uih_seq_lengths,
+            num_candidates,
+        )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py
new file mode 100644
index 000000000..948f10618
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from generative_recommenders.common import gpu_unavailable
+from generative_recommenders.dlrm_v3.inference.main import main
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class DLRMV3InferenceTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    @given(
+        world_size=st.sampled_from([1]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+        deadline=None,
+    )
+    def test_e2e(self, world_size: int) -> None:
+        os.environ["WORLD_SIZE"] = str(world_size)
+        main()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py
new file mode 100644
index 000000000..34d0388ea
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py
@@ -0,0 +1,236 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+Numerical parity test: eager HSTU vs traced (sparse + dense) on a synthetic
+batch.
+
+The production deployment path (see ``end_to_end_test.py``) uses
+``torch.jit.trace``, not ``torch.jit.script``, for the HSTU sparse/dense
+wrappers. Tracing records the actual tensor ops executed during a forward
+pass and ignores source-level dispatch logic (HammerKernel enum,
+``is_fx_tracing()``, ``torch.autocast``, IntEnum branches) that scripting
+cannot compile. This unit test mirrors that path.
+
+Tolerances are deliberately loose because the traced path replaces the
+Triton fused kernels with PyTorch fallbacks and skips ``torch.autocast`` in
+the user-forward block; both can perturb low-order bits in bf16.
+"""
+
+import unittest
+from typing import Dict, List, Tuple
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel
+from generative_recommenders.dlrm_v3.configs import (
+    get_embedding_table_config,
+    get_hstu_configs,
+)
+from generative_recommenders.dlrm_v3.datasets.dataset import get_random_data
+from generative_recommenders.dlrm_v3.inference.dense_predict_module import (
+    HSTUDenseScriptModule,
+)
+from generative_recommenders.dlrm_v3.inference.sparse_predict_module import (
+    HSTUSparseScriptModule,
+)
+from generative_recommenders.dlrm_v3.inference.ts_types import (
+    SeqEmbLengths,
+    SeqEmbValues,
+)
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+
+_DATASET = "kuairand-1k"
+
+
+def _move_dense_inputs(
+    seq_emb_values: Dict[str, torch.Tensor],
+    seq_emb_lengths: Dict[str, torch.Tensor],
+    payload_features: Dict[str, torch.Tensor],
+    uih_seq_lengths: torch.Tensor,
+    num_candidates: torch.Tensor,
+    device: torch.device,
+) -> Tuple[
+    Dict[str, torch.Tensor],
+    Dict[str, torch.Tensor],
+    Dict[str, torch.Tensor],
+    torch.Tensor,
+    torch.Tensor,
+]:
+    """C++-side ``move_sparse_output_to_device`` analog for the test."""
+    return (
+        {k: v.to(device).to(torch.bfloat16) for k, v in seq_emb_values.items()},
+        {k: v.to(device) for k, v in seq_emb_lengths.items()},
+        {k: v.to(device) for k, v in payload_features.items()},
+        uih_seq_lengths.to(device),
+        num_candidates.to(device),
+    )
+
+
+class _SparseTraceShim(torch.nn.Module):
+    """Adapter that takes raw tensors and rebuilds the KJTs inside forward.
+
+    ``torch.jit.trace`` does not accept ``KeyedJaggedTensor`` (or any
+    non-Tensor / non-collection-of-Tensor type) as a top-level forward
+    input, so we make the traced boundary tensor-only and bake the
+    ``List[str]`` of feature keys in as module attributes.
+    """
+
+    def __init__(
+        self,
+        sparse_module: HSTUSparseScriptModule,
+        uih_keys: List[str],
+        candidates_keys: List[str],
+    ) -> None:
+        super().__init__()
+        self._sparse_module: HSTUSparseScriptModule = sparse_module
+        self._uih_keys: List[str] = uih_keys
+        self._candidates_keys: List[str] = candidates_keys
+
+    def forward(
+        self,
+        uih_lengths: torch.Tensor,
+        uih_values: torch.Tensor,
+        candidates_lengths: torch.Tensor,
+        candidates_values: torch.Tensor,
+    ) -> Tuple[
+        SeqEmbValues,
+        SeqEmbLengths,
+        Dict[str, torch.Tensor],
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        uih_kjt = KeyedJaggedTensor(
+            keys=self._uih_keys,
+            lengths=uih_lengths,
+            values=uih_values,
+        )
+        candidates_kjt = KeyedJaggedTensor(
+            keys=self._candidates_keys,
+            lengths=candidates_lengths,
+            values=candidates_values,
+        )
+        return self._sparse_module(
+            uih_features=uih_kjt, candidates_features=candidates_kjt
+        )
+
+
+class HSTUScriptedParityTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_scripted_matches_eager(self) -> None:
+        torch.manual_seed(0)
+        device = torch.device("cuda:0")
+        torch.cuda.set_device(device)
+
+        hstu_config = get_hstu_configs(_DATASET)
+        table_config = get_embedding_table_config(_DATASET)
+
+        # Some embedding tables in kuairand-1k are tiny (e.g.
+        # user_active_degree has num_embeddings=8). Clamp the random value
+        # range so every index stays in range for every table; otherwise the
+        # default value_bound=1000 triggers an out-of-range embedding lookup.
+        min_rows = min(t.num_embeddings for t in table_config.values())
+        value_bound = max(2, min_rows)
+
+        uih_kjt, candidates_kjt = get_random_data(
+            contexual_features=list(
+                hstu_config.contextual_feature_to_max_length.keys()
+            ),
+            hstu_uih_keys=hstu_config.hstu_uih_feature_names,
+            hstu_candidates_keys=hstu_config.hstu_candidate_feature_names,
+            uih_max_seq_len=128,
+            max_num_candidates=hstu_config.max_num_candidates_inference,
+            value_bound=value_bound,
+        )
+
+        sparse_module = HSTUSparseScriptModule(
+            table_config=table_config,
+            hstu_config=hstu_config,
+            use_no_copy_embedding_collection=True,
+        ).eval()
+        dense_module = (
+            HSTUDenseScriptModule(
+                hstu_config=hstu_config,
+                table_config=table_config,
+            )
+            .to(torch.bfloat16)
+            .to(device)
+            .eval()
+        )
+
+        # Pin the HammerKernel to PyTorch on both wrappers. The Triton
+        # kernels use Python-level dispatch (autotune, constexpr arguments)
+        # that interacts badly with torch.jit.trace's recording pass. The
+        # eager reference run uses the same setting so the comparison is
+        # apples-to-apples.
+        sparse_module._sparse._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH)
+        dense_module._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH)
+
+        # === Eager reference path ===
+        with torch.no_grad():
+            sparse_out_e = sparse_module(
+                uih_features=uih_kjt, candidates_features=candidates_kjt
+            )
+            dense_inputs_e = _move_dense_inputs(*sparse_out_e, device=device)
+            preds_eager = dense_module(*dense_inputs_e)
+
+        # === Traced path ===
+        # Sparse is traced via a raw-tensor shim because KJT is not a valid
+        # traced input. Dense is traced directly with the eager sparse
+        # output as the example.
+        sparse_shim = _SparseTraceShim(
+            sparse_module=sparse_module,
+            uih_keys=list(uih_kjt.keys()),
+            candidates_keys=list(candidates_kjt.keys()),
+        )
+        traced_sparse = torch.jit.trace(
+            sparse_shim,
+            example_inputs=(
+                uih_kjt.lengths(),
+                uih_kjt.values(),
+                candidates_kjt.lengths(),
+                candidates_kjt.values(),
+            ),
+            strict=False,
+            check_trace=False,
+        )
+        traced_dense = torch.jit.trace(
+            dense_module,
+            example_inputs=tuple(dense_inputs_e),
+            strict=False,
+            check_trace=False,
+        )
+
+        with torch.no_grad():
+            sparse_out_t = traced_sparse(
+                uih_kjt.lengths(),
+                uih_kjt.values(),
+                candidates_kjt.lengths(),
+                candidates_kjt.values(),
+            )
+            dense_inputs_t = _move_dense_inputs(*sparse_out_t, device=device)
+            preds_traced = traced_dense(*dense_inputs_t)
+
+        torch.testing.assert_close(
+            preds_eager.float(),
+            preds_traced.float(),
+            atol=1e-2,
+            rtol=1e-2,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format
new file mode 100644
index 000000000..f08c9c2c8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format
@@ -0,0 +1,2 @@
+BasedOnStyle: Google
+Standard: Cpp11
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt
new file mode 100644
index 000000000..4fec0e44f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt
@@ -0,0 +1,113 @@
+cmake_minimum_required(VERSION 3.12)
+
+project(mlperf_loadgen)
+
+# Read the version file
+file(READ "${CMAKE_SOURCE_DIR}/VERSION.txt" VERSION_CONTENTS)
+
+# Extract the major, minor, and patch versions from the VERSION file (assuming "MAJOR.MINOR.PATCH" format)
+string(REGEX MATCH "^([0-9]+)\\.([0-9]+)\\.([0-9]+)" VERSION_MATCH ${VERSION_CONTENTS})
+
+# Set the variables for the major, minor, and patch versions
+set(mlperf_loadgen_VERSION_MAJOR "${CMAKE_MATCH_1}")
+set(mlperf_loadgen_VERSION_MINOR "${CMAKE_MATCH_2}")
+set(mlperf_loadgen_VERSION_PATCH "${CMAKE_MATCH_3}")
+
+# Check if the version format was parsed correctly
+if(NOT DEFINED mlperf_loadgen_VERSION_MAJOR OR NOT DEFINED mlperf_loadgen_VERSION_MINOR OR NOT DEFINED mlperf_loadgen_VERSION_PATCH)
+    message(FATAL_ERROR "Version format in VERSION.txt is incorrect. Expected format: MAJOR.MINOR.PATCH")
+endif()
+
+# Print out the version
+message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}.${mlperf_loadgen_VERSION_PATCH}")
+
+# Set build options. NB: CXX_STANDARD is supported since CMake 3.1.
+if (NOT MSVC)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -W -Wall")
+endif()
+# Extra build options can be specified by setting the MLPERF_LOADGEN_CXX_FLAGS variable
+if (MLPERF_LOADGEN_CXX_FLAGS)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MLPERF_LOADGEN_CXX_FLAGS}")
+endif()
+message(STATUS "Using C++ compiler flags: ${CMAKE_CXX_FLAGS}")
+set(CMAKE_CXX_STANDARD "14")
+message(STATUS "Using C++ standard: ${CMAKE_CXX_STANDARD}")
+message(STATUS "Using static linker flags: ${CMAKE_STATIC_LINKER_FLAGS}")
+message(STATUS "Using shared linker flags: ${CMAKE_SHARED_LINKER_FLAGS}")
+
+# Output directory for libraries.
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+message(STATUS "Using output path: ${LIBRARY_OUTPUT_PATH}")
+
+# Detect Python to use for generating source file with version info.
+# NB: PythonInterp has been deprecated since CMake 3.12
+# but it works with earlier versions of CMake.
+find_package(PythonInterp)
+message(STATUS "Using Python interpreter: ${PYTHON_EXECUTABLE}")
+
+# Specify the source and destination files
+set(CONF_FILE "mlperf.conf")
+set(HEADER_FILE "mlperf_conf.h")
+
+# Read the content of the configuration file
+file(READ ${CONF_FILE} CONF_CONTENTS)
+
+# Escape all double quotes and backslashes
+string(REPLACE "\\" "\\\\" CONF_CONTENTS "${CONF_CONTENTS}")
+string(REPLACE "\"" "\\\"" CONF_CONTENTS "${CONF_CONTENTS}")
+
+# Handle new lines
+string(REPLACE "\n" "\\n\"\n\"" CONF_CONTENTS "${CONF_CONTENTS}")
+
+# Wrap the content in a C++ string declaration
+set(FORMATTED_CONTENT "const char* mlperf_conf =\n\"${CONF_CONTENTS}\";\n")
+
+# Write the formatted content to the header file
+file(WRITE ${HEADER_FILE} "${FORMATTED_CONTENT}")
+
+message(STATUS "Output config:  ${CMAKE_BINARY_DIR}/mlperf_conf.h")
+
+# Generate source file with version info.
+execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/version_generator.py ${CMAKE_BINARY_DIR}/version_generated.cc ${CMAKE_CURRENT_SOURCE_DIR})
+
+# Add source files.
+set(SOURCE
+  ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/early_stopping.cc  
+  ${CMAKE_CURRENT_SOURCE_DIR}/issue_query_controller.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/loadgen.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/logging.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/logging.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/results.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/results.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/version.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/version.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/mlperf_conf.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/VERSION.txt
+  ${CMAKE_BINARY_DIR}/version_generated.cc
+)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+add_library(mlperf_loadgen STATIC ${SOURCE})
+target_link_libraries(mlperf_loadgen)
+
+if(WIN32)
+set (LIBS "")
+else()
+set (LIBS pthread)
+endif()
+
+add_executable(benchmark benchmark/repro.cpp)
+target_link_libraries(benchmark PUBLIC mlperf_loadgen ${LIBS})
+
+# Install library and headers.
+install(TARGETS mlperf_loadgen
+	DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/
+	DESTINATION ${CMAKE_INSTALL_PREFIX}/include FILES_MATCHING PATTERN "*.h")
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in
new file mode 100644
index 000000000..152b53111
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in
@@ -0,0 +1,2 @@
+include VERSION.txt
+include mlperf.conf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md
new file mode 100644
index 000000000..212c8a53c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md
@@ -0,0 +1,223 @@
+# Overview {#mainpage}
+
+## Introduction
+
+* The LoadGen is a *reusable* module that *efficiently* and *fairly* measures
+  the performance of inference systems.
+* It generates traffic for scenarios as formulated by a diverse set of experts
+  in the [MLCommons working group](https://mlcommons.org/).
+* The scenarios emulate the workloads seen in mobile devices,
+  autonomous vehicles, robotics, and cloud-based setups.
+* Although the LoadGen is not model or dataset aware, its strength is in its
+  reusability with logic that is.
+
+## Integration Example and Flow
+The following is an diagram of how the LoadGen can be integrated into an
+inference system, resembling how some of the MLPerf reference models are
+implemented.
+<div style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
+<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/loadgen_integration_diagram.svg" width="500px" style="padding: 20px">
+<ol style="padding: 20px">
+<li>Benchmark knows the model, dataset, and preprocessing.</li>
+<li>Benchmark hands dataset sample IDs to LoadGen.</li>
+<li>LoadGen starts generating queries of sample IDs.</li>
+<li>Benchmark creates requests to backend.</li>
+<li>Result is post processed and forwarded to LoadGen.</li>
+<li>LoadGen outputs logs for analysis.<br>
+</ol>
+</div>
+
+## Useful Links
+* [FAQ](README_FAQ.md)
+* [LoadGen Build Instructions](README_BUILD.md)
+* [LoadGen API](loadgen.h)
+* [Test Settings](test_settings.h) -
+  A good description of available scenarios, modes, and knobs.
+* [MLPerf Inference Code](https://github.com/mlcommons/inference) -
+  Includes source for the LoadGen and reference models that use the LoadGen.
+* [MLPerf Inference Rules](https://github.com/mlcommons/inference_policies) -
+  Any mismatch with this is a bug in the LoadGen.
+
+## Scope of the LoadGen's Responsibilities
+
+### In Scope
+* **Provide a reusable** C++ library with python bindings.
+* **Implement** the traffic patterns of the MLPerf Inference scenarios and
+  modes.
+* **Record** all traffic generated and received for later analysis and
+  verification.
+* **Summarize** the results and whether performance constraints were met.
+* **Target high-performance** systems with efficient multi-thread friendly
+  logging utilities.
+* **Generate trust** via a shared, well-tested, and community-hardened
+  code base.
+
+### Out of Scope
+The LoadGen is:
+* **NOT** aware of the ML model it is running against.
+* **NOT** aware of the data formats of the model's inputs and outputs.
+* **NOT** aware of how to score the accuracy of a model's outputs.
+* **NOT** aware of MLPerf rules regarding scenario-specific constraints.
+
+Limitting the scope of the LoadGen in this way keeps it reusable across
+different models and datasets without modification. Using composition and
+dependency injection, the user can define their own model, datasets, and
+metrics.
+
+Additionally, not hardcoding MLPerf-specific test constraints, like test
+duration and performance targets, allows users to use the LoadGen unmodified
+for custom testing and continuous integration purposes.
+
+## Submission Considerations
+
+### Upstream all local modifications
+* As a rule, no local modifications to the LoadGen's C++ library are allowed
+for submission.
+* Please upstream early and often to keep the playing field level.
+
+### Choose your TestSettings carefully!
+* Since the LoadGen is oblivious to the model, it can't enforce the MLPerf
+requirements for submission. *e.g.:* target percentiles and latencies.
+* For verification, the values in TestSettings are logged.
+* To help make sure your settings are spec compliant, use
+TestSettings::FromConfig in conjunction with the relevant config file provided
+with the reference models.
+
+## Responsibilities of a LoadGen User
+
+### Implement the Interfaces
+* Implement the SystemUnderTest and QuerySampleLibrary interfaces and pass
+  them to the StartTest function.
+* Call QuerySampleComplete for every sample received by
+  SystemUnderTest::IssueQuery.
+
+### Assess Accuracy
+* Process the *mlperf_log_accuracy.json* output by the LoadGen to determine
+  the accuracy of your system.
+* For the official models, Python scripts will be provided by the MLPerf model
+  owners for you to do this automatically.
+
+For templates of how to do the above in detail, refer to code for the demos,
+tests, and reference models.
+
+
+## LoadGen over the Network
+
+For reference, on a high level a submission looks like this:
+
+<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
+<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_submission.png" width="300px" style="padding: 20px">
+</div>
+
+The LoadGen implementation is common to all submissions, while the QSL (“Query Sample Library”) and SUT (“System Under Test”) are implemented by submitters. QSL is responsible for loading the data and includes untimed preprocessing.
+
+A submission over the network introduces a new component “QDL” (query dispatch library) that is added to the system as presented in the following diagram:
+
+<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
+<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_network_submission.png" width="300px" style="padding: 20px">
+</div>
+
+QDL is a proxy for a load-balancer, that dispatches queries to SUT over a physical network, receives the responses and passes them back to LoadGen.  It is implemented by the submitter. The interface of the QDL is the same as the API to SUT. 
+
+In scenarios using QDL, data may be compressed in QSL at the choice of the submitter in order to reduce network transmission time. Decompression is part of the timed processing in SUT. A set of approved standard compression schemes will be specified for each benchmark; additional compression schemes must be approved in advance by the Working Group.
+
+All communication between LoadGen/QSL and SUT is via QDL, and all communication between QDL and SUT must pass over a physical network.
+
+QDL implements the protocol to transmit queries over the network and receive responses. It also implements decompression of any response returned by the SUT, where compression of responses is allowed. Performing any part of the timed preprocessing or inference in QDL is specifically disallowed. Currently no batching is allowed in QDL, although this may be revisited in future.
+
+The MLperf over the Network will run in Server mode and Offline mode. All LoadGen modes are expected to work as is with insignificant changes. These include running the test in performance mode, accuracy mode, find peak performance mode and compliance mode. The same applies for power measurements.
+
+### QDL details
+The Query Dispatch Library is implemented by the submitter and interfaces with LoadGen using the same SUT API. All MLPerf Inference SUTs implement the `mlperf::SystemUnderTest` class which is defined in system_under_test.h. The QDL implements `mlperf::QueryDispatchLibrary` class which inherits the `mlperf::SystemUnderTest` class and has the same API and support all existing `mlperf::SystemUnderTest` methods. It has a separate header file query_dispatch_library.h. Using sut with `mlperf::SystemUnderTest` class in LoadGen StartTest is natively upcasting `mlperf::QueryDispatchLibrary` class.
+
+#### QDL Query issue and response over the network
+
+The QDL gets the queries from the LoadGen through 
+```CPP
+void IssueQuery(const std::vector<QuerySample>& samples)
+```
+
+The QDL dispatches the queries to the SUT over the physical media. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. Submitter implementation includes all methods required to serialize the query, load balance, drive it to the Operating system and network interface card and send to the SUT.
+
+The QDL receives the query responses over the network from the SUT. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. The submitter implementation includes all methods required to receive the network data from the Network Interface card, go through the Operating system, deserialize the query response, and provide it back to the LoadGen through query completion by:
+
+```CPP
+struct QuerySampleResponse {
+  ResponseId id;
+  uintptr_t data;
+  size_t size;
+};
+void QuerySamplesComplete(QuerySampleResponse* responses, 
+                          size_t response_count);
+
+```
+
+#### QDL Additional Methods
+
+In addition to that the QDL needs to implement the following methods that are provided by the SUT interface to the LoadGen:
+```CPP
+const std::string& Name();
+```
+The `Name` function returns a known string for over the Network SUTs to identify it as over the network benchmark.
+```CPP
+void FlushQueries();
+```
+
+It is not specified here how the QDL would query and configure the SUT to execute the above methods. The QDL responds to the LoadGen after receiving its own response from the SUT.
+
+### Example
+
+Refer to [LON demo](demos/lon) for a reference example illustrating usage of Loadgen over the network.
+
+## Find Peak Performance Mode
+
+The Find Peak Performance mode can be used to find the optimal queries per second (QPS) for the server scenario.
+
+### Setup
+
+You can setup loadgen to run this mode by setting the `mode` variable in the `test_settings` used to run the test. Using the Python API:
+
+```python
+settings = mlperf_loadgen.TestSettings()
+settings.server_target_qps = 100
+settings.scenario = mlperf_loadgen.TestScenario.Server
+settings.mode = mlperf_loadgen.TestMode.FindPeakPerformance
+...
+
+mlperf_loadgen.StartTest(sut, qsl, settings)
+```
+
+Using the C/C++ API:
+```CPP
+mlperf::TestSettings settings;
+setting.server_target_qps = 100;
+settings.scenario = mlperf::TestScenario::Server;
+settings.mode = mlperf::TestMode::FindPeakPerformance;
+mlperf::LogSettings log_settings;
+/*
+Construct QSL and SUT
+*/
+mlperf::StartTest(&sut, &qsl, settings, log_settings);
+```
+
+**Note:** Make sure you are setting the TestScenario to server and you are providing an initial target QPS.
+
+### Description
+
+The Find Peak Performance mode works by finding a lower and upper boundary for the optimal QPS. Then performing a binary search between the lower and upper bound to find the optimal QPS.
+
+#### Finding lower and upper boundary
+
+LoadGen begins by running performance mode at the specified target QPS. If the test passes, this value is used as the lower bound; otherwise, an error is raised. The algorithm then guesses the upper bound as twice the target QPS. 
+
+Then LoadGen will run performance mode using the upper bound guess. If the test is successful, both the lower bound and upper bound will be doubled. This repeats until the upper bound guess fails the test.
+
+```
+[initial_target_qps, 2*initial_target_qps] -> [2*initial_target_qps, 4*initial_target_qps] -> [4*initial_target_qps, 8*initial_target_qps]...
+```
+
+Finally, the final lower bound and upper bound are set to their current values. This process assures that the lower bound passes the performance mode, but the upper bound doesn’t.
+
+#### Binary Search
+
+Once the lower and upper bounds are set, binary search can be performed over the range `[lower, upper]`` to find the optimal QPS. If a given QPS fails in performance mode, the optimal value lies below it; if it passes, the optimal is higher.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md
new file mode 100644
index 000000000..499cc360a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md
@@ -0,0 +1,47 @@
+# Building the LoadGen {#ReadmeBuild}
+
+## Prerequisites
+
+    sudo apt-get install libglib2.0-dev python-pip python3-pip
+    pip2 install absl-py numpy
+    pip3 install absl-py numpy
+
+## Quick Start
+### Installation - Python
+
+    pip install absl-py numpy
+    git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
+    cd mlperf_inference/loadgen
+    CFLAGS="-std=c++14 -O3" python -m pip install .
+
+This will fetch the loadgen source, build and install the loadgen as a python module, and run a simple end-to-end demo.
+
+Alternatively, we provide wheels for several python versions and operating system that can be installed using pip directly.
+
+    pip install mlperf-loadgen
+
+**NOTE:** Take into account that we only update the published wheels after an official release, they may not include the latest changes.
+
+### Testing your Installation
+The following command will run a simple end-to-end demo:
+
+    python mlperf_inference/loadgen/demos/py_demo_single_stream.py
+
+A summary of the test results can be found in the *"mlperf_log_summary.txt"* logfile.
+
+For a timeline visualization of what happened during the test, open the *"mlperf_log_trace.json"* file in Chrome:
+* Type “chrome://tracing” in the address bar, then drag-n-drop the json.
+* This may be useful for SUT performance tuning and understanding + debugging the loadgen.
+
+### Installation - C++
+To build the loadgen as a C++ library, rather than a python module:
+
+    git clone https://github.com/mlcommons/inference.git mlperf_inference
+    cd mlperf_inference
+    mkdir loadgen/build/ && cd loadgen/build/
+    cmake .. && cmake --build .
+    cp libmlperf_loadgen.a ..
+
+## Quick start: Loadgen Over the Network
+
+Refer to [LON demo](demos/lon/README.md) for a basic example.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md
new file mode 100644
index 000000000..ab4e0c75d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md
@@ -0,0 +1,78 @@
+# LoadGen FAQ {#ReadmeFAQ}
+
+## Q: The LoadGen does not match the MLPerf specification. Who is right?
+**A:**
+The MLPerf spec is *always* right.
+Please file a LoadGen bug so it may be resolved.
+
+## Q: How can I file a bug?
+**A:**
+On GitHub: https://github.com/mlcommons/inference/issues/new
+
+## Q: Can I make local modifications to the LoadGen for submission?
+**A:**
+No. To keep the playing field level, please upstream any local
+modificiations you need to make. Ideally upstream such changes behind a runtime
+flag or via an abstract interface the client can implement. This will help
+with testability.
+
+## Q: Where can I find the results of a test?
+**A:**
+By default, the loadgen will output an *mlperf_log_summary.txt* file
+that summarizes the target metrics and constraints of the test, along with
+other stats about the run.
+
+*Note:* LogSettings also has a flag to forward the results to stdout and
+there's an outstanding TODO to make this more programmable.
+
+## Q: The reference implementation for \<*some_model*\> prints out results of its own. Are those for submission?
+**A:**
+They are not. The LoadGen results are the ground truth for submission
+results since they will work even for systems that forgo the python bindings.
+If you notice a bug in the LoadGen's results, please file a bug or submit a
+patch.
+
+## Q: I'm getting linker errors for LoadgenVersion definitions. Where is *version_generated.cc*?
+**A:**
+If you have a custom build setup, make sure you run the *version_generator.py*
+script, which will create the cc file you are looking for. The official build
+files that come with the LoadGen do this for you out of the box.
+
+## Q: What is this *version_generator.py* script?
+**A:**
+The LoadGen records git stats (if available) and the SHA1 of all its
+source files (always) at build time for verification purposes. This is easy
+to circumvent, but try your best to run *version_generator.py* correctly;
+ideally integrated with your build system if you have a custom build.
+The intention is more to help with debugging efforts and detect accidental
+version missmatches than to detect bad actors.
+
+## Q: How do I view the *mlperf_log_trace.json* file?
+**A:**
+This file uses the [Trace Event Format]
+(https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit)
+to record a timeline of all the threads involved.
+You can view the file by typing [chrome://tracing](chrome://tracing) into
+Chrome's address bar and dragging the json file there.
+This file zips well and you can drag the zip file directly into
+[chrome://tracing](chrome://tracing) too.
+Please include zipped traces (and the other logs) when filing bug reports.
+
+## Q: Why is the code littered with so many lambdas? My eyes hurt.
+**A:**
+Lambdas are a convenient and efficient way to ship arbitrary data + deferred
+logic over to the logging thread without much boilerplate.
+Much of the loadgen is built on top of the logging utilities.
+Thus the lambdas. (Sorry about the eyes.)
+
+## Q: What C++ version does the LoadGen target?
+**A:**
+It currently targets and requires C++14. It should compile with recent
+versions of clang, gcc, and msvc.
+
+## Q: What dependencies does the LoadGen code have?
+**A:**
+The C++ code has no external dependencies. The loadgen itself, logging
+utilities, and unit test utilities are built solely on the C++ Standard Library.
+The python bindings, however, do require
+[pybind11](https://github.com/pybind/pybind11).
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt
new file mode 100644
index 000000000..ac14c3dfa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt
@@ -0,0 +1 @@
+5.1.1
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore
new file mode 100644
index 000000000..e792c8e55
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore
@@ -0,0 +1,2 @@
+loadgen_build
+build
\ No newline at end of file
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md
new file mode 100644
index 000000000..24e872983
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md
@@ -0,0 +1,10 @@
+Note: please install jemalloc first. See: http://jemalloc.net/
+Command: bash run.sh <target_qps> <0=Basic,1=Queue> <numCompleteThreads> <maxSizeInComplete> <server_coalesce_queries=0or1>
+
+Experiments:
+- On Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz
+- Basic SUT : 500-600k i/s
+- Basic SUT + jemalloc: 800-900k i/s (`bash run.sh 800000 0`)
+- Queued SUT (2 complete threads) + jemalloc: 1.2-1.3M i/s (`bash run.sh 1200000 1 2 2048`)
+- Queued SUT (2 complete threads) + jemalloc + server_coalesce_queries: 1.4-1.5M is/ (`bash run.sh 1400000 1 2 512 1`)
+- Basic SUT + jemalloc + server_coalesce_queries + 4 IssueQueryThreads: 2.4-2.5M is/ (`bash run.sh 2400000 0 2 512 1 4`)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp
new file mode 100644
index 000000000..44ff53efa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cassert>
+#include <condition_variable>
+#include <deque>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "loadgen.h"
+#include "query_sample_library.h"
+#include "system_under_test.h"
+#include "test_settings.h"
+
+class QSL : public mlperf::QuerySampleLibrary {
+ public:
+  ~QSL() override{};
+  const std::string& Name() override { return mName; }
+  size_t TotalSampleCount() override { return 1000000; }
+  size_t PerformanceSampleCount() override { return TotalSampleCount(); }
+  void LoadSamplesToRam(const std::vector<mlperf::QuerySampleIndex>&) override {
+  }
+  void UnloadSamplesFromRam(
+      const std::vector<mlperf::QuerySampleIndex>&) override {}
+
+ private:
+  std::string mName{"Dummy QSL"};
+};
+
+class BasicSUT : public mlperf::SystemUnderTest {
+ public:
+  BasicSUT() {
+    // Start with some large value so that we don't reallocate memory.
+    initResponse(10000);
+  }
+  ~BasicSUT() override {}
+  const std::string& Name() override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    size_t n = samples.size();
+    if (n > mResponses.size()) {
+      std::cerr << "Warning: reallocating response buffer in BasicSUT. Maybe "
+                   "you should initResponse with larger value!?"
+                << std::endl;
+      initResponse(samples.size());
+    }
+    for (size_t i = 0; i < n; i++) {
+      mResponses[i].id = samples[i].id;
+    }
+    mlperf::QuerySamplesComplete(mResponses.data(), n);
+  }
+  void FlushQueries() override {}
+
+ private:
+  void initResponse(int size) {
+    mResponses.resize(size,
+                      {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+  }
+  int mBuf{0};
+  std::string mName{"BasicSUT"};
+  std::vector<mlperf::QuerySampleResponse> mResponses;
+};
+
+class QueueSUT : public mlperf::SystemUnderTest {
+ public:
+  QueueSUT(int numCompleteThreads, int maxSize) {
+    // Each thread handle at most maxSize at a time.
+    std::cout << "QueueSUT: maxSize = " << maxSize << std::endl;
+    initResponse(numCompleteThreads, maxSize);
+    // Launch complete threads
+    for (int i = 0; i < numCompleteThreads; i++) {
+      mThreads.emplace_back(&QueueSUT::CompleteThread, this, i);
+    }
+  }
+  ~QueueSUT() override {
+    {
+      std::unique_lock<std::mutex> lck(mMtx);
+      mDone = true;
+      mCondVar.notify_all();
+    }
+    for (auto& thread : mThreads) {
+      thread.join();
+    }
+  }
+  const std::string& Name() override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::unique_lock<std::mutex> lck(mMtx);
+    for (const auto& sample : samples) {
+      mIdQueue.push_back(sample.id);
+    }
+    // Let some worker thread to consume tasks
+    mCondVar.notify_one();
+  }
+  void FlushQueries() override {}
+
+ private:
+  void CompleteThread(int threadIdx) {
+    auto& responses = mResponses[threadIdx];
+    size_t maxSize{responses.size()};
+    size_t actualSize{0};
+    while (true) {
+      {
+        std::unique_lock<std::mutex> lck(mMtx);
+        mCondVar.wait(lck, [&]() { return !mIdQueue.empty() || mDone; });
+
+        if (mDone) {
+          break;
+        }
+
+        actualSize = std::min(maxSize, mIdQueue.size());
+        for (size_t i = 0; i < actualSize; i++) {
+          responses[i].id = mIdQueue.front();
+          mIdQueue.pop_front();
+        }
+        mCondVar.notify_one();
+      }
+      mlperf::QuerySamplesComplete(responses.data(), actualSize);
+    }
+  }
+  void initResponse(int numCompleteThreads, int size) {
+    mResponses.resize(numCompleteThreads);
+    for (auto& responses : mResponses) {
+      responses.resize(size,
+                       {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+    }
+  }
+  int mBuf{0};
+  std::string mName{"QueueSUT"};
+  std::vector<std::vector<mlperf::QuerySampleResponse>> mResponses;
+  std::vector<std::thread> mThreads;
+  std::deque<mlperf::ResponseId> mIdQueue;
+  std::mutex mMtx;
+  std::condition_variable mCondVar;
+  bool mDone{false};
+};
+
+class MultiBasicSUT : public mlperf::SystemUnderTest {
+ public:
+  MultiBasicSUT(int numThreads)
+      : mNumThreads(numThreads), mResponses(numThreads) {
+    // Start with some large value so that we don't reallocate memory.
+    initResponse(10000);
+    for (int i = 0; i < mNumThreads; ++i) {
+      mThreads.emplace_back(&MultiBasicSUT::startIssueThread, this, i);
+    }
+  }
+  ~MultiBasicSUT() override {
+    for (auto& thread : mThreads) {
+      thread.join();
+    }
+  }
+  const std::string& Name() override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    int thread_idx = mThreadMap[std::this_thread::get_id()];
+    size_t n = samples.size();
+    auto& reponses = mResponses[thread_idx];
+    if (n > reponses.size()) {
+      std::cout
+          << "Warning: reallocating response buffer in MultiBasicSUT. Maybe "
+             "you should initResponse with larger value!?"
+          << std::endl;
+      initResponse(samples.size());
+    }
+    for (size_t i = 0; i < n; i++) {
+      reponses[i].id = samples[i].id;
+    }
+    mlperf::QuerySamplesComplete(reponses.data(), n);
+  }
+  void FlushQueries() override {}
+
+ private:
+  void initResponse(int size) {
+    for (auto& responses : mResponses) {
+      responses.resize(size,
+                       {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+    }
+  }
+  void startIssueThread(int thread_idx) {
+    {
+      std::lock_guard<std::mutex> lock(mMtx);
+      mThreadMap[std::this_thread::get_id()] = thread_idx;
+    }
+    mlperf::RegisterIssueQueryThread();
+  }
+  int mBuf{0};
+  int mNumThreads{0};
+  std::string mName{"MultiBasicSUT"};
+  std::vector<std::vector<mlperf::QuerySampleResponse>> mResponses;
+  std::mutex mMtx;
+  std::vector<std::thread> mThreads;
+  std::map<std::thread::id, int> mThreadMap;
+};
+
+int main(int argc, char** argv) {
+  assert(argc >= 2 && "Need to pass in at least one argument: target_qps");
+  int target_qps = std::stoi(argv[1]);
+  std::cout << "target_qps = " << target_qps << std::endl;
+
+  bool useQueue{false};
+  int numCompleteThreads{4};
+  int maxSize{1};
+  bool server_coalesce_queries{false};
+  int num_issue_threads{0};
+  if (argc >= 3) {
+    useQueue = std::stoi(argv[2]) != 0;
+  }
+  if (argc >= 4) {
+    numCompleteThreads = std::stoi(argv[3]);
+  }
+  if (argc >= 5) {
+    maxSize = std::stoi(argv[4]);
+  }
+  if (argc >= 6) {
+    server_coalesce_queries = std::stoi(argv[5]) != 0;
+  }
+  if (argc >= 7) {
+    num_issue_threads = std::stoi(argv[6]);
+  }
+
+  QSL qsl;
+  std::unique_ptr<mlperf::SystemUnderTest> sut;
+
+  // Configure the test settings
+  mlperf::TestSettings testSettings;
+  testSettings.scenario = mlperf::TestScenario::Server;
+  testSettings.mode = mlperf::TestMode::PerformanceOnly;
+  testSettings.server_target_qps = target_qps;
+  testSettings.server_target_latency_ns = 10000000;  // 10ms
+  testSettings.server_target_latency_percentile = 0.99;
+  testSettings.min_duration_ms = 60000;
+  testSettings.min_query_count = 270000;
+  testSettings.server_coalesce_queries = server_coalesce_queries;
+  std::cout << "testSettings.server_coalesce_queries = "
+            << (server_coalesce_queries ? "True" : "False") << std::endl;
+  testSettings.server_num_issue_query_threads = num_issue_threads;
+  std::cout << "num_issue_threads = " << num_issue_threads << std::endl;
+
+  // Configure the logging settings
+  mlperf::LogSettings logSettings;
+  logSettings.log_output.outdir = "build";
+  logSettings.log_output.prefix = "mlperf_log_";
+  logSettings.log_output.suffix = "";
+  logSettings.log_output.prefix_with_datetime = false;
+  logSettings.log_output.copy_detail_to_stdout = false;
+  logSettings.log_output.copy_summary_to_stdout = true;
+  logSettings.log_mode = mlperf::LoggingMode::AsyncPoll;
+  logSettings.log_mode_async_poll_interval_ms = 1000;
+  logSettings.enable_trace = false;
+
+  // Choose SUT
+  if (num_issue_threads == 0) {
+    if (useQueue) {
+      std::cout << "Using QueueSUT with " << numCompleteThreads
+                << " complete threads" << std::endl;
+      sut.reset(new QueueSUT(numCompleteThreads, maxSize));
+    } else {
+      std::cout << "Using BasicSUT" << std::endl;
+      sut.reset(new BasicSUT());
+    }
+  } else {
+    if (useQueue) {
+      std::cout << "Using MultiQueueSUT with " << numCompleteThreads
+                << " complete threads" << std::endl;
+      std::cerr << "!!!! MultiQueueSUT is NOT implemented yet !!!!"
+                << std::endl;
+      return 1;
+      // sut.reset(new MultiQueueSUT(num_issue_threads, numCompleteThreads,
+      // maxSize));
+    } else {
+      std::cout << "Using MultiBasicSUT" << std::endl;
+      sut.reset(new MultiBasicSUT(num_issue_threads));
+    }
+  }
+
+  // Start test
+  std::cout << "Start test..." << std::endl;
+  mlperf::StartTest(sut.get(), &qsl, testSettings, logSettings);
+  std::cout << "Test done. Clean up SUT..." << std::endl;
+  sut.reset();
+  std::cout << "Done!" << std::endl;
+  return 0;
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh
new file mode 100644
index 000000000..62559c1a8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+echo "Building loadgen..."
+if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi;
+cd loadgen_build && cmake ../.. && make -j && cd ..
+echo "Building test program..."
+if [ ! -e build ]; then mkdir build; fi;
+g++ --std=c++11 -O3 -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \
+LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 build/repro.exe $1 $2 $3 $4 $5 $6
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh
new file mode 100644
index 000000000..ba63727c8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+echo "Building loadgen in Debug mode..."
+if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi;
+cd loadgen_build && cmake -DCMAKE_BUILD_TYPE=Debug ../.. && make -j && cd ..
+echo "Building test program in Debug mode..."
+if [ ! -e build ]; then mkdir build; fi;
+g++ --std=c++11 -O0 -g -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \
+gdb --args build/repro.exe $1 $2 $3 $4 $5 $6
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc
new file mode 100644
index 000000000..0248a1c16
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc
@@ -0,0 +1,176 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "c_api.h"
+
+#include <string>
+
+#include "../loadgen.h"
+#include "../query_sample.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+
+namespace mlperf {
+namespace c {
+namespace {
+
+// Forwards SystemUnderTest calls to relevant callbacks.
+class SystemUnderTestTrampoline : public SystemUnderTest {
+ public:
+  SystemUnderTestTrampoline(ClientData client_data, std::string name,
+                            IssueQueryCallback issue_cb,
+                            FlushQueriesCallback flush_queries_cb)
+      : client_data_(client_data),
+        name_(std::move(name)),
+        issue_cb_(issue_cb),
+        flush_queries_cb_(flush_queries_cb) {}
+  ~SystemUnderTestTrampoline() override = default;
+
+  const std::string& Name() override { return name_; }
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    (*issue_cb_)(client_data_, samples.data(), samples.size());
+  }
+
+  void FlushQueries() override { (*flush_queries_cb_)(); }
+
+ private:
+  ClientData client_data_;
+  std::string name_;
+  IssueQueryCallback issue_cb_;
+  FlushQueriesCallback flush_queries_cb_;
+};
+
+}  // namespace
+
+void* ConstructSUT(ClientData client_data, const char* name, size_t name_length,
+                   IssueQueryCallback issue_cb,
+                   FlushQueriesCallback flush_queries_cb) {
+  SystemUnderTestTrampoline* sut = new SystemUnderTestTrampoline(
+      client_data, std::string(name, name_length), issue_cb, flush_queries_cb);
+  return reinterpret_cast<void*>(sut);
+}
+
+void DestroySUT(void* sut) {
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  delete sut_cast;
+}
+
+namespace {
+
+// Forwards QuerySampleLibrary calls to relevant callbacks.
+class QuerySampleLibraryTrampoline : public QuerySampleLibrary {
+ public:
+  QuerySampleLibraryTrampoline(
+      ClientData client_data, std::string name, size_t total_sample_count,
+      size_t performance_sample_count,
+      LoadSamplesToRamCallback load_samples_to_ram_cb,
+      UnloadSamplesFromRamCallback unload_samples_from_ram_cb)
+      : client_data_(client_data),
+        name_(std::move(name)),
+        total_sample_count_(total_sample_count),
+        performance_sample_count_(performance_sample_count),
+        load_samples_to_ram_cb_(load_samples_to_ram_cb),
+        unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {}
+  ~QuerySampleLibraryTrampoline() override = default;
+
+  const std::string& Name() override { return name_; }
+  size_t TotalSampleCount() override { return total_sample_count_; }
+  size_t PerformanceSampleCount() override { return performance_sample_count_; }
+
+  void LoadSamplesToRam(const std::vector<QuerySampleIndex>& samples) override {
+    (*load_samples_to_ram_cb_)(client_data_, samples.data(), samples.size());
+  }
+  void UnloadSamplesFromRam(
+      const std::vector<QuerySampleIndex>& samples) override {
+    (*unload_samples_from_ram_cb_)(client_data_, samples.data(),
+                                   samples.size());
+  }
+
+ private:
+  ClientData client_data_;
+  std::string name_;
+  size_t total_sample_count_;
+  size_t performance_sample_count_;
+  LoadSamplesToRamCallback load_samples_to_ram_cb_;
+  UnloadSamplesFromRamCallback unload_samples_from_ram_cb_;
+};
+
+}  // namespace
+
+void* ConstructQSL(ClientData client_data, const char* name, size_t name_length,
+                   size_t total_sample_count, size_t performance_sample_count,
+                   LoadSamplesToRamCallback load_samples_to_ram_cb,
+                   UnloadSamplesFromRamCallback unload_samples_from_ram_cb) {
+  QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline(
+      client_data, std::string(name, name_length), total_sample_count,
+      performance_sample_count, load_samples_to_ram_cb,
+      unload_samples_from_ram_cb);
+  return reinterpret_cast<void*>(qsl);
+}
+
+void DestroyQSL(void* qsl) {
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  delete qsl_cast;
+}
+
+// mlperf::c::StartTest just forwards to mlperf::StartTest after doing the
+// proper cast.
+void StartTest(void* sut, void* qsl, const TestSettings& settings,
+               const std::string& audit_config_filename = "audit.config") {
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  LogSettings default_log_settings;
+  mlperf::StartTest(sut_cast, qsl_cast, settings, default_log_settings,
+                    audit_config_filename);
+}
+
+void QuerySamplesComplete(QuerySampleResponse* responses,
+                          size_t response_count) {
+  mlperf::QuerySamplesComplete(responses, response_count);
+}
+
+void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses,
+                                    size_t response_count,
+                                    ResponseCallback response_cb,
+                                    ClientData client_data) {
+  mlperf::QuerySamplesComplete(
+      responses, response_count,
+      [client_data, response_cb](QuerySampleResponse* response) {
+        response_cb(client_data, response);
+      });
+}
+
+void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count) {
+  mlperf::FirstTokenComplete(responses, response_count);
+}
+
+void FirstTokenCompleteResponseCb(QuerySampleResponse* responses,
+                                  size_t response_count,
+                                  ResponseCallback response_cb,
+                                  ClientData client_data) {
+  mlperf::FirstTokenComplete(
+      responses, response_count,
+      [client_data, response_cb](QuerySampleResponse* response) {
+        response_cb(client_data, response);
+      });
+}
+
+void RegisterIssueQueryThread() { mlperf::RegisterIssueQueryThread(); }
+
+}  // namespace c
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h
new file mode 100644
index 000000000..0ee44fb71
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h
@@ -0,0 +1,95 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief A C API wrapping the C++ loadgen. Not tested. Needs work.
+/// \details The C API allows a C or Python client to easily create
+/// a SystemUnderTest without having to expose the SystemUnderTest class
+/// directly.
+/// ConstructSUT works with a bunch of function poitners instead that are
+/// called from an underlying trampoline class.
+
+#ifndef SYSTEM_UNDER_TEST_C_API_H_
+#define SYSTEM_UNDER_TEST_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "../query_sample.h"
+#include "../test_settings.h"
+
+namespace mlperf {
+
+namespace c {
+
+/// \brief Optional opaque client data that creators of SUTs and QSLs can have
+/// the loadgen pass back to their callback invocations.
+/// Helps avoids global variables.
+typedef uintptr_t ClientData;
+
+typedef void (*IssueQueryCallback)(ClientData, const QuerySample*, size_t);
+typedef void (*FlushQueriesCallback)();
+typedef void (*ResponseCallback)(ClientData, QuerySampleResponse*);
+
+/// \brief SUT calls this function to report query result back to loadgen
+void QuerySamplesComplete(QuerySampleResponse* responses,
+                          size_t response_count);
+
+void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses,
+                                    size_t response_count,
+                                    ResponseCallback response_cb,
+                                    ClientData client_data);
+
+void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count);
+
+void FirstTokenCompleteResponseCb(QuerySampleResponse* responses,
+                                  size_t response_count,
+                                  ResponseCallback response_cb,
+                                  ClientData client_data);
+
+/// \brief Create an opaque SUT pointer based on C callbacks.
+void* ConstructSUT(ClientData client_data, const char* name, size_t name_length,
+                   IssueQueryCallback issue_cb,
+                   FlushQueriesCallback flush_queries_cb);
+/// \brief Destroys the SUT created by ConstructSUT.
+void DestroySUT(void* sut);
+
+typedef void (*LoadSamplesToRamCallback)(ClientData, const QuerySampleIndex*,
+                                         size_t);
+typedef void (*UnloadSamplesFromRamCallback)(ClientData,
+                                             const QuerySampleIndex*, size_t);
+
+/// \brief Create an opaque QSL pointer based on C callbacks.
+void* ConstructQSL(ClientData client_data, const char* name, size_t name_length,
+                   size_t total_sample_count, size_t performance_sample_count,
+                   LoadSamplesToRamCallback load_samples_to_ram_cb,
+                   UnloadSamplesFromRamCallback unload_samples_from_ram_cb);
+/// \brief Destroys the QSL created by ConsructQSL.
+void DestroyQSL(void* qsl);
+
+/// \brief Run tests on a SUT created by ConstructSUT().
+/// \details This is the C entry point. See mlperf::StartTest for the C++ entry
+/// point.
+void StartTest(void* sut, void* qsl, const TestSettings& settings,
+               const std::string& audit_config_filename);
+
+///
+/// \brief Register a thread for query issuing in Server scenario.
+/// \details This is the C entry point. See mlperf::RegisterIssueQueryThread for
+/// the C++ entry point.
+///
+void RegisterIssueQueryThread();
+
+}  // namespace c
+}  // namespace mlperf
+
+#endif  // SYSTEM_UNDER_TEST_C_API_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc
new file mode 100644
index 000000000..96396dab9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc
@@ -0,0 +1,484 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Python bindings for the loadgen using pybind11.
+
+#ifndef PYTHON_BINDINGS_H
+#define PYTHON_BINDINGS_H
+
+#include <functional>
+
+#include "../loadgen.h"
+#include "../query_dispatch_library.h"
+#include "../query_sample.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+#include "pybind11/functional.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "pybind11/stl_bind.h"
+
+namespace mlperf {
+
+namespace {
+
+using IssueQueryCallback = std::function<void(std::vector<QuerySample>)>;
+using FastIssueQueriesCallback =
+    std::function<void(std::vector<ResponseId>, std::vector<QuerySampleIndex>)>;
+using FlushQueriesCallback = std::function<void()>;
+using NameCallback = std::function<std::string()>;
+
+// Forwards SystemUnderTest calls to relevant callbacks.
+class SystemUnderTestTrampoline : public SystemUnderTest {
+ public:
+  SystemUnderTestTrampoline(std::string name, IssueQueryCallback issue_cb,
+                            FlushQueriesCallback flush_queries_cb)
+      : name_(std::move(name)),
+        issue_cb_(issue_cb),
+        flush_queries_cb_(flush_queries_cb) {}
+  ~SystemUnderTestTrampoline() override = default;
+
+  const std::string& Name() override { return name_; }
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    issue_cb_(samples);
+  }
+
+  void FlushQueries() override { flush_queries_cb_(); }
+
+ protected:
+  std::string name_;
+  IssueQueryCallback issue_cb_;
+  FlushQueriesCallback flush_queries_cb_;
+};
+
+class FastSystemUnderTestTrampoline : public SystemUnderTestTrampoline {
+ public:
+  FastSystemUnderTestTrampoline(std::string name,
+                                FastIssueQueriesCallback fast_issue_cb,
+                                FlushQueriesCallback flush_queries_cb)
+      : SystemUnderTestTrampoline(name, nullptr, flush_queries_cb),
+        fast_issue_cb_(fast_issue_cb) {}
+  ~FastSystemUnderTestTrampoline() override = default;
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    std::vector<ResponseId> responseIds;
+    std::vector<QuerySampleIndex> querySampleIndices;
+    for (auto& s : samples) {
+      responseIds.push_back(s.id);
+      querySampleIndices.push_back(s.index);
+    }
+    fast_issue_cb_(responseIds, querySampleIndices);
+  }
+
+ private:
+  FastIssueQueriesCallback fast_issue_cb_;
+};
+
+using LoadSamplesToRamCallback =
+    std::function<void(std::vector<QuerySampleIndex>)>;
+using UnloadSamplesFromRamCallback =
+    std::function<void(std::vector<QuerySampleIndex>)>;
+
+// Forwards QuerySampleLibrary calls to relevant callbacks.
+class QuerySampleLibraryTrampoline : public QuerySampleLibrary {
+ public:
+  QuerySampleLibraryTrampoline(
+      std::string name, size_t total_sample_count,
+      size_t performance_sample_count,
+      LoadSamplesToRamCallback load_samples_to_ram_cb,
+      UnloadSamplesFromRamCallback unload_samples_from_ram_cb)
+      : name_(std::move(name)),
+        total_sample_count_(total_sample_count),
+        performance_sample_count_(performance_sample_count),
+        load_samples_to_ram_cb_(load_samples_to_ram_cb),
+        unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {}
+  ~QuerySampleLibraryTrampoline() override = default;
+
+  const std::string& Name() override { return name_; }
+  size_t TotalSampleCount() { return total_sample_count_; }
+  size_t PerformanceSampleCount() { return performance_sample_count_; }
+
+  void LoadSamplesToRam(const std::vector<QuerySampleIndex>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    load_samples_to_ram_cb_(samples);
+  }
+  void UnloadSamplesFromRam(
+      const std::vector<QuerySampleIndex>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    unload_samples_from_ram_cb_(samples);
+  }
+
+ private:
+  std::string name_;
+  size_t total_sample_count_;
+  size_t performance_sample_count_;
+  LoadSamplesToRamCallback load_samples_to_ram_cb_;
+  UnloadSamplesFromRamCallback unload_samples_from_ram_cb_;
+};
+
+// A QDL that allows defining callbacks for
+// IssueQuery, FlushQueries, and Name methods.
+class QueryDispatchLibraryTrampoline : public QueryDispatchLibrary {
+ public:
+  QueryDispatchLibraryTrampoline(IssueQueryCallback issue_query_callback,
+                                 FlushQueriesCallback flush_queries_callback,
+                                 NameCallback name_callback)
+      : issue_query_callback_(issue_query_callback),
+        flush_queries_callback_(flush_queries_callback),
+        name_callback_(name_callback) {}
+
+  // Returns the name of the SUT. Name shall be returned over the network
+  // TODO: other bindings should also be fixed eventually to be used over the
+  // network
+  const std::string& Name() override {
+    static std::string name;  // HACK: avoid returning a reference to temporary.
+    pybind11::gil_scoped_acquire gil_acquirer;
+    name = name_callback_();  // name_callback_() shall returned name over the
+                              // network.
+    return name;
+  }
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    issue_query_callback_(samples);
+  }
+
+  void FlushQueries() override { flush_queries_callback_(); }
+
+ protected:
+  IssueQueryCallback issue_query_callback_;
+  FlushQueriesCallback flush_queries_callback_;
+  NameCallback name_callback_;
+};
+
+}  // namespace
+
+/// \brief Python bindings.
+namespace py {
+
+uintptr_t ConstructSUT(IssueQueryCallback issue_cb,
+                       FlushQueriesCallback flush_queries_cb) {
+  SystemUnderTestTrampoline* sut =
+      new SystemUnderTestTrampoline("PySUT", issue_cb, flush_queries_cb);
+  return reinterpret_cast<uintptr_t>(sut);
+}
+
+void DestroySUT(uintptr_t sut) {
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  delete sut_cast;
+}
+
+uintptr_t ConstructFastSUT(FastIssueQueriesCallback fast_issue_cb,
+                           FlushQueriesCallback flush_queries_cb) {
+  FastSystemUnderTestTrampoline* sut = new FastSystemUnderTestTrampoline(
+      "PyFastSUT", fast_issue_cb, flush_queries_cb);
+  return reinterpret_cast<uintptr_t>(sut);
+}
+
+void DestroyFastSUT(uintptr_t sut) {
+  FastSystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<FastSystemUnderTestTrampoline*>(sut);
+  delete sut_cast;
+}
+
+uintptr_t ConstructQSL(
+    size_t total_sample_count, size_t performance_sample_count,
+    LoadSamplesToRamCallback load_samples_to_ram_cb,
+    UnloadSamplesFromRamCallback unload_samples_from_ram_cb) {
+  QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline(
+      "PyQSL", total_sample_count, performance_sample_count,
+      load_samples_to_ram_cb, unload_samples_from_ram_cb);
+  return reinterpret_cast<uintptr_t>(qsl);
+}
+
+void DestroyQSL(uintptr_t qsl) {
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  delete qsl_cast;
+}
+
+uintptr_t ConstructQDL(IssueQueryCallback issue_cb,
+                       FlushQueriesCallback flush_queries_cb,
+                       NameCallback name_callback) {
+  QueryDispatchLibraryTrampoline* qdl = new QueryDispatchLibraryTrampoline(
+      issue_cb, flush_queries_cb, name_callback);
+  return reinterpret_cast<uintptr_t>(qdl);
+}
+
+void DestroyQDL(uintptr_t qdl) {
+  QueryDispatchLibraryTrampoline* qdl_cast =
+      reinterpret_cast<QueryDispatchLibraryTrampoline*>(qdl);
+  delete qdl_cast;
+}
+
+void StartTest(uintptr_t sut, uintptr_t qsl, mlperf::TestSettings test_settings,
+               const std::string& audit_config_filename) {
+  pybind11::gil_scoped_release gil_releaser;
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  LogSettings default_log_settings;
+  mlperf::StartTest(sut_cast, qsl_cast, test_settings, default_log_settings,
+                    audit_config_filename);
+}
+
+void StartTestWithLogSettings(uintptr_t sut, uintptr_t qsl,
+                              mlperf::TestSettings test_settings,
+                              mlperf::LogSettings log_settings,
+                              const std::string& audit_config_filename) {
+  pybind11::gil_scoped_release gil_releaser;
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  mlperf::StartTest(sut_cast, qsl_cast, test_settings, log_settings,
+                    audit_config_filename);
+}
+
+using ResponseCallback = std::function<void(QuerySampleResponse*)>;
+
+/// TODO: Get rid of copies.
+void QuerySamplesComplete(std::vector<QuerySampleResponse> responses,
+                          ResponseCallback response_cb = {}) {
+  pybind11::gil_scoped_release gil_releaser;
+  mlperf::QuerySamplesComplete(responses.data(), responses.size(), response_cb);
+}
+
+void FirstTokenComplete(std::vector<QuerySampleResponse> responses,
+                        ResponseCallback response_cb = {}) {
+  pybind11::gil_scoped_release gil_releaser;
+  mlperf::FirstTokenComplete(responses.data(), responses.size(), response_cb);
+}
+
+PYBIND11_MODULE(mlperf_loadgen, m) {
+  m.doc() = "MLPerf Inference load generator.";
+
+  pybind11::enum_<TestScenario>(m, "TestScenario")
+      .value("SingleStream", TestScenario::SingleStream)
+      .value("MultiStream", TestScenario::MultiStream)
+      .value("Server", TestScenario::Server)
+      .value("Offline", TestScenario::Offline);
+
+  pybind11::enum_<TestMode>(m, "TestMode")
+      .value("SubmissionRun", TestMode::SubmissionRun)
+      .value("AccuracyOnly", TestMode::AccuracyOnly)
+      .value("PerformanceOnly", TestMode::PerformanceOnly)
+      .value("FindPeakPerformance", TestMode::FindPeakPerformance);
+
+  pybind11::class_<TestSettings>(m, "TestSettings")
+      .def(pybind11::init<>())
+      .def_readwrite("scenario", &TestSettings::scenario)
+      .def_readwrite("mode", &TestSettings::mode)
+      .def_readwrite("single_stream_expected_latency_ns",
+                     &TestSettings::single_stream_expected_latency_ns)
+      .def_readwrite("single_stream_target_latency_percentile",
+                     &TestSettings::single_stream_target_latency_percentile)
+      .def_readwrite("multi_stream_expected_latency_ns",
+                     &TestSettings::multi_stream_expected_latency_ns)
+      .def_readwrite("multi_stream_target_latency_percentile",
+                     &TestSettings::multi_stream_target_latency_percentile)
+      .def_readwrite("multi_stream_samples_per_query",
+                     &TestSettings::multi_stream_samples_per_query)
+      .def_readwrite("server_target_qps", &TestSettings::server_target_qps)
+      .def_readwrite("server_target_latency_ns",
+                     &TestSettings::server_target_latency_ns)
+      .def_readwrite("server_target_latency_percentile",
+                     &TestSettings::server_target_latency_percentile)
+      .def_readwrite("server_coalesce_queries",
+                     &TestSettings::server_coalesce_queries)
+      .def_readwrite("server_find_peak_qps_decimals_of_precision",
+                     &TestSettings::server_find_peak_qps_decimals_of_precision)
+      .def_readwrite("server_find_peak_qps_boundary_step_size",
+                     &TestSettings::server_find_peak_qps_boundary_step_size)
+      .def_readwrite("server_max_async_queries",
+                     &TestSettings::server_max_async_queries)
+      .def_readwrite("server_num_issue_query_threads",
+                     &TestSettings::server_num_issue_query_threads)
+      .def_readwrite("offline_expected_qps",
+                     &TestSettings::offline_expected_qps)
+      .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms)
+      .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms)
+      .def_readwrite("min_query_count", &TestSettings::min_query_count)
+      .def_readwrite("max_query_count", &TestSettings::max_query_count)
+      .def_readwrite("qsl_rng_seed", &TestSettings::qsl_rng_seed)
+      .def_readwrite("sample_index_rng_seed",
+                     &TestSettings::sample_index_rng_seed)
+      .def_readwrite("schedule_rng_seed", &TestSettings::schedule_rng_seed)
+      .def_readwrite("accuracy_log_rng_seed",
+                     &TestSettings::accuracy_log_rng_seed)
+      .def_readwrite("accuracy_log_probability",
+                     &TestSettings::accuracy_log_probability)
+      .def_readwrite("print_timestamps", &TestSettings::print_timestamps)
+      .def_readwrite("performance_issue_unique",
+                     &TestSettings::performance_issue_unique)
+      .def_readwrite("performance_issue_same",
+                     &TestSettings::performance_issue_same)
+      .def_readwrite("performance_issue_same_index",
+                     &TestSettings::performance_issue_same_index)
+      .def_readwrite("performance_sample_count_override",
+                     &TestSettings::performance_sample_count_override)
+      .def_readwrite("test05", &TestSettings::test05)
+      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
+      .def_readwrite("test05_sample_index_rng_seed",
+                     &TestSettings::test05_sample_index_rng_seed)
+      .def_readwrite("test05_schedule_rng_seed",
+                     &TestSettings::test05_schedule_rng_seed)
+      .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
+      .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
+      .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
+      .def_readwrite("infer_token_latencies",
+                     &TestSettings::infer_token_latencies)
+      .def_readwrite("token_latency_scaling_factor",
+                     &TestSettings::token_latency_scaling_factor)
+      .def("FromConfig", &TestSettings::FromConfig, pybind11::arg("path"),
+           pybind11::arg("model"), pybind11::arg("scenario"),
+           pybind11::arg("conf_type") = 1,
+           "This function configures settings from the given user "
+           "configuration file, model, and scenario. The conf_type flag "
+           "should be set to 1 for loading user.conf or else only the default "
+           "mlperf_conf file "
+           "will be loaded by the loadgen.");
+
+  pybind11::enum_<LoggingMode>(m, "LoggingMode")
+      .value("AsyncPoll", LoggingMode::AsyncPoll)
+      .value("EndOfTestOnly", LoggingMode::EndOfTestOnly)
+      .value("Synchronous", LoggingMode::Synchronous);
+
+  pybind11::class_<LogOutputSettings>(m, "LogOutputSettings")
+      .def(pybind11::init<>())
+      .def_readwrite("outdir", &LogOutputSettings::outdir)
+      .def_readwrite("prefix", &LogOutputSettings::prefix)
+      .def_readwrite("suffix", &LogOutputSettings::suffix)
+      .def_readwrite("prefix_with_datetime",
+                     &LogOutputSettings::prefix_with_datetime)
+      .def_readwrite("copy_detail_to_stdout",
+                     &LogOutputSettings::copy_detail_to_stdout)
+      .def_readwrite("copy_summary_to_stdout",
+                     &LogOutputSettings::copy_summary_to_stdout);
+
+  pybind11::class_<LogSettings>(m, "LogSettings")
+      .def(pybind11::init<>())
+      .def_readwrite("log_output", &LogSettings::log_output)
+      .def_readwrite("log_mode", &LogSettings::log_mode)
+      .def_readwrite("log_mode_async_poll_interval_ms",
+                     &LogSettings::log_mode_async_poll_interval_ms)
+      .def_readwrite("enable_trace", &LogSettings::enable_trace);
+
+  pybind11::class_<QuerySample>(m, "QuerySample")
+      .def(pybind11::init<>())
+      .def(pybind11::init<ResponseId, QuerySampleIndex>())
+      .def_readwrite("id", &QuerySample::id)
+      .def_readwrite("index", &QuerySample::index)
+      .def(pybind11::pickle(
+          [](const QuerySample& qs) {  // __getstate__
+            /*Return a tuple that fully encodes state of object*/
+            return pybind11::make_tuple(qs.id, qs.index);
+          },
+          [](pybind11::tuple t) {  // __setstate__
+            if (t.size() != 2)
+              throw std::runtime_error("Invalid state for QuerySample");
+            /* Create a new C++ instance*/
+            QuerySample q;
+            q.id = t[0].cast<uintptr_t>();
+            q.index = t[1].cast<size_t>();
+            return q;
+          }));
+
+  pybind11::class_<QuerySampleResponse>(m, "QuerySampleResponse")
+      .def(pybind11::init<>())
+      .def(pybind11::init<ResponseId, uintptr_t, size_t>())
+      .def(pybind11::init<ResponseId, uintptr_t, size_t, int64_t>())
+      .def_readwrite("id", &QuerySampleResponse::id)
+      .def_readwrite("data", &QuerySampleResponse::data)
+      .def_readwrite("size", &QuerySampleResponse::size)
+      .def_readwrite("n_tokens", &QuerySampleResponse::n_tokens)
+      .def(pybind11::pickle(
+          [](const QuerySampleResponse& qsr) {  // __getstate__
+            /* Return a tuple that fully encodes state of object*/
+            return pybind11::make_tuple(qsr.id, qsr.data, qsr.size);
+          },
+          [](pybind11::tuple t) {  // __setstate__
+            if ((t.size() != 3) || (t.size() != 4))
+              throw std::runtime_error("Invalid state for QuerySampleResponse");
+            /* Create a new C++ instance*/
+            QuerySampleResponse q;
+            q.id = t[0].cast<uintptr_t>();
+            q.data = t[1].cast<uintptr_t>();
+            q.size = t[2].cast<size_t>();
+            if (t.size() == 4) {
+              q.n_tokens = t[3].cast<int64_t>();
+            } else {
+              q.n_tokens = 0;
+            }
+            return q;
+          }));
+
+  // TODO: Use PYBIND11_MAKE_OPAQUE for the following vector types.
+  pybind11::bind_vector<std::vector<QuerySample>>(m, "VectorQuerySample");
+  pybind11::bind_vector<std::vector<QuerySampleResponse>>(
+      m, "VectorQuerySampleResponse");
+
+  m.def("ConstructSUT", &py::ConstructSUT, "Construct the system under test.");
+  m.def("DestroySUT", &py::DestroySUT,
+        "Destroy the object created by ConstructSUT.");
+
+  m.def("ConstructFastSUT", &py::ConstructFastSUT,
+        "Construct the system under test, fast issue query");
+  m.def("DestroyFastSUT", &py::DestroyFastSUT,
+        "Destroy the object created by ConstructFastSUT.");
+
+  m.def("ConstructQSL", &py::ConstructQSL,
+        "Construct the query sample library.");
+  m.def("DestroyQSL", &py::DestroyQSL,
+        "Destroy the object created by ConstructQSL.");
+
+  m.def("ConstructQDL", &py::ConstructQDL,
+        "Construct the query sample library, communicating with the SUT over "
+        "the network.");
+  m.def("DestroyQDL", &py::DestroyQDL,
+        "Destroy the object created by ConstructQDL.");
+
+  m.def("StartTest", &py::StartTest,
+        "Run tests on a SUT created by ConstructSUT() with the provided QSL. "
+        "Uses default log settings.",
+        pybind11::arg("sut"), pybind11::arg("qsl"),
+        pybind11::arg("test_settings"),
+        pybind11::arg("audit_config_filename") = "audit.config");
+  m.def("StartTestWithLogSettings", &py::StartTestWithLogSettings,
+        "Run tests on a SUT created by ConstructSUT() with the provided QSL. "
+        "Accepts custom log settings.",
+        pybind11::arg("sut"), pybind11::arg("qsl"),
+        pybind11::arg("test_settings"), pybind11::arg("log_settings"),
+        pybind11::arg("audit_config_filename") = "audit.config");
+  m.def("QuerySamplesComplete", &py::QuerySamplesComplete,
+        "Called by the SUT to indicate that samples from some combination of"
+        "IssueQuery calls have finished.",
+        pybind11::arg("responses"),
+        pybind11::arg("response_cb") = ResponseCallback{});
+  m.def("FirstTokenComplete", &py::FirstTokenComplete,
+        "Called by the SUT to indicate that tokens from some combination of"
+        "IssueQuery calls have finished.",
+        pybind11::arg("responses"),
+        pybind11::arg("response_cb") = ResponseCallback{});
+}
+
+}  // namespace py
+}  // namespace mlperf
+
+#endif  // PYTHON_BINDINGS_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md
new file mode 100644
index 000000000..f46e22a65
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md
@@ -0,0 +1,67 @@
+# Demo
+
+## Loadgen Over the Network
+
+### Overview
+
+
+This folder provides a demo implementation for LoadGen over the network.\
+Two sides are implemented:
+
+1. The SUT side which is implemented in [sut_over_network_demo.py](sut_over_network_demo.py). Each Node should run it for multiple Nodes operation.
+2. The LoadGen node running the LoadGen, QSL and QDL instances, implemented in [py_demo_server_lon.py](py_demo_server_lon.py)
+
+The demo SUT is implemented with a Flask server. the LON node implements a Flask client for network operation.
+
+The test runs in MLPerf Server mode. the SUT is not implementing a benchmark but contains dummy interface to preprocessing, postprocessing and  model calling functions.
+
+### Setup
+
+Install python packages:
+
+```sh
+pip install absl-py numpy wheel flask requests
+```
+
+Clone:
+
+```sh
+git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
+```
+
+Build:
+
+```sh
+cd mlperf_inference/loadgen
+CFLAGS="-std=c++14 -O3" python setup.py bdist_wheel
+cd ..; pip install --force-reinstall loadgen/dist/`ls -r loadgen/dist/ | head -n1` ; cd -
+```
+
+### Run the demo (single machine)
+
+Start the demo SUT server (run this at a separate terminal):
+
+```sh
+python demos/lon/sut_over_network_demo.py --port 8000
+```
+
+Start the test:
+
+```sh
+python demos/lon/py_demo_server_lon.py --sut_server http://localhost:8000
+```
+
+### Run the demo (over the network)
+
+To run over a network - simply run the demo SUT over on a different machine. For multiple Nodes run the demo SUT on each machine specifying the node number.\
+
+```sh
+python demos/lon/sut_over_network_demo.py --port 8000 --node N1
+```
+
+Then, when running the client, replace `localhost` with the correct IP.
+
+
+```sh
+python demos/lon/py_demo_server_lon.py --sut_server IP1:8000,IP2:8000,IP3:8000
+```
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py
new file mode 100644
index 000000000..1248215db
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py
@@ -0,0 +1,191 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""
+Python demo showing how to use the MLPerf Inference LoadGen over the Network bindings.
+This programs runs in the LON Node side.
+It runs the demo in MLPerf server mode over the network.
+It communicates over the network with a Network SUT node,
+which is running the Network SUT demo based on a flask server, implemented in SUT_over_network.py
+"""
+
+import threading
+import requests
+import array
+import time
+
+from absl import app
+from absl import flags
+import mlperf_loadgen
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_list(
+    "sut_server", "http://localhost:8000", "Address of the server(s) under test."
+)
+
+
+class QSL:
+    """Demo QuerySampleLibrary with dummy features."""
+
+    def __init__(self, total_sample_count, performance_sample_count):
+        self.eval_features = {
+            i: f"what_is_my_dummy_feature_{i}?" for i in range(total_sample_count)
+        }
+        self.qsl = mlperf_loadgen.ConstructQSL(
+            total_sample_count,
+            performance_sample_count,
+            self.load_samples_to_ram,
+            self.unload_samples_from_ram,
+        )
+
+    def get_features(self, sample_id):
+        """Returns the feature for a given sample id."""
+        return self.eval_features[sample_id]
+
+    def load_samples_to_ram(self, query_samples):
+        """Loads the features for the given query samples into RAM."""
+        # Current implementation is not using this functionality.
+        del query_samples
+        return
+
+    def unload_samples_from_ram(self, query_samples):
+        """Unloads the features for the given query samples from RAM."""
+        # Current implementation is not using this functionality.
+        del query_samples
+        return
+
+    def __del__(self):
+        mlperf_loadgen.DestroyQSL(self.qsl)
+
+
+class QDL:
+    """QDL acting as a proxy to the SUT.
+    This QDL communicates with the SUT via HTTP.
+    It uses two endpoints to communicate with the SUT:
+    - /predict/ : Send a query to the SUT and get a response.
+    - /getname/ : Get the name of the SUT. Send a getname to the SUT and get a response.
+    """
+
+    def __init__(self, qsl: QSL, sut_server_addr: list):
+        """
+        Constructor for the QDL.
+        Args:
+            qsl: The QSL to use.
+            sut_server_addr: A list of addresses of the SUT.
+        """
+        self.qsl = qsl
+
+        # Construct QDL from the python binding
+        self.qdl = mlperf_loadgen.ConstructQDL(
+            self.issue_query, self.flush_queries, self.client_get_name
+        )
+        self.sut_server_addr = sut_server_addr
+        self.num_nodes = len(sut_server_addr)
+
+        # For round robin between the SUTs:
+        self.next_sut_id = 0
+        self.lock = threading.Lock()
+
+    def issue_query(self, query_samples):
+        """Process the query to send to the SUT"""
+        threading.Thread(
+            target=self.process_query_async,
+            args=[query_samples]).start()
+
+    def flush_queries(self):
+        """Flush the queries. Dummy implementation."""
+        pass
+
+    def process_query_async(self, query_samples):
+        """
+        This function is called by the Loadgen in a separate thread.
+        It is responsible for
+            1. Creating a query for the SUT, by reading the features from the QSL.
+            2. Sending the query to the SUT.
+            3. Waiting for the response from the SUT.
+            4. Deserializing the response.
+            5. Calling mlperf_loadgen.QuerySamplesComplete(query_samples, response)
+        Args:
+            query_samples: A list of QuerySample objects.
+        """
+        responses = []
+        for s in query_samples:
+            # Overall process:
+            # QDL builds a real-world query and sends to SUT --> SUT processes --> SUT sends back to QDL
+            # Read features from the QSL
+            features = self.qsl.get_features(s.index)
+
+            time.sleep(0.001)  # Ensure a maximal rate of queries to the SUT
+
+            # Send the query to SUT in round robin
+            # Wait for a response
+            sut_result = self.client_predict(features, s.index)
+            response_array = array.array("B", sut_result.encode("utf-8"))
+            bi = response_array.buffer_info()
+            responses.append(
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, bi[0], bi[1]))
+        mlperf_loadgen.QuerySamplesComplete(responses)
+
+    def get_sut_id_round_robin(self):
+        """Get the SUT id in round robin."""
+        with self.lock:
+            res = self.next_sut_id
+            self.next_sut_id = (self.next_sut_id + 1) % self.num_nodes
+        return res
+
+    def client_predict(self, query, id):
+        """Serialize the query, send it to the SUT in round robin, and return the deserialized response."""
+        url = "{}/predict/".format(
+            self.sut_server_addr[self.get_sut_id_round_robin()])
+        response = requests.post(url, json={"query": query, id: id})
+        return response.json()["result"]
+
+    def client_get_name(self):
+        """Get the name of the SUT from ALL the SUTS."""
+        if len(self.sut_server_addr) == 1:
+            return requests.post(
+                f"{self.sut_server_addr[0]}/getname/").json()["name"]
+
+        sut_names = [
+            requests.post(f"{addr}/getname/").json()["name"]
+            for addr in self.sut_server_addr
+        ]
+        return "Multi-node SUT: " + ", ".join(sut_names)
+
+    def __del__(self):
+        mlperf_loadgen.DestroyQDL(self.qdl)
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Server
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.server_target_qps = 100
+    settings.server_target_latency_ns = 100000000
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    # QDL and QSL
+    qsl = QSL(1024, 128)
+    qdl = QDL(qsl, sut_server_addr=FLAGS.sut_server)
+
+    mlperf_loadgen.StartTest(qdl.qdl, qsl.qsl, settings)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py
new file mode 100644
index 000000000..55e5e038d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py
@@ -0,0 +1,88 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+
+"""
+Python demo showing how to use the MLPerf Inference load generator bindings over the network.
+This part of the demo runs the "demo SUT" which is connected over the network to the LON node.
+A corresponding "demo LON node" with the demo test is implemented in py_demo_server_lon.py.
+
+The SUT is implemented using a Flask server, with dummy implementation of the inference processing.
+Two endpoints are exposed:
+- /predict/ : Receives a query (e.g., a text) runs inference, and returns a prediction.
+- /getname/ : Get the name of the SUT.
+
+The current implementation is a dummy implementation, which does not use
+a real DNN model, batching, or pre/postprocessing code,
+but rather just returns subset of the input query as a response,
+Yet, it illustrates the basic structure of a SUT server.
+"""
+
+import argparse
+from flask import Flask, request, jsonify
+
+
+app = Flask(__name__)
+
+
+node = ""
+
+
+def preprocess(query):
+    """[SUT Node] A dummy preprocess."""
+    # Here may come for example batching, tokenization, resizing,
+    # normalization, etc.
+    response = query
+    return response
+
+
+def dnn_model(query):
+    """[SUT Node] A dummy DNN model."""
+    # Here may come for example a call to a dnn model such as resnet, bert,
+    # etc.
+    response = query
+    return response
+
+
+def postprocess(query):
+    """[SUT Node] A dummy postprocess."""
+    # Here may come for example a postprocessing call, e.g., NMS,
+    # detokenization, etc.
+    response = query
+    return response
+
+
+@app.route("/predict/", methods=["POST"])
+def predict():
+    """Receives a query (e.g., a text) runs inference, and returns a prediction."""
+    query = request.get_json(force=True)["query"]
+    result = postprocess(dnn_model(preprocess(query)))
+    return jsonify(result=result)
+
+
+@app.route("/getname/", methods=["POST", "GET"])
+def getname():
+    """Returns the name of the SUT."""
+    return jsonify(name=f"Demo SUT (Network SUT) node" +
+                   (" " + node) if node else "")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--node", type=str, default="")
+    args = parser.parse_args()
+    node = args.node
+    app.run(debug=False, port=args.port)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py
new file mode 100644
index 000000000..f6082cad6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py
@@ -0,0 +1,86 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+
+from datetime import datetime
+
+# Global var
+NUM_AGENTS = 8
+LOOPBACK_LATENCY_S = 0.001
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in NUM_AGENTS slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(LOOPBACK_LATENCY_S * (i_slice + 1))
+    responses = []
+    samples_to_complete = query_samples[i_slice: len(
+        query_samples): NUM_AGENTS]
+    for j, s in enumerate(samples_to_complete):
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    for i in range(8):
+        threading.Thread(
+            target=process_query_async, args=(
+                query_samples, i)).start()
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.MultiStream
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.multi_stream_expected_latency_ns = 8000000
+    settings.multi_stream_samples_per_query = 8
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py
new file mode 100644
index 000000000..909585edc
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py
@@ -0,0 +1,81 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in 3 slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(3 * (i_slice + 1))
+    responses = []
+    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
+    for s in samples_to_complete:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 0)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 1)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 2)).start()
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Offline
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.offline_expected_qps = 1000
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py
new file mode 100644
index 000000000..8b6f2b826
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py
@@ -0,0 +1,74 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    time.sleep(0.001)
+    responses = []
+    for s in query_samples:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Server
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.server_target_qps = 100
+    settings.server_target_latency_ns = 100000000
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py
new file mode 100644
index 000000000..8806271bd
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py
@@ -0,0 +1,84 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import array
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    """Processes the list of queries."""
+    time.sleep(0.001)
+    responses = []
+    response_array = array.array(
+        "f", [0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 254, 255]
+    )
+    response_info = response_array.buffer_info()
+    response_data = response_info[0]
+    response_size = response_info[1] * response_array.itemsize
+    for s in query_samples:
+        responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size)
+        )
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.single_stream_expected_latency_ns = 1000000
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py
new file mode 100644
index 000000000..e4b083853
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py
@@ -0,0 +1,142 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import threading
+import time
+import numpy as np
+import array
+
+import mlperf_loadgen
+
+from datetime import datetime
+
+# Global var
+NUM_AGENTS = 8
+LOOPBACK_LATENCY_S = 0.001
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in NUM_AGENTS slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(LOOPBACK_LATENCY_S * (i_slice + 1))
+    query_responses = []
+    samples_to_complete = query_samples[i_slice: len(
+        query_samples): NUM_AGENTS]
+    for j, s in enumerate(samples_to_complete):
+        response_array = np.array(responses[s.index], np.int32)
+        token = response_array[0]
+        time.sleep(0.0002)
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        mlperf_loadgen.FirstTokenComplete(
+            [
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, response_token_data, response_token_size
+                )
+            ]
+        )
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size, n_tokens
+            )
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    for i in range(8):
+        threading.Thread(
+            target=process_query_async, args=(
+                query_samples, i)).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--expected-latency", type=int, default=8000000)
+    parser.add_argument("--samples-per-query", type=int, default=8)
+    parser.add_argument("--min-query-count", type=int, default=100)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.MultiStream
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.multi_stream_expected_latency_ns = args.expected_latency
+    settings.multi_stream_samples_per_query = args.samples_per_query
+    settings.min_query_count = args.min_query_count
+    settings.min_duration_ms = args.min_duration_ms
+    settings.use_token_latencies = True
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py
new file mode 100644
index 000000000..2e190cdd5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py
@@ -0,0 +1,130 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import threading
+import time
+import numpy as np
+import array
+
+import mlperf_loadgen
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in 3 slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(3 * (i_slice + 1))
+    query_responses = []
+    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
+    for s in samples_to_complete:
+        response_array = np.array(responses[s.index], np.int32)
+        token = response_array[0]
+        time.sleep(0.0002)
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size, n_tokens
+            )
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 0)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 1)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 2)).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--expected-qps", type=int, default=1000)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Offline
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.offline_expected_qps = args.expected_qps
+    settings.min_duration_ms = args.min_duration_ms
+    settings.use_token_latencies = True
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py
new file mode 100644
index 000000000..9325b8410
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py
@@ -0,0 +1,130 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import threading
+import time
+import numpy as np
+import array
+
+import mlperf_loadgen
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20, mod=3)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in 3 slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(3 * (i_slice + 1))
+    query_responses = []
+    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
+    for s in samples_to_complete:
+        response_array = np.array(responses[s.index], np.int32)
+        token = response_array[0]
+        time.sleep(0.0002)
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size)
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 0)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 1)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 2)).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--expected-qps", type=int, default=1000)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Offline
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.offline_expected_qps = args.expected_qps
+    settings.min_duration_ms = args.min_duration_ms
+    settings.infer_token_latencies = 1
+    settings.token_latency_scaling_factor = 21
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py
new file mode 100644
index 000000000..b564543cd
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py
@@ -0,0 +1,132 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import array
+import threading
+import time
+import numpy as np
+
+from absl import app
+import mlperf_loadgen
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    """Processes the list of queries."""
+    query_responses = []
+    for s in query_samples:
+        response_array = np.array(responses[s.index], np.int32)
+        token = response_array[0]
+        time.sleep(0.0002)
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        mlperf_loadgen.FirstTokenComplete(
+            [
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, response_token_data, response_token_size
+                )
+            ]
+        )
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        # print(f"Reported size python: {n_tokens}")
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size, n_tokens
+            )
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--target-qps", type=int, default=100)
+    parser.add_argument("--target-latency-ns", type=int, default=100000000)
+    parser.add_argument("--min-query-count", type=int, default=100)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Server
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.server_target_qps = args.target_qps
+    settings.server_target_latency_ns = args.target_latency_ns
+    settings.min_query_count = args.min_query_count
+    settings.min_duration_ms = args.min_duration_ms
+    settings.use_token_latencies = True
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py
new file mode 100644
index 000000000..76461a75d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py
@@ -0,0 +1,125 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import array
+import threading
+import time
+import numpy as np
+
+from absl import app
+import mlperf_loadgen
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20, mod=3)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    """Processes the list of queries."""
+    query_responses = []
+    for s in query_samples:
+        response_array = np.array(responses[s.index], np.int32)
+        token = response_array[0]
+        time.sleep(0.0002)
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        # print(f"Reported size python: {n_tokens}")
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size)
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--target-qps", type=int, default=100)
+    parser.add_argument("--target-latency-ns", type=int, default=100000000)
+    parser.add_argument("--min-query-count", type=int, default=100)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Server
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.server_target_qps = args.target_qps
+    settings.server_target_latency_ns = args.target_latency_ns
+    settings.min_query_count = args.min_query_count
+    settings.min_duration_ms = args.min_duration_ms
+    settings.infer_token_latencies = 1
+    settings.token_latency_scaling_factor = 21
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py
new file mode 100644
index 000000000..ca8d84591
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py
@@ -0,0 +1,129 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import array
+import threading
+import time
+import numpy as np
+
+from absl import app
+import mlperf_loadgen
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    """Processes the list of queries."""
+    query_responses = []
+    for s in query_samples:
+        response_array = np.array(responses[s.index], np.int32)
+        time.sleep(0.0002)
+        token = response_array[:1]
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        mlperf_loadgen.FirstTokenComplete(
+            [
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, response_token_data, response_token_size
+                )
+            ]
+        )
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size, n_tokens
+            )
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--expected-latency", type=int, default=2050000)
+    parser.add_argument("--min-query-count", type=int, default=100)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.single_stream_expected_latency_ns = args.expected_latency
+    settings.min_query_count = args.min_query_count
+    settings.min_duration_ms = args.min_duration_ms
+    settings.use_token_latencies = True
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png
new file mode 100644
index 000000000..35663b97f
Binary files /dev/null and b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png differ
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png
new file mode 100644
index 000000000..3ed6429c0
Binary files /dev/null and b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png differ
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn
new file mode 100644
index 000000000..865bc4d3b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn
@@ -0,0 +1,33 @@
+generated_doxygen_out_dir =
+    get_path_info(".", "gen_dir") + "/.."
+
+loadgen_doxygen_sources = [
+  "doxygen.cfg",
+  "doxygen_footer.html",
+  "doxygen_header.html",
+  "doxygen_layout.xml",
+  "doxygen_stylesheet.css",
+  "loadgen-integration_diagram.dia",
+  "mlperf_icon.png",
+  "mlperf_logo_horizontal_color.svg",
+  "README.md"
+]
+
+source_set("loadgen_doxygen_sources") {
+  sources = loadgen_doxygen_sources
+}
+
+source_set("doxygen_html_generator_script") {
+  sources = [ "doxygen_html_generator.py" ]
+}
+
+action("generate_doxygen_html") {
+  script = "doxygen_html_generator.py"
+  args = [ rebase_path(generated_doxygen_out_dir, root_build_dir),
+           rebase_path("../..") ]
+  outputs = [ generated_doxygen_out_dir ]
+  deps = [ ":loadgen_doxygen_sources",
+           ":doxygen_html_generator_script",
+           "../..:mlperf_loadgen_sources_no_gen",
+           "../..:docs" ]
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md
new file mode 100644
index 000000000..d5cf5fe18
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md
@@ -0,0 +1,34 @@
+# Generating the HTML docs {#ReadmeHtmlDocs}
+
+This document is generated from inline docstrings in the source and
+various markdown files checked into the git repository. If you've
+checked out the code, you can generate this documentation.
+
+*Prerequisite:* You must have [doxygen](http://www.doxygen.nl) installed
+on your system:
+
+## With gn / ninja
+
+If you are using the gn build flow, you may run:
+
+    ninja -C out/Release generate_doxygen_html
+
+* This will output the documentation to out/Release/gen/loadgen/docs/gen and
+avoid poluting the source directory.
+
+## Manually
+
+Alternatively, you can manually run:
+
+    python docs/src/doxygen_html_generator.py <target_dir> <loadgen_root>
+
+* If <loadgen_root> is omitted, it will default to ".".
+* If <target_dir> is also omitted, it will default to "./docs/gen".
+
+## Hosting
+
+A version of this doc is currently hosted online at
+https://mlperf.github.io/inference/loadgen/index.html
+
+To update the hosted version, submit a PR to the
+[mlperf.github.io](https://github.com/mlperf/mlperf.github.io) repository.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg
new file mode 100644
index 000000000..fc05853d1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg
@@ -0,0 +1,2495 @@
+# Doxyfile 1.8.13
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "LoadGen Guide"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_logo_horizontal_color.svg
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = $(MLPERF_DOXYGEN_OUT_PATH)
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = YES
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 1
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_layout.xml
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = $(MLPERF_LOADGEN_SRC_PATH)
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = depot_tools
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             = $(MLPERF_LOADGEN_SRC_PATH)/docs/src
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse-libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = YES
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          = -I ../third_party/pybind/include --std=c++14
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot o=
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_header.html
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_footer.html
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_stylesheet.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_icon.png \
+                         $(MLPERF_LOADGEN_SRC_PATH)/loadgen_integration_diagram.svg
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 127
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = YES
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 50
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = YES
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = YES
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = YES
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = YES
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = YES
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html
new file mode 100644
index 000000000..0d7b177af
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html
@@ -0,0 +1,26 @@
+<!-- HTML footer for doxygen 1.8.13-->
+<!-- start footer part -->
+<!--BEGIN GENERATE_TREEVIEW-->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    $navpath
+    <li class="footer">
+      <div style="flex-grow:1"></div>
+      <div>
+        $generatedby
+        <a href="http://www.doxygen.org/index.html">doxygen</a>
+        $doxygenversion
+      </div>
+    </li>
+  </ul>
+</div>
+<!--END GENERATE_TREEVIEW-->
+<!--BEGIN !GENERATE_TREEVIEW-->
+<hr class="footer"/><address class="footer"><small>
+$generatedby &#160;<a href="http://www.doxygen.org/index.html">
+doxygen
+</a> $doxygenversion
+</small></address>
+<!--END !GENERATE_TREEVIEW-->
+</body>
+</html>
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html
new file mode 100644
index 000000000..91d214b95
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html
@@ -0,0 +1,49 @@
+<!-- HTML header for doxygen 1.8.13-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen $doxygenversion"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<link rel="shortcut icon" href="mlperf_icon.png">
+<!--BEGIN PROJECT_NAME--><title>LoadGen: $title</title><!--END PROJECT_NAME-->
+<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
+<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="$relpath^jquery.js"></script>
+<script type="text/javascript" src="$relpath^dynsections.js"></script>
+$treeview
+$search
+$mathjax
+<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
+$extrastylesheet
+</head>
+<body>
+<div id="top" style="display:flex; flex-flow:row wrap; justify-content:flex-start; align-items:center;"><!-- do not remove this div, it is closed by doxygen! -->
+
+<!--BEGIN TITLEAREA-->
+<div id="titlearea" style="display:flex; flex-flow:row wrap; justify-content:flex-start; align-items:center;">
+  <!--BEGIN PROJECT_LOGO-->
+  <a href="https://www.mlperf.org"><img alt="MLPerf" src="$relpath^$projectlogo"/ height="55px"></a>
+  <!--END PROJECT_LOGO-->
+  <!--BEGIN PROJECT_NAME-->
+  <div style="padding-left: 1em;">
+    <div id="projectname"><a href="index.html">$projectname</a>
+   <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
+   </div>
+   <!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
+  </div>
+  <!--END PROJECT_NAME-->
+  <!--BEGIN !PROJECT_NAME-->
+   <!--BEGIN PROJECT_BRIEF-->
+    <div id="projectbrief" style="padding-left: 1em;">$projectbrief</div>
+   <!--END PROJECT_BRIEF-->
+  <!--END !PROJECT_NAME-->
+  <!--BEGIN DISABLE_INDEX-->
+   <!--BEGIN SEARCHENGINE-->
+   <div>$searchbox</div>
+   <!--END SEARCHENGINE-->
+  <!--END DISABLE_INDEX-->
+</div>
+<!--END TITLEAREA-->
+<!-- end header part -->
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py
new file mode 100644
index 000000000..4065d7bd0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py
@@ -0,0 +1,37 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# \file
+#  \brief A script that sets the environment variables expected by doxygen.cfg.
+#  \details This can be run manually without any arguments, but also allows a
+#  build system to customize the output directory.
+
+import os
+import sys
+
+
+def generate_doxygen_html(doxygen_out_dir, loadgen_root):
+    os.environ["MLPERF_LOADGEN_SRC_PATH"] = loadgen_root
+    os.environ["MLPERF_DOXYGEN_OUT_PATH"] = doxygen_out_dir
+    os.popen("doxygen " + loadgen_root + "/docs/src/doxygen.cfg")
+
+
+def main(argv):
+    doxygen_out_dir = "./docs/gen" if len(argv) < 2 else argv[1]
+    loadgen_root = "." if len(argv) < 3 else argv[2]
+    generate_doxygen_html(doxygen_out_dir, loadgen_root)
+
+
+main(sys.argv)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml
new file mode 100644
index 000000000..1fc5a9cb4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml
@@ -0,0 +1,211 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.13 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="yes" title="Overview"/>
+    <tab type="usergroup" visible="yes" title="API and Usage">
+      <tab type="user" url="@ref LoadgenAPI" visible="yes" title="API"/>
+      <tab type="user" url="@ref LoadgenAPITestSettings" visible="yes" title="Settings"/>
+    </tab>
+    <tab type="usergroup" visible="yes" title="Development">
+      <tab type="user" url="@ref ReadmeBuild" visible="yes" title="Building"/>
+      <tab type="user" url="@ref ReadmeHtmlDocs" visible="yes" title="Generating this Document"/>
+      <tab type="user" url="@ref todo" visible="yes" title="TODOs"/>
+    </tab>
+    <tab type="usergroup" visible="yes" title="Test Coverage">
+      <tab type="user" url="@ref ReadmeTests" visible="yes" title="Build and Run"/>
+      <tab type="user" url="@ref LoadgenTestsBasic" visible="yes" title="Basic"/>
+      <tab type="user" url="@ref LoadgenTestsPerformance" visible="yes" title="Performance"/>
+    </tab>
+    <tab type="usergroup" visible="yes" title="The Code">
+      <tab type="modules" visible="no" title="" intro=""/>
+      <tab type="namespaces" visible="yes" title="">
+        <tab type="namespacelist" visible="yes" title="" intro=""/>
+        <tab type="namespacemembers" visible="yes" title="" intro=""/>
+      </tab>
+      <tab type="classes" visible="yes" title="">
+        <tab type="classlist" visible="yes" title="" intro=""/>
+        <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+        <tab type="hierarchy" visible="yes" title="" intro=""/>
+        <tab type="classmembers" visible="yes" title="" intro=""/>
+      </tab>
+      <tab type="files" visible="yes" title="">
+        <tab type="filelist" visible="yes" title="" intro=""/>
+        <tab type="globals" visible="yes" title="" intro=""/>
+      </tab>
+      <tab type="examples" visible="no" title="" intro=""/>
+    </tab>
+    <tab type="pages" visible="no" title="" intro=""/>
+    <tab type="user" url="@ref ReadmeFAQ" visible="yes" title="FAQ"/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css
new file mode 100644
index 000000000..3bd61261c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css
@@ -0,0 +1,1629 @@
+/* The standard CSS for doxygen 1.8.13 */
+
+body, table, div, p, dl {
+	font: 400 14px/22px Roboto,sans-serif;
+}
+
+p.reference, p.definition {
+	font: 400 14px/22px Roboto,sans-serif;
+}
+
+/* @group Heading Levels */
+
+h1.groupheader {
+	font-size: 150%;
+}
+
+.title {
+	font: 400 14px/28px Roboto,sans-serif;
+	font-size: 175%;
+	font-weight: bold;
+	margin: 10px 2px;
+	color: #135384;
+}
+
+h2.groupheader {
+	border-bottom: 1px solid #879ECB;
+	color: #354C7B;
+	font-size: 150%;
+	font-weight: normal;
+	margin-top: 1.75em;
+	padding-top: 8px;
+	padding-bottom: 4px;
+	width: 100%;
+}
+
+h3.groupheader {
+	font-size: 100%;
+}
+
+h1, h2, h3, h4, h5, h6 {
+	-webkit-transition: text-shadow 0.5s linear;
+	-moz-transition: text-shadow 0.5s linear;
+	-ms-transition: text-shadow 0.5s linear;
+	-o-transition: text-shadow 0.5s linear;
+	transition: text-shadow 0.5s linear;
+	margin-right: 15px;
+	color: #135384;
+
+}
+
+h1.glow, h2.glow, h3.glow, h4.glow, h5.glow, h6.glow {
+	text-shadow: 0 0 15px cyan;
+}
+
+dt {
+	font-weight: bold;
+}
+
+div.multicol {
+	-moz-column-gap: 1em;
+	-webkit-column-gap: 1em;
+	-moz-column-count: 3;
+	-webkit-column-count: 3;
+}
+
+p.startli, p.startdd {
+	margin-top: 2px;
+}
+
+p.starttd {
+	margin-top: 0px;
+}
+
+p.endli {
+	margin-bottom: 0px;
+}
+
+p.enddd {
+	margin-bottom: 4px;
+}
+
+p.endtd {
+	margin-bottom: 2px;
+}
+
+/* @end */
+
+caption {
+	font-weight: bold;
+}
+
+span.legend {
+        font-size: 70%;
+        text-align: center;
+}
+
+h3.version {
+        font-size: 90%;
+        text-align: center;
+}
+
+div.qindex, div.navtab{
+	background-color: #EBEFF6;
+	border: 1px solid #A3B4D7;
+	text-align: center;
+}
+
+div.qindex, div.navpath {
+	width: 100%;
+	line-height: 140%;
+}
+
+div.navtab {
+	margin-right: 15px;
+}
+
+/* @group Link Styling */
+
+a {
+	color: #3D578C;
+	font-weight: normal;
+	text-decoration: none;
+}
+
+.contents a:visited {
+	color: #4665A2;
+}
+
+a:hover {
+	text-decoration: underline;
+}
+
+a.qindex {
+	font-weight: bold;
+}
+
+a.qindexHL {
+	font-weight: bold;
+	background-color: #9CAFD4;
+	color: #ffffff;
+	border: 1px double #869DCA;
+}
+
+.contents a.qindexHL:visited {
+        color: #ffffff;
+}
+
+a.el {
+	font-weight: bold;
+}
+
+a.elRef {
+}
+
+a.code, a.code:visited, a.line, a.line:visited {
+	color: #4665A2; 
+}
+
+a.codeRef, a.codeRef:visited, a.lineRef, a.lineRef:visited {
+	color: #4665A2; 
+}
+
+/* @end */
+
+dl.el {
+	margin-left: -1cm;
+}
+
+pre.fragment {
+        border: 1px solid #C4CFE5;
+        background-color: #FBFCFD;
+        padding: 4px 6px;
+        margin: 4px 8px 4px 2px;
+        overflow: auto;
+        word-wrap: break-word;
+        font-size:  9pt;
+        line-height: 125%;
+        font-family: monospace, fixed;
+        font-size: 105%;
+}
+
+div.fragment {
+        padding: 0px;
+        margin: 4px 8px 4px 2px;
+	background-color: #FBFCFD;
+	border: 1px solid #C4CFE5;
+}
+
+div.line {
+	font-family: monospace, fixed;
+        font-size: 13px;
+	min-height: 13px;
+	line-height: 1.0;
+	text-wrap: unrestricted;
+	white-space: -moz-pre-wrap; /* Moz */
+	white-space: -pre-wrap;     /* Opera 4-6 */
+	white-space: -o-pre-wrap;   /* Opera 7 */
+	white-space: pre-wrap;      /* CSS3  */
+	word-wrap: break-word;      /* IE 5.5+ */
+	text-indent: -53px;
+	padding-left: 53px;
+	padding-bottom: 0px;
+	margin: 0px;
+	-webkit-transition-property: background-color, box-shadow;
+	-webkit-transition-duration: 0.5s;
+	-moz-transition-property: background-color, box-shadow;
+	-moz-transition-duration: 0.5s;
+	-ms-transition-property: background-color, box-shadow;
+	-ms-transition-duration: 0.5s;
+	-o-transition-property: background-color, box-shadow;
+	-o-transition-duration: 0.5s;
+	transition-property: background-color, box-shadow;
+	transition-duration: 0.5s;
+}
+
+div.line:after {
+    content:"\000A";
+    white-space: pre;
+}
+
+div.line.glow {
+	background-color: cyan;
+	box-shadow: 0 0 10px cyan;
+}
+
+
+span.lineno {
+	padding-right: 4px;
+	text-align: right;
+	border-right: 2px solid #0F0;
+	background-color: #E8E8E8;
+        white-space: pre;
+}
+span.lineno a {
+	background-color: #D8D8D8;
+}
+
+span.lineno a:hover {
+	background-color: #C8C8C8;
+}
+
+.lineno {
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+div.ah, span.ah {
+	background-color: black;
+	font-weight: bold;
+	color: #ffffff;
+	margin-bottom: 3px;
+	margin-top: 3px;
+	padding: 0.2em;
+	border: solid thin #333;
+	border-radius: 0.5em;
+	-webkit-border-radius: .5em;
+	-moz-border-radius: .5em;
+	box-shadow: 2px 2px 3px #999;
+	-webkit-box-shadow: 2px 2px 3px #999;
+	-moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+	background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444));
+	background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000 110%);
+}
+
+div.classindex ul {
+        list-style: none;
+        padding-left: 0;
+}
+
+div.classindex span.ai {
+        display: inline-block;
+}
+
+div.groupHeader {
+	margin-left: 16px;
+	margin-top: 12px;
+	font-weight: bold;
+}
+
+div.groupText {
+	margin-left: 16px;
+	font-style: italic;
+}
+
+body {
+	background-color: white;
+	color: black;
+        margin: 0;
+}
+
+div.contents {
+	margin-top: 10px;
+	margin-left: 12px;
+	margin-right: 8px;
+}
+
+td.indexkey {
+	background-color: #EBEFF6;
+	font-weight: bold;
+	border: 1px solid #C4CFE5;
+	margin: 2px 0px 2px 0;
+	padding: 2px 10px;
+        white-space: nowrap;
+        vertical-align: top;
+}
+
+td.indexvalue {
+	background-color: #EBEFF6;
+	border: 1px solid #C4CFE5;
+	padding: 2px 10px;
+	margin: 2px 0px;
+}
+
+tr.memlist {
+	background-color: #EEF1F7;
+}
+
+p.formulaDsp {
+	text-align: center;
+}
+
+img.formulaDsp {
+	
+}
+
+img.formulaInl {
+	vertical-align: middle;
+}
+
+div.center {
+	text-align: center;
+        margin-top: 0px;
+        margin-bottom: 0px;
+        padding: 0px;
+}
+
+div.center img {
+	border: 0px;
+}
+
+address.footer {
+	text-align: right;
+	padding-right: 12px;
+}
+
+img.footer {
+	border: 0px;
+	vertical-align: middle;
+}
+
+/* @group Code Colorization */
+
+span.keyword {
+	color: #008000
+}
+
+span.keywordtype {
+	color: #604020
+}
+
+span.keywordflow {
+	color: #e08000
+}
+
+span.comment {
+	color: #800000
+}
+
+span.preprocessor {
+	color: #806020
+}
+
+span.stringliteral {
+	color: #002080
+}
+
+span.charliteral {
+	color: #008080
+}
+
+span.vhdldigit { 
+	color: #ff00ff 
+}
+
+span.vhdlchar { 
+	color: #000000 
+}
+
+span.vhdlkeyword { 
+	color: #700070 
+}
+
+span.vhdllogic { 
+	color: #ff0000 
+}
+
+blockquote {
+        background-color: #F7F8FB;
+        border-left: 2px solid #9CAFD4;
+        margin: 0 24px 0 4px;
+        padding: 0 12px 0 16px;
+}
+
+/* @end */
+
+/*
+.search {
+	color: #003399;
+	font-weight: bold;
+}
+
+form.search {
+	margin-bottom: 0px;
+	margin-top: 0px;
+}
+
+input.search {
+	font-size: 75%;
+	color: #000080;
+	font-weight: normal;
+	background-color: #e8eef2;
+}
+*/
+
+td.tiny {
+	font-size: 75%;
+}
+
+.dirtab {
+	padding: 4px;
+	border-collapse: collapse;
+	border: 1px solid #A3B4D7;
+}
+
+th.dirtab {
+	background: #EBEFF6;
+	font-weight: bold;
+}
+
+hr {
+	height: 0px;
+	border: none;
+	border-top: 1px solid #4A6AAA;
+}
+
+hr.footer {
+	height: 1px;
+}
+
+/* @group Member Descriptions */
+
+table.memberdecls {
+	border-spacing: 0px;
+	padding: 0px;
+}
+
+.memberdecls td, .fieldtable tr {
+	-webkit-transition-property: background-color, box-shadow;
+	-webkit-transition-duration: 0.5s;
+	-moz-transition-property: background-color, box-shadow;
+	-moz-transition-duration: 0.5s;
+	-ms-transition-property: background-color, box-shadow;
+	-ms-transition-duration: 0.5s;
+	-o-transition-property: background-color, box-shadow;
+	-o-transition-duration: 0.5s;
+	transition-property: background-color, box-shadow;
+	transition-duration: 0.5s;
+}
+
+.memberdecls td.glow, .fieldtable tr.glow {
+	background-color: cyan;
+	box-shadow: 0 0 15px cyan;
+}
+
+.mdescLeft, .mdescRight,
+.memItemLeft, .memItemRight,
+.memTemplItemLeft, .memTemplItemRight, .memTemplParams {
+	background-color: #F9FAFC;
+	border: none;
+	margin: 4px;
+	padding: 1px 0 0 8px;
+}
+
+.mdescLeft, .mdescRight {
+	padding: 0px 8px 4px 8px;
+	color: #555;
+}
+
+.memSeparator {
+        border-bottom: 1px solid #DEE4F0;
+        line-height: 1px;
+        margin: 0px;
+        padding: 0px;
+}
+
+.memItemLeft, .memTemplItemLeft {
+        white-space: nowrap;
+}
+
+.memItemRight {
+	width: 100%;
+}
+
+.memTemplParams {
+	color: #4665A2;
+        white-space: nowrap;
+	font-size: 80%;
+}
+
+/* @end */
+
+/* @group Member Details */
+
+/* Styles for detailed member documentation */
+
+.memtitle {
+	padding: 8px;
+	border-top: 1px solid #A8B8D9;
+	border-left: 1px solid #A8B8D9;
+	border-right: 1px solid #A8B8D9;
+	border-top-right-radius: 4px;
+	border-top-left-radius: 4px;
+	margin-bottom: -1px;
+	background-image: url('nav_f.png');
+	background-repeat: repeat-x;
+	background-color: #E2E8F2;
+	line-height: 1.25;
+	font-weight: 300;
+	float:left;
+}
+
+.permalink
+{
+        font-size: 65%;
+        display: inline-block;
+        vertical-align: middle;
+}
+
+.memtemplate {
+	font-size: 80%;
+	color: #4665A2;
+	font-weight: normal;
+	margin-left: 9px;
+}
+
+.memnav {
+	background-color: #EBEFF6;
+	border: 1px solid #A3B4D7;
+	text-align: center;
+	margin: 2px;
+	margin-right: 15px;
+	padding: 2px;
+}
+
+.mempage {
+	width: 100%;
+}
+
+.memitem {
+	padding: 0;
+	margin-bottom: 10px;
+	margin-right: 5px;
+        -webkit-transition: box-shadow 0.5s linear;
+        -moz-transition: box-shadow 0.5s linear;
+        -ms-transition: box-shadow 0.5s linear;
+        -o-transition: box-shadow 0.5s linear;
+        transition: box-shadow 0.5s linear;
+        display: table !important;
+        width: 100%;
+}
+
+.memitem.glow {
+         box-shadow: 0 0 15px cyan;
+}
+
+.memname {
+        font-weight: 400;
+        margin-left: 6px;
+}
+
+.memname td {
+	vertical-align: bottom;
+}
+
+.memproto, dl.reflist dt {
+        border-top: 1px solid #A8B8D9;
+        border-left: 1px solid #A8B8D9;
+        border-right: 1px solid #A8B8D9;
+        padding: 6px 0px 6px 0px;
+        color: #253555;
+        font-weight: bold;
+        text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
+        background-color: #DFE5F1;
+        /* opera specific markup */
+        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        border-top-right-radius: 4px;
+        /* firefox specific markup */
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
+        -moz-border-radius-topright: 4px;
+        /* webkit specific markup */
+        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        -webkit-border-top-right-radius: 4px;
+
+}
+
+.overload {
+        font-family: "courier new",courier,monospace;
+	font-size: 65%;
+}
+
+.memdoc, dl.reflist dd {
+        border-bottom: 1px solid #A8B8D9;      
+        border-left: 1px solid #A8B8D9;      
+        border-right: 1px solid #A8B8D9; 
+        padding: 6px 10px 2px 10px;
+        background-color: #FBFCFD;
+        border-top-width: 0;
+        background-image:url('nav_g.png');
+        background-repeat:repeat-x;
+        background-color: #FFFFFF;
+        /* opera specific markup */
+        border-bottom-left-radius: 4px;
+        border-bottom-right-radius: 4px;
+        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        /* firefox specific markup */
+        -moz-border-radius-bottomleft: 4px;
+        -moz-border-radius-bottomright: 4px;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
+        /* webkit specific markup */
+        -webkit-border-bottom-left-radius: 4px;
+        -webkit-border-bottom-right-radius: 4px;
+        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+}
+
+dl.reflist dt {
+        padding: 5px;
+}
+
+dl.reflist dd {
+        margin: 0px 0px 10px 0px;
+        padding: 5px;
+}
+
+.paramkey {
+	text-align: right;
+}
+
+.paramtype {
+	white-space: nowrap;
+}
+
+.paramname {
+	color: #602020;
+	white-space: nowrap;
+}
+.paramname em {
+	font-style: normal;
+}
+.paramname code {
+        line-height: 14px;
+}
+
+.params, .retval, .exception, .tparams {
+        margin-left: 0px;
+        padding-left: 0px;
+}       
+
+.params .paramname, .retval .paramname {
+        font-weight: bold;
+        vertical-align: top;
+}
+        
+.params .paramtype {
+        font-style: italic;
+        vertical-align: top;
+}       
+        
+.params .paramdir {
+        font-family: "courier new",courier,monospace;
+        vertical-align: top;
+}
+
+table.mlabels {
+	border-spacing: 0px;
+}
+
+td.mlabels-left {
+	width: 100%;
+	padding: 0px;
+}
+
+td.mlabels-right {
+	vertical-align: bottom;
+	padding: 0px;
+	white-space: nowrap;
+}
+
+span.mlabels {
+        margin-left: 8px;
+}
+
+span.mlabel {
+        background-color: #728DC1;
+        border-top:1px solid #5373B4;
+        border-left:1px solid #5373B4;
+        border-right:1px solid #C4CFE5;
+        border-bottom:1px solid #C4CFE5;
+	text-shadow: none;
+	color: white;
+	margin-right: 4px;
+	padding: 2px 3px;
+	border-radius: 3px;
+	font-size: 7pt;
+	white-space: nowrap;
+	vertical-align: middle;
+}
+
+
+
+/* @end */
+
+/* these are for tree view inside a (index) page */
+
+div.directory {
+        margin: 10px 0px;
+        border-top: 1px solid #9CAFD4;
+        border-bottom: 1px solid #9CAFD4;
+        width: 100%;
+}
+
+.directory table {
+        border-collapse:collapse;
+}
+
+.directory td {
+        margin: 0px;
+        padding: 0px;
+	vertical-align: top;
+}
+
+.directory td.entry {
+        white-space: nowrap;
+        padding-right: 6px;
+	padding-top: 3px;
+}
+
+.directory td.entry a {
+        outline:none;
+}
+
+.directory td.entry a img {
+        border: none;
+}
+
+.directory td.desc {
+        width: 100%;
+        padding-left: 6px;
+	padding-right: 6px;
+	padding-top: 3px;
+	border-left: 1px solid rgba(0,0,0,0.05);
+}
+
+.directory tr.even {
+	padding-left: 6px;
+	background-color: #F7F8FB;
+}
+
+.directory img {
+	vertical-align: -30%;
+}
+
+.directory .levels {
+        white-space: nowrap;
+        width: 100%;
+        text-align: right;
+        font-size: 9pt;
+}
+
+.directory .levels span {
+        cursor: pointer;
+        padding-left: 2px;
+        padding-right: 2px;
+	color: #3D578C;
+}
+
+.arrow {
+    color: #9CAFD4;
+    -webkit-user-select: none;
+    -khtml-user-select: none;
+    -moz-user-select: none;
+    -ms-user-select: none;
+    user-select: none;
+    cursor: pointer;
+    font-size: 80%;
+    display: inline-block;
+    width: 16px;
+    height: 22px;
+}
+
+.icon {
+    font-family: Arial, Helvetica;
+    font-weight: bold;
+    font-size: 12px;
+    height: 14px;
+    width: 16px;
+    display: inline-block;
+    background-color: #728DC1;
+    color: white;
+    text-align: center;
+    border-radius: 4px;
+    margin-left: 2px;
+    margin-right: 2px;
+}
+
+.icona {
+    width: 24px;
+    height: 22px;
+    display: inline-block;
+}
+
+.iconfopen {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('folderopen.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+.iconfclosed {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('folderclosed.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+.icondoc {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('doc.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+table.directory {
+    font: 400 14px Roboto,sans-serif;
+}
+
+/* @end */
+
+div.dynheader {
+        margin-top: 8px;
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+address {
+	font-style: normal;
+	color: #2A3D61;
+}
+
+table.doxtable caption {
+	caption-side: top;
+}
+
+table.doxtable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.doxtable td, table.doxtable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.doxtable th {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+table.fieldtable {
+        /*width: 100%;*/
+        margin-bottom: 10px;
+        border: 1px solid #A8B8D9;
+        border-spacing: 0px;
+        -moz-border-radius: 4px;
+        -webkit-border-radius: 4px;
+        border-radius: 4px;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+        -webkit-box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
+        box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
+}
+
+.fieldtable td, .fieldtable th {
+        padding: 3px 7px 2px;
+}
+
+.fieldtable td.fieldtype, .fieldtable td.fieldname {
+        white-space: nowrap;
+        border-right: 1px solid #A8B8D9;
+        border-bottom: 1px solid #A8B8D9;
+        vertical-align: top;
+}
+
+.fieldtable td.fieldname {
+        padding-top: 3px;
+}
+
+.fieldtable td.fielddoc {
+        border-bottom: 1px solid #A8B8D9;
+        /*width: 100%;*/
+}
+
+.fieldtable td.fielddoc p:first-child {
+        margin-top: 0px;
+}       
+        
+.fieldtable td.fielddoc p:last-child {
+        margin-bottom: 2px;
+}
+
+.fieldtable tr:last-child td {
+        border-bottom: none;
+}
+
+.fieldtable th {
+        background-image:url('nav_f.png');
+        background-repeat:repeat-x;
+        background-color: #E2E8F2;
+        font-size: 90%;
+        color: #253555;
+        padding-bottom: 4px;
+        padding-top: 5px;
+        text-align:left;
+        font-weight: 400;
+        -moz-border-radius-topleft: 4px;
+        -moz-border-radius-topright: 4px;
+        -webkit-border-top-left-radius: 4px;
+        -webkit-border-top-right-radius: 4px;
+        border-top-left-radius: 4px;
+        border-top-right-radius: 4px;
+        border-bottom: 1px solid #A8B8D9;
+}
+
+
+.tabsearch {
+	top: 0px;
+	left: 10px;
+	height: 36px;
+	background-image: url('tab_b.png');
+	z-index: 101;
+	overflow: hidden;
+	font-size: 13px;
+}
+
+.navpath ul {
+	display: flex;
+	flex-flow: row wrap;
+	justify-content: flex-start;
+	align-items: center;
+	font-size: 11px;
+	background-image:none;
+	background-repeat:repeat-x;
+	background-position: 0 -5px;
+	height:auto;
+	line-height:30px;
+	color:#8AA0CC;
+	border:solid 1px #C2CDE4;
+	overflow:hidden;
+	margin:0px;
+	padding:0px;
+}
+
+.navpath li
+{
+	list-style-type:none;
+	float:left;
+	padding-left:10px;
+	padding-right:15px;
+	background-image:url('bc_s.png');
+	background-repeat:no-repeat;
+	background-position:right;
+	color:#364D7C;
+}
+
+.navpath li.navelem a
+{
+	height:32px;
+	display:block;
+	text-decoration: none;
+	outline: none;
+	color: #283A5D;
+	font-family: 'Lucida Grande',Geneva,Helvetica,Arial,sans-serif;
+	text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
+	text-decoration: none;        
+}
+
+.navpath li.navelem a:hover
+{
+	color:#6884BD;
+}
+
+.navpath li.footer
+{
+	display: flex;
+	flex-flow: row wrap;
+	justify-content: flex-start;
+	align-items: center;
+        flex-grow: 1;
+        list-style-type:none;
+        float:none;
+        padding-left:10px;
+        padding-right:15px;
+        background-image:none;
+        background-repeat:no-repeat;
+        background-position:right;
+        color:#364D7C;
+        font-size: 8pt;
+}
+
+div.summary
+{
+	float: right;
+	font-size: 8pt;
+	padding-right: 5px;
+	width: 50%;
+	text-align: right;
+}       
+
+div.summary a
+{
+	white-space: nowrap;
+}
+
+table.classindex
+{
+        margin: 10px;
+        white-space: nowrap;
+        margin-left: 3%;
+        margin-right: 3%;
+        width: 94%;
+        border: 0;
+        border-spacing: 0; 
+        padding: 0;
+}
+
+div.ingroups
+{
+	font-size: 8pt;
+	width: 50%;
+	text-align: left;
+}
+
+div.ingroups a
+{
+	white-space: nowrap;
+}
+
+div.header
+{
+        background-image:url('nav_h.png');
+        background-repeat:repeat-x;
+	background-color: #F9FAFC;
+	margin:  0px;
+	border-bottom: 1px solid #C4CFE5;
+}
+
+div.headertitle
+{
+	padding: 5px 5px 5px 10px;
+       	color: #135384;
+}
+
+dl
+{
+        padding: 0 0 0 10px;
+}
+
+/* dl.note, dl.warning, dl.attention, dl.pre, dl.post, dl.invariant, dl.deprecated, dl.todo, dl.test, dl.bug */
+dl.section
+{
+	margin-left: 0px;
+	padding-left: 0px;
+}
+
+dl.note
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #D0C000;
+}
+
+dl.warning, dl.attention
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #FF0000;
+}
+
+dl.pre, dl.post, dl.invariant
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #00D000;
+}
+
+dl.deprecated
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #505050;
+}
+
+dl.todo
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #00C0E0;
+}
+
+dl.test
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #3030E0;
+}
+
+dl.bug
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #C08050;
+}
+
+dl.section dd {
+	margin-bottom: 6px;
+}
+
+
+#projectlogo
+{
+	text-align: center;
+	vertical-align: bottom;
+	border-collapse: separate;
+}
+ 
+#projectlogo img
+{ 
+	border: 0px none;
+}
+ 
+#projectalign
+{
+        vertical-align: middle;
+}
+
+#projectname
+{
+	font: 200% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 2px 0px;
+}
+    
+#projectbrief
+{
+	font: 120% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 0px;
+}
+
+#projectnumber
+{
+	font: 50% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 0px;
+}
+
+#top {
+	border-bottom: 1px solid #5373B4;
+}
+
+#titlearea
+{
+	flex-grow: 1;
+	padding: 0px;
+	margin: 0px;
+	width: auto;
+	border-bottom: none;
+}
+
+#main-nav {
+}
+
+#main-menu {
+	display: flex;
+	flex-flow: row wrap;
+	justify-content: flex-start;
+	align-items: center;
+	background-image: none;
+	min-width: 770px;
+}
+
+.ui-resizable-e {
+	height: 100%;
+	background-repeat: repeat-y;
+}
+
+.image
+{
+        text-align: center;
+}
+
+.dotgraph
+{
+        text-align: center;
+}
+
+.mscgraph
+{
+        text-align: center;
+}
+
+.plantumlgraph
+{
+        text-align: center;
+}
+
+.diagraph
+{
+        text-align: center;
+}
+
+.caption
+{
+	font-weight: bold;
+}
+
+div.zoom
+{
+	border: 1px solid #90A5CE;
+}
+
+dl.citelist {
+        margin-bottom:50px;
+}
+
+dl.citelist dt {
+        color:#334975;
+        float:left;
+        font-weight:bold;
+        margin-right:10px;
+        padding:5px;
+}
+
+dl.citelist dd {
+        margin:2px 0;
+        padding:5px 0;
+}
+
+div.toc {
+        padding: 14px 25px;
+        background-color: #F4F6FA;
+        border: 1px solid #D8DFEE;
+        border-radius: 7px 7px 7px 7px;
+        float: right;
+        height: auto;
+        margin: 0 8px 10px 10px;
+        width: 200px;
+}
+
+div.toc li {
+        background: url("bdwn.png") no-repeat scroll 0 5px transparent;
+        font: 10px/1.2 Verdana,DejaVu Sans,Geneva,sans-serif;
+        margin-top: 5px;
+        padding-left: 10px;
+        padding-top: 2px;
+}
+
+div.toc h3 {
+        font: bold 12px/1.2 Arial,FreeSans,sans-serif;
+	color: #4665A2;
+        border-bottom: 0 none;
+        margin: 0;
+}
+
+div.toc ul {
+        list-style: none outside none;
+        border: medium none;
+        padding: 0px;
+}       
+
+div.toc li.level1 {
+        margin-left: 0px;
+}
+
+div.toc li.level2 {
+        margin-left: 15px;
+}
+
+div.toc li.level3 {
+        margin-left: 30px;
+}
+
+div.toc li.level4 {
+        margin-left: 45px;
+}
+
+.inherit_header {
+        font-weight: bold;
+        color: gray;
+        cursor: pointer;
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+.inherit_header td {
+        padding: 6px 0px 2px 5px;
+}
+
+.inherit {
+        display: none;
+}
+
+tr.heading h2 {
+        margin-top: 12px;
+        margin-bottom: 4px;
+}
+
+/* tooltip related style info */
+
+.ttc {
+        position: absolute;
+        display: none;
+}
+
+#powerTip {
+	cursor: default;
+	white-space: nowrap;
+	background-color: white;
+	border: 1px solid gray;
+	border-radius: 4px 4px 4px 4px;
+	box-shadow: 1px 1px 7px gray;
+	display: none;
+	font-size: smaller;
+	max-width: 80%;
+	opacity: 0.9;
+	padding: 1ex 1em 1em;
+	position: absolute;
+	z-index: 2147483647;
+}
+
+#powerTip div.ttdoc {
+        color: grey;
+	font-style: italic;
+}
+
+#powerTip div.ttname a {
+        font-weight: bold;
+}
+
+#powerTip div.ttname {
+        font-weight: bold;
+}
+
+#powerTip div.ttdeci {
+        color: #006318;
+}
+
+#powerTip div {
+        margin: 0px;
+        padding: 0px;
+        font: 12px/16px Roboto,sans-serif;
+}
+
+#powerTip:before, #powerTip:after {
+	content: "";
+	position: absolute;
+	margin: 0px;
+}
+
+#powerTip.n:after,  #powerTip.n:before,
+#powerTip.s:after,  #powerTip.s:before,
+#powerTip.w:after,  #powerTip.w:before,
+#powerTip.e:after,  #powerTip.e:before,
+#powerTip.ne:after, #powerTip.ne:before,
+#powerTip.se:after, #powerTip.se:before,
+#powerTip.nw:after, #powerTip.nw:before,
+#powerTip.sw:after, #powerTip.sw:before {
+	border: solid transparent;
+	content: " ";
+	height: 0;
+	width: 0;
+	position: absolute;
+}
+
+#powerTip.n:after,  #powerTip.s:after,
+#powerTip.w:after,  #powerTip.e:after,
+#powerTip.nw:after, #powerTip.ne:after,
+#powerTip.sw:after, #powerTip.se:after {
+	border-color: rgba(255, 255, 255, 0);
+}
+
+#powerTip.n:before,  #powerTip.s:before,
+#powerTip.w:before,  #powerTip.e:before,
+#powerTip.nw:before, #powerTip.ne:before,
+#powerTip.sw:before, #powerTip.se:before {
+	border-color: rgba(128, 128, 128, 0);
+}
+
+#powerTip.n:after,  #powerTip.n:before,
+#powerTip.ne:after, #powerTip.ne:before,
+#powerTip.nw:after, #powerTip.nw:before {
+	top: 100%;
+}
+
+#powerTip.n:after, #powerTip.ne:after, #powerTip.nw:after {
+	border-top-color: #ffffff;
+	border-width: 10px;
+	margin: 0px -10px;
+}
+#powerTip.n:before {
+	border-top-color: #808080;
+	border-width: 11px;
+	margin: 0px -11px;
+}
+#powerTip.n:after, #powerTip.n:before {
+	left: 50%;
+}
+
+#powerTip.nw:after, #powerTip.nw:before {
+	right: 14px;
+}
+
+#powerTip.ne:after, #powerTip.ne:before {
+	left: 14px;
+}
+
+#powerTip.s:after,  #powerTip.s:before,
+#powerTip.se:after, #powerTip.se:before,
+#powerTip.sw:after, #powerTip.sw:before {
+	bottom: 100%;
+}
+
+#powerTip.s:after, #powerTip.se:after, #powerTip.sw:after {
+	border-bottom-color: #ffffff;
+	border-width: 10px;
+	margin: 0px -10px;
+}
+
+#powerTip.s:before, #powerTip.se:before, #powerTip.sw:before {
+	border-bottom-color: #808080;
+	border-width: 11px;
+	margin: 0px -11px;
+}
+
+#powerTip.s:after, #powerTip.s:before {
+	left: 50%;
+}
+
+#powerTip.sw:after, #powerTip.sw:before {
+	right: 14px;
+}
+
+#powerTip.se:after, #powerTip.se:before {
+	left: 14px;
+}
+
+#powerTip.e:after, #powerTip.e:before {
+	left: 100%;
+}
+#powerTip.e:after {
+	border-left-color: #ffffff;
+	border-width: 10px;
+	top: 50%;
+	margin-top: -10px;
+}
+#powerTip.e:before {
+	border-left-color: #808080;
+	border-width: 11px;
+	top: 50%;
+	margin-top: -11px;
+}
+
+#powerTip.w:after, #powerTip.w:before {
+	right: 100%;
+}
+#powerTip.w:after {
+	border-right-color: #ffffff;
+	border-width: 10px;
+	top: 50%;
+	margin-top: -10px;
+}
+#powerTip.w:before {
+	border-right-color: #808080;
+	border-width: 11px;
+	top: 50%;
+	margin-top: -11px;
+}
+
+@media print
+{
+  #top { display: none; }
+  #side-nav { display: none; }
+  #nav-path { display: none; }
+  body { overflow:visible; }
+  h1, h2, h3, h4, h5, h6 { page-break-after: avoid; }
+  .summary { display: none; }
+  .memitem { page-break-inside: avoid; }
+  #doc-content
+  {
+    margin-left:0 !important;
+    height:auto !important;
+    width:auto !important;
+    overflow:inherit;
+    display:inline;
+  }
+}
+
+/* @group Markdown */
+
+/*
+table.markdownTable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.markdownTable td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.markdownTableHead tr {
+}
+
+table.markdownTableBodyLeft td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+th.markdownTableHeadLeft th.markdownTableHeadRight th.markdownTableHeadCenter th.markdownTableHeadNone {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+th.markdownTableHeadLeft {
+	text-align: left
+}
+
+th.markdownTableHeadRight {
+	text-align: right
+}
+
+th.markdownTableHeadCenter {
+	text-align: center
+}
+*/
+
+table.markdownTable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.markdownTable td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.markdownTable tr {
+}
+
+th.markdownTableHeadLeft, th.markdownTableHeadRight, th.markdownTableHeadCenter, th.markdownTableHeadNone {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+th.markdownTableHeadLeft, td.markdownTableBodyLeft {
+	text-align: left
+}
+
+th.markdownTableHeadRight, td.markdownTableBodyRight {
+	text-align: right
+}
+
+th.markdownTableHeadCenter, td.markdownTableBodyCenter {
+	text-align: center
+}
+
+
+/* @end */
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/loadgen_integration_diagram.dia b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/loadgen_integration_diagram.dia
new file mode 100644
index 000000000..569089f24
Binary files /dev/null and b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/loadgen_integration_diagram.dia differ
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_icon.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_icon.png
new file mode 100644
index 000000000..95321896d
Binary files /dev/null and b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_icon.png differ
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg
new file mode 100644
index 000000000..d932bdae6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 22.0.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<svg version="1.1" id="mlperf_x5F_logo_x5F_horizontal_x5F_color"
+	 xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 494 166"
+	 style="enable-background:new 0 0 494 166;" xml:space="preserve">
+<style type="text/css">
+	.st0{fill:#333333;}
+	.st1{fill:#1A6197;}
+	.st2{fill:#B4BFC1;}
+</style>
+<g>
+	<g>
+		<path class="st0" d="M242.3,58.3c0.9,0.9,1.4,1.9,1.4,3.2v43c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-2,1.3-3.3,1.3
+			c-1.4,0-2.5-0.4-3.4-1.3c-0.9-0.8-1.3-1.9-1.3-3.3V74.6l-12.2,18.7c-0.5,0.6-1.1,1.1-1.7,1.4c-0.6,0.4-1.3,0.5-2,0.5
+			c-0.6,0-1.3-0.1-1.9-0.5c-0.6-0.4-1.2-0.9-1.7-1.4l-12.2-19.4v30.6c0,1.3-0.4,2.4-1.2,3.3c-0.8,0.8-1.8,1.3-3.1,1.3
+			c-1.2,0-2.3-0.4-3-1.3c-0.8-0.9-1.2-1.9-1.2-3.2v-43c0-1.2,0.5-2.3,1.4-3.2c0.9-0.9,2-1.3,3.3-1.3c0.7,0,1.5,0.2,2.2,0.6
+			c0.7,0.4,1.3,0.9,1.7,1.4l16.2,24.9L235.4,59c0.4-0.6,0.9-1.1,1.6-1.4c0.7-0.4,1.3-0.6,2-0.6C240.3,57,241.3,57.4,242.3,58.3z"/>
+		<path class="st0" d="M286.7,101.5c0.9,0.8,1.3,1.9,1.3,3.2c0,1.2-0.4,2.3-1.3,3.1c-0.9,0.8-1.9,1.2-3.2,1.2h-24.1
+			c-1.3,0-2.4-0.4-3.2-1.3c-0.9-0.9-1.3-1.9-1.3-3.2V61.5c0-1.3,0.4-2.4,1.3-3.2c0.9-0.9,2-1.3,3.4-1.3c1.2,0,2.3,0.4,3.2,1.3
+			c0.9,0.9,1.3,1.9,1.3,3.2v38.8h19.4C284.7,100.3,285.8,100.7,286.7,101.5z"/>
+		<path class="st0" d="M325.2,59.3c2.3,1.5,4.1,3.5,5.5,6.1c1.4,2.6,2.1,5.3,2.1,8.4c0,3-0.7,5.7-2.1,8.3c-1.4,2.6-3.2,4.6-5.5,6.1
+			c-2.3,1.5-4.7,2.3-7.2,2.3H305v14c0,1.3-0.4,2.4-1.2,3.3s-1.8,1.3-3.1,1.3c-1.2,0-2.3-0.4-3-1.3c-0.8-0.9-1.2-1.9-1.2-3.2V61.5
+			c0-1.3,0.4-2.4,1.3-3.2c0.9-0.9,1.9-1.3,3.2-1.3H318C320.5,57,322.9,57.8,325.2,59.3z M320.8,80.8c0.9-0.8,1.7-1.8,2.2-3.1
+			c0.6-1.3,0.9-2.6,0.9-4c0-1.4-0.3-2.7-0.9-4c-0.6-1.3-1.3-2.3-2.2-3c-0.9-0.8-1.8-1.2-2.8-1.2H305V82H318
+			C318.9,82,319.8,81.6,320.8,80.8z"/>
+		<path class="st0" d="M373.9,90.4c-0.9,0.7-1.9,1.1-3.1,1.1h-24.5c0.6,2.9,2,5.2,4.1,6.9c2.1,1.7,4.5,2.5,7.2,2.5
+			c2,0,3.6-0.2,4.7-0.6c1.1-0.4,2-0.8,2.7-1.2c0.7-0.4,1.1-0.7,1.4-0.9c0.9-0.4,1.7-0.7,2.5-0.7c1,0,1.9,0.4,2.7,1.1
+			c0.7,0.7,1.1,1.6,1.1,2.6c0,1.3-0.7,2.5-2.1,3.6c-1.4,1.1-3.2,2.1-5.6,2.9c-2.3,0.8-4.7,1.2-7,1.2c-4.2,0-7.8-0.9-10.9-2.6
+			c-3.1-1.7-5.5-4.1-7.2-7.2c-1.7-3-2.5-6.5-2.5-10.3c0-4.3,0.9-8,2.7-11.2c1.8-3.2,4.1-5.7,7-7.4c2.9-1.7,6-2.6,9.3-2.6
+			c3.3,0,6.3,0.9,9.2,2.7c2.9,1.8,5.2,4.2,6.9,7.2c1.7,3,2.6,6.3,2.6,9.8C375.2,88.7,374.8,89.6,373.9,90.4z M346.4,84.1h19.4v-0.5
+			c-0.2-2.1-1.2-3.9-3-5.3c-1.8-1.5-3.9-2.2-6.2-2.2C350.8,76,347.4,78.7,346.4,84.1z"/>
+		<path class="st0" d="M407.5,69.8c1,0.8,1.5,1.8,1.5,3c0,1.5-0.4,2.7-1.2,3.4c-0.8,0.8-1.7,1.1-2.8,1.1c-0.7,0-1.6-0.2-2.5-0.5
+			c-0.1,0-0.5-0.1-1-0.3c-0.5-0.1-1.1-0.2-1.7-0.2c-1.3,0-2.5,0.4-3.7,1.2c-1.2,0.8-2.2,2-2.9,3.6c-0.8,1.6-1.1,3.5-1.1,5.7v17.6
+			c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-1.9,1.3-3.2,1.3c-1.3,0-2.3-0.4-3.2-1.3c-0.8-0.9-1.3-1.9-1.3-3.2V73.8c0-1.3,0.4-2.4,1.3-3.2
+			c0.8-0.9,1.9-1.3,3.2-1.3c1.3,0,2.3,0.4,3.2,1.3c0.8,0.9,1.3,1.9,1.3,3.2v1c1.1-2,2.8-3.6,4.9-4.6c2.1-1.1,4.4-1.6,6.8-1.6
+			C405.2,68.6,406.5,69,407.5,69.8z"/>
+		<path class="st0" d="M432.3,63.3c-0.7,1-1,2-1,3.2V70h6.8c1.2,0,2.2,0.4,3,1.2c0.8,0.8,1.2,1.7,1.2,2.9c0,1.2-0.4,2.2-1.2,2.9
+			c-0.8,0.8-1.8,1.2-3,1.2h-6.8v26.3c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-1.9,1.3-3.2,1.3s-2.4-0.4-3.2-1.3c-0.8-0.9-1.3-1.9-1.3-3.2
+			V78.2h-4c-1.2,0-2.2-0.4-3-1.2c-0.8-0.8-1.2-1.7-1.2-2.9c0-1.2,0.4-2.2,1.2-2.9c0.8-0.8,1.8-1.2,3-1.2h4v-3.4
+			c0-3.7,1.3-6.7,3.8-9.1c2.6-2.3,6.2-3.5,11-3.5c1.8,0,3.4,0.4,4.8,1.1c1.4,0.7,2,1.9,2,3.4c0,1.2-0.3,2.2-1,2.9
+			c-0.7,0.7-1.5,1.1-2.5,1.1c-0.2,0-0.5,0-0.8-0.1c-0.3,0-0.6-0.1-0.9-0.1c-1.2-0.3-2.2-0.4-3-0.4C434.1,61.9,433,62.4,432.3,63.3z"
+			/>
+	</g>
+	<g>
+		<path class="st1" d="M138.7,71.3l-24.2,39.3l0,0c-1.9,3.2-5.4,5.3-9.4,5.3c-6.1,0-11-4.9-11-11c0-4,2.1-7.5,5.3-9.4l0,0
+			L138.7,71.3"/>
+		<g>
+			<path class="st2" d="M136.1,67.1l-11.7,7.2c-7.3-6.6-17-10.6-27.6-10.6C74,63.8,55.5,82.2,55.5,105c0,2.7,0.3,5.3,0.7,7.8l0,0
+				c0,0.1,0,0.3,0,0.4c0,1.6-1.3,2.8-2.8,2.8c-1.4,0-2.6-1-2.8-2.4l0,0c-0.4-2.8-0.7-5.7-0.7-8.6c0-30.4,24.6-55,55-55
+				c14,0,26.7,5.2,36.4,13.8L136.1,67.1z M146.2,68.6l-3.3,5.3l-8.6,14c2.4,5.2,3.7,11,3.7,17.1c0,0,0,0,0,0c0,6.1,4.9,11,11,11
+				c6.1,0,11-4.9,11-11h0C160,91,154.8,78.3,146.2,68.6z"/>
+		</g>
+	</g>
+</g>
+</svg>
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc
new file mode 100644
index 000000000..41f74b803
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc
@@ -0,0 +1,117 @@
+#include "early_stopping.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <map>
+#include <numeric>
+#include <string>
+
+namespace mlperf {
+
+namespace loadgen {
+
+double lbeta(int64_t x, int64_t y) {
+  return std::lgamma(x) + std::lgamma(y) - std::lgamma(x + y);
+}
+
+// The Gaussian Hypergeometric function specialized for a = 1.
+// Based on http://dlmf.nist.gov/15.2.E1.
+// Converges if c > 0 and (b <= 0 or x < 1).
+// TODO(ckstanton): http://dlmf.nist.gov/15.2.E1 says there are transformations
+// to replace x with with a value less than 0.5, for faster convergence.
+// Presently, this function can take up to 200,000 iterations to converge.
+double hypergeometric_2F1_A1(int64_t b, int64_t c, double x) {
+  // TODO(ckstanton): Is there a more principled way to pick kTolerance?
+  constexpr double kTolerance = 1.0 / (1LL << 33);
+  double term = 1.0;
+  double result = 1.0;
+  for (int64_t i = 0; std::abs(term) > kTolerance; ++i) {
+    term *= (b + i) * x / (c + i);
+    result += term;
+  }
+  return result;
+}
+
+// BetaRegularized[x, a, b] =
+// Beta[x, a, b]/Beta[a, b] =
+// x^a/a Hypergeometric2F1[a, 1-b, 1+a, x]/Beta[a, b] =
+// (http://dlmf.nist.gov/15.8.E1.)
+// x^a/a (1-x)^(b-1) Hypergeometric2F1[1, 1-b, 1+a, x/(x-1)]/Beta[a, b]
+double beta_regularized(double x, int64_t a, int64_t b) {
+  return std::exp(a * std::log(x) + (b - 1) * std::log(1 - x) - lbeta(a, b)) /
+         a * hypergeometric_2F1_A1(1 - b, 1 + a, x / (x - 1));
+}
+
+// Compute the odds of t or fewer overlatency queries in h + t total queries.
+// The binomial distribution is the discrete probability distribution for
+// independent boolean experiments. The CDF of the binomial distribution is:
+// BetaRegularized[q, n - k, 1 + k] where 1 - q is the probability of an event
+// per experiment, n is the total number of experiments, and k is the number of
+// events. An even in our case is an overlatency query, so q = p - d, n = h + t,
+// and k = t.
+// Sum[Binomial[h + t, x] (p - d)^(h + t - x) (1 - p + d)^x, {x, 0, t}] =
+// BetaRegularized[p - d, h, 1 + t]
+double odds(int64_t h, int64_t t, double p, double d) {
+  return beta_regularized(p - d, h, 1 + t);
+}
+
+// Binary search to find the minimum value h such that:
+// odds(h, t, p, d) <= 1 - c on the range [min_h, max_h] given t, p, d, and c.
+int64_t find_min_passing(int64_t min_h, int64_t max_h, int64_t t, double p,
+                         double d, double c) {
+  int64_t count = max_h - min_h;
+  while (count > 0) {
+    int64_t step = count / 2;
+    int64_t h = min_h + step;
+    double prob = odds(h, t, p, d);
+    if (prob < 1 - c) {
+      count = step;
+    } else {
+      min_h = h + 1;
+      count -= step + 1;
+    }
+  }
+  return min_h;
+}
+
+int64_t MinPassingQueriesFinder::operator()(int64_t t, double p, double d,
+                                            double c) {
+  // Given t, p, d, and c, return the minimum h such that odds(h, t, p, d) <= 1
+  // - c
+
+  auto &cache = caches_[std::make_tuple(p, d, c)];
+  auto it = cache.lower_bound(t);
+  if (it != cache.end() && it->first == t) {
+    return it->second;
+  }
+
+  int64_t x0 = -1;
+  int64_t y0 = 0;
+  int64_t x1 = 0;
+  int64_t y1 = std::ceil(std::log(1 - c) / std::log(p - d));
+
+  if (it != cache.begin()) {
+    --it;
+    x1 = it->first;
+    y1 = it->second;
+  }
+
+  if (it != cache.begin()) {
+    --it;
+    x0 = it->first;
+    y0 = it->second;
+  }
+
+  double min_slope = (p - d) / (1 - p + d);
+  double max_slope = (y1 - y0) * (x1 - x0);
+  int64_t min_h = (t - x1) * min_slope + y1;
+  int64_t max_h = (t - x1) * max_slope + y1 + 1;
+  int64_t h = find_min_passing(min_h, max_h, t, p, d, c);
+  cache[t] = h;
+  return h;
+}
+
+}  // namespace loadgen
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h
new file mode 100644
index 000000000..49b7a901e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h
@@ -0,0 +1,27 @@
+#ifndef MLPERF_LOADGEN_EARLYSTOPPING_H_
+#define MLPERF_LOADGEN_EARLYSTOPPING_H_
+
+#include <cstdint>
+#include <map>
+
+namespace mlperf {
+namespace loadgen {
+
+class MinPassingQueriesFinder {
+ public:
+  int64_t operator()(int64_t t, double p, double d, double c);
+
+ private:
+  // Memoize prior computations results and use them to bound the binary search
+  // range for subsequent computations.
+
+  // TODO: Is there something more efficient to use besides std::map for
+  // caches_?
+  std::map<std::tuple<double, double, double>, std::map<int64_t, int64_t>>
+      caches_;
+};
+
+}  // namespace loadgen
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_EARLYSTOPPING_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc
new file mode 100644
index 000000000..75fdc9519
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc
@@ -0,0 +1,98 @@
+// DO NOT EDIT: Autogenerated by version_generator.py.
+
+#include <string>
+
+namespace mlperf {
+
+const std::string& LoadgenVersion() {
+  static const std::string str = "4.1";
+  return str;
+}
+
+const std::string& LoadgenBuildDateLocal() {
+  static const std::string str = "2024-10-18T23:12:51.002440";
+  return str;
+}
+
+const std::string& LoadgenBuildDateUtc() {
+  static const std::string str = "2024-10-19T06:12:51.002446";
+  return str;
+}
+
+const std::string& LoadgenGitRevision() {
+  static const std::string str = "f5c8f17583";
+  return str;
+}
+
+const std::string& LoadgenGitCommitDate() {
+  static const std::string str = "2024-10-08T18:30:16+01:00";
+  return str;
+}
+
+const std::string& LoadgenGitStatus() {
+  static const std::string str = R"LGVG_RSLD()LGVG_RSLD";
+  return str;
+}
+
+const std::string& LoadgenGitLog() {
+  static const std::string str = R"LGVG_RSLD(f5c8f1758374aeaba26b2e84d31690111cfdf054 Fix bug: Loadgen ignoring token latency targets in user conf (#1874)
+976bb1ad9c7946be79507f3ff67955c27426af52 Set correct remote repo (#1871)
+41fa8aadd1ba0ecc97f6a519d8b42b04278e5f24 Add format files github action (#1682)
+518b454fd8647bfbd23a074e875e87353f33393e Tflite tpu (#1449)
+e0fdec1c7a75c98cfc194f13d62ac4388d419c8a Fix link in GettingStarted.ipynb (#1512)
+92bd8198d15411d7fb7d7c27f8904bc5a0bcfe7a Fix warning in the submission checker (#1808)
+224cfbf5c0e82cae6d48620025b7e1258ae3666a Fix typo in reference datatype (#1851)
+3ef1249b7f50a250c02c568342e0aea6638fc5a7 Fix docs (#1853)
+a0874c100c54cbc54fb743ac8bf9fb5fadc64135 Update build_wheels.yml (#1758)
+6eff09986e337ccf03f675c9f244d8ee93644e16 Extend the final report generation script to output a json file of results (#1825)
+54f3f93a73cc8ca5e3319ad87fb325e510574f56 Add binding for server_num_issue_query_threads parameter (#1862)
+c4d0b3ea98e6fe7252e50cb573f0d523da7979df Update docs: SCC24,  fix broken redirect  (#1843)
+7d2f0c41e5cd79c9178702867392e38f57953338 Update DLRM readme (#1811)
+cf5fddc5d0746bf3820eb0ab7294bbf709d788ab Enable systems to be marked as power only (#1850)
+81c2de69de4af90410cd1ba000fc5bd731bf6dee Documentation updates (#1821)
+73b02798219c794a735a7f2ddabbc3df9173352d Fix error with generate_final_report.py when the input CSV file is empty (#1827))LGVG_RSLD";
+  return str;
+}
+
+const std::string& LoadgenSha1OfFiles() {
+  static const std::string str = R"LGVG_RSLD(012aad77e5206c89d50718c46c119d1f3cb056b2 /.clang-format
+e173f4513f3c5dac1f0bea1473bb0a058e23f190 /=42
+d5274ff0b56e8d3cdb273174628a4461fca6f02a /CMakeLists.txt
+20a55bb946c2c0bbb564ced2af1e48efd096b3a8 /README.md
+5f6c6a784e9cd6995db47f9b9f70b1769909c9d8 /README_BUILD.md
+01f9ae9887f50bc030dc6107e740f40c43ca388f /README_FAQ.md
+32181da9e161c285f8fe46ddaa49e6cba2f9f918 /bindings/c_api.cc
+91f58bd79b83b278f3240174a9af747fc38aff74 /bindings/c_api.h
+ea4c89decad19eaf3217bfa2fb757d3b83a561d6 /bindings/python_api.cc
+53dba8ad4272190ceb6335c12fd25e53dc02a8cb /diagram_network_submission.png
+84c2f79309b237cef652aef6a187ba8e875a3952 /diagram_submission.png
+0cd7b546a389deac73f7955cd39255ed76557d62 /early_stopping.cc
+158fcae6a5f47e82150d6416fa1f7bcef37e77fe /early_stopping.h
+126e952d00f4ea9efd12405fb209aa3ed585e4b2 /issue_query_controller.cc
+923d9d5cdf598e3ec33d7a1110a31f7e11527ec7 /issue_query_controller.h
+6650091ba7a918f343b06eb7a5aa540eae87275f /loadgen.cc
+e00fdc6dbc85a8c9a8485dbcbfe2944f81251c4e /loadgen.h
+47f748307536f80cfc606947b440dd732afc2637 /loadgen_integration_diagram.svg
+197efc96d178e5d33a750d07fa7b2966417506ea /logging.cc
+ddb961df7bcc145bcd7cce8c21f7cf075350dcbe /logging.h
+ca17720f9c8246e821331946d893e830fc88f8bd /pyproject.toml
+13ad6d842200cb161d6927eb74a3fafd79c46c75 /query_dispatch_library.h
+e9187c8612bbdc972305b789feb6e15c26e96cfe /query_sample.h
+8323a2225be1dff31f08ecc86b76eb3de06568bc /query_sample_library.h
+a5ff7e77caa6e9e22ada90f0de0c865c987bf167 /requirements.txt
+34e2d2a44324cb07c884f92146ecbb8ef9d704e2 /results.cc
+d82500c326c2de83db411f1146882aa4692b419c /results.h
+13c49b028b22749b5f3c44f3d9bb489e8c0574e9 /setup.py
+18d4809589dae33317d88d9beeb5491a6e1ccdec /system_under_test.h
+c15c3e150030089a8d634bd2ad6d4b644002e613 /test_settings.h
+e21febd60f9b5bedd1fc81bb990f09c34b32043c /test_settings_internal.cc
+f1d5335b53ca610c30e0edc5d07999a27b5b4b9a /test_settings_internal.h
+3df8fdabf6eaea4697cf25d1dcb89cae88e36efd /utils.cc
+40775e32d619ea6356826ae5ea4174c7911f6894 /utils.h
+cbec2a5f98f9786c8c3d8b06b3d12df0b6550fa0 /version.cc
+9d574baa64424e9c708fcfedd3dbb0b518a65fcc /version.h
+eea9b9cb1a06cd1abe1bbdaee82f9af31527fedb /version_generator.py)LGVG_RSLD";
+  return str;
+}
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc
new file mode 100644
index 000000000..c1abea9d1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc
@@ -0,0 +1,552 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Implements IssueQueryController and other helper classes for
+/// query issuing.
+
+#include "issue_query_controller.h"
+
+#include <sstream>
+
+namespace mlperf {
+
+void RegisterIssueQueryThread() {
+  loadgen::IssueQueryController::GetInstance().RegisterThread();
+}
+
+/// \brief Loadgen implementation details.
+namespace loadgen {
+
+QueryMetadata::QueryMetadata(
+    const std::vector<QuerySampleIndex>& query_sample_indices,
+    std::chrono::nanoseconds scheduled_delta,
+    ResponseDelegate* response_delegate, SequenceGen* sequence_gen)
+    : scheduled_delta(scheduled_delta),
+      response_delegate(response_delegate),
+      sequence_id(sequence_gen->NextQueryId()),
+      wait_count_(query_sample_indices.size()) {
+  samples_.reserve(query_sample_indices.size());
+  for (QuerySampleIndex qsi : query_sample_indices) {
+    samples_.push_back({this, sequence_gen->NextSampleId(), qsi,
+                        sequence_gen->NextAccLogRng()});
+  }
+  query_to_send.reserve(query_sample_indices.size());
+  for (auto& s : samples_) {
+    query_to_send.push_back({reinterpret_cast<ResponseId>(&s), s.sample_index});
+  }
+}
+
+QueryMetadata::QueryMetadata(QueryMetadata&& src)
+    : query_to_send(std::move(src.query_to_send)),
+      scheduled_delta(src.scheduled_delta),
+      response_delegate(src.response_delegate),
+      sequence_id(src.sequence_id),
+      wait_count_(src.samples_.size()),
+      samples_(std::move(src.samples_)) {
+  // The move constructor should only be called while generating a
+  // vector of QueryMetadata, before it's been used.
+  // Assert that wait_count_ is in its initial state.
+  assert(src.wait_count_.load() == samples_.size());
+  // Update the "parent" of each sample to be this query; the old query
+  // address will no longer be valid.
+  // TODO: Only set up the sample parenting once after all the queries have
+  //       been created, rather than re-parenting on move here.
+  for (size_t i = 0; i < samples_.size(); i++) {
+    SampleMetadata* s = &samples_[i];
+    s->query_metadata = this;
+    query_to_send[i].id = reinterpret_cast<ResponseId>(s);
+  }
+}
+
+void QueryMetadata::NotifyOneSampleCompleted(PerfClock::time_point timestamp) {
+  size_t old_count = wait_count_.fetch_sub(1, std::memory_order_relaxed);
+  if (old_count == 1) {
+    all_samples_done_time = timestamp;
+    all_samples_done_.set_value();
+    response_delegate->QueryComplete();
+  }
+}
+
+void QueryMetadata::WaitForAllSamplesCompleted() {
+  all_samples_done_.get_future().wait();
+}
+
+PerfClock::time_point QueryMetadata::WaitForAllSamplesCompletedWithTimestamp() {
+  all_samples_done_.get_future().wait();
+  return all_samples_done_time;
+}
+
+// When server_coalesce_queries is set to true in Server scenario, we
+// sometimes coalesce multiple queries into one query. This is done by moving
+// the other query's sample into current query, while maintaining their
+// original scheduled_time.
+void QueryMetadata::CoalesceQueries(QueryMetadata* queries, size_t first,
+                                    size_t last, size_t stride) {
+  // Copy sample data over to current query, boldly assuming that each query
+  // only has one sample.
+  query_to_send.reserve((last - first) / stride +
+                        2);  // Extra one for the current query.
+  for (size_t i = first; i <= last; i += stride) {
+    auto& q = queries[i];
+    auto& s = q.samples_[0];
+    query_to_send.push_back({reinterpret_cast<ResponseId>(&s), s.sample_index});
+    q.scheduled_time = scheduled_time + q.scheduled_delta - scheduled_delta;
+    q.issued_start_time = issued_start_time;
+  }
+}
+
+void QueryMetadata::Decoalesce() { query_to_send.resize(1); }
+
+/// \brief A base template that should never be used since each scenario has
+/// its own specialization.
+template <TestScenario scenario>
+struct QueryScheduler {
+  static_assert(scenario != scenario, "Unhandled TestScenario");
+};
+
+/// \brief Schedules queries for issuance in the single stream scenario.
+template <>
+struct QueryScheduler<TestScenario::SingleStream> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    auto tracer = MakeScopedTracer([](AsyncTrace& trace) { trace("Waiting"); });
+    if (prev_query != nullptr) {
+      prev_query->WaitForAllSamplesCompleted();
+    }
+    prev_query = next_query;
+
+    auto now = PerfClock::now();
+    next_query->scheduled_time = now;
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  QueryMetadata* prev_query = nullptr;
+};
+
+/// \brief Schedules queries for issuance in the multi stream scenario.
+template <>
+struct QueryScheduler<TestScenario::MultiStream> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    auto tracer = MakeScopedTracer([](AsyncTrace& trace) { trace("Waiting"); });
+    if (prev_query != nullptr) {
+      prev_query->WaitForAllSamplesCompleted();
+    }
+    prev_query = next_query;
+
+    auto now = PerfClock::now();
+    next_query->scheduled_time = now;
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  QueryMetadata* prev_query = nullptr;
+};
+
+/// \brief Schedules queries for issuance in the server scenario.
+template <>
+struct QueryScheduler<TestScenario::Server> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point start)
+      : start(start) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    auto tracer =
+        MakeScopedTracer([](AsyncTrace& trace) { trace("Scheduling"); });
+
+    auto scheduled_time = start + next_query->scheduled_delta;
+    next_query->scheduled_time = scheduled_time;
+
+    auto now = PerfClock::now();
+    if (now < scheduled_time) {
+      std::this_thread::sleep_until(scheduled_time);
+      now = PerfClock::now();
+    }
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  const PerfClock::time_point start;
+};
+
+/// \brief Schedules queries for issuance in the offline scenario.
+template <>
+struct QueryScheduler<TestScenario::Offline> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point start)
+      : start(start) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    next_query->scheduled_time = start;
+    auto now = PerfClock::now();
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  const PerfClock::time_point start;
+};
+
+IssueQueryController& IssueQueryController::GetInstance() {
+  // The singleton.
+  static IssueQueryController instance;
+  return instance;
+}
+
+void IssueQueryController::RegisterThread() {
+  // Push this thread to thread queue.
+  auto thread_id = std::this_thread::get_id();
+  size_t thread_idx{0};
+  {
+    std::lock_guard<std::mutex> lock(mtx);
+    thread_idx = thread_ids.size();
+    thread_ids.emplace_back(thread_id);
+  }
+
+  LogDetail([thread_id, thread_idx](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Registered IssueQueryThread[" << thread_idx
+       << "]. thread ID : " << std::hash<std::thread::id>()(thread_id);
+    MLPERF_LOG(detail, "generic_message", ss.str());
+#else
+    detail("Registered IssueQueryThread[" + std::to_string(thread_idx) +
+               "]. thread ID : ",
+           std::to_string(std::hash<std::thread::id>()(thread_id)));
+#endif
+  });
+
+  // Start test.
+  while (true) {
+    // Wait until the main thread signals a start or the end.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      cond_var.wait(lock, [this]() { return issuing || end_test; });
+      // The test has ended.
+      if (end_test) {
+        break;
+      }
+    }
+
+    // Start issuing queries.
+    if (thread_idx <= num_threads) {
+      IssueQueriesInternal<TestScenario::Server, true>(num_threads, thread_idx);
+      {
+        std::lock_guard<std::mutex> lock(mtx);
+        thread_complete[thread_idx] = true;
+      }
+      cond_var.notify_all();
+    }
+
+    // Wait until all issue threads complete.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      cond_var.wait(lock, [this]() { return !issuing; });
+    }
+  }
+}
+
+void IssueQueryController::SetNumThreads(size_t n) {
+  // Try waiting for IssueQueryThreads() to registered themselves.
+  std::unique_lock<std::mutex> lock(mtx);
+  const std::chrono::seconds timeout(10);
+  num_threads = n;
+  cond_var.wait_for(lock, timeout,
+                    [this]() { return thread_ids.size() >= num_threads; });
+  // If the number of registered threads do not match the settings, report an
+  // error.
+  if (num_threads != thread_ids.size()) {
+    LogDetail([this](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "Mismatch between settings and number of registered "
+         << "IssueQueryThreads! settings.server_num_issue_query_threads = "
+         << num_threads << " but " << thread_ids.size()
+         << " threads registered.";
+      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+      detail.Error(
+          "Mismatch between settings and number of registered ",
+          "IssueQueryThreads! settings.server_num_issue_query_threads = ",
+          num_threads, " but ", thread_ids.size(), " threads registered.");
+#endif
+    });
+  }
+}
+
+template <TestScenario scenario>
+void IssueQueryController::StartIssueQueries(IssueQueryState* s) {
+  // Get the state.
+  state = s;
+  state->start_for_power = std::chrono::system_clock::now();
+  state->start_time = PerfClock::now();
+
+  if (scenario != TestScenario::Server || num_threads == 0) {
+    // Usually, we just use the same thread to issue queries.
+    IssueQueriesInternal<scenario, false>(1, 0);
+  } else {
+    // If server_num_issue_query_threads is non-zero, issue queries on the
+    // registered threads.
+    // Tell all threads to start issuing queries.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      issuing = true;
+      thread_complete.assign(num_threads, false);
+    }
+    cond_var.notify_all();
+    // Wait until all issue threads complete.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      cond_var.wait(lock, [this]() {
+        return std::all_of(thread_complete.begin(), thread_complete.end(),
+                           [](bool in) { return in; });
+      });
+      issuing = false;
+    }
+    cond_var.notify_all();
+  }
+}
+
+template void IssueQueryController::StartIssueQueries<
+    TestScenario::MultiStream>(IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<TestScenario::Offline>(
+    IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<TestScenario::Server>(
+    IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<
+    TestScenario::SingleStream>(IssueQueryState* s);
+
+void IssueQueryController::EndThreads() {
+  // Tell all the issue threads to end.
+  {
+    std::lock_guard<std::mutex> lock(mtx);
+    end_test = true;
+  }
+  cond_var.notify_all();
+}
+
+template <TestScenario scenario, bool multi_thread>
+void IssueQueryController::IssueQueriesInternal(size_t query_stride,
+                                                size_t thread_idx) {
+  // Get all the needed information.
+  auto sut = state->sut;
+  auto& queries = *state->queries;
+  auto& response_logger = *state->response_delegate;
+
+  // Some book-keeping about the number of queries issued.
+  size_t queries_issued = 0;
+  size_t queries_issued_per_iter = 0;
+  size_t queries_count = queries.size();
+
+  // Calculate the min/max queries per issue thread.
+  const auto& settings = *state->settings;
+  const size_t min_query_count = settings.min_query_count;
+  const size_t min_query_count_for_thread =
+      (thread_idx < (min_query_count % query_stride))
+          ? (min_query_count / query_stride + 1)
+          : (min_query_count / query_stride);
+  const size_t max_query_count = settings.max_query_count;
+  const size_t max_query_count_for_thread =
+      (thread_idx < (max_query_count % query_stride))
+          ? (max_query_count / query_stride + 1)
+          : (max_query_count / query_stride);
+
+  // Create query scheduler.
+  const auto start = state->start_time;
+  QueryScheduler<scenario> query_scheduler(settings, start);
+  auto last_now = start;
+
+  // We can never run out of generated queries in the server scenario,
+  // since the duration depends on the scheduled query time and not
+  // the actual issue time.
+  bool ran_out_of_generated_queries = scenario != TestScenario::Server;
+  // This is equal to the sum of numbers of samples issued.
+  size_t expected_latencies = 0;
+
+  for (size_t queries_idx = thread_idx; queries_idx < queries_count;
+       queries_idx += query_stride) {
+    queries_issued_per_iter = 0;
+    auto& query = queries[queries_idx];
+    auto tracer1 =
+        MakeScopedTracer([](AsyncTrace& trace) { trace("SampleLoop"); });
+    last_now = query_scheduler.Wait(&query);
+
+    // If in Server scenario and server_coalesce_queries is enabled, multiple
+    // queries are coalesed into one big query if the current time has already
+    // passed the scheduled time of multiple queries.
+    if (scenario == TestScenario::Server &&
+        settings.requested.server_coalesce_queries) {
+      auto current_query_idx = queries_idx;
+      for (; queries_idx + query_stride < queries_count;
+           queries_idx += query_stride) {
+        auto next_scheduled_time =
+            start + queries[queries_idx + query_stride].scheduled_delta;
+        // If current time hasn't reached the next query's scheduled time yet,
+        // don't include next query.
+        if (last_now < next_scheduled_time) {
+          break;
+        }
+        queries_issued_per_iter++;
+      }
+      if (queries_idx > current_query_idx) {
+        // Coalesced all the pass due queries.
+        query.CoalesceQueries(queries.data(), current_query_idx + query_stride,
+                              queries_idx, query_stride);
+      }
+    }
+
+    // Issue the query to the SUT.
+    {
+      auto tracer3 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("IssueQuery"); });
+      sut->IssueQuery(query.query_to_send);
+    }
+
+    // Increment the counter.
+    expected_latencies += query.query_to_send.size();
+    queries_issued_per_iter++;
+    queries_issued += queries_issued_per_iter;
+
+    if (scenario == TestScenario::Server &&
+        settings.requested.server_coalesce_queries) {
+      // Set the query back to its clean state.
+      query.Decoalesce();
+    }
+
+    if (state->mode == TestMode::AccuracyOnly) {
+      // TODO: Rate limit in accuracy mode so accuracy mode works even
+      //       if the expected/target performance is way off.
+      continue;
+    }
+
+    auto duration = (last_now - start);
+    if (scenario == TestScenario::Server) {
+      if (settings.max_async_queries != 0) {
+        // Checks if there are too many outstanding queries.
+        size_t queries_issued_total{0};
+        if (multi_thread) {
+          // To check actual number of async queries in multi-thread case,
+          // we would have to combine the number of queries_issued from all
+          // issue threads.
+          {
+            std::lock_guard<std::mutex> lock(state->mtx);
+            state->queries_issued += queries_issued_per_iter;
+            queries_issued_total = state->queries_issued;
+          }
+        } else {
+          queries_issued_total = queries_issued;
+        }
+        size_t queries_outstanding =
+            queries_issued_total -
+            response_logger.queries_completed.load(std::memory_order_relaxed);
+        if (queries_outstanding > settings.max_async_queries) {
+          LogDetail([thread_idx, queries_issued_total,
+                     queries_outstanding](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+            std::stringstream ss;
+            ss << "IssueQueryThread " << thread_idx
+               << " Ending early: Too many outstanding queries." << " issued "
+               << queries_issued_total << " outstanding "
+               << queries_outstanding;
+            MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+            detail.Error("IssueQueryThread ", std::to_string(thread_idx),
+                         " Ending early: Too many outstanding queries.",
+                         "issued", std::to_string(queries_issued_total),
+                         "outstanding", std::to_string(queries_outstanding));
+#endif
+          });
+          break;
+        }
+      }
+    } else {
+      // Checks if we end normally.
+      if (queries_issued >= min_query_count_for_thread &&
+          duration >= settings.target_duration) {
+        LogDetail([thread_idx](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+          MLPERF_LOG(
+              detail, "generic_message",
+              "Ending naturally: Minimum query count and test duration met.");
+#else
+          detail(
+              " Ending naturally: Minimum query count and test duration met.");
+#endif
+        });
+        ran_out_of_generated_queries = false;
+        break;
+      }
+    }
+
+    // Checks if we have exceeded max_query_count for this thread.
+    if (settings.max_query_count != 0 &&
+        queries_issued >= max_query_count_for_thread) {
+      LogDetail([thread_idx, queries_issued](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        std::stringstream ss;
+        ss << "IssueQueryThread " << thread_idx
+           << " Ending early: Max query count reached." << " query_count "
+           << queries_issued;
+        MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+        detail.Error("IssueQueryThread ", std::to_string(thread_idx),
+                     " Ending early: Max query count reached.", "query_count",
+                     std::to_string(queries_issued));
+#endif
+      });
+      ran_out_of_generated_queries = false;
+      break;
+    }
+
+    // Checks if we have exceeded max_duration.
+    if (settings.max_duration.count() != 0 &&
+        duration > settings.max_duration) {
+      LogDetail([thread_idx, duration](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        std::stringstream ss;
+        ss << "IssueQueryThread " << thread_idx
+           << " Ending early: Max test duration reached." << " duration_ns "
+           << duration.count();
+        MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+        detail.Error("IssueQueryThread ", std::to_string(thread_idx),
+                     " Ending early: Max test duration reached.", "duration_ns",
+                     std::to_string(duration.count()));
+#endif
+      });
+      ran_out_of_generated_queries = false;
+      break;
+    }
+  }
+
+  // Combine the issuing statistics from multiple issue threads.
+  {
+    std::lock_guard<std::mutex> lock(state->mtx);
+    state->ran_out_of_generated_queries |= ran_out_of_generated_queries;
+    // In Server scenario and when max_async_queries != 0, we would have set
+    // state->queries_issued when we check max_async_queries in the loop.
+    if (!(scenario == TestScenario::Server && settings.max_async_queries != 0 &&
+          multi_thread)) {
+      state->queries_issued += queries_issued;
+    }
+    state->expected_latencies += expected_latencies;
+  }
+}
+
+}  // namespace loadgen
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h
new file mode 100644
index 000000000..5668c574e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h
@@ -0,0 +1,215 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Declare IssueQueryController and other helper classes for
+/// query issuing.
+
+#ifndef MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
+#define MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
+
+#include <stdint.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <thread>
+#include <vector>
+
+#include "loadgen.h"
+#include "logging.h"
+#include "query_sample.h"
+#include "system_under_test.h"
+#include "test_settings_internal.h"
+#include "utils.h"
+
+namespace mlperf {
+
+namespace loadgen {
+
+struct SampleMetadata;
+class QueryMetadata;
+
+/// \brief Every query and sample within a call to StartTest gets a unique
+/// sequence id for easy cross reference, and a random number which is used to
+/// determine accuracy logging when it is enabled.
+struct SequenceGen {
+  uint64_t NextQueryId() { return query_id++; }
+  uint64_t NextSampleId() { return sample_id++; }
+  uint64_t CurrentSampleId() { return sample_id; }
+  double NextAccLogRng() { return accuracy_log_dist(accuracy_log_rng); }
+  void InitAccLogRng(uint64_t accuracy_log_rng_seed) {
+    accuracy_log_rng = std::mt19937(accuracy_log_rng_seed);
+  }
+
+ private:
+  uint64_t query_id = 0;
+  uint64_t sample_id = 0;
+  std::mt19937 accuracy_log_rng;
+  std::uniform_real_distribution<double> accuracy_log_dist =
+      std::uniform_real_distribution<double>(0, 1);
+};
+
+/// \brief An interface for a particular scenario + mode to implement for
+/// extended hanlding of sample completion.
+struct ResponseDelegate {
+  virtual ~ResponseDelegate() = default;
+  virtual void SampleComplete(SampleMetadata*, QuerySampleResponse*,
+                              PerfClock::time_point,
+                              const ResponseCallback&) = 0;
+  virtual void TokenComplete(SampleMetadata*, QuerySampleResponse*,
+                             PerfClock::time_point,
+                             const ResponseCallback&) = 0;
+  virtual void QueryComplete() = 0;
+  std::atomic<size_t> queries_completed{0};
+};
+
+/// \brief Used by the loadgen to coordinate response data and completion.
+struct SampleMetadata {
+  QueryMetadata* query_metadata;
+  uint64_t sequence_id;
+  QuerySampleIndex sample_index;
+  double accuracy_log_val;
+};
+
+/// \brief Maintains data and timing info for a query and all its samples.
+class QueryMetadata {
+ public:
+  QueryMetadata(const std::vector<QuerySampleIndex>& query_sample_indices,
+                std::chrono::nanoseconds scheduled_delta,
+                ResponseDelegate* response_delegate, SequenceGen* sequence_gen);
+  QueryMetadata(QueryMetadata&& src);
+
+  void NotifyOneSampleCompleted(PerfClock::time_point timestamp);
+
+  void WaitForAllSamplesCompleted();
+
+  PerfClock::time_point WaitForAllSamplesCompletedWithTimestamp();
+
+  /// \brief Coalesce multiple queries into one query.
+  /// When server_coalesce_queries is set to true in Server scenario, we
+  /// sometimes coalesce multiple queries into one query. This is done by moving
+  /// the other query's sample into current query, while maintaining their
+  /// original scheduled_time.
+  void CoalesceQueries(QueryMetadata* queries, size_t first, size_t last,
+                       size_t stride);
+
+  /// \brief Set a coalesced query back to its original state.
+  void Decoalesce();
+
+ public:
+  std::vector<QuerySample> query_to_send;
+  const std::chrono::nanoseconds scheduled_delta;
+  ResponseDelegate* const response_delegate;
+  const uint64_t sequence_id;
+
+  // Performance information.
+
+  size_t scheduled_intervals = 0;  // Number of intervals between queries, as
+                                   // actually scheduled during the run.
+                                   // For the MultiStream scenario only.
+  PerfClock::time_point scheduled_time;
+  PerfClock::time_point issued_start_time;
+  PerfClock::time_point all_samples_done_time;
+
+ private:
+  std::atomic<size_t> wait_count_;
+  std::promise<void> all_samples_done_;
+  std::vector<SampleMetadata> samples_;
+};
+
+/// \brief A state object for communications between the controller and its
+/// caller.
+struct IssueQueryState {
+  // Information from caller to controller.
+  SystemUnderTest* sut;
+  std::vector<QueryMetadata>* queries;
+  ResponseDelegate* response_delegate;
+  const TestSettingsInternal* settings;
+  TestMode mode;
+  // Information from controller to caller.
+  std::chrono::system_clock::time_point start_for_power;
+  PerfClock::time_point start_time;
+  bool ran_out_of_generated_queries;
+  size_t queries_issued;
+  size_t expected_latencies;
+  // The lock to modify this state (in multi-thread case).
+  std::mutex mtx;
+};
+
+/// \brief Controls the query issuing part.
+/// This controller handles both the cases if the user registers or does not
+/// register IssueQueryThreads. It is implemented as a singleton, and is NOT
+/// thread-safe (i.e. users should not call StartTest() on multiple threads).
+/// It is thread-safe with regard to IssueQueryThreads.
+class IssueQueryController {
+ public:
+  /// \brief Get the controller instance singleton.
+  static IssueQueryController& GetInstance();
+
+  /// \brief Don't allow copy. This is a singleton.
+  IssueQueryController(IssueQueryController const&) = delete;
+  void operator=(IssueQueryController const&) = delete;
+
+  /// \brief Register an IssueQueryThread.
+  /// It is blocking until the entire test ends.
+  void RegisterThread();
+
+  /// \brief Set number of IssueQueryThreads and wait for thread registration.
+  /// If for any reason the number of registered threads do not match the
+  /// specified number, it prints out an error.
+  void SetNumThreads(size_t n);
+
+  /// \brief Kick off the query issuing.
+  /// The query issuing will be done on the current thread if there is no
+  /// registered IssueQueryThreads or if it is not in Server scenario.
+  template <TestScenario scenario>
+  void StartIssueQueries(IssueQueryState* s);
+
+  /// \brief Notify the IssueQueryThreads to end.
+  void EndThreads();
+
+ private:
+  /// \brief Hide constructor. This is a singleton.
+  IssueQueryController() {}
+
+  /// \brief The internal helper which actually issues queries.
+  /// This should be called by the thread(s) which issues queries.
+  template <TestScenario scenario, bool multi_thread>
+  void IssueQueriesInternal(size_t query_stride, size_t thread_idx);
+
+  /// \brief The issue query state.
+  IssueQueryState* state;
+  /// \brief Locks for communications across IssueQueryThreads and the main
+  /// thread.
+  std::mutex mtx;
+  std::condition_variable cond_var;
+  /// \brief Thread ids of the registered IssueQueryThreads.
+  std::vector<std::thread::id> thread_ids;
+  size_t num_threads{0};
+  /// \brief Whether the threads should be actively issuing queries.
+  bool issuing{false};
+  /// \brief Flags for each IssueQueryThread to mark that it is done.
+  std::vector<bool> thread_complete;
+  /// \brief Whether the threads can end now.
+  bool end_test{false};
+};
+
+}  // namespace loadgen
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc
new file mode 100644
index 000000000..42b2140de
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc
@@ -0,0 +1,1345 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "loadgen.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <future>
+#include <iomanip>
+#include <iostream>
+#include <queue>
+#include <random>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "early_stopping.h"
+#include "issue_query_controller.h"
+#include "logging.h"
+#include "query_sample.h"
+#include "query_sample_library.h"
+#include "results.h"
+#include "system_under_test.h"
+#include "test_settings.h"
+#include "test_settings_internal.h"
+#include "utils.h"
+#include "version.h"
+
+namespace mlperf {
+
+/// \brief Loadgen implementation details.
+namespace loadgen {
+
+/// \brief A random set of samples in the QSL that should fit in RAM when
+/// loaded together.
+struct LoadableSampleSet {
+  std::vector<QuerySampleIndex> set;
+  const size_t sample_distribution_end;  // Excludes padding in MultiStream.
+};
+
+/// \brief Generates nanoseconds from a start time to multiple end times.
+/// TODO: This isn't very useful anymore. Remove it.
+struct DurationGeneratorNs {
+  const PerfClock::time_point start;
+  int64_t delta(PerfClock::time_point end) const {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
+        .count();
+  }
+};
+
+/// \brief ResponseDelegate implementation templated by scenario and mode.
+template <TestScenario scenario, TestMode mode>
+struct ResponseDelegateDetailed : public ResponseDelegate {
+  double accuracy_log_offset = 0.0f;
+  double accuracy_log_prob = 0.0f;
+
+  void SampleComplete(SampleMetadata* sample, QuerySampleResponse* response,
+                      PerfClock::time_point complete_begin_time,
+                      const ResponseCallback& response_cb) override {
+    // Using a raw pointer here should help us hit the std::function
+    // small buffer optimization code path when we aren't copying data.
+    // For some reason, using std::unique_ptr<std::vector> wasn't moving
+    // into the lambda; even with C++14.
+    std::vector<uint8_t>* sample_data_copy = nullptr;
+    double accuracy_log_val =
+        sample->accuracy_log_val + accuracy_log_offset < 1.0
+            ? sample->accuracy_log_val + accuracy_log_offset
+            : sample->accuracy_log_val + accuracy_log_offset - 1.0;
+    if (mode == TestMode::AccuracyOnly ||
+        accuracy_log_val <= accuracy_log_prob) {
+      // if a response_cb callback is provided, data only needs to reside on the
+      // host *after* calling it note that the callback is blocking and will
+      // likely involve a memcpy from accelerator to host
+      if (response_cb) {
+        response_cb(response);
+      }
+      // TODO: Verify accuracy with the data copied here.
+      uint8_t* src_begin = reinterpret_cast<uint8_t*>(response->data);
+      uint8_t* src_end = src_begin + response->size;
+      sample_data_copy = new std::vector<uint8_t>(src_begin, src_end);
+    }
+    int64_t n_tokens = response->n_tokens;
+    Log([sample, complete_begin_time, sample_data_copy,
+         n_tokens](AsyncLog& log) {
+      QueryMetadata* query = sample->query_metadata;
+      DurationGeneratorNs sched{query->scheduled_time};
+      if (scenario == TestScenario::Server) {
+        // Trace the server scenario as a stacked graph via counter events.
+        DurationGeneratorNs issued{query->issued_start_time};
+        log.TraceCounterEvent("Latency", query->scheduled_time, "issue_delay",
+                              sched.delta(query->issued_start_time),
+                              "issue_to_done",
+                              issued.delta(complete_begin_time));
+      }
+
+      // While visualizing overlapping samples in offline mode is not
+      // practical, sample completion is still recorded for auditing purposes.
+      log.TraceSample("Sample", sample->sequence_id, query->scheduled_time,
+                      complete_begin_time, "sample_seq", sample->sequence_id,
+                      "query_seq", query->sequence_id, "sample_idx",
+                      sample->sample_index, "issue_start_ns",
+                      sched.delta(query->issued_start_time), "complete_ns",
+                      sched.delta(complete_begin_time));
+
+      if (sample_data_copy) {
+        log.LogAccuracy(sample->sequence_id, sample->sample_index,
+                        LogBinaryAsHexString{sample_data_copy}, n_tokens);
+        delete sample_data_copy;
+      }
+
+      // Record the latency at the end, since it will unblock the issuing
+      // thread and potentially destroy the metadata being used above.
+      QuerySampleLatency latency = sched.delta(complete_begin_time);
+      log.RecordSampleCompletion(sample->sequence_id, complete_begin_time,
+                                 latency, n_tokens);
+    });
+  }
+
+  void TokenComplete(SampleMetadata* sample, QuerySampleResponse* response,
+                     PerfClock::time_point complete_begin_time,
+                     const ResponseCallback& response_cb) override {
+    // Using a raw pointer here should help us hit the std::function
+    // small buffer optimization code path when we aren't copying data.
+    // For some reason, using std::unique_ptr<std::vector> wasn't moving
+    // into the lambda; even with C++14.
+    std::vector<uint8_t>* token_data_copy = nullptr;
+    double accuracy_log_val =
+        sample->accuracy_log_val + accuracy_log_offset < 1.0
+            ? sample->accuracy_log_val + accuracy_log_offset
+            : sample->accuracy_log_val + accuracy_log_offset - 1.0;
+    if (mode == TestMode::AccuracyOnly ||
+        accuracy_log_val <= accuracy_log_prob) {
+      uint8_t* src_begin = reinterpret_cast<uint8_t*>(response->data);
+      uint8_t* src_end = src_begin + response->size;
+      token_data_copy = new std::vector<uint8_t>(src_begin, src_end);
+    }
+    Log([sample, complete_begin_time, token_data_copy](AsyncLog& log) {
+      QueryMetadata* query = sample->query_metadata;
+      DurationGeneratorNs sched{query->scheduled_time};
+      if (scenario == TestScenario::Server) {
+        DurationGeneratorNs issued{query->issued_start_time};
+        log.TraceCounterEvent(
+            "Token_Latency", query->scheduled_time, "issue_delay",
+            sched.delta(query->issued_start_time), "issue_to_done",
+            issued.delta(complete_begin_time));
+      } else {
+        log.TraceSample("Token", sample->sequence_id, query->scheduled_time,
+                        complete_begin_time, "sample_seq", sample->sequence_id,
+                        "query_seq", query->sequence_id, "sample_idx",
+                        sample->sample_index, "issue_start_ns",
+                        sched.delta(query->issued_start_time), "complete_ns",
+                        sched.delta(complete_begin_time));
+      }
+      if (token_data_copy) {
+        log.CacheToken(sample->sequence_id,
+                       LogBinaryAsHexString{token_data_copy});
+      }
+      QuerySampleLatency latency = sched.delta(complete_begin_time);
+      log.RecordTokenCompletion(sample->sequence_id, complete_begin_time,
+                                latency);
+    });
+  }
+
+  void QueryComplete() override {
+    // We only need to track outstanding queries in the server scenario to
+    // detect when the SUT has fallen too far behind.
+    if (scenario == TestScenario::Server) {
+      queries_completed.fetch_add(1, std::memory_order_relaxed);
+    }
+  }
+};
+
+/// \brief Selects the query timestamps for all scenarios except Server.
+template <TestScenario scenario>
+auto ScheduleDistribution(double qps) {
+  return [period = std::chrono::duration_cast<std::chrono::nanoseconds>(
+              std::chrono::duration<double>(1.0 / qps))](auto& /*gen*/) {
+    return period;
+  };
+}
+
+/// \brief Selects the query timestamps for the Server scenario.
+template <>
+auto ScheduleDistribution<TestScenario::Server>(double qps) {
+  // Poisson arrival process corresponds to exponentially distributed
+  // interarrival times.
+  return [dist = std::exponential_distribution<>(qps)](auto& gen) mutable {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::duration<double>(dist(gen)));
+  };
+}
+
+/// \brief Selects samples for the accuracy mode.
+template <TestMode mode>
+auto SampleDistribution(size_t sample_count, size_t stride, std::mt19937* rng) {
+  std::vector<size_t> indices;
+  for (size_t i = 0; i < sample_count; i += stride) {
+    indices.push_back(i);
+  }
+  std::shuffle(indices.begin(), indices.end(), *rng);
+  return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
+    return indices.at(i++);
+  };
+}
+
+/// \brief Selects samples for the performance mode.
+template <>
+auto SampleDistribution<TestMode::PerformanceOnly>(size_t sample_count,
+                                                   size_t /*stride*/,
+                                                   std::mt19937* /*rng*/) {
+  return [dist = std::uniform_int_distribution<>(0, sample_count - 1)](
+             auto& gen) mutable { return dist(gen); };
+}
+
+/// \brief Sample across the dataset, and ensure coverage of each of the
+/// samples.
+// Useful for non-uniform dataset (e.g. Llama2, GPTJ, 3d-unet)
+auto SampleDistributionEqualIssue(size_t sample_count, size_t set_size,
+                                  std::mt19937* rng) {
+  std::vector<size_t> indices;
+  std::vector<size_t> shuffle_indices(set_size);
+  std::iota(shuffle_indices.begin(), shuffle_indices.end(), 0);
+  for (size_t j = 0; j < sample_count; j += set_size) {
+    std::shuffle(shuffle_indices.begin(), shuffle_indices.end(), *rng);
+    indices.insert(indices.end(), shuffle_indices.begin(),
+                   shuffle_indices.end());
+  }
+  return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
+    return indices.at((i++) % indices.size());
+  };
+}
+
+/// \brief Generates queries for the requested settings, templated by
+/// scenario and mode.
+/// \todo Make GenerateQueries faster.
+/// QueryMetadata is expensive to move; either reserve queries in advance
+/// so the queries vector doesn't need to grow. And/or parent samples to their
+/// queries only after all queries have been generated.
+/// \todo For the server scenario only, scale the query timeline at the end so
+/// the QPS as scheduled is equal to the QPS as requested.
+template <TestScenario scenario, TestMode mode>
+std::vector<QueryMetadata> GenerateQueries(
+    const TestSettingsInternal& settings,
+    const LoadableSampleSet& loaded_sample_set, SequenceGen* sequence_gen,
+    ResponseDelegate* response_delegate) {
+  auto tracer =
+      MakeScopedTracer([](AsyncTrace& trace) { trace("GenerateQueries"); });
+
+  auto& loaded_samples = loaded_sample_set.set;
+
+  // Generate 2x more samples than we think we'll need given the expected
+  // QPS in case the SUT is faster than expected.
+  // We should exit before issuing all queries.
+  // Does not apply to the server scenario since the duration only
+  // depends on the ideal scheduled time, not the actual issue time.
+  const int duration_multiplier = scenario == TestScenario::Server ? 1 : 2;
+  std::chrono::microseconds gen_duration =
+      duration_multiplier * settings.target_duration;
+  size_t min_queries = settings.min_query_count;
+
+  size_t samples_per_query = settings.samples_per_query;
+  if (mode == TestMode::AccuracyOnly && scenario == TestScenario::Offline) {
+    samples_per_query = loaded_sample_set.sample_distribution_end;
+  }
+
+  // We should not exit early in accuracy mode.
+  if (mode == TestMode::AccuracyOnly || settings.performance_issue_unique) {
+    gen_duration = std::chrono::microseconds(0);
+    // Integer truncation here is intentional.
+    // For MultiStream, loaded samples is properly padded.
+    // For Offline, we create a 'remainder' query at the end of this function.
+    min_queries = loaded_samples.size() / samples_per_query;
+  }
+
+  std::vector<QueryMetadata> queries;
+
+  // Using the std::mt19937 pseudo-random number generator ensures a modicum of
+  // cross platform reproducibility for trace generation.
+  std::mt19937 sample_rng(settings.sample_index_rng_seed);
+  std::mt19937 schedule_rng(settings.schedule_rng_seed);
+
+  constexpr bool kIsMultiStream = scenario == TestScenario::MultiStream;
+  const size_t sample_stride = kIsMultiStream ? samples_per_query : 1;
+
+  auto sample_distribution = SampleDistribution<mode>(
+      loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
+  // Use the unique sample distribution same as in AccuracyMode to
+  // to choose samples when either flag performance_issue_unique
+  // or performance_issue_same is set.
+  auto sample_distribution_unique = SampleDistribution<TestMode::AccuracyOnly>(
+      loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
+
+  auto sample_distribution_equal_issue = SampleDistributionEqualIssue(
+      min_queries, loaded_samples.size(), &sample_rng);
+
+  auto schedule_distribution =
+      ScheduleDistribution<scenario>(settings.target_qps);
+
+  // When sample_concatenate_permutation is turned on, pad to a multiple of the
+  // complete dataset to ensure fairness.
+  auto enable_equal_issue = settings.sample_concatenate_permutation;
+  if (mode != TestMode::AccuracyOnly && enable_equal_issue) {
+    if (scenario == TestScenario::Offline &&
+        samples_per_query % loaded_samples.size() != 0) {
+      // In offline mode, we pad samples_per_query
+      size_t pad_size =
+          (loaded_samples.size() - samples_per_query % loaded_samples.size());
+      samples_per_query += pad_size;
+    } else if ((scenario != TestScenario::Offline) &&
+               (min_queries % loaded_samples.size() != 0)) {
+      // In Server, SingleStream, MultiStream mode, the min_queries should be
+      // padded
+      size_t pad_size =
+          (loaded_samples.size() - min_queries % loaded_samples.size());
+      min_queries += pad_size;
+    }
+  }
+
+  std::vector<QuerySampleIndex> samples(samples_per_query);
+  std::chrono::nanoseconds timestamp(0);
+  std::chrono::nanoseconds prev_timestamp(0);
+  // Choose a single sample to repeat when in performance_issue_same mode
+  QuerySampleIndex same_sample = settings.performance_issue_same_index;
+
+  while (prev_timestamp < gen_duration || queries.size() < min_queries) {
+    if (kIsMultiStream) {
+      QuerySampleIndex sample_i = settings.performance_issue_unique
+                                      ? sample_distribution_unique(sample_rng)
+                                  : settings.performance_issue_same
+                                      ? same_sample
+                                      : sample_distribution(sample_rng);
+      for (auto& s : samples) {
+        // Select contiguous samples in the MultiStream scenario.
+        // This will not overflow, since GenerateLoadableSets adds padding at
+        // the end of the loadable sets in the MultiStream scenario.
+        // The padding allows the starting samples to be the same for each
+        // query with respect to samples_per_query.
+        s = loaded_samples[sample_i++];
+      }
+    } else if (scenario == TestScenario::Offline) {
+      // For the Offline + Performance scenario, we also want to support
+      // contiguous samples. In this scenario the query can be much larger than
+      // what fits into memory. We simply repeat loaded_samples N times, plus a
+      // remainder to ensure we fill up samples. Note that this eliminates
+      // randomization.
+      size_t num_loaded_samples = loaded_samples.size();
+      size_t num_full_repeats = samples_per_query / num_loaded_samples;
+      uint64_t remainder = samples_per_query % (num_loaded_samples);
+      if (settings.performance_issue_same) {
+        std::fill(samples.begin(), samples.begin() + samples_per_query,
+                  loaded_samples[same_sample]);
+      } else {
+        for (size_t i = 0; i < num_full_repeats; ++i) {
+          std::copy(loaded_samples.begin(), loaded_samples.end(),
+                    samples.begin() + i * num_loaded_samples);
+
+          if (settings.sample_concatenate_permutation) {
+            std::shuffle(samples.begin() + i * num_loaded_samples,
+                         samples.begin() + (i + 1) * num_loaded_samples,
+                         sample_rng);
+          }
+        }
+
+        std::copy(loaded_samples.begin(), loaded_samples.begin() + remainder,
+                  samples.begin() + num_full_repeats * num_loaded_samples);
+
+        if (settings.sample_concatenate_permutation) {
+          assert(remainder == 0);
+        }
+      }
+    } else {
+      for (auto& s : samples) {
+        s = loaded_samples[settings.performance_issue_unique
+                               ? sample_distribution_unique(sample_rng)
+                           : settings.performance_issue_same ? same_sample
+                           : enable_equal_issue
+                               ? sample_distribution_equal_issue(sample_rng)
+                               : sample_distribution(sample_rng)];
+      }
+    }
+    queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
+    prev_timestamp = timestamp;
+    timestamp += schedule_distribution(schedule_rng);
+    // In equal_issue mode, the min_queries will be bumped up by a multiple of
+    // the dataset size if the test time has not met the threshold.
+    if (enable_equal_issue && (queries.size() >= min_queries) &&
+        (prev_timestamp < gen_duration) &&
+        (scenario != TestScenario::Offline)) {
+      min_queries += loaded_samples.size();
+    }
+  }
+
+  // See if we need to create a "remainder" query for offline+accuracy to
+  // ensure we issue all samples in loaded_samples. Offline doesn't pad
+  // loaded_samples like MultiStream does.
+  if (scenario == TestScenario::Offline && mode == TestMode::AccuracyOnly) {
+    size_t remaining_samples = loaded_samples.size() % samples_per_query;
+    if (remaining_samples != 0) {
+      samples.resize(remaining_samples);
+      for (auto& s : samples) {
+        s = loaded_samples[sample_distribution(sample_rng)];
+      }
+      queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
+    }
+  }
+
+  LogDetail([count = queries.size(), spq = samples_per_query,
+             duration = timestamp.count()](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generated_query_count", count);
+    MLPERF_LOG(detail, "generated_samples_per_query", spq);
+    MLPERF_LOG(detail, "generated_query_duration", duration);
+#else
+    detail("GeneratedQueries: ", "queries", count, "samples per query", spq,
+           "duration", duration);
+#endif
+  });
+
+  return queries;
+}
+
+/// \brief Issues a series of pre-generated queries.
+// TODO: Templates for scenario and mode are overused, given the loadgen
+//       no longer generates queries on the fly. Should we reduce the
+//       use of templates?
+template <TestScenario scenario, TestMode mode>
+PerformanceResult IssueQueries(SystemUnderTest* sut,
+                               const TestSettingsInternal& settings,
+                               const LoadableSampleSet& loaded_sample_set,
+                               SequenceGen* sequence_gen) {
+  // Create reponse handler.
+  ResponseDelegateDetailed<scenario, mode> response_logger;
+  std::uniform_real_distribution<double> accuracy_log_offset_dist =
+      std::uniform_real_distribution<double>(0.0, 1.0);
+  std::mt19937 accuracy_log_offset_rng(settings.accuracy_log_rng_seed);
+  response_logger.accuracy_log_offset =
+      accuracy_log_offset_dist(accuracy_log_offset_rng);
+  response_logger.accuracy_log_prob = settings.accuracy_log_probability;
+
+  // Generate queries.
+  auto sequence_id_start = sequence_gen->CurrentSampleId();
+  std::vector<QueryMetadata> queries = GenerateQueries<scenario, mode>(
+      settings, loaded_sample_set, sequence_gen, &response_logger);
+
+  // Calculated expected number of queries
+  uint64_t expected_queries =
+      settings.target_qps * settings.min_duration.count() / 1000;
+  uint64_t minimum_queries =
+      settings.min_query_count * settings.samples_per_query;
+
+  if (scenario != TestScenario::Offline) {
+    expected_queries *= settings.samples_per_query;
+  } else {
+    minimum_queries = settings.min_sample_count;
+  }
+
+  expected_queries =
+      expected_queries < minimum_queries ? minimum_queries : expected_queries;
+
+  if (settings.accuracy_log_sampling_target > 0) {
+    response_logger.accuracy_log_prob =
+        (double)settings.accuracy_log_sampling_target / expected_queries;
+  }
+  auto sequence_id_end = sequence_gen->CurrentSampleId();
+  size_t max_latencies_to_record = sequence_id_end - sequence_id_start;
+
+  // Initialize logger for latency recording.
+  GlobalLogger().RestartLatencyRecording(sequence_id_start,
+                                         max_latencies_to_record);
+
+  // Create and initialize an IssueQueryState.
+  IssueQueryState state{
+      sut, &queries, &response_logger, &settings, mode, {}, {}, false, 0,
+      0,   {}};
+  auto& controller = IssueQueryController::GetInstance();
+
+  // Set number of IssueQueryThreads and wait for the threads to register.
+  controller.SetNumThreads(settings.requested.server_num_issue_query_threads);
+
+  // Start issuing the queries.
+  controller.StartIssueQueries<scenario>(&state);
+
+  // Gather query issuing statistics.
+  const auto start_for_power = state.start_for_power;
+  const auto start = state.start_time;
+  const auto ran_out_of_generated_queries = state.ran_out_of_generated_queries;
+  const auto queries_issued = state.queries_issued;
+  const auto expected_latencies = state.expected_latencies;
+
+  // Let the SUT know it should not expect any more queries.
+  sut->FlushQueries();
+
+  if (mode == TestMode::PerformanceOnly && ran_out_of_generated_queries) {
+    LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(
+          detail, "error_runtime",
+          "Ending early: Ran out of generated queries to issue before the "
+          "minimum query count and test duration were reached. "
+          "Please update the relevant expected latency or target qps in the "
+          "TestSettings so they are more accurate.");
+#else
+      detail.Error(
+          "Ending early: Ran out of generated queries to issue before the "
+          "minimum query count and test duration were reached.");
+      detail(
+          "Please update the relevant expected latency or target qps in the "
+          "TestSettings so they are more accurate.");
+#endif
+    });
+  }
+
+  // Wait for tail queries to complete and collect all the latencies.
+  // We have to keep the synchronization primitives alive until the SUT
+  // is done with them.
+  auto& final_query = queries[queries_issued - 1];
+  std::vector<QuerySampleLatency> sample_latencies(
+      GlobalLogger().GetLatenciesBlocking(expected_latencies));
+
+  std::vector<QuerySampleLatency> first_token_latencies(
+      GlobalLogger().GetTokenLatencies(expected_latencies));
+
+  std::vector<QuerySampleLatency> time_per_output_token_arr(
+      GlobalLogger().GetTimePerOutputToken(expected_latencies));
+
+  std::vector<int64_t> tokens_per_sample(
+      GlobalLogger().GetTokensPerSample(expected_latencies));
+
+  // Log contention counters after every test as a sanity check.
+  GlobalLogger().LogContentionAndAllocations();
+
+  // This properly accounts for the fact that the max completion time may not
+  // belong to the final query. It also excludes any time spent postprocessing
+  // in the loadgen itself after final completion, which may be significant
+  // in the offline scenario.
+  PerfClock::time_point max_completion_time =
+      GlobalLogger().GetMaxCompletionTime();
+  auto sut_active_duration = max_completion_time - start;
+  LogDetail([start_for_power, sut_active_duration](AsyncDetail& detail) {
+    auto end_for_power =
+        start_for_power +
+        std::chrono::duration_cast<std::chrono::system_clock::duration>(
+            sut_active_duration);
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_INTERVAL_START(detail, "power_begin",
+                              DateTimeStringForPower(start_for_power));
+    MLPERF_LOG_INTERVAL_END(detail, "power_end",
+                            DateTimeStringForPower(end_for_power));
+#else
+    detail("POWER_BEGIN: ", "mode", ToString(mode), "time",
+           DateTimeStringForPower(start_for_power));
+    detail("POWER_END: ", "mode", ToString(mode), "time",
+           DateTimeStringForPower(end_for_power));
+#endif
+  });
+
+  double max_latency =
+      QuerySampleLatencyToSeconds(GlobalLogger().GetMaxLatencySoFar());
+  double final_query_scheduled_time =
+      DurationToSeconds(final_query.scheduled_delta);
+  double final_query_issued_time =
+      DurationToSeconds(final_query.issued_start_time - start);
+  double final_query_all_samples_done_time =
+      DurationToSeconds(final_query.all_samples_done_time - start);
+
+  std::vector<QuerySampleLatency> query_latencies;
+  if (scenario == TestScenario::MultiStream) {
+    query_latencies.resize(queries_issued);
+    for (size_t i = 0; i < queries_issued; i++) {
+      query_latencies[i] = DurationGeneratorNs{queries[i].scheduled_time}.delta(
+          queries[i].all_samples_done_time);
+    }
+  }
+
+  return PerformanceResult{
+      std::move(sample_latencies),
+      std::move(query_latencies),
+      queries_issued,
+      max_latency,
+      final_query_scheduled_time,
+      final_query_issued_time,
+      final_query_all_samples_done_time,
+      TokenPerformanceResults{first_token_latencies, time_per_output_token_arr,
+                              tokens_per_sample}};
+}
+
+void LoadSamplesToRam(QuerySampleLibrary* qsl,
+                      const std::vector<QuerySampleIndex>& samples) {
+  LogDetail([&samples](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "loaded_qsl_set", samples);
+#else
+    std::string set("\"[");
+    for (auto i : samples) {
+      set += std::to_string(i) + ",";
+    }
+    set.resize(set.size() - 1);
+    set += "]\"";
+    detail("Loading QSL : ", "set", set);
+#endif
+  });
+  qsl->LoadSamplesToRam(samples);
+}
+
+/// \brief Generates random sets of samples in the QSL that we can load into
+/// RAM at the same time.
+std::vector<LoadableSampleSet> GenerateLoadableSets(
+    QuerySampleLibrary* qsl, const TestSettingsInternal& settings) {
+  auto tracer = MakeScopedTracer(
+      [](AsyncTrace& trace) { trace("GenerateLoadableSets"); });
+
+  std::vector<LoadableSampleSet> result;
+  std::mt19937 qsl_rng(settings.qsl_rng_seed);
+
+  // Generate indices for all available samples in the QSL.
+  const size_t qsl_total_count = qsl->TotalSampleCount();
+  std::vector<QuerySampleIndex> samples(qsl_total_count);
+  for (size_t i = 0; i < qsl_total_count; i++) {
+    samples[i] = static_cast<QuerySampleIndex>(i);
+  }
+
+  // Randomize the order of the samples.
+  std::shuffle(samples.begin(), samples.end(), qsl_rng);
+
+  // Partition the samples into loadable sets.
+  const size_t set_size = settings.performance_sample_count;
+  const size_t set_padding = (settings.scenario == TestScenario::MultiStream)
+                                 ? settings.samples_per_query - 1
+                                 : 0;
+  std::vector<QuerySampleIndex> loadable_set;
+  loadable_set.reserve(set_size + set_padding);
+
+  for (auto s : samples) {
+    loadable_set.push_back(s);
+    if (loadable_set.size() == set_size) {
+      result.push_back({std::move(loadable_set), set_size});
+      loadable_set.clear();
+      loadable_set.reserve(set_size + set_padding);
+    }
+  }
+
+  if (!loadable_set.empty()) {
+    // Copy the size since it will become invalid after the move.
+    size_t loadable_set_size = loadable_set.size();
+    result.push_back({std::move(loadable_set), loadable_set_size});
+  }
+
+  // Add padding for the multi stream scenario. Padding allows the
+  // starting sample to be the same for all SUTs, independent of the value
+  // of samples_per_query, while enabling samples in a query to be contiguous.
+  for (auto& loadable_set : result) {
+    auto& set = loadable_set.set;
+    for (size_t i = 0; i < set_padding; i++) {
+      // It's not clear in the spec if the STL deallocates the old container
+      // before assigning, which would invalidate the source before the
+      // assignment happens. Even though we should have reserved enough
+      // elements above, copy the source first anyway since we are just moving
+      // integers around.
+      QuerySampleIndex p = set[i];
+      set.push_back(p);
+    }
+  }
+
+  return result;
+}
+
+/// \brief Opens and owns handles to all of the log files.
+struct LogOutputs {
+  LogOutputs(const LogOutputSettings& output_settings,
+             const std::string& test_date_time) {
+    std::string prefix = output_settings.outdir;
+    prefix += "/" + output_settings.prefix;
+    if (output_settings.prefix_with_datetime) {
+      prefix += test_date_time + "_";
+    }
+    const std::string& suffix = output_settings.suffix;
+
+    summary_out.open(prefix + "summary" + suffix + ".txt");
+    detail_out.open(prefix + "detail" + suffix + ".txt");
+    accuracy_out.open(prefix + "accuracy" + suffix + ".json");
+    trace_out.open(prefix + "trace" + suffix + ".json");
+  }
+
+  bool CheckOutputs() {
+    bool all_ofstreams_good = true;
+    if (!summary_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open summary file.";
+    }
+    if (!detail_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open detailed log file.";
+    }
+    if (!accuracy_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open accuracy log file.";
+    }
+    if (!trace_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open trace file.";
+    }
+    return all_ofstreams_good;
+  }
+
+  std::ofstream summary_out;
+  std::ofstream detail_out;
+  std::ofstream accuracy_out;
+  std::ofstream trace_out;
+};
+
+/// \brief Find boundaries of performance settings by widening bounds
+/// exponentially.
+/// \details To find an upper bound of performance, widen an
+/// upper bound exponentially until finding a bound that can't satisfy
+/// performance constraints. i.e. [1, 2) -> [2, 4) -> [4, 8) -> ...
+template <TestScenario scenario>
+std::pair<PerformanceSummary, PerformanceSummary> FindBoundaries(
+    SystemUnderTest* sut, QuerySampleLibrary* qsl, SequenceGen* sequence_gen,
+    PerformanceSummary l_perf_summary) {
+  // Get upper bound
+  TestSettingsInternal u_settings = l_perf_summary.settings;
+  find_peak_performance::WidenPerformanceField<scenario>(&u_settings);
+
+  LogDetail(
+      [l_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           l_perf_summary.settings),
+       u_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           u_settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        MLPERF_LOG(detail, "generic_message",
+                   "FindBoundaries: Checking fields [" + l_field + ", " +
+                       u_field + ")");
+#else
+        detail("FindBoundaries: Checking fields [" + l_field + ", " + u_field +
+               ")");
+#endif
+      });
+
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, u_settings));
+  const LoadableSampleSet& performance_set = loadable_sets.front();
+  LoadSamplesToRam(qsl, performance_set.set);
+
+  PerformanceResult u_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, u_settings, performance_set, sequence_gen));
+  PerformanceSummary u_perf_summary{sut->Name(), u_settings, std::move(u_pr)};
+
+  qsl->UnloadSamplesFromRam(performance_set.set);
+
+  std::string tmp;
+  if (!u_perf_summary.PerfConstraintsMet(&tmp)) {
+    return std::make_pair(l_perf_summary, u_perf_summary);
+  } else {
+    return FindBoundaries<scenario>(sut, qsl, sequence_gen, u_perf_summary);
+  }
+}
+
+/// \brief Find peak performance by binary search.
+/// \details The found lower & upper bounds by the function 'FindBoundaries' are
+/// used as initial bounds of binary search
+template <TestScenario scenario>
+PerformanceSummary FindPeakPerformanceBinarySearch(
+    SystemUnderTest* sut, QuerySampleLibrary* qsl, SequenceGen* sequence_gen,
+    const LoadableSampleSet& performance_set, PerformanceSummary l_perf_summary,
+    PerformanceSummary u_perf_summary) {
+  if (find_peak_performance::IsFinished<scenario>(l_perf_summary.settings,
+                                                  u_perf_summary.settings)) {
+    return l_perf_summary;
+  }
+
+  const TestSettingsInternal m_settings =
+      find_peak_performance::MidOfBoundaries<scenario>(l_perf_summary.settings,
+                                                       u_perf_summary.settings);
+
+  LogDetail([l_field =
+                 find_peak_performance::ToStringPerformanceField<scenario>(
+                     l_perf_summary.settings),
+             u_field =
+                 find_peak_performance::ToStringPerformanceField<scenario>(
+                     u_perf_summary.settings),
+             m_field =
+                 find_peak_performance::ToStringPerformanceField<scenario>(
+                     m_settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(
+        detail, "generic_message",
+        "FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
+            l_field + ", " + u_field + "): " + m_field);
+#else
+    detail(
+        "FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
+        l_field + ", " + u_field + "): " + m_field);
+#endif
+  });
+
+  PerformanceResult m_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, m_settings, performance_set, sequence_gen));
+  PerformanceSummary m_perf_summary{sut->Name(), m_settings, std::move(m_pr)};
+
+  std::string tmp;
+  if (m_perf_summary.PerfConstraintsMet(&tmp)) {
+    return FindPeakPerformanceBinarySearch<scenario>(
+        sut, qsl, sequence_gen, performance_set, m_perf_summary,
+        u_perf_summary);
+  } else {
+    return FindPeakPerformanceBinarySearch<scenario>(
+        sut, qsl, sequence_gen, performance_set, l_perf_summary,
+        m_perf_summary);
+  }
+}
+
+/// \brief Runs the performance mode, templated by scenario.
+template <TestScenario scenario>
+void RunPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                        const TestSettingsInternal& settings,
+                        SequenceGen* sequence_gen) {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message", "Starting performance mode");
+#else
+    detail("Starting performance mode:");
+#endif
+  });
+
+  // Use first loadable set as the performance set.
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, settings));
+  const LoadableSampleSet& performance_set = loadable_sets.front();
+  LoadSamplesToRam(qsl, performance_set.set);
+
+  // Start PerfClock/system_clock timers for measuring performance interval
+  // for comparison vs external timer.
+  auto pc_start_ts = PerfClock::now();
+  auto sc_start_ts = std::chrono::system_clock::now();
+  if (settings.print_timestamps) {
+    std::cout << "Loadgen :: Perf mode start. system_clock Timestamp = "
+              << std::chrono::system_clock::to_time_t(sc_start_ts) << "\n"
+              << std::flush;
+  }
+
+  PerformanceResult pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, settings, performance_set, sequence_gen));
+
+  // Measure PerfClock/system_clock timer durations for comparison vs
+  // external timer.
+  auto pc_stop_ts = PerfClock::now();
+  auto sc_stop_ts = std::chrono::system_clock::now();
+  auto pc_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+                         pc_stop_ts - pc_start_ts)
+                         .count();
+  auto sc_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+                         sc_stop_ts - sc_start_ts)
+                         .count();
+  float pc_sc_ratio = static_cast<float>(pc_duration) / sc_duration;
+  if (settings.print_timestamps) {
+    std::cout << "Loadgen :: Perf mode stop. systme_clock Timestamp = "
+              << std::chrono::system_clock::to_time_t(sc_stop_ts) << "\n"
+              << std::flush;
+    std::cout << "Loadgen :: PerfClock Perf duration = " << pc_duration
+              << "ms\n"
+              << std::flush;
+    std::cout << "Loadgen :: system_clock Perf duration = " << sc_duration
+              << "ms\n"
+              << std::flush;
+    std::cout << "Loadgen :: PerfClock/system_clock ratio = " << std::fixed
+              << std::setprecision(4) << pc_sc_ratio << "\n"
+              << std::flush;
+  }
+
+  if (pc_sc_ratio > 1.01 || pc_sc_ratio < 0.99) {
+    LogDetail([pc_sc_ratio](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "PerfClock and system_clock differ by more than 1%! "
+         << " pc_sc_ratio: " << pc_sc_ratio;
+      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+      detail.Error("PerfClock and system_clock differ by more than 1\%! ",
+                   "pc_sc_ratio", pc_sc_ratio);
+#endif
+    });
+  } else if (pc_sc_ratio > 1.001 || pc_sc_ratio < 0.999) {
+    LogDetail([pc_sc_ratio](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "PerfClock and system_clock differ by more than 0.1%! "
+         << " pc_sc_ratio: " << pc_sc_ratio;
+      MLPERF_LOG_WARNING(detail, "warning_generic_message", ss.str());
+#else
+      detail.Warning("PerfClock and system_clock differ by more than 0.1\%. ",
+                     "pc_sc_ratio", pc_sc_ratio);
+#endif
+    });
+  }
+
+  PerformanceSummary perf_summary{sut->Name(), settings, std::move(pr)};
+  LogSummary([perf_summary](AsyncSummary& summary) mutable {
+    perf_summary.LogSummary(summary);
+  });
+  // Create a copy to prevent thread hazard between LogSummary and LogDetail.
+  PerformanceSummary perf_summary_detail{perf_summary};
+  LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
+    perf_summary_detail.LogDetail(detail);
+  });
+
+  qsl->UnloadSamplesFromRam(performance_set.set);
+}
+
+/// \brief Runs the binary search mode, templated by scenario.
+/// \details 1. Check whether lower bound from user satisfies the performance
+/// constraints, 2. Find an upper bound using the function 'FindBoundaries'
+/// based on the lower bound, 3. Find peak performance settings using the
+/// function 'FindPeakPerformanceBinarySearch'. note: Since we can't find a
+/// lower bound programmatically because of the monotonicity issue of Server
+/// scenario, rely on user's settings. After resolving this issue, we can
+/// make the function 'FindBoundaries' find a lower bound as well from some
+/// random initial settings.
+template <TestScenario scenario>
+void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                             const TestSettingsInternal& base_settings,
+                             SequenceGen* sequence_gen) {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message", "Starting FindPeakPerformance mode");
+#else
+    detail("Starting FindPeakPerformance mode:");
+#endif
+  });
+
+  if (scenario != TestScenario::Server) {
+    LogDetail([unsupported_scenario = ToString(scenario)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_config",
+                       find_peak_performance::kNotSupportedMsg);
+#else
+      detail.Error(find_peak_performance::kNotSupportedMsg);
+#endif
+    });
+    return;
+  }
+
+  LogDetail(
+      [base_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           base_settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        MLPERF_LOG(
+            detail, "generic_message",
+            "FindPeakPerformance: Check validity of the base settings field: " +
+                base_field);
+#else
+        detail(
+            "FindPeakPerformance: Check validity of the base settings field: " +
+            base_field);
+#endif
+      });
+
+  // 1. Check whether the lower bound came from user satisfy performance
+  // constraints or not.
+  std::vector<loadgen::LoadableSampleSet> base_loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, base_settings));
+  const LoadableSampleSet& base_performance_set = base_loadable_sets.front();
+  LoadSamplesToRam(qsl, base_performance_set.set);
+
+  PerformanceResult base_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, base_settings, base_performance_set, sequence_gen));
+  PerformanceSummary base_perf_summary{sut->Name(), base_settings,
+                                       std::move(base_pr)};
+
+  // We can also use all_constraints_met to check performance constraints,
+  // but to reduce searching time, leave it up to whether the settings satisfy
+  // min duration & min queries or not to users.
+  std::string msg;
+  if (!base_perf_summary.PerfConstraintsMet(&msg)) {
+    LogDetail([msg](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "FindPeakPerformance: Initial lower bound does not satisfy "
+         << "performance constraints, msg: " << msg;
+      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+      detail.Error(
+          "FindPeakPerformance: Initial lower bound does not satisfy "
+          "performance constraints, msg: " +
+          msg);
+#endif
+    });
+
+    PerformanceSummary perf_summary{sut->Name(), base_settings,
+                                    std::move(base_perf_summary.pr)};
+    LogSummary([perf_summary](AsyncSummary& summary) mutable {
+      perf_summary.LogSummary(summary);
+    });
+    // Create a copy to prevent thread hazard between LogSummary and LogDetail.
+    PerformanceSummary perf_summary_detail{perf_summary};
+    LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
+      perf_summary_detail.LogDetail(detail);
+    });
+
+    qsl->UnloadSamplesFromRam(base_performance_set.set);
+
+    return;
+  }
+
+  // Clear loaded samples.
+  qsl->UnloadSamplesFromRam(base_performance_set.set);
+
+  // 2. Find an upper bound based on the lower bound.
+  std::pair<PerformanceSummary, PerformanceSummary> boundaries =
+      FindBoundaries<scenario>(sut, qsl, sequence_gen, base_perf_summary);
+  PerformanceSummary l_perf_summary = boundaries.first;
+  PerformanceSummary u_perf_summary = boundaries.second;
+
+  LogDetail(
+      [l_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           l_perf_summary.settings),
+       u_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           u_perf_summary.settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        MLPERF_LOG(detail, "generic_message",
+                   "FindPeakPerformance: Found boundaries: [" + l_field + ", " +
+                       u_field + ")");
+#else
+        detail("FindPeakPerformance: Found boundaries: [" + l_field + ", " +
+               u_field + ")");
+#endif
+      });
+
+  // Reuse performance_set, u_perf_summary has the largest 'samples_per_query'.
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, u_perf_summary.settings));
+  const LoadableSampleSet& performance_set = loadable_sets.front();
+  LoadSamplesToRam(qsl, performance_set.set);
+
+  // 3. Find peak performance settings using the found boundaries
+  PerformanceSummary perf_summary = FindPeakPerformanceBinarySearch<scenario>(
+      sut, qsl, sequence_gen, performance_set, l_perf_summary, u_perf_summary);
+
+  // Print-out the peak performance test setting.
+  LogDetail([field = find_peak_performance::ToStringPerformanceField<scenario>(
+                 perf_summary.settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message",
+               "FindPeakPerformance: Found peak performance field: " + field);
+#else
+    detail("FindPeakPerformance: Found peak performance field: " + field);
+#endif
+  });
+
+  LogSummary([perf_summary](AsyncSummary& summary) mutable {
+    perf_summary.LogSummary(summary);
+  });
+  // Create a copy to prevent thread hazard between LogSummary and LogDetail.
+  PerformanceSummary perf_summary_detail{perf_summary};
+  LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
+    perf_summary_detail.LogDetail(detail);
+  });
+
+  qsl->UnloadSamplesFromRam(performance_set.set);
+}
+
+/// \brief Runs the accuracy mode, templated by scenario.
+template <TestScenario scenario>
+void RunAccuracyMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                     const TestSettingsInternal& settings,
+                     SequenceGen* sequence_gen) {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message", "Starting accuracy mode");
+#else
+    detail("Starting accuracy mode:");
+#endif
+  });
+
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, settings));
+
+  for (auto& loadable_set : loadable_sets) {
+    {
+      auto tracer = MakeScopedTracer(
+          [count = loadable_set.set.size()](AsyncTrace& trace) {
+            trace("LoadSamples", "count", count);
+          });
+      LoadSamplesToRam(qsl, loadable_set.set);
+    }
+
+    PerformanceResult pr(IssueQueries<scenario, TestMode::AccuracyOnly>(
+        sut, settings, loadable_set, sequence_gen));
+
+    {
+      auto tracer = MakeScopedTracer(
+          [count = loadable_set.set.size()](AsyncTrace& trace) {
+            trace("UnloadSampes", "count", count);
+          });
+      qsl->UnloadSamplesFromRam(loadable_set.set);
+    }
+  }
+}
+
+/// \brief Routes runtime scenario requests to the corresponding instances
+/// of its templated mode functions.
+struct RunFunctions {
+  using Signature = void(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                         const TestSettingsInternal& settings,
+                         SequenceGen* sequence_gen);
+
+  template <TestScenario compile_time_scenario>
+  static RunFunctions GetCompileTime() {
+    return {(RunAccuracyMode<compile_time_scenario>),
+            (RunPerformanceMode<compile_time_scenario>),
+            (FindPeakPerformanceMode<compile_time_scenario>)};
+  }
+
+  static RunFunctions Get(TestScenario run_time_scenario) {
+    switch (run_time_scenario) {
+      case TestScenario::SingleStream:
+        return GetCompileTime<TestScenario::SingleStream>();
+      case TestScenario::MultiStream:
+        return GetCompileTime<TestScenario::MultiStream>();
+      case TestScenario::Server:
+        return GetCompileTime<TestScenario::Server>();
+      case TestScenario::Offline:
+        return GetCompileTime<TestScenario::Offline>();
+    }
+    // We should not reach this point.
+    assert(false);
+    return GetCompileTime<TestScenario::SingleStream>();
+  }
+
+  Signature& accuracy;
+  Signature& performance;
+  Signature& find_peak_performance;
+};
+
+}  // namespace loadgen
+
+void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+               const TestSettings& requested_settings,
+               const LogSettings& log_settings,
+               const std::string audit_config_filename) {
+  GlobalLogger().StartIOThread();
+
+  const std::string test_date_time = CurrentDateTimeISO8601();
+
+  loadgen::LogOutputs log_outputs(log_settings.log_output, test_date_time);
+  if (!log_outputs.CheckOutputs()) {
+    return;
+  }
+
+  GlobalLogger().StartLogging(&log_outputs.summary_out, &log_outputs.detail_out,
+                              &log_outputs.accuracy_out,
+                              log_settings.log_output.copy_detail_to_stdout,
+                              log_settings.log_output.copy_summary_to_stdout);
+
+  GlobalLogger().SetUseTokens(requested_settings.use_token_latencies);
+  bool needs_first_token =
+      (requested_settings.scenario != TestScenario::Offline);
+  GlobalLogger().SetNeedsFirstToken(needs_first_token);
+
+  if (log_settings.enable_trace) {
+    GlobalLogger().StartNewTrace(&log_outputs.trace_out, PerfClock::now());
+  }
+
+  // measure sut->Name() response time
+  PerfClock::time_point pre_get_sut_name_ts = PerfClock::now();
+  const std::string& sut_name = sut->Name();
+  PerfClock::time_point post_get_sut_name_ts = PerfClock::now();
+
+  auto get_sut_name_duration_ns =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(
+          post_get_sut_name_ts - pre_get_sut_name_ts)
+          .count();
+
+  LogLoadgenVersion();
+  LogDetail([sut, qsl, test_date_time, &sut_name,
+             &get_sut_name_duration_ns](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "test_datetime", test_date_time);
+    MLPERF_LOG(detail, "sut_name", sut_name);
+    MLPERF_LOG(detail, "get_sut_name_duration_ns", get_sut_name_duration_ns);
+    MLPERF_LOG(detail, "qsl_name", qsl->Name());
+    MLPERF_LOG(detail, "qsl_reported_total_count", qsl->TotalSampleCount());
+    MLPERF_LOG(detail, "qsl_reported_performance_count",
+               qsl->PerformanceSampleCount());
+#else
+    detail("Date + time of test: ", test_date_time);
+    detail("System Under Test (SUT) name: ", sut_name);
+    detail("Get SUT name time [ns]: ", get_sut_name_duration_ns);
+    detail("Query Sample Library (QSL) name: ", qsl->Name());
+    detail("QSL total size: ", qsl->TotalSampleCount());
+    detail("QSL performance size*: ", qsl->PerformanceSampleCount());
+    detail("*TestSettings (performance_sample_count_override) can override");
+    detail("*Refer to Effective Settings for actual value");
+#endif
+  });
+
+  TestSettings test_settings = requested_settings;
+  // Look for Audit Config file to override TestSettings during audit
+  if (FileExists(audit_config_filename)) {
+    LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING(detail, "warning_generic_message",
+                         "Found Audit Config file (audit.config)."
+                         " Overriding TestSettings from audit.config file.");
+#else
+      detail(
+          "Found Audit Config file (audit.config)."
+          " Overriding TestSettings from audit.config file.");
+#endif
+    });
+    std::string audit_scenario = loadgen::ToString(test_settings.scenario);
+    // Remove Spaces from the string
+    RemoveValue(&audit_scenario, ' ');
+    const std::string generic_model = "*";
+    test_settings.FromConfig(audit_config_filename, generic_model,
+                             audit_scenario, 2);
+  }
+  if (test_settings.test05) {
+    // If the configuration indicates we are running test05,
+    // random seeds
+    LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING(detail, "warning_generic_message",
+                         "Test05 flag detected"
+                         " Overriding random seeds");
+#else
+      detail(
+          "Test05 flag detected"
+          " Overriding random seeds");
+#endif
+    });
+    test_settings.mode = TestMode::PerformanceOnly;
+    test_settings.qsl_rng_seed = requested_settings.test05_qsl_rng_seed;
+    test_settings.sample_index_rng_seed =
+        requested_settings.test05_sample_index_rng_seed;
+    test_settings.schedule_rng_seed =
+        requested_settings.test05_schedule_rng_seed;
+  }
+
+  loadgen::TestSettingsInternal sanitized_settings(
+      test_settings, qsl->PerformanceSampleCount());
+  sanitized_settings.LogAllSettings();
+
+  auto run_funcs = loadgen::RunFunctions::Get(sanitized_settings.scenario);
+
+  loadgen::SequenceGen sequence_gen;
+  switch (sanitized_settings.mode) {
+    case TestMode::SubmissionRun:
+      run_funcs.accuracy(sut, qsl, sanitized_settings, &sequence_gen);
+      run_funcs.performance(sut, qsl, sanitized_settings, &sequence_gen);
+      break;
+    case TestMode::AccuracyOnly:
+      run_funcs.accuracy(sut, qsl, sanitized_settings, &sequence_gen);
+      break;
+    case TestMode::PerformanceOnly:
+      run_funcs.performance(sut, qsl, sanitized_settings, &sequence_gen);
+      break;
+    case TestMode::FindPeakPerformance:
+      run_funcs.find_peak_performance(sut, qsl, sanitized_settings,
+                                      &sequence_gen);
+      break;
+  }
+
+  loadgen::IssueQueryController::GetInstance().EndThreads();
+
+  // Stop tracing after logging so all logs are captured in the trace.
+  GlobalLogger().StopLogging();
+  GlobalLogger().StopTracing();
+  GlobalLogger().StopIOThread();
+}
+
+void AbortTest() {
+  loadgen::IssueQueryController::GetInstance().EndThreads();
+  GlobalLogger().StopLogging();
+  GlobalLogger().StopTracing();
+  GlobalLogger().StopIOThread();
+}
+
+void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count,
+                          const ResponseCallback& response_cb) {
+  PerfClock::time_point timestamp = PerfClock::now();
+
+  auto tracer = MakeScopedTracer(
+      [](AsyncTrace& trace) { trace("QuerySamplesComplete"); });
+
+  const QuerySampleResponse* end = responses + response_count;
+
+  // Notify first to unblock loadgen production ASAP.
+  for (QuerySampleResponse* response = responses; response < end; response++) {
+    loadgen::SampleMetadata* sample =
+        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
+    loadgen::QueryMetadata* query = sample->query_metadata;
+    query->NotifyOneSampleCompleted(timestamp);
+  }
+
+  // Log samples.
+  for (QuerySampleResponse* response = responses; response < end; response++) {
+    loadgen::SampleMetadata* sample =
+        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
+    loadgen::QueryMetadata* query = sample->query_metadata;
+    query->response_delegate->SampleComplete(sample, response, timestamp,
+                                             response_cb);
+  }
+  // PerfClock::time_point end_timestamp = PerfClock::now();
+  // mlperf::samples_overhead_acum += (end_timestamp - timestamp).count();
+}
+
+void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
+                        const ResponseCallback& response_cb) {
+  PerfClock::time_point timestamp = PerfClock::now();
+
+  auto tracer =
+      MakeScopedTracer([](AsyncTrace& trace) { trace("FirstTokenComplete"); });
+
+  const QuerySampleResponse* end = responses + response_count;
+
+  // Log samples.
+  for (QuerySampleResponse* response = responses; response < end; response++) {
+    loadgen::SampleMetadata* sample =
+        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
+    loadgen::QueryMetadata* query = sample->query_metadata;
+    query->response_delegate->TokenComplete(sample, response, timestamp,
+                                            response_cb);
+  }
+  // PerfClock::time_point end_timestamp = PerfClock::now();
+  // mlperf::tokens_overhead_acum += (end_timestamp - timestamp).count();
+}
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h
new file mode 100644
index 000000000..84e02656c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h
@@ -0,0 +1,103 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Provides the entry points for a SUT to start a test and respond
+/// to issued queries.
+
+#ifndef MLPERF_LOADGEN_LOADGEN_H_
+#define MLPERF_LOADGEN_LOADGEN_H_
+
+#include <cstddef>
+#include <functional>
+#include <numeric>
+#include <string>
+
+/// \brief Contains the loadgen API.
+namespace mlperf {
+
+struct QuerySampleResponse;
+class QuerySampleLibrary;
+class SystemUnderTest;
+struct TestSettings;
+struct LogSettings;
+
+using ResponseCallback = std::function<void(QuerySampleResponse*)>;
+
+/// \addtogroup LoadgenAPI Loadgen API
+/// @{
+
+///
+/// \brief SUT calls this to notify loadgen of completed samples.
+/// \details
+/// * The samples may be from any combination of queries or partial queries as
+///   issued by \link mlperf::SystemUnderTest::IssueQuery
+///
+///   SystemUnderTest::IssueQuery \endlink.
+/// * The SUT is responsible for owning and allocating the reponse data. The
+///   loadgen will copy the response data if needed (e.g. for accuracy mode).
+///   + If no response callback is provided, the response data must remain valid
+///     for the entire duration of this call.
+///   + The response callback is untimed; it is called for each response in
+///     responses after the loadgen records the completion time and before the
+///     loadgen copies the response data. The response callback enables the
+///     loadgen to simulate response data being stored in accelerator DRAM.
+///     After the response callback is called, response data must reside on the
+///     host so that the loadgen can copy it. Submitters must seek prior
+///     approval to use this feature of loadgen (refer to
+///     https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#5-load-generator).
+/// * All calls to QuerySampleComplete are thread-safe and wait-free bounded.
+///   + Any number of threads can call QuerySampleComplete simultaneously.
+///   + Regardless of where any other thread stalls, the current thread will
+///     finish QuerySampleComplete in a bounded number of cycles.
+///   + Note: If a callback is provided, the SUT must ensure that the callback
+///     is also thread-safe and wait-free bounded for the above to hold.
+void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count,
+                          const ResponseCallback& response_cb = {});
+
+void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
+                        const ResponseCallback& response_cb = {});
+
+///
+/// \brief Starts the test against SUT with the specified settings.
+/// \details This is the C++ entry point. See mlperf::c::StartTest for the
+/// C entry point.
+///
+void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+               const TestSettings& requested_settings,
+               const LogSettings& log_settings,
+               const std::string audit_config_filename = "audit.config");
+
+///
+/// \brief Aborts the running test.
+/// \details This function will stop issueing new samples to the SUT. StartTest
+/// will return after the current inference finishes. Since StartTest is a
+/// blocking function, this function can only be called in another thread.
+void AbortTest();
+
+///
+/// \brief Register a thread for query issuing in Server scenario.
+/// \details If a thread registers itself, the thread(s) is used to call SUT's
+/// IssueQuery(). This function is blocking until the entire test is done. The
+/// number of registered threads must match server_num_issue_query_threads in
+/// TestSettings. This function only has effect in Server scenario.
+/// This is the C++ entry point. See mlperf::c::RegisterIssueQueryThread for the
+/// C entry point.
+///
+void RegisterIssueQueryThread();
+// inline long long samples_overhead_acum;
+// inline long long tokens_overhead_acum;
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_LOADGEN_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg
new file mode 100644
index 000000000..17dd1b481
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created by diasvg.py -->
+<svg width="29.100cm" height="12.100cm" viewBox="1.950 0.950 29.100 12.100"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+<!-- Layer: Background -->
+<g id="Background">
+<!-- Flowchart - Box -->
+<rect x="2.000" y="2.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="6.000" y="3.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Model + Dataset</text>
+<!-- Flowchart - Box -->
+<rect x="2.000" y="6.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="6.000" y="7.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Pre Processor</text>
+<!-- Flowchart - Box -->
+<rect x="24.000" y="6.000" width="7.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="27.500" y="7.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Post Processor</text>
+<!-- Flowchart - Box -->
+<rect x="13.000" y="1.000" width="8.000" height="8.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="17.000" y="5.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Benchmark</text>
+<!-- Flowchart - Box -->
+<rect x="24.000" y="1.000" width="7.000" height="4.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="27.500" y="3.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Backend</text>
+<!-- Flowchart - Box -->
+<rect x="13.000" y="11.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="17.000" y="12.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+LoadGen</text>
+<!-- Standard - Line -->
+<line x1="10.000" y1="3.000" x2="12.513" y2="3.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="12.888,3.000 12.388,3.250 12.513,3.000 12.388,2.750 "/>
+<!-- Standard - Line -->
+<line x1="10.000" y1="7.000" x2="12.513" y2="7.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="12.888,7.000 12.388,7.250 12.513,7.000 12.388,6.750 "/>
+<!-- Standard - Line -->
+<line x1="15.000" y1="9.000" x2="15.000" y2="10.513" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="15.000,10.888 14.750,10.388 15.000,10.513 15.250,10.388 "/>
+<!-- Standard - Line -->
+<line x1="17.000" y1="11.000" x2="17.000" y2="9.487" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="17.000,9.112 17.250,9.612 17.000,9.487 16.750,9.612 "/>
+<!-- Standard - Line -->
+<line x1="19.000" y1="9.000" x2="19.000" y2="10.513" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="19.000,10.888 18.750,10.388 19.000,10.513 19.250,10.388 "/>
+<!-- Standard - Line -->
+<line x1="21.000" y1="3.000" x2="23.513" y2="3.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="23.888,3.000 23.388,3.250 23.513,3.000 23.388,2.750 "/>
+<!-- Standard - Line -->
+<line x1="24.000" y1="7.000" x2="21.487" y2="7.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="21.112,7.000 21.612,6.750 21.487,7.000 21.612,7.250 "/>
+<!-- Standard - Text -->
+<text x="10.850" y="2.650"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+1</text>
+<!-- Standard - Text -->
+<!-- Standard - Text -->
+<text x="14.100" y="9.950"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+2</text>
+<!-- Standard - Text -->
+<text x="16.100" y="10.450"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+3</text>
+<!-- Standard - Text -->
+<text x="18.100" y="9.950"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+5</text>
+<!-- Standard - Text -->
+<text x="21.950" y="2.650"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+4</text>
+<!-- Standard - Text -->
+<!-- Standard - Text -->
+<text x="24.500" y="12.221"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+LoadGen Logs</text>
+<!-- Standard - Line -->
+<line x1="21.000" y1="12.000" x2="23.513" y2="12.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="23.888,12.000 23.388,12.250 23.513,12.000 23.388,11.750 "/>
+<!-- Standard - Text -->
+<text x="21.900" y="11.450"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+6</text>
+<!-- Standard - Text -->
+<!-- Standard - Text -->
+<!-- Standard - Line -->
+<line x1="27.500" y1="5.000" x2="27.500" y2="5.513" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="27.500,5.888 27.250,5.388 27.500,5.513 27.750,5.388 "/>
+</g>
+</svg>
\ No newline at end of file
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc
new file mode 100644
index 000000000..807c1954a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc
@@ -0,0 +1,1301 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Implements a logging system with a central IO thread that handles
+/// all stringification and IO.
+/// \details Log-producing threads only submit lambdas to be executed on the
+/// IO thread.
+/// All producers and consumers use lock-free operations that guarantee
+/// forward progress independent of a) other stalled threads and b) where
+/// those threads are stalled.
+/// Each thread uses a double-buffering scheme to queue its logs. One buffer
+/// is always reserved for writes and the other is reserved for reads.
+/// A producing thread sends requests to the IOThread to swap the buffers
+/// and the IOThread does the actual read/write swap after it has finished
+/// reading the buffer it was working on.
+
+#include "logging.h"
+
+#include <cassert>
+#include <cmath>
+#include <future>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <process.h>
+#include <windows.h>
+#define MLPERF_GET_PID() _getpid()
+#else
+#include <unistd.h>
+#define MLPERF_GET_PID() getpid()
+#endif
+
+// Use system-level TID for tracing. This enables correlation with other
+// performance tools that are not aware of C++ std::thread::id.
+#if defined(__linux__)
+#include <sys/syscall.h>
+#define MLPERF_GET_TID() syscall(SYS_gettid)
+#elif defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
+#define MLPERF_GET_TID() GetCurrentThreadId()
+#elif defined(__APPLE__)
+#define MLPERF_GET_TID() \
+  std::hash<std::thread::id>{}(std::this_thread::get_id())
+#else
+// TODO: std::this_thread::id is a class but MLPERF_GET_TID() assigned to
+// uint64_t
+#define MLPERF_GET_TID() std::this_thread::get_id()
+#endif
+
+#include "utils.h"
+
+namespace mlperf {
+namespace logging {
+
+namespace {
+
+uintptr_t SwapRequestSlotIsWritableValue(size_t id) {
+  // LSB of 1 indicates that this isn't a pointer.
+  // MSBs encode the id to detect collisions when a slot in
+  // |thread_swap_request_slots_| is reused for a different id and the request
+  // for the previous id is very slow.
+  return (id << 1) | 0x1;
+}
+
+bool SwapRequestSlotIsReadable(uintptr_t value) {
+  // Valid pointers will not have their lsb set.
+  return (value & 0x1) != 0x1;
+}
+
+constexpr size_t kMaxThreadsToLog = 1024;
+constexpr std::chrono::milliseconds kLogPollPeriod(10);
+
+/// \brief How many log entries to pre-allocate per thread to help avoid
+/// runtime allocation.
+constexpr size_t kTlsLogReservedEntryCount = 1024;
+
+constexpr auto kInvalidLatency = std::numeric_limits<QuerySampleLatency>::min();
+constexpr auto nTokenInvalid = std::numeric_limits<int64_t>::min();
+
+}  // namespace
+
+const std::string& ArgValueTransform(const bool& value) {
+  static const std::string v_true("true");
+  static const std::string v_false("false");
+  return value ? v_true : v_false;
+}
+
+char Bin2Hex(uint8_t four_bits) {
+  char number = '0' + four_bits;
+  char letter = ('A' - 10) + four_bits;
+  return four_bits < 10 ? number : letter;
+}
+
+const std::string ArgValueTransform(const LogBinaryAsHexString& value) {
+  if (value.data == nullptr) {
+    return "\"\"";
+  }
+  std::string hex;
+  hex.reserve(value.data->size() + 2);
+  hex.push_back('"');
+  for (auto b : *value.data) {
+    hex.push_back(Bin2Hex(b >> 4));
+    hex.push_back(Bin2Hex(b & 0x0F));
+  }
+  hex.push_back('"');
+  return hex;
+}
+
+#if USE_NEW_LOGGING_FORMAT
+const std::string ArgValueTransform(const std::string& value) {
+  return std::string("\"") + value + std::string("\"");
+}
+
+const std::string ArgValueTransform(const char* value) {
+  return std::string("\"") + std::string(value) + std::string("\"");
+}
+
+const std::string ArgValueTransform(const std::vector<size_t>& value) {
+  std::string s("[");
+  for (auto i : value) {
+    s += std::to_string(i) + ",";
+  }
+  s.resize(s.size() - 1);
+  s += "]";
+  return s;
+}
+
+const std::string ArgValueTransform(
+    const std::map<std::string, std::string>& value) {
+  std::string s("{");
+  for (const auto& i : value) {
+    s += "\"";
+    s += i.first;
+    s += "\":\"";
+    s += i.second;
+    s += "\",";
+  }
+  s.resize(s.size() - 1);
+  s += "}";
+  return s;
+}
+
+const std::string ArgValueTransform(const float value) {
+  if (value == std::numeric_limits<float>::infinity()) {
+    return "Infinity";
+  } else if (value == -std::numeric_limits<float>::infinity()) {
+    return "-Infinity";
+  } else if (std::isnan(value)) {
+    return "NaN";
+  }
+  return std::to_string(value);
+}
+
+const std::string ArgValueTransform(const double value) {
+  if (value == std::numeric_limits<double>::infinity()) {
+    return "Infinity";
+  } else if (value == -std::numeric_limits<double>::infinity()) {
+    return "-Infinity";
+  } else if (std::isnan(value)) {
+    return "NaN";
+  }
+  return std::to_string(value);
+}
+#endif
+
+ChromeTracer::ChromeTracer(std::ostream* out, PerfClock::time_point origin)
+    : out_(out), origin_(origin) {
+  WriteTraceEventHeader();
+}
+
+ChromeTracer::~ChromeTracer() {
+  WriteTraceEventFooter();
+  out_->flush();
+}
+
+void ChromeTracer::WriteTraceEventHeader() {
+  // Times and durations are converted from nanoseconds to microseconds, use
+  // 3 decimal digits to preserve precision.
+  *out_ << std::fixed << std::setprecision(3) << "{\"traceEvents\":[\n";
+}
+
+void ChromeTracer::WriteTraceEventFooter() {
+  *out_ << "{\"name\":\"LastTrace\"}\n"
+        << "],\n"
+        << "\"displayTimeUnit\":\"ns\",\n"
+        << "\"otherData\":{\n"
+        << "\"ts\":" << Micros(origin_.time_since_epoch()).count() << ",\n"
+        << "\"version\":\"MLPerf LoadGen v1.0\"\n"
+        << "}\n"
+        << "}\n";
+}
+
+void AsyncLog::SetCurrentPidTid(uint64_t pid, uint64_t tid) {
+  current_pid_ = pid;
+  current_tid_ = tid;
+}
+
+void AsyncLog::SetLogFiles(std::ostream* summary, std::ostream* detail,
+                           std::ostream* accuracy, bool copy_detail_to_stdout,
+                           bool copy_summary_to_stdout,
+                           PerfClock::time_point log_origin) {
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  if (summary_out_ != &std::cerr) {
+    std::string warning_summary;
+    if (log_warning_count_ == 0) {
+      warning_summary = "\nNo warnings encountered during test.\n";
+    } else if (log_warning_count_ == 1) {
+      warning_summary = "\n1 warning encountered. See detailed log.\n";
+    } else if (log_warning_count_ != 0) {
+      warning_summary = "\n" + std::to_string(log_warning_count_) +
+                        " warnings encountered. See detailed log.\n";
+    }
+
+    std::string error_summary;
+    if (log_error_count_ == 0) {
+      error_summary = "\nNo errors encountered during test.\n";
+    } else if (log_error_count_ == 1) {
+      error_summary = "\n1 ERROR encountered. See detailed log.\n";
+    } else if (log_error_count_ != 0) {
+      error_summary = "\n" + std::to_string(log_error_count_) +
+                      " ERRORS encountered. See detailed log.\n";
+    }
+
+    *summary_out_ << warning_summary << error_summary;
+    if (copy_summary_to_stdout_) {
+      std::cout << warning_summary << error_summary;
+    }
+  }
+  if (summary_out_) {
+    summary_out_->flush();
+  }
+  if (detail_out_) {
+    detail_out_->flush();
+  }
+  if (accuracy_out_ != &std::cerr) {
+    WriteAccuracyFooterLocked();
+    accuracy_out_->flush();
+  }
+  summary_out_ = summary;
+  detail_out_ = detail;
+  accuracy_out_ = accuracy;
+  if (accuracy_out_ != &std::cerr) {
+    WriteAccuracyHeaderLocked();
+  }
+  copy_detail_to_stdout_ = copy_detail_to_stdout;
+  copy_summary_to_stdout_ = copy_summary_to_stdout;
+  log_origin_ = log_origin;
+  log_error_count_ = 0;
+  log_warning_count_ = 0;
+}
+
+void AsyncLog::StartNewTrace(std::ostream* trace_out,
+                             PerfClock::time_point origin) {
+  std::unique_lock<std::mutex> lock(trace_mutex_);
+  if (trace_out) {
+    tracer_ = std::make_unique<ChromeTracer>(trace_out, origin);
+  } else {
+    tracer_.reset();
+  }
+}
+
+void AsyncLog::StopTrace() {
+  std::unique_lock<std::mutex> lock(trace_mutex_);
+  tracer_.reset();
+}
+
+void AsyncLog::LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
+                           const LogBinaryAsHexString& response,
+                           int64_t n_tokens = 0) {
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  if (!accuracy_out_) {
+    return;
+  }
+  *accuracy_out_ << (accuracy_needs_comma_ ? ",\n{ " : "\n{ ");
+  if (!use_tokens_) {
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+            response);
+  } else if (!needs_first_token_) {
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+            response, "token_count", n_tokens);
+  } else {
+    const size_t i = seq_id - latencies_first_sample_sequence_id_;
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+            response, "token_data", token_records_[i], "token_count", n_tokens);
+  }
+
+  *accuracy_out_ << " }";
+  accuracy_needs_comma_ = true;
+}
+
+void AsyncLog::CacheToken(uint64_t seq_id,
+                          const LogBinaryAsHexString& response) {
+  std::unique_lock<std::mutex> lock(token_record_mutex_);
+  const size_t i = seq_id - latencies_first_sample_sequence_id_;
+  if (token_records_.size() <= i) {
+    token_records_.resize(i + 1);
+  }
+  token_records_[i] = response;
+}
+
+void AsyncLog::Flush() {
+  {
+    std::unique_lock<std::mutex> lock(log_mutex_);
+    if (summary_out_) {
+      summary_out_->flush();
+    }
+    if (detail_out_) {
+      detail_out_->flush();
+    }
+    if (accuracy_out_) {
+      accuracy_out_->flush();
+    }
+  }
+
+  {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->Flush();
+    }
+  }
+}
+
+void AsyncLog::WriteAccuracyHeaderLocked() {
+  *accuracy_out_ << "[";
+  accuracy_needs_comma_ = false;
+}
+
+void AsyncLog::WriteAccuracyFooterLocked() { *accuracy_out_ << "\n]\n"; }
+
+void AsyncLog::RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                                       size_t latencies_to_reserve) {
+  std::unique_lock<std::mutex> lock(latencies_mutex_);
+  assert(latencies_.empty());
+  assert(latencies_recorded_ == latencies_expected_);
+  latencies_recorded_ = 0;
+  latencies_expected_ = 0;
+  max_latency_ = 0;
+  max_completion_timstamp_ = PerfClock::now();
+  latencies_first_sample_sequence_id_ = first_sample_sequence_id;
+  latencies_.reserve(latencies_to_reserve);
+  token_latencies_.reserve(latencies_to_reserve);
+  tokens_per_sample_.reserve(latencies_to_reserve);
+  time_per_output_token_.reserve(latencies_to_reserve);
+}
+
+void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
+                                      PerfClock::time_point completion_time,
+                                      QuerySampleLatency latency,
+                                      int64_t n_tokens = 0) {
+  std::unique_lock<std::mutex> lock(latencies_mutex_);
+
+  max_latency_ = std::max(max_latency_, latency);
+
+  max_completion_timstamp_ =
+      std::max(max_completion_timstamp_, completion_time);
+
+  if (sample_sequence_id < latencies_first_sample_sequence_id_) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Received completion for an old sample."
+       << " Min expected id: " << latencies_first_sample_sequence_id_
+       << " Actual id: " << sample_sequence_id;
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync(
+        "Received completion for an old sample.", "Min expected id",
+        latencies_first_sample_sequence_id_, "Actual id", sample_sequence_id);
+#endif
+    return;
+  }
+
+  const size_t i = sample_sequence_id - latencies_first_sample_sequence_id_;
+
+  if (latencies_.size() <= i) {
+    // TODO: Reserve in advance.
+    latencies_.resize(i + 1, kInvalidLatency);
+  } else if (latencies_[i] != kInvalidLatency) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                          "Attempted to complete a sample twice.");
+#else
+    GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
+#endif
+
+    // Return without recording the latency again to avoid potentially
+    // ending the test before the SUT is actually done, which could result
+    // in a segfault.
+    // If the SUT recorded the wrong sample, the test will hang and see
+    // the error above.
+    return;
+  }
+
+  if (use_tokens_) {
+    if (needs_first_token_ && (token_latencies_.size() <= i)) {
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to record a sample latency before it's "
+                            "first token latency");
+    } else if (needs_first_token_ && (token_latencies_[i] == kInvalidLatency)) {
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to record a sample latency before it's "
+                            "first token latency");
+    }
+
+    if (tokens_per_sample_.size() <= i) {
+      // TODO: Reserve in advance.
+      tokens_per_sample_.resize(i + 1, nTokenInvalid);
+    } else if (tokens_per_sample_[i] != nTokenInvalid) {
+      // Call LogErrorSync here since this kind of error could result in a
+      // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to complete a sample twice.");
+#else
+      GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
+#endif
+
+      // Return without recording the latency again to avoid potentially
+      // ending the test before the SUT is actually done, which could result
+      // in a segfault.
+      // If the SUT recorded the wrong sample, the test will hang and see
+      // the error above.
+      return;
+    }
+    if (n_tokens == 0) {
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "n_tokens argument missing or attempted to record "
+                            "0 as number of tokens");
+    } else if (n_tokens < 0) {
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to record a negative number of tokens");
+      n_tokens = 0;
+    } else if (n_tokens == 1) {
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Number of tokens need to be greater than 1");
+      n_tokens = 0;
+    }
+    if (time_per_output_token_.size() <= i) {
+      time_per_output_token_.resize(i + 1, kInvalidLatency);
+    } else if (time_per_output_token_[i] != kInvalidLatency) {
+      // Call LogErrorSync here since this kind of error could result in a
+      // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to complete a sample twice.");
+#else
+      GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
+#endif
+
+      // Return without recording the latency again to avoid potentially
+      // ending the test before the SUT is actually done, which could result
+      // in a segfault.
+      // If the SUT recorded the wrong sample, the test will hang and see
+      // the error above.
+      return;
+    }
+    tokens_per_sample_[i] = n_tokens;
+    time_per_output_token_[i] =
+        (latency - token_latencies_[i]) / (n_tokens - 1);
+  }
+  latencies_[i] = latency;
+  latencies_recorded_++;
+  if (AllLatenciesRecorded()) {
+    all_latencies_recorded_.notify_all();
+  }
+}
+
+void AsyncLog::RecordTokenCompletion(uint64_t sample_sequence_id,
+                                     PerfClock::time_point completion_time,
+                                     QuerySampleLatency latency) {
+  std::unique_lock<std::mutex> lock(token_latencies_mutex_);
+  // std::unique_lock<std::mutex> lock(latencies_mutex_);
+  // max_latency_ = std::max(max_latency_, latency);
+
+  // max_completion_timstamp_ =
+  //     std::max(max_completion_timstamp_, completion_time);
+
+  if (sample_sequence_id < latencies_first_sample_sequence_id_) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Received completion for an old sample."
+       << " Min expected id: " << latencies_first_sample_sequence_id_
+       << " Actual id: " << sample_sequence_id;
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync(
+        "Received completion for an old sample.", "Min expected id",
+        latencies_first_sample_sequence_id_, "Actual id", sample_sequence_id);
+#endif
+    return;
+  }
+
+  const size_t i = sample_sequence_id - latencies_first_sample_sequence_id_;
+
+  if (latencies_.size() > i) {
+    if (latencies_[i] != kInvalidLatency) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR_SYNC(
+          GlobalLogger(), "error_runtime",
+          "Attempted to record token latency after sample was completed");
+#else
+      GlobalLogger().LogErrorSync(
+          "Attempted to record token latency after sample was completed");
+#endif
+
+      // Return without recording the latency again to avoid potentially
+      // ending the test before the SUT is actually done, which could result
+      // in a segfault.
+      // If the SUT recorded the wrong sample, the test will hang and see
+      // the error above.
+      return;
+    }
+  }
+  if (token_latencies_.size() <= i) {
+    // TODO: Reserve in advance.
+    token_latencies_.resize(i + 1, kInvalidLatency);
+  } else if (token_latencies_[i] != kInvalidLatency) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                          "Attempted to complete a sample twice.");
+#else
+    GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
+#endif
+
+    // Return without recording the latency again to avoid potentially
+    // ending the test before the SUT is actually done, which could result
+    // in a segfault.
+    // If the SUT recorded the wrong sample, the test will hang and see
+    // the error above.
+    return;
+  }
+  token_latencies_[i] = latency;
+}
+
+std::vector<QuerySampleLatency> AsyncLog::GetLatenciesBlocking(
+    size_t expected_count) {
+  std::vector<QuerySampleLatency> latencies;
+  {
+    std::unique_lock<std::mutex> lock(latencies_mutex_);
+    latencies_expected_ = expected_count;
+    all_latencies_recorded_.wait(lock, [&] { return AllLatenciesRecorded(); });
+    latencies.swap(latencies_);
+  }
+
+  if (latencies.size() != expected_count) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Received SequenceId that was too large."
+       << " expected_size: " << expected_count
+       << " actual_size: " << latencies.size();
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync("Received SequenceId that was too large.",
+                                "expected_size", expected_count, "actual_size",
+                                latencies.size());
+#endif
+  }
+
+  size_t invalid_latency_count = 0;
+  for (auto l : latencies) {
+    if (l == kInvalidLatency) {
+      invalid_latency_count++;
+    }
+  }
+  if (invalid_latency_count != 0) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Encountered incomplete samples at the end of a series of queries."
+       << " count: " << invalid_latency_count;
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync(
+        "Encountered incomplete samples at the end of a series of queries.",
+        "count", invalid_latency_count);
+#endif
+  }
+
+  return latencies;
+}
+
+std::vector<QuerySampleLatency> AsyncLog::GetTokenLatencies(
+    size_t expected_count) {
+  std::vector<QuerySampleLatency> token_latencies;
+  token_latencies.swap(token_latencies_);
+  return token_latencies;
+}
+
+std::vector<QuerySampleLatency> AsyncLog::GetTimePerOutputToken(
+    size_t expected_count) {
+  std::vector<QuerySampleLatency> tpot_latencies;
+  tpot_latencies.swap(time_per_output_token_);
+  return tpot_latencies;
+}
+
+std::vector<int64_t> AsyncLog::GetTokensPerSample(size_t expected_count) {
+  std::vector<int64_t> tokens_per_sample;
+  tokens_per_sample.swap(tokens_per_sample_);
+  return tokens_per_sample;
+}
+
+PerfClock::time_point AsyncLog::GetMaxCompletionTime() {
+  return max_completion_timstamp_;
+}
+
+QuerySampleLatency AsyncLog::GetMaxLatencySoFar() {
+  std::unique_lock<std::mutex> lock(latencies_mutex_);
+  return max_latency_;
+}
+
+void AsyncLog::SetUseTokens(bool use_tokens) { use_tokens_ = use_tokens; }
+
+void AsyncLog::SetNeedsFirstToken(bool needs_first_token) {
+  needs_first_token_ = needs_first_token;
+}
+
+/// \brief Records a single thread using thread-local storage and submits
+/// entries to the central Logger.
+///
+/// \details This setup allows for each log entry to be added:
+///   * With forward-progress guarantees. (i.e.: no locking or blocking
+///       operations even if other threads have stalled.)
+///   * Without expensive syscalls or I/O operations, which are deferred to
+///       the central Logger.
+class TlsLogger {
+ public:
+  TlsLogger(std::function<void()> forced_detatch);
+  ~TlsLogger();
+  void ForcedDetatchFromThread() { forced_detatch_(); }
+
+  void Log(AsyncLogEntry&& entry);
+  void SwapBuffers();
+
+  std::vector<AsyncLogEntry>* StartReadingEntries();
+  void FinishReadingEntries();
+  bool ReadBufferHasBeenConsumed();
+  size_t MaxEntryVectorSize() { return max_entry_size_; }
+
+  uint64_t Pid() const { return pid_; }
+  uint64_t Tid() const { return tid_; }
+
+  void RequestSwapBuffersSlotRetried() {
+    swap_buffers_slot_retry_count_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  size_t ReportLogCasFailCount() {
+    size_t c = log_cas_fail_count_.load(std::memory_order_relaxed);
+    log_cas_fail_count_.fetch_sub(c, std::memory_order_relaxed);
+    return c;
+  }
+
+  size_t ReportSwapBuffersSlotRetryCount() {
+    size_t c = swap_buffers_slot_retry_count_.load(std::memory_order_relaxed);
+    swap_buffers_slot_retry_count_.fetch_sub(c, std::memory_order_relaxed);
+    return c;
+  }
+
+  void TraceCounters();
+
+ private:
+  using EntryVector = std::vector<AsyncLogEntry>;
+  enum class EntryState { Unlocked, ReadLock, WriteLock };
+
+  // Accessed by producer only.
+  size_t i_read_ = 0;
+
+  // Accessed by producer and consumer atomically.
+  EntryVector entries_[2];
+  std::atomic<EntryState> entry_states_[2]{{EntryState::ReadLock},
+                                           {EntryState::Unlocked}};
+  std::atomic<size_t> i_write_{1};
+
+  std::atomic<size_t> log_cas_fail_count_{0};
+  std::atomic<size_t> swap_buffers_slot_retry_count_{0};
+
+  // Accessed by consumer only.
+  size_t unread_swaps_ = 0;
+  size_t i_write_prev_ = 0;
+  uint64_t pid_;
+  uint64_t tid_;
+  size_t max_entry_size_ = kTlsLogReservedEntryCount;
+
+  std::function<void()> forced_detatch_;
+};
+
+Logger::Logger(std::chrono::duration<double> poll_period,
+               size_t max_threads_to_log)
+    : poll_period_(poll_period),
+      max_threads_to_log_(max_threads_to_log),
+      thread_swap_request_slots_(max_threads_to_log * 2) {
+  const size_t kSlotCount = max_threads_to_log * 2;
+  for (size_t i = 0; i < kSlotCount; i++) {
+    std::atomic_init(&thread_swap_request_slots_[i],
+                     SwapRequestSlotIsWritableValue(i));
+  }
+}
+
+Logger::~Logger() {
+  // TlsLoggers might outlive this Logger when loaded as a python module.
+  // Forcefully make all currently registered TlsLoggers orphans.
+  std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+  TlsLogger* tls_logger_prev = nullptr;
+  (void)tls_logger_prev;  // Avoid unused error in release builds.
+  while (!tls_loggers_registerd_.empty()) {
+    TlsLogger* tls_logger = *tls_loggers_registerd_.begin();
+    // Otherwise, this is an infinite loop.
+    assert(tls_logger != tls_logger_prev);
+    tls_loggers_registerd_mutex_.unlock();
+    tls_logger->ForcedDetatchFromThread();
+    tls_loggers_registerd_mutex_.lock();
+    tls_logger_prev = tls_logger;
+  }
+}
+
+void Logger::RequestSwapBuffers(TlsLogger* tls_logger) {
+  auto tls_logger_as_uint = reinterpret_cast<uintptr_t>(tls_logger);
+  assert(SwapRequestSlotIsReadable(tls_logger_as_uint));
+  size_t id, slot;
+  uintptr_t slot_is_writeable_value;
+  // The compare_exchange below should almost always succeed.
+  // The compare_exchange may fail if a recycled slot is still actively used
+  // by another thread, so we retry with subsequent slots here if needed.
+  // Since the slot count is 2x the expected number of threads to log,
+  // the CAS should only fail at most 50% of the time when all logging threads
+  // happen to be descheduled between the fetch_add and CAS below, which is
+  // very unlikely.
+  id = swap_request_id_.fetch_add(1, std::memory_order_relaxed);
+  slot = id % thread_swap_request_slots_.size();
+  slot_is_writeable_value = SwapRequestSlotIsWritableValue(id);
+  while (!thread_swap_request_slots_[slot].compare_exchange_strong(
+      slot_is_writeable_value, tls_logger_as_uint, std::memory_order_release)) {
+    id = swap_request_id_.fetch_add(1, std::memory_order_relaxed);
+    slot = id % thread_swap_request_slots_.size();
+    slot_is_writeable_value = SwapRequestSlotIsWritableValue(id);
+    tls_logger->RequestSwapBuffersSlotRetried();
+  }
+}
+
+void Logger::RegisterTlsLogger(TlsLogger* tls_logger) {
+  std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+  if (tls_loggers_registerd_.size() >= max_threads_to_log_) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC((*this), "error_runtime",
+                          "Warning: More TLS loggers registerd than can be "
+                          "active simultaneously.");
+#else
+    LogErrorSync(
+        "Warning: More TLS loggers registerd than can "
+        "be active simultaneously.\n");
+#endif
+  }
+  tls_loggers_registerd_.insert(tls_logger);
+}
+
+// This moves ownership of the tls_logger data to Logger so the
+// exiting thread can exit immediately, even if all the logs of the
+// exiting thread haven't been processed.
+void Logger::UnRegisterTlsLogger(std::unique_ptr<TlsLogger> tls_logger) {
+  OrphanContainer::iterator orphan;
+  {
+    std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
+    tls_logger_orphans_.emplace_front(std::move(tls_logger));
+    orphan = tls_logger_orphans_.begin();
+  }
+
+  // Only remove the TlsLogger from the registry after adding to orphans so
+  // CollectTlsLoggerStats doesn't have any gaps in coverage.
+  {
+    std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+    tls_loggers_registerd_.erase(orphan->get());
+  }
+
+  // This will flush the logs of |tls_logger| and mark it for destruction.
+  // Deferring destruction via orphans_to_destroy helps avoid use-after-frees
+  // when the IOThread calls FinishReadingEntries.
+  (*orphan)->Log([this, orphan](AsyncLog&) {
+    CollectTlsLoggerStats(orphan->get());
+    orphans_to_destroy_.push_back(orphan);
+  });
+}
+
+void Logger::CollectTlsLoggerStats(TlsLogger* tls_logger) {
+  tls_total_log_cas_fail_count_ += tls_logger->ReportLogCasFailCount();
+  tls_total_swap_buffers_slot_retry_count_ +=
+      tls_logger->ReportSwapBuffersSlotRetryCount();
+
+  size_t max_entry_vector_size = tls_logger->MaxEntryVectorSize();
+  if (max_entry_vector_size > kTlsLogReservedEntryCount) {
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream msg;
+    msg << "Logging allocation detected:" << " tid: " << tls_logger->Tid()
+        << " reserved_entries: " << kTlsLogReservedEntryCount
+        << " max_entries: " << max_entry_vector_size;
+    MLPERF_LOG_WARNING((*this), "warning_generic_message", msg.str());
+#else
+    async_logger_.FlagWarning();
+    async_logger_.LogDetail("Logging allocation detected: ", "tid",
+                            tls_logger->Tid(), "reserved_entries",
+                            kTlsLogReservedEntryCount, "max_entries",
+                            max_entry_vector_size);
+#endif
+  }
+}
+
+void Logger::StartIOThread() {
+  {
+    std::unique_lock<std::mutex> lock(io_thread_mutex_);
+    keep_io_thread_alive_ = true;
+  }
+  io_thread_ = std::thread(&Logger::IOThread, this);
+}
+
+void Logger::StopIOThread() {
+  {
+    std::unique_lock<std::mutex> lock(io_thread_mutex_);
+    keep_io_thread_alive_ = false;
+    io_thread_cv_.notify_all();
+  }
+  io_thread_.join();
+}
+
+void Logger::StartLogging(std::ostream* summary, std::ostream* detail,
+                          std::ostream* accuracy, bool copy_detail_to_stdout,
+                          bool copy_summary_to_stdout) {
+  async_logger_.SetLogFiles(summary, detail, accuracy, copy_detail_to_stdout,
+                            copy_summary_to_stdout, PerfClock::now());
+}
+
+void Logger::StopLogging() {
+  if (std::this_thread::get_id() == io_thread_.get_id()) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC((*this), "error_runtime",
+                          "StopLogging() not supported from IO thread.");
+#else
+    LogErrorSync("StopLogging() not supported from IO thread.");
+#endif
+    return;
+  }
+
+  // Flush logs from this thread.
+  std::promise<void> io_thread_flushed_this_thread;
+  Log([&](AsyncLog&) { io_thread_flushed_this_thread.set_value(); });
+  io_thread_flushed_this_thread.get_future().wait();
+  async_logger_.SetLogFiles(&std::cerr, &std::cerr, &std::cerr, false, false,
+                            PerfClock::now());
+}
+
+void Logger::StartNewTrace(std::ostream* trace_out,
+                           PerfClock::time_point origin) {
+  async_logger_.StartNewTrace(trace_out, origin);
+}
+
+void Logger::StopTracing() {
+  // Flush traces from this thread.
+  std::promise<void> io_thread_flushed_this_thread;
+  Log([&](AsyncLog&) { io_thread_flushed_this_thread.set_value(); });
+  io_thread_flushed_this_thread.get_future().wait();
+  async_logger_.StopTrace();
+}
+
+void Logger::LogContentionAndAllocations() {
+  LogDetail([&](AsyncDetail& detail) {
+    {
+      std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+      for (auto tls_logger : tls_loggers_registerd_) {
+        CollectTlsLoggerStats(tls_logger);
+      }
+    }
+
+    {
+      std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
+      for (auto& orphan : tls_logger_orphans_) {
+        CollectTlsLoggerStats(orphan.get());
+      }
+    }
+
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "logger_swap_request_slots_retry_count",
+               swap_request_slots_retry_count_);
+    MLPERF_LOG(detail, "logger_swap_request_slots_retry_retry_count",
+               swap_request_slots_retry_retry_count_);
+    MLPERF_LOG(detail, "logger_swap_request_slots_retry_reencounter_count",
+               swap_request_slots_retry_reencounter_count_);
+    MLPERF_LOG(detail, "logger_start_reading_entries_retry_count",
+               start_reading_entries_retry_count_);
+    MLPERF_LOG(detail, "logger_tls_total_log_cas_fail_count",
+               tls_total_log_cas_fail_count_);
+    MLPERF_LOG(detail, "logger_tls_total_swap_buffers_slot_retry_count",
+               tls_total_swap_buffers_slot_retry_count_);
+#else
+    detail("Log Contention Counters:");
+    detail(std::to_string(swap_request_slots_retry_count_) +
+           " : swap_request_slots_retry_count");
+    detail(std::to_string(swap_request_slots_retry_retry_count_) +
+           " : swap_request_slots_retry_retry_count");
+    detail(std::to_string(swap_request_slots_retry_reencounter_count_) +
+           " : swap_request_slots_retry_reencounter_count");
+    detail(std::to_string(start_reading_entries_retry_count_) +
+           " : start_reading_entries_retry_count");
+    detail(std::to_string(tls_total_log_cas_fail_count_) +
+           " : tls_total_log_cas_fail_count");
+    detail(std::to_string(tls_total_swap_buffers_slot_retry_count_) +
+           " : tls_total_swap_buffers_slot_retry_count");
+#endif
+
+    swap_request_slots_retry_count_ = 0;
+    swap_request_slots_retry_retry_count_ = 0;
+    swap_request_slots_retry_reencounter_count_ = 0;
+    start_reading_entries_retry_count_ = 0;
+    tls_total_log_cas_fail_count_ = 0;
+    tls_total_swap_buffers_slot_retry_count_ = 0;
+  });
+}
+
+void Logger::RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                                     size_t latencies_to_reserve) {
+  async_logger_.RestartLatencyRecording(first_sample_sequence_id,
+                                        latencies_to_reserve);
+}
+
+std::vector<QuerySampleLatency> Logger::GetLatenciesBlocking(
+    size_t expected_count) {
+  return async_logger_.GetLatenciesBlocking(expected_count);
+}
+std::vector<QuerySampleLatency> Logger::GetTokenLatencies(
+    size_t expected_count) {
+  return async_logger_.GetTokenLatencies(expected_count);
+}
+std::vector<QuerySampleLatency> Logger::GetTimePerOutputToken(
+    size_t expected_count) {
+  return async_logger_.GetTimePerOutputToken(expected_count);
+}
+std::vector<QuerySampleLatency> Logger::GetTokensPerSample(
+    size_t expected_count) {
+  return async_logger_.GetTokensPerSample(expected_count);
+}
+
+PerfClock::time_point Logger::GetMaxCompletionTime() {
+  return async_logger_.GetMaxCompletionTime();
+}
+
+QuerySampleLatency Logger::GetMaxLatencySoFar() {
+  return async_logger_.GetMaxLatencySoFar();
+}
+
+void Logger::SetUseTokens(bool use_tokens) {
+  async_logger_.SetUseTokens(use_tokens);
+}
+
+void Logger::SetNeedsFirstToken(bool needs_first_token) {
+  async_logger_.SetNeedsFirstToken(needs_first_token);
+}
+
+TlsLogger* Logger::GetTlsLoggerThatRequestedSwap(size_t slot, size_t next_id) {
+  uintptr_t slot_value = thread_swap_request_slots_[slot].load();
+  if (SwapRequestSlotIsReadable(slot_value)) {
+    // TODO: Convert this block to a simple write once we are confidient
+    // that we don't need to check for success.
+    bool success = thread_swap_request_slots_[slot].compare_exchange_strong(
+        slot_value, SwapRequestSlotIsWritableValue(next_id));
+    if (!success) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING((*this), "warning_generic_message", "CAS failed.");
+#else
+      LogErrorSync("CAS failed.", "line", __LINE__);
+#endif
+      assert(success);
+    }
+    return reinterpret_cast<TlsLogger*>(slot_value);
+  }
+  return nullptr;
+}
+
+void Logger::GatherRetrySwapRequests(std::vector<TlsLogger*>* threads_to_swap) {
+  if (swap_request_slots_to_retry_.empty()) {
+    return;
+  }
+
+  std::vector<SlotRetry> retry_slots;
+  retry_slots.swap(swap_request_slots_to_retry_);
+  for (auto& slot_retry : retry_slots) {
+    TlsLogger* tls_logger =
+        GetTlsLoggerThatRequestedSwap(slot_retry.slot, slot_retry.next_id);
+    if (tls_logger) {
+      threads_to_swap->push_back(tls_logger);
+    } else {
+      swap_request_slots_to_retry_.push_back(slot_retry);
+      swap_request_slots_retry_retry_count_++;
+    }
+  }
+}
+
+void Logger::GatherNewSwapRequests(std::vector<TlsLogger*>* threads_to_swap) {
+  auto swap_request_end = swap_request_id_.load(std::memory_order_acquire);
+  for (; swap_request_id_read_ < swap_request_end; swap_request_id_read_++) {
+    size_t slot = swap_request_id_read_ % thread_swap_request_slots_.size();
+    size_t next_id = swap_request_id_read_ + thread_swap_request_slots_.size();
+    TlsLogger* tls_logger = GetTlsLoggerThatRequestedSwap(slot, next_id);
+    if (tls_logger) {
+      threads_to_swap->push_back(tls_logger);
+    } else {
+      swap_request_slots_retry_count_++;
+      // A thread is in the middle of its call to RequestSwapBuffers.
+      // Retry later once it's done.
+      auto it = std::find_if(swap_request_slots_to_retry_.begin(),
+                             swap_request_slots_to_retry_.end(),
+                             [=](SlotRetry& s) { return s.slot == slot; });
+      if (it == swap_request_slots_to_retry_.end()) {
+        // This is the first time we are retrying the slot.
+        swap_request_slots_to_retry_.push_back({slot, next_id});
+      } else {
+        // Whoa. We've been retrying this slot since the last time it was
+        // encountered. Just update the next_id.
+        it->next_id = next_id;
+        swap_request_slots_retry_reencounter_count_++;
+      }
+    }
+  }
+}
+
+void Logger::IOThread() {
+  while (keep_io_thread_alive_) {
+    auto tracer1 =
+        MakeScopedTracer([](AsyncTrace& trace) { trace("IOThreadLoop"); });
+    {
+      auto tracer2 = MakeScopedTracer([](AsyncTrace& trace) { trace("Wait"); });
+      std::unique_lock<std::mutex> lock(io_thread_mutex_);
+      io_thread_cv_.wait_for(lock, poll_period_,
+                             [&] { return !keep_io_thread_alive_; });
+    }
+
+    {
+      auto tracer3 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("Gather"); });
+      std::vector<TlsLogger*> threads_to_swap;
+      threads_to_swap.swap(threads_to_swap_deferred_);
+      GatherRetrySwapRequests(&threads_to_swap);
+      GatherNewSwapRequests(&threads_to_swap);
+      for (TlsLogger* thread : threads_to_swap) {
+        if (thread->ReadBufferHasBeenConsumed()) {
+          thread->SwapBuffers();
+          // After swapping a thread, it's ready to be read.
+          threads_to_read_.push_back(thread);
+        } else {
+          // Don't swap buffers again until we've finish reading the
+          // previous swap.
+          threads_to_swap_deferred_.push_back(thread);
+        }
+      }
+    }
+
+    {
+      auto tracer4 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("Process"); });
+      // Read from the threads we are confident have activity.
+      for (std::vector<TlsLogger*>::iterator thread = threads_to_read_.begin();
+           thread != threads_to_read_.end(); thread++) {
+        auto tracer5 =
+            MakeScopedTracer([tid = (*thread)->Tid()](AsyncTrace& trace) {
+              trace("Thread", "tid", tid);
+            });
+        std::vector<AsyncLogEntry>* entries = (*thread)->StartReadingEntries();
+        if (!entries) {
+          start_reading_entries_retry_count_++;
+          continue;
+        }
+
+        async_logger_.SetCurrentPidTid((*thread)->Pid(), (*thread)->Tid());
+        for (auto& entry : *entries) {
+          // Execute the entry to perform the serialization and I/O.
+          entry(async_logger_);
+        }
+        (*thread)->FinishReadingEntries();
+        // Mark for removal by the call to RemoveValue below.
+        *thread = nullptr;
+      }
+
+      // Only remove threads where reading succeeded so we retry the failed
+      // threads the next time around.
+      RemoveValue(&threads_to_read_, nullptr);
+    }
+
+    // Explicitly flush every time we wake up. The goal being minimization
+    // of large implicit flushes which could affect tail latency measurements,
+    // especially at percentiles closer to 100%.
+    /// \todo Determine if explicitly flushing logs every wake up is better
+    /// than relying on implicit flushing.
+    {
+      auto tracer6 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("FlushAll"); });
+      async_logger_.Flush();
+    }
+
+    if (!orphans_to_destroy_.empty()) {
+      auto tracer7 = MakeScopedTracer(
+          [](AsyncTrace& trace) { trace("Abandoning Orphans"); });
+      std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
+      for (auto orphan : orphans_to_destroy_) {
+        tls_logger_orphans_.erase(orphan);
+      }
+      orphans_to_destroy_.clear();
+    }
+  }
+}
+
+TlsLogger::TlsLogger(std::function<void()> forced_detatch)
+    : pid_(MLPERF_GET_PID()),
+      tid_(MLPERF_GET_TID()),
+      forced_detatch_(std::move(forced_detatch)) {
+  for (auto& entry : entries_) {
+    entry.reserve(kTlsLogReservedEntryCount);
+  }
+}
+
+TlsLogger::~TlsLogger() {}
+
+// Log always makes forward progress since it can unconditionally obtain a
+// "lock" on at least one of the buffers for writing.
+// Notificiation is also lock free.
+void TlsLogger::Log(AsyncLogEntry&& entry) {
+  size_t cas_fail_count = 0;
+  auto unlocked = EntryState::Unlocked;
+  size_t i_write = i_write_.load(std::memory_order_relaxed);
+  while (!entry_states_[i_write].compare_exchange_strong(
+      unlocked, EntryState::WriteLock, std::memory_order_acquire,
+      std::memory_order_relaxed)) {
+    unlocked = EntryState::Unlocked;
+    i_write ^= 1;
+    // We may need to try 3 times, since there could be a race with a
+    // previous SwapBuffers request and we use memory_order_relaxed when
+    // loading i_write_ above.
+    cas_fail_count++;
+    if (cas_fail_count >= 3) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
+                         "CAS failed.");
+#else
+      GlobalLogger().LogErrorSync("CAS failed.", "times", cas_fail_count,
+                                  "line", __LINE__);
+#endif
+    }
+    log_cas_fail_count_.fetch_add(1, std::memory_order_relaxed);
+  }
+  entries_[i_write].emplace_back(std::forward<AsyncLogEntry>(entry));
+
+  // TODO: Convert this block to a simple write once we are confidient
+  // that we don't need to check for success.
+  auto write_lock = EntryState::WriteLock;
+  bool success = entry_states_[i_write].compare_exchange_strong(
+      write_lock, EntryState::Unlocked, std::memory_order_release);
+  if (!success) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
+                       "CAS failed.");
+#else
+    GlobalLogger().LogErrorSync("CAS failed.", "line", __LINE__);
+#endif
+    assert(success);
+  }
+
+  bool write_buffer_swapped = i_write_prev_ != i_write;
+  if (write_buffer_swapped) {
+    GlobalLogger().RequestSwapBuffers(this);
+    i_write_prev_ = i_write;
+  }
+}
+
+void TlsLogger::SwapBuffers() {
+  // TODO: Convert this block to a simple write once we are confidient
+  // that we don't need to check for success.
+  auto read_lock = EntryState::ReadLock;
+  bool success = entry_states_[i_read_].compare_exchange_strong(
+      read_lock, EntryState::Unlocked, std::memory_order_release);
+  if (!success) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
+                       "CAS failed.");
+#else
+    GlobalLogger().LogErrorSync("CAS failed.", "line", __LINE__);
+#endif
+    assert(success);
+  }
+
+  i_write_.store(i_read_, std::memory_order_relaxed);
+  i_read_ ^= 1;
+  unread_swaps_++;
+}
+
+// Returns nullptr if read lock fails.
+std::vector<AsyncLogEntry>* TlsLogger::StartReadingEntries() {
+  auto unlocked = EntryState::Unlocked;
+  if (entry_states_[i_read_].compare_exchange_strong(
+          unlocked, EntryState::ReadLock, std::memory_order_acquire,
+          std::memory_order_relaxed)) {
+    return &entries_[i_read_];
+  }
+  return nullptr;
+}
+
+void TlsLogger::FinishReadingEntries() {
+  // Detect first logging allocation and track max allocated size.
+  size_t new_size = entries_[i_read_].size();
+  if (new_size > max_entry_size_) {
+    if (max_entry_size_ == kTlsLogReservedEntryCount) {
+      Log([ts = PerfClock::now()](AsyncLog& log) {
+        log.TraceAsyncInstant("FirstAllocation", 0, ts);
+      });
+    }
+    max_entry_size_ = new_size;
+  }
+
+  entries_[i_read_].clear();
+  unread_swaps_--;
+}
+
+bool TlsLogger::ReadBufferHasBeenConsumed() { return unread_swaps_ == 0; }
+
+void TlsLogger::TraceCounters() {
+  auto tracer = MakeScopedTracer(
+      [lcfc = log_cas_fail_count_.load(std::memory_order_relaxed),
+       sbsrc = swap_buffers_slot_retry_count_.load(std::memory_order_relaxed)](
+          AsyncTrace& trace) {
+        trace("TlsLogger:ContentionCounters", "log_cas_fail_count", lcfc,
+              "swap_buffers_slot_retry_count", sbsrc);
+      });
+}
+
+Logger& GlobalLogger() {
+  static Logger g_logger(kLogPollPeriod, kMaxThreadsToLog);
+  return g_logger;
+}
+
+/// \brief Moves ownership of the TlsLogger to Logger on thread exit
+/// so no round-trip synchronization with the IO thread is required.
+struct TlsLoggerWrapper {
+  TlsLoggerWrapper(std::function<void()> forced_detatch)
+      : tls_logger(std::make_unique<TlsLogger>(std::move(forced_detatch))) {
+    GlobalLogger().RegisterTlsLogger(tls_logger.get());
+  }
+  ~TlsLoggerWrapper() {
+    tls_logger->TraceCounters();
+    GlobalLogger().UnRegisterTlsLogger(std::move(tls_logger));
+  }
+  std::unique_ptr<TlsLogger> tls_logger;
+};
+
+TlsLoggerWrapper* InitializeMyTlsLoggerWrapper() {
+  thread_local std::unique_ptr<TlsLoggerWrapper> tls_logger_wrapper;
+  // forced_detatch lets the global Logger forcefully detatch TlsLoggers
+  // from the thread in the Logger's destructor, which may run before
+  // thread-local variables are destroyed when the loadgen is used as a python
+  // module and dynamically unloaded.
+  // Note: We capture a pointer to the tls_logger_wrapper since variables of
+  // the thread-local storage class aren't actually captured. C++ spec says
+  // only variables of the automatic storage class are captured.
+  /// \todo There is a race where the same TlsLoggerWrapper might be
+  /// destroyed both naturally and via forced_detatch. Destruction of
+  /// the TlsLoggerWrapper should be locked.
+  auto forced_detatch = [tls_logger_wrapper = &tls_logger_wrapper]() {
+    tls_logger_wrapper->reset();
+  };
+  tls_logger_wrapper = std::make_unique<TlsLoggerWrapper>(forced_detatch);
+  return tls_logger_wrapper.get();
+}
+
+TlsLogger* InitializeMyTlsLogger() {
+  thread_local TlsLoggerWrapper* wrapper = InitializeMyTlsLoggerWrapper();
+  return wrapper->tls_logger.get();
+}
+
+void Log(AsyncLogEntry&& entry) {
+  thread_local TlsLogger* const tls_logger = InitializeMyTlsLogger();
+  tls_logger->Log(std::forward<AsyncLogEntry>(entry));
+}
+
+}  // namespace logging
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h
new file mode 100644
index 000000000..8f1a398e9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h
@@ -0,0 +1,816 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Internal logging implementation details.
+
+#ifndef MLPERF_LOADGEN_LOGGING_H_
+#define MLPERF_LOADGEN_LOGGING_H_
+
+#define USE_NEW_LOGGING_FORMAT 1
+#define MLPERF_LOG(logger, key, value) \
+  logger.Log((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_ERROR(logger, key, value) \
+  logger.LogError((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_ERROR_SYNC(logger, key, value) \
+  logger.LogErrorSync((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_WARNING(logger, key, value) \
+  logger.LogWarning((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_INTERVAL_START(logger, key, value) \
+  logger.LogIntervalStart((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_INTERVAL_END(logger, key, value) \
+  logger.LogIntervalEnd((key), (value), __FILE__, __LINE__)
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <map>
+#include <mutex>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <vector>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+/// \brief Wait-free logging utilities that defer stringification
+/// and syscalls to a worker thread.
+namespace logging {
+
+class AsyncLog;
+class Logger;
+class TlsLogger;
+struct TlsLoggerWrapper;
+
+/// \todo Verify lambas are not allocating when bounded to a std::function.
+using AsyncLogEntry = std::function<void(AsyncLog&)>;
+using PerfClock = std::chrono::high_resolution_clock;
+
+/// \brief Logs the raw bytes as a hexadecimal ascii string.
+struct LogBinaryAsHexString {
+  std::vector<uint8_t>* data;
+};
+
+/// \brief By default, print out the value directly.
+template <typename T>
+const T& ArgValueTransform(const T& value) {
+  return value;
+}
+
+/// \brief Print out True/False.
+const std::string& ArgValueTransform(const bool& value);
+/// \brief Print out binary day as hex string.
+const std::string ArgValueTransform(const LogBinaryAsHexString& value);
+#if USE_NEW_LOGGING_FORMAT
+/// \brief Print out a string in JSON format (with quotes).
+const std::string ArgValueTransform(const std::string& value);
+const std::string ArgValueTransform(const char* value);
+/// \brief Prints a list of int in JSON format.
+const std::string ArgValueTransform(const std::vector<size_t>& value);
+/// \brief Prints a dict in JSON format.
+const std::string ArgValueTransform(
+    const std::map<std::string, std::string>& value);
+#endif
+
+/// \brief Helper to print out values without quotes when value is a string.
+template <typename T>
+const T& ArgValueTransformWithoutQuote(const T& value) {
+  return ArgValueTransform<T>(value);
+}
+inline const std::string ArgValueTransformWithoutQuote(
+    const LogBinaryAsHexString& value) {
+  return ArgValueTransform(value);
+}
+/// \brief Helper to print out a string without the quotes.
+inline const std::string ArgValueTransformWithoutQuote(
+    const std::string& value) {
+  return value;
+}
+
+/// \brief Outputs a trace that can be uploaded to chrome://tracing for
+/// visualization.
+/// \details Trace event format definition:
+/// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit?usp=sharing
+class ChromeTracer {
+ public:
+  ChromeTracer(std::ostream* trace_out, PerfClock::time_point origin);
+  ~ChromeTracer();
+
+  template <typename... Args>
+  void AddCompleteEvent(const std::string& name, uint64_t pid, uint64_t tid,
+                        PerfClock::time_point start, PerfClock::time_point end,
+                        const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\":\"X\","
+          << "\"pid\":" << pid << "," << "\"tid\":" << tid << ","
+          << "\"ts\":" << Micros(start - origin_).count() << ","
+          << "\"dur\":" << Micros(end - start).count() << "," << "\"args\":{";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  template <typename... Args>
+  void AddAsyncBeginEvent(const std::string& name, uint64_t pid, uint64_t id,
+                          PerfClock::time_point time, const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"b\"," << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
+          << "\"args\":{";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  template <typename... Args>
+  void AddAsyncInstantEvent(const std::string& name, uint64_t pid, uint64_t id,
+                            PerfClock::time_point time, const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"n\"," << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
+          << "\"args\":{";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  template <typename... Args>
+  void AddAsyncEndEvent(const std::string& name, uint64_t pid, uint64_t id,
+                        PerfClock::time_point time) {
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"e\", " << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << "},\n";
+  }
+
+  template <typename... Args>
+  void AddCounterEvent(const std::string& name, uint64_t pid,
+                       PerfClock::time_point time, const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\": \"C\","
+          << "\"pid\":" << pid << ","
+          << "\"ts\":" << Micros(time - origin_).count() << ","
+          << "\"args\":{ ";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  void Flush() { out_->flush(); }
+
+ private:
+  using Micros = std::chrono::duration<double, std::micro>;
+
+  void WriteTraceEventHeader();
+  void WriteTraceEventFooter();
+
+  void AddArgs() {}
+
+  template <typename T>
+  void AddArgs(const std::string& arg_name, const T& arg_value) {
+    *out_ << "\"" << arg_name << "\":" << ArgValueTransform(arg_value);
+  }
+
+  template <typename T, typename... Args>
+  void AddArgs(const std::string& arg_name, const T& arg_value,
+               const Args... args) {
+    *out_ << "\"" << arg_name << "\":" << ArgValueTransform(arg_value) << ",";
+    AddArgs(args...);
+  }
+
+  std::ostream* out_;
+  PerfClock::time_point origin_;
+};
+
+/// \brief The proxy all logging lambdas ultimately use to write any log type.
+/// \details Passed as an argument to the log lambda on the
+/// recording thread to serialize the data captured by the lambda and
+/// forward it to the output stream.
+/// \todo Make summary_out_, detail_out_, accuracy_out_, and trace_out_
+/// instances of a new LogOutput interface that the client may override.
+class AsyncLog {
+ public:
+  void SetLogFiles(std::ostream* summary, std::ostream* detail,
+                   std::ostream* accuracy, bool copy_detail_to_stdout,
+                   bool copy_summary_to_stdout,
+                   PerfClock::time_point log_origin);
+  void StartNewTrace(std::ostream* trace_out, PerfClock::time_point origin);
+  void StopTrace();
+  void Flush();
+
+  void SetCurrentPidTid(uint64_t pid, uint64_t tid);
+
+  void LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
+                   const LogBinaryAsHexString& response, int64_t n_tokens);
+  void CacheToken(uint64_t seq_id, const LogBinaryAsHexString& response);
+
+  template <typename... Args>
+  void LogSummary(const std::string& message, const Args... args);
+
+  void SetLogDetailTime(PerfClock::time_point time) { log_detail_time_ = time; }
+
+  void FlagError() {
+    std::unique_lock<std::mutex> lock(log_mutex_);
+    log_error_count_++;
+    error_flagged_ = true;
+  }
+
+  void FlagWarning() {
+    std::unique_lock<std::mutex> lock(log_mutex_);
+    log_warning_count_++;
+    warning_flagged_ = true;
+  }
+
+#if USE_NEW_LOGGING_FORMAT
+  template <typename T>
+  void LogDetail(const std::string& key, const T& value,
+                 const std::string file_name, const unsigned int line_no);
+#else
+  template <typename... Args>
+  void LogDetail(const std::string& message, const Args... args);
+#endif
+
+  template <typename... Args>
+  void Trace(const std::string& trace_name, PerfClock::time_point start,
+             PerfClock::time_point end, const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddCompleteEvent(trace_name, current_pid_, current_tid_, start,
+                                end, args...);
+    }
+  }
+
+  template <typename... Args>
+  void TraceAsyncInstant(const std::string& trace_name, uint64_t id,
+                         PerfClock::time_point instant_time,
+                         const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddAsyncInstantEvent(trace_name, current_pid_, id, instant_time,
+                                    args...);
+    }
+  }
+
+  void SetScopedTraceTimes(PerfClock::time_point start,
+                           PerfClock::time_point end) {
+    scoped_start_ = start;
+    scoped_end_ = end;
+  }
+
+  template <typename... Args>
+  void ScopedTrace(const std::string& trace_name, const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddCompleteEvent(trace_name, current_pid_, current_tid_,
+                                scoped_start_, scoped_end_, args...);
+    }
+  }
+
+  template <typename... Args>
+  void TraceSample(const std::string& trace_name, uint64_t id,
+                   PerfClock::time_point start, PerfClock::time_point end,
+                   const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddAsyncBeginEvent(trace_name, current_pid_, id, start, args...);
+      tracer_->AddAsyncEndEvent(trace_name, current_pid_, id, end);
+    }
+  }
+
+  template <typename... Args>
+  void TraceCounterEvent(const std::string& trace_name,
+                         PerfClock::time_point time, const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddCounterEvent(trace_name, current_pid_, time, args...);
+    }
+  }
+
+  void RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                               size_t latencies_to_reserve);
+  void RecordSampleCompletion(uint64_t sample_sequence_id,
+                              PerfClock::time_point completion_time,
+                              QuerySampleLatency latency, int64_t n_tokens);
+  void RecordTokenCompletion(uint64_t sample_sequence_id,
+                             PerfClock::time_point completion_time,
+                             QuerySampleLatency latency);
+  std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
+  std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
+  std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
+  std::vector<int64_t> GetTokensPerSample(size_t expected_count);
+  PerfClock::time_point GetMaxCompletionTime();
+  QuerySampleLatency GetMaxLatencySoFar();
+  void SetUseTokens(bool use_tokens);
+  void SetNeedsFirstToken(bool needs_first_token);
+  size_t GetErrorCount() { return log_error_count_; };
+
+ private:
+  void WriteAccuracyHeaderLocked();
+  void WriteAccuracyFooterLocked();
+
+  void LogArgs(std::ostream*) {}
+
+  template <typename T>
+  void LogArgs(std::ostream* out, const T& value_only) {
+    *out << ArgValueTransformWithoutQuote(value_only);
+  }
+
+  template <typename T>
+  void LogArgs(std::ostream* out, const std::string& arg_name,
+               const T& arg_value) {
+    *out << "\"" << arg_name
+         << "\" : " << ArgValueTransformWithoutQuote(arg_value);
+  }
+
+  template <typename T, typename... Args>
+  void LogArgs(std::ostream* out, const std::string& arg_name,
+               const T& arg_value, const Args... args) {
+    *out << "\"" << arg_name
+         << "\" : " << ArgValueTransformWithoutQuote(arg_value) << ", ";
+    LogArgs(out, args...);
+  }
+
+  std::mutex log_mutex_;
+  std::ostream* summary_out_ = &std::cerr;
+  std::ostream* detail_out_ = &std::cerr;
+  std::ostream* accuracy_out_ = &std::cerr;
+  // TODO: Instead of these bools, use a class that forwards to two streams.
+  bool copy_detail_to_stdout_ = false;
+  bool copy_summary_to_stdout_ = false;
+  bool accuracy_needs_comma_ = false;
+  PerfClock::time_point log_origin_;
+  size_t log_error_count_ = 0;
+  bool error_flagged_ = false;
+  size_t log_warning_count_ = 0;
+  bool warning_flagged_ = false;
+  bool use_tokens_ = false;
+  bool needs_first_token_ = false;
+
+  std::mutex trace_mutex_;
+  std::unique_ptr<ChromeTracer> tracer_;
+
+  uint64_t current_pid_;
+  uint64_t current_tid_;
+  PerfClock::time_point log_detail_time_;
+  PerfClock::time_point scoped_start_;
+  PerfClock::time_point scoped_end_;
+
+  std::mutex latencies_mutex_;
+  std::mutex token_latencies_mutex_;
+  std::mutex token_record_mutex_;
+  std::condition_variable all_latencies_recorded_;
+  uint64_t latencies_first_sample_sequence_id_ = 0;
+  std::vector<QuerySampleLatency> latencies_;
+  std::vector<QuerySampleLatency> token_latencies_;
+  std::vector<QuerySampleLatency> time_per_output_token_;
+  std::vector<LogBinaryAsHexString> token_records_;
+  std::vector<int64_t> tokens_per_sample_;
+  QuerySampleLatency max_latency_ = 0;
+  PerfClock::time_point max_completion_timstamp_;
+  size_t latencies_recorded_ = 0;
+  size_t latencies_expected_ = 0;
+  // Must be called with latencies_mutex_ held.
+  bool AllLatenciesRecorded() {
+    return latencies_recorded_ == latencies_expected_;
+  }
+};
+
+/// \brief The central logger that logs all threads belonging to a run.
+class Logger {
+ public:
+  Logger(std::chrono::duration<double> poll_period, size_t max_threads_to_log);
+  ~Logger();
+
+  void StartIOThread();
+  void StopIOThread();
+
+  void StartLogging(std::ostream* summary, std::ostream* detail,
+                    std::ostream* accuracy, bool copy_detail_to_stdout,
+                    bool copy_summary_to_stdout);
+  void StopLogging();
+
+  void StartNewTrace(std::ostream* trace_out, PerfClock::time_point origin);
+  void StopTracing();
+
+  void LogContentionAndAllocations();
+
+  void RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                               size_t latencies_to_reserve);
+  std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
+  std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
+  std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
+  std::vector<int64_t> GetTokensPerSample(size_t expected_count);
+  PerfClock::time_point GetMaxCompletionTime();
+  QuerySampleLatency GetMaxLatencySoFar();
+  void SetUseTokens(bool use_tokens);
+  void SetNeedsFirstToken(bool needs_first_token);
+
+ private:
+  friend AsyncLog;
+  friend TlsLogger;
+  friend TlsLoggerWrapper;
+
+  void RegisterTlsLogger(TlsLogger* tls_logger);
+  void UnRegisterTlsLogger(std::unique_ptr<TlsLogger> tls_logger);
+  void RequestSwapBuffers(TlsLogger* tls_logger);
+  void CollectTlsLoggerStats(TlsLogger* tls_logger);
+
+  TlsLogger* GetTlsLoggerThatRequestedSwap(size_t slot, size_t next_id);
+  void GatherRetrySwapRequests(std::vector<TlsLogger*>* threads_to_swap);
+  void GatherNewSwapRequests(std::vector<TlsLogger*>* threads_to_swap);
+
+  /// \brief The main logging thread function that handles the serialization
+  /// and I/O to the stream or file.
+  ///
+  /// \todo Provide client hook to set logging thead affinity and priority.
+  void IOThread();
+
+// Slow synchronous error logging for internals that may prevent
+// async logging from working.
+#if USE_NEW_LOGGING_FORMAT
+  template <typename T>
+  void LogErrorSync(const std::string& key, const T& value,
+                    const std::string file_name, const unsigned int line_no) {
+    /// \todo Acquire mutex once for FlagError + LogDetail to avoid
+    ///       races. Better yet, switch to a non-stateful error API.
+    //       This is better than nothing though.
+    async_logger_.FlagError();
+    async_logger_.LogDetail(key, value, file_name, line_no);
+  }
+  template <typename T>
+  void LogWarning(const std::string& key, const T& value,
+                  const std::string file_name, const unsigned int line_no) {
+    async_logger_.FlagWarning();
+    async_logger_.LogDetail(key, value, file_name, line_no);
+  }
+#else
+  template <typename... Args>
+  void LogErrorSync(const std::string& message, Args&&... args) {
+    /// \todo Acquire mutex once for FlagError + LogDetail to avoid
+    ///       races. Better yet, switch to a non-stateful error API.
+    //       This is better than nothing though.
+    async_logger_.FlagError();
+    async_logger_.LogDetail(message, std::forward<Args>(args)...);
+  }
+#endif
+
+  // Accessed by IOThead only.
+  const std::chrono::duration<double> poll_period_;
+  AsyncLog async_logger_;
+
+  const size_t max_threads_to_log_;
+  std::thread io_thread_;
+
+  // Accessed by producers and IOThead during thread registration and
+  // destruction. Protected by io_thread_mutex_.
+  std::mutex io_thread_mutex_;
+  std::condition_variable io_thread_cv_;
+  bool keep_io_thread_alive_ = false;
+
+  std::mutex tls_loggers_registerd_mutex_;
+  std::unordered_set<TlsLogger*> tls_loggers_registerd_;
+
+  // Temporarily stores TlsLogger data for threads that have exited until
+  // all their log entries have been processed.
+  // Accessed by IOThread and producers as their threads exit.
+  std::mutex tls_logger_orphans_mutex_;
+  using OrphanContainer = std::list<std::unique_ptr<TlsLogger>>;
+  OrphanContainer tls_logger_orphans_;
+
+  // Accessed by producers and IOThead atomically.
+  std::atomic<size_t> swap_request_id_{0};
+  std::vector<std::atomic<uintptr_t>> thread_swap_request_slots_;
+
+  // Accessed by IOThead only.
+  size_t swap_request_id_read_{0};
+  struct SlotRetry {
+    size_t slot;
+    uintptr_t next_id;
+  };
+  std::vector<SlotRetry> swap_request_slots_to_retry_;
+  std::vector<TlsLogger*> threads_to_swap_deferred_;
+  std::vector<TlsLogger*> threads_to_read_;
+  std::vector<OrphanContainer::iterator> orphans_to_destroy_;
+
+  // Counts for retries related to the lock-free scheme.
+  // Abnormally high counts could be an indicator of contention.
+  // Access on IOThread only.
+  size_t swap_request_slots_retry_count_ = 0;
+  size_t swap_request_slots_retry_retry_count_ = 0;
+  size_t swap_request_slots_retry_reencounter_count_ = 0;
+  size_t start_reading_entries_retry_count_ = 0;
+  size_t tls_total_log_cas_fail_count_ = 0;
+  size_t tls_total_swap_buffers_slot_retry_count_ = 0;
+};
+
+Logger& GlobalLogger();
+
+/// \brief The generic way to add a log entry.
+/// \details Supports all types of logs, which is useful for complex
+/// lambdas that may wish to log in multiple places or log something other
+/// than a simple summary, detail, or trace entry.
+void Log(AsyncLogEntry&& entry);
+
+/// \brief The convenience proxy a LogSummary lambda uses to write to the
+/// summary log.
+class AsyncSummary {
+ public:
+  explicit AsyncSummary(AsyncLog& async_log) : async_log_(async_log) {}
+  AsyncLog& async_log() { return async_log_; }
+
+  template <typename... Args>
+  AsyncLog& operator()(Args&&... args) {
+    async_log_.LogSummary(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+ private:
+  AsyncLog& async_log_;
+};
+
+/// \brief A helper to simplify adding a summary log entry.
+template <typename LambdaT>
+void LogSummary(LambdaT&& lambda) {
+  Log([lambda = std::forward<LambdaT>(lambda)](AsyncLog& log) mutable {
+    AsyncSummary async_summary(log);
+    lambda(async_summary);
+  });
+}
+
+/// \brief The convenience proxy a LogDetail lambda uses to write to the detail
+/// log.
+class AsyncDetail {
+ public:
+  explicit AsyncDetail(AsyncLog& async_log) : async_log_(async_log) {}
+  AsyncLog& async_log() { return async_log_; }
+
+#if USE_NEW_LOGGING_FORMAT
+  template <typename T>
+  AsyncLog& Log(const std::string& key, const T& value,
+                const std::string file_name, const unsigned int line_no) {
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogError(const std::string& key, const T& value,
+                     const std::string file_name, const unsigned int line_no) {
+    async_log_.FlagError();
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogWarning(const std::string& key, const T& value,
+                       const std::string file_name,
+                       const unsigned int line_no) {
+    async_log_.FlagWarning();
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogIntervalStart(const std::string& key, const T& value,
+                             const std::string file_name,
+                             const unsigned int line_no) {
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogIntervalEnd(const std::string& key, const T& value,
+                           const std::string file_name,
+                           const unsigned int line_no) {
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+#else
+  template <typename... Args>
+  AsyncLog& operator()(Args&&... args) {
+    async_log_.LogDetail(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+  template <typename... Args>
+  AsyncLog& Error(Args&&... args) {
+    async_log_.FlagError();
+    async_log_.LogDetail(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+  template <typename... Args>
+  AsyncLog& Warning(Args&&... args) {
+    async_log_.FlagWarning();
+    async_log_.LogDetail(std::forward<Args>(args)...);
+    return async_log_;
+  }
+#endif
+
+ private:
+  AsyncLog& async_log_;
+};
+
+/// \brief A helper to simplify adding a detail log entry.
+template <typename LambdaT>
+void LogDetail(LambdaT&& lambda) {
+  Log([lambda = std::forward<LambdaT>(lambda),
+       timestamp = PerfClock::now()](AsyncLog& log) mutable {
+    log.SetLogDetailTime(timestamp);
+    AsyncDetail async_detail(log);
+    lambda(async_detail);
+  });
+}
+
+/// \brief The convenience proxy a ScopedTracer lambda uses to write to the
+/// detail log.
+class AsyncTrace {
+ public:
+  explicit AsyncTrace(AsyncLog& async_log) : async_log_(async_log) {}
+  AsyncLog& async_log() { return async_log_; }
+
+  template <typename... Args>
+  AsyncLog& operator()(Args&&... args) {
+    async_log_.ScopedTrace(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+ private:
+  AsyncLog& async_log_;
+};
+
+/// \brief ScopedTracer is an RAII object that traces the start and end
+/// of its lifetime.
+template <typename LambdaT>
+class ScopedTracer {
+ public:
+  ScopedTracer(LambdaT&& lambda)
+      : start_(PerfClock::now()), lambda_(std::forward<LambdaT>(lambda)) {}
+
+  ~ScopedTracer() {
+    Log([start = start_, lambda = std::move(lambda_),
+         end = PerfClock::now()](AsyncLog& log) {
+      log.SetScopedTraceTimes(start, end);
+      AsyncTrace async_trace(log);
+      lambda(async_trace);
+    });
+  }
+
+ private:
+  PerfClock::time_point start_;
+  LambdaT lambda_;
+};
+
+/// \brief Helper that creates a ScopeTracer with automatic type deduction.
+/// \details Helps with automatic template type deduction, which has been
+/// supported for functions for a long time.
+/// C++17 will support deduction for classes, which will neutralize the utility
+/// of a helper function like this.
+/// \todo Determine which traces to keep for submission purposes.
+template <typename LambdaT>
+auto MakeScopedTracer(LambdaT&& lambda) -> ScopedTracer<LambdaT> {
+  return ScopedTracer<LambdaT>(std::forward<LambdaT>(lambda));
+}
+
+template <typename... Args>
+void AsyncLog::LogSummary(const std::string& message, const Args... args) {
+  auto tracer = MakeScopedTracer([message](AsyncTrace& trace) {
+    std::string sanitized_message = message;
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '"', '\'');
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '\n', ';');
+    trace("LogSummary", "message", "\"" + sanitized_message + "\"");
+  });
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  *summary_out_ << message;
+  LogArgs(summary_out_, args...);
+  *summary_out_ << "\n";
+
+  if (copy_summary_to_stdout_) {
+    std::cout << message;
+    LogArgs(&std::cout, args...);
+    std::cout << "\n";
+  }
+}
+
+#if USE_NEW_LOGGING_FORMAT
+template <typename T>
+void AsyncLog::LogDetail(const std::string& key, const T& value,
+                         const std::string file_name,
+                         const unsigned int line_no) {
+  auto tracer = MakeScopedTracer([key](AsyncTrace& trace) {
+    std::string sanitized_key = key;
+    std::replace(sanitized_key.begin(), sanitized_key.end(), '"', '\'');
+    std::replace(sanitized_key.begin(), sanitized_key.end(), '\n', ';');
+    trace("LogDetail", "key", "\"" + sanitized_key + "\"");
+  });
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  std::vector<std::ostream*> detail_streams{detail_out_, &std::cout};
+  if (!copy_detail_to_stdout_) {
+    detail_streams.pop_back();
+  }
+  auto time_ns = (log_detail_time_ - log_origin_).count();
+  for (auto os : detail_streams) {
+    *os << ":::MLLOG {" << "\"key\": " << ArgValueTransform(key) << ", "
+        << "\"value\": " << ArgValueTransform(value) << ", "
+        << "\"time_ms\": " << ArgValueTransform(time_ns / 1000000ULL) << "."
+        << std::setfill('0') << std::setw(6)
+        << ArgValueTransform(time_ns % 1000000ULL) << ", "
+        << "\"namespace\": \"mlperf::logging\", "
+        << "\"event_type\": \"POINT_IN_TIME\", " << "\"metadata\": {"
+        << "\"is_error\": " << ArgValueTransform(error_flagged_) << ", "
+        << "\"is_warning\": " << ArgValueTransform(warning_flagged_) << ", "
+        << "\"file\": \"" << file_name << "\", "
+        << "\"line_no\": " << ArgValueTransform(line_no) << ", "
+        << "\"pid\": " << ArgValueTransform(current_pid_) << ", "
+        << "\"tid\": " << ArgValueTransform(current_tid_) << "}}\n";
+    if (error_flagged_) {
+      os->flush();
+    }
+  }
+  error_flagged_ = false;
+  warning_flagged_ = false;
+}
+#else
+template <typename... Args>
+void AsyncLog::LogDetail(const std::string& message, const Args... args) {
+  auto tracer = MakeScopedTracer([message](AsyncTrace& trace) {
+    std::string sanitized_message = message;
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '"', '\'');
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '\n', ';');
+    trace("LogDetail", "message", "\"" + sanitized_message + "\"");
+  });
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  std::vector<std::ostream*> detail_streams{detail_out_, &std::cout};
+  if (!copy_detail_to_stdout_) {
+    detail_streams.pop_back();
+  }
+  for (auto os : detail_streams) {
+    *os << "\"pid\": " << current_pid_ << ", " << "\"tid\": " << current_tid_
+        << ", " << "\"ts\": " << (log_detail_time_ - log_origin_).count()
+        << "ns : ";
+    if (error_flagged_) {
+      *os << "ERROR : ";
+    } else if (warning_flagged_) {
+      *os << "WARNING : ";
+    }
+    *os << message;
+    LogArgs(os, args...);
+    *os << "\n";
+    if (error_flagged_) {
+      os->flush();
+    }
+  }
+  error_flagged_ = false;
+  warning_flagged_ = false;
+}
+#endif
+
+}  // namespace logging
+
+// Export some things out of the logging namespace to simplify call sites.
+
+const auto GlobalLogger = logging::GlobalLogger;
+const auto Log = logging::Log;
+
+using PerfClock = logging::PerfClock;
+
+using LogBinaryAsHexString = logging::LogBinaryAsHexString;
+
+using AsyncLog = logging::AsyncLog;
+
+using AsyncSummary = logging::AsyncSummary;
+template <typename LambdaT>
+void LogSummary(LambdaT&& lambda) {
+  logging::LogSummary(std::forward<LambdaT>(lambda));
+}
+
+using AsyncDetail = logging::AsyncDetail;
+template <typename LambdaT>
+void LogDetail(LambdaT&& lambda) {
+  logging::LogDetail(std::forward<LambdaT>(lambda));
+}
+
+using AsyncTrace = logging::AsyncTrace;
+
+template <typename LambdaT>
+using ScopedTracer = logging::ScopedTracer<LambdaT>;
+
+template <typename LambdaT>
+auto MakeScopedTracer(LambdaT&& lambda) -> ScopedTracer<LambdaT> {
+  return ScopedTracer<LambdaT>(std::forward<LambdaT>(lambda));
+}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_LOGGING_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf
new file mode 100644
index 000000000..1b825514b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf
@@ -0,0 +1,164 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# Set performance_sample_count for each model.
+# User can optionally set this to higher values in user.conf.
+resnet50.*.performance_sample_count_override = 1024
+ssd-mobilenet.*.performance_sample_count_override = 256
+retinanet.*.performance_sample_count_override = 64
+bert.*.performance_sample_count_override = 10833
+dlrm.*.performance_sample_count_override = 204800
+dlrm-v2.*.performance_sample_count_override = 204800
+rnnt.*.performance_sample_count_override = 2513
+gptj.*.performance_sample_count_override = 13368
+mixtral-8x7b.*.performance_sample_count_override = 15000
+llama2-70b.*.performance_sample_count_override = 24576
+llama2-70b-interactive.*.performance_sample_count_override = 24576
+llama3_1-405b.*.performance_sample_count_override = 8313
+llama3_1-405b-interactive.*.performance_sample_count_override = 8313
+llama3_1-8b.*.performance_sample_count_override = 13368
+llama3_1-8b-edge.*.performance_sample_count_override = 5000
+llama3_1-8b-interactive.*.performance_sample_count_override = 13368
+stable-diffusion-xl.*.performance_sample_count_override = 5000
+rgat.*.performance_sample_count_override = 788379
+pointpainting.*.performance_sample_count_override = 1024
+deepseek-r1.*.performance_sample_count_override = 4388
+whisper.*.performance_sample_count_override = 1633
+# set to 0 to let entire sample set to be performance sample
+3d-unet.*.performance_sample_count_override = 0
+
+# Set seeds.
+*.*.qsl_rng_seed = 1780908523862526354
+*.*.sample_index_rng_seed = 14771362308971278857
+*.*.schedule_rng_seed = 18209322760996052031
+
+# Set seeds for TEST_05 (not needed from v5.0 onwards)
+*.*.test05_qsl_rng_seed = 7975553102935885558
+*.*.test05_sample_index_rng_seed = 11403566307062068064
+*.*.test05_schedule_rng_seed = 15816800565822761601
+
+*.SingleStream.target_latency_percentile = 90
+pointpainting.SingleStream.target_latency_percentile = 99.9
+*.SingleStream.min_duration = 600000
+
+*.MultiStream.target_latency_percentile = 99
+*.MultiStream.samples_per_query = 8
+*.MultiStream.min_duration = 600000
+*.MultiStream.min_query_count = 662
+retinanet.MultiStream.target_latency = 528
+
+# 3D-UNet uses equal issue mode because it has non-uniform inputs
+3d-unet.*.sample_concatenate_permutation = 1
+
+# R-GAT uses equal issue mode because it may have non-uniform inputs
+rgat.*.sample_concatenate_permutation = 1
+
+# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
+gptj.*.sample_concatenate_permutation = 1
+llama2-70b.*.sample_concatenate_permutation = 1
+llama2-70b-interactive.*.sample_concatenate_permutation = 1
+mixtral-8x7b.*.sample_concatenate_permutation = 1
+llama3_1-405b.*.sample_concatenate_permutation = 1
+llama3_1-405b-interactive.*.sample_concatenate_permutation = 1
+llama3_1-8b.*.sample_concatenate_permutation = 1
+llama3_1-8b-edge.*.sample_concatenate_permutation = 1
+llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
+deepseek-r1.*.sample_concatenate_permutation = 1
+whisper.*.sample_concatenate_permutation = 1
+
+*.Server.target_latency = 10
+*.Server.target_latency_percentile = 99
+*.Server.target_duration = 0
+*.Server.min_duration = 600000
+resnet50.Server.target_latency = 15
+retinanet.Server.target_latency = 100
+bert.Server.target_latency = 130
+dlrm.Server.target_latency = 60
+dlrm-v2.Server.target_latency = 60
+rnnt.Server.target_latency = 1000
+gptj.Server.target_latency = 20000
+stable-diffusion-xl.Server.target_latency = 20000
+# Benchmarks that measure token latencies
+llama2-70b.*.use_token_latencies = 1
+llama2-70b-interactive.*.use_token_latencies = 1
+mixtral-8x7b.*.use_token_latencies = 1
+llama3_1-405b.*.use_token_latencies = 1
+llama3_1-405b-interactive.*.use_token_latencies = 1
+llama3_1-8b.*.use_token_latencies = 1
+llama3_1-8b-edge.*.use_token_latencies = 1
+llama3_1-8b-interactive.*.use_token_latencies = 1
+deepseek-r1.*.use_token_latencies = 1
+whisper.*.use_token_latencies = 1
+
+# gptj benchmark infers token latencies
+gptj.*.infer_token_latencies = 1
+gptj.*.token_latency_scaling_factor = 69
+# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0
+llama2-70b.Server.target_latency = 0
+llama2-70b.Server.ttft_latency = 2000
+llama2-70b.Server.tpot_latency = 200
+
+# Target Latencies for interactive setting
+llama2-70b-interactive.Server.target_latency = 0
+llama2-70b-interactive.Server.ttft_latency = 450
+llama2-70b-interactive.Server.tpot_latency = 40
+
+mixtral-8x7b.Server.target_latency = 0
+mixtral-8x7b.Server.ttft_latency = 2000
+mixtral-8x7b.Server.tpot_latency = 200
+
+llama3_1-405b.Server.target_latency = 0
+llama3_1-405b.Server.ttft_latency = 6000
+llama3_1-405b.Server.tpot_latency = 175
+
+# Target Latencies for interactive setting
+llama3_1-405b-interactive.Server.target_latency = 0
+llama3_1-405b-interactive.Server.ttft_latency = 4500
+llama3_1-405b-interactive.Server.tpot_latency = 80
+
+
+llama3_1-8b.Server.target_latency = 0
+llama3_1-8b.Server.ttft_latency = 2000
+llama3_1-8b.Server.tpot_latency = 100
+
+# Target Latencies for interactive setting
+llama3_1-8b-interactive.Server.target_latency = 0
+llama3_1-8b-interactive.Server.ttft_latency = 500
+llama3_1-8b-interactive.Server.tpot_latency = 30
+
+deepseek-r1.Server.target_latency = 0
+deepseek-r1.Server.ttft_latency = 2000
+deepseek-r1.Server.tpot_latency = 80
+
+*.Offline.target_latency_percentile = 90
+*.Offline.min_duration = 600000
+
+# In Offline scenario, we always have one query. But LoadGen maps this to
+# min_sample_count internally in Offline scenario. If the dataset size is larger
+# than 24576 we limit the min_query_count to 24576 and otherwise we use
+# the dataset size as the limit
+
+resnet50.Offline.min_query_count = 24576
+retinanet.Offline.min_query_count = 24576
+dlrm-v2.Offline.min_query_count = 24576
+bert.Offline.min_query_count = 10833
+gptj.Offline.min_query_count = 13368
+rnnt.Offline.min_query_count = 2513
+3d-unet.Offline.min_query_count = 43
+stable-diffusion-xl.Offline.min_query_count = 5000
+llama2-70b.Offline.min_query_count = 24576
+llama3_1-405b.Offline.min_query_count = 8313
+llama3_1-8b.Offline.min_query_count = 13368
+llama3_1-8b-edge.Offline.min_query_count = 5000
+mixtral-8x7b.Offline.min_query_count = 15000
+rgat.Offline.min_query_count = 788379
+deepseek-r1.Offline.min_query_count = 4388
+whisper.Offline.min_query_count = 1633
+
+# These fields should be defined and overridden by user.conf.
+*.SingleStream.target_latency = 10
+*.MultiStream.target_latency = 80
+*.Server.target_qps = 1.0
+*.Offline.target_qps = 1.0
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h
new file mode 100644
index 000000000..7859e0139
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h
@@ -0,0 +1,167 @@
+const char* mlperf_conf =
+"# The format of this config file is 'key = value'.\n"
+"# The key has the format 'model.scenario.key'. Value is mostly int64_t.\n"
+"# Model maybe '*' as wildcard. In that case the value applies to all models.\n"
+"# All times are in milli seconds\n"
+"\n"
+"# Set performance_sample_count for each model.\n"
+"# User can optionally set this to higher values in user.conf.\n"
+"resnet50.*.performance_sample_count_override = 1024\n"
+"ssd-mobilenet.*.performance_sample_count_override = 256\n"
+"retinanet.*.performance_sample_count_override = 64\n"
+"bert.*.performance_sample_count_override = 10833\n"
+"dlrm.*.performance_sample_count_override = 204800\n"
+"dlrm-v2.*.performance_sample_count_override = 204800\n"
+"rnnt.*.performance_sample_count_override = 2513\n"
+"gptj.*.performance_sample_count_override = 13368\n"
+"mixtral-8x7b.*.performance_sample_count_override = 15000\n"
+"llama2-70b.*.performance_sample_count_override = 24576\n"
+"llama2-70b-interactive.*.performance_sample_count_override = 24576\n"
+"llama3_1-405b.*.performance_sample_count_override = 8313\n"
+"llama3_1-405b-interactive.*.performance_sample_count_override = 8313\n"
+"llama3_1-8b.*.performance_sample_count_override = 13368\n"
+"llama3_1-8b-edge.*.performance_sample_count_override = 5000\n"
+"llama3_1-8b-interactive.*.performance_sample_count_override = 13368\n"
+"stable-diffusion-xl.*.performance_sample_count_override = 5000\n"
+"rgat.*.performance_sample_count_override = 788379\n"
+"pointpainting.*.performance_sample_count_override = 1024\n"
+"deepseek-r1.*.performance_sample_count_override = 4388\n"
+"whisper.*.performance_sample_count_override = 1633\n"
+"# set to 0 to let entire sample set to be performance sample\n"
+"3d-unet.*.performance_sample_count_override = 0\n"
+"\n"
+"# Set seeds.\n"
+"*.*.qsl_rng_seed = 1780908523862526354\n"
+"*.*.sample_index_rng_seed = 14771362308971278857\n"
+"*.*.schedule_rng_seed = 18209322760996052031\n"
+"\n"
+"# Set seeds for TEST_05 (not needed from v5.0 onwards)\n"
+"*.*.test05_qsl_rng_seed = 7975553102935885558\n"
+"*.*.test05_sample_index_rng_seed = 11403566307062068064\n"
+"*.*.test05_schedule_rng_seed = 15816800565822761601\n"
+"\n"
+"*.SingleStream.target_latency_percentile = 90\n"
+"pointpainting.SingleStream.target_latency_percentile = 99.9\n"
+"*.SingleStream.min_duration = 600000\n"
+"\n"
+"*.MultiStream.target_latency_percentile = 99\n"
+"*.MultiStream.samples_per_query = 8\n"
+"*.MultiStream.min_duration = 600000\n"
+"*.MultiStream.min_query_count = 662\n"
+"retinanet.MultiStream.target_latency = 528\n"
+"\n"
+"# 3D-UNet uses equal issue mode because it has non-uniform inputs\n"
+"3d-unet.*.sample_concatenate_permutation = 1\n"
+"\n"
+"# R-GAT uses equal issue mode because it may have non-uniform inputs\n"
+"rgat.*.sample_concatenate_permutation = 1\n"
+"\n"
+"# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario\n"
+"gptj.*.sample_concatenate_permutation = 1\n"
+"llama2-70b.*.sample_concatenate_permutation = 1\n"
+"llama2-70b-interactive.*.sample_concatenate_permutation = 1\n"
+"mixtral-8x7b.*.sample_concatenate_permutation = 1\n"
+"llama3_1-405b.*.sample_concatenate_permutation = 1\n"
+"llama3_1-405b-interactive.*.sample_concatenate_permutation = 1\n"
+"llama3_1-8b.*.sample_concatenate_permutation = 1\n"
+"llama3_1-8b-edge.*.sample_concatenate_permutation = 1\n"
+"llama3_1-8b-interactive.*.sample_concatenate_permutation = 1\n"
+"deepseek-r1.*.sample_concatenate_permutation = 1\n"
+"whisper.*.sample_concatenate_permutation = 1\n"
+"\n"
+"*.Server.target_latency = 10\n"
+"*.Server.target_latency_percentile = 99\n"
+"*.Server.target_duration = 0\n"
+"*.Server.min_duration = 600000\n"
+"resnet50.Server.target_latency = 15\n"
+"retinanet.Server.target_latency = 100\n"
+"bert.Server.target_latency = 130\n"
+"dlrm.Server.target_latency = 60\n"
+"dlrm-v2.Server.target_latency = 60\n"
+"rnnt.Server.target_latency = 1000\n"
+"gptj.Server.target_latency = 20000\n"
+"stable-diffusion-xl.Server.target_latency = 20000\n"
+"# Benchmarks that measure token latencies\n"
+"llama2-70b.*.use_token_latencies = 1\n"
+"llama2-70b-interactive.*.use_token_latencies = 1\n"
+"mixtral-8x7b.*.use_token_latencies = 1\n"
+"llama3_1-405b.*.use_token_latencies = 1\n"
+"llama3_1-405b-interactive.*.use_token_latencies = 1\n"
+"llama3_1-8b.*.use_token_latencies = 1\n"
+"llama3_1-8b-edge.*.use_token_latencies = 1\n"
+"llama3_1-8b-interactive.*.use_token_latencies = 1\n"
+"deepseek-r1.*.use_token_latencies = 1\n"
+"whisper.*.use_token_latencies = 1\n"
+"\n"
+"# gptj benchmark infers token latencies\n"
+"gptj.*.infer_token_latencies = 1\n"
+"gptj.*.token_latency_scaling_factor = 69\n"
+"# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0\n"
+"llama2-70b.Server.target_latency = 0\n"
+"llama2-70b.Server.ttft_latency = 2000\n"
+"llama2-70b.Server.tpot_latency = 200\n"
+"\n"
+"# Target Latencies for interactive setting\n"
+"llama2-70b-interactive.Server.target_latency = 0\n"
+"llama2-70b-interactive.Server.ttft_latency = 450\n"
+"llama2-70b-interactive.Server.tpot_latency = 40\n"
+"\n"
+"mixtral-8x7b.Server.target_latency = 0\n"
+"mixtral-8x7b.Server.ttft_latency = 2000\n"
+"mixtral-8x7b.Server.tpot_latency = 200\n"
+"\n"
+"llama3_1-405b.Server.target_latency = 0\n"
+"llama3_1-405b.Server.ttft_latency = 6000\n"
+"llama3_1-405b.Server.tpot_latency = 175\n"
+"\n"
+"# Target Latencies for interactive setting\n"
+"llama3_1-405b-interactive.Server.target_latency = 0\n"
+"llama3_1-405b-interactive.Server.ttft_latency = 4500\n"
+"llama3_1-405b-interactive.Server.tpot_latency = 80\n"
+"\n"
+"\n"
+"llama3_1-8b.Server.target_latency = 0\n"
+"llama3_1-8b.Server.ttft_latency = 2000\n"
+"llama3_1-8b.Server.tpot_latency = 100\n"
+"\n"
+"# Target Latencies for interactive setting\n"
+"llama3_1-8b-interactive.Server.target_latency = 0\n"
+"llama3_1-8b-interactive.Server.ttft_latency = 500\n"
+"llama3_1-8b-interactive.Server.tpot_latency = 30\n"
+"\n"
+"deepseek-r1.Server.target_latency = 0\n"
+"deepseek-r1.Server.ttft_latency = 2000\n"
+"deepseek-r1.Server.tpot_latency = 80\n"
+"\n"
+"*.Offline.target_latency_percentile = 90\n"
+"*.Offline.min_duration = 600000\n"
+"\n"
+"# In Offline scenario, we always have one query. But LoadGen maps this to\n"
+"# min_sample_count internally in Offline scenario. If the dataset size is larger\n"
+"# than 24576 we limit the min_query_count to 24576 and otherwise we use\n"
+"# the dataset size as the limit\n"
+"\n"
+"resnet50.Offline.min_query_count = 24576\n"
+"retinanet.Offline.min_query_count = 24576\n"
+"dlrm-v2.Offline.min_query_count = 24576\n"
+"bert.Offline.min_query_count = 10833\n"
+"gptj.Offline.min_query_count = 13368\n"
+"rnnt.Offline.min_query_count = 2513\n"
+"3d-unet.Offline.min_query_count = 43\n"
+"stable-diffusion-xl.Offline.min_query_count = 5000\n"
+"llama2-70b.Offline.min_query_count = 24576\n"
+"llama3_1-405b.Offline.min_query_count = 8313\n"
+"llama3_1-8b.Offline.min_query_count = 13368\n"
+"llama3_1-8b-edge.Offline.min_query_count = 5000\n"
+"mixtral-8x7b.Offline.min_query_count = 15000\n"
+"rgat.Offline.min_query_count = 788379\n"
+"deepseek-r1.Offline.min_query_count = 4388\n"
+"whisper.Offline.min_query_count = 1633\n"
+"\n"
+"# These fields should be defined and overridden by user.conf.\n"
+"*.SingleStream.target_latency = 10\n"
+"*.MultiStream.target_latency = 80\n"
+"*.Server.target_qps = 1.0\n"
+"*.Offline.target_qps = 1.0\n"
+"\n"
+"";
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml
new file mode 100755
index 000000000..6f0ae06f0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml
@@ -0,0 +1,7 @@
+[build-system]
+requires = ["setuptools>=42", "wheel", "pybind11==2.11.1"]
+build-backend = "setuptools.build_meta:__legacy__"
+
+[tool.cibuildwheel]
+environment = "CFLAGS='-std=c++14'"
+build = "cp3{7,8,9,10,11,12,13}-*"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h
new file mode 100644
index 000000000..6c594efe0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the QueryDispatchLibrary interface.
+
+#ifndef MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
+#define MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
+
+#include <string>
+
+#include "system_under_test.h"
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief The interface a client implements for the LoadGen over the network to
+/// test. The API inherits the System_under_test.h API When working in LON mode
+/// the QueryDispatchLibrary class is used and natively Upcasted to the
+/// QueryDispatchLibrary class.
+
+class QueryDispatchLibrary : public SystemUnderTest {
+ public:
+  virtual ~QueryDispatchLibrary() = default;
+};
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h
new file mode 100644
index 000000000..e740be99e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the structs involved in issuing a query and responding to
+/// a query.
+/// \details These are broken out into their own files since they are exposed
+/// as part of the C API and we want to avoid C clients including C++ code.
+
+#ifndef MLPERF_LOADGEN_QUERY_SAMPLE_H_
+#define MLPERF_LOADGEN_QUERY_SAMPLE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <iostream>
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief Represents a unique identifier for a sample of an issued query.
+/// \details As currently implemented, the id is a pointer to an internal
+/// loadgen struct whose value will never be zero/null.
+typedef uintptr_t ResponseId;
+constexpr ResponseId kResponseIdReserved = 0;
+
+/// \brief An index into the QuerySampleLibrary corresponding to a
+/// single sample.
+typedef size_t QuerySampleIndex;
+
+/// \brief Represents the smallest unit of input inference can run on.
+/// A query consists of one or more samples.
+struct QuerySample {
+  ResponseId id;
+  QuerySampleIndex index;
+};
+
+/// \brief Represents a single response to QuerySample
+struct QuerySampleResponse {
+  ResponseId id;
+  uintptr_t data;
+  size_t size;  ///< Size in bytes.
+  int64_t n_tokens;
+
+ public:
+  QuerySampleResponse(ResponseId id, uintptr_t data, size_t size,
+                      int64_t n_tokens)
+      : id(id),
+        data(data),
+        size(size),
+        n_tokens(n_tokens){
+            // std::cout << "Initialized with 4 arguments, n_tokens: " <<
+            // n_tokens <<"\n";
+        };
+  QuerySampleResponse(ResponseId id, uintptr_t data, size_t size)
+      : id(id),
+        data(data),
+        size(size),
+        n_tokens(0){
+            // std::cout << "Initialized with 3 arguments, n_tokens: " <<
+            // n_tokens <<"\n";
+        };
+  QuerySampleResponse()
+      : id(0),
+        data(0),
+        size(0),
+        n_tokens(0){
+            // std::cout << "Initialized with 0 arguments, n_tokens: " <<
+            // n_tokens <<"\n";
+        };
+};
+
+/// \brief A latency in nanoseconds, as recorded by the loadgen.
+typedef int64_t QuerySampleLatency;
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_QUERY_SAMPLE_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h
new file mode 100644
index 000000000..7258068cb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the QuerySampleLibrary interface.
+
+#ifndef MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
+#define MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief The interface a client implements to coordinate with the loadgen
+/// which samples should be loaded.
+class QuerySampleLibrary {
+ public:
+  virtual ~QuerySampleLibrary() {}
+
+  /// \brief A human readable name for the model.
+  virtual const std::string& Name() = 0;
+
+  /// \brief Total number of samples in library.
+  virtual size_t TotalSampleCount() = 0;
+
+  /// \brief The number of samples that are guaranteed to fit in RAM.
+  virtual size_t PerformanceSampleCount() = 0;
+
+  /// \brief Loads the requested query samples into memory.
+  /// \details Paired with calls to UnloadSamplesFromRam.
+  /// In the MultiStream scenarios:
+  ///   * Samples will appear more than once.
+  ///   * SystemUnderTest::IssueQuery will only be called with a set of samples
+  ///     that are neighbors in the vector of samples here, which helps
+  ///     SUTs that need the queries to be contiguous.
+  /// In all other scenarios:
+  ///   * A previously loaded sample will not be loaded again.
+  virtual void LoadSamplesToRam(
+      const std::vector<QuerySampleIndex>& samples) = 0;
+
+  /// \brief Unloads the requested query samples from memory.
+  /// \details In the MultiStream scenarios:
+  ///   * Samples may be unloaded the same number of times they were loaded;
+  ///     however, if the implementation de-dups loaded samples rather than
+  ///     loading samples into contiguous memory, it may unload a sample the
+  ///     first time they see it unloaded without a refcounting scheme, ignoring
+  ///     subsequent unloads. A refcounting scheme would also work, but is not
+  ///     a requirement.
+  /// In all other scenarios:
+  ///   * A previously unloaded sample will not be unloaded again.
+  virtual void UnloadSamplesFromRam(
+      const std::vector<QuerySampleIndex>& samples) = 0;
+};
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt
new file mode 100644
index 000000000..e47c59fd7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt
@@ -0,0 +1 @@
+pybind11
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc
new file mode 100644
index 000000000..f7c61af43
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc
@@ -0,0 +1,856 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "results.h"
+
+#include "early_stopping.h"
+#include "utils.h"
+
+namespace mlperf {
+namespace loadgen {
+
+void PerformanceSummary::ProcessLatencies() {
+  if (pr.sample_latencies.empty()) {
+    return;
+  }
+
+  sample_count = pr.sample_latencies.size();
+
+  QuerySampleLatency accumulated_sample_latency = 0;
+  for (auto latency : pr.sample_latencies) {
+    accumulated_sample_latency += latency;
+  }
+  sample_latency_mean = accumulated_sample_latency / sample_count;
+
+  std::sort(pr.sample_latencies.begin(), pr.sample_latencies.end());
+
+  target_latency_percentile.sample_latency =
+      pr.sample_latencies[sample_count * target_latency_percentile.percentile];
+  sample_latency_min = pr.sample_latencies.front();
+  sample_latency_max = pr.sample_latencies.back();
+  for (auto& lp : latency_percentiles) {
+    assert(lp.percentile >= 0.0);
+    assert(lp.percentile < 1.0);
+    lp.sample_latency = pr.sample_latencies[sample_count * lp.percentile];
+  }
+
+  query_count = pr.queries_issued;
+
+  // Count the number of overlatency queries. Only for Server scenario. Since in
+  // this scenario the number of samples per query is 1, sample_latencies are
+  // used.
+  if (settings.scenario == TestScenario::Server) {
+    QuerySampleLatency max_latency = settings.target_latency.count() + 1;
+    overlatency_query_count =
+        pr.sample_latencies.end() -
+        std::lower_bound(pr.sample_latencies.begin(), pr.sample_latencies.end(),
+                         max_latency);
+  }
+
+  if (settings.use_token_latencies) {
+    ProcessTokenLatencies();
+  }
+
+  // MultiStream only after this point.
+  if (settings.scenario != TestScenario::MultiStream) {
+    return;
+  }
+
+  // Calculate per-query stats.
+  size_t query_count = pr.queries_issued;
+  assert(pr.query_latencies.size() == query_count);
+  std::sort(pr.query_latencies.begin(), pr.query_latencies.end());
+  QuerySampleLatency accumulated_query_latency = 0;
+  for (auto latency : pr.query_latencies) {
+    accumulated_query_latency += latency;
+  }
+  query_latency_mean = accumulated_query_latency / query_count;
+  query_latency_min = pr.query_latencies.front();
+  query_latency_max = pr.query_latencies.back();
+  target_latency_percentile.query_latency =
+      pr.query_latencies[query_count * target_latency_percentile.percentile];
+  for (auto& lp : latency_percentiles) {
+    lp.query_latency = pr.query_latencies[query_count * lp.percentile];
+  }
+}
+
+void PerformanceSummary::ProcessTokenLatencies() {
+  constexpr auto nTokenInvalid = std::numeric_limits<int64_t>::min();
+  token_count = 0;
+  for (auto n_tokens : pr.token_results.tokens_per_sample) {
+    if (n_tokens != nTokenInvalid) token_count += n_tokens;
+  }
+  if (pr.token_results.first_token_latencies.empty()) {
+    return;
+  }
+  QuerySampleLatency accumulated_first_token_latency = 0;
+  for (auto latency : pr.token_results.first_token_latencies) {
+    accumulated_first_token_latency += latency;
+  }
+  first_token_latency_mean = accumulated_first_token_latency / sample_count;
+  QuerySampleLatency accumulated_tpot = 0;
+  for (auto latency : pr.token_results.time_per_output_token_arr) {
+    accumulated_tpot += latency;
+  }
+  time_per_output_token_mean = accumulated_tpot / sample_count;
+  std::sort(pr.token_results.first_token_latencies.begin(),
+            pr.token_results.first_token_latencies.end());
+  std::sort(pr.token_results.time_per_output_token_arr.begin(),
+            pr.token_results.time_per_output_token_arr.end());
+
+  token_target_latency_percentile.sample_latency =
+      pr.token_results
+          .first_token_latencies[sample_count *
+                                 token_target_latency_percentile.percentile];
+  first_token_latency_min = pr.token_results.first_token_latencies.front();
+  first_token_latency_max = pr.token_results.first_token_latencies.back();
+  for (auto& lp : token_latency_percentiles) {
+    assert(lp.percentile >= 0.0);
+    assert(lp.percentile < 1.0);
+    lp.sample_latency =
+        pr.token_results.first_token_latencies[sample_count * lp.percentile];
+  }
+
+  target_tpot_percentile.sample_latency =
+      pr.token_results
+          .time_per_output_token_arr[sample_count *
+                                     target_tpot_percentile.percentile];
+  time_per_output_token_min =
+      pr.token_results.time_per_output_token_arr.front();
+  time_per_output_token_max = pr.token_results.time_per_output_token_arr.back();
+  for (auto& lp : tpot_percentiles) {
+    assert(lp.percentile >= 0.0);
+    assert(lp.percentile < 1.0);
+    lp.sample_latency =
+        pr.token_results
+            .time_per_output_token_arr[sample_count * lp.percentile];
+  }
+
+  if (settings.scenario == TestScenario::Server) {
+    // TODO: Maybe another target latency needs to be added?
+    QuerySampleLatency max_latency = settings.target_latency.count() + 1;
+    overlatency_first_token_count =
+        pr.token_results.first_token_latencies.end() -
+        std::lower_bound(pr.token_results.first_token_latencies.begin(),
+                         pr.token_results.first_token_latencies.end(),
+                         max_latency);
+  }
+}
+
+bool PerformanceSummary::EarlyStopping(
+    std::string* recommendation, int64_t queries_issued,
+    std::vector<QuerySampleLatency>* sample_latencies,
+    std::vector<QuerySampleLatency>* query_latencies,
+    std::chrono::nanoseconds target_latency) {
+  recommendation->clear();
+
+  MinPassingQueriesFinder find_min_passing;
+  double confidence = 0.99;
+  double tolerance = 0.0;
+
+  ProcessLatencies();
+  switch (settings.scenario) {
+    case TestScenario::SingleStream: {
+      // TODO: Grab multistream percentile from settings, instead of hardcoding.
+      double multi_stream_percentile = 0.99;
+      int64_t t = 1;
+      int64_t h_min = find_min_passing(1, target_latency_percentile.percentile,
+                                       tolerance, confidence);
+      int64_t h = h_min;
+      if (queries_issued < h_min + 1) {
+        *recommendation =
+            " * Only processed " + std::to_string(queries_issued) +
+            " queries.\n * Need to process at least " +
+            std::to_string(h_min + 1) + " queries for early stopping.";
+        return false;
+      } else {
+        for (int64_t i = 2; i < queries_issued + 1; ++i) {
+          h = find_min_passing(i, target_latency_percentile.percentile,
+                               tolerance, confidence);
+          if (queries_issued < h + i) {
+            t = i - 1;
+            break;
+          }
+        }
+      }
+      QuerySampleLatency percentile_estimate =
+          (*sample_latencies)[queries_issued - t];
+      *recommendation =
+          " * Processed at least " + std::to_string(h_min + 1) + " queries (" +
+          std::to_string(queries_issued) + ").\n" + " * Would discard " +
+          std::to_string(t - 1) + " highest latency queries.\n" +
+          " * Early stopping " +
+          DoubleToString(target_latency_percentile.percentile * 100, 1) +
+          "th percentile estimate: " + std::to_string(percentile_estimate);
+      early_stopping_latency_ss = percentile_estimate;
+
+      // Early stopping estimate for 99%ile (used for infering multi-stream from
+      // single-stream)
+      t = 1;
+      h_min =
+          find_min_passing(1, multi_stream_percentile, tolerance, confidence);
+      h = h_min;
+      if (queries_issued < h_min + 1) {
+        *recommendation +=
+            "\n * Not enough queries processed for " +
+            DoubleToString(multi_stream_percentile * 100, 1) +
+            "th percentile\n" +
+            " early stopping estimate (would need to process at\n least " +
+            std::to_string(h_min + 1) + " total queries).";
+      } else {
+        for (int64_t i = 2; i < queries_issued + 1; ++i) {
+          h = find_min_passing(i, multi_stream_percentile, tolerance,
+                               confidence);
+          if (queries_issued < h + i) {
+            t = i - 1;
+            break;
+          }
+        }
+        percentile_estimate = (*sample_latencies)[queries_issued - t];
+        *recommendation +=
+            "\n * Early stopping " +
+            DoubleToString(multi_stream_percentile * 100, 1) +
+            "th percentile estimate: " + std::to_string(percentile_estimate);
+        early_stopping_latency_ms = percentile_estimate;
+      }
+      break;
+    }
+    case TestScenario::Server: {
+      int64_t t =
+          std::count_if((*sample_latencies).begin(), (*sample_latencies).end(),
+                        [=](auto const& latency) {
+                          return latency > target_latency.count();
+                        });
+      int64_t h = find_min_passing(t, target_latency_percentile.percentile,
+                                   tolerance, confidence);
+      if (queries_issued >= h + t) {
+        *recommendation = " * Run successful.";
+      } else {
+        *recommendation = " * Run unsuccessful.\n * Processed " +
+                          std::to_string(queries_issued) + " queries.\n" +
+                          " * Would need to run at least " +
+                          std::to_string(h + t - queries_issued) +
+                          " more queries,\n with the run being successful if "
+                          "every additional\n query were under latency.";
+        return false;
+      }
+      break;
+    }
+    case TestScenario::MultiStream: {
+      int64_t t = 1;
+      int64_t h_min = find_min_passing(1, target_latency_percentile.percentile,
+                                       tolerance, confidence);
+      int64_t h = h_min;
+      if (queries_issued < h_min + 1) {
+        *recommendation =
+            " * Only processed " + std::to_string(queries_issued) +
+            " queries.\n * Need to process at least " +
+            std::to_string(h_min + 1) + " queries for early stopping.";
+        return false;
+      } else {
+        for (int64_t i = 2; i < queries_issued + 1; ++i) {
+          h = find_min_passing(i, target_latency_percentile.percentile,
+                               tolerance, confidence);
+          if (queries_issued < h + i) {
+            t = i - 1;
+            break;
+          }
+        }
+      }
+      QuerySampleLatency percentile_estimate =
+          (*query_latencies)[queries_issued - t];
+      *recommendation =
+          " * Processed at least " + std::to_string(h_min + 1) + " queries (" +
+          std::to_string(queries_issued) + ").\n" + " * Would discard " +
+          std::to_string(t - 1) + " highest latency queries.\n" +
+          " * Early stopping " +
+          DoubleToString(target_latency_percentile.percentile * 100, 1) +
+          "th percentile estimate: " + std::to_string(percentile_estimate);
+      early_stopping_latency_ms = percentile_estimate;
+      break;
+    }
+    case TestScenario::Offline:
+      break;
+  }
+  return true;
+}
+
+bool PerformanceSummary::MinDurationMet(std::string* recommendation) {
+  recommendation->clear();
+  const double min_duration = DurationToSeconds(settings.min_duration);
+  bool min_duration_met = false;
+  switch (settings.scenario) {
+    case TestScenario::Offline:
+      min_duration_met = pr.max_latency >= min_duration;
+      break;
+    case TestScenario::Server:
+      min_duration_met = pr.final_query_scheduled_time >= min_duration;
+      break;
+    case TestScenario::SingleStream:
+    case TestScenario::MultiStream:
+      min_duration_met = pr.final_query_issued_time >= min_duration;
+      break;
+  }
+  if (min_duration_met) {
+    return true;
+  }
+
+  switch (settings.scenario) {
+    case TestScenario::SingleStream:
+    case TestScenario::MultiStream:
+      *recommendation =
+          "Decrease the expected latency so the loadgen pre-generates more "
+          "queries.";
+      break;
+    case TestScenario::Server:
+      *recommendation =
+          "Increase the target QPS so the loadgen pre-generates more queries.";
+      break;
+    case TestScenario::Offline:
+      *recommendation =
+          "Increase expected QPS so the loadgen pre-generates a larger "
+          "(coalesced) query.";
+      break;
+  }
+  return false;
+}
+
+bool PerformanceSummary::MinQueriesMet() {
+  return pr.queries_issued >= settings.min_query_count;
+}
+
+bool PerformanceSummary::MinSamplesMet() {
+  return sample_count >= settings.min_sample_count;
+}
+
+bool PerformanceSummary::HasPerfConstraints() {
+  return settings.scenario == TestScenario::Server;
+}
+
+bool PerformanceSummary::PerfConstraintsMet(std::string* recommendation) {
+  recommendation->clear();
+  bool perf_constraints_met = true;
+  switch (settings.scenario) {
+    case TestScenario::SingleStream:
+    case TestScenario::MultiStream:
+      break;
+    case TestScenario::Server:
+      ProcessLatencies();
+      if (!settings.use_token_latencies) {
+        if (target_latency_percentile.sample_latency >
+            settings.target_latency.count()) {
+          *recommendation = "Reduce target QPS to improve latency.";
+          perf_constraints_met = false;
+        }
+      } else {
+        if (token_target_latency_percentile.sample_latency >
+            settings.server_ttft_latency) {
+          *recommendation =
+              "TTFT constrain not met: Reduce target QPS to improve latency.";
+          perf_constraints_met = false;
+        }
+
+        if (target_tpot_percentile.sample_latency >
+            settings.server_tpot_latency) {
+          if (recommendation->empty()) {
+            *recommendation =
+                "TPOT constrain not met: Reduce target QPS to improve latency.";
+          } else {
+            recommendation->append(
+                "\n * TPOT constrain not met: Reduce target QPS to improve "
+                "latency.");
+          }
+          perf_constraints_met = false;
+        }
+      }
+      break;
+    case TestScenario::Offline:
+      break;
+  }
+  return perf_constraints_met;
+}
+
+void PerformanceSummary::LogSummary(AsyncSummary& summary) {
+  ProcessLatencies();
+
+  summary(
+      "================================================\n"
+      "MLPerf Results Summary\n"
+      "================================================");
+  summary("SUT name : ", sut_name);
+  summary("Scenario : ", ToString(settings.scenario));
+  summary("Mode     : ", ToString(settings.mode));
+
+  switch (settings.scenario) {
+    case TestScenario::SingleStream: {
+      summary(DoubleToString(target_latency_percentile.percentile * 100, 1) +
+                  "th percentile latency (ns) : ",
+              target_latency_percentile.sample_latency);
+      break;
+    }
+    case TestScenario::MultiStream: {
+      summary(DoubleToString(target_latency_percentile.percentile * 100, 1) +
+                  "th percentile latency (ns) : ",
+              target_latency_percentile.query_latency);
+      break;
+    }
+    case TestScenario::Server: {
+      // Subtract 1 from sample count since the start of the final sample
+      // represents the open end of the time range: i.e. [begin, end).
+      // This makes sense since:
+      // a) QPS doesn't apply if there's only one sample; it's pure latency.
+      // b) If you have precisely 1k QPS, there will be a sample exactly on
+      //    the 1 second time point; but that would be the 1001th sample in
+      //    the stream. Given the first 1001 queries, the QPS is
+      //    1000 queries / 1 second.
+      // TODO: make a more permanent solution
+      double qps_as_completed =
+          (sample_count - 1) / pr.final_query_all_samples_done_time;
+      summary("Completed samples per second    : ",
+              DoubleToString(qps_as_completed));
+      break;
+    }
+    case TestScenario::Offline: {
+      double samples_per_second = sample_count / pr.max_latency;
+      summary("Samples per second: ", samples_per_second);
+      break;
+    }
+  }
+
+  if (settings.use_token_latencies) {
+    switch (settings.scenario) {
+      case TestScenario::SingleStream: {
+        summary(DoubleToString(token_target_latency_percentile.percentile * 100,
+                               1) +
+                    "th first token percentile latency (ns) : ",
+                token_target_latency_percentile.sample_latency);
+        break;
+      }
+      case TestScenario::MultiStream: {
+        summary(DoubleToString(token_target_latency_percentile.percentile * 100,
+                               1) +
+                    "th first token percentile latency (ns) : ",
+                token_target_latency_percentile.sample_latency);
+        break;
+      }
+      case TestScenario::Offline: {
+        double tokens_per_second = token_count / pr.max_latency;
+        summary("Tokens per second: ", tokens_per_second);
+        break;
+      }
+      case TestScenario::Server:
+        double tps_as_completed =
+            token_count / pr.final_query_all_samples_done_time;
+        summary("Completed tokens per second: ",
+                DoubleToString(tps_as_completed));
+        break;
+    }
+  }
+
+  if (settings.infer_token_latencies) {
+    switch (settings.scenario) {
+      case TestScenario::SingleStream: {
+        break;
+      }
+      case TestScenario::MultiStream: {
+        break;
+      }
+      case TestScenario::Offline: {
+        double tokens_per_second = settings.token_latency_scaling_factor *
+                                   sample_count / pr.max_latency;
+        summary("Tokens per second (inferred): ", tokens_per_second);
+        break;
+      }
+      case TestScenario::Server:
+        double tps_as_completed = settings.token_latency_scaling_factor *
+                                  (sample_count - 1) /
+                                  pr.final_query_all_samples_done_time;
+        summary("Completed tokens per second (inferred): ",
+                DoubleToString(tps_as_completed));
+        break;
+    }
+  }
+
+  std::string min_duration_recommendation;
+  std::string perf_constraints_recommendation;
+  std::string early_stopping_recommendation;
+  std::string early_stopping_ttft_recommendation;
+  std::string early_stopping_tpot_recommendation;
+
+  bool min_duration_met = MinDurationMet(&min_duration_recommendation);
+  bool min_queries_met = MinQueriesMet() && MinSamplesMet();
+  bool early_stopping_met = true;
+  if (!settings.use_token_latencies) {
+    early_stopping_met = EarlyStopping(
+        &early_stopping_recommendation, pr.queries_issued, &pr.sample_latencies,
+        &pr.query_latencies, settings.target_latency);
+  } else {
+    early_stopping_met =
+        EarlyStopping(&early_stopping_tpot_recommendation, pr.queries_issued,
+                      &pr.token_results.time_per_output_token_arr,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_tpot_latency)) &&
+        EarlyStopping(&early_stopping_ttft_recommendation, pr.queries_issued,
+                      &pr.token_results.first_token_latencies,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_ttft_latency));
+  }
+  bool perf_constraints_met =
+      PerfConstraintsMet(&perf_constraints_recommendation);
+  bool all_constraints_met = min_duration_met && min_queries_met &&
+                             perf_constraints_met && early_stopping_met;
+  summary("Result is : ", all_constraints_met ? "VALID" : "INVALID");
+  if (HasPerfConstraints()) {
+    summary("  Performance constraints satisfied : ",
+            perf_constraints_met ? "Yes" : "NO");
+  }
+  summary("  Min duration satisfied : ", min_duration_met ? "Yes" : "NO");
+  summary("  Min queries satisfied : ", min_queries_met ? "Yes" : "NO");
+  summary("  Early stopping satisfied: ", early_stopping_met ? "Yes" : "NO");
+
+  if (!all_constraints_met) {
+    summary("Recommendations:");
+    if (!perf_constraints_met) {
+      summary(" * " + perf_constraints_recommendation);
+    }
+    if (!min_duration_met) {
+      summary(" * " + min_duration_recommendation);
+    }
+    if (!min_queries_met) {
+      summary(
+          " * The test exited early, before enough queries were issued.\n"
+          "   See the detailed log for why this may have occurred.");
+    }
+  }
+  // Early stopping results
+  if (settings.scenario == TestScenario::SingleStream ||
+      settings.scenario == TestScenario::Server ||
+      settings.scenario == TestScenario::MultiStream) {
+    if (!settings.use_token_latencies) {
+      summary("Early Stopping Result:");
+      summary(early_stopping_recommendation);
+    } else {
+      summary("TTFT Early Stopping Result:");
+      summary(early_stopping_ttft_recommendation);
+      summary("TPOT Early Stopping Result:");
+      summary(early_stopping_tpot_recommendation);
+    }
+  }
+
+  summary(
+      "\n"
+      "================================================\n"
+      "Additional Stats\n"
+      "================================================");
+
+  if (settings.scenario == TestScenario::SingleStream) {
+    double qps_w_lg = (sample_count - 1) / pr.final_query_issued_time;
+    double qps_wo_lg = 1 / QuerySampleLatencyToSeconds(sample_latency_mean);
+    summary("QPS w/ loadgen overhead         : " + DoubleToString(qps_w_lg));
+    summary("QPS w/o loadgen overhead        : " + DoubleToString(qps_wo_lg));
+    summary("");
+  } else if (settings.scenario == TestScenario::Server) {
+    // Scheduled samples per second as an additional stat
+    double qps_as_scheduled =
+        (sample_count - 1) / pr.final_query_scheduled_time;
+    summary("Scheduled samples per second : ",
+            DoubleToString(qps_as_scheduled));
+  } else if (settings.scenario == TestScenario::MultiStream) {
+    summary("Per-query latency:  ");
+    summary("Min latency (ns)                : ", query_latency_min);
+    summary("Max latency (ns)                : ", query_latency_max);
+    summary("Mean latency (ns)               : ", query_latency_mean);
+    for (auto& lp : latency_percentiles) {
+      summary(
+          DoubleToString(lp.percentile * 100) + " percentile latency (ns)   : ",
+          lp.query_latency);
+    }
+  }
+
+  if (settings.scenario != TestScenario::MultiStream) {
+    summary("Min latency (ns)                : ", sample_latency_min);
+    summary("Max latency (ns)                : ", sample_latency_max);
+    summary("Mean latency (ns)               : ", sample_latency_mean);
+    for (auto& lp : latency_percentiles) {
+      summary(
+          DoubleToString(lp.percentile * 100) + " percentile latency (ns)   : ",
+          lp.sample_latency);
+    }
+  }
+  if (settings.use_token_latencies) {
+    summary("");
+    if (settings.scenario == TestScenario::SingleStream) {
+      double tps_w_lg = token_count / pr.final_query_issued_time;
+      double tps_wo_lg =
+          ((double)token_count) /
+          (QuerySampleLatencyToSeconds(sample_latency_mean) * sample_count);
+      summary("TPS w/ loadgen overhead         : " + DoubleToString(tps_w_lg));
+      summary("TPS w/o loadgen overhead        : " + DoubleToString(tps_wo_lg));
+
+    } else if (settings.scenario == TestScenario::Server) {
+      double tps_as_completed =
+          token_count / pr.final_query_all_samples_done_time;
+      summary("Completed tokens per second                 : ",
+              DoubleToString(tps_as_completed));
+    }
+
+    if (settings.scenario != TestScenario::Offline) {
+      summary("Min First Token latency (ns)                : ",
+              first_token_latency_min);
+      summary("Max First Token latency (ns)                : ",
+              first_token_latency_max);
+      summary("Mean First Token latency (ns)               : ",
+              first_token_latency_mean);
+      for (auto& lp : token_latency_percentiles) {
+        summary(DoubleToString(lp.percentile * 100) +
+                    " percentile first token latency (ns)   : ",
+                lp.sample_latency);
+      }
+      summary("");
+      summary("Min Time per Output Token (ns)                : ",
+              time_per_output_token_min);
+      summary("Max Time per Output Token (ns)                : ",
+              time_per_output_token_max);
+      summary("Mean Time per Output Token (ns)               : ",
+              time_per_output_token_mean);
+      for (auto& lp : tpot_percentiles) {
+        summary(DoubleToString(lp.percentile * 100) +
+                    " percentile time to output token (ns)   : ",
+                lp.sample_latency);
+      }
+    }
+  }
+
+  summary(
+      "\n"
+      "================================================\n"
+      "Test Parameters Used\n"
+      "================================================");
+  settings.LogSummary(summary);
+}
+
+void PerformanceSummary::LogDetail(AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+  ProcessLatencies();
+
+  // General validity checking
+  std::string min_duration_recommendation;
+  std::string perf_constraints_recommendation;
+  std::string early_stopping_recommendation;
+  std::string early_stopping_ttft_recommendation;
+  std::string early_stopping_tpot_recommendation;
+  bool min_duration_met = MinDurationMet(&min_duration_recommendation);
+  bool min_queries_met = MinQueriesMet() && MinSamplesMet();
+  bool perf_constraints_met =
+      PerfConstraintsMet(&perf_constraints_recommendation);
+  bool early_stopping_met = true;
+  if (!settings.use_token_latencies) {
+    early_stopping_met = EarlyStopping(
+        &early_stopping_recommendation, pr.queries_issued, &pr.sample_latencies,
+        &pr.query_latencies, settings.target_latency);
+  } else {
+    early_stopping_met =
+        EarlyStopping(&early_stopping_tpot_recommendation, pr.queries_issued,
+                      &pr.token_results.time_per_output_token_arr,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_tpot_latency)) &&
+        EarlyStopping(&early_stopping_ttft_recommendation, pr.queries_issued,
+                      &pr.token_results.first_token_latencies,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_ttft_latency));
+  }
+  bool all_constraints_met = min_duration_met && min_queries_met &&
+                             perf_constraints_met && early_stopping_met;
+
+  MLPERF_LOG(detail, "result_validity",
+             all_constraints_met ? "VALID" : "INVALID");
+  if (HasPerfConstraints()) {
+    MLPERF_LOG(detail, "result_perf_constraints_met", perf_constraints_met);
+  }
+  MLPERF_LOG(detail, "result_min_duration_met", min_duration_met);
+  MLPERF_LOG(detail, "result_min_queries_met", min_queries_met);
+  MLPERF_LOG(detail, "early_stopping_met", early_stopping_met);
+  if (!all_constraints_met) {
+    std::string recommendation;
+    if (!perf_constraints_met) {
+      recommendation += perf_constraints_recommendation + " ";
+    }
+    if (!min_duration_met) {
+      recommendation += min_duration_recommendation + " ";
+    }
+    if (!min_queries_met) {
+      recommendation +=
+          "The test exited early, before enough queries were issued.";
+    }
+    std::replace(recommendation.begin(), recommendation.end(), '\n', ' ');
+    MLPERF_LOG(detail, "result_invalid_reason", recommendation);
+  }
+  std::replace(early_stopping_recommendation.begin(),
+               early_stopping_recommendation.end(), '\n', ' ');
+  if (!settings.use_token_latencies) {
+    MLPERF_LOG(detail, "early_stopping_result", early_stopping_recommendation);
+  } else {
+    std::replace(early_stopping_ttft_recommendation.begin(),
+                 early_stopping_ttft_recommendation.end(), '\n', ' ');
+    std::replace(early_stopping_tpot_recommendation.begin(),
+                 early_stopping_tpot_recommendation.end(), '\n', ' ');
+    MLPERF_LOG(detail, "early_stopping_ttft_result",
+               early_stopping_ttft_recommendation);
+    MLPERF_LOG(detail, "early_stopping_tpot_result",
+               early_stopping_tpot_recommendation);
+  }
+  // Report number of queries
+  MLPERF_LOG(detail, "result_query_count", query_count);
+  if (settings.scenario == TestScenario::Server) {
+    MLPERF_LOG(detail, "result_overlatency_query_count",
+               overlatency_query_count);
+  }
+
+  auto reportPerQueryLatencies = [&]() {
+    MLPERF_LOG(detail, "result_min_query_latency_ns", query_latency_min);
+    MLPERF_LOG(detail, "result_max_query_latency_ns", query_latency_max);
+    MLPERF_LOG(detail, "result_mean_query_latency_ns", query_latency_mean);
+    for (auto& lp : latency_percentiles) {
+      std::string percentile = DoubleToString(lp.percentile * 100);
+      MLPERF_LOG(detail,
+                 "result_" + percentile + "_percentile_per_query_latency_ns",
+                 lp.query_latency);
+    }
+  };
+
+  // Per-scenario performance results.
+  switch (settings.scenario) {
+    case TestScenario::SingleStream: {
+      double qps_w_lg = (sample_count - 1) / pr.final_query_issued_time;
+      double qps_wo_lg = 1 / QuerySampleLatencyToSeconds(sample_latency_mean);
+      MLPERF_LOG(detail, "result_qps_with_loadgen_overhead", qps_w_lg);
+      MLPERF_LOG(detail, "result_qps_without_loadgen_overhead", qps_wo_lg);
+      MLPERF_LOG(detail, "early_stopping_latency_ss",
+                 early_stopping_latency_ss);
+      MLPERF_LOG(detail, "early_stopping_latency_ms",
+                 early_stopping_latency_ms);
+      break;
+    }
+    case TestScenario::MultiStream: {
+      reportPerQueryLatencies();
+      MLPERF_LOG(detail, "early_stopping_latency_ms",
+                 early_stopping_latency_ms);
+      break;
+    }
+    case TestScenario::Server: {
+      // Subtract 1 from sample count since the start of the final sample
+      // represents the open end of the time range: i.e. [begin, end).
+      // This makes sense since:
+      // a) QPS doesn't apply if there's only one sample; it's pure latency.
+      // b) If you have precisely 1k QPS, there will be a sample exactly on
+      //    the 1 second time point; but that would be the 1001th sample in
+      //    the stream. Given the first 1001 queries, the QPS is
+      //    1000 queries / 1 second.
+      double qps_as_scheduled =
+          (sample_count - 1) / pr.final_query_scheduled_time;
+      MLPERF_LOG(detail, "result_scheduled_samples_per_sec", qps_as_scheduled);
+      double qps_as_completed =
+          (sample_count - 1) / pr.final_query_all_samples_done_time;
+      MLPERF_LOG(detail, "result_completed_samples_per_sec", qps_as_completed);
+      break;
+    }
+    case TestScenario::Offline: {
+      double samples_per_second = sample_count / pr.max_latency;
+      MLPERF_LOG(detail, "result_samples_per_second", samples_per_second);
+      break;
+    }
+  }
+
+  // Detailed latencies
+  MLPERF_LOG(detail, "result_min_latency_ns", sample_latency_min);
+  MLPERF_LOG(detail, "result_max_latency_ns", sample_latency_max);
+  MLPERF_LOG(detail, "result_mean_latency_ns", sample_latency_mean);
+  for (auto& lp : latency_percentiles) {
+    MLPERF_LOG(detail,
+               "result_" + DoubleToString(lp.percentile * 100) +
+                   "_percentile_latency_ns",
+               lp.sample_latency);
+  }
+  // Detailed first token latencies
+  if (settings.use_token_latencies) {
+    if (settings.scenario != TestScenario::Offline) {
+      MLPERF_LOG(detail, "result_first_token_min_latency_ns",
+                 first_token_latency_min);
+      MLPERF_LOG(detail, "result_first_token_max_latency_ns",
+                 first_token_latency_max);
+      MLPERF_LOG(detail, "result_first_token_mean_latency_ns",
+                 first_token_latency_mean);
+      for (auto& lp : token_latency_percentiles) {
+        MLPERF_LOG(detail,
+                   "result_first_token_" + DoubleToString(lp.percentile * 100) +
+                       "_percentile_latency_ns",
+                   lp.sample_latency);
+      }
+      double tps_w_lg = ((double)token_count) / pr.final_query_issued_time;
+      double tps_wo_lg =
+          ((double)token_count) / (sample_latency_mean * sample_count);
+      MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead",
+                 tps_w_lg);
+      MLPERF_LOG(detail, "result_token_throughput", tps_wo_lg);
+      for (auto& lp : tpot_percentiles) {
+        MLPERF_LOG(detail,
+                   "result_time_per_output_token_" +
+                       DoubleToString(lp.percentile * 100) + "_percentile_ns",
+                   lp.sample_latency);
+      }
+      MLPERF_LOG(detail, "result_time_to_output_token_min",
+                 time_per_output_token_min);
+      MLPERF_LOG(detail, "result_time_to_output_token_max",
+                 time_per_output_token_max);
+      MLPERF_LOG(detail, "result_time_to_output_token_mean",
+                 time_per_output_token_mean);
+      double tps_as_completed =
+          token_count / pr.final_query_all_samples_done_time;
+      MLPERF_LOG(detail, "result_completed_tokens_per_second",
+                 tps_as_completed);
+    } else {
+      double tokens_per_second = token_count / pr.max_latency;
+      MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second);
+    }
+  }
+
+  if (settings.infer_token_latencies) {
+    switch (settings.scenario) {
+      case TestScenario::Server: {
+        double completed_tokens_per_second =
+            (sample_count - 1) * settings.token_latency_scaling_factor /
+            pr.final_query_all_samples_done_time;
+        MLPERF_LOG(detail, "result_inferred_completed_tokens_per_second",
+                   completed_tokens_per_second);
+        break;
+      }
+      case TestScenario::Offline: {
+        double tokens_per_second = sample_count *
+                                   settings.token_latency_scaling_factor /
+                                   pr.max_latency;
+        MLPERF_LOG(detail, "result_inferred_tokens_per_second",
+                   tokens_per_second);
+        break;
+      }
+      case TestScenario::SingleStream: {
+        break;
+      }
+      case TestScenario::MultiStream: {
+        break;
+      }
+    }
+  }
+  MLPERF_LOG(detail, "num_errors", detail.async_log().GetErrorCount());
+#endif
+}
+}  // namespace loadgen
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h
new file mode 100644
index 000000000..6befea2c0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h
@@ -0,0 +1,128 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines PerformanceResult and PerformanceSummary.
+
+#ifndef MLPERF_LOADGEN_RESULTS_H_
+#define MLPERF_LOADGEN_RESULTS_H_
+
+#include <string>
+#include <vector>
+
+#include "query_sample.h"
+#include "test_settings_internal.h"
+
+namespace mlperf {
+namespace loadgen {
+
+/// \brief Contains the performance results for benchmarks that have
+/// token based metrics
+struct TokenPerformanceResults {
+  std::vector<QuerySampleLatency> first_token_latencies;
+  std::vector<QuerySampleLatency> time_per_output_token_arr;
+  std::vector<int64_t> tokens_per_sample;
+};
+
+/// \brief Provides performance results that are independent of scenario
+/// and other context.
+struct PerformanceResult {
+  std::vector<QuerySampleLatency> sample_latencies;
+  std::vector<QuerySampleLatency> query_latencies;
+  size_t queries_issued;
+  double max_latency;
+  double final_query_scheduled_time;         // seconds from start.
+  double final_query_issued_time;            // seconds from start.
+  double final_query_all_samples_done_time;  // seconds from start.
+  TokenPerformanceResults token_results;
+};
+
+/// \brief Wraps PerformanceResult with relevant context to change how
+/// it's interpreted and reported.
+struct PerformanceSummary {
+  std::string sut_name;
+  TestSettingsInternal settings;
+  PerformanceResult pr;
+
+  // Set by ProcessLatencies.
+  size_t sample_count = 0;
+  size_t query_count = 0;
+  size_t overlatency_query_count = 0;
+  QuerySampleLatency sample_latency_min = 0;
+  QuerySampleLatency sample_latency_max = 0;
+  QuerySampleLatency sample_latency_mean = 0;
+  QuerySampleLatency query_latency_min = 0;
+  QuerySampleLatency query_latency_max = 0;
+  QuerySampleLatency query_latency_mean = 0;
+
+  /// \brief The latency at a given percentile.
+  struct PercentileEntry {
+    const double percentile;
+    QuerySampleLatency sample_latency = 0;
+    QuerySampleLatency query_latency = 0;  // MultiStream only.
+  };
+
+  // Latency target percentile
+  PercentileEntry target_latency_percentile{settings.target_latency_percentile};
+  PercentileEntry latency_percentiles[6] = {{.50}, {.90}, {.95},
+                                            {.97}, {.99}, {.999}};
+
+  // Early stopping percentile estimates for SingleStream and MultiStream
+  QuerySampleLatency early_stopping_latency_ss = 0;
+  QuerySampleLatency early_stopping_latency_ms = 0;
+
+  // Set by ProcessTokenLatencies
+  size_t token_count = 0;
+  size_t overlatency_first_token_count = 0;
+  QuerySampleLatency first_token_latency_min = 0;
+  QuerySampleLatency first_token_latency_max = 0;
+  QuerySampleLatency first_token_latency_mean = 0;
+  QuerySampleLatency time_per_output_token_min = 0;
+  QuerySampleLatency time_per_output_token_max = 0;
+  QuerySampleLatency time_per_output_token_mean = 0;
+
+  // Latency token target percentile
+  PercentileEntry token_target_latency_percentile{
+      settings.target_latency_percentile};
+  PercentileEntry token_latency_percentiles[6] = {{.50}, {.90}, {.95},
+                                                  {.97}, {.99}, {.999}};
+  PercentileEntry target_tpot_percentile{settings.target_latency_percentile};
+  PercentileEntry tpot_percentiles[6] = {{.50}, {.90}, {.95},
+                                         {.97}, {.99}, {.999}};
+
+#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
+  // MSVC complains if there is no explicit constructor.
+  // (target_latency_percentile above depends on construction with settings)
+  PerformanceSummary(const std::string& sut_name_arg,
+                     const TestSettingsInternal& settings_arg,
+                     const PerformanceResult& pr_arg)
+      : sut_name(sut_name_arg), settings(settings_arg), pr(pr_arg){};
+#endif
+  void ProcessLatencies();
+  void ProcessTokenLatencies();
+
+  bool MinDurationMet(std::string* recommendation);
+  bool EarlyStopping(std::string* recommendation, int64_t queries_issued,
+                     std::vector<QuerySampleLatency>* sample_latencies,
+                     std::vector<QuerySampleLatency>* query_latencies,
+                     std::chrono::nanoseconds target_latency);
+  bool MinQueriesMet();
+  bool MinSamplesMet();
+  bool HasPerfConstraints();
+  bool PerfConstraintsMet(std::string* recommendation);
+  void LogSummary(AsyncSummary& summary);
+  void LogDetail(AsyncDetail& detail);
+};
+}  // namespace loadgen
+}  // namespace mlperf
+
+#endif
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py
new file mode 100644
index 000000000..6254eea17
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py
@@ -0,0 +1,136 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# \file
+#  \brief MLPerf Inference LoadGen python module setup.
+#  \details Creates a module that python can import.
+#  All source files are compiled by python"s C++ toolchain  without depending
+#  on a loadgen lib.
+#
+#  This setup.py can be used stand-alone, without the use of an external
+#  build system. This will polute your source tree with output files
+#  and binaries. Use one of the gn build targets instead if you want
+#  to avoid poluting the source tree.
+
+from setuptools import Extension, setup
+from pathlib import Path
+from pybind11 import get_include
+from pybind11.setup_helpers import Pybind11Extension, build_ext
+from version_generator import generate_loadgen_version_definitions
+import subprocess
+
+generated_version_source_filename = "generated/version_generated.cc"
+generate_loadgen_version_definitions(generated_version_source_filename, ".")
+
+public_headers = [
+    "loadgen.h",
+    "query_sample.h",
+    "query_sample_library.h",
+    "system_under_test.h",
+    "test_settings.h",
+    "issue_query_controller.h",
+    "early_stopping.h",
+    "query_dispatch_library.h"
+]
+
+lib_headers = [
+    "logging.h",
+    "test_settings_internal.h",
+    "trace_generator.h",
+    "utils.h",
+    "version.h",
+    "results.h",
+    "bindings/c_api.h",
+    "version_generator.py",
+    "mlperf_conf.h"
+]
+
+lib_sources = [
+    "early_stopping.cc",
+    "issue_query_controller.cc",
+    "loadgen.cc",
+    "logging.cc",
+    "test_settings_internal.cc",
+    "utils.cc",
+    "version.cc",
+    "results.cc",
+]
+
+lib_bindings = [
+    "bindings/c_api.cc",
+    "bindings/python_api.cc",
+]
+
+this_directory = Path(__file__).parent
+mlperf_loadgen_headers = public_headers + lib_headers
+mlperf_loadgen_sources_no_gen = lib_sources + lib_bindings
+mlperf_loadgen_sources = mlperf_loadgen_sources_no_gen + [
+    generated_version_source_filename
+]
+mlperf_long_description = (
+    this_directory /
+    "README.md").read_text(
+        encoding="utf-8")
+
+with open("VERSION.txt", "r") as f:
+    version = f.read()
+version_split = version.split(".")
+
+if len(version_split) < 2:
+    print("Version is incomplete. Needs a format like 4.1.1 in VERSION file")
+
+
+try:
+    with open("mlperf.conf", 'r') as file:
+        conf_contents = file.read()
+
+    # Escape backslashes and double quotes
+    conf_contents = conf_contents.replace('\\', '\\\\').replace('"', '\\"')
+
+    # Convert newlines
+    conf_contents = conf_contents.replace('\n', '\\n"\n"')
+
+    formatted_content = f'const char* mlperf_conf =\n"{conf_contents}";\n'
+
+    with open("mlperf_conf.h", 'w') as header_file:
+        header_file.write(formatted_content)
+
+except IOError as e:
+    raise RuntimeError(f"Failed to generate header file: {e}")
+
+mlperf_loadgen_module = Pybind11Extension(
+    "mlperf_loadgen",
+    define_macros=[
+        ("MAJOR_VERSION",
+         version_split[0]),
+        ("MINOR_VERSION",
+         version_split[1])
+    ],
+    include_dirs=[".", get_include()],
+    sources=mlperf_loadgen_sources,
+    depends=mlperf_loadgen_headers,
+)
+
+setup(name="mlcommons_loadgen",
+      version=version,
+      description="MLPerf Inference LoadGen python bindings",
+      url="https://mlcommons.org/",
+      cmdclass={"build_ext": build_ext},
+      ext_modules=[mlperf_loadgen_module],
+      packages=['mlcommons_loadgen'],
+      package_dir={'mlcommons_loadgen': '.'},
+      include_package_data=True,
+      long_description=mlperf_long_description,
+      long_description_content_type='text/markdown')
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h
new file mode 100644
index 000000000..843453962
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the SystemUnderTest interface.
+
+#ifndef MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
+#define MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
+
+#include <string>
+#include <vector>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief The interface a client implements for the loadgen to test.
+/// \todo Add hook for an untimed warm up period for the SUT.
+/// \todo Add hook for an untimed warm up period for the loadgen logic.
+/// \todo Support power hooks for cool-down period before runing performance
+/// traffic.
+/// \todo Support power hooks for correlating test timeline with power
+/// measurment timeline.
+class SystemUnderTest {
+ public:
+  virtual ~SystemUnderTest() {}
+
+  /// \brief A human-readable string for logging purposes.
+  virtual const std::string& Name() = 0;
+
+  /// \brief Lets the loadgen issue N samples to the SUT.
+  /// \details The SUT may either a) return immediately and signal completion
+  /// at a later time on another thread or b) it may block and signal
+  /// completion on the current stack. The load generator will handle both
+  /// cases properly.
+  /// Note: The data for neighboring samples may or may not be contiguous
+  /// depending on the scenario.
+  virtual void IssueQuery(const std::vector<QuerySample>& samples) = 0;
+
+  /// \brief Called immediately after the last call to IssueQuery
+  /// in a series is made.
+  /// \details This doesn't necessarily signify the end of the
+  /// test since there may be multiple series involved during a test; for
+  /// example in accuracy mode.
+  /// Clients can use this to flush any deferred queries immediately, rather
+  /// than waiting for some timeout.
+  /// This is especially useful in the server scenario.
+  virtual void FlushQueries() = 0;
+};
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h
new file mode 100644
index 000000000..584d073bb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h
@@ -0,0 +1,329 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Provides ways for a client to change the behavior and
+/// constraints of the load generator.
+/// \details Note: The MLPerf specification takes precedent over any of the
+/// comments in this file if there are inconsistencies in regards to how the
+/// loadgen *should* work.
+/// The comments in this file are indicative of the loadgen implementation.
+
+#ifndef MLPERF_LOADGEN_TEST_SETTINGS_H
+#define MLPERF_LOADGEN_TEST_SETTINGS_H
+
+#include <cstdint>
+#include <string>
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \addtogroup LoadgenAPITestSettings Test Settings
+/// \brief This page contains a description of all the scenarios, modes,
+/// and log settings as implemented by the LoadGen.
+/// @{
+
+///
+/// \enum TestScenario
+/// * **SingleStream**
+///  + Issues queries containing a single sample.
+///  + The next query is only issued once the previous one has completed.
+///  + Internal LoadGen latency between queries is not included in the
+///    latency results.
+///  + **Final performance result is:** a percentile of the latency.
+/// * **MultiStream**
+///  + Issues queries containing N samples.
+///   - N is specified by \link
+///   mlperf::TestSettings::multi_stream_samples_per_query
+///   multi_stream_samples_per_query \endlink.
+///  + The next query is only issued once the previous one has completed.
+///  + The samples of each query are guaranteed to be contiguous with respect
+///    to the order they were loaded in the QuerySampleLibrary.
+///  + Latency is tracked and reported on a per-query and per-sample basis.
+///  + The latency of a query is the maximum latency of its samples, including
+///    any cross-thread communication within the loadgen.
+///  + Internal LoadGen latency between queries is not included in the
+///    latency results.
+///  + **Final performance result is:** a percentile of the query latency.
+/// * **Server**
+///  + Sends queries with a single sample.
+///  + Queries have a random poisson (non-uniform) arrival rate that, when
+///    averaged, hits the target QPS.
+///  + There is no limit on the number of outstanding queries, as long as
+///    the latency constraints are met.
+///  + **Final performance result is:** PASS if the a percentile of the latency
+///    is under a given threshold. FAIL otherwise.
+///   - Threshold is specified by \link
+///   mlperf::TestSettings::server_target_latency_ns server_target_latency_ns
+///   \endlink.
+/// * **Offline**
+///  + Sends all N samples to the SUT inside of a single query.
+///  + The samples of the query are guaranteed to be contiguous with respect
+///    to the order they were loaded in the QuerySampleLibrary.
+///  + **Final performance result is:** samples per second.
+///
+enum class TestScenario {
+  SingleStream,
+  MultiStream,
+  Server,
+  Offline,
+};
+
+///
+/// \enum TestMode
+/// * **SubmissionRun**
+///  + Runs accuracy mode followed by performance mode.
+///  + TODO: Implement further requirements as decided by MLPerf.
+/// * **AccuracyOnly**
+///  + Runs each sample from the QSL through the SUT a least once.
+///  + Outputs responses to an accuracy json that can be parsed by a model +
+///    sample library specific script.
+/// * **PerformanceOnly**
+///  + Runs the performance traffic for the given scenario, as described in
+///    the comments for TestScenario.
+/// * **FindPeakPerformance**
+///  + Determines the maximumum QPS for the Server scenario.
+///  + Not applicable for SingleStream, MultiStream or Offline scenarios.
+///
+enum class TestMode {
+  SubmissionRun,
+  AccuracyOnly,
+  PerformanceOnly,
+  FindPeakPerformance,
+};
+
+///
+/// \brief Top-level struct specifing the modes and parameters of the test.
+///
+struct TestSettings {
+  TestScenario scenario = TestScenario::SingleStream;
+  TestMode mode = TestMode::PerformanceOnly;
+
+  // ==================================
+  /// \name SingleStream-specific
+  /**@{*/
+  /// \brief A hint used by the loadgen to pre-generate enough samples to
+  ///        meet the minimum test duration.
+  double single_stream_expected_latency_ns = 1000000;
+  /// \brief The latency percentile reported as the final result.
+  double single_stream_target_latency_percentile = 0.90;
+  /**@}*/
+
+  // ==================================
+  /// \name MultiStream-specific
+  /**@{*/
+  /// \brief A hint used by the loadgen to pre-generate enough samples to
+  ///        meet the minimum test duration.
+  /// \brief MultiStream latency is for query (not sample) latency
+  double multi_stream_expected_latency_ns = 8000000;
+  /// \brief The latency percentile for MultiStream mode.
+  double multi_stream_target_latency_percentile = 0.99;
+  /// \brief The number of samples in each query.
+  /// \details How many samples are bundled in a query
+  uint64_t multi_stream_samples_per_query = 8;
+  /**@}*/
+
+  // ==================================
+  /// \name Server-specific
+  /**@{*/
+  /// \brief The average QPS of the poisson distribution.
+  /// \details note: This field is used as a FindPeakPerformance's lower bound.
+  /// When you run FindPeakPerformanceMode, you should make sure that this value
+  /// satisfies performance constraints.
+  double server_target_qps = 1;
+  /// \brief The latency constraint for the Server scenario.
+  uint64_t server_target_latency_ns = 100000000;
+  /// \brief The latency percentile for server mode. This value is combined with
+  /// server_target_latency_ns to determine if a run is valid.
+  /// \details 99% is the default value, which is correct for image models. GNMT
+  /// should be set to 0.97 (97%) in v0.5.(As always, check the policy page for
+  /// updated values for the benchmark you are running.)
+  double server_target_latency_percentile = 0.99;
+  /// \brief If this flag is set to true, LoadGen will combine samples from
+  /// multiple queries into a single query if their scheduled issue times have
+  /// passed.
+  bool server_coalesce_queries = false;
+  /// \brief The decimal places of QPS precision used to terminate
+  /// FindPeakPerformance mode.
+  int server_find_peak_qps_decimals_of_precision = 1;
+  /// \brief A step size (as a fraction of the QPS) used to widen the lower and
+  /// upper bounds to find the initial boundaries of binary search.
+  double server_find_peak_qps_boundary_step_size = 1;
+  /// \brief The maximum number of outstanding queries to allow before earlying
+  /// out from a performance run. Useful for performance tuning and speeding up
+  /// the FindPeakPerformance mode.
+  uint64_t server_max_async_queries = 0;  ///< 0: Infinity.
+  /// \brief The number of issue query threads that will be registered and used
+  /// to call SUT's IssueQuery(). If this is 0, the same thread calling
+  /// StartTest() will be used to call IssueQuery(). See also
+  /// mlperf::RegisterIssueQueryThread().
+  uint64_t server_num_issue_query_threads = 0;
+  /**@}*/
+
+  // ==================================
+  /// \name Offline-specific
+  /**@{*/
+  /// \brief Specifies the QPS the SUT expects to hit for the offline load.
+  /// The loadgen generates 10% more queries than it thinks it needs to meet
+  /// the minimum test duration.
+  double offline_expected_qps = 1;
+  /// \brief Affects the order in which the samples of the dataset are chosen.
+  /// If false it concatenates a single permutation of the dataset (or part
+  /// of it depending on QSL->PerformanceSampleCount()) several times up to the
+  /// number of samples requested.
+  /// If true it concatenates a multiple permutation of the dataset (or a
+  /// part of it depending on QSL->PerformanceSampleCount()) several times
+  /// up to the number of samples requested.
+  bool sample_concatenate_permutation = false;
+  /**@}*/
+
+  // ==================================
+  /// \name Test duration
+  /// The test runs until **both** min duration and min query count have been
+  /// met. However, it will exit before that point if **either** max duration or
+  /// max query count have been reached.
+  /**@{*/
+  uint64_t min_duration_ms = 10000;
+  uint64_t max_duration_ms = 0;  ///< 0: Infinity.
+  uint64_t min_query_count = 100;
+  uint64_t max_query_count = 0;  ///< 0: Infinity.
+  /**@}*/
+
+  // ==================================
+  /// \name Random number generation
+  /// There are 4 separate seeds, so each dimension can be changed
+  /// independently.
+  /**@{*/
+  /// \brief Affects which subset of samples from the QSL are chosen for
+  /// the performance sample set and accuracy sample sets.
+  uint64_t qsl_rng_seed = 0;
+  /// \brief Affects the order in which samples from the performance set will
+  /// be included in queries.
+  uint64_t sample_index_rng_seed = 0;
+  /// \brief Affects the poisson arrival process of the Server scenario.
+  /// \details Different seeds will appear to "jitter" the queries
+  /// differently in time, but should not affect the average issued QPS.
+  uint64_t schedule_rng_seed = 0;
+  /// \brief Affects which samples have their query returns logged to the
+  /// accuracy log in performance mode.
+  uint64_t accuracy_log_rng_seed = 0;
+
+  /// \brief Probability of the query response of a sample being logged to the
+  /// accuracy log in performance mode
+  double accuracy_log_probability = 0.0;
+
+  /// \brief Target number of samples that will have their results printed to
+  /// accuracy log in performance mode for compliance testing
+  uint64_t accuracy_log_sampling_target = 0;
+
+  /// \brief Variables for running test05 from native config. A boolean that
+  /// determines whether or not to run test05 and three random seed to run the
+  /// test
+  bool test05 = false;
+  uint64_t test05_qsl_rng_seed = 0;
+  uint64_t test05_sample_index_rng_seed = 0;
+  uint64_t test05_schedule_rng_seed = 0;
+
+  /// \brief Load mlperf parameter config from file.
+  int FromConfig(const std::string &path, const std::string &model,
+                 const std::string &scenario, int conf_type = 1);
+  /**@}*/
+
+  // ==================================
+  /// \name Performance Sample modifiers
+  /// \details These settings can be used to Audit Performance mode runs.
+  /// In order to detect sample caching by SUT, performance of runs when only
+  /// unique queries (with non-repeated samples) are issued can be compared with
+  /// that when the same query is repeatedly issued.
+  /**@{*/
+  /// \brief Prints measurement interval start and stop timestamps to std::cout
+  /// for the purpose of comparison against an external timer
+  bool print_timestamps = false;
+  /// \brief Allows issuing only unique queries in Performance mode of any
+  /// scenario \details This can be used to send non-repeat & hence unique
+  /// samples to SUT
+  bool performance_issue_unique = false;
+  /// \brief If true, the same query is chosen repeatedley for Inference.
+  /// In offline scenario, the query is filled with the same sample.
+  bool performance_issue_same = false;
+  /// \brief Offset to control which sample is repeated in
+  /// performance_issue_same mode.
+  /// Value should be within [0, performance_sample_count)
+  uint64_t performance_issue_same_index = 0;
+  /// \brief Overrides QSL->PerformanceSampleCount() when non-zero
+  uint64_t performance_sample_count_override = 0;
+  /// \brief Measure token latencies
+  bool use_token_latencies = false;
+  /// Token latency parameters
+  uint64_t server_ttft_latency = 100000000;
+  uint64_t server_tpot_latency = 100000000;
+  /// \brief Infer token latencies
+  bool infer_token_latencies = false;
+  uint64_t token_latency_scaling_factor;
+  /**@}*/
+};
+
+///
+/// \enum LoggingMode
+/// Specifies how and when logging should be sampled and stringified at
+/// runtime.
+/// * **AsyncPoll**
+///  + Logs are serialized and output on an IOThread that polls for new logs at
+///  a fixed interval. This is the only mode currently implemented.
+/// * **EndOfTestOnly**
+///  + TODO: Logs are serialzied and output only at the end of the test.
+/// * **Synchronous**
+///  + TODO: Logs are serialized and output inline.
+enum class LoggingMode {
+  AsyncPoll,
+  EndOfTestOnly,
+  Synchronous,
+};
+
+///
+/// \brief Specifies where log outputs should go.
+///
+/// By default, the loadgen outputs its log files to outdir and
+/// modifies the filenames of its logs with a prefix and suffix.
+/// Filenames will take the form:
+/// "<outdir>/<datetime><prefix>summary<suffix>.txt"
+///
+/// Affordances for outputing logs to stdout are also provided.
+///
+struct LogOutputSettings {
+  std::string outdir = ".";
+  std::string prefix = "mlperf_log_";
+  std::string suffix = "";
+  bool prefix_with_datetime = false;
+  bool copy_detail_to_stdout = false;
+  bool copy_summary_to_stdout = false;
+};
+
+///
+/// \brief Top-level log settings.
+///
+struct LogSettings {
+  LogOutputSettings log_output;
+  LoggingMode log_mode = LoggingMode::AsyncPoll;
+  uint64_t log_mode_async_poll_interval_ms = 1000;  ///< TODO: Implement this.
+  bool enable_trace = true;
+};
+
+/// @}
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_TEST_SETTINGS_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc
new file mode 100644
index 000000000..3f2cd8847
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc
@@ -0,0 +1,800 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "test_settings_internal.h"
+
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <string>
+
+#include "logging.h"
+#include "mlperf_conf.h"
+#include "utils.h"
+
+namespace mlperf {
+namespace loadgen {
+
+TestSettingsInternal::TestSettingsInternal(
+    const TestSettings &requested_settings, size_t qsl_performance_sample_count)
+    : requested(requested_settings),
+      scenario(requested.scenario),
+      mode(requested.mode),
+      samples_per_query(1),
+      target_qps(1),
+      max_async_queries(0),
+      target_duration(std::chrono::milliseconds(requested.min_duration_ms)),
+      min_duration(std::chrono::milliseconds(requested.min_duration_ms)),
+      max_duration(std::chrono::milliseconds(requested.max_duration_ms)),
+      min_query_count(requested.min_query_count),
+      max_query_count(requested.max_query_count),
+      min_sample_count(0),
+      qsl_rng_seed(requested.qsl_rng_seed),
+      sample_index_rng_seed(requested.sample_index_rng_seed),
+      schedule_rng_seed(requested.schedule_rng_seed),
+      accuracy_log_rng_seed(requested.accuracy_log_rng_seed),
+      accuracy_log_probability(requested.accuracy_log_probability),
+      accuracy_log_sampling_target(requested.accuracy_log_sampling_target),
+      print_timestamps(requested.print_timestamps),
+      performance_issue_unique(requested.performance_issue_unique),
+      performance_issue_same(requested.performance_issue_same),
+      performance_issue_same_index(requested.performance_issue_same_index),
+      performance_sample_count(0),
+      sample_concatenate_permutation(false),
+      use_token_latencies(requested.use_token_latencies),
+      server_ttft_latency(requested.server_ttft_latency),
+      server_tpot_latency(requested.server_tpot_latency),
+      infer_token_latencies(requested.infer_token_latencies),
+      token_latency_scaling_factor(requested.token_latency_scaling_factor) {
+  // Target QPS, target latency, and max_async_queries.
+  switch (requested.scenario) {
+    case TestScenario::SingleStream:
+      target_qps = static_cast<double>(std::nano::den) /
+                   requested.single_stream_expected_latency_ns;
+      max_async_queries = 1;
+      target_latency_percentile =
+          requested.single_stream_target_latency_percentile;
+      break;
+    case TestScenario::MultiStream:
+      target_qps = static_cast<double>(std::nano::den) /
+                   requested.multi_stream_expected_latency_ns;
+      max_async_queries = 1;
+      target_latency_percentile =
+          requested.multi_stream_target_latency_percentile;
+      break;
+    case TestScenario::Server:
+      if (requested.server_target_qps >= 0.0) {
+        target_qps = requested.server_target_qps;
+      } else {
+        LogDetail([server_target_qps = requested.server_target_qps,
+                   target_qps = target_qps](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Invalid value for server_target_qps requested."
+             << " requested: " << server_target_qps << " using: " << target_qps;
+          MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+          detail.Error("Invalid value for server_target_qps requested.",
+                       "requested", server_target_qps, "using", target_qps);
+#endif
+        });
+      }
+      target_latency =
+          std::chrono::nanoseconds(requested.server_target_latency_ns);
+      target_latency_percentile = requested.server_target_latency_percentile;
+      max_async_queries = requested.server_max_async_queries;
+      break;
+    case TestScenario::Offline:
+      // target_latency_percentile is not used in Offline, but set it to
+      // 0.99 anyway to avoid garbage value.
+      target_latency_percentile = 0.99;
+      if (requested.offline_expected_qps >= 0.0) {
+        target_qps = requested.offline_expected_qps;
+      } else {
+        LogDetail([offline_expected_qps = requested.offline_expected_qps,
+                   target_qps = target_qps](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Invalid value for offline_expected_qps requested."
+             << " requested: " << offline_expected_qps
+             << " using: " << target_qps;
+          MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+          detail.Error("Invalid value for offline_expected_qps requested.",
+                       "requested", offline_expected_qps, "using", target_qps);
+#endif
+        });
+      }
+      max_async_queries = 1;
+      break;
+  }
+
+  // Performance Sample Count: TestSettings override QSL ->
+  // PerformanceSampleCount
+  performance_sample_count = (requested.performance_sample_count_override == 0)
+                                 ? qsl_performance_sample_count
+                                 : requested.performance_sample_count_override;
+
+  // Sample by concatentating several permutations of the dataset
+  // sample_concatenate_permutation
+  sample_concatenate_permutation =
+      (requested.sample_concatenate_permutation == 0)
+          ? false
+          : requested.sample_concatenate_permutation;
+
+  // Samples per query.
+  if (requested.scenario == TestScenario::MultiStream) {
+    samples_per_query = requested.multi_stream_samples_per_query;
+  }
+
+  // In the offline scenario, coalesce all queries into a single query.
+  if (requested.scenario == TestScenario::Offline) {
+    // TODO: Should the spec require a max duration for large query counts?
+    // kSlack is used to make sure we generate enough samples for the SUT
+    // to take longer than than the minimum test duration required by the
+    // MLPerf spec.
+    constexpr double kSlack = 1.1;
+    uint64_t target_sample_count =
+        kSlack * DurationToSeconds(target_duration) * target_qps;
+    samples_per_query =
+        (requested.performance_issue_unique)
+            ? performance_sample_count
+            : std::max<uint64_t>(min_query_count, target_sample_count);
+    min_query_count = 1;
+    target_duration = std::chrono::milliseconds(0);
+  }
+
+  // FIXME: Only do this for 3D-UNet SingleStream, for v2.0
+  // TODO: consolidate after v2.0
+  // make min_queries to be multiple of performance_sample_count
+  // performance_sample_count == 0 makes it to be equal to loaded_samples.size()
+  if (sample_concatenate_permutation &&
+      requested.scenario == TestScenario::SingleStream) {
+    // set slack larger for 3D-UNet KiTS19 distribution, i.e. 50% latency << 90%
+    // latency
+    constexpr double kSlack = 2.0;
+    uint64_t expected_queries =
+        kSlack * DurationToSeconds(target_duration) * target_qps;
+    min_query_count =
+        min_query_count > expected_queries ? min_query_count : expected_queries;
+    min_query_count += qsl_performance_sample_count -
+                       (min_query_count % qsl_performance_sample_count);
+  }
+
+  min_sample_count = min_query_count * samples_per_query;
+
+  // Validate TestSettings
+  if (requested.performance_issue_same &&
+      (requested.performance_issue_same_index >= performance_sample_count)) {
+    LogDetail([performance_issue_same_index =
+                   requested.performance_issue_same_index,
+               performance_sample_count =
+                   performance_sample_count](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "Sample Idx to be repeated in performance_issue_same mode"
+         << " cannot be greater than loaded performance_sample_count."
+         << " performance_issue_same_index: " << performance_issue_same_index
+         << " performance_sample_count: " << performance_sample_count;
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+      detail.Error(
+          "Sample Idx to be repeated in performance_issue_same mode"
+          " cannot be greater than loaded performance_sample_count.",
+          "performance_issue_same_index", performance_issue_same_index,
+          "performance_sample_count", performance_sample_count);
+#endif
+    });
+  }
+
+  if (requested.performance_issue_unique && requested.performance_issue_same) {
+    LogDetail([performance_issue_unique = requested.performance_issue_unique,
+               performance_issue_same =
+                   requested.performance_issue_same](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "Performance_issue_unique and performance_issue_same, both"
+         << " cannot be true at the same time."
+         << " performance_issue_unique: " << performance_issue_unique
+         << " performance_issue_same: " << performance_issue_same;
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+      detail.Error(
+          "Performance_issue_unique and performance_issue_same, both"
+          " cannot be true at the same time.",
+          "performance_issue_unique", performance_issue_unique,
+          "performance_issue_same", performance_issue_same);
+#endif
+    });
+  }
+}
+
+std::string ToString(TestScenario scenario) {
+  switch (scenario) {
+#if USE_NEW_LOGGING_FORMAT
+    case TestScenario::SingleStream:
+      return "SingleStream";
+    case TestScenario::MultiStream:
+      return "MultiStream";
+#else
+    case TestScenario::SingleStream:
+      return "Single Stream";
+    case TestScenario::MultiStream:
+      return "Multi Stream";
+#endif
+    case TestScenario::Server:
+      return "Server";
+    case TestScenario::Offline:
+      return "Offline";
+  }
+  assert(false);
+  return "InvalidScenario";
+}
+
+std::string ToString(TestMode mode) {
+  switch (mode) {
+#if USE_NEW_LOGGING_FORMAT
+    case TestMode::SubmissionRun:
+      return "SubmissionRun";
+    case TestMode::AccuracyOnly:
+      return "AccuracyOnly";
+    case TestMode::PerformanceOnly:
+      return "PerformanceOnly";
+    case TestMode::FindPeakPerformance:
+      return "FindPeakPerformance";
+#else
+    case TestMode::SubmissionRun:
+      return "Submission";
+    case TestMode::AccuracyOnly:
+      return "Accuracy";
+    case TestMode::PerformanceOnly:
+      return "Performance";
+    case TestMode::FindPeakPerformance:
+      return "Find Peak Performance";
+#endif
+  }
+  assert(false);
+  return "InvalidMode";
+}
+
+void LogRequestedTestSettings(const TestSettings &s) {
+  LogDetail([s](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "requested_scenario", ToString(s.scenario));
+    MLPERF_LOG(detail, "requested_test_mode", ToString(s.mode));
+
+    // Scenario-specific
+    switch (s.scenario) {
+      case TestScenario::SingleStream:
+        MLPERF_LOG(detail, "requested_single_stream_expected_latency_ns",
+                   s.single_stream_expected_latency_ns);
+        MLPERF_LOG(detail, "requested_single_stream_target_latency_percentile",
+                   s.single_stream_target_latency_percentile);
+        break;
+      case TestScenario::MultiStream:
+        MLPERF_LOG(detail, "requested_multi_stream_expected_latency_ns",
+                   s.multi_stream_expected_latency_ns);
+        MLPERF_LOG(detail, "requested_multi_stream_target_latency_percentile",
+                   s.multi_stream_target_latency_percentile);
+        MLPERF_LOG(detail, "requested_multi_stream_samples_per_query",
+                   s.multi_stream_samples_per_query);
+        break;
+      case TestScenario::Server:
+        MLPERF_LOG(detail, "requested_server_target_qps", s.server_target_qps);
+        MLPERF_LOG(detail, "requested_server_target_latency_ns",
+                   s.server_target_latency_ns);
+        MLPERF_LOG(detail, "requested_server_target_latency_percentile",
+                   s.server_target_latency_percentile);
+        MLPERF_LOG(detail, "requested_server_coalesce_queries",
+                   s.server_coalesce_queries);
+        MLPERF_LOG(detail,
+                   "requested_server_find_peak_qps_decimals_of_precision",
+                   s.server_find_peak_qps_decimals_of_precision);
+        MLPERF_LOG(detail, "requested_server_find_peak_qps_boundary_step_size",
+                   s.server_find_peak_qps_boundary_step_size);
+        MLPERF_LOG(detail, "requested_server_max_async_queries",
+                   s.server_max_async_queries);
+        MLPERF_LOG(detail, "requested_server_num_issue_query_threads",
+                   s.server_num_issue_query_threads);
+        break;
+      case TestScenario::Offline:
+        MLPERF_LOG(detail, "requested_offline_expected_qps",
+                   s.offline_expected_qps);
+        break;
+    }
+
+    // Overrides
+    MLPERF_LOG(detail, "requested_min_duration_ms", s.min_duration_ms);
+    MLPERF_LOG(detail, "requested_max_duration_ms", s.max_duration_ms);
+    MLPERF_LOG(detail, "requested_min_query_count", s.min_query_count);
+    MLPERF_LOG(detail, "requested_max_query_count", s.max_query_count);
+    MLPERF_LOG(detail, "requested_qsl_rng_seed", s.qsl_rng_seed);
+    MLPERF_LOG(detail, "requested_sample_index_rng_seed",
+               s.sample_index_rng_seed);
+    MLPERF_LOG(detail, "requested_schedule_rng_seed", s.schedule_rng_seed);
+    MLPERF_LOG(detail, "requested_accuracy_log_rng_seed",
+               s.accuracy_log_rng_seed);
+    MLPERF_LOG(detail, "requested_accuracy_log_probability",
+               s.accuracy_log_probability);
+    MLPERF_LOG(detail, "requested_accuracy_log_sampling_target",
+               s.accuracy_log_sampling_target);
+    MLPERF_LOG(detail, "requested_print_timestamps", s.print_timestamps);
+    MLPERF_LOG(detail, "requested_performance_issue_unique",
+               s.performance_issue_unique);
+    MLPERF_LOG(detail, "requested_performance_issue_same",
+               s.performance_issue_same);
+    MLPERF_LOG(detail, "requested_performance_issue_same_index",
+               s.performance_issue_same_index);
+    MLPERF_LOG(detail, "requested_performance_sample_count_override",
+               s.performance_sample_count_override);
+    MLPERF_LOG(detail, "requested_sample_concatenate_permutation",
+               s.sample_concatenate_permutation);
+    // Token latencies specific values
+    if (s.use_token_latencies) {
+      MLPERF_LOG(detail, "requested_use_token_latencies",
+                 s.use_token_latencies);
+      if (s.scenario != TestScenario::Offline) {
+        MLPERF_LOG(detail, "requested_server_ttft_latency",
+                   s.server_ttft_latency);
+        MLPERF_LOG(detail, "requested_server_tpot_latency",
+                   s.server_tpot_latency);
+      }
+    }
+#else
+    detail("");
+    detail("Requested Settings:");
+    detail("Scenario : " + ToString(s.scenario));
+    detail("Test mode : " + ToString(s.mode));
+
+    // Scenario-specific
+    switch (s.scenario) {
+      case TestScenario::SingleStream:
+        detail("single_stream_expected_latency_ns : ",
+               s.single_stream_expected_latency_ns);
+        detail("single_stream_target_latency_percentile : ",
+               s.single_stream_target_latency_percentile);
+        break;
+      case TestScenario::MultiStream:
+        detail("multi_stream_expected_latency_ns : ",
+               s.multi_stream_expected_latency_ns);
+        detail("multi_stream_target_latency_percentile : ",
+               s.multi_stream_target_latency_percentile);
+        detail("multi_stream_samples_per_query : ",
+               s.multi_stream_samples_per_query);
+        break;
+      case TestScenario::Server:
+        detail("server_target_qps : ", s.server_target_qps);
+        detail("server_target_latency_ns : ", s.server_target_latency_ns);
+        detail("server_target_latency_percentile : ",
+               s.server_target_latency_percentile);
+        detail("server_coalesce_queries : ", s.server_coalesce_queries);
+        detail("server_find_peak_qps_decimals_of_precision : ",
+               s.server_find_peak_qps_decimals_of_precision);
+        detail("server_find_peak_qps_boundary_step_size : ",
+               s.server_find_peak_qps_boundary_step_size);
+        detail("server_max_async_queries : ", s.server_max_async_queries);
+        detail("server_num_issue_query_threads : ",
+               s.server_num_issue_query_threads);
+        break;
+      case TestScenario::Offline:
+        detail("offline_expected_qps : ", s.offline_expected_qps);
+        break;
+    }
+
+    // Overrides
+    detail("min_duration_ms : ", s.min_duration_ms);
+    detail("max_duration_ms : ", s.max_duration_ms);
+    detail("min_query_count : ", s.min_query_count);
+    detail("max_query_count : ", s.max_query_count);
+    detail("qsl_rng_seed : ", s.qsl_rng_seed);
+    detail("sample_index_rng_seed : ", s.sample_index_rng_seed);
+    detail("schedule_rng_seed : ", s.schedule_rng_seed);
+    detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
+    detail("accuracy_log_probability : ", s.accuracy_log_probability);
+    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
+    detail("print_timestamps : ", s.print_timestamps);
+    detail("performance_issue_unique : ", s.performance_issue_unique);
+    detail("performance_issue_same : ", s.performance_issue_same);
+    detail("performance_issue_same_index : ", s.performance_issue_same_index);
+    detail("performance_sample_count_override : ",
+           s.performance_sample_count_override);
+    detail("");
+#endif
+  });
+}
+
+void TestSettingsInternal::LogEffectiveSettings() const {
+  LogDetail([s = *this](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "effective_scenario", ToString(s.scenario));
+    MLPERF_LOG(detail, "effective_test_mode", ToString(s.mode));
+
+    MLPERF_LOG(detail, "effective_samples_per_query", s.samples_per_query);
+    MLPERF_LOG(detail, "effective_target_qps", s.target_qps);
+    MLPERF_LOG(detail, "effective_target_latency_ns", s.target_latency.count());
+    MLPERF_LOG(detail, "effective_target_latency_percentile",
+               s.target_latency_percentile);
+    MLPERF_LOG(detail, "effective_max_async_queries", s.max_async_queries);
+    MLPERF_LOG(detail, "effective_target_duration_ms",
+               s.target_duration.count());
+    MLPERF_LOG(detail, "effective_min_duration_ms", s.min_duration.count());
+    MLPERF_LOG(detail, "effective_max_duration_ms", s.max_duration.count());
+    MLPERF_LOG(detail, "effective_min_query_count", s.min_query_count);
+    MLPERF_LOG(detail, "effective_max_query_count", s.max_query_count);
+    MLPERF_LOG(detail, "effective_min_sample_count", s.min_sample_count);
+    MLPERF_LOG(detail, "effective_qsl_rng_seed", s.qsl_rng_seed);
+    MLPERF_LOG(detail, "effective_sample_index_rng_seed",
+               s.sample_index_rng_seed);
+    MLPERF_LOG(detail, "effective_schedule_rng_seed", s.schedule_rng_seed);
+    MLPERF_LOG(detail, "effective_accuracy_log_rng_seed",
+               s.accuracy_log_rng_seed);
+    MLPERF_LOG(detail, "effective_accuracy_log_probability",
+               s.accuracy_log_probability);
+    MLPERF_LOG(detail, "effective_accuracy_log_sampling_target",
+               s.accuracy_log_sampling_target);
+    MLPERF_LOG(detail, "effective_print_timestamps", s.print_timestamps);
+    MLPERF_LOG(detail, "effective_performance_issue_unique",
+               s.performance_issue_unique);
+    MLPERF_LOG(detail, "effective_performance_issue_same",
+               s.performance_issue_same);
+    MLPERF_LOG(detail, "effective_performance_issue_same_index",
+               s.performance_issue_same_index);
+    MLPERF_LOG(detail, "effective_performance_sample_count",
+               s.performance_sample_count);
+    MLPERF_LOG(detail, "effective_sample_concatenate_permutation",
+               s.sample_concatenate_permutation);
+#else
+    detail("");
+    detail("Effective Settings:");
+
+    detail("Scenario : " + ToString(s.scenario));
+    detail("Test mode : " + ToString(s.mode));
+
+    detail("samples_per_query : ", s.samples_per_query);
+    detail("target_qps : ", s.target_qps);
+    detail("target_latency (ns): ", s.target_latency.count());
+    detail("target_latency_percentile : ", s.target_latency_percentile);
+    detail("max_async_queries : ", s.max_async_queries);
+    detail("target_duration (ms): ", s.target_duration.count());
+    detail("min_duration (ms): ", s.min_duration.count());
+    detail("max_duration (ms): ", s.max_duration.count());
+    detail("min_query_count : ", s.min_query_count);
+    detail("max_query_count : ", s.max_query_count);
+    detail("min_sample_count : ", s.min_sample_count);
+    detail("qsl_rng_seed : ", s.qsl_rng_seed);
+    detail("sample_index_rng_seed : ", s.sample_index_rng_seed);
+    detail("schedule_rng_seed : ", s.schedule_rng_seed);
+    detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
+    detail("accuracy_log_probability : ", s.accuracy_log_probability);
+    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
+    detail("print_timestamps : ", s.print_timestamps);
+    detail("performance_issue_unique : ", s.performance_issue_unique);
+    detail("performance_issue_same : ", s.performance_issue_same);
+    detail("performance_issue_same_index : ", s.performance_issue_same_index);
+    detail("performance_sample_count : ", s.performance_sample_count);
+#endif
+  });
+}
+
+void TestSettingsInternal::LogAllSettings() const {
+  LogRequestedTestSettings(requested);
+  LogEffectiveSettings();
+}
+
+void TestSettingsInternal::LogSummary(AsyncSummary &summary) const {
+  summary("samples_per_query : ", samples_per_query);
+  summary("target_qps : ", target_qps);
+  if (!use_token_latencies) {
+    summary("target_latency (ns): ", target_latency.count());
+  } else {
+    summary("ttft_latency (ns): ", server_ttft_latency);
+    summary("tpot_latency (ns): ", server_tpot_latency);
+  }
+  summary("max_async_queries : ", max_async_queries);
+  summary("min_duration (ms): ", min_duration.count());
+  summary("max_duration (ms): ", max_duration.count());
+  summary("min_query_count : ", min_query_count);
+  summary("max_query_count : ", max_query_count);
+  summary("qsl_rng_seed : ", qsl_rng_seed);
+  summary("sample_index_rng_seed : ", sample_index_rng_seed);
+  summary("schedule_rng_seed : ", schedule_rng_seed);
+  summary("accuracy_log_rng_seed : ", accuracy_log_rng_seed);
+  summary("accuracy_log_probability : ", accuracy_log_probability);
+  summary("accuracy_log_sampling_target : ", accuracy_log_sampling_target);
+  summary("print_timestamps : ", print_timestamps);
+  summary("performance_issue_unique : ", performance_issue_unique);
+  summary("performance_issue_same : ", performance_issue_same);
+  summary("performance_issue_same_index : ", performance_issue_same_index);
+  summary("performance_sample_count : ", performance_sample_count);
+  if (sample_concatenate_permutation) {
+    summary(
+        "WARNING: sample_concatenate_permutation was set to true. \n"
+        "Generated samples per query might be different as the one in the "
+        "setting.\n"
+        "Check the generated_samples_per_query line in the detailed log for "
+        "the real\n"
+        "samples_per_query value");
+  }
+}
+
+}  // namespace loadgen
+
+int TestSettings::FromConfig(const std::string &path, const std::string &model,
+                             const std::string &scenario, int conf_type) {
+  std::map<std::string, std::string> kv;
+  static int configCount = 0;
+
+  if (conf_type == 1) {
+    if (configCount == 0) {
+      // Only allow userConf as the single configFile and loadgen loads the
+      // mlperfConf automatically for perf and accuracy runs
+      FromConfig("", model, scenario, 0);
+    }
+
+    else {
+      LogDetail([](AsyncDetail &detail) {
+        std::stringstream ss;
+        ss << "Multiple conf files are used. This is not valid for official "
+              "submission.";
+        MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+      });
+    }
+    configCount++;
+  }
+
+  // lookup key/value pairs from config
+  auto lookupkv = [&](const std::string &model, const std::string &scenario,
+                      const std::string &key, uint64_t *val_l, double *val_d,
+                      double multiplier = 1.0) {
+    std::map<std::string, std::string>::iterator it;
+    std::string found;
+    // lookup exact key first
+    it = kv.find(model + "." + scenario + "." + key);
+    if (it != kv.end()) {
+      found = it->second;
+    } else {
+      // lookup key with model wildcard
+      it = kv.find("*." + scenario + "." + key);
+      if (it != kv.end()) {
+        found = it->second;
+      } else {
+        it = kv.find(model + ".*." + key);
+        if (it != kv.end()) {
+          found = it->second;
+        } else {
+          it = kv.find("*.*." + key);
+          if (it != kv.end()) {
+            found = it->second;
+          } else {
+            return false;
+          }
+        }
+      }
+    }
+    // if we get here, found will be set
+    if (val_l) {
+      *val_l = strtoull(found.c_str(), nullptr, 0) *
+               static_cast<uint64_t>(multiplier);
+    }
+    if (val_d) *val_d = strtod(found.c_str(), nullptr) * multiplier;
+    return true;
+  };
+
+  int line_nr = 0;
+  int errors = 0;
+  // Declare the input stream before the if-else block
+  std::unique_ptr<std::istream> fss;
+  std::string line;
+
+  if (conf_type != 0) {
+    // dirt simple config parser
+    fss = std::make_unique<std::ifstream>(path);
+    if (!static_cast<std::ifstream *>(fss.get())->is_open()) {
+      LogDetail([p = path](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+        std::stringstream ss;
+        ss << "can't open file " << p;
+        MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+        detail.Error("can't open file ", p);
+#endif
+      });
+      return -ENOENT;
+    }
+  } else {
+    // Convert unsigned char array to std::string
+    std::string config_str(mlperf_conf);
+    fss = std::make_unique<std::istringstream>(config_str);
+  }
+  while (std::getline(*fss, line)) {
+    line_nr++;
+    std::istringstream iss(line);
+    std::string s, k;
+    int looking_for = 0;  // 0=key, 1=equal, 2=value
+    while (iss >> s) {
+      if (s == "#" && looking_for != 2) {
+        // done with this line
+        break;
+      }
+      if (looking_for == 2) {
+        // got key and value
+        const char *start = s.c_str();
+        char *stop;
+        (void)strtoul(start, &stop, 0);
+        if (start + s.size() == stop) {
+          kv[k] = s;
+          continue;
+        }
+        (void)strtod(start, &stop);
+        if (start + s.size() == stop) {
+          kv[k] = s;
+          continue;
+        }
+        errors++;
+        LogDetail([l = line_nr](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "value needs to be integer or double, line=" << l;
+          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+          detail.Error("value needs to be integer or double, line=", l);
+#endif
+        });
+        break;
+      }
+      if (looking_for == 1 && s != "=") {
+        errors++;
+        LogDetail([l = line_nr](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "expected 'key=value', line=" << l;
+          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+          detail.Error("expected 'key=value', line=", l);
+#endif
+        });
+        break;
+      }
+      if (looking_for == 0) k = s;
+      looking_for++;
+    }
+  }
+  if (errors != 0) return -EINVAL;
+
+  uint64_t val;
+
+  // keys that apply to all scenarios
+  if (lookupkv(model, scenario, "mode", &val, nullptr)) {
+    switch (val) {
+      case 0:
+        mode = TestMode::SubmissionRun;
+        break;
+      case 1:
+        mode = TestMode::AccuracyOnly;
+        break;
+      case 2:
+        mode = TestMode::PerformanceOnly;
+        break;
+      case 3:
+        mode = TestMode::FindPeakPerformance;
+        break;
+      default:
+        LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Invalid value passed to Mode key in config.";
+          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+          detail.Error("Invalid value passed to Mode key in config.");
+#endif
+        });
+        break;
+    }
+  }
+
+  if (conf_type == 0) {
+    lookupkv(model, scenario, "qsl_rng_seed", &qsl_rng_seed, nullptr);
+    lookupkv(model, scenario, "sample_index_rng_seed", &sample_index_rng_seed,
+             nullptr);
+    lookupkv(model, scenario, "schedule_rng_seed", &schedule_rng_seed, nullptr);
+    lookupkv(model, scenario, "accuracy_log_probability", nullptr,
+             &accuracy_log_probability, 0.01);
+    if (lookupkv(model, scenario, "test05", &val, nullptr))
+      test05 = (val == 1) ? true : false;
+    lookupkv(model, scenario, "test05_qsl_rng_seed", &test05_qsl_rng_seed,
+             nullptr);
+    lookupkv(model, scenario, "test05_sample_index_rng_seed",
+             &test05_sample_index_rng_seed, nullptr);
+    lookupkv(model, scenario, "test05_schedule_rng_seed",
+             &test05_schedule_rng_seed, nullptr);
+  }
+
+  // keys that can be overriden in user.conf but will make the results eligible
+  // only for open submissions
+
+  // keys to measure token metrics
+  if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)) {
+    use_token_latencies = (val == 1) ? true : false;
+  }
+  if (use_token_latencies) {
+    lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr,
+             1000 * 1000);
+    lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr,
+             1000 * 1000);
+  }
+
+  // keys to infer token metrics
+  if (lookupkv(model, scenario, "infer_token_latencies", &val, nullptr)) {
+    infer_token_latencies = (val == 1) ? true : false;
+  }
+  if (infer_token_latencies) {
+    lookupkv(model, scenario, "token_latency_scaling_factor",
+             &token_latency_scaling_factor, nullptr, 1);
+  }
+  // keys that apply to SingleStream
+  lookupkv(model, "SingleStream", "target_latency_percentile", nullptr,
+           &single_stream_target_latency_percentile, 0.01);
+
+  // keys that apply to MultiStream
+  lookupkv(model, "MultiStream", "target_latency_percentile", nullptr,
+           &multi_stream_target_latency_percentile, 0.01);
+  lookupkv(model, "MultiStream", "samples_per_query",
+           &multi_stream_samples_per_query, nullptr, 1);
+
+  // keys that apply to Server
+  lookupkv(model, "Server", "target_latency_percentile", nullptr,
+           &server_target_latency_percentile, 0.01);
+  lookupkv(model, "Server", "target_latency", &server_target_latency_ns,
+           nullptr, 1000 * 1000);
+
+  // keys that can be overriden in user.conf (the provided values still need to
+  // pass the submission checker rules)
+  if (lookupkv(model, scenario, "performance_issue_unique", &val, nullptr))
+    performance_issue_unique = (val == 0) ? false : true;
+  if (lookupkv(model, scenario, "performance_issue_same", &val, nullptr))
+    performance_issue_same = (val == 0) ? false : true;
+  lookupkv(model, scenario, "performance_issue_same_index",
+           &performance_issue_same_index, nullptr);
+
+  if (lookupkv(model, scenario, "sample_concatenate_permutation", &val,
+               nullptr))
+    sample_concatenate_permutation = (val == 1) ? true : false;
+  if (lookupkv(model, "Server", "coalesce_queries", &val, nullptr))
+    server_coalesce_queries = (val == 0) ? false : true;
+  if (lookupkv(model, "Server", "max_async_queries", &val, nullptr))
+    server_max_async_queries = int(val);
+
+  lookupkv(model, scenario, "min_duration", &min_duration_ms, nullptr);
+  lookupkv(model, scenario, "max_duration", &max_duration_ms, nullptr);
+  lookupkv(model, scenario, "min_query_count", &min_query_count, nullptr);
+  lookupkv(model, scenario, "max_query_count", &max_query_count, nullptr);
+  lookupkv(model, scenario, "performance_sample_count_override",
+           &performance_sample_count_override, nullptr);
+  lookupkv(model, "SingleStream", "target_latency", nullptr,
+           &single_stream_expected_latency_ns, 1000 * 1000);
+  lookupkv(model, "MultiStream", "target_latency", nullptr,
+           &multi_stream_expected_latency_ns, 1000 * 1000);
+  lookupkv(model, "Server", "target_qps", nullptr, &server_target_qps);
+  lookupkv(model, "Offline", "target_qps", 0, &offline_expected_qps);
+
+  if (lookupkv(model, scenario, "print_timestamps", &val, nullptr))
+    print_timestamps = (val == 0) ? false : true;
+
+  // keys that are used in audit.conf
+  lookupkv(model, scenario, "accuracy_log_rng_seed", &accuracy_log_rng_seed,
+           nullptr);
+  lookupkv(model, scenario, "accuracy_log_sampling_target",
+           &accuracy_log_sampling_target, nullptr);
+  return 0;
+}
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h
new file mode 100644
index 000000000..ab2773bd1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h
@@ -0,0 +1,182 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief The internal representation of user-provided settings.
+
+#ifndef MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
+#define MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
+
+#include <chrono>
+#include <cmath>
+#include <string>
+
+#include "logging.h"
+#include "test_settings.h"
+
+namespace mlperf {
+
+namespace logging {
+class AsyncSummary;
+}
+
+namespace loadgen {
+
+using AsyncSummary = logging::AsyncSummary;
+
+std::string ToString(TestScenario scenario);
+std::string ToString(TestMode mode);
+
+/// \brief takes the user-friendly TestSettings and normalizes it
+/// for consumption by the loadgen.
+/// \details It does things like remove scenario-specific naming and introduce
+/// the concept of target_duration used to pre-generate queries.
+struct TestSettingsInternal {
+  explicit TestSettingsInternal(const TestSettings &requested_settings,
+                                size_t qsl_performance_sample_count);
+  void LogEffectiveSettings() const;
+  void LogAllSettings() const;
+  void LogSummary(AsyncSummary &summary) const;
+
+  const TestSettings requested;
+  const TestScenario scenario;  // Copied here for convenience.
+  const TestMode mode;          // Copied here for convenience.
+
+  uint64_t samples_per_query;
+  double target_qps;
+  std::chrono::nanoseconds target_latency{0};
+  double target_latency_percentile;  // Single, multistream, and server modes.
+  uint64_t max_async_queries;
+
+  // Target duration is used to generate queries of a minimum duration before
+  // the test run.
+  std::chrono::milliseconds target_duration{0};
+
+  // Min duration/query_count/sample_count are used to validate the test
+  // duration at the end of the run.
+  std::chrono::milliseconds min_duration{0};
+  std::chrono::milliseconds max_duration{0};
+  uint64_t min_query_count;
+  uint64_t max_query_count;
+  uint64_t min_sample_count;  // Offline only.
+
+  uint64_t qsl_rng_seed;
+  uint64_t sample_index_rng_seed;
+  uint64_t schedule_rng_seed;
+  uint64_t accuracy_log_rng_seed;
+  double accuracy_log_probability;
+  uint64_t accuracy_log_sampling_target;
+  bool print_timestamps;
+  bool performance_issue_unique;
+  bool performance_issue_same;
+  uint64_t performance_issue_same_index;
+  uint64_t performance_sample_count;
+
+  bool sample_concatenate_permutation;
+  bool use_token_latencies = false;
+  int64_t server_ttft_latency;
+  int64_t server_tpot_latency;
+
+  bool infer_token_latencies = false;
+  int64_t token_latency_scaling_factor;
+};
+
+/// \brief A namespace of collections of FindPeakPerformance helper functions,
+/// mainly about binary search.
+namespace find_peak_performance {
+
+constexpr char const *kNotSupportedMsg =
+    "Finding peak performance is only supported in Server scenarios.";
+
+template <TestScenario scenario>
+TestSettingsInternal MidOfBoundaries(
+    const TestSettingsInternal &lower_bound_settings,
+    const TestSettingsInternal &upper_bound_settings) {
+  TestSettingsInternal mid_settings = lower_bound_settings;
+  if (scenario == TestScenario::Server) {
+    assert(lower_bound_settings.target_qps < upper_bound_settings.target_qps);
+    mid_settings.target_qps =
+        lower_bound_settings.target_qps +
+        (upper_bound_settings.target_qps - lower_bound_settings.target_qps) / 2;
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+  }
+  return mid_settings;
+}
+
+template <TestScenario scenario>
+bool IsFinished(const TestSettingsInternal &lower_bound_settings,
+                const TestSettingsInternal &upper_bound_settings) {
+  if (scenario == TestScenario::Server) {
+    uint8_t precision = lower_bound_settings.requested
+                            .server_find_peak_qps_decimals_of_precision;
+    double l =
+        std::floor(lower_bound_settings.target_qps * std::pow(10, precision));
+    double u =
+        std::floor(upper_bound_settings.target_qps * std::pow(10, precision));
+    return l + 1 >= u;
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+    return true;
+  }
+}
+
+template <TestScenario scenario>
+std::string ToStringPerformanceField(const TestSettingsInternal &settings) {
+  if (scenario == TestScenario::Server) {
+    return std::to_string(settings.target_qps);
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+    return ToString(settings.scenario);
+  }
+}
+
+template <TestScenario scenario>
+void WidenPerformanceField(TestSettingsInternal *settings) {
+  if (scenario == TestScenario::Server) {
+    settings->target_qps =
+        settings->target_qps *
+        (1 + settings->requested.server_find_peak_qps_boundary_step_size);
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+  }
+}
+
+}  // namespace find_peak_performance
+}  // namespace loadgen
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn
new file mode 100644
index 000000000..d73bf831a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn
@@ -0,0 +1,25 @@
+static_library("mlperf_loadgen_tests_loadgen_test_main") {
+  sources = [ "loadgen_test.h", "loadgen_test_main.cc" ]
+  configs += [ "//build/config/compiler:exceptions" ]
+}
+
+executable("mlperf_loadgen_perftests") {
+  sources = [ "perftests_null_sut.cc" ]
+  deps = [ "..:mlperf_loadgen" ]
+}
+
+executable("mlperf_loadgen_tests_basic") {
+  sources = [ "basic.cc" ]
+  deps = [ "..:mlperf_loadgen",
+           ":mlperf_loadgen_tests_loadgen_test_main"  ]
+  configs += [ "//build/config/compiler:exceptions" ]
+}
+
+source_set("mlperf_loadgen_perftests_py") {
+  sources = [ "perftests_null_sut.py" ]
+  deps = [ "../..:loadgen_pymodule_wheel_lib" ]
+}
+
+source_set("docs") {
+  sources = [ "README.md" ]
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md
new file mode 100644
index 000000000..41056b457
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md
@@ -0,0 +1,42 @@
+# Building and Running the Tests {#ReadmeTests}
+
+The unit and performance tests are only supported via gn/ninja at the moment.
+
+See the [top-level build readme](@ref ReadmeBuild) for details but, from a clean checkout, you must first run:
+
+    make bootstrap_gn_ninja
+    third_party/gn/gn gen out/Release --args="is_debug=false"
+
+This will build the gn and ninja build tools and create a release project.
+
+## Unit Tests
+
+To build:
+
+    third_party/ninja/ninja -C out/Release mlperf_loadgen_tests_basic
+
+To run all tests:
+
+    out/Release/mlperf_loadgen_tests_basic .
+
+To run specific tests:
+
+    out/Release/mlperf_loadgen_tests_basic <regex>
+    e.g.:
+    out/Release/mlperf_loadgen_tests_basic SingleStream
+
+## Performance Tests
+
+To build:
+
+    third_party/ninja/ninja -C out/Release mlperf_loadgen_perftests
+
+To run all tests:
+
+    out/Release/mlperf_loadgen_perftests .
+
+To run specific tests:
+
+    out/Release/mlperf_loadgen_perftests <regex>
+    e.g.:
+    out/Release/mlperf_loadgen_tests_basic ServerPool
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc
new file mode 100644
index 000000000..97c6a0bb1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc
@@ -0,0 +1,314 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Basic functionality unit tests.
+
+#include <algorithm>
+#include <deque>
+#include <future>
+#include <iostream>
+#include <queue>
+#include <unordered_set>
+#include <vector>
+
+#include "../loadgen.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+#include "loadgen_test.h"
+
+/// \brief Correctness unit tests.
+namespace unit_tests {
+
+/// \defgroup LoadgenTestsBasic Test Coverage: Basic
+
+/// \brief Implements the client interfaces of the loadgen and
+/// has some basic sanity checks that are enabled for all tests.
+/// \details It also forwards calls to overrideable *Ext methods and implements
+/// the TestProxy concept.
+struct SystemUnderTestBasic : public mlperf::QuerySampleLibrary,
+                              public mlperf::SystemUnderTest {
+  const std::string& Name() const override { return name_; }
+
+  size_t TotalSampleCount() override { return total_sample_count_; }
+  size_t PerformanceSampleCount() override { return performance_sample_count_; }
+
+  void LoadSamplesToRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    for (auto s : samples) {
+      samples_load_count_.at(s)++;
+      loaded_samples_.push_back(s);
+    }
+    LoadSamplesToRamExt(samples);
+  }
+  virtual void LoadSamplesToRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) {}
+
+  void UnloadSamplesFromRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    for (auto s : samples) {
+      FAIL_IF(loaded_samples_.front() != s) &&
+          FAIL_EXP(loaded_samples_.front()) && FAIL_EXP(s);
+      loaded_samples_.pop_front();
+      size_t prev_load_count = samples_load_count_.at(s)--;
+      FAIL_IF(prev_load_count == 0) && FAIL_EXP(prev_load_count);
+    }
+    UnloadSamplesFromRamExt(samples);
+  }
+  virtual void UnloadSamplesFromRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) {}
+
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::vector<mlperf::QuerySampleResponse> responses;
+    query_sizes_.push_back(samples.size());
+    samples_between_flushes_.back() += samples.size();
+    responses.reserve(samples.size());
+    for (auto s : samples) {
+      FAIL_IF(samples_load_count_.at(s.index) == 0) &&
+          FAIL_MSG("Issued unloaded sample:") && FAIL_EXP(s.index);
+      samples_issue_count_.at(s.index)++;
+      issued_samples_.push_back(s.index);
+      responses.push_back({s.id, 0, 0});
+    }
+    mlperf::QuerySamplesComplete(responses.data(), responses.size());
+    IssueQueryExt(samples);
+  }
+  virtual void IssueQueryExt(const std::vector<mlperf::QuerySample>& samples) {}
+
+  void FlushQueries() override {
+    samples_between_flushes_.push_back(0);
+    FlushQueriesExt();
+  }
+  virtual void FlushQueriesExt() {}
+
+  virtual void RunTest() {
+    samples_load_count_.resize(total_sample_count_, 0);
+    samples_issue_count_.resize(total_sample_count_, 0);
+    samples_between_flushes_.resize(1, 0);
+    mlperf::StartTest(this, this, test_settings_, log_settings_);
+  }
+
+  virtual void EndTest() {}
+
+ protected:
+  mlperf::TestSettings test_settings_;
+  mlperf::LogSettings log_settings_;
+
+  std::string name_{"BasicSUT"};
+  size_t total_sample_count_;
+  size_t performance_sample_count_;
+  std::vector<mlperf::QuerySampleIndex> issued_samples_;
+  std::deque<mlperf::QuerySampleIndex> loaded_samples_;
+  std::vector<size_t> samples_load_count_;
+  std::vector<size_t> samples_issue_count_;
+
+  std::vector<size_t> query_sizes_;
+  std::vector<size_t> samples_between_flushes_;
+};
+
+/// \brief Provides common test set up logic.
+struct SystemUnderTestAccuracy : public SystemUnderTestBasic {
+  virtual void SetUpTest(size_t samples_per_query,
+                         size_t samples_per_query_remainder,
+                         size_t accuracy_remainder,
+                         mlperf::TestScenario scenario) {
+    performance_sample_count_ =
+        samples_per_query * 16 + samples_per_query_remainder;
+    total_sample_count_ = performance_sample_count_ * 32 + accuracy_remainder;
+
+    log_settings_.log_output.prefix_with_datetime = false;
+
+    test_settings_.scenario = scenario;
+    test_settings_.mode = mlperf::TestMode::AccuracyOnly;
+    test_settings_.multi_stream_samples_per_query = samples_per_query;
+
+    double qps = 1e3;
+    test_settings_.server_target_qps = qps;
+  }
+};
+
+/// \brief Verifies all samples from the QSL are included at least once
+/// in accuracy mode.
+/// \ingroup LoadgenTestsBasic
+struct TestAccuracyIncludesAllSamples : public SystemUnderTestAccuracy {
+  void EndTest() override {
+    std::sort(issued_samples_.begin(), issued_samples_.end());
+
+    FAIL_IF(issued_samples_.size() < total_sample_count_) &&
+        FAIL_EXP(issued_samples_.size()) && FAIL_EXP(total_sample_count_);
+    FAIL_IF(issued_samples_.front() != 0) && FAIL_EXP(issued_samples_.front());
+    FAIL_IF(issued_samples_.back() != total_sample_count_ - 1) &&
+        FAIL_EXP(issued_samples_.back()) && FAIL_EXP(total_sample_count_);
+
+    mlperf::QuerySampleIndex prev = -1;
+    size_t discontinuities = 0;
+    size_t dupes = 0;
+    for (auto s : issued_samples_) {
+      if (s == prev) {
+        dupes++;
+      } else if (s - prev > 1) {
+        discontinuities++;
+      }
+      prev = s;
+    }
+
+    FAIL_IF(discontinuities != 0) && FAIL_EXP(discontinuities);
+    FAIL_IF(dupes != 0) && FAIL_EXP(dupes);
+  }
+};
+
+REGISTER_TEST_ALL_SCENARIOS(AccuracyIncludesAllSamples,
+                            TestProxy<TestAccuracyIncludesAllSamples>(), 4, 0,
+                            0);
+
+/// \brief Verifies samples from the QSL aren't included too many times.
+/// \details This is a regression test for:
+/// https://github.com/mlperf/inference/pull/386
+/// The root cause was using different values for samples_per_query while
+/// generating queries for the GNMT dataset.
+/// \ingroup LoadgenTestsBasic
+struct TestAccuracyDupesAreLimitted : public SystemUnderTestAccuracy {
+  void SetUpTest(bool, mlperf::TestScenario scenario) {
+    SystemUnderTestAccuracy::SetUpTest(4, 0, 0, scenario);
+    total_sample_count_ = 3003;
+    performance_sample_count_ = 1001;
+  }
+
+  void EndTest() override {
+    std::sort(issued_samples_.begin(), issued_samples_.end());
+
+    FAIL_IF(issued_samples_.size() < total_sample_count_) &&
+        FAIL_EXP(issued_samples_.size()) && FAIL_EXP(total_sample_count_);
+    FAIL_IF(issued_samples_.front() != 0) && FAIL_EXP(issued_samples_.front());
+    FAIL_IF(issued_samples_.back() != total_sample_count_ - 1) &&
+        FAIL_EXP(issued_samples_.back()) && FAIL_EXP(total_sample_count_);
+
+    std::vector<size_t> issue_counts(total_sample_count_, 0);
+    for (auto s : issued_samples_) {
+      issue_counts.at(s)++;
+    }
+
+    const size_t max_count = 1;
+    for (size_t i = 0; i < issue_counts.size(); i++) {
+      FAIL_IF(issue_counts[i] > max_count) && FAIL_EXP(i) &&
+          FAIL_EXP(max_count) && FAIL_EXP(issue_counts[i]);
+    }
+  }
+};
+
+REGISTER_TEST_ALL_SCENARIOS(TestAccuracyDupesAreLimitted,
+                            TestProxy<TestAccuracyDupesAreLimitted>(), true);
+
+/// \brief Verifies offline + accuracy doesn't hang if the last set
+/// in the accuracy series is smaller than others.
+/// \ingroup LoadgenTestsBasic
+struct TestOfflineRemainderAccuracySet : public SystemUnderTestAccuracy {
+  void SetUpTest() {
+    SystemUnderTestAccuracy::SetUpTest(4, 0, 7, mlperf::TestScenario::Offline);
+  }
+
+  void EndTest() override {
+    auto& flush_samples = samples_between_flushes_;
+
+    FAIL_IF(flush_samples.size() < 3) && FAIL_EXP(flush_samples.size()) &&
+        BAD_TEST_MSG("Test should generate multiple query sets.") && ABORT_TEST;
+
+    // The last counter will be 0, since a test ends with a call to
+    // FlushQuery.
+    FAIL_IF(flush_samples.back() != 0) && FAIL_EXP(flush_samples.back()) &&
+        FAIL_MSG(
+            "Detected stray calls to IssueQuery after the last call to "
+            "FlushQuery.");
+    flush_samples.pop_back();
+
+    // Verify the test ran with a smaller last accuracy set.
+    size_t first_size = flush_samples.front();
+    size_t last_size = flush_samples.back();
+    FAIL_IF(first_size <= last_size) && FAIL_EXP(first_size) &&
+        FAIL_EXP(last_size) && BAD_TEST_MSG();
+
+    flush_samples.pop_back();  // Don't check the last set for equality.
+    for (size_t query_size : flush_samples) {
+      FAIL_IF(query_size != first_size) && FAIL_EXP(query_size) &&
+          FAIL_EXP(first_size);
+    }
+  }
+};
+
+REGISTER_TEST(Offline_RemainderAccuracySets,
+              TestProxy<TestOfflineRemainderAccuracySet>());
+
+/// \brief Verifies all queries only contain samples that are contiguous,
+/// even if the set size is not a multiple of samples_per_query.
+/// \ingroup LoadgenTestsBasic
+struct TestMultiStreamContiguousRemainderQuery
+    : public SystemUnderTestAccuracy {
+  void SetUpTest(mlperf::TestScenario scenario) {
+    SystemUnderTestAccuracy::SetUpTest(4, 1, 0, scenario);
+    first_qsl_offsets_.resize(total_sample_count_, kBadQslOffset);
+
+    auto spq = test_settings_.multi_stream_samples_per_query;
+    FAIL_IF(performance_sample_count_ % spq == 0) &&
+        FAIL_EXP(performance_sample_count_) && FAIL_EXP(spq) &&
+        BAD_TEST_MSG("There is no remainder.");
+  }
+
+  void LoadSamplesToRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    FAIL_IF(loaded_samples_.size() != samples.size()) &&
+        FAIL_MSG("Contiguous sample order is likely ambiguous.");
+    for (size_t i = 0; i < samples.size(); i++) {
+      auto& offset = first_qsl_offsets_.at(samples.at(i));
+      // Samples may be loaded into multiple slots for padding purposes,
+      // so make sure to only index the first time a sample appears in a
+      // loaded set.
+      if (offset == kBadQslOffset) {
+        offset = i;
+      }
+    }
+  }
+
+  void UnloadSamplesFromRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    FAIL_IF(!loaded_samples_.empty()) &&
+        FAIL_MSG("Contiguous sample order is likely ambiguous.");
+    for (size_t i = 0; i < samples.size(); i++) {
+      first_qsl_offsets_.at(samples.at(i)) = kBadQslOffset;
+    }
+  }
+
+  void IssueQueryExt(const std::vector<mlperf::QuerySample>& samples) override {
+    size_t expected_offset = first_qsl_offsets_[samples[0].index];
+    for (auto s : samples) {
+      FAIL_IF(loaded_samples_[expected_offset] != s.index) &&
+          FAIL_MSG("Samples are not contiguous.");
+      expected_offset++;
+    }
+  }
+
+  void FlushQueriesExt() override {}
+
+  void EndTest() override {}
+
+ private:
+  static const size_t kBadQslOffset;
+  std::vector<size_t> first_qsl_offsets_;
+};
+
+constexpr size_t TestMultiStreamContiguousRemainderQuery::kBadQslOffset =
+    std::numeric_limits<size_t>::max();
+
+REGISTER_TEST(MultiStream_RemainderQueryContiguous,
+              TestProxy<TestMultiStreamContiguousRemainderQuery>(),
+              mlperf::TestScenario::MultiStream);
+}  // namespace unit_tests
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h
new file mode 100644
index 000000000..777029b99
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h
@@ -0,0 +1,198 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief A minimal test framework.
+
+#ifndef MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
+#define MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
+
+#include <algorithm>
+#include <exception>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <vector>
+
+#define REGISTER_TEST(name, ...) \
+  static Test::StaticRegistrant test##name(#name, __VA_ARGS__);
+
+#define REGISTER_TEST_SCENARIO(name, scenario, test, ...) \
+  static Test::StaticRegistrant t##name##scenario(        \
+      #name "_" #scenario, test, __VA_ARGS__, mlperf::TestScenario::scenario)
+
+#define REGISTER_TEST_ALL_SCENARIOS(name, test, ...)             \
+  REGISTER_TEST_SCENARIO(name, SingleStream, test, __VA_ARGS__); \
+  REGISTER_TEST_SCENARIO(name, MultiStream, test, __VA_ARGS__);  \
+  REGISTER_TEST_SCENARIO(name, Server, test, __VA_ARGS__);       \
+  REGISTER_TEST_SCENARIO(name, Offline, test, __VA_ARGS__);
+
+#define FAIL_IF(exp)                                              \
+  [&]() {                                                         \
+    const bool v = exp;                                           \
+    if (v) {                                                      \
+      std::cerr << "\n   ERROR: (" << __FILE__ << "@" << __LINE__ \
+                << ") : " #exp;                                   \
+      Test::AddFailure();                                         \
+    }                                                             \
+    return v;                                                     \
+  }()
+
+#define FAIL_MSG(...)                                                      \
+  [&]() {                                                                  \
+    std::cerr << "\n    Info: (" << __FILE__ << "@" << __LINE__ << ") : "; \
+    Test::Log(__VA_ARGS__);                                                \
+    return true;                                                           \
+  }()
+
+#define FAIL_EXP(exp)                                                      \
+  [&]() {                                                                  \
+    std::cerr << "\n    Info: (" << __FILE__ << "@" << __LINE__ << ") : "; \
+    std::cerr << #exp << " is " << (exp);                                  \
+    return true;                                                           \
+  }()
+
+#define BAD_TEST_MSG(...)                                        \
+  [&]() {                                                        \
+    FAIL_MSG("The test isn't testing what it claims to test. "); \
+    Test::Log(__VA_ARGS__);                                      \
+    return true;                                                 \
+  }()
+
+#define ABORT_TEST                                     \
+  [&]() {                                              \
+    FAIL_MSG("ABORTING");                              \
+    throw std::logic_error("ABORT_TEST encountered."); \
+    return false;                                      \
+  }();
+
+/// \brief Testing utilities.
+namespace testing {
+
+/// \brief Wraps a test class as a functor for easy registration.
+/// Forwards registration args to a SetUpTest method.
+/// \details Calls SetUpTest, RunTest, and EndTest.
+template <typename TestT>
+struct TestProxy {
+  template <typename... Args>
+  void operator()(Args&&... args) {
+    TestT test;
+    test.SetUpTest(std::forward<Args>(args)...);
+    test.RunTest();
+    test.EndTest();
+  }
+};
+
+/// \brief A collection of methods for registering and running tests.
+class Test {
+  /// \brief Maps registered test names to a callback.
+  using TestMap = std::multimap<const char*, std::function<void()>>;
+
+  /// \brief The registered tests.
+  /// \details Wraps a static local to avoid undefined initialization order
+  /// and guarantee it is initialized before the first test registers itself.
+  static TestMap& tests() {
+    static TestMap tests_;
+    return tests_;
+  }
+
+  /// \brief The number of errors the current test has encountered.
+  static size_t& test_fails() {
+    static size_t test_fails_ = 0;
+    return test_fails_;
+  }
+
+ public:
+  /// \brief Registers a test before main() starts during static initialization.
+  struct StaticRegistrant {
+    template <typename... Args>
+    StaticRegistrant(Args&&... args) {
+      Test::Register(std::forward<Args>(args)...);
+    }
+  };
+
+  /// \brief Registers a test at runtime.
+  template <typename TestF, typename... Args>
+  static void Register(const char* name, TestF test, Args&&... args) {
+    std::function<void()> test_closure =
+        std::bind(test, std::forward<Args>(args)...);
+    tests().insert({std::move(name), std::move(test_closure)});
+  }
+
+  /// \brief Runs all currently registered tests that match the given filter.
+  static int Run(std::function<bool(const char*)> filter) {
+    // Determine which tests are enabled.
+    std::vector<TestMap::value_type*> enabled_tests;
+    for (auto& test : tests()) {
+      if (filter(test.first)) {
+        enabled_tests.push_back(&test);
+      }
+    }
+    const size_t enabled = enabled_tests.size();
+    std::cout << enabled << " of " << tests().size() << " tests enabled.\n";
+
+    // Run the tests.
+    std::vector<const char*> failures;
+    for (size_t i = 0; i < enabled; i++) {
+      const char* name = enabled_tests[i]->first;
+      std::cout << "[" << (i + 1) << "/" << enabled << "] : " << name << " : ";
+      std::cout.flush();
+      test_fails() = 0;
+      try {
+        enabled_tests[i]->second();  // Run the test.
+      } catch (std::exception& e) {
+        constexpr bool TestThrewException = true;
+        FAIL_IF(TestThrewException) && FAIL_EXP(e.what());
+      }
+      if (test_fails() > 0) {
+        failures.push_back(name);
+        std::cerr << "\n FAILED: " << name << "\n";
+      } else {
+        std::cout << "SUCCESS\n";
+      }
+    }
+
+    // Summarize.
+    if (enabled_tests.empty()) {
+      std::cerr << "Check your test filter.\n";
+    } else if (failures.empty()) {
+      std::cout << "All " << enabled << " tests passed! \\o/\n";
+    } else {
+      std::cout << failures.size() << " of " << enabled << " tests failed:\n";
+      for (auto failed_test_name : failures) {
+        std::cout << "  " << failed_test_name << "\n";
+      }
+    }
+    return failures.size();
+  }
+
+  /// \brief Used by test macros to flag test failure.
+  static void AddFailure() { test_fails()++; }
+
+  /// \brief Base case for the variadic version of Log.
+  static void Log() {}
+
+  /// \brief Used by test macros to log an arbitrary list of args.
+  template <typename T, typename... Args>
+  static void Log(T&& v, Args&&... args) {
+    std::cerr << v;
+    Log(std::forward<Args>(args)...);
+  }
+};
+
+}  // namespace testing
+
+// The testing namespace exists for documentation purposes.
+// Export the testing namespace for all files that define tests.
+using namespace testing;
+
+#endif  // MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc
new file mode 100644
index 000000000..3dc5afa80
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc
@@ -0,0 +1,33 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief A main entry point a test binary can use if it just wants to execute
+/// Test::Run on all statically registered tests.
+
+#include <regex>
+
+#include "loadgen_test.h"
+
+int main(int argc, char* argv[]) {
+  if (argc <= 1) {
+    std::cerr << "Usage: " << argv[0] << " <include_regex> <exclude_regex>\n";
+    return -1;
+  }
+  std::regex include_regex(argc >= 2 ? argv[1] : ".*");
+  std::regex exclude_regex(argc >= 3 ? std::regex(argv[2]) : std::regex());
+  auto test_filter = [&](const char* test_name) {
+    return (std::regex_search(test_name, include_regex) &&
+            !std::regex_search(test_name, exclude_regex));
+  };
+  return Test::Run(test_filter);
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc
new file mode 100644
index 000000000..56d562c3e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc
@@ -0,0 +1,230 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Performance tests using a null backend.
+
+#include <future>
+
+#include "../loadgen.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+
+/// \brief Performance unit tests.
+namespace perf_tests {
+
+/// \defgroup LoadgenTestsPerformance Test Coverage: Performance
+
+/// \brief A simple SUT implemenatation that immediately completes
+/// issued queries sychronously ASAP.
+class SystemUnderTestNull : public mlperf::SystemUnderTest {
+ public:
+  SystemUnderTestNull() = default;
+  ~SystemUnderTestNull() override = default;
+  const std::string& Name() override { return name_; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::vector<mlperf::QuerySampleResponse> responses;
+    responses.reserve(samples.size());
+    for (auto s : samples) {
+      responses.push_back({s.id, 0, 0});
+    }
+    mlperf::QuerySamplesComplete(responses.data(), responses.size());
+  }
+
+  void FlushQueries() override {}
+
+ private:
+  std::string name_{"NullSUT"};
+};
+
+/// \brief A stub implementation of QuerySampleLibrary.
+class QuerySampleLibraryNull : public mlperf::QuerySampleLibrary {
+ public:
+  QuerySampleLibraryNull() = default;
+  ~QuerySampleLibraryNull() = default;
+  const std::string& Name() const override { return name_; }
+
+  size_t TotalSampleCount() override { return 1024 * 1024; }
+
+  size_t PerformanceSampleCount() override { return 1024; }
+
+  void LoadSamplesToRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    return;
+  }
+
+  void UnloadSamplesFromRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    return;
+  }
+
+ private:
+  std::string name_{"NullQSL"};
+};
+
+/// \brief Runs single stream traffic.
+/// \ingroup LoadgenTestsPerformance
+void TestSingleStream() {
+  SystemUnderTestNull null_sut;
+  QuerySampleLibraryNull null_qsl;
+
+  mlperf::LogSettings log_settings;
+  log_settings.log_output.prefix_with_datetime = true;
+
+  mlperf::TestSettings ts;
+
+  mlperf::StartTest(&null_sut, &null_qsl, ts, log_settings);
+}
+
+/// \brief A SUT implementation that completes queries asynchronously using
+/// std::async.
+class SystemUnderTestNullStdAsync : public mlperf::SystemUnderTest {
+ public:
+  SystemUnderTestNullStdAsync() { futures_.reserve(1000000); }
+  ~SystemUnderTestNullStdAsync() override = default;
+  const std::string& Name() const override { return name_; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    futures_.emplace_back(std::async(std::launch::async, [samples] {
+      std::vector<mlperf::QuerySampleResponse> responses;
+      responses.reserve(samples.size());
+      for (auto s : samples) {
+        responses.push_back({s.id, 0, 0});
+      }
+      mlperf::QuerySamplesComplete(responses.data(), responses.size());
+    }));
+  }
+
+  void FlushQueries() override {}
+
+ private:
+  std::string name_{"NullStdAsync"};
+  std::vector<std::future<void>> futures_;
+};
+
+/// \brief Tests server traffic using SystemUnderTestNullStdAsync.
+/// \ingroup LoadgenTestsPerformance
+void TestServerStdAsync() {
+  SystemUnderTestNullStdAsync null_std_async_sut;
+  QuerySampleLibraryNull null_qsl;
+
+  mlperf::LogSettings log_settings;
+  log_settings.log_output.prefix_with_datetime = true;
+  log_settings.log_output.copy_summary_to_stdout = true;
+
+  mlperf::TestSettings ts;
+  ts.scenario = mlperf::TestScenario::Server;
+  ts.server_target_qps = 2000000;
+  ts.min_duration_ms = 100;
+
+  mlperf::StartTest(&null_std_async_sut, &null_qsl, ts, log_settings);
+}
+
+/// \brief A SUT implementation that completes queries asynchronously using
+/// an explicitly managed thread pool.
+class SystemUnderTestNullPool : public mlperf::SystemUnderTest {
+ public:
+  SystemUnderTestNullPool() {
+    samples_.reserve(kReserveSampleSize);
+    next_poll_time_ = std::chrono::high_resolution_clock::now() + poll_period_;
+    for (size_t i = 0; i < thread_count_; i++) {
+      threads_.emplace_back(&SystemUnderTestNullPool::WorkerThread, this);
+    }
+  }
+
+  ~SystemUnderTestNullPool() override {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      keep_workers_alive_ = false;
+    }
+    cv_.notify_all();
+    for (auto& thread : threads_) {
+      thread.join();
+    }
+  }
+
+  const std::string& Name() const override { return name_; }
+
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::unique_lock<std::mutex> lock(mutex_);
+    samples_.insert(samples_.end(), samples.begin(), samples.end());
+  }
+
+  void FlushQueries() override {}
+
+ private:
+  void WorkerThread() {
+    std::vector<mlperf::QuerySample> my_samples;
+    my_samples.reserve(kReserveSampleSize);
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (keep_workers_alive_) {
+      next_poll_time_ += poll_period_;
+      auto my_wakeup_time = next_poll_time_;
+      cv_.wait_until(lock, my_wakeup_time,
+                     [&]() { return !keep_workers_alive_; });
+      my_samples.swap(samples_);
+      lock.unlock();
+
+      std::vector<mlperf::QuerySampleResponse> responses;
+      responses.reserve(my_samples.size());
+      for (auto s : my_samples) {
+        responses.push_back({s.id, 0, 0});
+      }
+      mlperf::QuerySamplesComplete(responses.data(), responses.size());
+
+      lock.lock();
+      my_samples.clear();
+    }
+  }
+
+  static constexpr size_t kReserveSampleSize = 1024 * 1024;
+  const std::string name_{"NullPool"};
+  const size_t thread_count_ = 4;
+  const std::chrono::milliseconds poll_period_{1};
+  std::chrono::high_resolution_clock::time_point next_poll_time_;
+
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  bool keep_workers_alive_ = true;
+  std::vector<std::thread> threads_;
+
+  std::vector<mlperf::QuerySample> samples_;
+};
+
+/// \brief Tests server traffic using SystemUnderTestNullPool.
+/// \ingroup LoadgenTestsPerformance
+void TestServerPool() {
+  SystemUnderTestNullPool null_pool;
+  QuerySampleLibraryNull null_qsl;
+
+  mlperf::LogSettings log_settings;
+  log_settings.log_output.prefix_with_datetime = true;
+  log_settings.log_output.copy_summary_to_stdout = true;
+
+  mlperf::TestSettings ts;
+  ts.scenario = mlperf::TestScenario::Server;
+  ts.server_target_qps = 2000000;
+  ts.min_duration_ms = 100;
+
+  mlperf::StartTest(&null_pool, &null_qsl, ts, log_settings);
+}
+
+/// @}
+
+}  // namespace perf_tests
+
+int main(int argc, char* argv[]) {
+  perf_tests::TestSingleStream();
+  perf_tests::TestServerStdAsync();
+  perf_tests::TestServerPool();
+  return 0;
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py
new file mode 100644
index 000000000..115372e18
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py
@@ -0,0 +1,61 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python version of perftests_null_sut.cc.
+"""
+
+from __future__ import print_function
+from absl import app
+import mlperf_loadgen
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def issue_query(query_samples):
+    responses = []
+    for s in query_samples:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024 * 1024, 1024, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb
new file mode 100644
index 000000000..ab834d17a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb
@@ -0,0 +1,441 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tool to extract usefull information from mlperf trace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Figure size 3600x2400 with 0 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%matplotlib inline\n",
+    "# Ignore warnings\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import json\n",
+    "import os\n",
+    "import seaborn as sns\n",
+    "from operator import itemgetter\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "figsize=(10, 5)\n",
+    "font=10\n",
+    "\n",
+    "plt.figure(dpi=600)\n",
+    "plt.rc('xtick', labelsize=font) \n",
+    "plt.rc('font', size=font)\n",
+    "sns.set(font_scale=1.4, style=\"whitegrid\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def trace_to_df(fname):\n",
+    "    with open(fname, \"r\") as f:\n",
+    "        j = json.load(f)\n",
+    "    if type(j) == dict:\n",
+    "        j = j['traceEvents']\n",
+    "        \n",
+    "    result = []\n",
+    "    for item in j:\n",
+    "        name = item['name']\n",
+    "        if name not in [\"Latency\", \"Sample\", \"QuerySamplesComplete\", \"IssueQuery\"]:\n",
+    "            continue\n",
+    "\n",
+    "        args = item.get('args')\n",
+    "        d = {\"ts\": item['ts'], \"name\": name, \"dur\": item.get(\"dur\")}\n",
+    "\n",
+    "        if name == \"Latency\":\n",
+    "            d[\"issue_delay\"] = args[\"issue_delay\"]\n",
+    "            d[\"issue_to_done\"] = args[\"issue_to_done\"] / 1e3\n",
+    "            result.append(d)\n",
+    "        elif name == \"Sample\":\n",
+    "            if args:\n",
+    "                d[\"issue_start_ns\"] = args[\"issue_start_ns\"]\n",
+    "                d[\"complete_ns\"] = args[\"complete_ns\"]\n",
+    "                d[\"issue_to_done\"] = (args[\"complete_ns\"] - args[\"issue_start_ns\"]) / 1e3\n",
+    "                result.append(d)\n",
+    "        elif name == \"QuerySamplesComplete\":\n",
+    "            result.append(d)\n",
+    "        elif name == \"IssueQuery\":\n",
+    "            result.append(d)\n",
+    "\n",
+    "    df = pd.DataFrame(result)\n",
+    "    df = df.sort_values(by=[\"ts\"])\n",
+    "    return df\n",
+    "\n",
+    "BINS = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ts</th>\n",
+       "      <th>dur</th>\n",
+       "      <th>issue_delay</th>\n",
+       "      <th>issue_to_done</th>\n",
+       "      <th>issue_start_ns</th>\n",
+       "      <th>complete_ns</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>2.000000e+04</td>\n",
+       "      <td>10000.000000</td>\n",
+       "      <td>5.000000e+03</td>\n",
+       "      <td>10000.000000</td>\n",
+       "      <td>5.000000e+03</td>\n",
+       "      <td>5.000000e+03</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>4.894584e+07</td>\n",
+       "      <td>17.731682</td>\n",
+       "      <td>7.001508e+04</td>\n",
+       "      <td>6112.554491</td>\n",
+       "      <td>7.001508e+04</td>\n",
+       "      <td>6.182570e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>2.839099e+07</td>\n",
+       "      <td>25.578639</td>\n",
+       "      <td>9.666462e+04</td>\n",
+       "      <td>2254.077235</td>\n",
+       "      <td>9.666462e+04</td>\n",
+       "      <td>2.263719e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>4.102560e+03</td>\n",
+       "      <td>1.152000</td>\n",
+       "      <td>8.810000e+02</td>\n",
+       "      <td>2754.967000</td>\n",
+       "      <td>8.810000e+02</td>\n",
+       "      <td>2.780383e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>2.463025e+07</td>\n",
+       "      <td>3.974750</td>\n",
+       "      <td>5.806250e+04</td>\n",
+       "      <td>4100.473000</td>\n",
+       "      <td>5.806250e+04</td>\n",
+       "      <td>4.166623e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>4.881766e+07</td>\n",
+       "      <td>7.364000</td>\n",
+       "      <td>6.159800e+04</td>\n",
+       "      <td>6089.880000</td>\n",
+       "      <td>6.159800e+04</td>\n",
+       "      <td>6.155939e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>7.373552e+07</td>\n",
+       "      <td>27.441000</td>\n",
+       "      <td>6.835175e+04</td>\n",
+       "      <td>7337.257000</td>\n",
+       "      <td>6.835175e+04</td>\n",
+       "      <td>7.408272e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>9.832065e+07</td>\n",
+       "      <td>508.552000</td>\n",
+       "      <td>6.522433e+06</td>\n",
+       "      <td>22234.101000</td>\n",
+       "      <td>6.522433e+06</td>\n",
+       "      <td>2.414005e+07</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 ts           dur   issue_delay  issue_to_done  \\\n",
+       "count  2.000000e+04  10000.000000  5.000000e+03   10000.000000   \n",
+       "mean   4.894584e+07     17.731682  7.001508e+04    6112.554491   \n",
+       "std    2.839099e+07     25.578639  9.666462e+04    2254.077235   \n",
+       "min    4.102560e+03      1.152000  8.810000e+02    2754.967000   \n",
+       "25%    2.463025e+07      3.974750  5.806250e+04    4100.473000   \n",
+       "50%    4.881766e+07      7.364000  6.159800e+04    6089.880000   \n",
+       "75%    7.373552e+07     27.441000  6.835175e+04    7337.257000   \n",
+       "max    9.832065e+07    508.552000  6.522433e+06   22234.101000   \n",
+       "\n",
+       "       issue_start_ns   complete_ns  \n",
+       "count    5.000000e+03  5.000000e+03  \n",
+       "mean     7.001508e+04  6.182570e+06  \n",
+       "std      9.666462e+04  2.263719e+06  \n",
+       "min      8.810000e+02  2.780383e+06  \n",
+       "25%      5.806250e+04  4.166623e+06  \n",
+       "50%      6.159800e+04  6.155939e+06  \n",
+       "75%      6.835175e+04  7.408272e+06  \n",
+       "max      6.522433e+06  2.414005e+07  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = trace_to_df('/tmp/mlperf_log_trace.json')\n",
+    "df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoIAAAFKCAYAAACJoz5RAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAstklEQVR4nO3deZxcVZ338Q8ECIQASVgNyjIIP8AoPkRGUfRBkSXgOiIMjAsoKIroIAgzKItsowODKCIoI4IzqICODgjigiwqKtCAEiA/MLI8EEFIAoJgB0ieP84tUhSVdHelu6vS9/N+vfp1U/eeunWqTi/fnHvOuSssWrQISZIk1c+K3a6AJEmSusMgKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1tVK3KyBJdRIRmwB3A/tn5nndrc3z9ULdIuJdwDeAjTLz0W7UYbAiYhpwC/DKzJzZ5epIHTEISmNIROxH+SO6fWb+psvVGZSIWAc4Eng7sBHwJHAD8MXMvLybdRurIuKjwJM9GETHAScAZ/V6CATIzJkRcTlwPPAP3a6P1AkvDUvqmogISo/KIcCVwMeAfwPWAy6LiM91r3Zj2keB/drsvxdYDfivUa3NYm8BtgK+2qXX78TZwDsj4qXdrojUCXsEJXVFRKwMfBeYDLwhM69vOnYacAFwZET0ZebFo1y31TPzr6P5mp2qetFWysz+ZT1XZi4C/rbsterYB4AbM/OPXazDUP0UmE8J1p/pblWkoTMISmNcRKwPnATsSulpexS4CTg8M2+rymwLnAhsB6wBPARcC3woM5+KiB2Bq4A3ZubVTefehDZjyiJii+p8OwGrA3cAJ2Xmd5uq9i5gGnBMcwgEyMxnI+LDVZ0/C1xcnXfY69F0OX0n4B3A3sB61XPvBA7LzNNaPtNXAL8DPpqZZ7EEETEJOB14J7AI+F/gC23KXV297x1b9p8H7JiZm7S8z38FngA+AWwKvBm4OiIOq15rS2AicBflEvt/Np3zHmDj6t+Ne4zem5mbLOVz3IbSU7sDMI5y6f7ozPxFU5n9KJ/jjpTL/O8FJgA/oXwfPbykz6l6/qrAbsB/tDm2CPhsZh7Xsv8e4OrM3K96vBLwL8D7gJcAT1Wfwecz83+anjeY708iYi1KuHsXsCHwCHAN8KnMfAAgM5+u2u+dGAS1HPLSsDT2fRfYEzifckmwEUS2AIiIdSm9GpsB/065PHseJaStPtQXi4itgN8CL6/OdxgwF7g4It7TVPSt1fab7c6TmY9RgtNWEbHZCNaj4QxgW0poPiYz7wJ+DbQr+x5gAXDhUl5/har+76X0bn4GmEpph2X1XuBwSvA6FPhTtf9QYCZlzNqnKIH+nIg4qOm5/wzcD8yqzvPeat+S3sdWwC+A/wOcAhxXvY+fRcQb2jzldGAbSoA/i9LOXx7Ee5oOrALcOIiyS3Is5b1fA3y8+vcs4O8bBQb7fRERq1fnORT4OSV0f4USolsvA/dRvk8nL0Pdpa6wR1Aaw6oeqR0oPRinNh1qHnv3WmAKsGtmNv8RPrbDl/0iMAd4VWY+Ve07MyJ+AnwuIi6oLkFuDTyWmfcu5Vy/q7ZbA7NHqB4NT1B6355p2vdN4KyI2DozbweIiBWBfYDLMnPeUl7/bcAbgCMz89+r554F/GyI76OdjYHNM/NPLfu3yMwnmx6fUb3fwylj2cjMH0TEicAjmfnfg3itk4BVgelVOCYivkEJWKcBr2opPxfYufHZVp/XxyNirSrcL8mW1XZZLgu/Bbg8Mw9cSpnBfl98ihJo92oZmnBSFfKb/RFYgTK+8bplqL806uwRlMa2pyg9VztGxJQllGn8cX5LNW6vY9VrvBm4CFg9ItZpfAFXUC6vbVEVXwN4fIBTNo6vMYL1aDinJQRC6fHrp/SaNewIvJiBJ1TsDiyk9IoB5ZI3cOZQ3ssS/KBNCKQRAiNi5YiYUr3fq4DNqsucQ1KNP9wVuLQRAqvXeYTSazy9GnrQ7OstAfsXlMvJGw/wcmtX2/lDrWeTx4CXVZd+X2CI3xd7Are1G5/a8v6a67zOMtRd6gqDoDSGVRMIjqSMvXooIn4ZEUdFxEuail1DuXx8LDA3Ii6NiAOrS2ND9VJKz8hxwMMtX42xX+tV28cZOOA1jv95BOvR8IIex8ycD1wC7NvUC/QeYB5w2QB12Bh4MDNbw+6dg3sLS9W2dzQi3h4RN1L+AzCX8n5Prg4POQgC61LG+WWbY3dU201a9t/X8rgRkgZ72bS1t20ojqG8z4yI2yLitIho7rEcyvfFZpTL7EOpc2tAlHqeQVAa4zLzdGBzyqWux4CjgTuqiRdk5qLMfDfwasr4rnWArwG3RkTjj+KS/sCNa3nc+J3yBWDnJXw1/rjeDqwVERstpfqvqLaNy4UjUY+Gp2jvm5T1Dd9QTWh4F3BRZi5YSr2HarDvq+EFdY2IHYDvU9ZhPAjYg/I+G2NCR+v3/bNL2D9QwHuk2g5lnN3zPp/MvJYS4N4P3EyZNHJ9RBxRFenk+2IwGnV+ZKmlpB7kGEGpBjLzbkrIOz0iXkxZu+/TwNVNZa4HrgeOiYgZwOXAgZQxYo1enUktp2693NcIbM9k5kBj4S4F9qX8sT6x9WBErEmZfXpT03IiI1GPgVxB6ZF8L7A+sCaDW2fvXmDniFijpVew3WXL+cDftdk/0OXUZntSln7ZJTOfWwImIt7Ypuxge64epgTLaHOsMabvniHUcWkaPYybUkJcs/m0tHlErAK8qPUkVS/uN4FvRsRqlO/jz0bEfzC074vZlAlTg7Ep5TOdNcjyUs+wR1AawyJiQvXH8DmZeT8l2EyqykxuM/j9pmo7qdreS+npaZ0l+tGWc/+ZMibtwIjYsE191m16+D3gNuBfWi7fNcamnUXpaTmp6dBI1GOpqnGDF1CC1geBP2TmYCYEXE75HfuRptddETi4TdnZwJbN9aqWbHndYOtJ+VwW0fR7vZrF+oE2Zf/KIHreqjGNVwBvbZ65XY21ez9lzb+HhlDHpemjBNnWySdQPp/WNv8QLT2CEbF28+NqMsgsymSX1Yb4ffFdynjDd7cp1/rzMh2YVYVQablij6A0tm0B/DwiLqaErn7KJIatKDNJofxBPzgivk/5g7sasD8lWHwXylIu1TkOqdZ0m02Zodk6zg5K8PkV8PuIOKcqux7l0vPWVEtvVOuvvYuyNMcvI+JcShiYTOkp/D/Aic3rv41EPQbpm5RlRHahjC8bjEur1/+3an2+2yjrFLabtHMu8EngxxHx9aqeB1XPWXMIr/dJ4KcR8V/V6xwIPAhs0FL2RuCjEXEsZcziE5l56RLO+xnK+/5lRJxJCWsHUv6TsOcg6zagzFwQEVdQLs8e1XL4P4GzI+J7lKWOtqFMYmm9FHtHRFxLWefwkarcAcAPM/OJqsxgvy9OoQwD+HZE7EL53pwEzKCMRbwGnlsY/f+yfN0NRXqOPYLS2Pb/KL1Zr6f0rJ1CmRn5wcxsDI6/hnJJeC/K0hpHUcLDmzLzt03nOoSyLt5BlEu591FC5PNkZlJ6dS6hXPY9k9JjtxJlfGJr2W0o68ztTFmn7RRKCHx/Zj6v/EjVYyCZeQvw++rhYJZcITMXUpaQuQD4J8rn/6cl1PWOqo5rUZZkeRvlUvRNrWWX8npXV+eeQhkG8AHK2ohfalP8eBYHx29V5ZZ03jsoSxDdTJl49FnK98ebqzF5w+lc4FURsWnL/nOAz1N6Bf+Dcil2Z0rPZrPTKTO6j6R8T+1GWSppn0aBwX5fVHeWeUN1fDfK5/gxyhqMz82gpsxCnkKZRS0td1ZYtMhJTpJ6R0S8nLLkyL2UW88tbe25URMRNwALMnMol2s1BNWl85mU5WqO7HZ9BiMiLgEWZuY7ul0XqRP2CErqKZl5K2WSSADfryYFdFVEvJLSizQcdwXRElS9qEcDH6kWQ+9pETGNMtTCW8tpuWWPoCQtQfWHfjplfOCLgE1b7twhScs1ewQlacn2pNzPdzXgHw2BksYaewQlSZJqyuVjOtDX1zce2I4yA3BJq+hLkiT1gnGU4S03TJ8+vb/5gEGwM9tRZjVKkiQtL14P/LJ5h0GwM38C2GKLLVhlleGd0Dhz5kymTRvsXY00GmyT3mJ79Bbbo/fYJr2lF9pjwYIF3HnnnVDll2YGwc48C7DKKqswfvz4YT/5SJxTy8Y26S22R2+xPXqPbdJbeqg9XjCczVnDkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNeWdRXrY439dwJP9z3S7GsNiwviVWGP14b0dnyRJWjYGwR72ZP8zXHnDfd2uxrDYabuNDIKSJPUYLw1LkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSpplbqdgUaImIiMAvYENguM29sOvY+4ChgE2A2cHxmXtjy/JWB44H3A5OAG4BPZOYtLeU2AL4I7AYsAn4I/HNmPjIS70uSJKlX9VKP4HG0CaYRsSdwPvB9YAbwM+DbETGjpegXgIOBY4G3AwuAKyNiatO5VgKuAF4OvA84AHgtcElErDDM70eSJKmn9USPYERMAw4CPgl8teXwCcDFmfmv1eOrImIr4LPAj6rnb1g9/+OZeU617zfA3cA/A0dUz30XsA0wLTNvq8rNAX5FCZmXj8T7kyRJ6kW90iN4JvBl4M7mnRGxKbAl8J2W8t8CtouIdavHuwDjgOcuF2fm45TLvrs3PW934NZGCKzKXQfc21JOkiRpzOt6EIyI9wIvBU5sc3irant7y/5GkIumcg9l5tw25baIiBWbyrWeq1Fuy6HUW5IkaXnX1SAYEWsBpwBHZOYTbYpMrraPtuyfX22nNJVrLdMotzIwcRDlprTZL0mSNGZ1e4zgicBdmXlBl+vRkZkzZ47Iefv6+gBYbc11mTNnzoi8xmibO3cC99/9cLer0bFGm6g32B69xfboPbZJb+nl9uhaEIyIl1EmeOwcEZOq3Y2eu4kRsQaLe/4mAQ82Pb3RUziv2s6vyrSaDDwNPDGIcvPa7F+qadOmMX78+KE+ban6+vqYPn06AA/Ne5KpU58c1vN3y9prr8P6m2/U7Wp0pLlN1H22R2+xPXqPbdJbeqE9+vv7l9h51c1Lw5tTguhVlIA2H7i0OnYV8AvgjurxVi3P3braZrW9A1gvIlov724N3JmZC5vKtZ6rUW5WB+9BkiRpudXNIPhL4I0tX4dWxw4CDsjMuykBbe+W5+4D3JCZjWuNPwEWAns1ClQLVL+V5y8Jcznw8mr5mUa511AWqnbpGEmSVCtduzRc3cnj6uZ9EY1JwPQ13VnkGODCiJgN/JSyWPQuwB5N53ogIs4GPh8Rz1CWgzkcWAE4veklvgf8HvhuRPwr5f2fAvyaak1CSZKkuuj68jEDycyLgf2BPYEfA7sC+2Zma3A7FDiLMgHlEmA14M2ZOafpXM9Qbi03E/hv4BvAb4C3ZeaiEX4rkiRJPaXbs4afJzOvpvTite4/n3KbuaU992ngX6qvpZV7kBdeapYkSaqdnu8RlCRJ0sgwCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmhhwEI2LXiFhhJCojSZKk0dNJj+CPgPsj4pSI2Ga4KyRJkqTR0UkQfAfwK+Bg4KaI+H1EHB4RU4e1ZpIkSRpRQw6CmXlJZu4FrA8cCDwMfA64NyJ+EhHviYgJw1xPSZIkDbOOJ4tk5uOZeW5m7gRsDBwFrAecDzwUEd+MiJ2GqZ6SJEkaZsM1a3gcsDIwHlgBeAp4M/DTiLg5IqYN0+tIkiRpmKzU6RMjYi1gL+A9wOuAZ4DLgH+ptguBtwFfAL4BbLeslZUkSdLwGXIQjIh3UMLf7sCqwA3AJ4BvZ+a8luI/iIh1gK+0Oc8/AJ8EtgQmAg8A3wdOyMzHmsrNAE4Ctq7KnJ6ZZ7Q53+GUCSwbALcBR2bmlS1l1gBOAfas6n4VcEhm3jOkD0GSJGkM6OTS8P8Arwa+CGydma/OzDPbhMCG3wMXtNk/BbgW+BCwW3W+DwAXNwpExPbAJcDNwAxKz+LpEXFQ84mqEHgycCawB3AXcFmb5W2+TemlPATYG5gKXOnkFkmSVEedXBreBbgyMxcNpnBmXg9c32b/f7bsujoi/gZ8NSKmZuYc4Bjgpsz8YFXmqojYCDg2Ir6WmQsjYjzwGUpP4akAEXENcCvwacrlayLi1ZSQuEdmXl7tuxWYDexHm15LSZKksayT5WN+NtgQ2IFHqu0qVcB7E3BhS5lvUS7/bls9fi2wFvCdpjo+C1wEzGi6C8ruwGPAFU3l7qOsibj78L4NSZKk3tfJLea+EBF3LeX4nRFxyhDONy4iVo2I6ZQewEuqMXubAasAt7c85bZqu2W13ara3tGm3ERgw6ZyszJzYZtyWyJJklQznVwa3oMX9tI1uxB4N/CpQZ5vLqVHD0pv3b7VvydX20dbys+vtlOayvVn5lNLKXd/Va71XI1yU9rsH9DMmTM7edqA+vr6AFhtzXWZM2fOiLzGaJs7dwL33/1wt6vRsUabqDfYHr3F9ug9tklv6eX26CQIvgS4ZynH763KDNaOwARgGmWs36URsXMH9Rp106ZNY/z48cN6zr6+PqZPnw7AQ/OeZOrUJ4f1/N2y9trrsP7mG3W7Gh1pbhN1n+3RW2yP3mOb9JZeaI/+/v4ldl51EgT/Amy6lON/R1lQelAy85bqn9dFRB9wI/BOFl8SntTylEZPYWOW8nxgfESsmpl/G6BcuyQyuamMJElSbXSyfMzPgQ9Xs3efJyI2AT5clenELZSFqF9Kmc27gMVjABu2rrazqm1jbGC7co9T1h5slIumySPN5WYhSZJUM50EwWMoPYkzI+KLEfGh6utLlDUDVwSO7rA+21fP/2Nm9lMC5V4tZfYBHgRuqh5fR5kNvHejQESMq553RdMM58spvYu7NpV7CbBDdUySJKlWhnxpODPviojXURZvPqTl8DWUO3XkQOeJiB8DV1Jm7f4NeCVlgsnvgR9UxY4Hro2IcyiLUr8OOBA4uDH7NzP7I+JE4OSIeJgSEA+gzDpuTDwhM38bEZcBX4+IwyiXuI8H7gPOG9qnIEmStPzr6F7DmXkbsGN1+7i/q3bPzsy5QzjN9ZRb1TXGG94DnA2clpkLqtf5dUS8nXLXkPcBc4BDM/PslvqcGhEAHwfWp4TLPTLzdy2vuQ9wKmXx6PGUW8y9OzPHxowMSZKkIegoCDZk5iMsXgR6qM89mkFcQq7uAjLgpdvqriKnDlDmccoYxg8PspqSJEljVkdBsBqDtyulN3Ay0DoBY1FmnrCMdZMkSdIIGnIQjIhXAd8DXswLA2DDIsAgKEmS1MM66RH8CrAa8A7gF5n56HBWSJIkSaOjkyD4CuDTmXnpcFdGkiRJo6eTdQTvZ8mXhCVJkrSc6CQIfg44MCLWHO7KSJIkafR0cml4CvBX4A8R8V3g/wHPtpRZlJmnLGvlJEmSNHI6CYKfa/r3QUsoswgwCEqSJPWwToLgpgMXkSRJUq/r5F7D945ERSRJkjS6Or7FXERsDuwIrAdckJn3RMQqwAbAg437BUuSJKk3dXJnkRWBs4EPUpaRWQT8GrgHWAW4FTge+I9hq6UkSZKGXSfLxxwFfAA4GtiepjUFM/MJyu3n/mFYaidJkqQR00kQ3B84NzNPBv7Q5vitwObLVCtJkiSNuE6C4IuB65dy/Clgjc6qI0mSpNHSSRB8ENh4KcenA84sliRJ6nGdBMHvAR+pZg03LAKIiBnA+4CLhqFukiRJGkGdBMHjgPuAm4ELKCHwqIj4DfBD4HfAvw1XBSVJkjQyhhwEM/MvwGuBk4H1gb8BOwATKSHxDZn51DDWUZIkSSOgowWlM/NvlCB48vBWR5IkSaOlk0vDkiRJGgM6ubPIuYMotigzP9hBfSRJkjRKOrk0/CaqWcJNxgEvqrYPA39dxnpJkiRphA05CGbmJu32R8TKwIeBfwZ2XqZaSZIkacQN2xjBzHw6M78M/AT48nCdV5IkSSNjJCaL/A54wwicV5IkScNoJILgzsCTI3BeSZIkDaNOZg0fs4RDkyg9gdsCn1uGOkmSJGkUdDJr+Lgl7J8PzAYOAs7ptEKSJEkaHZ3MGnYRakmSpDHAUCdJklRTnYwR3KiTF8rM+zp5niRJkkZGJ2ME7+GFdxYZjHEdPEeSJEkjpJMgeADwceAlwLeAO6v9AewD3Ad8CVg4HBWUJEnSyOgkCL4IGA+8NDPnNx+IiGOBXwEbZOa/DUP9JEmSNEI6mSxyEPC11hAIkJlzKUvHfGRZKyZJkqSR1UkQXBuYuJTjq1dlJEmS1MM6CYK/AT4REdNbD0TEq4BPAL9d1opJkiRpZHUyRvBjwNXA9RFxA3BXtX9zYDtgHnDIsNROkiRJI2bIPYKZeTvwcsrM4EnAntXXJOCLwMsz87bhq6IkSZJGQic9gmTmQ8Ch1ZckSZKWQx0FwYaI2BxYD5iZmY8N8bnvBv4JmA5MAWYDZwFfzcyFTeVmACcBWwMPAKdn5hltznc4cDCwAXAbcGRmXtlSZg3gFEoP5qrAVcAhmXnPUOouSZI0FnR0r+GI2Dci7gNmAddSwhwRsU5E3BkRew3iNIcB/cCngLcAP6Bcbv580+tsD1wC3AzMAL4BnB4RB7XU53DgZOBMYA/KuMXLImKbltf8NvA2yhjGvYGpwJURMWHQb16SJGmM6ORew+8C/hv4KXA6cGrjWGY+EhF3AO8DLhrgVG/NzIebHl8VEROBj0XEZzKzHzgGuCkzP9hUZiPg2Ij4WmYujIjxwGcoPYWnVnW8BrgV+DSwV7Xv1ZSQuEdmXl7tu5XSE7kf8JWhfhaSJEnLs056BD8N/CwzdwXOb3P8t0BrT9wLtITAhpspl2ynVAHvTcCFLWW+Rbn8u231+LXAWsB3ms79LCWIzoiIFarduwOPAVc0lbuPcieU3QeqryRJ0ljTSRDcCvj+Uo7/GVi3s+rwesryM38GNgNWAW5vKdOYkbxlU30A7mhTbiKwYVO5Wc3jD5vKbYkkSVLNdDJZ5K8s/c4imwGPDPWk1WLU+wOfzcxnI2JydejRlqKNW9tNqbaTgf7MfGop5e6vyrWeq1FuSpv9A5o5c2YnTxtQX18fAKutuS5z5swZkdcYbXPnTuD+u9t1Ai8fGm2i3mB79Bbbo/fYJr2ll9ujkyD4c2C/iPhi64GImAocCPzvUE4YERsA3wOup2mySK+bNm0a48ePH9Zz9vX1MX16uWnLQ/OeZOrUJ4f1/N2y9trrsP7mG3W7Gh1pbhN1n+3RW2yP3mOb9JZeaI/+/v4ldl51OkbwRcCNwEeBRcDuEfE5ygSNhcBnB3uyiFgL+BHwJPC2zHy6OtTo0ZvU8pRGT+G8pnLjI2LVQZRrPVej3Lw2+yVJksa0Tu4schfwOuBB4DhgBeCTwBHALcAO1SSMAVXh7RLKWoS7ZebcpsOzgQUsHgPYsHW1nVVtG2MD25V7nLL2YKNcNE0eaS43C0mSpJoZUhCMiHHV8i0PZeYuwDrAq4HtgfUzc6fMvHOQ51qJMrP3FcCMzLy3+Xi1fMzPqZZ/abIPJYTeVD2+jjIbeO/melbPuyIzF1W7L6f0CO7aVO4lwA7VMUmSpFoZ6hjBFSk9dUcCp2XmfOCGDl/7TOCtlJ7ECRHxmqZjt2fmX4DjgWsj4hzgAkpP5IHAwY3Zv5nZHxEnAidHxMOUgHgAZdLKvo0TZuZvI+Iy4OsRcRjQOP99wHkdvgdJkqTl1pB6BKvxe3Mo4wKXVaNn7t+BX7d8bVu93q+BtwPbAT+mBLxDM/PslnqdChwFfJwy3nBLysLRv2t5zX2AH1IWj76Y0rP45swcGzMyJEmShqCTWcPfoMwaPisz/9bpC2fmJoMsdzmDuHRbhcFTByjzOPDh6kuSJKnWOgmCdwLjgFkRcT7wR6B1DT8yc6BbzEmSJKmLOgmC/93076OXUGYRA99rWJIkSV00qCAYEV8Czs/MPuCN1e6JlJ7AZ0eobpIkSRpBg+0R/BjwG6AvM6+JiLUp9wPeOTOvGbHaSZIkacR0cmeRhtaFmSVJkrQcWZYgKEmSpOWYQVCSJKmmhjJr+O8i4u+rf69VbbeMiCfaFc7M65epZpIkSRpRQwmCn62+mp3RptwKlOVjxnVaKUmSJI28wQbB/Ue0FpIkSRp1gwqCmXn+SFdEkiRJo8vJIpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1dRK3XzxiHgpcDjwGmAaMCszp7UpNwM4CdgaeAA4PTPPaFPucOBgYAPgNuDIzLyypcwawCnAnsCqwFXAIZl5z/C9M0mSpN7X7R7BlwF7AH8Abm9XICK2By4BbgZmAN8ATo+Ig1rKHQ6cDJxZnfMu4LKI2KbllN8G3gYcAuwNTAWujIgJw/SeJEmSlgtd7REELs3M/wWIiPOAV7UpcwxwU2Z+sHp8VURsBBwbEV/LzIURMR74DKWn8NTqfNcAtwKfBvaq9r2aEhL3yMzLq323ArOB/YCvjMi7lCRJ6kFd7RHMzIVLO14FvDcBF7Yc+hbl8u+21ePXAmsB32k697PARcCMiFih2r078BhwRVO5+4BfVcckSZJqo9uXhgeyGbAKL7xsfFu13bLablVt72hTbiKwYVO5WW0C6G1N55IkSaqFbl8aHsjkavtoy/751XZKU7n+zHxqKeXur8q1nqtRbkqb/Us1c+bMoT5lUPr6+gBYbc11mTNnzoi8xmibO3cC99/9cLer0bFGm6g32B69xfboPbZJb+nl9uj1INjTpk2bxvjx44f1nH19fUyfPh2Ah+Y9ydSpTw7r+btl7bXXYf3NN+p2NTrS3CbqPtujt9gevcc26S290B79/f1L7Lzq9UvDjR69SS37Gz2F85rKjY+IVQdRrvVcjXLz2uyXJEkas3o9CM4GFrB4DGDD1tV2VrVtjA1sV+5xytqDjXLRNHmkudwsJEmSaqSng2Bm9gM/p1r+pck+wIPATdXj6yizgfduFIiIcdXzrsjMRdXuyyk9grs2lXsJsEN1TJIkqTa6fWeRCSxetmVjYM2I2LN6fENm3gscD1wbEecAFwCvAw4EDm7M/s3M/og4ETg5Ih6mBMQDKLOO9228Xmb+NiIuA74eEYcBf6nOfx9w3oi+WUmSpB7T7cki6wEXt+xrPN4fOC8zfx0Rb6fcNeR9wBzg0Mw8u/lJmXlqRAB8HFifsiTMHpn5u5bz7wOcSlk8ejzlFnPvzsyxMStDkiRpkLoaBKv7+7aO12tX7nIGcem2uqvIqQOUeRz4cPUlSZJUWz09RlCSJEkjxyAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaZW6nYFVA8LFy7ioXlPdrsaHVltzXWfV/cJ41dijdVX6WKNJEkaHgZBjYr+p5/lut/P6XY1OjJnzhymTl0cBHfabiODoCRpTPDSsCRJUk0ZBCVJkmqqdpeGI2Jz4AxgB+Ap4DvAkZm5fA5gkyRJ6lCtgmBETAKuAu4F9gTWA04D1gX+sXs1kyRJGn21CoLAh4HJwCsz8xGAiHgGuCAiTsjM27paO0mSpFFUtzGCuwNXNkJg5XtAPzCjO1WSJEnqjrr1CG4FnNu8IzP7I2I2sOUQzjMOYMGCBcNYtcX6+/sBeObpBay04sIReY3R9uwzTy+372XVlVd4Xt0XLOjngYdGpu1H07gVV+DZhYu6XY0hGz9xCg889Ojz9q22ykqsPmHl7lRIz/3OUu+wTXpLt9ujKa+Maz1WtyA4GXi0zf75wJQhnOdFAHfeeecwVOmFZs6c+dy/NxtKrXrYo3++Z7l9L5tNWZMyr6j48wN/7F5lBMCDT8zrdhXUpPl3lnqDbdJbeqg9XgTMbt5RtyA4XG4AXg/8CXi2y3WRJElamnGUEHhD64G6BcH5wKQ2+ycDswZ7kunTp/cDvxymOkmSJI202e121m2yyB2UcYLPiYjxwGYMIQhKkiSNBXULgpcDO0XE2k373gmMr45JkiTVxgqLFi1/swY7VS0oPRO4BziBxQtKX5mZLigtSZJqpVY9gpn5KPAm4Angf4AvABcCH+hitSRJkrqiVj2CkiRJWqxWPYKSJElazCAoSZJUUwZBSZKkmqrbgtI9KyI2B84AdqDcz+w7wJGZ+WRXKzbGRMRLgcOB1wDTgFmZOa1NuRnAScDWwAPA6Zl5RptyhwMHAxsAt1Ha7MqRewdjR0S8G/gnYDrlFo+zgbOAr2bmwqZytsUoiYh/AD5Juff6RMrn/X3ghMx8rKmcbdIFETGRsubthsB2mXlj07H3AUcBm1B+lo7PzAtbnr8ycDzwfsrNFW4APpGZt4xC9Zd7EbEf8I02h87MzI81lVuufj7sEewB1bI2VwFrAHsChwH7AOd2sVpj1cuAPYA/ALe3KxAR2wOXADcDMyg/+KdHxEEt5Q4HTgbOrM55F3BZRGwzYrUfWw4D+oFPAW8BfgB8Cfh8o4BtMeqmANcCHwJ2A75IWVXh4kYB26SrjqNNB05E7AmcTwntM4CfAd+uAkmzL1CCx7HA24EFwJURMXUE6zwW7QZs3/R1auPA8vjz4azhHhARRwLHABtn5iPVvn2BC4BpmXlbN+s3lkTEio3epog4D3hVa49gRPwImJKZr27a9zXgrcCGmbmwuiPNQ8DXMvOIqsw44FZgZmbuNSpvaDkWEetm5sMt+04DPgJMysx+26L7IuJDwFcpn/cc26Q7ImIa8BtKj+1XaeoRjIg7gFubP9eI+Anl5+jvq8cbAvcCH8/Mr1T71gDuBs5ttJOWrKlHcN3G3+o2ZZa7nw97BHvD7pRFrZu/sb5H6S1p/R+dlkHzJcd2qh/QN1HWl2z2LUr3/bbV49cCa1Eu4TfO/SxwETAjIlYYrjqPVa0hsHIzsCowxbboGY3fS6vYJl11JvBl4M7mnRGxKeVS/ndayn8L2C4i1q0e7wKMo6ntMvNx4IeUv0FaRsvrz4dBsDdsRctlyszsp4zz2LIrNaqvzYBVeOFl40avbKM9GvesvqNNuYmUMTwautcD84A/Y1t0TUSMi4hVI2I65WrFJZl5D7ZJV0TEe4GXAie2Odz4rJfUJtFU7qHMnNum3BYRYR4YvJkR8WxE3B0Rx0ZE43L9cvnzYcP3hsnAo232z6eM2dHomVxtH23ZP7/aTmkq15+ZTw1QToMUEa8C9ge+UP3v2LbonrmUSWs3An8C9q322yajLCLWAk4BjsjMJ9oUGUqbtJZplFuZEkC0dH+ijK/cjzJO8PvA0cB/VseXy58PZw1L6rqI2IAyHOJ6miaLqGt2BCZQZtZ/Brg0Inbuao3q60Tgrsy8oNsVqbvM/DHw46ZdP42Ix4DjIuKELlVrmdkj2BvmU6byt5pMuUym0dP4H9mklv2N/+nNayo3PiJWHaCcBlD1ePwIeBJ4W2Y+XR2yLbokM2/JzOsy82vAO4E3VlvbZBRFxMuAg4CjI2JStcJEo+duYjXZYyht0lqmUe5poF1vowZ2UbXdluX058Mg2BvuYPGYAeC5QaebUdaM0uiZTVlSYauW/VtX20Z7NMZ2tCv3OGXtKA2g+kV4CbAesFvL+CXbojfcAiykjFGzTUbX5pQrd1dRwsN84NLq2FXAL1j6Zw2Q1fYOYL2IaL3suDVw50AT6TQoy+XPh0GwN1wO7BQRazfteycwvjqmUVJN0vk50Dp9fx/gQeCm6vF1wGPA3o0C1fT/vYArMtN1mQZQDbC+CHgFMCMz720+blv0jO0pfyv+aJuMul9SemObvw6tjh0EHJCZd1MCxt4tz90HuKFpdv5PKIG+eYmZiZRlTfw707l/BBYBfcvrz4djBHvDV4FDgP+txhmsB5wGXJiZbRc9VmciYgKLl0rYGFizWowVyi/Neykr718bEedQ1nJ8HXAgcHDjf83VGncnAidHxMOUH/ADKL24+6LBOJPyR+gIYEJEvKbp2O2Z+Rdsi1EVET8GrqTMXvwb8ErKgt+/pyz4DbbJqKmWFLu6eV9EYxIwfU13FjkGuDAiZgM/pSwWvQtloeLGuR6IiLOBz0fEM5Q1BQ8HVgBOH7l3MXZUPx8/B2ZSQvUM4KPA1zPzj1Wx5e7nwwWle0REbEG5q8LrWXyLuSO8xdzwiohNKAuotrN/Zp5Xldudsur7VsAcykzWL7U53+GUEL8+5Y/nEd5Ca3Ai4h5KGG/njZl5dVXOthgl1X9E3w5sWu26hzKJ57QqmDfK2SZdEhE7Ui4Lt95i7v288BZz32l57srACZRZr2ux+BZzN49G3Zd3EXE6Jfy9mNKRdhfVnUOqlQ4a5Zarnw+DoCRJUk05RlCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNXU/we2dyrPLzUr8gAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAApsAAAFKCAYAAABSGJRzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA3OklEQVR4nO3debgcVZn48W8EuRDZQtiMIy4sr2DUGQIjICiKigFFHFnEUQSF0ZHFBRQXREDEhYyiAu4KjiCL/lQQxAUQHRDBwAAB8oIIZDAjsoRFYS6Y5PfHqSZFc29yb9+u23f5fp4nT6er3q46dbq679unzjk1ZenSpUiSJElNeEqvCyBJkqSJy2RTkiRJjTHZlCRJUmNMNiVJktQYk01JkiQ1xmRTkiRJjTHZlMagiDg6IsbUvGQRsTQivtLrckgjERFTIuLaiPhEr8syFBFxTkSc3etySCOxcq8LIE0Ww0ge92+0IGNEREwFPgj8KjN/1ePiNCIidgQuAfbJzDN7XJwhiYinAe8H9gI2Bh4DrgW+BpyemWPqR1AH9gE2Ab7Q64IM0aeA30fEizLz2l4XRuqEyaY0et7a9vzfgG2At7ctvxz4LvDp0ShUD00FPl79/1c9LIcqEbEBcBGwOfA94EvAasC/AP8JzI6It2bmkt6VcsQ+AHw/M+/pdUGGIjOvjojfA4fz5O8QaVww2ZRGSWZ+t/48Il4J/HP78pq/N18q6QlOoySab8jMc2vLvxARJ1ASnv8GThjNQkXEKsCSzBzRZyIi/gn4R+DIbpRrFJ0FHBsRB2Xmg70ujDRcJpvSGBQRRwMfz8wptWW3A/MpLZ5zgOcDtwKHZubFEbE7cCywGXAjcGBmzm3b7mbAccBOwNOAm4BPZub3h1G2vSktks8FEjgiMy9si1mritkD2BC4E/g2cHxmLo6IZwO3VeEfj4hWC+dpwH8A1wF7ZOYPqu1Fdex/yMxNa/v5T+Clmfms2rKtgWOAlwCrAHOBj2XmJW1lfDrwCeC1wDTgj8AXM/PLtZgdKZfB3ww8B3g3sC5wGfDOzPzDUOutts3VgaOBNwIzgAeBG4CjMvPXVcwmwPHADsA6wL3Ab4GDM/N/a/W3f2ae2rb9pcAxmXn0MI91G2Bn4FttiWbLh4HXAx+KiJMy85GGyrEjpc7fAmxKafl/BvBPEXEF8I3MPLRtX9OB/wU+n5lHDFD2lt2BxcDFba8/mrbPW7V8P8p5+5zMvL1atiXlM7Q1sAZwF/Br4N8y85EqZgpwMOXqxaaU9/g8ymflnrZ9vIpSt1sBU4CbgS9n5jdqYb+gfOZ3Bs5ZzvFJY5IDhKTx5bmUy5vnAx8C1gbOjYg3A18EzgCOquLOiYiVWi+MiM2B3wEvAD4LHEZJYs6JiLcMcf8vAb4MnA18FFgVOC8itq/tZzVKsrAfpTvAwZQ/7kcDX63C7gb+vfr/DymXB99arZ8HLAJeWtvvS4ElwCZVwtKyA+UPfWvfLwN+Q0nQjgWOAPqAn1dJTCtufeAK4DXAKcB7qv2eEhEDtXp9kHIpeQ6lD902wOmD1tLyfRk4hHLc7wY+Q6mPF1VleyrwM2B74OQq5hRgA0pyOizDONbXVY/fGWg7VaviGZS63a7BcrR8hJKQf4HyPi4EfgTsHRHtDSV7A08drOw12wE3tpLCDo5hPUritzHlM3QwcCowk/LjreXLwOcon7f3UPq77gFcEhGr1rb3Vsp7vUG1vQ8CVwK7tu36RuARyudPGnds2ZTGl00pLXm/AYiImyh/rL4FbJ6Zt1XL76ckbi8Hflm99guUP9hb1f7YnhwRPwc+HRFDGfwxE9guM39b7edU4BZKa2sr4Xwf8Dxgy8ycXy37WkTcBhwXESdkZkbE9yl/lK8boIvBZTwx2dwB+CmwY7X8rIh4JvAs4JPVa6ZUx/xfwKtax1KNoL+G0lLYSpKOoyShL8jMu6tlX4mIrwMfqVru7q/tf1XgRZn5aLXNRZRLyzMzc94K6qzda4GvZ+b7B1m/BeXHwp5tLc7HDXM/9dcN5Vi3qNYtbxBKa90WlL6dTZSjZQ3KOf231oKI+A5lgM+rgQtqsW8BrsnMG1ZQhudRWro7tR0l2d45M39fW95qmScitgPeCbwtM79TW34h5YfQvpTPw5rAScDVwA71BLg6lx+XmX+PiP9h2XskjSu2bErjy82tRLPyu+rxV61Es235cwEiYh3glZQWyadFxLqtf8CFlMuUmw1h/79vJZoAmXkvpbXrJRExrVq8FyXhu6dtP62kd8ch7Oc3wAury/FQEsyLKS1jrSR0h1oslJbBqMozvbbfNSmtUS+OiKnVH/I9KK3DS9vK+HPKgJgXt5XnO61Es22fzx3CsbR7oCrLMwZZ3+qTt3M1MrxjwzzWNarHh5azyda6NZYTM9JytHynnmhWfkG5XP74QJmIeC6wLWUA04pMp7Sad+qB6vG1VQv0QPYC/gpc2Hac8ymX3F9exb2acm5+ur2ldZAffYsoXTikcceWTWl8WVB/kpkPlO6M/E9bXOuPYisB3ITSH+zo6t9A1qf0wVyeWwZYdnP1+CzKH8TNKInf3QPEtvazIr+h/BjePiKuq7b9a2B1YM8qZgfgL7XW01ay/M3lbHc60E+pl7fz5JkABivjgrbnrYRlGsP3AUrf1AURcQ0l2f/PzEyAzLwtIj5HmX7oLVUr73nAd6vkfjjWY+jHWk8k7x8ktpVk/qXBcrTc2h5Q9ff9LnBQRKyRmQ9RWjUXU7qXDMWUFYcM6lLg+5SWzPdHxKXAucAZtcR4M8p5etcg22gd58bV41BbxqcA433aKU1SJpvS+LJ4mMtbf1hbVzE+zxMvP9YN93LwYJ5CaYX81CDr/ziEbfye0kftpZR+qQ9RLoWvARxdtdTuQGlBre8XSl/WwS6V3l1tD0py8q1B4tovx66ofocsM8+JiN9QBtu8GjgU+GBE7JeZZ1Qxh0XEt4Ddqpj/AI6MiJdl5o0MknTU++hWWnUylGO9kTKA5oXU+sG2eWH12HoPmyhHy2D9Kr9DSdj/hZK0/yvwi8z88yDxdfcw8A+EwZK4JxxH1eK4Z0T8M6U7xKso/TE/HBHbZOZfKMd6L/CmQbbZacvqNJYNqpPGFZNNaXJoJQd/z8xfLjdy+TYdYFmrRfGO6vFWYI0h7GfQVprMfKwaefxSYC3g8qpV6wrKlFCvp/Rf+3rtZa2WsIeWt++IuJuSvK48wrroWJUYfRX4akSsTekecAylC0Ar5gZKAvapiHghJYF+H3AgyxKWtds2/ay258M51vMog3L2ZYBks0og38yy0dc0VI7lysx5EXE18Naqz/JmlLobipsoswq0WwQQEWu39RttP45WGa6kDOQ5KiJmU37AHUjpP3wrJQm9IjP/upyytM7XmZRL7IOqBkQ9k8F/KEpjmn02pUmganG5BDhwoL6C1SjbodgqIratvW46JQG5PDNbicdZwNYRscsA+1kjIvqqpw9Xj4Ndiv4NMIvyh/vX1XE8Qmn1PILSqlhPiuYCf6Bc3nxSn8LWMWbmYsql0N0j4kWDxTUhIlaq9UOlKs/9lBartauYNQcYbX0TpaVv7eo1D1Ja6V7aFvfutm0P+Vgz8wpK/8n9I6J9NDSURGoz4DOt+S6bKMcQnUbp+/hBShL7wyG+7jJgi2rGhLpW4vf4cVT9Zd/WVs5p7YN3KAN8YFnCfRblb+tR7Tuv3v/W+f5zSv/cD7WXZ4B9bEEZpHb5wIcljW22bEqTx79T/theV40AvpXSf+zFlD9mmwxhG/OAn0TElyh/5P+Ncmn7w7WYEyjT6Pw4Ik6jJIGrUVpw9qRMvXR7NU/jDcCbIuJmyqXH2zKzNbjpNyybxqmeVP6akmw+SG3kdGYuiYh3UPpA3lhdhr6TMl3QyyjJaWtwxocoA5V+W9XFDZSk9x+BN1D+sDdhDeBPEfGDquwPUqazeQ1lZDLAKyizBHyf0od2CmVqnzUoiUzLNyiJyjcoCfhLGXiQ13COdV9KF4hzI+IMynuwKuWS9csog3Y+37b9JsqxIt+jTEP1RuDUYUxl9GNKK+grKIOVWn5O6Zf7zSiT1y+m9C29G9ioFvc2Sn/RH1I+P6tRbi/bSqbJzF9HxMnAB6oW6Z9R+glvQhkkdVRV5gcj4j2UbgW/r+r7Xsr8uc+g1HnLqyg/Nn42xOOUxhRbNqVJohqAshVlQMO+LJvDcWXgY0PczGXVa/amTCXUD+zemoy82s8jlKTiM5TE40TK5dnNKRN61/vWvQO4ndIn8Xssm3sTyiTmfwf+j3LJsqU1Evyy9tsmVuXYhnJZ+t2UBO7twH1VeVpxf6Ek2d+g9FM8iXKJekPK/KNNeZhS7y+g1PmJlPfk8Gr/UJLQnwK7UBKqT1ASzt3bpkI6ljIYag/KHI0rAbPbdzicY83Mu6rYY4B/okyX9QVKovnxzHxCS19T5ViRauqkn1ZPhzIKvfW6ayktkXu1LX+MkvDeSqnvQ6tyntS2iUsp5+JelHr5COV8fkXtRxKZeTDl3F6H0iL8aUrf27OpTShfTYT/Wsr5+RFK/W1L6dJQtxfww8x8AGkcmrJ0qYPbJEkDq7pdXE5pnNg2M+/scZEAiIhzKD8sntX+o2MFr9uH0tf3WR2M7h91Ue5Y9HtgVmZe0+vySJ2wZVOSNKjM/BPlMv9UytyRa/e2RI/fjWg3ypRRQ040K2dSWjDf2+1yNeTDwPdNNDWe2bIpSRoXIuI5lD6ub6dcbt50rLS0ShqcA4QkSePFy4BvU25isJ+JpjQ+2LIpSZKkxtiy2YG5c+f2AVtT7tE72J1FJEmSxoKVgKcDV82aNat/tHdustmZrVk2/YokSdJ40H6b31FhstmZ/wXYbLPNWGWVVRrbybx585g5c2Zj2x8vrIfCeiish8J6KKyHwnoorIeivR4effRRbr75Zqjyl9FmstmZxQCrrLIKfX19K4odkaa3P15YD4X1UFgPhfVQWA+F9VBYD8Ug9dCTrn/OsylJkqTGmGxKkiSpMSabkiRJaozJpiRJkhpjsilJkqTGmGxKkiSpMSabkiRJaozJpiRJkhpjsilJkqTGeAehMWz1tdblrvse7nUxumJq38qs8bTmbu0pSZLGJpPNMWzx0ilcdNWCXhejK3baeiOTTUmSJiEvo0uSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGrNyr3YcEXsC/wrMAtYBbgW+DHw1M5dUMacCbxvg5Xtm5vfbtnc4cBCwIXADcERmXtQWswZwArAHsCpwCXBIZt7etQOTJEnS43rZsnkY0A98AHgt8CPgi8Bn2uL+CGzb9u/iekCVaB4PnAzsCtwCnB8RL2rb1veA3YBDgL2BGcBFETG1WwclSZKkZXrWsgm8LjPvrj2/JCJWBw6OiCMzs79a/khmXjHYRiKiDzgSODEz51TLLgWuBz4K7FUtezElEd01My+oll1PaVHdDzilmwcnSZKkHrZstiWaLddQLm+vM4xNbQesBZxZ2/Zi4GxgdkRMqRbvAjwAXFiLWwBcVq2TJElSl/WyZXMgOwD3AX+pLds4Iu4HngbMAz6dmWfV1m9ePd7Utq0bgNWBZwB3VnHzW/1B2+J27krpJUmS9ARjJtmMiK2A/YFjqpZJKC2dV1ESwrWAA4AzI2K1zDy1ipkG9GfmI22bXFQ9rkNJNqcB9w+w60UMryX1cfPmzevkZUO22prrsXDhwkb3MVruvXcqd942UGP20MydO7eLpRm/rIfCeiish8J6KKyHwnooxlI9jIlkMyI2BH4AXEltgFBmfqEt9McRcTFwDHDqqBVwEDNnzqSvr6+x7d94ywJmzJjR2PZH0/Tp67LBpht19Nq5c+cya9asLpdo/LEeCuuhsB4K66GwHgrroWivh/7+/sYbyJan5/NsRsRawE+Bh4HdMvOxFbzkHGCjiFiver4I6IuIVdviplWP99Xi1h5ge9NqMZIkSeqiniabVYJ4LrA+8JrMvLeDzbT6am7etnwL4CHgT7W4qA0YqsfN72C/kiRJWoGeJZsRsTJlxPgLgdmZeccQXjOFMpXRHbXR7JdTRpnvXYtbqYq7MDOXVosvoLRs7lyLeyawfbVOkiRJXdbLPpsnA68DPghMjYhtautupFzePo0yEfsfKIniAcCOwFtbgZnZHxHHAcdHxN3A1VXcxsCba3G/i4jzgW9GxGHAg8CxwALGQP9PSZKkiaiXyWarhfGzA6x7OXAdpcXySMpl9scoieRumXlePTgz50QEwKHABpTR67tm5rVt290HmEOZwL2PcrvKPTPz4W4ckCRJkp6oZ8lmZj57CGGvH8b25lASyeXFPAS8s/onSZKkhvV8NLokSZImLpNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNWblXu04IvYE/hWYBawD3Ap8GfhqZi6pxc0GPglsAfwJODEzvzTA9g4HDgI2BG4AjsjMi9pi1gBOAPYAVgUuAQ7JzNu7fXySJEnqbcvmYUA/8AHgtcCPgC8Cn2kFRMS2wLnANcBs4NvAiRHxrvqGqkTzeOBkYFfgFuD8iHhR2z6/B+wGHALsDcwALoqIqV0+NkmSJNHDlk3gdZl5d+35JRGxOnBwRByZmf3AUcDVmfmOWsxGwMcj4muZuSQi+oAjKS2ecwAi4lLgeuCjwF7VshdTEtFdM/OCatn1lBbV/YBTGj5eSZKkSadnLZttiWbLNZTL2+tUSeQrgLPaYs6gXCrfsnq+HbAWcGZt24uBs4HZETGlWrwL8ABwYS1uAXBZtU6SJEldNtYGCO0A3Af8BdgYWAW4sS3mhurxedXj5tXjTQPErQ48oxY3v94ftBb3PCRJktR1vbyM/gQRsRWwP3BMZi6OiGnVqvvbQhdVj+tUj9OA/sx8ZDlxd1Zx7dtqxa0zwPIVmjdvXicvG7LV1lyPhQsXNrqP0XLvvVO587aBGrOHZu7cuV0szfhlPRTWQ2E9FNZDYT0U1kMxluphTCSbEbEh8APgSmoDhMa6mTNn0tfX19j2b7xlATNmzGhs+6Np+vR12WDTjTp67dy5c5k1a1aXSzT+WA+F9VBYD4X1UFgPhfVQtNdDf39/4w1ky9Pzy+gRsRbwU+BhYLfMfKxa1WqZXLvtJa0Wz/tqcX0RseoQ4tq31Yq7b4DlkiRJGqGeJptVgngusD7wmsy8t7b6VuBRlvXJbNmiepxfPbb6ag4U9xBlbs5WXNQGDNXj5iNJkqSu61myGRErU0aMvxCYnZl31NdXUx9dTDV1Uc0+wJ+Bq6vnl1NGme9d2/ZK1esuzMyl1eILKC2bO9finglsX62TJElSl/Wyz+bJwOuADwJTI2Kb2robM/NB4Fjg1xHxdeB04CXAgcBBrVHlmdkfEccBx0fE3ZQk9ADKaPY3tzaYmb+LiPOBb0bEYUBr+wuAUxs9UkmSpEmql8lmq4XxswOseznwq8z8bUS8nnJ3oH2BhcD7MvMr9eDMnBMRAIcCG1CmM9o1M69t2+4+wBzKBO59lNtV7pmZD3fnkCRJklTXs2QzM589xLgLGMJl7uruQXNWEPMQ8M7qnyRJkhrW89HokiRJmrhMNiVJktQYk01JkiQ1xmRTkiRJjTHZlCRJUmNMNiVJktQYk01JkiQ1ZtjJZkTsPMD9xSVJkqQn6aRl86fAnRFxQkS8qNsFkiRJ0sTRSbK5O3AZcBBwdURcFxGHR8SMrpZMkiRJ496wk83MPDcz96Lcg/xA4G7g08AdEfHziHhLREztcjklSZI0DnU8QCgzH8rMb2XmTsCzgI8A6wOnAXdFxHciYqculVOSJEnjULdGo68EPBXoA6YAjwCvBH4REddExMwu7UeSJEnjyMqdvjAi1gL2At4CvAT4O3A+8KHqcQmwG/B54NvA1iMtrCRJksaXYSebEbE7JcHcBVgVuAp4D/C9zLyvLfxHEbEucMoIyylJkqRxqJOWzf8H/An4AnBaZs5fQfx1wOkd7EeSJEnjXCfJ5quBizJz6VCCM/NK4MoO9iNJkqRxbtjJZmb+somCSJIkaeLp5HaVn4+IW5az/uaIOGFkxZIkSdJE0MnUR7sCZy1n/VnA6zorjiRJkiaSTpLNZwK3L2f9HVWMJEmSJrlOks0HgecsZ/1zKZO6S5IkaZLrJNm8GHhnRGzUviIing28s4qRJEnSJNfJ1EdHAbOBeRHxbeCGavlMYD9gMfCxrpROkiRJ41onUx/dEhEvAU4GDmlbfSlwSGZmNwonSZKk8a2je6Nn5g3AjtWtKJ9bLb41M+/tWskkSZI07nWUbLZk5j3APV0qiyRJkiaYjpLNiFgJ2JnSqjkNmNIWsjQzPzHCskmSJGmcG3ayGRFbAT8A/oEnJ5ktSwGTTUmSpEmuk5bNU4DVgN2B32Tm/d0skCRJkiaOTpLNFwIfzczzul0YSZIkTSydTOp+J4NfPpckSZIe10my+WngwIhYs9uFkSRJ0sTSyWX0dYC/AX+IiO8D/0O5a1Dd0sw8YaSFkyRJ0vjWSbL56dr/3zVIzFLAZFOSJGmS6yTZfE63dh4RmwCHA9tQ7q0+PzNntsWcCrxtgJfvmZnfb4s9HDgI2JByz/YjMvOitpg1KInwHsCqwCWUW2ze3oVDkiRJUk0n90a/o4v7fz6wK/A7Sv/RwfqQ/hH417ZlN9efVInm8cBHgKuBA4HzI+LFmXltLfR7wJaU+7o/CBwLXBQRL8jMh0d2OJIkSarr+HaVEbEpsCOwPnB6Zt4eEatQWhX/nJmPDmEz52Xmj6vtnQpsNUjcI5l5xXLK0gccCZyYmXOqZZcC1wMfBfaqlr2YktzumpkXVMuuB24F9qPMISpJkqQuGfZo9Ih4SkR8DZgPfJXSMvjcavUqlATvkKFsKzOXDHf/g9gOWAs4s7btxcDZwOyIaE3VtAvwAHBhLW4BcFm1TpIkSV3UydRHHwHeDnwM2JbanJuZ+VfKrSz/pSulW2bjiLg/Ih6LiGsiYu+29ZtXjze1Lb8BWB14Ri1u/gBJ7g3A87paYkmSJHV0GX1/4FuZeXxETB9g/fXAa0dWrCe4BriKkhCuBRwAnBkRq2XmqVXMNKA/Mx9pe+2i6nEdymT004D7B9jHoipmWObNmzfclwzLamuux8KFCxvdx2i5996p3Hnb3R2/fu7cuV0szfhlPRTWQ2E9FNZDYT0U1kMxluqhk2TzH4Arl7P+EWCNzorzZJn5hbZFP46Ii4FjgFO7tZ9OzJw5k76+vsa2f+MtC5gxY0Zj2x9N06evywabbtTRa+fOncusWbO6XKLxx3oorIfCeiish8J6KKyHor0e+vv7G28gW55OLqP/GXjWctbPAro5Yn0g5wAbRcR61fNFQF9ErNoWN616vK8Wt/YA25tWi5EkSVKXdJJs/gD492o0estSgIiYDexLGZgzmlp9NTdvW74F8BDwp1pc1AYM1ePmN1c8SZKkyamTZPNoYAGlL+XplETzIxFxBfAT4FrgU90qYLsqUdwLuCMzW50AL6eMMt+7FrdSFXdhZi6tFl9AadncuRb3TGD7ap0kSZK6qJNJ3R+MiO2A9wN7Av9HSdZupSSiJ2Tm/w1lWxExlWVTDj0LWDMi9qieX1U9nkaZiP0PlETxAMr8nm+tlak/Io4Djo+IuymTuh8AbAy8uRb3u4g4H/hmRBzGskndF9Dj/p+SJEkTUUeTulfJ5PHVv5FYn9L/sq71fH/gXEqL5ZFV7GOURHK3zDyvrUxzIgLgUGADyuj1XdvuHgSwDzCHMoF7H+V2lXt69yBJkqTu6/gOQt1Q3Y+8vf9ku9cPY3tzKInk8mIeAt5Z/ZMkSVKDhp1sRsS3hhC2NDPf0UF5JEmSNIF00rL5CqrR5zUrAU+vHu8G/jbCckmSJGkC6GSA0LMHWh4RT6Vcmn4v8KoRlUqSJEkTQidTHw0oMx/LzJOAnwMndWu7kiRJGr+6lmzWXAu8tIHtSpIkaZxpItl8FeA0QpIkSepoNPpRg6xam9KiuSXw6RGUSZIkSRNEJ6PRjx5k+SLKXYTeBXy90wJJkiRp4uhkNHoTl94lSZI0AZk4SpIkqTGd9NncqJMdZeaCTl4nSZKk8auTPpu38+Q7CA3FSh28RpIkSeNYJ8nmAcChwDOBM4Cbq+UB7AMsAL4ILOlGASVJkjR+dZJsPh3oAzbJzEX1FRHxceAyYMPM/FQXyidJkqRxrJMBQu8CvtaeaAJk5r2UaY/+faQFkyRJ0vjXSbI5HVh9OeufVsVIkiRpkusk2bwCeE9EzGpfERFbAe8BfjfSgkmSJGn866TP5sHAr4ArI+Iq4JZq+abA1sB9wCFdKZ0mjCVLlnLXfQ939NrV1lyv49c2YWrfyqzxtFV6XQxJksaFTu4gdGNEvAD4EDAb2KNadQfwBeCzmfnn7hVRE0H/Y4u5/LqFHb124cKFzJgxdpLNnbbeyGRTkqQh6qRlk8y8C3hf9U+SJEkaUEfJZktEbAqsD8zLzAe6UyRJkiRNFB3dGz0i3hwRC4D5wK+BWdXydSPi5ojYq4tllCRJ0jg17GQzIt4IfBe4CfgAMKW1LjPvqZbv260CSpIkafzqpGXzo8AvM3Nn4LQB1v8OeNGISiVJkqQJoZNkc3Pgh8tZ/xdgvc6KI0mSpImkk2Tzbyz/DkIbA/d0VhxJkiRNJJ0kmxcD+0XEkyYajIgZwIHAz0ZaMEmSJI1/nfbZfDrwe+DdwFJgl4j4NHA9sAQ4pmsllCRJ0rg17GQzM28BXgL8GTiaMhr9/cAHgf8Gts/MBd0roiRJksarYU3qHhErAc8A7srMV0fENGATStL6x8y8u4EySpIkaZwa7h2EngLcChwBfC4zFwFXdb1UkiRJmhCGdRk9Mx8DFlL6aUqSJEnL1ckAoW9TRqOv2u3CSJIkaWIZ7mV0gJuBlYD5EXEa8EfgkfagzDx7hGWTJEnSONdJsvnd2v8/NkjMUmCFyWZEbAIcDmwDzATmZ+bMAeJmA58EtgD+BJyYmV8aIO5w4CBgQ+AG4IjMvKgtZg3gBGAPYFXgEuCQzLx9ReWVJEnS8AzpMnpEfDEiZlVPX179ex3wytrz+r9XDHH/zwd2Bf4A3DjIvrcFzgWuAWZTLuOfGBHvaos7HDgeOLna5i3A+RHRfp/27wG7AYcAewMzgIsiYuoQyyxJkqQhGmrL5sHAFcDczLw0IqZT7oH+qsy8dAT7Py8zfwwQEacCWw0QcxRwdWa+o3p+SURsBHw8Ir6WmUsiog84ktLiOafa3qWUSeY/CuxVLXsxJRHdNTMvqJZdTxlhvx9wygiORZIkSW06GSDUMmWkO8/MJctbXyWRrwDOalt1BuVS+ZbV8+2AtYAza9teTLmUPzsiWmXdBXgAuLAWtwC4rFonSZKkLhpJsjkaNgZW4cmX2G+oHp9XPW5ePd40QNzqlInoW3HzB0hyb6htS5IkSV3SyQCh0TStery/bfmi6nGdWlx/ZraPiq/H3VnFtW+rFbfOAMuXa968ecN9ybCstuZ6LFy4sNF9jJb+WGdExzKW6uHee6dy5229uVnW3Llze7LfscZ6KKyHwnoorIfCeijGUj0MJ9l8bkT8c/X/tarH50XEXwcKzswrR1SycWDmzJn09fU1tv0bb1nAjBkzGtv+aOrrW7XjY1m4cOGYqofp09dlg003GvX9zp07l1mzZq04cIKzHgrrobAeCuuhsB6K9nro7+9vvIFseYaTbB5T/at70vRDlL6cSylzcY5Uq2Vy7bblrRbP+2pxfRGxamb+3wriBsoSptViJEmS1CVDTTb3b7QUg7sVeJTS1/LC2vItqsf51WOrr+bmlCmS6nEPUebmbMW9KiKmZObStrj5SJIkqauGlGxm5mlNF2SQ/fZHxMWUqYs+X1u1D/Bn4Orq+eWUUeZ7UyWbEbFS9boLa4nlBZSplHamSl4j4pnA9sB7Gj0YSZKkSainA4SqidRbUw49C1gzIvaonl+VmXcAxwK/joivA6cDLwEOBA5qjSqvktLjgOMj4m5KEnoAZTT7m1v7y8zfRcT5wDcj4jDgwWr7C4BTGz1YSZKkSajXo9HXB85pW9Z6vj9wamb+NiJeT7k70L7AQuB9mfmV+osyc05EABwKbECZzmjXzLy2bfv7AHMoE7j3UW5XuWdmPty1o5IkSRLQ42Szuh/5CieHr+72c8EQ4uZQEsnlxTwEvLP6J0mSpAaN9UndJUmSNI6ZbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqzMq9LsCKRMR+wLcHWHVyZh5ci5sNfBLYAvgTcGJmfmmA7R0OHARsCNwAHJGZFzVQdE1QS5Ys5a77Hh71/a625npd3e/UvpVZ42mrdG17kiQNZMwnmzWvAR6oPf9z6z8RsS1wLvAd4DDgJcCJEfFYZn6lFnc4cDzwEeBq4EDg/Ih4cWZe2/whaCLof2wxl1+3cNT3u3DhQmbM6F6yudPWG5lsSpIaN56SzbmZec8g644Crs7Md1TPL4mIjYCPR8TXMnNJRPQBR1JaPOcARMSlwPXAR4G9Gi6/JEnSpDPu+2xWSeQrgLPaVp1BuVS+ZfV8O2At4MxWQGYuBs4GZkfElOZLK0mSNLmMp5bNeRGxHrAAOBX4ZGb+HdgYWAW4sS3+hurxecDvgc2r5zcNELc68Azgzu4XW5IkafIaD8nm/wIfB64EFgOzgY8BzwH2A6ZVcfe3vW5R9bhO9TgN6M/MR5YTN6xkc968ecMJH7bV1lyPhQtHv29gE/pjnREdy1iqh5Eey0h0c7/33juVO2+7u2vbG01z587tdRHGBOuhsB4K66GwHoqxVA9jPtnMzJ8BP6st+kVEPAAcHRGf6FGxAJg5cyZ9fX2Nbf/GWxYwY8aMxrY/mvr6Vu34WMrAmLFTDyM5lpHodj1Mn74uG2y6Ude2N1rmzp3LrFmzel2MnrMeCuuhsB4K66For4f+/v7GG8iWZ7z22Ty7etySZS2Ta7fFtFo876seFwF9EbHqCuIkSZLUJeM12ay7FXiUZX0yW7aoHudXj62+mgPFPUSZm1OSJEldNF6TzTcBSynTIfUDF/PkqYv2oczFeXX1/HLKPJ17twIiYqXqdRdm5tKmCy1JkjTZjPk+mxHxM0oyOQ9YQhkg9G7gm5n5xyrsWODXEfF14HTKpO4HAgdl5hKAzOyPiOOA4yPibkoSegBlNPubR/GQJEmSJo0xn2xSLn+/HfgHSnlvAY4ATmwFZOZvI+L1lLsD7QssBN5Xv3tQFTcnIgAOBTagTHu0q3cPkiRJasaYTzYz873Ae4cQdwFwwRDi5gBzRlwwSZIkrdB47bMpSZKkccBkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUmJV7XQBJvbFkyVLuuu/hXhdj2FZbc70nlXtq38qs8bRVelQiSdLymGxKk1T/Y4u5/LqFvS7GsC1cuJAZM56YbO609UYmm5I0RnkZXZIkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGO+NLmncW7JkKXfd9/CKA8eBqX0re593SROKyaakca//scVcft3CXhejK3baeiOTTUkTipfRJUmS1BhbNiVpDBlql4DV1lxvTHcdsDuApBaTTUkaQ4baJWDhwoXMmDF2k027A0hqmXTJZkRsCnwJ2B54BDgTOCIzx+63tiRJ0jg1qZLNiFgbuAS4A9gDWB/4HLAe8KbelUySJGlimlTJJvBOYBrwj5l5D0BE/B04PSI+kZk39LR0kjRBjNZ0VKPRd9X+p9LITLZkcxfgolaiWfkB8C1gNmCyKUldMFrTUY1G31X7n0ojM9mSzc0pieXjMrM/Im4FnjeM7awE8Oijj3axaE+2ZMliVn7Kkkb3MVoW//2xjo9l1adOGVP1MJJjGYlu10OvjmOkBqqH8XosAxnqsYy1z0W70XpPRqMe/v7Yo/T3r9ToPrqhv7+/10UYE6yHol4PtXylJyfylKVLl/Zivz0REY8BH8vMT7ct/y/gL5n5L0PZzty5c7cHftNAESVJkpqyw6xZs/5rtHc62Vo2u+UqYAfgf4HFPS6LJEnS8qwEPJ2Sv4y6yZZsLgLWHmD5NGD+UDcya9asfmDUfxlIkiR16NZe7Xiy3a7yJkq/zcdFRB+wMcNINiVJkjQ0ky3ZvADYKSKm15a9Aeir1kmSJKmLJtsAobWBecDtwCdYNqn7RZnppO6SJEldNqlaNjPzfuAVwF+B/wd8HjgLeHsPiyVJkjRhTaqWTUmSJI2uSdWyKUmSpNFlsilJkqTGmGxKkiSpMZNtUvcxLyI2Bb4EbA88ApwJHJGZD/e0YCsQEXsC/wrMAtahTB77ZeCrmbmkFjcb+CSwBfAn4MTM/NIA2zscOAjYELiBUgcXtcWsAZwA7AGsClwCHJKZt7fF9aROI2J1yvytzwC2zszf19btC3wEeDalro7NzLPaXv9U4FjgbZSbEVwFvCcz/7stbkPgC8BrgKXAT4D3ZuY9bXH/TJl9YRZwH/CNar+N3QUrIt4KvJfyfj8MXA3s0yrbZDgfImJ3ynu9OfA34DLgQ5l5S1vchDknImIT4HBgG2AmMD8zZw4QN2bf/6GWbST1EBErAYcBu1b7WRm4Hjim/fgmcj0MED8LuBJ4JDNXb1vXk8/AUD6fKzKMz8WqwIeAtwL/ANwDXJCZB7bFjZvzwZbNMaSamukSYA3KiXEYsA/wrR4Wa6gOA/qBDwCvBX4EfBH4TCsgIrYFzgWuAWYD3wZOjIh31TdUfYCOB06mfAnfApwfES9q2+f3gN2AQ4C9gRnARRExtbatteldnR7NAD/oImIP4DTgh5R6+CXwvepDXPd5yhfJx4HXA49Sjm9GbVsrAxcCLwD2BQ4AtgPOjYgptbjnVvu5j/L+HE95rz7ZheMcUER8lPKD4/9RjvMdlC/Evmr9hD8fImInyvHPB/6lKtvzgF9GxJq1uIl2Tjyf8l79AbhxoICx/P4PtWxDsKJ6WI2SwPw3sD/wJsof8F9ExGvbyjSR66G+z6dQvjfuHiRk1D8Dw/h8rshQPhdPofz93Lcqz6uBD1Jm0anHjavzwZbNseWdlFtn/mOt5efvwOkR8YnMvKGnpVu+12Vm/cvhkqpl7+CIODIz+4GjgKsz8x21mI2Aj0fE1zJzSXVHpyMpv5bmAETEpZRf+x8F9qqWvZjyAds1My+oll1P+cW5H3BKtY+e1GlEzATeBbwf+Grb6k8A52Tmh6vnl0TE5sAxwE+r1z+jev2hmfn1atkVwG2UlsIPVq99I/AiYGbrWCJiIaX1bDbLblbwAeB+YM/qvbgoItYCjoqIz2bmfd07eoiIoCTbb8jMn9RW/aj2/8lwPuwD3AG8LTOXVvu7A/gd8BKq95uJd06cl5k/rvZ9KrDVADFj+f1fYdm6VA+PAM/JzEWtBRHxc2Azyh/8n1TLJno91B0IrEVJdA6tr+jhZ2CFn88u1sP+wLbAFpn5p9ry02v1MO7OB1s2x5ZdKBPM15v5f0BpMRzuL6hR1ZZotlxDabZfp/pwvIIyr2ndGZRLAFtWz7ejfNGcWdv2YuBsYHbtV+kuwAOUX6+tuAWUL5NdatvvVZ2eDJwE3FxfGBHPobRsndkWfwawdUSsVz1/NbAStfrKzIcof3zaj+/6epKUmZdTEpz2uB9VX6j1fbbel27bH7ijLdF83CQ6H54KPNRKNCv3V49TYGKeEytKQsby+z+Msq3QiuohMxfXE81q2VJKS+eM2uIJXQ8tEbEupbXuPZQWy3aj/hkYxudzhYZYDwdSEts/LSdm3J0PJptjy+a0Na1XH4JbKSf7eLMD5fLEXyj3n1+FJ186aH0ZtI6vde/6mwaIW53S/7EVN3+AD+8NPLGuRr1Oo/RT3AQ4boDVreMbrB6iFndXZt47QNxm1aWWVtxAl2Mer4eIeBqwUXtc1WfnYZqph22A6yLiyIj4c0Q8FhFXRsTLqvWT5Xw4Fdg8Ig6JiLUj4tnAHMrxtPpWTZZzom4sv/9DLVsjqvdxO554zJOlHj4D/FdmXjjI+l58Bob6+RyxKP1RtwRuj4jTIuKvEfG3iPhR1YLYMu7OB5PNsWUay1o96hZRBt2MGxGxFaV16/PVL65p1ar720Jbv+pbxzcN6M/MR4YQ176tVly9rka1TqtLMCcAH8zMvw4QMpx6aI9pxT2V8oWyorjWttYeZJ/tcd20IfAqyjlwKPA64EHgwirhmhTnQ2ZeQumr+clqH7cBzwFeVWtNmSznRN1Yfv+HWramHEJJYP6jtmzC10PVH3Af4H3LCevFZ2A062E65TiOoHyHvpHS1/1FwAVR+qK2yjSuzgeTTXVdlFGAP6CMJvzMCsInmuOAWzLz9BVGTmxPoXzxvzEzz65aKnajJJwf6GnJRlFEbAd8B/gm5RLUnsASykCF1XpZNo09Vcv/Z4E5mfmbXpdntEQZlX8K8LnM/GOvy9NDrZzsr8DumfmzzDyT8r3xfOANPSvZCJlsji2LWPZrq24a5XL0mFe17P2Uchlit8x8rFrV+vWzdttLWr+W7qvF9UWZ+mFFce3basXV62rU6jQink/pvP6x6pLp2iz7pb16lCkohlMP7TGtuMdYNjJxKMd3/yD7bI/rpkXAvVmbjiTLFBpXUKb8mPDnQ+WLwCWZ+b7MvCQzv0/psP9PlGlNWmVigHJNtHOibiy//0MtW1dFxAuBH1MG0R3Rtnqi18OBwNOBU2rfnatCGSld+2HWi8/AaNbD/ZRpmi6rt1pmmTbvQcp3Z6tM4+p8MNkcW25iWV8M4PFOuRtTpk4Z06oT/1xgfeA1bf1qbqV0+N687WVbVI+t42v1QRko7iHKtCCtuKh1hK7H1etqNOt0U8oMD5dQPpiLgPOqdZcAv2H5xweQ1eNNwPoR0X5pYgvg5lofnCcdXy1uPkBm/g1Y0B4XEc8CptLMubW8Ud2rMjnOh9b+/7u+IDPvpMybt3GtTLSXi4l3TtSN5fd/qGXrmojYGPgZZR7at7YNKIOJXw/PAzagHEfru/MI4GnV/z9VK/dofwaG+vkcseoH+e2DrF5KlYCvoExj8nww2RxbLgB2iojptWVvoIyMu2Dgl4wNVV+Ss4EXArMz8476+qp/2sVUUzLU7AP8mfIlC3A5ZfTc3rVtr1S97sLal/AFlF9YO9finkmZkLZeV6NZp/8FvLztX6v/0buAAzLzNsqHcu+21+4DXFUb1f9zyuXWx+srylRSr+PJx/eCahqOVtw2lImH2+N2j4hV2vbZz7KBKt30E2B6RDw+QrHqkL8tMHeSnA9QRr/Oqi+o/pitS/VHZRKdE48by+//MMrWFVW3o59X2949MwcahT3R6+EknvzdeRrwf9X/T6riRv0zMIzPZ7f8BNi+3s0mysTzawFzq0Xj7nyYsnRp+w8o9Up16WAe5Y/QJygthJ+jTEnwpt6VbMUi4qvAv1HmOWvva3RjZj5YdQD/NWWE7umUeQaPBQ7KzK/UttWarPbDlBP4AEpH6Rdn5rW1uJ9QLkceRrnEcCylSf8F1S/EntdpROxIadV8/A5CUe62dBbl1/ovKBMTv4cyF9pPa689iXKp9TBK0nI4ZV62F2TmwipmZeD3lE7lH6a0rJ4A3AW8JJfN7fhcSgvbxZS7REQV96XM/FADx/0U4LfAepR53x6qjmNrylxuf5gM50NEHEyp75Mol0inU+bHWw94fqv1f6KdE1EmjG5NrXIQpYXk/dXzqzLzjrH8/g+1bCOtB8pMHb+tlr+F8h49LjOvmAz10N44Ub3maODwfPIdhEb9MzDUz2c36qFKBq+lvMefpySLx1Peyy1bXdPG2/lgy+YYkpn3UwYR/JVy15HPU07wt/ewWEPV+uX0WcqXZ/3flgCZ+VvKh3RryiWjA4D3tZ+kWSap/QhlFPNPKZdYdq1/gCr7UH4FngKcQ/l19cqs3V5rLNZpZp5DGaW9B6UedgbePMCX1vsod9I4jtI9YTXK8S2sbevvlNuxzQO+S7mjwxWU/rJLa3F/BF5JSXLOpyQ8/0FJBLuuupy1K+WLqfX+AOyYmX+oYibD+XAyZcLkHSh98U6k3D3k5fVuJhPwnFif8h6cA+wIPLP2/OXV/sfs+z/UsnWhHjagjDRenXJ+tH93TpZ6GI5R/wwM4/O5IkP5XPxP9f8p1fKTKA04r8xlYyDG3flgy6YkSZIaY8umJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqzP8HVvNfDeBqWn8AAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df1 = df[df[\"name\"].isin([\"IssueQuery\"])]\n",
+    "df1['delta'] = df1['ts'].diff()\n",
+    "ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('IssueQuery duration (usec)');\n",
+    "plt.show()\n",
+    "ax = df1['delta'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Time between IssueQuery (usec)');\n",
+    "\n",
+    "# df1['delta'].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for SingleStream\n",
+    "if False:\n",
+    "    df1 = df[df[\"name\"].isin([\"QuerySamplesComplete\"])]\n",
+    "    ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "    ax.set_title('Inference time (usec)');\n",
+    "    plt.show()\n",
+    "    ax = df1['dur'].plot(figsize=figsize)\n",
+    "    ax.set(ylim=(0, 100))\n",
+    "    ax.set_title('Individual inference time (usec)');"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoMAAAFtCAYAAAB8yGDhAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAABA20lEQVR4nO3deZwcZbX/8U8WMkmAkIWwBAkgwgGM4DXyExQF5SKyCFxlEQQFRUAQRETZd81FiQsoKKAsV5TVK8tlEWVfJQ4IJJAjW4hhIIQkLJIwZJnfH+fppNL0bD29Tdf3/Xrl1emq6uqnz9TUnH7qeU4N6OjoQERERETyaWC9GyAiIiIi9aNkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERybHC9GyAi+WFmw4GfArsDawHnuvvRdW1UAzKz9YEXgIPc/bL6tmZFZvZd4AhgY3dfXO/2dMXMdgGuBjZw9zn1bo9Io1LPoIj0mJkdaGYdZrZVmbs4BjgEuBg4APhdxRrXD5nZ4WZ2YL3b0VNmtipwAnBOoyeCAO5+M/Ac0WYR6YR6BkWklrYDHnf3U+vdkAZxOPAacFnR8heBYcCiWjeoG18j2vU/9W5IL1wInGNmp7v7m/VujEgjUjIoIrW0BjCvUjszswHAUHdfWKl9NgJ37wDeqXc7SvgacIu7v13vhvTCdcB5wN7Ab+rcFpGGpGRQRPrEzC4DvgRsCJwP/CewELgcOM7dl5jZdsBdmdcU7oO5gbvPMLMW4Hhgf2A80Vt2DXCSuy8oet2FaV8nA0Zcdr7MzFYDTgP2JMYjzgIuBSa5+5L0+vWJsXgnAHPTe74PeAI43N2nFH22jYEzgO2BEcC/gJuz4xzNbG3gLGBXYBTwPHCeu/+qm7jNANYriseL7r5+qTGDZnZ6+nybAScBnyd6Di9Kz8cR8f8MEf/J7n5O0Xv2KM6dtHcDYPP0Htnl72lrZl0HcIa7n56erwKcDnwxtfdNYBpwqrvfm3ndlkTcPwEMAVqBU9z9rqL9r532twswFngZ+AtwjLu/BeDur5rZE8B/oWRQpCSNGRSRShgI3EYkWMcC9wDfJRI1gKeJMYKzgOnp/wcAc1Lv3p+A7wM3A0cSCcrhwPVpfdaniITkj8BRwHQzG0YkiAcCVwDfAu4kEoULS7R3n/R+FxJJ5frA/5rZSoUNzOyDwCPAjsAl6b2uJZKwwjZrAA8DnwMuAL4NTAUuMLOTu4nZ0SXicXQ3rwG4kvgifzzwEJHYfpdIgl4BjgOeAX5sZp/JtLW3cS728fT49x60sTO/Su/7p/S+PwLmAFtk2rktcB8wGjgzfZ4W4Pb0paKw3VrEz+crxLFwJHG5/f8BY4retxXYugefUSSX1DMoIpWwEnCtu5+Znv/azB4Fvg78yt1nA1eY2fHAa+5+ReGFZrYfkUx92t3vySz/O5HY7QDcnnmvTYCPuPs/MtuemFk+PS2+yMxeAH5gZue4u2f2sS6wkbvPT6934AYi8fu/tM35xDnyQ+7+Qua9Tsrs5wdEovKhzGzVX5vZxcCJZvZLd3+9VMDc/Xoz+0FxPHqg1d2/ntpyETAD+DHRc/bDtPxKoI24rHtnet2+9C7OxTZJj8/3oq3FdgUudvdjSq1MydqFwP3ADulyOWb2a+AxYBLLk9Kzid7Fj7v73zK7Ob1E0vc80Wu7NhEXEclQMigilXJx0fP7iN6u7uwN/BOYZmarZ5bfA3QAn2bFJOXBbCKY2cf9wGtF+/grkbBtB2STwT8WEsFMWwHeD2BmY4FtgfOziSAsG89XSFz2JHq5Oore93bgYOBjwJ87++BlWnapM12C/ztxqfu3meWvpwT3/ZnX9TbOxcYAS4E3+tD2N4CPmdk67v5SifVbEJf+fwyMMbPsur8AR6byRO8Ql31vLUoEgeU/o4zCz3p1lAyKvIeSQRGphEXu/nLRsvlEb0x3NiYSgM7qwK1R9Py5TvaxRS/2MTP7xN3np8Sj0N5CEjW1k/1BjFEbRfS+fa2H71sJM4uev0HE/5USy9fMPO9tnEsZkP4VJ1s99T1iLOlMM3uMGFrwu0yv7cbp8belXpyMAd4lxnB29fPJKvQUlttukaamZFBEKmFpH147EHiKGG9XSnFPTqmZwwOJy6H/3ck+ii9tLulku96MKSuMub6SGFNYyrRe7K+nSrW9s/hnP09v41zstbS/1Vje0wadJFhmNqh4mbtfa2b3EUXHP0uMw/y+mR3o7n9geUyPJ8b5lTIntaE3Ckn+a718nUguKBkUkXp7DpgI3FHi8l5v9rGqu/+1gm0CmNDFNnOAt4DBfXjfWvZU9TXOT6fHDVgxGSz8f2TR9uuV2knqwbwQuNDMRhITcM4A/sDyuL/VVUzN7F1iJnJXP5+sQpuLe09FBM0mFpH6u5q4nPnN4hVm1pLuetGTfWxpZjuX2MeqqaRKj7n7a8RYugNTSZXs/gakbZYQNez2MLMtiveRxh125216dim9Evoa5wfS40ezC1Mh59eIWd5Zhxe9x6BU/if72teJsjQj06JW4FngmFLtKcTU3ZcSYzV3MrOPldiuuId3IvBwH75siDQ19QyKSL1dQUzEOD+VFbmfuBxpxKSHvYC7u9nHOUTJlxvM7HIiqRhG9BztBXyImHXbG0emtrSa2YXEpebxRE3FjdI2xxOTUx5KM4inEcndh4kJDkO7eY+/A4eb2WnE5I5/u/tNvWxnT/Upzu4+08z+Qcw6vqho9W+A483sN8Rn+hTLx/8VrAq8ZGZ/BB4nevY+Qcxw/mV6j6Vm9nViLOFTZnYJUX5nHDGhZwAx0QWipM4OwN3p5/MUkex+gYj9DFhW/mdz4Nfdh0gkn5QMikhdpQTgC0SNva8S48kWEsnXBURB6O72sTDVoDuBSGwOIC7hPkMUhO715UF3fzLdg/ks4FAiufwXcFNmm1dTz9QpwB5Er9s84pLqd3vwNmcSZW6OISZEvJjdfyVVIs7E2MizzWzloruQnElMqNmTiP+twE7Aq5ltFhDlenYAdiPKEb1A1KU8N9POe1PcTyF6F0cQP78prDiT+uUU+7OIsjkjiXGPt7Pi2MAvEhNOru7B5xPJpQEdHeo1FxGR7qVLt88Tdwzp8g4rjSL1Zt6dvWuMiKxIYwZFRKRH0i3ezga+Z2YNf2XJzHYBPkAUqxaRTqhnUERERCTH1DMoIiIikmNKBkVERERyrOHHfDSi1tbWFmBL4GU6v5OBiIiISCMYBKwNTJk4cWJ78Uolg+XZkuU3thcRERHpDz5J1BhdgZLB8rwMsPHGGzNkyJA+7Wjq1KlMmNDTOyo1L8VBMQDFABQDUAxAMQDFACoXg3fffZd//vOfkPKXYkoGy7MEYMiQIbS09OouVyVVYh/NQHFQDEAxAMUAFANQDEAxgIrHoOTQNk0gEREREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmm29E1sLfefpcF7Yvr3YyKGN4ymFVX7tt9nEVERKTylAw2sAXti7ljysx6N6Mitt9yvJJBERGRBqTLxCIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4Nruebm9kHgGOBrYAJwHR3n1C0zWXAV0u8fC93v65o22OBI4C1gGnAce5+R9E2qwLnAHsCQ4G7gCPdfUYFPpKIiIhIv1LvnsEPArsAzwJPdbHd88DWRf/uzG6QEsFJwPlpn88AN5vZFkX7uhLYDTgS2AcYB9xhZsP7+mFERERE+pu69gwCN7n7DbCsB/CjnWy30N0f7mwnZtYCnAz83N0np2X3AE8CJwF7p2UfIxLFXdz9lrTsSeA54EDggr5/JBEREZH+o649g+6+tEK7+jiwGnBVZt9LgGuAncxsQFq8M/AGcFtmu5nAA2mdiIiISK7Uu2ewpzY0s9eBlYGpwNnufnVm/abp8emi100DVgHWAWal7aaXSEKnATtWutEiIiIija7eYwZ74jFikskexKSPWcBVZnZgZptRQLu7Lyx67fz0ODqz3esl3mN+ZhsRERGR3Gj4nkF3P7do0Q1mdidwBnBZ7Vu03NSpUyuyn9bW1pLLh40YS1tbW0Xeo97mzh3OrBfmdLlNZ3HIE8VAMQDFABQDUAxAMYDaxKDhk8FOXAtcYGZj3X0O0bPXYmZD3f2dzHaj0uO89DgfGF9if6My2/TYhAkTaGlp6e3LVtDa2srEiRNLrps9bwHjxi3o0/4bxZgxq7PmRqVCH7qKQ14oBooBKAagGIBiAIoBVC4G7e3tXXZg9YfLxD1RGCu4adHyzYC3gJcy21lmQkl2u+nVa56IiIhIY+p3yWBK5PYGXky9ggAPErOE98lsNyhtd5u7d6TFtwAjyUwWMbN1gW3SOhEREZFcqfcdSIazvKTLesAIM9szPZ+SHi8nCkU/SyRyBwPbAQcU9uPu7Wb2A2CSmc0BHk3bbQjsl9nub2Z2M/BbM/su8CZwJjCTOo8/FBEREamHeo8ZXIMY/5dVeH4QcCPR43dy2nYRkejt5u43ZV/k7pPNDOAoYE2iXMwu7v540f73BSYTBaZbiNvR7eXuzTE4T0RERKQX6poMpvsBF4/fK7Z7L/Y3mUj0utrmLeDQ9E9EREQk1/rdmEERERERqRwlgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTH6npvYpH+6K2332VB++KK73fYiLHMnreg4vvtzPCWway68pCavZ+IiDQmJYMivbSgfTF3TJlZ8f22tbUxblztksHttxyvZFBERHSZWERERCTPlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTLOJpSaWLu3osmxKrcuq9MWixUvq3QQREZGKUTIoNdG+aAkPPtHW6fpal1Xpi49vPq7eTRAREakYXSYWERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHKsrnUGzewDwLHAVsAEYLq7T8isHwR8F9gF2Ixo75PAGe5+R9G+ZgDrlXibse7+Wma7VYFzgD2BocBdwJHuPqNiH0xERESkn6h3z+AHiUTvWeCpEuuHAScC/wAOAr4EvAT8xcx2LbH9dcDWRf9eL9rmSmA34EhgH2AccIeZDe/bRxERERHpf+p9B5Kb3P0GADO7DPho0fqFwAbuPr+wwMxuBzYmegz/r2j72e7+cGdvZmYfI5LPXdz9lrTsSeA54EDggr58GBEREZH+pq49g+6+tJv1S7KJYFrWQfQUlnNPsJ2BN4DbMvubCTyQ1omIiIjkSr17BnvNzAYCHweeLrH6y2Z2MLAEuB84wd0fzazflBiXWJyETgN2rEZ7RURERBpZv0sGibF+BhxStPxG4G/ATGIiyQnAfWa2pbsXxiOO4r1jCAHmA6N725CpU6f29iUltba2llw+bMRY2traKvIe9dZuo7v9LP3ls/bks5SrljGYO3c4s16YU7P366nOfh/yRDFQDEAxAMUAahODfpUMmtm2wI+Bye5+X3adux+VeXqfmd0KTAeOB75SjfZMmDCBlpaWPu2jtbWViRMnllw3e94Cxo1b0Kf9N4qWlqGMG9f5lf22trYu1zeS7j5LuWodgzFjVmfNjcbX7P16oqvfh7xQDBQDUAxAMYDKxaC9vb3LDqx6zybuMTPbHLgBuB44rrvt3X0ucCeQjeJ8YGSJzUcB8/rcSBEREZF+pl8kg2a2IfBn4FHggDSJpBxPx+5sQNHyzYheRBEREZFcafhk0MzWAm4HXgH2cPd3e/i61YHtgSmZxbcQPYM7ZrZbF9gmrRMRERHJlXrfgWQ4y0u6rAeMMLM90/MpwKtEGZg1gGOAzcxs2esLNQXNbF9gV+BWoij1+sSl5Bbg7Mz2fzOzm4Hfmtl3gTeBM4lJJ5dV4zOKiIiINLJ6TyBZA7i2aFnh+UHA3cAW6fn1JV5fuNz7AlF38KfE+L83gHuAPd29+PLvvsBkosB0C3E7ur3cvTlmaoiIiIj0Ql2TwXQ/4OLxe8W6W1/oIfx0D9/zLeDQ9E9EREQk1xp+zKCIiIiIVI+SQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiO9ToZNLMdzWxANRojIiIiIrVVTs/grcAsMzvHzLaodINEREREpHbKSQb3AB4AjgAeNbMnzOxYMxtX0ZaJiIiISNX1Ohl09xvdfW9gTeAbwBzgbOBFM7vdzPY3s+EVbqeIiIiIVEHZE0jc/S13v8TdtwfWA04E1gAuB2ab2f+Y2fYVaqeIiIiIVEGlZhMPAlYCWoABwELgP4G/mNljZjahQu8jIiIiIhU0uNwXmtlqwN7A/sAngMXAzcDx6XEpsBvwM+BSYMu+NlZEREREKqvXyaCZ7UEkgDsDQ4EpwLeBK919XtHm15vZ6sAFfWyniIiIiFRBOT2D/wu8BJwLXO7u07vZ/gng92W8j4iIiIhUWTnJ4GeBO9y9oycbu/sjwCNlvI+IiIiIVFmvk0F3/2s1GiIiIiIitVfO7eh+ZmbPdLH+n2Z2Tt+aJSIiIiK1UE5pmV2Aq7tYfzXw+fKaIyIiIiK1VM6YwXWBGV2sfzFt0y0z+wBwLLAVMAGY7u7vqUloZjsBPwQ2Iyav/Nzdf1Fiu2OJ2+StBUwDjnP3O4q2WRU4B9iTmA19F3Cku3f1mURERESaUjk9g28CG3Sx/v1E0eme+CDR0/gs8FSpDcxsa+BG4DFgJ6Jm4c/N7LCi7Y4FJgHnp30+A9xsZlsU7fJKov7hkcA+wDjgDt1CT0RERPKonGTwTuBQMxtfvMLM1gcOTdv0xE3uvq677wk82sk2pwKPuvvX3f0ud/8B8FvgNDMbmN63BTiZ6DGc7O53ErUQnwdOyrTvY0SieLC7X+nuNwP/BYwHDuxhm0VERESaRjnJ4KnE5eWpZnaumR2S/p1H1BQcCJzSkx25+9Ku1qck7zO8d4ziH4hLwR9Jzz8OrAZcldn3EuAaYCczG5AW7wy8AdyW2W4m8EBaJyIiIpIrvU4G3f0Z4vZzjxKXWn+d/n0LaAU+6e5eofZtCAzhvZeQp6XHTdLjpunx6RLbrQKsk9lueokkdFpmXyIiIiK5Uda9id19GrBdutXc+9Pi59x9bsVaFkalx9eLls9Pj6Mz27W7e/FYxex2s9J2xfsqbDe6xPIuTZ06tbcvKam1tbXk8mEjxtLW1laR96i3dhvd7WfpL5+1J5+lXLWMwdy5w5n1wpyavV9Pdfb7kCeKgWIAigEoBlCbGJSVDBa4+2vAaxVqS78zYcIEWlpa+rSP1tZWJk6cWHLd7HkLGDduQZ/23yhaWoYybty4Tte3tbV1ub6RdPdZylXrGIwZszprbvSeob911dXvQ14oBooBKAagGEDlYtDe3t5lB1ZZyaCZDQJ2JHoFRwEDijbpcPezytl3kULP3sii5YUew3mZ7VrMbKi7v9PNdqX++o3KbCMiIiKSG71OBs3so8Afgffx3iSwoAOoRDL4HPAuMdbvtszyzdLj9PRYGCu4KVGCJrvdW0RtwsJ2O5jZgKJ7K2+W2ZeIiIhIbpTTM3gBMAzYA7jP3V+vZIOy3L3dzO4E9gZ+llm1L/AKy8vRPEjMEt6HlAym3su9gdsyid8txGzoHUnJpZmtC2wDfLtan0NERESkUZWTDG4OnOTuN/X1zVOh50JJl/WAEWa2Z3o+xd1fBM4E7jWzi4HfEzOZvwEcUZgVnJLGHwCTzGwOkSQeTMxG3q/wfu7+NzO7GfitmX2XKKB9JjATuKyvn0dERESkvyknGZxF55eHe2sN4NqiZYXnBwGXuftDZrY7cXeRrwBtwHfc/dfZF7n7ZDMDOApYkygXs4u7P160/32ByUQPZwtxO7q93L05ZmqIiIiI9EI5yeDZwPfM7CJ3f7Mvb57uB9xtYunutxCXeLvbbjKR6HW1zVvEXVIO7VkrRURERJpXOcngaOBt4Fkzuw74F7CkaJsOdz+nr40TERERkeoqt2ew4LBOtukAlAyKiIiINLhyksENKt4KEREREamLXieDaYaviIiIiDSBsm9HZ2YbAdsRM4J/7+4zzGwIsBbwiru/W5kmioiIiEi1lHMHkoHAr4GvEzOBO4CHgBnAEOBJonbfTyrWShERERGpioFlvOZE4GvAKcDWZErDuPu/iVvVfaEirRMRERGRqionGTwIuMTdJwHPllj/JLBRn1olIiIiIjVRTjL4PuCRLtYvBFYtrzkiIiIiUkvlJIOvEPcR7sxEQDOORURERPqBcpLBPwLfTLOJCzoAzGwn4v7B11SgbSIiIiJSZeUkg6cDM4HHgN8TieCJZvYw8H/A48B/V6qBIiIiIlI9vU4G3f1N4OPAJGBN4B1gG2AVIlH8lLsvrGAbRURERKRKyio67e7vEMngpMo2R0RERERqqZzLxCIiIiLSJMq5A8klPdisw92/XkZ7RERERKSGyrlM/BnS7OGMQcDa6XEO8HYf2yUiIiIiNdDrZNDd1y+13MxWAg4FjgZ26FOrRERERKQmKjZm0N0XufsvgduBX1ZqvyIiIiJSPdWYQPI48Kkq7FdEREREKqwayeAOwIIq7FdEREREKqyc2cSndrJqJNEj+BHg7D60SURERERqpJzZxKd3snw+8BxwGHBxuQ0SERERkdopZzaxClWLiIiINAkldiIiIiI5Vs6YwfHlvJG7zyzndSIiIiJSPeWMGZzBe+9A0hODyngNZnY3sG0nq09w97PN7HTgtBLrv+fuk4v29xXgRGB9Yozjme5+dTltExEREenvykkGDwaOAtYF/gD8My03YF9gJnAesLQSDQQOB0YULTsgLb8ls2whcau8rBezT8xsT+ByYrbz7cAewJVm9qa731qh9oqIiIj0G+Ukg2sDLcAH3H1+doWZnQY8AKzl7v9dgfbh7k8VLzOz84An3f2JzOKl7v5wN7s7C7jW3U9Iz+8ys02BMwAlgyIiIpI75UwgOQy4qDgRBHD3uURZmW/2tWGdMbONgC2BK3r5ug2ATYCrilb9AdjSzMZWpoUiIiIi/Uc5PYNjgFW6WL9y2qZa9icuQf+haPkwM3sVGA08C/zC3c/PrN80PRb3NE5LjwbMqXBbRURERBpaOT2DDwPfNrOJxSvM7KPAt4G/9bVhXfgycI+7z8osexY4jhizuBvwEPDLNLGkYFR6fL1of4UeztEVb6mIiIhIgyunZ/BbwN3AI2Y2BXgmLS9cvp0HHFmR1hUxs62ADYFJ2eXuXnzJ+BYzAzjOzM5x97er0Z6pU6dWZD+tra0llw8bMZa2traKvEe9tdvobj9Lf/msPfks5aplDObOHc6sFxqvM7yz34c8UQwUA1AMQDGA2sSgnDuQPGVmHwKOB3YC9kyrXgTOBX7s7q9Urokr2B94B7iuB9teAxwIbAZMYXkP4Egg275Cj+G83jZmwoQJtLS09PZlK2htbWXixPd0sgIwe94Cxo1b0Kf9N4qWlqGMGzeu0/VtbW1drm8k3X2WctU6BmPGrM6aG5VVNrRquvp9yAvFQDEAxQAUA6hcDNrb27vswCqnZxB3nw18J/2rCTMbDOwD3OTub5axi6fT46bA9MzyzdKj96F5IiIiIv1Sn25HZ2YbmdknzGy1SjWoCzsCq9PzWcRfImoPTgNw9xeIJHCfou32Baa4e+NdLxMRERGpsrJ6Bs1sP6Jw8zpp0Q7AnWa2OvAgcLK7X1OZJi6zPzCXEvUAzayVKCbtwBAi4ftyakf2OuupwNVm9hzwF2B34LPALhVuq4iIiEi/0OueQTP7ItE79zTwPWBAYZ27v5aWf6VSDUzvuQoxS/gad19UYpNngaOB64mxgpsAX3P3H2Y3cvdrgYOIcY5/Jnob99PdR0RERCSvyukZPAn4q7vvaGZjgMlF6/9GhYtOu/u/ifqFna0vvvTb1b4uJ3oRRURERHKvnDGDmwJ/6mL9q4Du5iEiIiLSD5STDL5N13cg2RB4rbzmiIiIiEgtlZMM3gkcaGZDileY2TjgG8R4PBERERFpcOUkgycBawN/Bw4HOoCdzexs4EnivsFnVKyFIiIiIlI1vU4G3f0Z4BPEXTxOJ2YTHwN8H/gHsI27z6xcE0VERESkWno1m9jMBhG1BWe7+2fNbBTwASKpfF6Fm0VERET6l96WlhkIPAccB/zU3ecT9/0VERERkX6oV5eJU8HnNmKcoIiIiIj0c+VMILmUmE08tNKNEREREZHaKucOJP8EBgHTzexy4HlgYfFGVbg3sYiIiIhUWDnJ4BWZ/5/SyTYdxD2CRURERKSB9SgZNLPzgMvdvRX4dFq8CtEjuKRKbRMRERGRKutpz+C3gIeBVne/x8zGEPcg3sHd76la60RERESkqsqZQFIwoGKtEBEREZG66EsyKCIiIiL9nJJBERERkRzrzWzi95vZ/0v/Xy09bmJm/y61sbs/0qeWiYiIiEjV9SYZPCP9y/pFie0GEKVlBpXbKBERERGpjZ4mgwdVtRUiIiIiUhc9Sgbd/fJqN0REREREak8TSERERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiORYb4pO14WZHQhcWmLV+e7+rcx2OwE/BDYDXgJ+7u7vKYptZscCRwBrAdOA49z9jio0XURERKTh9aeewc8BW2f+TS6sMLOtgRuBx4CdiOTx52Z2WHYHKRGcBJwP7AI8A9xsZlvU4gOIiIiINJqG7xnMaHX31zpZdyrwqLt/PT2/y8zGA6eZ2UXuvtTMWoCTiR7DyQBmdg/wJHASsHeV2y8iIiLScPpTz2BJKcn7DHB10ao/EJeCP5KefxxYDbiqsIG7LwGuAXYyswHVb62IiIhIY+lPPYNTzWwsMBO4DPihuy8GNgSGAE8VbT8tPW4C/B3YND1/usR2qwDrALMq32wRERGRxtUfksGXgdOAR4AlxJjAU4ANgAOBUWm714teNz89jk6Po4B2d1/YxXZKBkVERCRXGj4ZdPc/A3/OLPqLmb0BnG5mZ9WpWQBMnTq1IvtpbW0tuXzYiLG0tbVV5D3qrd1Gd/tZ+stn7clnKVctYzB37nBmvTCnZu/XU539PuSJYqAYgGIAigHUJgYNnwx24hrgdGI8YOFy8MiibQo9hvPS43ygxcyGuvs7XWzXYxMmTKClpaW3L1tBa2srEydOLLlu9rwFjBu3oE/7bxQtLUMZN25cp+vb2tq6XN9Iuvss5ap1DMaMWZ01Nxpfs/fria5+H/JCMVAMQDEAxQAqF4P29vYuO7D6/QQS4DngXZaPCSzYLD1OT4+FsYKltnuLqE0oIiIikiv9NRn8EtBBlJtpB+7kvaVh9gVeAR5Nzx8E3gD2KWxgZoPS625z945qN1pERESk0TT8ZWIz+zOR7E0FlhITSA4Hfuvuz6fNzgTuNbOLgd8DnwC+ARzh7ksB3L3dzH4ATDKzOUSSeDAxG3m/Gn4kERERkYbR8MkgcXn3a8D7iPY+AxwH/Lywgbs/ZGa7E3cX+QrQBnzH3X+d3ZG7TzYzgKOANYnxhru4++PV/xgiIiIijafhk0F3Pxo4ugfb3QLc0oPtJpO5lZ2IiIhInvXXMYMiIiIiUgEN3zMoItWxdGkHs+c1VumiYSPGltWm4S2DWXXlIVVokYhI81MyKJJT7YuW8OATjVXoO2ot9j4Z3H7L8UoGRUTKpMvEIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRkUERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJscH1bkB3zGwv4MvARGA08BzwK+BCd1+atrkM+GqJl+/l7tcV7e9Y4AhgLWAacJy731G1DyAiIiLSwPpDz+B3gXbge8CuwPXAecCPirZ7Hti66N+d2Q1SIjgJOB/YBXgGuNnMtqhe80VEREQaV8P3DAKfd/c5med3mdkqwLfM7GR3b0/LF7r7w53txMxagJOBn7v75LTsHuBJ4CRg7+o0X0RERKRxNXzPYFEiWPAYMJS4bNxTHwdWA67K7HsJcA2wk5kN6Es7RURERPqj/tAzWMongXnAq5llG5rZ68DKwFTgbHe/OrN+0/T4dNG+pgGrAOsAs6rSWhEREZEG1e+SQTP7KHAQcEbq2YPoKZxCJHarAQcDV5nZMHe/LG0zCmh394VFu5yfHkfTy2Rw6tSpvf8AJbS2tpZcPmzEWNra2iryHvXWbqO7/Sz95bP25LOUq5YxqObn6Ity2jR37nBmvVDqIkL/1Nk5IU8UA8UAFAOoTQz6VTJoZmsBfwQeITOBxN3PLdr0BjO7EzgDuKxa7ZkwYQItLS192kdraysTJ04suW72vAWMG7egT/tvFC0tQxk3blyn69va2rpc30i6+yzlqnUMqvU5+qLcGIwZszprbjS+Ci2qva7OCXmhGCgGoBhA5WLQ3t7eZQdWw48ZLDCz1YBbgQXAbu6+qJuXXAuMN7Ox6fl8oMXMhhZtNyo9zqtYY0VERET6iX6RDKYE7kZgDeBz7j63jN0UxgpuWrR8M+At4KXyWygiIiLSPzV8Mmhmg4kZv5sDO7n7iz14zQCiVMyLmdnIDwJvAPtkthuUtrvN3Tsq3XYRERGRRtcfxgyeD3we+D4w3My2yqx7irjMezlwJfAsMJKYQLIdcEBhQ3dvN7MfAJPMbA7waNpuQ2C/qn8KERERkQbUH5LBHdPjj0us+zTwBNHjdzJxGXkRkejt5u43ZTd298lmBnAUsCYx+3gXd3+8Ok0XERERaWwNnwy6+/o92Gz3XuxvMjC57AaJiIiINJGGHzMoIiIiItWjZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRkUERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkWMPfm1hEpDtLl3Ywe96CejejIlZZbfV6N0FEckbJoIj0e+2LlvDgE231bkZFfPj9w+vdBBHJGV0mFhEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHBtc7wbUmpltBPwC2AZYCFwFHOfuC+raMBEREZE6yFUyaGYjgbuAF4E9gTWAnwJjgS/Vr2UiImHo0GHMntf/v5sObxnMqisPqXczRKQHcpUMAocCo4APu/trAGa2GPi9mZ3l7tPq2joRyb1FSzq4Y8rMejejz7bfcrySQZF+Im9jBncG7igkgskfgXZgp/o0SURERKR+8tYzuClwSXaBu7eb2XPAJr3YzyCAd999tyKNam9vL7l88aJ3GTxwaUXeo96WLF7U5WcZutKAfvNZu/ss5ap1DKr1Ofqi3Bg04mcp19Ili5vis7z7bjsvzS7vHNmyymhemv16ZRvUB8OGDGbl4SvV/H07+9uQJ4pBZWKQyVcGlVo/oKOjo89v0l+Y2SLgFHc/u2j5/cCr7v6FnuyntbV1G+C+KjRRREREpFo+OXHixPuLF+atZ7BSpgCfBF4GltS5LSIiIiJdGQSsTeQv75G3ZHA+MLLE8lHA9J7uZOLEie3AezJrERERkQb1XGcr8jaB5Gli3OAyZtYCbEgvkkERERGRZpG3ZPAWYHszG5NZ9l9AS1onIiIikit5m0AyEpgKzADOYnnR6TvcXUWnRUREJHdy1TPo7q8DnwH+Dfwv8DPgauBrdWyWiIiISN3kqmdQRERERFaUq55BEREREVmRkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRlsEGY2oOgxdz8bxeC9FIOgOCgGoBiAYgCKAVQ+BrkPaKNw945UFHukma3s7ksBzGxQfVtWO4oBmNkQM1vXzDY2s5GZGAyod9tqLfuZC3EoXt7sFAPFoFheY6DjoLoxUJ3BBmBmewN7AbsALwLPAH919/My2wzM/vCbjWIAZnYgsC+wA/Av4A3g/4AfuPuCtE1TxyDLzFYHNgG2BZyIyTPuPi+tH+DuTX0CUwwUAwAzGwdMBHYG/gnMAh5095fS+qY/L+g4qG4MlAzWmZltADwG3AHcBGwBbAZsTtwp5SR3v6Z+Law+xWBZDJ4AriHujmPAB4E9gAHAKe5+ftq26U96AGZ2K/EHcAEwHngZeJBIkK9y9/Y6Nq8mFAPFAMDM7gPeD8wF3kecF9uAG4BzM18Wm/bcoOOgujFQMlhnZnYpsDqwT+YXeh1gO2A/YGvgT0Qy0FavdlaTYgBmdgGwAfAFd1+Ylo0EPgR8FTgQuAc41N2frVMza8bMziJ6io8A/pEWHwHsDqwK3A9c4O5/r0sDa0AxUAwAzOwMYB/gQHd/2MyGAfsDOwH/AcwAfuTut9WvldWl46D6MVAyWEdpLNylwEjgi8BSoCMzTmwjIhHYB7jJ3Y+pU1OrRjFYNt7jXOIb3w7uviB72cfMxgK7AUcTl4gOKCTNzcjMhhKJ71/c/eSidesC3wb2JGJxuLs/22w9IoqBYgAxhhi4GZjq7t8pWjcW+BKRGA4krqDc3oQx0HFQgxgoGawzMzsJOAyY6O6vpmUrufui9P9BwPeAScBe7v7HujW2ShQDMLOvAj8Gdnb31rSsOAZfBi4DvunuF9arrbVgZtcBS9197/R8JWBJJkHeAbgCeA7YtTBmppkoBooBgJn9BtjQ3T+dnhfH4GPARcBQYDt3f7luja0SHQfVj4FmE9ff5cT4j/vN7FMA7r7IzAaa2RB3X+LuZwN/Bz5az4ZWkWIQYz6eA243s8/DshgMSknhEnf/H+DPwEfq2dAauRf4opkdYmaD3H2Ruy9NJ0Dc/S/A54jB1J+pZ0OrSDFQDADuBLY1s9PTFYPiGPwN+E9gFHF1pRnpOKhyDJQM1lH6xZ4FHAq8Bvwq/cKv4+5L3f3dwnbAC8AH6tjcqlAMgrvPJWYSPwhcZmaXmNmGKQks9A4OAF4F1qtjU2vlUuBq4vLH0WY2HpZ/SUjbzCAS6C3q0sLqUwwUA9z9D8CPiOEy55nZh9PywpfFgcSEgieATaw5S63k/jigyjFQMlhHhe5dd78XOAmYDnwFuMrMTjCzUWkixReJwcJX1a2xVaIYLOfuLwLfJ37pPwHcbWbnmtkmZvYfwEHE7OLf1K+V1WOZYuPu/hZwKnFimwSca2ZfMLO1fXkJjSHEwOnX69HealAMFIOsTGL3S+LqwR5EDI4xsw+kL4tLgRHAOkBbs4yV03FQ2xhozGCNmdmqwJZEnaABwAvufmla10LMGv08MUtsTaInaBFwu7t/ox5trjTFYNlM4c8S3fmrAE8CV7j7S2l84GeJGOxE9ATOBd4CbnT3o+vR5lpIcRkILEonP8zsIOC/gcHAQ0R9rZeBXYH13X2D+rS2OhQDxcBi4siawHBgji+vI7cH8YVxHWAe8eV5FnEuHenuG9elwVWS9+MAahcDJYM1ZmbXELNGVyH+wI8n/sj/DPiFuy80szWJy6FrpvW3AS96KjnS3ykGYGY3ABsTyXAb8GFgEPA/wPnuPt3MVgFGEyf+dYCHiT8MTVdPy0oXHb/H3X+S2eY44FPE8TAO+D3wB3d/uPYtrjzFQDGATovP3wqc4ctLbx1EnEM3B9YCrgSud/fH6tHmStNxUPsYKBmsITPbnZgs8WV3v9nM3kckBHsRl0afB45Il0yLX9sUU+UVAzCzXYlf2t3c/R4zWw0YC+wNHAu8DZzo7r8r8dqmiEGWdV10/G3gdHe/Im27CpFAd7j7v+vT4spTDBQD6Lb4/EAiIfx52naou79jmaoDzUDHQX1ioGSwhszsZ8Qlvz19xfsKDiUum55GFFo+AZgMDHT3JXVoatUoBmBmpxDf5nYpTJDJrFuX6P7fD/ipux9bhybWlPWs6PiNRNHxf1nMpGu2Y0IxUAywnhWfvw84zN29SWOg46AOMdAEkhrIDAKeSSQB66XlgwHc/R13vw84BLiEqLk3oZkOcMVgBf8CPk1841+Bu/+LmFl9KvBlM9upxm2rqTQ+chDQASybHenuL7n774lC2xcQE2q+A9Bsx4RioBjAsnPkYmIySHbiwOvp3HgCcW5YHZhkZsObMAY6DuoUAyWDNZC5rHcv8ct+RFq+2MwGpx8+7v48cDwxI+g7pfbVXykGK/gr8BRwnJmtBSsky7j728BPgFeAb1pzlooAlp3EnJgsNKowO9KW1856hugt/i1RTqHp6qgpBooBLDtHthJjpTdNy7IxmEMUnT8H+C/ggPq0tHp0HNQvBkoGa8jjzhJnAUeZ2S1pSvhid1+S+UHPA64FVre4B2VTyXsM0pi/WcAPiZlf15vZR4vHAaZLRFcTZQJG1L6lNaWi44oBKAag4vOg4wDqEAONGawyMxvs7ouLlu0PnA6sDfwUOMuXF1dehRg8vMDd96xxc2smxeBMYrZw08egMNi7aNnWxGf/f0TtwJ8Ar7j7m2a2OjGzeJG7717zBtdIuvyxNJ3wzgZWI74IXOzuL2W3I2ZMDnT3verT2spLvb4DUgy2IW5JOAK4jpzEAHQcZFkUE/4lcRnwBuCH7v5cZv0AoodwTXf/XF0aWSV5Pw7qeT5QMlhlFvfdvdbd/5lZthIxAHTf9K8D+BPwDjF77EPAVumSab9nUTtwbWCEuz+Rlg0gkqCvETdbX0pzx+BnxHHwYGbZAGAjoqD2YcAawN3EZfS1iJh90t1fqHmD68DMPg18i7g88hJwC/Brotbax4nLIgd5k9yb2sxGe9H9Q81sW+AootRQG00eg1JyeBwY8LxnZgSb2QeJIvO7ECW4/hf4FTCMiMvPgK+7+3W1b3Ft5PA4qOv5QMlgFZnZN4ALiZuMv2BFZUHMbCwxXXxbYE+i1t6TRNJwRz3aXGlmth1x+5ztiGEJc0k1sYjxMS1EvaztiaSoGWNwMPELbO7+XInjYDjRQ7odkRi/QVwq+l93n1KHJleVqeg4FneUuRfYvPjcYDGz/mvAzsQfgbVozhjoOIhbyz0ATPSoLZo9DoYQyeAONHHxeR0HjXE+UDJYRWY2B/g5cHYaEzeA6PIdD8zwVE08s/1q7v5G7VtaPWb2EnGj9b8RB/A2xIG9gPim+xuPGbSF7ZsxBnOAc4H/TsfBYOKm8psD09z9laLtm6puWDFT0XHM7AHiXtz7FA8fyGyzFrAhzRsDHQdxHMwBvpQ9DrLDiyxKy6xGFBVuuuLzOg4a43wwuBI7kfcys58Ss0Ev8eXTvo8Gvk5k9kMsagkdX/hhFpKg4p6j/srMDgPaiQLKhYTvGjM7mZgRdwrwSTM7uHA5uAlj8FPiNkG/zRwHpwH7E7/UQ83sWuBUd/e0fvF799QcLIqOf5bSRcdPAw4ws0LR8dlFr22KWdUWdxb4MHGZpz0tW5vo/RhJ9I5c6e4ziXNI9rXNEgMdB8uPg0+w/DjYmJgpvE4aE/Zzd3+WuNfsi5nXNksMdBw0yPlAPYNVkC7/zgYmufvJadmpRDJY6CV7H/Bl4lvhbh7TxZuKmR0AnAx8xN3fTgfusiLSaQLFJcT4h8+5+9P1a23lpUkgrwLnuvt30rIziLI61wH3EOMCv0l889/Z3f9ep+bWhKnoOGY2E/i9u5+Qnm9L9BxvDrwJLCTuMvAjd7/YmrOoro4Ds1nA7zLHwWeIqyXrE+PDWog7E51BTCJpuj/WOg4a53yg0jLVsQ3xLW4/MzsvjQv5NlFIeF93P4c4yA8nuv73rldDq+xZ4hf9MDMb5u4d6TJpoYjmQ0Q19QVEdf1mY8AjwDfM7EYz+xjwDaJH9Ah3vxL4BdFL2A4cXLeWVpmp6DgAZvZN4ovgSma2WVp8EfACMWZqDaK+5kzgRDPbuJlioOMgmNmXiXP/+y1mjQKcDzwO/D933wDYB7gKOIY4NpqGjoPQSOcDJYNV4O5/Iga93kp0gT9AFBm+pjAWzKOq/DXEfSi3slR0uclMAX4HfBfYM02UwFMRzfT/x4iBs582s6app2dmK7v7A8RxcDox1uMh4u4jNxR+od19kbv/jYjVZmY2rFkuf2S5io4X3E3MCNwfmJyGCKxE/I7c7+7vuvtVxHCStYk7TjQNHQfLPETcReKDwM/M7G6iJ/B4YgIdKRk6iojTV+rTzOrQcbDM3TTI+UDJYJW4+z3EN7pTiUKidxBdvsu+FaXZYs8TYzebJhksfMNJA6CPJu6leSnwSzPbxFJx6YwnieLKzfSt7ydm9ll3n050+R9CXOa4iRj/kz0OBgOzCi9sxstBBZ7zouPu/rS770oMDVgD+AIxZOAld+/I9Iy8QMy2X7lJvxzk9jhI46Gfd/dvAUcCM4irCNcTdUaXZpKh+cTVhSElzpv9Xp6PA2is84HGDFaQxRT5CcAniVmiN6fl44CR7v5Uujy6NC1fj0gSL3f3s+rV7kpKk0Z2JC77/jvzWQ8jLo8OIwqm3k3ccmdz4EdEr+nxdWhyxZnZocTYn+eJYQFT0vKRxO2FXig6DjYgblH3O3c/vT6tri3LWdHxzqTfi8Xu/pui5WOI5OAxdz+qHm2rNFMB/kJB6TW9qGSUme1DjIe7smj5KOI4cHc/pGYNrTGdD0I9zwdKBiskfZO7lKgF1AGMIS4Pf8lXrBo+KH3r2ZAYB7Gvu7+vHm2utPQt5m3gJHefXGL9eOKy6beIntAWooTAXe6+Xw2bWjUpBnOJX9ytidlfX/WiwtGZ42ATotfwi+6+Xq3bWwumouPAshP6Ku7+YtHyAakXYCWPW04NIeqOXkbUpmyKouOmAvyY2WPEbNnt3f3h7GSAEsdBC3EcXEpzHQc6H9B45wMlgxViZucRs57OIsYB/gfR43Wfu+9ftO0goqL8x4DD3P36mja2SszsQmAr4kT3Wlq2CZEgrwrcBTzh7q+b2W7AfKLA8j+9k9pK/U0mBtsSJSIuJi5xHFCiV2Qg0SO4CXCou99U4+ZWnanoOGa2BTFR6gBiqMhjwMleogB52v7bxBem6939ezVvcBWYCvBjZl8lErvZxFWRA9z9X6V6TNP2pxBfnq9z9+Nq2tgq0fmgcc8HSgYrwKI20hTgSE+3B0o9RMcAk4BPufuDtmJV8S2A8c2SAJjZB4DpxCXie9JA4G8RA6DXJWYMjyQGyx6W7S1tFma2EXGS/0IhwU9/BM8l/gic6O5vFF0iHgP8h7v/tU7NripT0XHM7ElgHvAoMUxie+AfwFe8qGCsma0LfI8oSv6VZhk/airAj5nNI+41+wJwBXAjUV/vPV+EzWw0kTSNAw5pouNA54MGPR9oAkllfJYoJfMMLOvmXUyMfZhB3FKITCL4PqKCfFMkgsmxxPE0OiWCo4kK8jcR3/Y3I7r/twJusKim3mwuAm4mZpEXXAdcTcwG+yJAJhEc5O5zmzgRzBYd/6W7X5PGu4wjLgGdAlxuZu8vvMYzRcfr0eZKs6gruRT4hketycOBnxDHwr5pm2WTx9IfwnOAo5ooAeisAP8DxNWCl8zs3OzkgCY8Dn5K9IL9xt2vJnp6diQmmg1L2yz7e5wmTZwNfLuJjgOdDxr4fKBksDL+TdxKZwZE0pd6fxYTycCyQa8W5VNuIsaJNYV0EruR+FxXm9mfiG++twI/cPdHgFfd/Vric38I+Ei92lsNaRzM08SJbtltotx9vrsfRCSEF5rZ18xsYPrC0Eyzp0t5m7h/5jyIE3pKgN9090OJOy+sDdxlZptmX9gMfwDT7/qOwP8Qk4lw96Xu/mvij9+X07JlY8bS8395zCLt99Ll36OJckovp2WnEn/4pxNDaS4lYvFo6l1fpkmOg3WJ0iknFYbPEJdGryBmkR4Iy78kFrj7QndfUMOmVpvOBw18PlAyWAEe9QJ3TZcACz/Awi/2bcAHzGyr9HwvorbUebVvaXWkA/oWYgzEgcAGRG/gLaQyKhlPEL0E69awiVXn7u3ufri7P5ldnvmWdw4x9uVE4IPNcHLrgbwXHV83/Vvsy+8zWzjn/gH4qJl9MLP9p8zsZmuu8hkqwB+fdRpRYqxw5ehNdz+MSIQnm9kh6UtiM/9N1vmggc8HzXzg1ZS7z0iPxX/kHwJeArYzszVYfmuhebVtYfW5+5tEkendiMvGj6Re0sLsqAHERJJBvDdJbEq+vLj0VOIP3WLgT2b2H3VtWG3ktug4gLtPI7703QHLkoDCl8T7ibqan0vrRhC9ZaOKxw31Zx4F+A8gxwX43f0bxK0mF6TnHZkk4GdED+nxwCbFvYNNppXoDc3z+eB8GvR8oGSwitIPu52YMfp54DhgkLufUd+WVU/6tjcTuMjdH02LC8fZCKLncEkaN5Mb6XLIs8SkonWJnpCmlOkdLxQdv5+cFR235UWDf5S+CKzwRdHdZwN/BnZNi3YnZlh+sbYtrR5bXjD3fuLuESeTvwL8QwDc/ZXs8kzyM5X4mS8E/s/ilpVNyaNu4NHAg+TsfFDg7pPcfWrqCW2o84FmE1dRpkfs88TU+QFEPbk/1bdltZe+BR5J/FE4xN1vrHOT6sbMjgGedPe/1Lst1ZKSoYGF3h8zO4T4pjucqJd1D01adLwgJUNLOhsSYGb/RfSUfAr4PXCTN0kpmYJMQli4LLYGMMbdn7YmL8BfkH4XlpY6DgoxMLPdiQlojwL7NdGY0c5uxPBN4CSW34Sgac8HRTF43N3/nJYvO/7T87qeD5QM1oDFnSceB55298/VuTl1YWYbE5cM72qWX/LeshI1pJpJ+qN3EnBxZrLASpmE8P1Ez/A3iS9GQ2m+ouOlYrCssHBmu4HEDeqnECU21nL3sbVubzV0dhwQiXH2j18zF+Dv0XFQ9JpvAke7u9WomVVlnd+IYW93fzn93L/K8qskzXo+KI7BvcSx/nJmu7qfD5QM1kg6Ga7s7q/Xuy31kn4xBntmtq00DzM7G/g+cZnnUuBcX15OqbjI8OeJcaNvAdO9eYqOdxWDUknhX4HPAHu5+x9r3d5q6CYGKxRYtuUF+LciCq9fX/sWV15vjoPMFaSViF7TV0rutJ+x0jdi+DFwr7/3RgyF88GbxK33muV80OObUaTt63Y+UDIoIn2WSojcS5SNmE9c7pkBTC4MCUjjw4Y065eBXsRgoC8vH7EJccvK0+vR5krrbQzS/ycQBfhvrk+rK6uc46DZWPc3YtjW3R8o/nLQTHoQg1I3o9iU6DU8tdbtHVzrNxSRpvQRYC1i9uyVwMHAHsAFZrYvMMmj7E67mQ0lak22AA800aXznsZgiUVdyg8Tl46aaYxcb2IwhEiUBhFlqJpFOTEYSnP9LpS8EYNF8e1DiMumD2TGkq5LJMcvdrbDfqi7GOwCPJhJBNcjekdPq0djNZtYRCphOjEQ/JpUKmQyMWHoSqLn5wYzOzuVTBhO1NXauYn++EHvYrAyMVh8jybrHepNDFZJy3fP8XFQiEGz/S709kYMN5KKbzeR3sbgemKoRF2OA10mFpGKMLMh7v5udtJIWv45opjsNsAbRN3NHYhB0k0xa7JAMVAMQDEAMLP13X1GifHC2xEzxz/h7g+b2deJ+xKv5U1Wf7c/xUDJoIhURVHpkBHAF4ib0m8DHOvuP61n+2pBMVAMQDHISkMkngEuAC4hyulc7E1cf7dYI8ZAyaCIVFWmhMhg4CpgC3ffqLvXNRPFQDEAxSAzc/oSwICHibqKa9e5aTXTqDHQBBIRqar0x28g8J9Ej8ie3byk6SgGigEoBhl/IsbIbU0T3XWnlxoqBuoZFJGaMLPxwP7uPqnebakXxUAxAMXAdCOGhouBkkERERGpKd2IobFioGRQREREJMdUZ1BEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiO/X/icxdMgxke1QAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnYAAAFKCAYAAACQBBKyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAACHR0lEQVR4nO2dd5wURdrHf7MLLDkHEUkCFiKY1hzOHFAx5/CennreneE8MZzhxHzmnD3jmdBTCZJEohIUlpyKuOSwwO4Cm0O/f1T3THdPdZye6ZnZ5/v56DLd1RW6u6qefup5noooigKCIAiCIAgi88kJuwIEQRAEQRBEMJBgRxAEQRAEkSWQYEcQBEEQBJElkGBHEARBEASRJZBgRxAEQRAEkSWQYEcQBEEQBJElkGBHEBkAY6wXY0xhjN2oO/YYY8xXvCLGWCFj7BMX6U5Vyz3VTzlJqM9UxtjUBMo5gzE2jzFWobarrd+8shnG2CeMscKw62GGMdaCMbaNMXZz2HVxA2NsDmPs+bDrQTQsSLAjiIBhjN2oCg3HhV0XIgZjrBWAbwHUA7gDwA0AykKtVIgwxgaoHwe9wq6LB/4OoAbAf8OuiEv+DeB2xth+YVeEaDg0CrsCBEH45ikAz/q8lkEIOJnG2QlcexiAdgCe5JyPDKg+mcwAAMMATAVQaDp3K9Lsw58x1hjA3QDe4ZxXh1wdt4wAsAfA7QD+FW5ViIYCCXYEkaFwzmsB1Pq8tirg6qSEBCf0zurf0iDqAoilQc551mn9OOc1YddBwgUAOgH4JuyKuIVzXs8Y+x+APzLGhnHOM/FjisgwSLAjiBSg2o9dDaAPgLcAnAmgAsCnAB7gnNfp0rYF8CqASwAoAEYCeEWS52MAhnHOI+rvHwEcCqAn51wxpf0ZQB/OeW/1dyGAqZzzG3VpDgDwJoCzIJYovwAwXlJu3LXq8akAwDk/Vf3dBMDDAM4D0BdAUwCLATzDOR8hv1P2SMroBWAdgAcB7ALwTwAHAFgE4G+c8zm6605Rs5nCGAOAT7U2MMaOBvA4gBMBNAFQAOBfnPMpurIfg9BwDQLwAIDzIYRE7Z6erbY3X73kVwD/5Jwv0OXxCdy/BxEAf4PQnjGIZzIfwBOc81906a4F8A8AAwFUAvgZwP2c83U29/FGAB+b7gcA3MQ5/0St56mc8166axQA76n5Pw7gQIj7fBvnfAFj7FYA9wPoDuA3Na+1pnId77MNFwPYyjlfasozrq7q8ceg6x/qsTMQe4Z5ALYCGMc5v0OXJg/iPboeQA8AOyGEyYc55+WmMq6G0CIOglgiXgLgBZNGeCLE0n8+gDku2kkQCZFWqnaCyHJyIASlXQDuBTANwFAAf9YSqJP5SAj7ry8APAJgf4iJ34mvISbV4/UHGWOdAZwKYLjVhYyxZgAmATgHQrh7GmLyTcTwuzWA2wDMgBB4Hoa4Bz8wxgYnkK+MqyCEivcg7lkvAN+ry3eAaM/r6r+fgbi/7wEAY+wUAL8AaA/gCQihLQ/ATxZOI8MhlnQfAfCamse1EM+2EkLIfAxC8PmFMdbfdL3je6DyPsSz2Kbm+TSAEgB/0BIwxv4J4HMI4XYogBcBnARgBmOsk/xWAQCmS+7HDepxO06A+Mj4r9pGBuBHxthfIITLdyDemeMAfKK/0Md9lpU910U6KYyxAQDGAGim1v3vEDaXJ+rSRAD8APEujQFwJ4RQ9zcAI9TzWtpHAHwF8fH1OMRS62qIPqSnQP17IggiBZDGjiBSR2MA33LOn1B/v8sYmwfgZogJEQAuhJi4H+CcPw8AjLF3ILQkToyE0P5cBWCm7vjlAHIhBD8r/gzgIABXcc6/Uct9H0JD5JdiCO1hdNmXMfYmgHkQQsi4BPI20x1AP855sVoOh7gf5wD4kXM+kTHWBsBdACZyzqeq6SIQAt6vAM7SNJ2MsXch2v4MhEChZwXn/DJdm1pACGCfcM7/pDv+IQAO4FEA1+qud3wPVEHnFgBvc85v1137iiZcMMZ6AHgSwGO6vMAY+xrAUghB6yHZzeKcr2WM/WK+Hy7oD+BgzvkataxiiPv3BMT9L1WPNwLwIGOsL+d8tc/7HEXNrw+EsOWXsyAEycGc85264//U/fsaAOcCOI1zPk1X/lwIAfosCEG0D4QwNwrApRJNaxTO+WbGWDWETSNBJB3S2BFEavnA9PsXCM2OxnkQTg2aoAd10njLKWPO+V4AYwFcwRjT9+2rIISRBTaXnwdgO4D/6fKrAPAfp3Jt6lOnCXWMsSaMsfYQWrzpiC1XBsV3mlCnoi1VHihLrOMwCK3TlwA6MMY6MsY6qvWcCOBYxlhz0zXvmH6fBaHB+1K7Xs0jV63HaZJynd6Dy9W/w8wX6pbZL4X4OB9uKrcUYslbVm6iTNGEOpXf1L/fa0Kd6bjWJj/3WU97ABGIjwW/aPW72NQ/9FwJYCWApaZ7Og1CM6fd00sg5s8n9UIdYHg+eooBdEyg7gThGtLYEUTqqOGcbzUdK4YQCjR6AtimCml6Vros42sAl0Fo/aYyxvaHWJp70uG6ngDWSIy73ZYrhTF2C4Tm6GCIiVnDV/w9Gzbof3DOi1W7sXby5FEOUv9+aJOmAwC9bdUa03ktj4kW15vvqZv3oA+A7SbNkhmt3BUW59daHE+EDabfmrC00eK41iY/91lGxOG8HcMhtKIfAHiWMTYZwmv1G9URSasnA1BkkYfmgNNH/bvUIp2ZCIJ/5wlCCgl2BJE6UuERNwbAXggt3VQIDUQO7JdhvWI1QeUC0C9JXQcxiY4G8ByAHRBevDfBuDQZBHUWx50EAU1z80/EbKHMmCf5Cos8bgSw2aE8ILj3QCt3MOTe0eZ6BoHVfXa6/37us55dEO+dTFC3ex+jcM4rVDu/P0BoqM+BsGO9hzF2sqqhzgGwDML+TsYWmzra0RbCCYMgkg4JdgSRXqwHcBZjrJVJa3eQ1QV61MlrFIDLGGN3QAh4CznnVlodfbmHMcZyTFo7WbnFEBOVmZ4waomuUH9fpF+eYozd5NySlKFp3/Zyzt3YMdrlUZRAHrI8z2WMdeKcWwk8WrkbOOfLfJSRSg1SQveZc17HGFsF1QPZhN37aM6nHuKDZyqA+xljfwXwNsSy9hdqPfMBTLJYUtXQ2nMIHBw6GGPdIDyAl9ulI4igIBs7gkgvxkL0y79qB1R7oNstr4jna4h4XzdBeCe60daNBdAFMdsuzVP2FknaNQCOU8OZaGkvgHBg0KNpcfSehAdC2CelCwUQnoz3qDtTGHDwLNWYAOGt+pD+nnjMw4xm6/iYJD/tfn4HcY8fNRvsq+mcbLq0+HtOy9VBEMR9ngHgKMnxNQDaMMYO1eXXFab3jDHWQXLtPPVvW/XvcIh+8FdzQsZYnq7uP0BoXh9ljOWa0pmfhWZPOhMEkQJIY0cQ6cVoiAns32qMtqUQ8bvae8hjAoQW42X1txvB7gOIWFufMsbyIZYUrwcgC2T8HwgBcDxj7BsIe6PrEW97NgpCEzJK1SJ2gwgbwQEc7qE9SUMNIHszRPiRZYyxjwBsgggxcwqEUGrrhMA536OG+/gCwHzG2FcQjig9IDwsl0Is03qp11Q1PtvfVA9MzYP4eIjYcc+onq3/BPACgJ6MsREQAmZvABdBCCmP2RQzH0IwfJCJ2IkVAH6zi3/nlyDuM4SX802MsUNMsey+hljq/4Ex9jqA5hCC2UoAR+rS/Uv1Nh4DsdNGOwB/gRBwf1TTfA7xbr+lLtv+qtaNQZg1XAERw3ENY+wJiPv7K2Psewj7wCMhQt7oP8TOUtvqO1QLQXiBNHYEkUaoS0UXQggJ10HELtsK4I8e8qgB8D2AVhATdaGLa8oBnAHgJwgB718QGob7JWknQIQrOQgikPLxELsCbDKl+1S9fgBEzLTLIBwp0mo7L875dAjN5mwIwfNNAH8CsBtCYHCTx3AIwWQDxL15HcKOcDnUeHk+uBnAPRCa0Ochnkl7CA9NrdwXIQT/aoi4ei9DaKqmQsRos6vzdojgx+0gBPuvEAviHDgB3OcxEHaaV5ry3QXR5nKI+/RHiLh/o03Xj4QwDfijWvbdEMLtiZzz9Wpe9RAfI/dBvLcvQIQ1OQ5iyXaRrtzH1bwaQ4R7eQpCAJygpVG17ZcD+Ix2nSBSRURRyFGHIAiCSH8YYw9CaOP6pOm2ZwYYY5dCaAH7SDyhCSIpkMaOIAiCyBReg3BE+L+wK+KSBwG8SUIdkUpIY0cQBEEQBJElkMaOIAiCIAgiS2jwXrEFBQWNABwAYFN+fr4syCdBEARBEERa4CS3NHjBDiKI5WoAJxcUFGxySkwQBEEQBBEiB0DsL90X8WGmSLAD0FX9+4ttKoIgCIIgiPShK0iwk7IVAA466CA0aRIXND5QlixZgoEDBya1jHSlIbcdaNjtp7Y3zLYDDbv9DbntQMNuf7LbXl1djZUrVwKq/GKGBDt126MmTZogLy8v6YWloox0pSG3HWjY7ae2N1wacvsbctuBht3+FLW9TnaQvGIJgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAgiC9hStA/XDxuHouKKsKtChAgJdgRBEASRBYyfvR6l+6rxy4JNYVeFCBES7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCCILUBQl7CoQaQAJdgRBEASRVUTCrgARIiTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEEQWEaFoJw0aEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAgiC1CUsGtApAMk2BEEQRBEFkHhTho2JNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCY2cEjDGrgBwHYB8AO0BrAHwDoD3OOf1unSDATwNYACAzQBe5Zy/IcnvXgC3A9gPwFIAD3DOJ5nStALwAoDLATQFMAXAnZzzQlO6fgDeAHASgAoAX6v5lbtoO0EQBEEQRFbhRmM3FEAVgPsAXABgBIDXATynJWCMHQ9gFID5AAYD+BjAq4yxv+gzUoW6ZwC8BeB8AKsAjGGMHWYq8ysAFwK4E8BVAPYHMIkx1lyXV1sIga8VhAA4FMA1AD5y0SaCIAiCyCoUULwTwoXGDsAQznmR7vcUxlhLAHcwxh7hnFcBeBTAPM75zbo0PQAMY4y9zzmvZ4zlAXgEQpP3IgAwxqYBWAzgYQBXqseOhRD6zuecj1WPLYbQFN4I4G21jNsAtANwOOd8p5quFsAXjLEnOedL/dwQgiAIgshsKN5JQ8ZRY2cS6jTmQyyRtlcFttMBDDel+RJiufVI9fcJANpALJdqedcB+AbAYMaY9iaeB6AUwHhdug0AZqjnoEs3SRPqVL6D0C4OdmoXQRAEQRBEtuHXeeJkALsB7ADQB0ATAMtMaTSNWX/178Hq3+WSdC0BdNOlW6G339Ol66/7fbC5TFV7uMaUjiAIgiAIokHgZinWAGPsKAA3AXicc17HGGunnioxJS1W/7ZX/7YDUMU5r7BJt0lNZ85LS9de99ttOlcsWbLE6yW+KCgoSEk56UhDbjvQsNtPbW+4NOT2p7rtO7aXAAA2bdqIgoKSlJYtg559OHgS7Bhj+0Esd/4OnfNENjBw4EDk5eUltYyCggLk5+cntYx0pSG3HWjY7ae2N8y2Aw27/WG0fd6mxQDfhwMO6I78/D4pLdsMPfvktb2qqspWGeV6KZYx1gbAOADlAC7knNeopzSNW1vTJZomb7cuXR5jrKmLdOa8tHS7db/dpiMIgiCI7IecYgm4FOxUYWwUgM4AzuWc79KdXgOgGjEbOo0B6t8V6l/Ntk6Wbi9E7DstHdM5U+jTrdD9Xm7OS3Xk6GNKRxAEQRANhgg5xTZoHAU7xlgjCM/VQwEM5pyv159XHRYmQw1XouMaANsAzFN/z4Twdr1Kl3euet14zrn2rTEWQhN3ji5dd4ggxGN1+Y8FcAZjrIPu2CUA8kzpCIIgCIIgGgRubOzeAjAEwP0AmjPGjtOdW8Y53wPgCQDTGWMfAPgCwIkAbgVwu+bdyjmvYow9BeAZxlgRhMB3C4SG7VotQ875b4yxMQA+ZIwNBaDlvwHAJ7qy34MIYDySMfYkhDbxZQDDOedmD12CIAiCIIisx81SrKY5ex7ALNN/RwIA53wWgIsAHA1gAoTA9g/O+bv6jNTAxA8BuAvCXq8/RCDihaYyrwHwI0Qw4m8hNH9n6rcK45yXQMTP2wfgewCvQMTS+5OLNhEEQRAEQWQdjho7znkvNxmpu0Q4LoGqwt2LDmn2QuwscZtDupUAznVTP4IgCIIgiGzHb4BigiAIgiAIIs0gwY4gCIIgsgCKdkIAJNgRBEEQRFZB0U4aNiTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQRBagKBTwhCDBjiAIgiCyC4p30qAhwY4gCIIgCCJLIMGOIAiCyDh2FJfT0iNBSCDBjiAIgsgo+PrduPmpifjpt/VhV4Ug0g4S7AiCIIiMYuP2fQCAZet2h1wTgkg/SLAjCIIgMgxagiUIK0iwIwiCIDKSCHl/GiF5lwAJdgRBEESGQT4T9kQo3kmDhgQ7giAMlFXU4LclW8OuBkE4QgIMQcRDgh1BEAZe/KIAT338O7bvLg+7KkSCjPplDdZv3RN2NQiCSCGNwq4AQRDpxdadZQCA6pq6kGtCJMoHI5YgJwKMfPGisKsSKLQSSxDWkMaOIAgTYtokw/TsoD4LpSDNxo7eUYKIhzR2BEEYiE2aNGv6pbK6Ftt30VJ28shCaZUgAoI0dgRBGNCmTBLr/PP8f+fijhenoLq2PuyqZDX08WGExF0CIMGOIAgT2v6bNGn6Z8maXQCycxk0HaBwJ/ZQ123YkGBHEIQBsl8KApI8UgG9owQRDwl2BEEYIJEkcaLCcbjVyFroHbWHNJoNGxLsCIIwQkuxBEEQGQsJdgRBGCDnicRJB4WJks1qG/r4sIVuS8OGBDtCypChI/HcZ3PCrgYRAgpJdglDdorJhV5RgrCGBDvCkl8Xbgm7CkQYaNoQmjb9kwbasjSoApFislpLS7iGBDuCIAxEtSEk1/kmplGim5gMSH6xh966hg0JdgRBGKBlxAAJ8R42CNmH3lGCiIMEO4IgTDQIkSCpkEYpydANJghLSLAjCMIA7RUbBJqdYphVyH7hh95QgoiHBDuCIAyQx2HipINMlQZVSBrZ3DaCSBQS7AiCMEKSXcKQA0pyIa0yQVhDgh1BEAbqKdxJVpAOWsNkoaTDUncaksWPnPAACXZE4IybVYiSvVVhV4PwCXnFBkA2S1XpBL2jcqjzNmhIsCMCZUvRPrz9v4V4lnatyGBIKAGAD0YuxvT5mxLO59IHRuPOF6cEUCOv0HMkiIZIo7ArQGQXNbX1AIA9ZdUh14TwC9kvCUZNXwsA+MMRB3i+Vi9S1dTWo3DrnoBqRQAgmZUgbCCNHREoZDSe+dAzTJx0WIlNhzokm4b+8UEQMkiwIwjCiEKG6dlANst12dw2gkgUEuwIgjAQ09iRaEekJ1FzgXCrQRBpCQl2DZAxM9bhq594UvJWSNuT8ZBXbHagZPVaLEl2UrL5kROuIeeJBsi73y8CAFxzNktaGaTtyWRodiAyA4q1KIeG34YNaewIgjBAXrFZQhbL51mtjCSIBCHBjiAIA7SjWHAk6x5WVtVixLQ1qK9PLwnn2c/m4JWv5qWsPPr2IIh4SLAjAoW+pDMfhSS7tOfTscvw4aglmLVkq2WaMLrijIVbMHnuxqSXQ+MMQVhDgh2RFOhLOoPRHGDoIaYt+8prAABV1XWWabLbeYIgCCtIsCMIwgAp7DIHkr0JgjDjyiuWMdYXwL0AjgMwEMAKzvlAU5pPAPxRcvkVnPP/mdLeC+B2APsBWArgAc75JFOaVgBeAHA5gKYApgC4k3NeaErXD8AbAE4CUAHgazW/cjdtI4KFtASZD4U7IdKf7NAq7yguBxSgc/vmgeRHoy8BuNfYHQLgfACrASyzSbcWwPGm/ybrE6hC3TMA3lLzXAVgDGPsMFNeXwG4EMCdAK4CsD+ASYyx5rq82kIIfK0gBMChAK4B8JHLdhFJgsIQZC4KGdkFB93CpJAtAYpvfmoibn56YuD5Zvp9IRLDbRy70ZzzkUBUM3eURboKzvlsq0wYY3kAHgHwKuf8RfXYNACLATwM4Er12LEQQt/5nPOx6rHFANYAuBHA22qWtwFoB+BwzvlONV0tgC8YY09yzpe6bB8REPTFmPnQXrHpjxvFeDYrz7O4aQSRMK40dpzz+oDKOwFAG4jlUi3vOgDfABjMGNOmkvMAlAIYr0u3AcAM9Rx06SZpQp3KdwCqAAwOqM4E0bCg3UOyimwW0LO5bQThl6CdJ/owxkoYYzWMsfmMsatM5w9W/y43HV8KoCWAbrp0KyQC5VIA/U35GZaGOedVEJo9fToi1dCAm7FEtSE0a6aMEdPWYN2W0kDzzGatFtnyEoQ1QQp28yEcLC6GsHfbBOBrxtiNujTtAFRxzitM1xarf9vr0pVIyijWpfGSjkgVNN5mPNliv5RJfDhqCe56aarn62yfEWleCaJBEthesZzz10yHRjLGJgN4HMAnQZWTLJYsWZKScgoKClJSjhvc1MVrfbfsrgYAVJSXx12bTm0Pg0xr/7x5BYF5HWZa2/UEVfcg+9uu3bsAAOvWFaIVdkjTVFSLBQ/FQ75BoS8vGWVv3rwHALBt+3YUFFQGnn9QuG17UPeoqEjoSNZv2ICCvN2B5JkImdzvEyXMtgcm2FnwLYC3GWOdOOdFEJq0PMZYU865vje2U/9qb2IxgB6S/Nrp0mjp2lqkW+GlogMHDkReXp6XSzxTUFCA/Pz8pJbhii83AYB9XdykkdBmYwkwfgeat2huuDZt2h4SGdV+9dkfdZSVj5Q3Mqrtenz2Af21erE4yP42eflcYP1m9O7dC/n53aVp9pZXA//bgoiHfBPG1I5kPfs1xSuBhXvQdb/9kJ8/IPD8g8BV2xN5xyTMWrsAWF2Gnj16ID+/dyB5+iVj+30AJLvtVVVVtsqoVAco1mzrDjYdHwBgL4DNunRM50yhT6cX2Jab81I9b/vAo2BHBIOShLXY14fPx0+/rQ88X4JoEGShraQSjWMXckXSFRc3ZmdJBd75biHq6oLyjSTShaQJdqpQdiWA9aq2DgBmQni7XqVLl6umG88516SCsRCauHN06bpDBCEeqytmLIAzGGMddMcuAZBnSkekiGTYZ038fQPe+GZBgDkSRPqQkCOAzQRO/gWEHW9+uwBjZxZi4aqdzomJjMLtzhPNEQsz0hNAa8bY5ervOerfTyGCCq+GEMpuAXAqgBu0fDjnVYyxpwA8wxgrAjBPTdcHwLW6dL8xxsYA+JAxNhTAHgBPANgAo73eexABjEcyxp4E0BnAywCGc87tAikTyYY+pQnCFb4EMBLaiATR3rt6+gLIOtza2HWGsJfTo/2+CcAoCE3cI2raGgih7ULO+Wj9RZzzFxljAHAXgC4QIUzO55wvNOV/DYAXIYIR50HsMHGFfqswznkJY+x0AK8D+B6xLcXud9kugiCIUElW6I6sDgmSxU1LGfTtnbW4EuzU/VmdXoOL3Baq7jrxokOavRA7S9zmkG4lgHPdlk0QBJFOJFtGyeb5O9P3ig0T7c5l9QdAAyXVzhNElqNQ7CyCiOJG8Ij2GR+dpqH2MxJFEkd7N+leZh8k2BEEQYSIH4cjN5NxQ1DENFTBNlAawHvS0CDBjkg7vp20EisKww+uaWbx6p0YMnQkdu+pRHllDW4YNh6LVhc5X0gQNmjzarKWFbNxtTIqtGZh24LAzW3J0TR2DeELoIFBgh0RKLFJyn8en41djvve+CWQ+gTJ6F/XAgCWF+7G+q17UbKvCv8da972mCC8odRrS7HBSinJiCmZNkRNPkiy84v2umXxW9JgIcGOSAo04DYctu0qo6/+BNDuXI6HLuPKLi/6SLKvL5rftlUbi1FUbN6CnHADdd3sgwQ7gnCJzDGkoY+JfP1u3PrMzxg3qzDsqmQsUaE4G9dMk4x2y+55dTr+9NRP4VYmY2noo1j2QYIdESxZPEbo59/oHJzF7XXD5qIyAGJ5mvBHsuS6bH4100nLdNXDYzD6l7VhV8Mz0aXYNLqXRDCQYEcESsMwao5E25fVdkyuaOjtTxw/S7EabkweslkRGHbTFEVBeWUt3h+xOOSaCLwIaRTuJHshwY5ICmEPuMkm29vnFbof/lF8fA25C3dCU3ayqa9Pz3vsSZhPzyYQCUCCHUF4RD9o0tzpTF29gro0nQDTAe0d8qOxSwd2labeaSGqKQ9ZHants5qTgQ8vQqsOWQsJdkSgNJRBgpYx3PPk15vxwJvpF74mXYh5uGaecLBi/W7c+MRPmDx3Q2oLTg+5DnV1qmAXdkV8EB3DaBDLOkiwI5JCKiepyupaLFu3K+nl0ADoH76+OOwqpC0JOU/YXONnRwuvrN+6BwCwdG1qnWfSpStmtMZO/UtL9tkHCXZEoIQxRrz5zUI88Oav2FFcntRyNG2kYQhv4INitjY/lZNd9L3yItml2X0PSzgIW5zSbOwyUK7LSA0x4Q4S7IiMZ+2WUgBARWVtUsuJaVYiFLXdRLZNEqmUU5IW7iQlbQjnuaeLlimInXbCJk1uJREgJNgRhFcitLOGGW2iLa+swazFW0KuTeKkcq5Luo1dJksdToTcNP3HXjrgReClUJzZCwl2hC3fTV6F8sqasKvhDoex9bOxyzBu5rpAi8zmr10/WpHXhs/HM5/Mwcbte5NQo+wkEa9YO3mioTgyeWXj9r0YPpEHkpdsNxonxs8qxCX3j0qyp7iLGsWM7JJYDyIMSLAjbPlkzDJ8NHqp6/Su9rB0cb2/i+1PfztpFd7+bpH//PXQ564BTWOxY7ewc6yoSu6yeNJJpY2djz1d00VoC3v3Aj+a84fenoHPx69AWUXiH6yKj7XYD0YuQW2dgpqauoTLNzPxd8072fmBaPcuPd4kIkhIsEszdu+pxJChI7Fg5Y7Q6mAWrvzYrvldmsiEj8cI9HJdBlTYJ26eRVyaNFmSSpSULsUiSQb4KWhEWE87EbvE6trgBCo//V97zvVJHOw87UCRtFoQYUGCXZrB14uwAWNmBLtkmCmks5ikF3jTxaYmmfh5FtkSQiEM54mgheJsMOy3IpH3K9Bn6yOkjBYaJZnvmKudSdJ6tCUSgQS7NCXMeTGJq6GW/Hfccnw1YYWvwlM1ccUmykjoS1DpSjYKEclGE1KStles92w901CFBD/CcywwcPw9W72pJJCPIld5pJnjhx8mz92A1ZtKwq5G2kGCHeFMCvYd/ObnlfjyJ57e00NaVy4J+BGys8ZuJ/U2dnYT7NadZfhs7DJPk75fAWHx6p2uPZszUSYIss5+9vnVBHiz88TiNTvxj1emYeT0tQHUyzmNthScLs+wdF8VKqvdm/3U1Nbjla/m44E3aFcbMyTYEXGEOSmTBizD0TSZ9eFWI1FSuhQL5wn2yY9+w7eTVmHrzrKk1+ehd2bgmU/mJL2cIPCjbUr2s527fDse+2CWpWBttRSrOR6t3VySzOpF0a9ApAPXDxuP+153L6Rp0RqqazN8sEkCJNilHenRycIjsyS7TLcls8Ndy4ypssWpJKW1d6Gxq60zTl7p9tqluj5BlDfs/Vn4+ff1CeVRrz6WHN1M+sSHs1GwYoflOxTVapsaEajtnYtMEo1gkAwK1S3q3FCjCnRNGucmqzoZS6OwK0CkIYkYJrvQPrgpOp0GGw39QBi1kwmzQmkIbSzuneiSmIu0cbc19H4SbgUSKZ1vKAbfUIwzj+npO4/YNoOSmlj0AW1sM3vFan0nCG9ZNyHyYnsJh/4S+ULzbm7ahAQ7M6SxIxxJZcdPt+UBGRHdHclmASab2+ZESveKdaGx89Mbsvn5pU3TJCZ2Th99Vpq5aBiUAAIXe9KYBzTUVlTVYsjQkZixKDU7z9TUCI1dHgl2cZBgR8QRro2d90ju0WuDrYp9/ukrd4aCJpPEvIXTZur1R0rDnThrub1UR1EUzFm2Le2M44MkXZYRZa4TTlWKaubq5Ro7P11nReFuY5+T5PH9lFVYu7k0liSBsVbGtl3C/vOrCSsCytEeTWOXR0uxcZBgRwRKwur9dHbBlzrAZbgAY4uf0DPZsUSd2gDFgqDe+ZmLt+KJD3/DiGlrtJwDyTc9Cbdtsg8Yp1BImsbOvORqddyJ35dtw31v/IKxutinMqXfxz8uw99fnhr9HfQ+tzkp7vuaYNwol8QYM3RH0ox0kGeCULb4trFL4Ppk37qYPU2srCCWTTIZy3elYd8WT7jZK9bqlOz4rtIKAMCO4vKE6uWGsDW06TBeAuZ62As4VmYcfpdiNU3Zph37dEfdOE+o9QnqHqb4XdBKyQl8y5bMhwQ7Iq2ILbH4CGMQdGVsmMeLAACbi5IffiIs/IzP0Yk+wyU7N5NTRVVtdFINoiw373yllz14XTyCRAPihiVYpctKv2yfX6d7EtVsxe/H57MS6tUeBRz9h2oQ5ESXmAPK0CXpItwDQE1tHYqKK8KuBgl2hIwERs2ABtx06qwydu+p9H3tnGXbUhKPLAxioRwSy+fbSSt1S4npycPvzMCtz/yccD5uNCfaubtfmaZek3hH+3XhZvzjlWmYNn9zwnmFhb9xIjipULbPbyR2UooWGiWovWJlFiJe9nn2KhBakXLtbZoI93pe/nIe/vTUT6itC7dyFO6EcMTL4JmopiZmo5d+6CfgROr3xIe/IRIBRr14USD1ShZenqQm0EXflQTHtc/GLgcAXHxKn8Qy8okbYWvVxpJAyoqFO7EuzGqulNZPcZEGsaW7Tdv3OlXRkTScY1OCdJ/fqK2Z/K4EHRJIlo8b4Spo54lU29hphKW9raiqxZ6yanRp3zx67Lel20SdwqlSFNLYEXGkh42d+wxSLQRGEElYoxj0YLRwZRFqVC+xdCDjl2LVvyl5t3wUlsrJrK6u3marp2Du0LotpRijM/53Il3eLz/OE1b4H1Piv4Y9bBUbnKNaqm3sQn4FHnz7V9zy9ETDMc0+MmzFBAl2aUrYL21opEkYA1siAQ6GAbBmUwkeeW8mPhy1NNB8/QzQfie1sCneU4lp8zbFn0jBc9Y0drbOEz6qEdQE+8RHv+GKB8c4FJZYGXe9NBXvfr/I/QUS27YwkGl2E62R38em1/h21mmRRJ4ytZ6/cqzICVgTme6s2VQad0zb/zfs6YEEOyKORPplop068K/IANG3LZ2qV1pWDQDYXLTPIaVH/DhPOHgEmqmuqcO6LfEDZKp57D+z8eIXBdhbLu4lAl6msmPDNrGNUrLe+URznbdih3XeIfWD6apdYIUXZ5IkYohjF3UgkhO04KPP77B+HQEATRrlWKaJHUuOEJI6r9j0lSDDnh5IsCOSgt84dmHb2G3cvhfPfTYnbm9OPZE009hp6v+cEOqkDa0/z9kg/uHRxu6t/y3EXS9NRXECzihO1NbVY/Qva1Fn80x3lghPtjrV6DmRsDteeW34AgBOz89LwBPB/JVFvuuU7pTsqwIAbN0ZzMfM356fhP+MXOL5OkWmsoODZBcwbuxBZVUJ+iM66hWbciO7FJeXAZBgl2akg7iQyBdXon0skXAnQfDa8Pn4deEWrHYwjE+H56SRtC9vH9fE5Dp3V/P1xQCAfRU1Pkpzx8hpa/D+iMUYP6vQMo05TIssjEXSsS0qvWevsJbfWjVvEkg+G7fvw8jp9l7YKwp346mPfosutwHyp+KksbPCa9+J1kE3ZlrHlYw/YXae2FFcjh9/XeupbD2xrdIaho2dLbQUS6QdCYRX0jq1X1f+wINmeq6A3Sn9Wmzyq+KWoCPI+ylbI2KxD6YV0Wj7SfzML6usUf9aL9tZaZhTeUuDKipZd1LqKJCkspw48dD9AQAH9WiXsjL//ekc/LZ0m1G7rN4SQ7iTkAYvu1KlGjv14MzFWwEAw96fhfd+WIxSVRvqufxUhzuJFhx/qHRfVUIhqTIdEuzSlHS2H7Bjwuz1AIBFq3f6uj4aNDMkY3G+QWiQ6iSChn67NN9bpiUBTYjODTgCuy/nCY/pc31uo+SWhSuL8OtCsSm5l/YEHQrCDfYCQQI1CUjQCDs2lx7tgyC1MlT82CQLVRPVvKVqKdZNGqmNnfg7flYh1m/bg33l4gPIb18MOoyLW2rr6qO7rWhcP2w8/vj4hJTVYduuMkNg4rBnBxLsiDgS6ZfGbW38Fx62DVtVtU3okEh6OU/Uh+iJZVWmWyFKuz5ZGrtH3psZDQbtqoQ4bXXqbmpOTgQjp6/BkKEjsU9z4nAgmdUbY1qWs7M7TfWHqNX7Vbh1TxLLtDkpfQ6W66LyLHw+TJkphuSz1DYP2/HOJSkXaNWCNmzbixuf+ClUR5pbn/kZf3rqp+jvsOcvEuyynJraOluj8aBxOyFZUR+CpkSG7S4ACL/j6kmrpViPQUqTrbHT42Zi1pK42b81aCIRYMLsQgDArgSWkeKeic983v1hseG3dBzxa1CWIFHDf1PrErERc1um3slFu9dbd5bpQl2401zpu+u+8mp8MHKxdeIEkUc7UWx/e0Vrd6J9ma/f7es6T9vtZTkk2KUZQU/Olz7wIx58e4anaxJZ0pQtYfqheG8VZqhLaE4kQ6Bx2lg6jeQ6XRy0NKqUy9cgOhmkwpXO5r229DlNqY1d/HJeWpGO75epSl620vJcpMRlXz9WLlgpwsL4uU2fjl2ObbvKfdVP/2Fnda2djZ0Zv2YmQdnY3fv6L67SmUtJxcdhpkCCXQNgeaG/L6Aw0G8g/exnc0Krh0xI0g9Y6TTHRcOdBG5j5/0a6w3O7UmFttGdLZIW7iT1k8TiNTtdC7h2t3fR6uSEOQnahjMRErHF9V2mzsY2Vg/JeR9KzERWVfT3Qvv3JC38ULRu1l6xcf/OUPvuoJQK2QAJdoQjqRzOUzGoTJ8v2WHAjN1SbCS9nCeUNNLY+V2ZS4UnnV0Rcbcumja193RzkbAHNAu6+p/3v/ELqtXt42S1m7Nsu/Tawq17ErJDapQbX1pozusSIUscT957JPXYlxQXDdLtoSpePmw2bt8r/QD4fsrqqNZv9pJtsWDbFvWMX7JP7Glq5bUMKASNI6b6p0TrnyGQYJemNFStsrndyRioX/i8wDGNbIhL150ntPEsEnBvTuTOu31sKdW4eGhRzJ4qOXVJhOWFuz05CeyrqMGu0grc+eKUhLTgOTlpOF2kVGNntKEDjO9U9HAS67R6Uwn+9vxkfD91ta5i4k9dvRINtG1GvhQbrI2dFouyZbPG0WOP/2c2PhzlPfCzHxrqnCkjDXsqEQZlFTWoqhFagDA7SNxgE1JdFABTCzZG70k6E67GzviAYlVIfEkxcGzLstD8pJME7xNFAW58Qnjs8QTMMuyE3OkLNvvON12prK5FTW2s/0fN+gw2drF/axovpzdG9s67fc22qxq5lRuKUbBiO1ZuKLZ8rY1OHrKlWN2/9XXxK5lGHY5i189dvh0jptkHft5TVo3rh43zUZyxTWRjF4MEOwIAcPUjY/H3l6aEXY20+epatGonXvpyHj4ZvTR6TB9WINHQBEESRLiTrTvL4u18/MSxS+ONwN3Z2Bl/p6tY5/c9atI4N+CahINVnMFE37tq3YfcFQ+Owd+enxyXecRKYFIPx5wI7MvyM4ZEHaVyInjsg9kY+tp0y3KCtrl1jcdil67dhdJ9iUVTAGgpVg8JdkQUzb4nFUtwltfHueCHgxa2pXhvfBT2CCK+l+iSIfDUS76UvVBUXIE///tnfDJmWWB1ctvM1Bq/23jFupyMg6S6pg41tXKjeaf74ree2SLYaZjvU6LLieZYfZrNGqAzeTCUp6uL6V/JsBfWhJdcK7WhDr1g5yrcSVDV9ZhPUGOpX+eJiqpaTJhdmPodM5IICXYhs2nHXrzw+Vzb4J9mvKTNONKkb9U57ubgU2Pnsz62eeq+4v2gbSG0eI1xtxA/dY3ONy4vTpex1Erzk0zB87J//oi/PjdJXh9dwdt3l8cF/vYbL1Im2A3/eaWra1P5rGYu2hK3m4C8Lql17QKsnSe0Dyu3fcDoZW+/bKqxUt0ZxyC0WaQ1LMVKzhvkoDTph4ngV2P37veL8Oa3C7FsXeZEj3CCBLuQefXr+Zg+fzNWbSgRBxzGqR27y3HJ/aPx02/rk1cp08CSysC3cXYSIc380RAiufLB0fctScZSrGSJyA9ubn3pvipTmASLvNJwpvBy64MSHJy0ANt3l0uP19crUW3eUx/9Fn/e5+3Naxwb8s1V27G7PC44bBjLW+WVNfj3p3Pw1Ee/YUvRPmwpit/NxkrwTuZwYbctl6iM+schSLesm7p9y0b9sjYuD8t4dAYB1NktNrBhPuDporqmTq6NM3vFenj4C1buwIX3jsTe8upoH5Rd/9VPHA+/4y0ObDpAgl3INMoVj2BywUZX6bUv918CMFYeN3MdPhq9NNStWMzECRfhVCMm2ElGO2Fj5y9fd3Zeird9TaN19Vcnt2zbVYbrh43HD1OtjaG92tily1JsNE3Ab5zf3F77ej4ufWA0AATaPxs3sl6KvfnpiXHBYYdP5NF/p6ovllWI9hbvrcJtz07Cbc/KtZpAMDKEWwcprf1GTVj8XfG1jO6xIW608w5RWQwfB0EIxF7z2FK0D9U1dY4fpJf980e8/EV8JINE+ur/Jq+CogBrNpVEn3+eRJv95YQVvvc9DxMS7EJGW+obP6sQG7fvdb4gwInw7e8W4Yepqw2DN5DoAJ7g1QmMMEF+rWtfb4alWL2dtG/nCec0F947ylNYCm3ZOFGv2Hjth7Gy2ibXc5Zvs84jenFCVUk9pgB8YWscubrk9sPU1ZZaPSD+Njttx6QXCNy8Lobg5gF0sJra+ujSv4w9ZdW6gLvWFbR6PvX1CuYu3+5pHNm+q8xVOu0D6relsfffLgSSF+cJJyHMjJNwaa6Qmxh8svqWVdRgyNCR+H7K6viTCVBRVYvbnp2E14cvcPWs3Hhda0oSN8S2P4vtk5vXJHvsTxu5ScQY6wvgXgDHARgIYAXnfKAk3WAATwMYAGAzgFc5529I0t0L4HYA+wFYCuABzvkkU5pWAF4AcDmApgCmALiTc15oStcPwBsATgJQAeBrNT/r0TCN0A+0ldW1+Fj1wmyortvmVptvw7R5m9Bjv1bovX8bd/kpCr4YvwInH9HNUz3qJLs5RAfQiH/52u2EM3PRVg95ir+RJKvstHtRV2fnhJA8w/FE8RegONFCE7v8I51XthsKt7r4OPRA0MPQc5/NwW9Lt2H0SxdJz1/36Di88o9TXNfLLPxNnbcJU+dtwiM3HYNjB3aVX2v67dY2VbvuxS8KcMqRB8Sdj777AXipO2Fot5Vcp/u3NNyJxYX647vVPYt/+m09Lj2tr+d6WlGiOqWtWL8bJx++v688zE2SBdC2Qr9DjhbSprEHwTDdcduSQwCcD2A1AKnrHGPseACjAMwHMBjAxwBeZYz9xZTuXgDPAHhLzXMVgDGMscNMWX4F4EIAdwK4CsD+ACYxxprr8moLIfC1ghAAhwK4BsBHLtsVOmYziKiRtNOAGuCA20IXUFKrh18S9op1uP7FLwpw10tTMWp6/HKgbPAqq6zF8J9X4sG3PNpJqFnJNvyORP9nzZaifRgydGRcINlkiDv6MCx6Zi7agiFDR0YDhzrnY3/eabuw9Vv3RG9LOn6XuFoGN/1NdHJO9m0w3+fGjYKVJvSTfBBt0Wu7rNijhr5wde8t0ui92Z3sBN1qumXvtP4D3Owz4eXd8boC4MZ5wsmWz7KPGpZovQupbpJqu1S0at44MBtuJ43dtl1lKNlbhbf/txDzuNjXV1F0S9LpGtvIB24Fu9Gc8+6c88sBzLNI8yiAeZzzmznnUzjnTwH4EMAwxlgOADDG8gA8AqHJe5FzPhnA9QDWAnhYy4gxdiyE0HcL5/wrzvkYAJcA6AHgRl2ZtwFoB+Aizvl4zvlnAO4CcBVj7BCXbQsV/UvtRkuXjHcvyMC2xw8SX8n7dWjukFJOvNAgvycfjIyPZm5nuuLbk9ji1jgF8dSWDsy2kMmJYyf+mp+jtnyyyWmJX3eZnb2RtvGA/j3Vt2brrrJoXmHLdcV7KqPaBjdky5ie67A7hNeubghi6+Gh7iguxw6bJWTbMrWlWF9XC8ora7CvvBrT5m3CRfeNwtad1sutrjV2uhsQFRalPgla/d23QJ9y3oodlmFwNAwWIm5sRx2kTQWKdDtAL0KqFy19eaX42GzetLFDSrvyjNi9+/P4Dtz6zM+44bHxGDerMJaHR3vmTMGVYMc5t33LVIHtdADDTae+hFhuPVL9fQKANhDLpVredQC+ATCYMaa9PucBKAUwXpduA4AZ6jno0k3inOutG78DUAWhNUx/9B00pCgm8cuf/l/0owd0AQD06NI6gRrp65LY9f9861ctJ28XOgxkTgOdFui0SWNTF0vCGBINWmqqVKNGomynSULPJz/qAzIbz0WXYi2srhVFv09muIPl/z0+AX98fILn67R6h11/vzi9l153FfBrEnLzUxNx89MTLc8nen+tAhRrfPzjMlzzr3HRDyu7LdicNEa/q1pGfY21PmDcUszoOJTI9/KE2YW25105T6gVGPPrWtzw2Pi48/Wmvit9JBZL3l6pqa3D4/+ZHX0OicbelGGXVeGWUulxRdEtnWfN511wzhN9ADRB/DKtNkv0V/8erP5dLknXEkA3XboVEoFyqS4vLZ2hTM55FYA1pnQZgWEQdfKs8ighvPPdQgwZOlJ6LkjTLD/j9QqdgXZC4RUkl2oDSSBeX+rfSCSCH39da5/WYuBK5lKsebBvogp21bUOXn+6ShXvsTZsd/J4/XTM0szdgStJFU++fGgswG95Uyy88o35xX7sKq3Au98vQl29v69Ru27upF3S1ySIZTwn4eJJLdyM7mZou7TInCcMJhsSpE03Ja528NT1IhB9OjY23bpZijVoJh0EaLes2liCucu3461vF6iFSCrkFVP9/bz79Uri1sDzVuxIMIfgceU84YJ26t8S0/Fi9W97Xboqzrk58qQ+3SY1nTkvLV173W+36RxZsiQ1GxUXFBjdtveUxr4kVvCYd2ppaWlcWgBYs00sL+3Zs1d63oqxMwul5QPApk2x5cKCggKUVRoHlV27drkua80GsfxSWlpiuKa8qh4zZs9BU7MGC8BjX26K/nvZcqPMP2/efEPd9Gi/KyoqoteW7mgirZd5uyyn9uzeJYTNoh1F0bTlZWI5Z8WKFdFdOqzy2rpNPNfNmzejoGBvNF1VTb3tdV7qqLFpkxBet2/fhoKCWNcqKxPlruCrgLJN0msBYMtuYe9SXl6O4pxY4NuFCxeiZbNcaTqtbus3xGKMbS4qQ4cWYphct24dWsM44MnaU6a7p/t25sWd9/KOO7Ft2zYUFMiXZ6ur1CDNS5Zgc8tG2L1XhNyora0N7DkF2RaNNWvWokl1zNFmXaH98ufefbFxY8vmmBbr5S/lFjb79sWW8QvmzUdj1UD9q2k7wTdXYsOmZrHzFu2THZ87t8DS2H3VqlUAYs9ElkepOm6uXr0KkXLrd7u4pAQAsHbtGjSp3gIAqDeNBYsXL4r+e/6CBdIxqqCgwCCMzps/H3mNc7B2W+x94pyjYnceqmtUm9aIvO1VlaJd+jmnaEeRIc3GTZtRUGCtZSwqivWtrVvldota2fX1uv1uFSVu3NTqXlsr3vlFixajdXPR77cWiz5fWVnp+P5uKBLtKisri0vL1bmtZM8+fPbDL9GPzr179mD16niPW+16vZBpznP1FmNfXrR4EVo3z8WCteVx16xcUyKt8+rVa1BVLdq4ZMkSbGklF4ns2v7GcHkEg2T0d7cEJdhlPAMHDkReXvzEEiQFBQXIz883HBs1bxawVXTSfv36AT+LDt6mdZu4tACQu3IHMHknWrduFXe+oqoWNbX1aN1CJ9x8aRz0Duh9MITsrDt2wAHAAjFQ5ufni3AE38cmiw4dOiA//0i4obLRFuDX3Wjbtm20fuWVNbjq4bEAIPeG09Wxf//+wPjYoHXEkUcA32yO1k2fVsu/2ZQpQEkNDj64P/p1bwcDanrxZa/EXWtVl44dOwCF5ejUqRPy84VfT/PpU4HdpaKOE2J1lOW1eOtSYNledOvWDfn5B0WffXllDfDtFld1kJ2fPHcjunVqAdazPX78dS2OOWQ/7LdrI7BoD/bv2hX5+QdjyZqdqKtX0KlDPVZu3oIePXsj/3Brr+C534uJbVtxDfr06ARsFAP+oYcdinatmkbTtdlYAozfgebNm0frtq1yHTCnJJqmQ4f2QGE5evXqhfz8Ho7t+fLXacCuEvTv3x+V1XWYUrARd199pO01GhNmF6J1i7yoXacx4/jJvkuXLsjPjznzb99djsqqWvTs2hp54ycCZeUYeMhAdO3YAlt27gNGb0Pjxo18PSdZ+XHpJGm8cuCBByL/0JhX4b7IJmCmdciT1q1i48aq3RxYbC08AECrVq2AHbsAAEcecUR054qxC34DNm9D23btgfXy/qlhaLd6/vAjjjDGDdNd17dvX2DaLjRtmgeUlcfnAWDk3JnA1iL069cP+f27WN7LNm3aAJsr0a9vX+Qfsh8AIOd/W4G6mLAzcOAgYKQQjo44/PCY3Zd5rNH9PuKII9Asr1F0PAaAg/v3R/9e7ZE7YgeAakQk9QaAvJ9+BvbV4pBDDgF+3A4A6NKlM7Ay9pEkxo1+8Q1S69ClSxdgxb7Yv5fF29FqZTf6YTuqVGEzJycSPZ43cRIAkQdjDI1m/Q5UVWPQoEHo2FYI7Ks3lQDjdqCFrs9b0XTtLmBiEVq2bIn8/HyDYMMYAyYWYVtxDb79dTeuOZsB2Ik2bdqgb9/ewPRd0rrX1yvAV5sNx6K02A5MjVlhDRo4CEvW7sKo3+bF5fPYl/LVqt4HHohGCxYBqMKgQQOxX4cWxgQu+njTpk2BPfFBtJ3uVyJUVVXZKqOCWorVNG5tTce1WXa3Ll0eY6ypi3TmvLR0+lHLbbq0xVIT7bQUK9Ef3/7CZFz36Djb62SBjZ3iL3lZ7YjFoIod21Pmfvuj+N0PnBXlm9XI9HZJfavbJVEFEvF2S3Rp7pWv5uHe139B6b4qvPfDYjz63kyd55oo68G3Z+CRd2eisUsbu+3FFloeta7bd5dbb2FlsUuJUzt37C7Hq1/PQ3llbbSoR96diUlz3AXqBoA3v12IZz753XV6M7c8PRF3vDgFQPJCU2SmpV4Mvd2Rvi2xZUd/LVRs1mK95OhkF+WmegZnAZeF3/3yVDzzye/SnSfsPEm37y6POnIYHFPMdXKoiBdnMGtTGw83J8H+YX5OZRUxraZd3lb3YcvOfVHbx2haeJtrohdleieVEJRgtwZANWI2dBoD1L8r1L/aOpss3V6I2HdaOqZzptCnW6H7vdycl+rI0ceULm0xBql0YRBrk0YLIGuHrJMHOalpDiD6dskielte72PrCTfOAUHaOrl5Tpb2KwHVQQucvHtPVTSERPFe49JETLBzF1kfEN6kZm55eiJuf2GKp8HdaWL6fupqTJqzMW4P1LQhQwb7OIHA5XVT523CF+Odh0jj1lWx3DV7Tr/96t7Xp1ufjNqoyV+48soaVFa7f6cBYzvMdrx+hNMtO8swa/FWSOQ6W/u/WYu3eC5Lhj6WpJdAyPpObCVbGwVOzTHLaw3tcWu7J6tiXV09bvv3pKh5UTStoqDeo82nAiUrY8YGItipDguTAVxpOnUNgG2IhUiZCeHtepWWgDGWq143nnOu3eGxEJq4c3TpukMEIR6ry38sgDMYYx10xy4BkGdKR6jIjY3NBv7+X3Sna52MgpPXxVLbebXSauvqsWh1zH4mKG9LbWKtqKrF2s1iGX3C7PWGNFpcJ+k+izr0E6h+I2z9VVahQ8w5u/1IKHMZWy/VaO9vTFGR4IyW6jnD4f3Sns9Lki2avKAZ7/udFNdvsw7BE+0jFrf+qofHxnbEcFzZMGqzAW9e4k7o+3NMW219T+yCexvztT+v19g55WjlaKGv56qNJVFtlyF2oWmZoqKqVpgpmFi5oTjuw9IOqwDTVnXUJ7Mazwwx6TyQqR7wdrjdeaI5YmFGegJozRi7XP09h3O+HsATAKYzxj4A8AWAEwHcCuB2zbuVc17FGHsKwDOMsSIIge8WCA3btVp5nPPfGGNjAHzIGBsKYI+a/wYAn+iq9h5EAOORjLEnAXQG8DKA4ZxzaSDlho5TRyreW5nQZCTrI/pDl/3zR3z37AVRWx3bxPE/Hcq2Th3EZuZW2a/bUop1W/bg9KO6x53TtCJDL+lqm0cyiC6X2bT9X+/OxIJVRZbnnTC3p0Z1DvGyvY8XXvqiAHx9Md5/6EzP19ruPGEKwBfUYJ/0HThM2e+xWjJX8SqoWm1BpR0Pol+Z0bKU1VSLfxath0Nebmone9SV1e7255WOd3aCXb3iKp3Te+P0sablH4lEDM/QqIGN/fvDUXJ7rajHvXrdYx/MwrJ1u+NspYe+ZqOBRfwHn9MHgVZ3swC4cFURtu2Sm44oUHy9j4m+wukYCcCt80RnAN+ajmm/bwLwCed8FmPsIohdJf4PwBYA/+Ccv6u/iHP+ImMMEIGEu0CEMDmfc77QlP81AF4E8DaEBm4KgCv0W4VxzksYY6cDeB3A94htKXa/y3aFRl1dPdZsNsbWWVa4yyK1Po34UvW7MbEshqNezb5qYwn6HdDWV96AOzu06po6S8HO3OG9TLB2KQOZfiwyueulqQDEzgs3DRFxsc31rlW/1AMTGFxko32p19kkthPqXAU+Nd2U0jLhGde6pX9HJKtyd5ZUYOo8/w4HtpOlUa5LCgtW7sDhB3VOYgnAByMcvPs9TkJmQbCouAJzV2wPNP6YZdmSIsxbrDlWIyoY2CXRCVvq33Wb7Z1KbIu0eYkMgp1tJvZlGDR2LrW0gPM2Y+ayY8pTcaVeow+IUFV2MQItcXgudfUKGuVGdDE6xfFH3p1pm6cbgddwidKANXbq/qyOPZlzPhYulkA55y9CCG12afZC7Cxxm0O6lQDOdSoz3fh64kp8PZGjTcuYB+vn4+JtXvaUVRu8XFeqm4P7JddhJDT6jvpAEnXd3HFsN/cOsI/py/War1bFcTMLcfOFA5HXONfxK/r7qaujgp1Vfma27NyHXSWVGNS3Y1ydE8Zs9JMElq41foxUqxo7za7Sysjb1tHF4ty/3rMZ1AMmGWP9v96bhe+fuwCNGwW32Xgq9+RVAAz7YCY2bt+HI5kQUL1OpO5LAmRTTkWlUZPm7Dwh8pq3YgeaNmmEQw7sIEnkq5KG/PX/jm5KIcnXEPfPptzPx6/A2cf2RLvWZj9DgVGws6qbNu5YLMValK0/rlg/CgDAfW/8YpGLPdruD3OWbceh6thnqINWsMslWy2pZ42dTrALamuzdCB7dr3NMAq3Cm1d6T7rpZPl63bjukfHYcbCmMFtohO/dKN4g6o+/ryX5Zto7Wwusf16TqR9pkuDmnNGTI2Ps+QXc/Nu+/ckPPRObB9bfZ2HDB1psM8z5OOiLG3SzQ3Y8nnt5lK8/d1CrN5YgpmLttqmfVMLSGpi5UbrDxSrtun3/0wZiZrYmRrj1sbKLc99NtdTes/NMTlP7C0TS6H1USEmeMHObueGOO1zxGlJUzDql7W6XWjkafSFu53jFdm/XZqEON25d39YZHnO4DxhkZP0qGTv67jrlPi8E9bQ2lz+4aillueiGjsXY5ii+HOESPwVTj+BkAS7kHDzdbBqk5j8lqyJLbt6+SKRaUucBLecSCQh4Upm92zOzm6QSKSP+QmVomfbrljQYf09qZFEmfdal5jyzNm2RM+YGevcF2pCe1dydPZupfuqXGtZ7No7bmahq9ACMxfFewHW1yu2e3daEbRnnp7o81GCXTI3E7YHnletxHrzMlvE8Md1hx0ydCS+nGBckVhRuBtDho7Ehm3GMuzMOfxoZByTSJ6J68ckkez0VTSPwUZvVqexwN25UdMtdsFxaoSLRu4scY60YCYIxVf8nAHsdbAfVRR/Np9JUTqHDAl2IbGv3L1XYPHeKjz2wSzsq6jxJFy8+338F59juJOENRTOam2p1tB0fex34nVxi9Wku7u0Eu/9sEi6P6QVI6evsaiU/LBsiyLZb+cTMbRJpZF6v8sqanD9sPH4yMJQ2isO+80DkA+ac1dst7/Ism2Je6iWV9Zg8Zp4+9RUrcKEPYlUVLlzCtAwa0m1rqv1by8T6Vc/ccNvLabmPO7svDNj4RaU7quSmHU4CEAu+qoxvIc35EKhOLZrTw0uuX+0wS7Uy9L1rMX22nDHukmOeX3PX/hceE+70Zglg6gjTSTiausu2f3dUmQdUkmBEpuz/FVRBPRPM2jniZCoqHIh2Knv6AxV6zF57gZPX/wFko4g05ZFDOddZy/FjcbOrog4wcZL2abUTmPoisLdKNlXheMGCo9VK03ixN83eKiFVd2Mf83U1NYjNzdHItj6lwSiS7Hq1k1lqkfhzEVbcOvFgxLSGAAul0ckD6HWIdzEtPnxQbSBxIUvBcBz/52LeSt24PPH5Wa5Wps320wGXsvUkwwvUi8sL9ztnMgCYbNlfAiJeFRrxD1XiT3Hs5/NwSEHdkALbVcIl/jtPm5tF/WPM2oWpv5jW4nob7MWb8GpRx7gKd8gcFTY+bzOD167bnS81CkKnMcrRZpm2AezHK9LBM9BkVMAaexSyIjZu1Ggaiscl0Qs3jUvgp1sEnGajO3s6eYu347Pxxn3ct22q8zUMSRfsKZjtk1PSENn+u0wid73xi94+uPYzgX6e+P0eJo28WcAbzmIaOEjXGos3dym+jrNPkV0c01wjQ2aLjKxIddGZRcNtSK9zv7mvvKVfN/SILRqhVuEbWu8mUIs8+XrduOZT+aYjvrEdAPqPAZQTTeC1NtEl1xNx7UlN/PzLiqpkI5/Mg2sp3q4cLKynvx116r/NodrsRoeEzMndnNxfBo3z0+Wd1h+BVFzkoiLECmQz3nVNfZ9LmwtejIgwS6FLFhbjsc+mA3AnTFq3PumuBsMFq/ZiXkrdkgHI6lAqd/9Ise6jMf/MxvDf14Z/b1uSylufeZnjNTZeEi9qDx0nLjO6+IrzXVeDrg1EJ7Pi3zLn1ZV0oKmBvm1bHae8Bp7zGnycOOUIXs+fpd1UuW1tlVna1m8twoLA9BKadTXK/j59w3YtMM6QG86MaB3++i/FejMKAKxo5Ko9wG89T9z5CtBbk5E+u7ahcBw059ciUiWAp+Liy3zjF08+hcLOzkL3PThAEzsosjGRqtdNKR23D77bsyRJgLHbyLFaqnbuqGGcCcRYaqxw2qLxQyCBLuQcPOeyzqeVYf+8dfYwPDQ2zMw7INZUsFm9C/xtl/GOEeyTimvn7ZctVwXfy8m1/nsyB7T241vdgPXlIL4PUndChz/HbccVR63NIot08jPX/uvcep583JyIkuxYiTUBLCoxk6ri++cBW7ul+z5+BbsfF0Vw+4jQP+Om6v3no13omOZpt/19cBrw+dHYx+GwYr17pdju3bUbYquKNFnEIiI7fACmsvIicgFOztc9R+DFk1R/xqTrN5U4nSpZyEvkf4nM7Oxyt8qQLFVDYZPXGm504yeNZtKHdP4xezEtKes2nkpFvL50YuAe8+r03HzUxNd1y9dIcEuJJwmt+27y7F6Y4nhmAL5QFVbV4/3flgcd1z2kq+WdEaD74SDMbIsf/3XnOxaLwsCcTZmTpWwC3Vgc+7lL+OX+1KhD7LTgr357QLXzhNunpH29bpx+148/p/Z8c4ffo1wVGy9m6NFyJ5+/HWlLkKZJMPbTkN7l/eV10j6pvj90eilGDJ0ZELla8J2kNtaeeW+1/3FHgOAHep+1EFoT50+As1l5OoC1kbTBNBr3Qx3ljsr6C6urK7F356fHDugmSOEJATIy41gzaYSWyeayXM3xm05J+vrqWiV/nk7yfSKooCvLzYcq6mt91RPt7a1aS7XkWCXKtbqdplYt6U0LoK3mc1F+zBtvjHKvkFtrCMuJIGK649b/VKshwE7KtgZJkJxLKJ7szztHuFtJda206aj7QQvLLY8N2H2+kAnAS2rz8evwNzl27FWtS/TljTMJR3er5On/GUfJ369mp/S2TomDV1dzO/5FjX8ysTf18ed037+EEA8w7DDnSSCf/MD+ZWyfUDtyInEC3ZBoK+fV0N4vSZv1cYSbNzuYYk92a+CJP+a2jrc/co0/PuT3237ZpVpT+/q2jrsM4UbSarAKvn2dN6GDNExTmNL0T7beurP2H0kmDcGII0dAQCGWE1O++rZIRNWrF54t8sWcRo71x5hIp3e1spun0c3eO0wxk24E8vLD7Ko6XY8/7l9QFnzI3OaFO0wp3n1q/mG4+YsWjY3ehwqANbr3luzHZDX+7t1Zxk+HRPuFs5WNT7q4C4AgEP7dYoTWBMJzmq+RWF7xYaBkwOQ27urKArWb/Vmm+j1HTVo3Fzwv8mrdGVZ1UH3b/1xj5Kd112HZLlrWntH72jTxcvW7cY1qrkIIByMUiHbGDR2Dn1H+qw9rEDZMfS16SjRrSqkey8mwS4E/C/DyDc5tnrh3Q5qTjZ2VtRFvS511+jc092UZ8Zc40RCcnj9uvfTWc3CEAAMn8jjjrnNO8g4fmbMAT65ydZK9szueGFK9N+T5xrtEu2qJntuT340G/+bvMrgnJAuaF7OOZEAouwbMN6H5GzBlRr0j9TLLXJssUVm5sPrt+11DFIbV3Yqb3fcMrGn5FIWrirC3OUikoJXhUAyP2zvf/OXlHw4G4RiFzZ2ZiJwuM8e2lBZHVu+dhN6JUxIsEsRQYzniiIX4qwmCz/agR9/XetaColp7GKvkexSu3d84m/rjWm9buKs+7dZKE1F35JtqfX5+Pg9f93eU3Od5/EdWGdaXkgUrYgH355hn87hBsrerxUmGxc9idqVWVVnV2kFvp200tPAv2N3zPPt20krDfZe5hXmSASGJbZEBm3zXqeZhO/JzEllZ0EQ9nNu4vYFNU7EZ2N0VhL/9lbYI+/OxOP/mZ1QveJrJOpqK+8koJOSjQtev5W0HPR5Oc1n1TXxDm3iY9W7gkN+3t6WXE/Yn28k2GUQVjZ2Vi+8exkp9sJOXyAPDivNX2JjJ3eesK7IhNlGwc6rLGq2+9DjeTAxLOsG70pR6SLqv+z56pd7AiGgmcyt7Yq52KCD2L/weQE+G7schRa2psZKiD/6zcs/G7vcaH9nqmAEEdslOr39bFxxphvxwJv+HRfSHa/ym3b8s7EWy/Mpip0WlHZlwUqrfZ1F/jW1ddiwLZgPBN/ox7UEi7d674O0g9TfI6e5QbYvdSQSnK21/nV0bGPIkh0Jdikj8Se9bVeZ9IWy1Nj5WIoF3NdUE+zGzyrEtf8aq16sLcVaZ6g/xePsRrwtRf770zmxfE3tSGRJLWi5ToG75+H2mXmM4mCgzEJr5LXJTnGlVm00PttYvKhgb265uqOGc1gDFzdNshQbMY2S5mze/k4ed02WNoNXYg3ItGmypq3cXGH5YaI9j/IUajFlzhFBPRIn7eCb3y7EotWJBVP2gqKIuWFXaSx0SUR3LtFdMOZbCbIBCHZaHvrx0CnfjdvjPVpzHMI8eKqph9isYXdzEuxSRBAB58fNKpTmY6mxczmLFBV72+hZ29dUL1DuVfe+1S9p+cHrxKd3b99ZWoE/PfUTtqk2XInsbxj4x3QiApvfuiTYBqcq2wqhiogJJatOsDZs3rCazPRtiRPsHOpr18/CHuCDxOgEIFPNxx/7ctoufCEzTXCBq9fE46t03aPj4g8m+SFpt2W5KRKCl2LNoa9clQsbT24fzldukXUH33FNdXn5cjyKONkC65IGaTcaMiTYZRiyzpaoQfbXJmN/pw7960IRcVw2sWvHbDX+kQjKK2uwq1QiUCrmn+7bNnnuRhQVV+An1W7PbT9Nxq4PfnFbh1TsOblT9nz0dbD9Eo4/p9lPBr0U6/aejZ1ZaHlOmzQikfgB3qm6dgJuOrxTiaC3i3TaeivopoYn/idnmTRuBz4PRfzj1WneC1SUuPArsViWri73RZD3zhjHznu+W3eWBRYzUi+cJrrPdrJpFG7xDYlgnrRVgOJkYtZYaDGONK9YDUXo9+OQdYLbX5iCnSXxgkP8lmLu6xlb6Yt4urRkbxU6tWtmOJaMpVhX6ZJkn+IKU5udYnr59Tr2vb2Qw3k32Vo5L+g/jsyaXidTBVuNXdgjfIJMnx+zuXWOI+bxfQjigQZAKj6SgNiezdFyk60phP+PDouh3F25sunI56M0Op54v17bwjMIDMqKNO/WJNiliKBsa2T5pDqEgjaRyTasl64kSg7KhDqR2Pjz9W8WWNbDvMSjD3g6duY6adgRGbm58d5rqUC2J6ZsIA5rDHGMG2XzPSHV6MjsLwMkEbtD7eNIeMXGO0/4LTfNx39PGJbFZO+px8ami9CbdAErGj3A5LWf5LdDUaz7qHDEc5GBD6TCpI+s1m/bg607Y3Zzyblf3vOsrqlDmWrXG1yuwUKCXYoIavCQhjtJssYurg6aYaupLgqMWjNFUfDpmGXo172dIZ3dNGnuvFoMJxnmJeQN6rJDTiSCd75zv79nWDZf42cVxh1z/Z4kIMS4vcBp4q3zKNEkyXfCm22MRZXrPWjsRCaxg3YfVmu3OW+VlikYDNlle3IGXF6oS7EB5KG9N1peidj8+kFRFEthyM0qj2+NnaLgv+OWo7kSe/e368ILucvDGEMzWejHg6kFm6wTIvY8//HqNIN3szzjBCuWICTYpYxgnrR+sl29sQR9u7dFbV2wb5Fs8tMHs9UG9TghU1GgtTMSEQ4V302JN961+9pJxB6iyOc+llGNXRJ7YyK2c9ozX7p2l8dCE0vuZm9GK2RCX+xIsBNcEB9N+onO3C7n98m6AttL7L/sMwmj84TsfLD9J1XfW+Z6+3FUcINZsKupSe4H+YZtex0c45JjJ1avAN/8vNJw7NnP5liklrNph0RwSrKw9IluV5y5y7dbxg90FOqQuuV9K8h5IkUkQ2O3bXeZeiz5Grt7dRuHaxoKs6ZCr7EDrAd6u44RhKGr1wkhSI2dfgsuP1i9J1U1dfjnW7/G0iVUijuclmLtzg97f1b8wQQ0dmN+XWvY0kdGIo8x+i5H4peSnGzs7Pp28b7MDUhsxhBTTKax8/hSWoXLSDU7SyoNv305Ktih3hfzUuxz/7XfXjBRHnpnhqsAzTJK91V53uVDIwgBXz/fhMFHo5fEHUtGbNNkQYJdighqItZPOtqLFrSNnflrwxxYVyvOzs4mWQbybvAaXiH6JR1A4XbLB27Gu0S3hzNck2CDXvlqnn3+njWCmjbX+7vx7g+LPV9jVQsZeo2d2SnIqb6KAqzZVILvJdrplZsrJVdkJkE7T1ja2aoEsfOEG8wmHYDYPD5RamrF/dDui/kD0q/gFBR2j2tHcQX++PgEX/kmYz/kJo1zUZMEk6PPxy2XHs+Nc2H2RtjmoyTYpYqAnrR+0smJAKs3ldhGvg8C824U9RZLscIgVxwbP6vQ4FHnljA2SU/F9OG2VTLbl1Qse/mhzqOmWEueYlMjA1a3LfpBAucPpTjHbUXB3a9Mw8c/LgUgdhhIJTMWbUlJOfp2m7VPycCN/F9Vnfi9lu1YsqKwOOF8xxWUJJxHJpKMoam6pg7DJ650TuiRnaXyDy+ZPWTm6OtIsEsZydDY1dYp+Mcr02zjc/nCobJaHeInQKOO6P0Rci2L3YBtJyzMWhy/L2sQvPntQtTXh20VIbjrpalxx+rrFXwwIn5pwIlky37vfu/eQQWIaSh27wnWocDVVmIqbm6J+ePCrGkxC9XmPC994EfX9QmCZz+dgwoX29Uliv6+VEtMJoJ+3xo1cp6eRk1fE2yhKhXVwd1PBcLBze+yaLJI1vCQHiNpYsg+XGYvcT//hH0HyHkiRSTDxu5Vh6UyvzhVdcS01dhZUhF1OtAoKq5IWJNkpwR65pPfE8rbimnzN+Gc43vGBfMMlARuy4rC3Sh2sC8LAq/Pzq+A9uEo90JqbV09GuXGT/DJCvETicTnbf4QeX34Agzq2zF2QJd8nMTTORVc+dAYjH7poqSWoX8/ZI48QT8RN7avyfp4mbEwOC3ogpVFqE3HveSSdPPCWHUJGplg9/Z3i/DLApfvBS3FNhAC6kR6D1jZV3MqKN1XjTEz1mGvKYDt8J/dqcr//Ym1h9SYGWsTqptfHnp7hqcQKV6prKnHGzYx+dKBdDQOvuT+0dLjK/xqPyzXYmP/NE9M5tsybf4mw4bjeg3F25LYhKliyNCRSc3faQgL2jygsQuNXU5u+r2zMtLBdCJVZIFch1zJxyQALF7jbq/fsG8BaexSRFAPOtm7TADuB6Epkrg/bi4t2Wet6dm2y1u8o2QRtJDz0/xSbNrpz1haZu9hdZ9vfCJm8OxVq5WK5Tw/yN5H2Rd1MM8sgrr6OtMR/wGKswkn54myimBDu7Rv3dQxTSps/QKhgbwjQHYIsam2kw0a0tiliCx4113xXwsvo0wj6MHJr1AHeBNYdumMgb22wS4YdJisksQVK97rz9vUjaw7ae5G4wGH299AurbjGHbrMz+npiI6UuU5myhrkuzg5oek2dhlQYdYuaEkoevDvgck2KWIVi2ahF0F1/h9KSebJ8QMZsLs9WFXIYpMKeFq+6wsGGABYOhr0+OOPWOznO+HtVvExBuJAPNW7DCccwxPnC032oHXhs9PaXnutojLjHuvj0GZ7TSU/mDHO2PD/UimpdgU0altM+dEBCEjDW3f0pXaunpUJhAC4+mP4x10Ig7LffbR/bOHZIdV8kO6mg9kAuWVybl3Tkv2DYG9FeEu5ZJgRxBpzg6P+yxqaFooGXrj/2whAmtni0TzJVKPG23ckjUet9kjkk7pvnADLxO0FJsyzHvnEUQiuJn07IK3ptNSc1AkS0+wbVdZknImCCIb6dEpXNMrEuxSxDy+wzlRmkA2EkQmkqz3toQ0EKFAwxCRqcgiGaS0/FBLJwjCHzTppQzzXskEQRB2hB2FhwQ7Ig6SGYhMJFkanmTtckEQRHYStl0uCXZEHJPmZE/YkmyFRI14GooJwdnH9gy7CgRBpDEk2BFxLF3rbtsUgkgnGoZY13Ci3zQUQZ0ggoYEOyIOWnlKf2jSk9BAbkk67ulLEIQOsrEj0o3Vki2cCCLdyZRdCBKloch1DeNpEkTwkGBHEERW0FCUmDkNRbIjiAwl7B5Kgh1BZCD1tF7eYAl70kgVm3fsC7sKBOETimNHEIRHvvqJh12FtGPrzoaxQ4TT3rXZwioyCSEIX5BgRxAZyKLVRWFXIe148YuCsKuQEpK9EjvsluOSWwBBZDlhW0uQYEcQGUhDsScj4km2jV2fA9okNX+CIJILCXYEkYGQXEckDXq5CCKjIcGOIHzQvUurUMsv2VsVavlEeJBXLEEQdpBgRxA+oLmVCIuDe7dPav6ksCOIxAh7eiDBjiB8EHbHJRouzfIaJTV/2tWEIBKEnCcIgiAIt5C2mCAIO0iwIzKesO3dCCKVkEKNINKbsL+9AtPpM8ZuBPCx5NRbnPM7dOkGA3gawAAAmwG8yjl/Q5LfvQBuB7AfgKUAHuCcTzKlaQXgBQCXA2gKYAqAOznnhQE0icgQBh/fC++PWJzSMmkjdiJbIcGRIDKbZGjszgVwvO6/F7UTjLHjAYwCMB/AYAhB8FXG2F/0GahC3TMA3gJwPoBVAMYwxg4zlfUVgAsB3AngKgD7A5jEGGsefLOIdCW/f2fb8+ed0Cs1FSEIgiCIkL/7k2GFW8A532lx7lEA8zjnN6u/pzDGegAYxhh7n3NezxjLA/AIhCbvRQBgjE0DsBjAwwCuVI8dCyH0nc85H6seWwxgDYAbAbydhLYRaYiT9oz1bIexMwtTUxmCyHBIY0f4pUv75miUG8HmooaxvV+6kjIbO1VgOx3AcNOpLyGWW49Uf58AoA2Ar7UEnPM6AN8AGMwY02bx8wCUAhivS7cBwAz1HBECRw/okvIynVdFadmUINyiUMATwidiLKbxNuw7kAyN3RLGWCcAGwB8AuBpznktgD4AmgBYZkq/VP3bH8BcAAerv5dL0rUE0A3AJjXdCs55vSTdOYk3g/BDJIRX2ilgazL2TCcTO4IgCCMRRJBDLpmhE6RgtxXAMAC/A6iDsKH7F4DeEEuj7dR0JabritW/WtTNdgCqOOcVNuk2qenMeWnpPEfwXLJkiddLCAklpSUpL3PxEnvHicLCwsDLLC83v54EkRpWrlyZ1PwXL06tIxKRPVRVVaG2NuxapAcFBQWhlR2YYMc5nwBggu7QRMZYKYDHGGNPBlVOshg4cCDy8vKSV8CXm5KXdxrRrm1bYPO2lJZ52KGHAiOtyzzwwN7AzN2BltmieXOgpDTQPAnCiUgEOOigg4DJVmbMiTNw4CDb/kQQVjRtmofGjXKA0r1hVyVUIpEI8vPzk5Z/VVWVrTIq2UrTb9S/RyKmcWtrSqNp8rSZtxhAHmOsqYt05ry0dMHO4kRa47QsSqFJiGwhAiR9zy/aeYLwiwIab9OBVK6GrwFQjZgNncYA9e8K9a9mWydLtxci9p2WjumcKfTpVoBoMDjZ2CVlnKGxiwiDBF/mRrlkAEUkF5LrwifZvfxqCCG+gHNeBWAy1HAlOq4BsA3APPX3TAhv16u0BIyxXPW68Zxz7XNyLITG7hxduu4ATlLPESEQRqd2+kJMhkNHTjI8MggiydCkSySbMBzoCCNB7jwxAUJwWwKgHsJ54m8APuScr1WTPQFgOmPsAwBfADgRwK0Abte8WznnVYyxpwA8wxgrghD4boHwqr1WK49z/htjbAyADxljQwHsUfPXvHGJBoLjZJWEcSaXZkgiBBJ969xc77QSmxMB6jNgtbZ1iybYU1YddjUaHBFSCodOkI9gOYA/QdjVjYCIWfcAgOiuEpzzWQAuAnA0hKPFLQD+wTl/V5+RGpj4IQB3ARgHEQrlfM75QlOZ1wD4ESIY8bcQmr8zOeflAbaL8ICd9uy7Zy9IeZkAhTshsodE37tIEJ0hQ17+p/5yQthVaJBkxtuRXHbtDdc1OEiv2LsB3O0i3Vi4WCpVhbsXHdLsBXCb+h+R5jRpnJuUfJ3nKlqKJRoOZx7dAz/P2SA950pj5+CdkSlvPhnxhwPdd2BHSU2o5ZPSNA0ZfHwvtGjWOOxqZAxhaOxIsCPSFTvBzNWc67DMmm7zdqPcNKtQAyfd3o+GCAl2aQjr2Q4XnXxg2NXwRTjOE07nk6Cxo9GLCIVIgh99Qby39O4T1pDzRPiQYJeG1NaZd0nLHFo0Tb2m0VFwo3GGyCL6dm+Lf/3pWMvzds4PkQhwzID9bPPPAL8IV2Rqt7//+qPCroJ/FKC2PnPnr2yBBLs0RFGAfZXhrtH75eqzWMrLJLmOaCho7/oxh9gLZ1YoCvDIn45xSGMv2qWbFUK2xVPusV8r2/OXnto3RTXxx5pN2bsjT9Mm7uzEe3dJ4i5WLiDBLk058+gentIf3Eu+Pe5RB3cJojquadw49a+Uc4Bi+/MtyZ4xaeQ1ycURBzYPuxopp1Xz5LxTbmSqehtJp6a23rE/OApKSTZDGP3SRUnN3yu3XjTQdVrWs51zIiciwBv3nmZ5Ol3se3NyIvjb5YcZBc0UVu2vlx2KT4ed45wwQE4+vJvhd67Fs7jipA6pqI4lJNilKb33bxNIPq1bNAkkH7eEISQ5Bih2GGwOOTDcTpjNRADkSYT9Vs3Fe3nWMd4+YIJk6HXJ28vxlCMPsDyXbMeo+jprySwIMw9zf/IqaNh9tJ53Qi/P9bESZP0o8i6RaMMuOOlAPHv7SfjqqfPw5n3WAhcAdOvU0kepwEmH7R/9dwTBfxhce07/QPMDgMP6dsTg43vhpiGH4NGbVdMABdivQ2o+5Nq0yEP71uadR/3hVng/oLPx+V5xxkEAgL9ceqjhePO8cEUrEuyyHNkXResWTZL2Vdy4UXJCmtjh1nmiRbPGGPXihbjstL6m8+7KCUoA7Nw+uQNf144tkpq/mTuuOMzyXCQSwWmHto473rpFE/zw/BDcccXhSayZNa1bNMGpRx6Ac47rKT3vdsnFEhupwosc9Pb9p3suWm/j1Lypc0SrCxN01BpyUnCOXieZNCJu0Mt1+mVMP3vetpF8COfkRHDIgR3Qsllj9Nwv/l3W41eZuW7LHtdp3bSrp2k5d0Bv+YpOIug/qLvpBJ5zj+sVeFnyCgSXlVkT55brzu2PkS9ciIN6tA2uMgFAgl2Wk8q9If9whL/OkSjOW4qpKAoikUhc1Hyn6x+5SdgklVXE7B4Tses5bqA/+yi3NG6U2m7dsW0zw2/9pBKJyDV27VrnoVFujqW257LT+qZEk9rfYunsDI+mEGYmWsSREyQwI+neVav7U1RcEf13n25tHbO88syDDL+dBAdz7d0IM6ccEdNg2i0VD0zwmV+XgGaqzwFtEn7ufj1CNxftw/+dJ7ZHb9e6KTq0aeZwhT3P/O0kw++ke/HrHmluBu5H3LpFE5x+VHfpuUF9Otpem5MTCUWhYUfmPYEsxEpr4IVO7eQDQa4kxpPbPj7sluM81eE+G2+uls3Eq3bCoV095ekGxx3F1ASN1QDJ5onLadBrp6r795RV4e6rj/BVRz11NktlAHD94P5o18q/8W2TFAt25nn6jisPtzzndLxX19b4+F9n44bzBqBZXmDx0y2pqpEvTd584SGJ5VtdZ3kuJ4HHo39VrQS7VRtLov92CjYsw6uJ3fkn9o5LY/640NfDTnDUPrL8yiF6odHp48u8JH7PNUeiaV5iE3Qi8tPlp/fDd89eEDVTkHH4QZ1c5WX+Xkq2XV5NrehHrVo0lq4S+f1Is4tRaNeiPw1x339vvXigrTAaXWa2oVdXe01uqiHBLg2444rDceP5AxLK44bBB0uPd2lvvSz3wUNn2uZ5aF/7LxUv/HWwcOK4/4aj0UKyPGSeSJ+87XjXeTsNpi3VgVLTKNabVHZO17dtKYSsft3bRQVoBYrtFmmP/9m6/v17tTdoMMyclt8dHdp4sx3RL7/Kvh69LMV4tQFrqbMHatsqD/17xsqqqva2tc4RrDM6tm2G3JyIr6U0r5xpYeOX6Be4+X0+UWdDVesg2Gv03j9+sjj72NhH4CkmDfkFJwkBy24s6XuAs+1uba1R2L3+XKMWTK/hHv3SRehk0tge1q8j/nHNkZb561tvXjKUlaHxxRODLfPUaK4Lt+T0/uT37xxXrya65965fXN0tOmH+mdhxklr/rfLDo07FolEHHfnefgma49mg1mD6f7ZjXFu7qsezVGvdcuYANpjv1a46syD8OAfj5EKY3q7Oy9C0N1Xx96jl/7+B9fXebLzc/yScZ9VukCCXQg8+Mejo//+6knRqS47vV/0mJ/pzGoguegPMfsXs+Hnfh1aYPRLF1l6zgYZ2LdFUzFg5eZEcLtuALrijH4Y/dJFuPgUo93b4QcZB10Zrw89FW/ee5pjPVs0bYzPhp2DP10gJluzYOeksWvcKAdv3Xca7jUZ2+sH4SNZZ3z/XEzQO8Lmy/rUIw/AvdcHa7j/xr2n4cBuYtKWaUW9LMmfeXQP9OraGh89crb0/BCTTVb/nu2jE5V2J6PLswm8Q7J+cCRzfi/MPP3XE3DxKX2k5/Ia5+ICicYJAIY/fR7+drm1/aAd5vdZb8Mj0+Y9d8dJUcFM4/iBxuf4+ePnGoy8e+zX2jCWdGrbHKNfusgwlui1wx8+fBae/uuJjnWvNmkxI5EIjuxvfd/N/e/YQ7rGPzzdb0WX/aH9OuGea+OFQJlQZuUINuyW4/DS3/+A+284Ckcc1AlfPDEYnw47x3Ycvf+Go+K174rQbGmCyYcPn4WPH7X2upRpzzTNmJOzwgCfGqymTeRa7Iv+0MdoAmFRLzOd2zf37GD31F9OwJCTD8SfLx4UKy8SwfWDD0bHts2QI1FJ33JRLK2dTa6ZZnmNog4pzfIaGYTQPNUO9o+SDxmZTD/yhQt1dTjcdR3080OmhNYhwS4E9J0sL1EjbQcikQi6dhDaHNmSif21yaiRe0NVq2XPvt3bYvRLF6H3/m3Q0+XXX7vWTaPqdvOA7GaZosd+rdFUtzRotqVp3CjHoOVJSCi2GDxusfHcymucG71uoMQmxMotX0b71k3xxr2nWS7v5/fvjC4mBxAtrprW7DvVgVOmdQKME/eb950WtW8xTOiy++DjtuZEInHPQ++1etul8doTQGh/nDSnfsItyDxTB/TugM7t7LUMbVrmeX6v9GV1bt/coNGyorrWKHial3PNH5E5ORHDx6ECJc6OTjH8O/YrAqC5ZMndbgI1a/laNmuMg3q0w8mHd0MkEkHrFk2Et6RNHicf3s1SK/vds0MwQicEWGH3KGSrEgYCFhBYj3aGvmHO3urj1esKbf+e7dCkcS7+fPEgyyVjmcauWZNc9OveFoC3sdEs4Ldu0QTfPHM+brtkUPQj73Ldh0z0Okle+nHeSzdKl/AyXiDBLgSSYchq11natRZLiVYvqNWSRdivs6Uhs8fPJvOtOdakCXF6HLLSzJOd287vOuyApFJul2dlNfFm0Ox8f51sfZxslfSPsOd+raXehjIjez/vZCRiXNZ9959n4OYLXcYmc7gVXgRmDbNXts+i49IcNyjeKafO7CkkwTx21JiWYs2PQeYgoF/6UpT4utfbCOzNPSz9f/jIWXj+zpNt6xcrxp/0lJMTcfVcZfdBG9sVAJedELwnql+sBTtv768bMw2re+dXltWqqPXhZnmNcMFJB9oLiA6Feem2+nu0dN0u9xeGCAl2KeLlu2P2Afr3MTcRS2odbt5Tq45g2QdSsB/qhm17PV8TtDZcJpRpHmpmrLze3C51XnO2y505ZEKNU2BZmzuT6o3SA9kvUtIcP5pQ80TTspncwFuGnQenLG83yJaOgPjupijOXVCzAbv2bIb9O8bHUDObHcho1byxQZtfXSNx/FCzufAPB8afg/F17dCmafzHou6n+Z568YTt3C5e62gZxy6MZTNNCKlXMKiXtQY2kapJ2xUxLnGbE1l9eHrtT27SS+e0SCRaJy9FKroyvTxPZ6Fet7zqkFJ/7+Ys2+6+EiFCgl2K6Nc9FlZB3zlSqeb1WpIsvdfwKUf274y7dF6SGprn5m9Lt3msVfCYv1pbNGuMP9g4NwDxg4yfCd4re8qqXaWTDb5BhyAwt79tyzwc3Kt91Ng5kiNPZ4XmZJGns1v0q3Exk5NjXIpNZClIlneiHJhAMHLWsz1e+vsfcKXFVn49usidE/ReipFIxBBgNW4ZVffzSNZZOjBo1xx7yH448dD94+6b0Ss2drxZXiPp8zDfk2NttlCzekZ2nsmJcPygeBvW5+44CQ/deLRBY2eHH693W6cDxeQRbDptZYMdkF7BgCwSAxCrk/6jz2kVomPbZjGNnYc6OI07siHAzbBgDkScrpBgFwJBLcUathFzkaVlsWonMAdZlKV32sfQzOO3Ho+zJN5jiUyIijxChW+0ieWmCw7Ba/eciq+fOg9d2jePDr6GicOi2vrBTLMB8rIVkRsqKr15mOppFOAIrijxk2lubg6ev/NkHKHavTg9XfP1eapRePcu+gCz8df56Trma7zk4aTwct2X1Xw6t2sWJ8g8pHo6mrVr/Xu1QycHuzsAOKhHO8sPi7uvOQK9urbGq/84xXDcSuADgOMGdsWlp/aNOXPEhQeKv+bKMw7CoX074u6rj1CXvo3njSuxsR9XmGLoaTz11xMMv+3scq0eUXebNnrJxxL1PuREhI3k8YP2j773ToJFm5Z5+N+zF3iysTZvM2YW1vTvj7l8q3vhdcXITRgR2cd/BLoxISKCYv/9qsOjbXj/QXmEhr4HtNXdU+ub+sD/GUNt1TnssCL7mHDySAaAwcf3wu2qQ1UqwjH5hQS7EAhqhbONzt1c/xVk9orU+kOOhUpbC/kwoLdxSUSq2ZD0rYF9OuCMo+XBHa3QjLplAp75i9gcly0S8Fur1aFZXm7UsxSQPyfN/qixafDSBsjXh56KZ28XwUEv/EMfTx5gg4/vBUDsFnDpqRKDYIfZwu50x7ZN8dCN9pu/a+zvYlskrSwr7zKvSzxDTu6N+284yhDkOqiltJyI/9Ap+usO72ftBelIREw+z90h7MNOPfIAHHFQJ1x6al90Vp1UzLZt+f27RDWYXiLb6+vcuFEu3rj3NPQ5wHi9zBGne5eWuOy0vmiUm4ObhhyC6889GGcc3R0XmTyKZRNgx7bN8PRfT4yGFiq3+QjR97E8i8m0VfMmeEUnjPoZM9u2yjPssHPducFsq6V5abZtmYerzjoIL98dq6dmL+hG25zXOBeKzZeDlXOCdoXZrEO/y4hM6JCZY1hp1/QxTD9THYT269DclbDc0mI7tPtvOBoXnNQbB+7fBrdePAhnHhP/wS/DzVhi9haukpkT6Ni/Uyw8lNZdrDyO9eTkRHDu8b3w2K3H4am/nOCYPizSV+TMYpzsdtyiKMDVZzGUV9YYjj/4x2MwZOjIuPTN1I5vnkBOPrwbTj68G0ZMW+1Y5p8vGYSnP/4de8vFsqDZ7d0tfQ5oC76+2BBC5Lk7TkIL07Y9r9x9Ctq1zsONT/wEQNgnnXjo/nH5aYx4fgguvn+04ZjT5KuNG27skbTlHfPkpmlMzHv86uPLmXdoMPPnSwbhijMOQsvmTXDyEd3w/OdzDeed3ptjB+6Hwq174pZ5br7wEJx/Ym80bpSL9q2bYveeyrhr+x7QBqs3lQIQGhuNa85m+OonHpdem7isbm1UuLA0bDfSuFFunFbmxEO7YvGanYZjfmz3zM/fS/fTP897rhPLzDW19aisqsVvS7cZ3oPjB3XFrMVbo7+bNMpBta6vnXRYrH2yfWpr1I+dS07ta3Cw+Pqp89BEsnuHFW7aJxPO3r7/DMPvFs0aG+KIaWzbVe6Y/wmHdsX7Ixajc9vG2FFSA0VRMPzp8zB/ZRGOH9gVn49bEXeNef7ue0Bb5PfvjIIVO+LSfjrsHPzx8QkAgEN6u7PR269DMFvtXXdufxzatyMOObBDXODd0/IPAF+/Gycd1g3rVpWgSeNcg83iFWf0M3jj2g05Hzx0Jq5+ZGzcce1DwKzN1QvvjRvlYOQLF+Ki+0ZFj33++OC4/Mwa57OO6YH8g7sYvJzbtW6KP188KOr57kT/nu3xt8sPQ5tIEf797RYA4tl27dgCt13ifynT7r3O798Z3Tq1xOaifQBEyJ03v11oW0cz2kfGgd3aYO3mUtu65PePDxF24P5tsHZLKbp1Su2WjjJIYxcC7VoJuwLN/duMl6Xa687tj1svHhQdFO1svbR4RVZBWbUtJo8ZsF80eLGmPTvq4C74/PFzcciBHQyBIv0IdUBMXd9VN9gO6N0hzjuyb/e26NCmGW66YADOOa4nLj+9n+VeqAN6t5faksnCSHymC1Oh3e9mLkJBaLEATzzMqFU8V9W2mTm0b0zL06dbG2kajUa5OYYQI6NfusjTnr7Xnt0fnz9+Ltq1bmrYTP3iU/pGJxOzkKPFaTvCIj6clW2TNshavaqa8NhetaH5dNg5+ORRXVw8F8LH+ScdiM8fP9dwTF/e3686wjkTyMOduKVbp5bRD4m8xrlo16opOrdrjh77tY5uAP72/afjnmuPjAvo7dWuUQsM3LpFE7RpGRPOWzRr7ClgctSWyaHJ157TH6fl29uSRlHzcuNlCwAd2jTD6JcuwmmDRH9WFBE+5sRD95d+aN115eF46774fXG1pTrzvdRv/u7FrMPOVk+2T6yMRrk5lv3lgM6t8NRfTozWz7yd3v+dN8CgadPGk1YSLVeLZo2lQeLPO6E3ht1yXFyAajPm+yLzaNXmi/Zq5ISuHVtIP5yHnHxgXIgju3IHH99LupWgH2IrTdbvXiQSMezg1LaVWOo2k5sTwUV/kMe01PaHlsUgffimY3C/zc5KAPDEbcfjyduOx7v/tA/8nwpIsEshD1/VDV89ORi992+N68/tj3/qgosCwH3X56Nj22bRmF53Xnk4/mWznYl56RSwF+xyc3Iw/OnzLEM9aB2nW+eW0a/bv191BG67ZBAevfnY6GQThG3BXy87FENOPhC9HYQdjUtP62cbVPLjf50d3e3BTYTydq2b4tC+HTH0unxce05/XH56v7i9bo8fJAY4fXu7d2mFb545H6cfZRSOD3TRjjtNTiSP3XpcNHq9m51HmMW+pgP7iPcgJycSfUZ/vUy+BHymLoRMs7xcnH1MD/xpyCG4+iyG14eeahS+ABzQpRW6d2mJq3XG+U0a50S1glbhD/Iai3umCbPt1f0vtWUst4o3vYCjp1O7ZoYPFP1+p41ycww7HzRpnIuObWOCQFMHuybz7gz3XHsk3nngdMsYcN27tMJp+d3jtDf6pa+uLrRFJ6gT6jED5AHD3ZITEfa3D5rGFzPXnM1wz7XuAmVfqgZc7tOtDf6qBqN20880ecy85PfYrccZTAPOOrandJlP26VDFuvODw/deIwhRp32AXTHFYdFtw4MEm3pHRABr81oO0nceaX8I0Ubq1vq+pkWN9DNx8pNFxyCoyXvk7abiGZ28e/bT0LblnmG+I5B8PwdJxs+Ms1o5bXSCdUnH94Npx55QOzjOxruxL4sTbB77FaxjCxb5h/xwoVxZghHMCHInXdib5xxdHdcdno/dO9iNEc5bmBXnOwgSLdpmecqsH5KUBSlQf83d+7cXnPnzlUqKyuVZDN37lxf123cvkeZt2K7UlZRrfy2ZKuyt7xaWbu5RKmvr4+mqa6pVW56coLy/ZRViqIoSsneSqWqulZRFEV545v5ygX3jFDWbSm1Laesolp5+csCZW9ZlWOdtu0qU2pr66TnVm0sVh59b6ZywT0josf8tt0vF9wzQrnv9enK+z8s8p1HbW2dsntPhW2aucu3KRu27bFNc9u/fzbcCz319fWO1/++dKvyn5GLFUVRlMWri5QL7hkR/a++vt7wHuhZs6lEWbu5xHCstq5e2bRjr3LL0z8pMxZuti3XjL7M6ppaZeJvhZZl19fXKzMXbVFqa+sMz762rl559/uFytad+1yXO3nuhmjZS9fuVPaWVysVVTWKoijKzEWblS/GL1dqa+uiaTRWFO5Svpu8UlEURamrq1cmzC5UyitrpGUMfXWa8ta3C5QL7hmhfDlhheu6mVm3pVTZXLRXURRF+e/YZXF1amjMnP278spXBcrOknJf1+/YXa68/8Mi6VizZM1OZcmanY55LFmzU7nt3z8re8ur485VVNYoo39ZY/keJ4L23tfV1Su1df7znzJ3g+U49Mg7M5QL7hmhlOwV89cF94xQbn16omVeKzfsVkr3ifF99uIt0X4k4+ff1ysvf1ngu95uxvy6unqlQu2T2jxlfk7T5m2UHnfDlqJ9yoVD5X2wYMV227HXbmx1ItnzXWVlpTJ37lxl7ty5vRSJXBNRQgn2kz4UFBT0ArBu4MCByMvzv/G6y7KQnx/sVlJuqK6pw9K1uyyXD1JBWG1PB8orazDr93k44w/Om0m7oaKqFne/PBWXntYX5xzXK5A83TBq+hoM7NPRlXZSTxDPvrKqFjuKy9FDEshY45f5m3FQz3aul4xkbC7ah64dWgQSxkRRFMz+fS76DxgUNb9oaDTkfh9G20v3VaFJ49y08NhMl2dfXVOH+nrFsHNQskl226uqqrBkyRIA6J2fn19oPh/+0yeSTpPGuaEKdQ2d5k0bo22L4Lpas7xGeM8iPEAyudDCNiUVNM1rZCvUAXBcKnFDNxcewW6JRCJo0iinwQp1ROqxMl1oyLgJY5JtkI0dQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQIIdQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQIIdQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQDtPALkAUF1dnZLCqqqqUlJOOtKQ2w407PZT2xsuDbn9DbntQMNufzLbrpNXpNtq0F6xBQUnAfgl7HoQBEEQBEF44OT8/PxfzQdJYwfMAXAygK0A6kKuC0EQBEEQhB25ALpCyC9xNHiNHUEQBEEQRLZAzhMEQRAEQRBZAgl2BEEQBEEQWQIJdgRBEARBEFkCCXYEQRAEQRBZAgl2BEEQBEEQWQIJdgRBEARBEFkCCXYEQRAEQRBZAgl2BEEQBEEQWQLtPJFkGGP9ALwB4CQAFQC+BvAA57w81Iq5hDF2BYDrAOQDaA9gDYB3ALzHOa9X03wC4I+Sy6/gnP/PlN+9AG4HsB+ApRD3YpIpTSsALwC4HEBTAFMA3Mk5LwysYS5gjN0I4GPJqbc453fo0g0G8DSAAQA2A3iVc/6GJL+Mabtal6kATrE4/SDn/FnG2GMAhknO38c5f9GU3/8BeAhAL4j36AnO+XBTmsYAnoB4n9pCRFb/O+d8gd92uIEx1hfAvQCOAzAQwArO+UBJupQ/62SPIU5tZ4zlAhgK4HyIdjcCsBjA45I2FQLoKSmmE+d8py5dWrRdLcPx2Yc1xoX97NU0drsYHM85n62mmwr5eHE053yuLj9XfZwxth+A1wCcC0AB8COAu/XvUSK4mdvUdBnX50ljl0QYY20hHlwriAc5FMA1AD4KsVpeGQqgCsB9AC4AMALA6wCeM6VbC+B403+T9QnUF/8ZAG9BTBKrAIxhjB1myusrABcCuBPAVQD2BzCJMdY8qEZ55FwY2xUVWBhjxwMYBWA+gMEQguCrjLG/6DPI0Lb/DfHP9G313FhdugpJui/0GTHGLgfwKYAfIO7TzwC+UgdNPa9ADI7DAFwEoBqi/fsH1io5h0A8l9UAlskShPGsUzSGOLW9GYRAvgDATQCuhpjgJjLGLpCk/x/i34cSU5p0aTvg4tmrpHSMS5NnD8S3+XgAswHsADDXlHaGJO1yUxrHPs4YawRgPIBBAP4PwC0ATgAwijEW8dFOGY5zW6b2edLYJZfbALQDcLj2lcEYqwXwBWPsSc750lBr544hnPMi3e8pjLGWAO5gjD3COa9Sj1doX24yGGN5AB6B+Np5UT02DeLL/2EAV6rHjoXoGOdzzseqxxZDfE3diJhgkUoKbL4SHwUwj3N+s/p7CmOsB4BhjLH3Oef1mdp2znncQM8Yex3AYs75It3hertnr/IkgG855w+qv6cwxg4G8DiAcWre3QD8BcBdnPMP1GOzAawDcDeA+xNojhOjOecj1TI/AXCUJE0YzzoVY4hT2ysA9OacF2sHGGM/ATgIYtL50ZR+u8NYkE5tB9w9eyD1Y1w6PHuY26wKHkcAeJ9zXmtKXuJwj9z28csAHAZgoNZOxtgWCMFxMIwfln5xM7dlZJ8njV1yOQ/AJJNQ8B3EV4JZU5GWmF58jfkQauT2HrI6AUAbCJWylncdgG8ADNZ9hZ0HoBTia01LtwGiQ5/nqfJJRu3QpwMYbjr1JYQ6/kj1d1a0XV0eOBrA5x6v6w2gP3TtV/kSwNGMsU7q77MhNreO3k/O+V4IwSGp7dcvvcgI8VknfQxxajvnvE4v1KnHFAgNnh9Natq0XS3btv0eyLpnb8EVAPLgcRxQcdvHz4P4gFyqSzcTwHoENBY4zW2Z3OdJsEsuB8Ok3la/AtZATHSZyskAdkOo4jX6MMZKGGM1jLH5jLGrTNccrP41q+WXAmgJoJsu3QrJgLMU4d2zJYyxOsbYOsbYMHWZAAD6AGiC+CUMbTDS6pvJbddzPYB6iIFNTzPG2A7GWC1jbAVj7HbTea39VveJ6dJt55zvkqQ7iDEW5ngV1rNOyzFEfRYnIL6dAHAdY6ySMVbGGJvAGDvSdD5T257qMS7d2q9xPYCVnPPfJedOYYztU5//r4yxM0zn3fbxuLbr0iWz7fq5LWP7PAl2yaUd4m1LAKAY3rRdaQNj7CgIO5tX1K8SQHzl3AvgYgibgE0AvmbC+UCjHYAqznmFKUtNE9Bel65EUnQY92wrhB3IjRB2dj8A+BeA/6jn26l/S0zXydqUaW2XcR2AaZzzTbpjqwE8AGEDciGAWQDeZMKpQsPLfTKn0dI1hhgkwyKsZ52u78SdEAL5S6bjowDcAeAsiCWlAwD8whgboEuTiW0PY4xLp/YDANRlyJNhsqFVmQaxnHoegBsARAD8xBg7XZfGbR9Pedslc1vG9nmysSNcw4SX0ncAfofOwJRz/pop6UjG2GQI+6lPUlbBgOGcTwAwQXdoImOsFMBjjLEnQ6pWKDDGjoP4gn1Gf5xzbl6OGcsYA4AHGGMvcM7LUlRFIkUwxk4B8DyAFznnv+jPcc7v0v38hTE2DsAKAP+EMILPSLJ1jPPBtRACW9wyLOfc4B3PGBsFYCGAx2ByMkk3rOa2TIU0dsmlGMKd20w7CHVvxsAYawNh5F4O4ELOeY3DJd8C6KGznyoGkMcYa2pKp30V7dalayvJL13u2Tfq3yMR+yJra0oja1Omt/16AJUQHo9OfANhp6JpabzcJ3MaLV0NgH3uqpoUwnrWafVOMMYOBTASwoPwAaf06pLbZIiQEhoZ2XYJyR7j0rH91wGYxTlf65RQXT4cCffPXt/HU9Z2m7ktY/s8CXbJZTli6+8AokbYfSC+YjMC9YUdBaAzgHMl9hFu0OwPDjYdHwBgL0T4BC0dY/Eu7QOQfvdsDYSrvqxNQKy+Gd121abwKggPuj0+srBrPwBwXbrOjDHzcsMACJueoIzc/RDWs06bMYQx1gdCgz0PwA2qA4UfMq7tLsnaZ6+WfThErDs/ThMabvt4XNt16QJru8PclrF9ngS75DIWwBmMsQ66Y5dAeBQF4a6ddNRJ/RsAhwIYzDlf7+KaCISL93qd59FMCI+gq3TpctV043WTxFiIr5VzdOm6QwRqTId7djVEsMwC9Yt0MlR3dh3XANgGMQECmd/2cwB0hPsB/WqIEBlLAYBzvg5iQDIbm18DYI7uHfkJwjkjej/V8ANDEPKzD/FZp8UYoi5V/QTR1os559Uur+sI4AyIILQaGdV2GSka49Kt/ddBaNXMXqJSVGHkYhifvds+PhbAICZCImnpjoMIbh5I253mtkzu8xFF8fvRRTjBRLyfJQAKIeJ4dQbwMoQr89Xh1cw9jLH3APwZIr7QL6bTyyDUw59CBF5cDfHi3gLhbHCD3gaLxYI4PgjRKW6BiFd0LOd8oS7djxBxkoYC2AMRpbwdgEE8hTt2MMYmQHTsJRCD0WCIoL0fc85vVdMcD2A6hJ3NFwBOVOt7O+f8XV1eGdV2PYyxryCM4bual+AZYwUQz59DeJBdBTEBPMI5f1qX7gqICeHfACZCBCb9O0RMp3G6dG9CGF4PhQhtcC9EbK1BnPMtSWxjc8RCDtwO8XV8j/p7Dud8fRjPOhVjiFPbITwEZ6nHrwewXX89j+08cA1EoNdxEFqKXhDLtd0gdh+IahvSpe1qOU7tB0IY49Lh2WvCjuqtugEiptuFknxOhgj0+4Na3/0g+nc+gLM451N1aR37uCp0zYVwqHgQwh/gBYh378QEtMX6OtvObZzzPZna58l5IolwzktUj6DXAXyP2NYgyQy0GjTa18XzknOnAVgE8bXyCMQLWAPxYl/IOR+tT8w5f1E1rL8LQBcIjc75+hdf5RqI3R3ehvhKmQKxdU+qBZvlAP4E4dnXCCKa+AMAXtUScM5nMcYugujU/wdgC4B/6Du9mi7T2g4g+jV9IYBPLewqV0N4wnVVfy8F8CfOuWErNs75t+ok8hDEQL4GwLV6oU7lHxB2Nk9BxIaaA+DMZAp1Kp0hbKb0aL9vAvBJGM86RWOIU9unQgSLBYRtnRltaWkdRFy7lyEmq1IIT8nL9UKdSrq0HXBu/yiEMMalybP/RP33qRAC+j2QsxXiw+4ZAB0g7NVmAziVcz7DlNaxj3POaxlj50JsKfY5YluK/T0IoU7FaW6bmql9njR2BEEQBEEQWQLZ2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCf8Pn/9H/koRoC8AAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df1 = df[df[\"name\"].isin([\"Latency\"])]\n",
+    "ax = df1['issue_to_done'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Inference time (usec)');\n",
+    "#ax.set(xlim=(0, 25000))\n",
+    "plt.xticks(rotation=60)\n",
+    "plt.show()\n",
+    "\n",
+    "ax = df1['issue_to_done'].plot(figsize=figsize)\n",
+    "ax.set_title('Individual inference time (usec)');\n",
+    "#ax.set(ylim=(0, 200))\n",
+    "plt.show()\n",
+    "\n",
+    "\n",
+    "# df1['issue_to_done'].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAApwAAAFKCAYAAACwxI8KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA8y0lEQVR4nO3deZgkRZn48e8A0oBcw+24ooL4Ao6iDKyKuIsiIqDIKoegIigs7iIeC4oiIiIiyqh4gNeq4G9REF1PEA9AVJDDwUWG4wURmMVZEbkVbJCZ3x+RNZMkNdPdNZVd3T3fz/P0U12ZUZmRkUe9FRkROW3hwoVIkiRJbVlh0BmQJEnS1GbAKUmSpFYZcEqSJKlVBpySJElqlQGnJEmSWmXAKUmSpFYZcGpSi4hjI2JCje0VEQsj4nODzofUi4g4LSJuGXQ+JqqIeEp1jh8w6Lw0RcS0iLgqIj446LyMRkScHRHfGHQ+ND5WGnQGpKYxBJAHtpqRCSIiVgPeBfwsM3824Oy0KiLWA44EXglsDDwAXAF8MjPPHWTexiIitgSOAZ4LPAG4C7gRuDAzjx1g1iaMiHg58GbgH4G1gbuBy4CvZOa3B5i11kTEDOBfge9k5v+0sIp9gacBn2xh2W34MPDriNgqM68adGbULgNOTUSvb7z/V+B5wBsb0y8B/gs4cTwyNUCrAe+v/v/ZAPPRqogI4HxgPeArwBxgOrAfcE5EfCQz3z3ALI5KRDwfuBCYD5wG/AGYAcwC3g0cO6i8TQQRMQ34PHAw8Fvg05QyWh/YFfjviHhtZn5tcLlszQzKuXwL8D8tLP+dwDcz888tLLvvMvPKiPg1cASPve5rijHg1ISTmf9Vfx8RLwH+sTm95u/t50ptiojHAd+kBJj/lJmX1+Z9HDgDODIi5mTm2eOct8dn5l/H8JGjgb8C22bmnY1lbdjXzE1O76AEm58B3paZC2rzPhIRLwMeN5CcTWIR8Rzg2ZTjbzI5CzguIg7NzPsGnRm1x4BTk1pEHAu8PzOn1abdAlxPqfmcDTwDuAl4a2ZeEBF7AMcBTweuBQ7OzDmN5T4dOB7YEXg8cB3wocz85hjytg+lNmMTIIEjM/O8Rpq1qjR7AhsBt1Fq907IzEci4inAzVXy90dEp6bzdOBjlBqiPTPzW9Xyotr232XmZrX1/D9KIPfk2rRtgQ8ALwBWptQovi8zL2zk8QnAB4GXUwLC3wOfyszP1tLsQKnV2w94KvDvlJrKi4FDMvN3IxTXq4GZwDH1YBOgKodDgJ2r/J7dWOeL6k0NamV2YGaeVps+4j6t2uV9pUqzB7APsEH12RuAwzPz443yeRZwFfDvVZlsClzbDDarbbm98dndKcHXcyg1fH8EvkHZD3+rpTsNeA3lmD0V2AG4DzgxMz8VEc8APkW5E3An8N7M/H9dtuvFlLJ+DTAE/BA4rJmvbiJiP0qwOBP4G/BT4F2ZeXMtzdOAE4AXAutUefkV8JbM/L+IWBU4inI+vKMRbHbKqHmOrFctc3fKrfffUZpYfLGW5imUff6eqlyOoJxPlwBvAuZV8/6Nclz+BHhjvSawdt34KHAS5boxDzg+M786ivJZ6nlSO14BvhIRX6n+/0CnmcUyXnf2AB4BLmjk61ga18hq+gGUY+KpmXlLNW3rav3bAmsAtwM/B/41Mx+s0kwD3kK587QZpby/T7m+/bmxjp0o5b4NMI1yDn02M/+zluwnlOv0zlTntqYmOw1pqtoE+DpwDuU25trA96ovzU8BX6O0sdsEODsiVux8MCK2oLQleybly+dwyhfn2RHxulGu/wXAZynBw3uBVYDvR8T2tfWsSvkCOoDSNOAtlC+LYym3HAHuoHxJAnybctvp9dX8uZR2b/9UW+8/AQuAp1VfgB0vpHxxdNb9z8AvKEHBcZR2k0PAj6svxk66DYBLgZdRAp23Ves9NSK61aS8C3gV5Qvkw5QA6IwlltJir6heu36xZ+a9wHeBLSJi01Es71F62KefBrYGPkQJgm+kBE7d0r4OeIhSUwPldulzImKrUWTtQGCYcky+lbL/30G5Fd+0AnAu5fbzOykBzScj4kDgR8CVlP14H3BaFfw1fZJya/844AuUIOXHEbHy0jIZEe+mHKM3U8puNrA9cHFErF+leVyVj+2BUyg/Ok4FNqTcSoZyXqwLfC0zR7wzERGrUM6RA4Ezq+2+HfhClaem1wCHUWpPP0Y57s+m/FB5BWXff54SFH68y+c3Ab5FadpxJHAPcHr143Fp+RzNeXId5ZoDpew75/J/V8tY1uvOdpQfOg+OIm23bVifEvxtWq3/LZTjcCYl+O34LKXsLqu28wuUH8wXVvurs7zXU46HDavlvQu4HNitseprgQcpx4amMGs4NVVtRqnR+wVARFxHufh9GdiiUysTEfdQvoBeRKmxgfKlPB/YpnbxPiUifgycGBFnZOZIHZtmAttl5q+q9ZxG6TRyIuULGUpgsTmwdWZeX037QkTcDBwfESdlZkbENykX+d92aW5wMY8OOF9IqbXaoZp+VkQ8CXgyJXiqt6H7JbBTZ1uqnvW/odQmbVct73hKIPrMzLyjmva5iPgicFREfCYz76mtfxVgq8x8qFrm3ZSgaGZmzl1KeW0J3JuZty4lzVW1tDctJV03Y92nfwF2aARFXwU+GxFbZua1ABGxAqWjxjmZeVeV7qPATkCnfdovKEHT+fVay8prM/OB2vvPR8SNlP3/zsz839q8xwFnZeYHq3V/vdqmLwGvz8wzquk/odTUHUD326s7ZOZwlfaa6vP7A//ZJS0RsTGl5u7YzDyuNv1M4BrKcXwUZb9sAuzVqJE7vvb/ltXrb7utq4t/pZxLB2Tm6dV6T6Wcy8dGxBcbNcn/ADytc0xWPyTfQ2kH/ZzMfLiavgHwmog4pBGgbQbsl5lfr9J9gXJOnBQRZ3erka1t40jnye0R8UNKsP+rLk2ElvW6sznlLkWvtqP8AN05M39dm965q0JEbAccAryhXusbEedRjvP9KdewNSlB/5XAC+tlXF1/FsnMv0fE/7L42NAUZQ2npqobOsFm5bLq9Wf1W4C16ZsARMQ6wEsoNZOPj4j1On/AecATKbc1R/LrTrAJUH0pfg14QURMrybvTQn6/txYTyfw3WEU6/kF8Kzq1jyUIPMCSm1LJxB9YS0twFZAVPlZt7beNSk1HM+NiNWqL4Y9KbXECxt5/DGwKqUXdt1XO8FmY52bjLAdawD3j5CmM3+NEdI9So/79ItdauDOotRG1js37EAJchbdvs7MCyhl/gNKsHR49f/tVW0ktbQPVHlcISLWqvL0S8rtx627bM5/1j57D+XW9N8otfmd6UmpmetW5p/vBJuVr1ZpX94lbcerKJUTZzXK7l7gasqPNSg1qwA7R8TjuywHyjEGI+/rjt0otfyLgrPMfAQ4mRLgvaSR/luNH0Cd8/u/OsFmbfrjgCc1Pv8nFtdUUwVK/1mle1a3DPZ4njSX0Y/rzrqUOx69urd6fXlVW93N3pQfY+c18ng9pea5cyy8lLKvT2zWuC4haL6b0tRBU5gBp6aqefU31S1ZgP9tpOtM7wSBT6N82R9L+aKr/32sSrPBKNZ/Y5dpN1SvnXaUT6fUhDXXc+kY1vMLynm8fa0m8+fVXz3g/FOtFrXzxfWlLut+W7W8dSltCqdTRgdopuuMndfM47zG+84X4HSW7n5GDiQ78/80QrqmXvbpY2pQM/Nu4HvAfrVamtdRhjw6p5H2ksx8JaUpx7MpNY0LgS9HxIs76SJiZkScS/kSv6fK00XV7LV4tIcz8/8a0+4F/tCl5u1eupf5o47LKqi+GXhKl7QdnePleh5bfttQlV31Q+7jwEGUH1E/jYi3RcS6tWV1gtLR/mh4MqU98iON6ddVr818N4+/0Z73HTd1KcvOedtcV0cv50lTv64700ZOskQXUTruvR+4MyK+HxEHN348PB1YnRJcNvO5YS2PnWYvS7ur0cz3hBpPWf3nLXVNVc0vqJGmdy7UnR9hn6C0l+tmtBfRkaxAqY388BLm/34Uy/g1pf3TP1GCm/sptwDXoNxyXIcScP6ysV4obVuXdAvujmp5UGrPvryEdNc03o9UvktyLfDsiNg4M5tBQ0enhqlTLkv6glqx8b6XfbqkdnBfBfYC/ikiLqN0wPlao1Z3kapW7Srgqoj4FaVt4OuAC6pa6QspPdrfS+kM8yClNus0HlshsKTbub2W+Wh18rEL3UeEWFRWmXl4RHyZ0sHnpZRg6eiI+OeqGUInUHwm8J0+5a+u1/N+WXTKZyznyZKWsSzXnT/T/UfGqM6TquZxr4j4R0qN906U9pnviYjnZeafqnzeSWkr202vNazTWdw5UlOUAaf0aJ1g5u+Z+dOlply6zbpM69QUddop3gSsMYr1LPGXf2Y+HBGd2+drAZdUvbovpQQHr6S0jfpi7WOd2rv7l7buiLiDEsCutIxlMRrfp/Rw359Ht/nr5GVNyrZcmZmdfdT5clu7kfzJjff92qdQbm/+iXJbfUPKbcP/t9RPLNbpfd/pQPMiym3EPTOzU6vZ6dnbls0ot3k761qJMqrARUv8xOLjZV6n7erSZOY1lADrw1F68M9h8VBIv6Tst/0i4oQuNZdNt1I6YK3YSLt59XrLSPkZo00jYoVGLWfnvF3SusZynizpXO7HMXodZV823Q0QEWs3mhs0zxMAsowScTlwTETsQgmAD6a0Ab+JEohempl/WUpeOsfMTErN+BJVx+CTWHKgrSnCW+pSTfUr/kLg4Ih4YnN+1ZNzNLaJMgB453PrUgKqS6pbs1Daim0bEbt2Wc8aETFUve10KlnSbelfUHoe70TVE71qN/VrSk/badR6qFMCgN8B/xERj7m12dnG6gv+m8Ae0aXH9RjKYjS+RQlS3h0R2zTWsyKl09R0qo5PlVspNVf1TlNQekcv0sd92rkFfQalzd6bKLd7L2ks78VVZ6Kmzn7ufAF3Aqj6kF4rAP8x2vz04JDacQUlwF+bRpOAhm9R8npMs8MHLBq2iIhYswoe6q6j1ICuDYuOyw9TAsaPLWF5L43yFCIobV/Xp5w7nfkrUJp+DLO4vXO/bEAZCquzrlUpTQRuYwkdncZ4nnTGc33UudynY/RiYMsqz3Wd4G/ReVLdJn9DYx3Tu+yPK6vXtavXsyhxwzGNdETEirX26T+mNJ94dzM/XdaxJaWz4SVoSrOGU3qsf6NcvH9b9TK9ifJF9FzKxbHbcDNNc4EfRMSnKbUf/0q5zf2eWpqTKEO1fDciTqcEgqtSagX2otx2vCUzH4zSm/g1EXED5ZbWzZnZ6RDxCxYP8VQPLH/O4mFyFj02LjMXRMSbKLV111a3QG+j1Lz9MyUA6jT+fzelY8yvqrK4hvJl+WzgXyhfFMusqql9NaWJwS+rPNWfNPQcyniI/137zL0RcTZwWJTHod5EuRXYra1bP/Zpx1cpNXYvpftTgz4FrB4R36YEXCtQOgC9nrLvTq7SXVy9P706Th6mBLKrjyEvvbgwSg/3p1CGEJpLGde1q8z8fZQhiE4CnhwR36G0N30qpdb5LEo5vJjSq/qblM5M0yjB2xrUOuJQhlTanBI07lDtw/mUwHLnajmdAPOLlHPnS1EGNv89ZSinHYH3ZJexTpfRjZShjJ5DOSdeR+lg99ql9FCH0Z8nN1FqHP8tIv5CuTbMrUZwWNZj9LuU4Z9ezKN/QPyY0rb1SxFxEuXHQ6e96ca1dG8ADq2O25so16IDq/TfBMjMn0fEKcA7q9rrH1EC/6dRjt1jgNMy876IeBulicGvI+JrlGP9GZQmI6+qrXcnyo+SH42wfZrkrOGUGqpevttQOojsz+IxBVcC3jfKxVxcfWYfyjBDw8AembkoIKxqe3YAPkKpfTiZMrzMFpRhaP5YW96bKLf0PkZpK/ZvtXm/otw+/xuLb9vC4h7iFze/LKt8PI/SQenfKUOYvJHSAeYjtXR/onzh/Sfli/4zlGBrI0rv676pyn2rah07UcYzPIkSbL4hM7uV/WGUL9o3U27Fz6NRc1Nb9rLu086y/ofFtV3dnn51BKW3/86U4OqT1facQXn60C3Vcu6i9ML+X0qgcBSl1/f+Y8nPGL2N0sb3/ZThbb4HvHRJbVA7MnM2Zf8/ROkA9XFKIPUzFg/WfRVlSK5dKdv9QUrQuUd9mKTMXJiZb6IEq/OBt1PaCh5O6Ty1e2dYomoYqRdRAuLXUo7/J1AGIm/jkba/p7TL3ZEyvNV0ygMElvqYzdGeJ1Wb3tdTztVTKOfyntW8ZTpGszyL/EpKT/L69Icp++omyj55a5XPzzQWcRHl+rE35Zg9inINenHtxy2Z+RbK9Wgdyh2HEyk/vr5BbdD5LA9deDnlmnIUpTyfT2k+U7c38O1ax05NUdMWLrRjmKSJKSKeSQmcb6WMqzohvpQi4grgocycFINVx+Knyjw/My8dIflyKaonDWXmywadl15FxL6UWuEnt1D723dRnmz0a2BWZv5m0PlRu6zhlDRhZebVlJqwAL4dIzwRZzxExLMpNVFLvA0tDciZlJrMtw84H6P1HuCbBpvLB9twSprQqh7cfWkruiwiYialc9Y7KD3Vu91OlwamGtpoNI9UnRAyc69B50HjxxpOSRqdPSm3pVcFXpOPfiSlJGkpbMMpSZKkVnlLvQdz5swZArYF/o8lP8FCkiRpIliRMsLDFbNmzRoeRAYMOHuzLYuHnJEkSZoMmo86HjcGnL35P4CnP/3prLxye51m586dy8yZM1tb/mRhORSWQ2E5FJZDYTkUlsNilkVRL4eHHnqIG264Aar4ZRAMOHvzCMDKK6/M0NDQSGmXSdvLnywsh8JyKCyHwnIoLIfCcljMsii6lMPAmgHaS12SJEmtMuCUJElSqww4JUmS1CoDTkmSJLXKgFOSJEmtMuCUJElSqww4JUmS1CoDTkmSJLXKgFOSJEmt8klDE9jqa63H7Xc9MOhs9MVqQyuxxuPbewyoJEmauAw4J7BHFk7j/CvmDTobfbHjthsbcEqStJzylrokSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWrVSoNacUTsBbwWmAWsA9wEfBb4fGYuqNKcBryhy8f3ysxvNpZ3BHAosBFwDXBkZp7fSLMGcBKwJ7AKcCFwWGbe0rcNkyRJ0qMMsobzcGAYeCfwcuA7wKeAjzTS/R54fuPvgnqCKtg8ATgF2A24ETgnIrZqLOvrwO7AYcA+wAzg/IhYrV8bJUmSpEcbWA0n8IrMvKP2/sKIWB14S0QcnZnD1fQHM/PSJS0kIoaAo4GTM3N2Ne0i4GrgvcDe1bTnUoLR3TLz3Gra1ZSa1QOAU/u5cZIkSSoGVsPZCDY7fkO51b3OGBa1HbAWcGZt2Y8A3wB2iYhp1eRdgXuB82rp5gEXV/MkSZLUgkHWcHbzQuAu4E+1aZtGxD3A44G5wImZeVZt/hbV63WNZV0DrA48EbitSnd9p31oI93Ofcm9JEmSHmPCBJwRsQ1wIPCBqoYSSo3nFZSgcC3gIODMiFg1M0+r0kwHhjPzwcYi765e16EEnNOBe7qs+m7GVqO6yNy5c3v52Kituub6zJ8/v9V1jJc771yN227uVqk9OnPmzOljbiYvy6GwHArLobAcCsthMcuimEjlMCECzojYCPgWcDm1TkOZ+clG0u9GxAXAB4DTxi2DSzBz5kyGhoZaW/61N85jxowZrS1/PK277npsuNnGPX12zpw5zJo1q885mnwsh8JyKCyHwnIoLIfFLIuiXg7Dw8OtV5KNZODjcEbEWsAPgQeA3TPz4RE+cjawcUSsX72/GxiKiFUa6aZXr3fV0q3dZXnTa2kkSZLUZwMNOKsg8XvABsDLMvPOHhbTabu5RWP6lsD9wB9q6aLWiaie7voe1itJkqRRGFjAGRErUXqSPwvYJTNvHcVnplGGObq11sv9Ekrv831q6Vas0p2XmQuryedSajh3rqV7ErB9NU+SJEktGGQbzlOAVwDvAlaLiOfV5l1LudV9OmWw9t9RgsWDgB2A13cSZuZwRBwPnBARdwBXVuk2BfarpbssIs4BvhQRhwP3AccB85gA7UElSZKmqkEGnJ2axo92mfci4LeUmsujKbfcH6YEk7tn5vfriTNzdkQAvBXYkNKrfbfMvKqx3H2B2ZRB3ocoj7bcKzMf6McGSZIk6bEGFnBm5lNGkeyVY1jebEowubQ09wOHVH+SJEkaBwPvpS5JkqSpzYBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrVppUCuOiL2A1wKzgHWAm4DPAp/PzAW1dLsAHwK2BP4AnJyZn+6yvCOAQ4GNgGuAIzPz/EaaNYCTgD2BVYALgcMy85Z+b58kSZKKQdZwHg4MA+8EXg58B/gU8JFOgoh4PvA94DfALsBXgJMj4s31BVXB5gnAKcBuwI3AORGxVWOdXwd2Bw4D9gFmAOdHxGp93jZJkiRVBlbDCbwiM++ovb8wIlYH3hIRR2fmMHAMcGVmvqmWZmPg/RHxhcxcEBFDwNGUms/ZABFxEXA18F5g72racynB6G6ZeW417WpKzeoBwKktb68kSdJyaWA1nI1gs+M3lFvd61SB5IuBsxppvka5bb519X47YC3gzNqyHwG+AewSEdOqybsC9wLn1dLNAy6u5kmSJKkFE63T0AuBu4A/AZsCKwPXNtJcU71uXr1uUb1e1yXd6sATa+mur7cPraXbHEmSJLViwgScEbENcCDwiaqGcno1655G0rur13Wq1+nAcGY+OIp0zWV10q3TZbokSZL6YJBtOBeJiI2AbwGXU+s0NNHNnTu31eWvuub6zJ8/v9V1jJc771yN227u1opidObMmdPH3ExelkNhORSWQ2E5FJbDYpZFMZHKYeABZ0SsBfwQeADYPTMfrmZ1aijXbnykU/N5Vy3dUESskpl/GyHdxl2yML2WZkxmzpzJ0NBQLx8dlWtvnMeMGTNaW/54Wnfd9dhws27FP7I5c+Ywa9asPudo8rEcCsuhsBwKy6GwHBazLIp6OQwPD7deSTaSgd5Sj4hVKMMebQC8LDPvrM2+CXiIxW00O7asXq+vXjttN7ulu58ydmcnXdQ6EdXTXY8kSZJaMbCAMyJWovQkfxawS2beWp9fDYt0AdWwRjX7An8ErqzeX0Lpfb5PbdkrVp87LzMXVpPPpdSW7lxL9yRg+2qeJEmSWjDIW+qnAK8A3gWsFhHPq827NjPvA44Dfh4RXwTOAF4AHAwc2ultnpnDEXE8cEJE3EEJRA+i9HLfr7PAzLwsIs4BvhQRhwOd5c8DTmt1SyVJkpZjgww4OzWNH+0y70XAzzLzVxHxSspThPYH5gPvyMzP1RNn5uyIAHgrsCFlqKPdMvOqxnL3BWZTBnkfojzacq/MfKA/myRJkqSmgQWcmfmUUaY7l1Hc8q6eMjR7hDT3A4dUf5IkSRoHE2YcTkmSJE1NBpySJElqlQGnJEmSWmXAKUmSpFYZcEqSJKlVBpySJElq1ZgDzojYucvjISVJkqSueqnh/CFwW0ScFBFb9TtDkiRJmlp6CTj3AC4GDgWujIjfRsQRETGjrzmTJEnSlDDmgDMzv5eZe1MeIXkwcAdwInBrRPw4Il4XEav1OZ+SJEmapHruNJSZ92fmlzNzR+DJwFHABsDpwO0R8dWI2LFP+ZQkSdIk1a9e6isCjwOGgGnAg8BLgJ9ExG8iYmaf1iNJkqRJZqVePxgRawF7A68DXgD8HTgHeHf1ugDYHfgE8BVg22XNrCRJkiafMQecEbEHJcjcFVgFuAJ4G/D1zLyrkfw7EbEecOoy5lOSJEmTVC81nP8N/AH4JHB6Zl4/QvrfAmf0sB5JkiRNAb0EnC8Fzs/MhaNJnJmXA5f3sB5JkiRNAWMOODPzp21kRJIkSVNTL4+2/ERE3LiU+TdExEnLli1JkiRNFb0Mi7QbcNZS5p8FvKK37EiSJGmq6SXgfBJwy1Lm31qlkSRJknoKOO8DnrqU+ZtQBn6XJEmSego4LwAOiYiNmzMi4inAIVUaSZIkqadhkY4BdgHmRsRXgGuq6TOBA4BHgPf1JXeSJEma9HoZFunGiHgBcApwWGP2RcBhmZn9yJwkSZImv56epZ6Z1wA7VI+t3KSafFNm3tm3nEmSJGlK6Cng7MjMPwN/7lNeJEmSNAX1FHBGxIrAzpTazenAtEaShZn5wWXMmyRJkqaAMQecEbEN8C3gH3hsoNmxEDDglCRJUk81nKcCqwJ7AL/IzHv6mSFJkiRNLb0EnM8C3puZ3+93ZiRJkjT19DLw+20s+Va6JEmS9Ci9BJwnAgdHxJr9zowkSZKmnl5uqa8D/BX4XUR8E/hfytOF6hZm5knLmjlJkiRNfr0EnCfW/n/zEtIsBAw4JUmS1FPA+dR+rTwingYcATyP8iz26zNzZiPNacAbunx8r8z8ZiPtEcChwEaUZ7wfmZnnN9KsQQmG9wRWAS6kPI7zlj5skiRJkhp6eZb6rX1c/zOA3YDLKO1Jl9Sm9PfAaxvTbqi/qYLNE4CjgCuBg4FzIuK5mXlVLenXga0pz4G/DzgOOD8inpmZDyzb5kiSJKmp50dbRsRmwA7ABsAZmXlLRKxMqV38Y2Y+NIrFfD8zv1st7zRgmyWkezAzL11KXoaAo4GTM3N2Ne0i4GrgvcDe1bTnUgLc3TLz3Gra1cBNwAGUMUYlSZLUR2PupR4RK0TEF4Drgc9Tagg3qWavTAnyDhvNsjJzwVjXvwTbAWsBZ9aW/QjwDWCXiOgM47QrcC9wXi3dPODiap4kSZL6rJdhkY4C3gi8D3g+tTE5M/MvlMdevqovuVts04i4JyIejojfRMQ+jflbVK/XNaZfA6wOPLGW7vouge41wOZ9zbEkSZKA3gLOA4EvZ+YJwO+6zL8a2GyZcvVov6F0LNqD0tHnNuDMiDiglmY6MJyZDzY+e3f1uk4t3T1d1nF3LY0kSZL6qJc2nP8AXL6U+Q8Ca/SWncfKzE82Jn03Ii4APgCc1q/19GLu3LmtLn/VNddn/vz5ra5jvNx552rcdvMdPX9+zpw5fczN5GU5FJZDYTkUlkNhOSxmWRQTqRx6CTj/CDx5KfNnAf3syd7N2cCpEbF+Zt5BqaEciohVMvNvtXTTq9e7qte7gY27LG96Lc2ozZw5k6GhobF+bNSuvXEeM2bMaG3542nddddjw826Ff3I5syZw6xZs/qco8nHcigsh8JyKCyHwnJYzLIo6uUwPDzceiXZSHq5pf4t4N+qXuodCwEiYhdgf0pnnfHUabu5RWP6lsD9wB9q6aLWiaie7vr2sidJkrT86iXgPBaYR2lbeQYl2DwqIi4FfgBcBXy4XxlsqoLFvYFbq9pNgEsovc/3qaVbsUp3XmYurCafC6wN7FxL9yRg+2qeJEmS+qyXgd/vi4jtgP8A9gL+RgnYbqIEoyc1bmsvUUSsxuLhiJ4MrBkRe1bvr6heT6cM1v47SrB4EGX8z9fX8jQcEccDJ0TEHZSB3w8CNgX2q6W7LCLOAb4UEYezeOD3eQy4PagkSdJU1dPA71VAeUL1tyw2oLTHrOu8PxD4HqXm8ugq7cOUYHL3zPx+I0+zIwLgrcCGlKGOdms8ZQhgX2A2ZZD3IcqjLffyKUOSJEnt6PlJQ/1QPb+82Z6y6ZVjWN5sSjC5tDT3A4dUf5IkSWrZmAPOiPjyKJItzMw39ZAfSZIkTTG91HC+mKpXes2KwBOq1zuAvy5jviRJkjRF9NJp6CndpkfE4yi3qd8O7LRMuZIkSdKU0cuwSF1l5sOZ+Rngx8Bn+rVcSZIkTW59CzhrrgL+qYXlSpIkaRJqI+DcCXCIIUmSJAG99VI/Zgmz1qbUbG4NnLgMeZIkSdIU0ksv9WOXMP1uytOG3gx8sdcMSZIkaWrppZd6G7fhJUmSNEUZPEqSJKlVvbTh3LiXFWXmvF4+J0mSpMmtlzact/DYJw2Nxoo9fEaSJEmTXC8B50HAW4EnAV8DbqimB7AvMA/4FLCgHxmUJEnS5NZLwPkEYAh4WmbeXZ8REe8HLgY2yswP9yF/kiRJmuR66TT0ZuALzWATIDPvpAyJ9G/LmjFJkiRNDb0EnOsCqy9l/uOrNJIkSVJPAeelwNsiYlZzRkRsA7wNuGxZMyZJkqSpoZc2nG8BfgZcHhFXADdW0zcDtgXuAg7rS+4kSZI06Y25hjMzrwWeSemJvjawZ/W3NvBJ4JmZeU3/sihJkqTJrJcaTjLzduAd1Z8kSZK0RD0FnB0RsRmwATA3M+/tT5YkSZI0lfQUcEbEfsCJwBOrSTsBF0TEesAlwNGZ+Y3+ZFFTwYIFC7n9rgd6+uyqa67f82fbsNrQSqzx+JUHnQ1JkiaNXp6l/mrgv4CfACcDszvzMvPPEXEdsD9gwKlFhh9+hEt+O7+nz86fP58ZMyZOwLnjthsbcEqSNAa9DIv0XuCnmbkzcHqX+ZcBWy1TriRJkjRl9BJwbgF8eynz/wSs31t2JEmSNNX0EnD+laU/aWhT4M+9ZUeSJElTTS8B5wXAARHxmEZsETEDOBj40bJmTJIkSVNDr204nwD8Gvh3YCGwa0ScCFwNLAA+0LccSpIkaVLr5UlDNwIvAP4IHAtMA/4DeBfwP8D2mTmvf1mUJEnSZDamYZEiYkXK2Ju3Z+ZLI2I68DRK4Pr7zLyjhTxKkiRpEhvrOJwrADcBRwIfz8y7gSv6nitJkiRNGWO6pZ6ZDwPzKe02JUmSpBH10mnoK5Re6qv0OzOSJEmaenp5lvoNwIrA9RFxOvB74MFmIp+lLkmSJOgt4Pyv2v/vW0KahfgsdUmSJDHKgDMiPgWcnplzgBdVk1en1Gw+0uvKI+JpwBHA84CZwPWZObNLul2ADwFbAn8ATs7MT3dJdwRwKLARcA1wZGae30izBnASsCewCnAhcFhm3tLrdkiSJGnJRlvD+RbgUmBOZl4UEetSnpm+U2ZetAzrfwawG3AZpT3pY9qURsTzge8BXwUOp4wBenJEPJyZn6ulOwI4ATgKuJLyxKNzIuK5mXlVbZFfB7YGDgPuA44Dzo+IZ2bmA8uwLZIkSeqil1vqHdP6sP7vZ+Z3ASLiNGCbLmmOAa7MzDdV7y+MiI2B90fEFzJzQUQMAUdTaj5nV8u7iPLko/cCe1fTnksJcHfLzHOraVdThno6ADi1D9skSZKkml56qfdNZi5Y2vwqkHwxcFZj1tcot823rt5vB6wFnFlb9iOUdqS7REQnON4VuBc4r5ZuHnBxNU+SJEl9NtCAcxQ2BVYGrm1Mv6Z63bx63aJ6va5LutUpT0fqpLu+S6B7TW1ZkiRJ6qOx3FLfJCL+sfp/rep184j4S7fEmXn5MuWsmF693tOYfnf1uk4t3XBmNodnqqe7rUrXXFYn3Tpdpi/V3Llzx/qRMVl1zfWZP39+q+sYL8OxzjJty0QqhzvvXI3bbh7MU1znzJkzkPVONJZDYTkUlkNhOSxmWRQTqRzGEnB+oPqre0xPcUrbzoWUsTqntJkzZzI0NNTa8q+9cR4zZsxobfnjaWholZ63Zf78+ROqHNZddz023GzjcV/vnDlzmDVr1rivd6KxHArLobAcCsthMcuiqJfD8PBw65VkIxltwHlgq7lYsk4N5dqN6Z2az7tq6YYiYpXM/NsI6bpFCtNraSRJktRHowo4M/P0tjOyBDcBD1HaXp5Xm75l9Xp99dppu7kF8JtGuvspY3d20u0UEdMyc2Ej3fVIkiSp7yZ0p6HMHAYuoBrWqGZf4I+U8TYBLqH0Pt+nkyAiVqw+d14tuDyXUlu6cy3dk4Dtq3mSJEnqs2UZh3OZRcRqLB6O6MnAmhGxZ/X+isy8lTIw+88j4ovAGZSB3w8GDu30Ns/M4Yg4HjghIu6gBKIHUXq579dZX2ZeFhHnAF+KiMNZPPD7POC0VjdWkiRpOTXQgBPYADi7Ma3z/kDgtMz8VUS8kvIUof2B+cA76k8ZAsjM2REB8FZgQ8pQR7s1njIEpXZ0NmWQ9yHKoy338ilDkiRJ7RhowFk9v3zEJxZVTwUa8ZZ39ZSh2SOkuR84pPqTJElSyyZ0G05JkiRNfgackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVSsNOgPSZLNgwUJuv+uBcV/vqmuu39f1rja0Ems8fuW+LU+SpCWZ8AFnRBwAfKXLrFMy8y21dLsAHwK2BP4AnJyZn+6yvCOAQ4GNgGuAIzPz/Bayrilq+OFHuOS388d9vfPnz2fGjP4FnDtuu7EBpyRpXEymW+ovA55f+5vdmRERzwe+B/wG2IUSoJ4cEW+uL6AKNk8ATgF2A24EzomIrcZjAyRJkpZHE76Gs2ZOZv55CfOOAa7MzDdV7y+MiI2B90fEFzJzQUQMAUdTaj5nA0TERcDVwHuBvVvOvyRJ0nJpMtVwdlUFki8GzmrM+hrltvnW1fvtgLWAMzsJMvMR4BvALhExrf3cSpIkLX8mUw3n3IhYH5gHnAZ8KDP/DmwKrAxc20h/TfW6OfBrYIvq/XVd0q0OPBG4rf/ZliRJWr5NhoDz/4D3A5cDj1DaaL4PeCpwADC9SndP43N3V6/rVK/TgeHMfHAp6cYUcM6dO3csycds1TXXZ/788e+c0obhWGeZtmUilcOybsuy6Od677xzNW67+Y6+LW88zZkzZ9BZmBAsh8JyKCyHxSyLYiKVw4QPODPzR8CPapN+EhH3AsdGxAcHlC0AZs6cydDQUGvLv/bGecyYMaO15Y+noaFVet6W0jt74pTDsmzLsuh3Oay77npsuNnGfVveeJkzZw6zZs0adDYGznIoLIfCcljMsijq5TA8PNx6JdlIJmsbzm9Ur1uzuIZy7UaaTs3nXdXr3cBQRKwyQjpJkiT10WQNOOtuAh5icRvNji2r1+ur107bzW7p7qeM3SlJkqQ+m6wB52uAhZShkoaBC3jssEb7An8ErqzeXwLcC+zTSRARK1afOy8zF7adaUmSpOXRhG/DGRE/ogSUc4EFlE5D/w58KTN/XyU7Dvh5RHwROAN4AXAwcGhmLgDIzOGIOB44ISLuoASiB1F6ue83jpskSZK0XJnwASflVvgbgX+g5PdG4Ejg5E6CzPxVRLyS8hSh/YH5wDsy83P1BWXm7IgAeCuwIWVIpN0y86r2N0OSJGn5NOEDzsx8O/D2UaQ7Fzh3FOlmU3sspiRJkto1WdtwSpIkaZIw4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktWqlQWdA0mAsWLCQ2+96YNDZGLNV11z/MflebWgl1nj8ygPKkSRpJAac0nJq+OFHuOS38wedjTGbP38+M2Y8OuDccduNDTglaQLzlrokSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVU+S13SpLdgwUJuv+uBkRNOAqsNreRz4SVNOQackia94Ycf4ZLfzh90Nvpix203NuCUNOV4S12SJEmtMuCUJElSq7ylLkkTyGjbo6665voTut2qbVEl1S13AWdEbAZ8GtgeeBA4EzgyMyfulVvScmO07VHnz5/PjBkT97JlW1RJdctVwBkRawMXArcCewIbAB8H1gdeM7icSZIkTV3LVcAJHAJMB56dmX8GiIi/A2dExAcz85qB5k6SpojxGqpqPJoW2DxAWnbLW8C5K3B+J9isfAv4MrALYMApSX0wXkNVjUfTApsHSMtueQs4t6AEl4tk5nBE3ARsPoblrAjw0EMP9TFrj7VgwSOstMKCVtcxXh75+8M9b8sqj5s2ocphWbZlWfS7HAa1HcuqWzlM1m3pZrTbMtHOi6bx2ifjUQ5/f/ghhodXbHUd/TA8PDzoLEwYlkXRKYdavDKwA3nawoULB7XucRcRDwPvy8wTG9N/CfwpM181muXMmTNne+AXLWRRkiSpLS+cNWvWLwex4uWthrNfrgBeCPwf8MiA8yJJkrQ0KwJPoMQvA7G8BZx3A2t3mT4duH60C5k1a9YwMJBfCJIkST24aZArX96eNHQdpR3nIhExBGzKGAJOSZIkjd7yFnCeC+wYEevWpv0LMFTNkyRJUp8tb52G1gbmArcAH2TxwO/nZ6YDv0uSJLVguarhzMx7gBcDfwH+G/gEcBbwxgFmS5IkaUpbrmo4JUmSNP6WqxpOSZIkjT8DTkmSJLXKgFOSJEmtWt4Gfp/wImIz4NPA9sCDwJnAkZn5wEAzNoKI2At4LTALWIcywOxngc9n5oJaul2ADwFbAn8ATs7MT3dZ3hHAocBGwDWUMji/kWYN4CRgT2AV4ELgsMy8pZFuYGUaEatTxnh9IrBtZv66Nm9/4CjgKZTyOi4zz2p8/nHAccAbKA8tuAJ4W2b+TyPdRsAngZcBC4EfAG/PzD830v0jZWSGWcBdwH9W623liVkR8Xrg7ZT9/QBwJbBvJ1/Lw/EQEXtQ9vMWwF+Bi4F3Z+aNjXRT5niIiKcBRwDPA2YC12fmzC7pJuz+H23elqUcImJF4HBgt2o9KwFXAx9obt9ULocu6WcBlwMPZubqjXkDOQdGc36OZAznxSrAu4HXA/8A/Bk4NzMPbqSbVMeDNZwTSDVs04XAGpSD43BgX+DLA8zWaB0ODAPvBF4OfAf4FPCRToKIeD7wPeA3wC7AV4CTI+LN9QVVJ9EJwCmUC/GNwDkRsVVjnV8HdgcOA/YBZgDnR8RqtWWtzWDL9Fi6/LCLiD2B04FvU8rip8DXq5O57hOUC8r7gVcCD1G2cUZtWSsB5wHPBPYHDgK2A74XEdNq6Tap1nMXZR+dQNlfH+rDdj5GRLyX8qPjvynb+CbKRXGomj/lj4eI2JGy/dcDr6rytjnw04hYs5Zuqh0Pz6Dsq98B13ZLMJH3/2jzNgojlcOqlCDmf4ADgddQvsR/EhEvb+RpKpdDfZ0rUK4bdywhybifA2M4P0cymvNiBcr35/5Vfl4KvIsyuk493aQ7HqzhnFgOoTxm89m1GqC/A2dExAcz85qB5m7pXpGZ9QvEhVXt3lsi4ujMHAaOAa7MzDfV0mwMvD8ivpCZC6onPx1N+dU0GyAiLqL86n8vsHc17bmUk2y3zDy3mnY15ZfnAcCp1ToGVqYRMRN4M/AfwOcbsz8InJ2Z76neXxgRWwAfAH5Yff6J1effmplfrKZdCtxMqTV8V/XZVwNbATM72xMR8yk1abuw+KEG7wTuAfaq9sf5EbEWcExEfDQz7+rjtgcl2P6XzPxBbdZ3av8vD8fDvsCtwBsyc2G1vluBy4AXUO1rpt7x8P3M/G617tOAbbqkmcj7f8S89akcHgSempl3dyZExI+Bp1O+9H9QTZvq5VB3MLAWJdh5a33GAM+BEc/PPpbDgcDzgS0z8w+16WfUymFSHg/WcE4su1IGoa9X+X+LUnM41l9S46oRbHb8hlKFv051gryYMu5p3dcotwO2rt5vR7nYnFlb9iPAN4Bdar9OdwXupfyK7aSbR7mg7Fpb/iDL9BTgM8AN9YkR8VRKLdeZjfRfA7aNiPWr9y8FVqRWZpl5P+VLqLmNV9eDpcy8hBLoNNN9p7qw1tfZ2Tf9dCBwayPYXGQ5Oh4eB9zfCTYr91Sv02BqHg8jffFM5P0/hryNaKRyyMxH6sFmNW0hpcZzRm3ylC6HjohYj1Jr9zZKzWXTuJ8DYzg/RzTKcjiYEtz+YSlpJuXxYMA5sWxBo5q9OhFuohzwk80LKbcq/kR5Xv3KPPY2QueC0Nm+zrPur+uSbnVKW8hOuuu7nMDX8OiyGkiZRmm7+DTg+C6zO9u4pLKIWrrbM/POLumeXt166aTrdntmUVlExOOBjZvpqnY8D9D/snge8NuIODoi/hgRD0fE5RHxz9X85eV4OA3YIiIOi4i1I+IpwGzK9nTaWi0Px0PTRN7/o81bK6r9uB2P3ublpRw+AvwyM89bwvxBnAOjPT+XWZT2qVsDt0TE6RHxl4j4a0R8p6pJ7JiUx4MB58QyncW1H3V3UzriTBoRsQ2llusT1S+v6dWsexpJO7/uO9s3HRjOzAdHka65rE66elmNe5lWt2ROAt6VmX/pkmQsZdFM00n3OMqFZaR0nWWtvYR1NtP1y0bATpRj4K3AK4D7gPOqoGu5OB4y80JK280PVeu4GXgqsFOtVmV5OB6aJvL+H23e2nIYJYj5WG3alC+Hqn3gvsA7lpJsEOfAeJbDupTtOJJyDX01pe37VsC5UdqmdvI06Y4HA071XZTegd+i9DL8yAjJp6LjgRsz84wRU05dK1Au/q/OzG9UNRa7U4LOdw40Z+MoIrYDvgp8iXI7ai9gAaXzwqqDzJsmnuoOwEeB2Zn5i0HnZ7xE6a1/KvDxzPz9oPMzQJ2Y7C/AHpn5o8w8k3LdeAbwLwPLWR8YcE4sd7P4V1fddMqt6Qmvqt37IeWWxO6Z+XA1q/MraO3GRzq/mu6qpRuKMizESOmay+qkq5fVuJZpRDyD0qj9fdUt1LVZ/Kt79ShDVIylLJppOukeZnGvxdFs4z1LWGczXb/cDdyZtaFKsgyvcSllOJDl4nigjNRwYWa+IzMvzMxvUhrxP4cy5EknT3TJ11Q6Hpom8v4fbd76KiKeBXyX0rHuyMbsqV4OBwNPAE6tXTdXgdKDuvbjbBDnwHiWwz2UIZwurtdeZhlO7z7KtbOTp0l3PBhwTizXsbhtBrCooe6mlGFVJrTq4P8esAHwskY7m5sojcC3aHxsy+q1s32dNind0t1PGTKkky5qjaPr6eplNd5luhll9IcLKSfo3cD3q3kXAr9g6dsIkNXrdcAGEdG8VbElcEOtXc5jtrGW7nqAzPwrMK+ZLiKeDKxG/8tiab29V2H5OR62pHQAWSQzb6OMq7dpLU8088XUOh6aJvL+H23e+iYiNgV+RBmn9vWNTmYw9cthc2BDynZ0rptHAo+v/v9wLd/jfQ6M9vxcZtWP8luWMHshVRA+Qp4m7PFgwDmxnAvsGBHr1qb9C6XH3LndPzIxVG1LvgE8C9glM2+tz6/aq11ANVxDzb7AHykXWoBLKL3q9qkte8Xqc+fVLsTnUn5p7VxL9yTKoLX1shrvMv0l8KLGX6dN0puBgzLzZsrJuU/js/sCV9R6/P+Ycvt1UZlFGWrqFTx2G59ZDdPRSfc8ygDFzXR7RMTKjXUOs7gDS7/8AFg3Ihb1XKwa6T8fmLMcHQ+3UgaUXqT6QluP6otlOTkeHmUi7/8x5K0vqiZIP66WvUdmduudPdXL4TM89rp5OvC36v/PVOnG/RwYw/nZLz8Atq83uYkyOP1awJxq0qQ8HqYtXNj8IaVBqW4jzKV8EX2QUlP4ccpwBa8ZXM5GFhGfB/6VMg5as+3RtZl5X9Uo/OeUnrtnUMYhPA44NDM/V1tWZ0Db91AO4oMojaefm5lX1dL9gHJr8nDK7YbjKNX7z6x+KU6IMo2IHSi1m4ueNBTlyUxnUX65/4QygPHbKOOl/bD22c9Qbr0eTglejqCM3fbMzJxfpVkJ+DWlsfl7KDWsJwG3Ay/IxeM/bkKpbbuA8kSJqNJ9OjPf3edtXgH4FbA+ZVy4+6tt2JYy1tvvlofjISLeQinrz1Bul65LGT9vfeAZnbsAU+14iDKodGfYlUMpNSX/Ub2/IjNvncj7f7R5W9ZyoIzg8atq+uso+2iRzLx0eSiHZgVF9ZljgSPysU8aGvdzYLTnZz/KoQoIr6Ls409QAsYTKPty604ztcl4PFjDOYFk5j2UjgV/oTyd5BOUg/yNA8zWaHV+QX2UcgGt/20NkJm/opyo21JuHx0EvKN5oGYZyPYoSu/mH1Jut+xWP4kq+1J+DZ4KnE35lfWSrD2Ka6KWaWaeTenBvSelLHYG9uty8XoH5akbx1OaK6xK2cb5tWX9nfL4trnAf1Ge/nAppQ3twlq63wMvoQQ751ACn49RAsJ+b98CSlvFn7N4/wDskJm/q9IsD8fDKZRBlV9IaZt3MuUpIy+qNzmZgsfDBpR9cDawA/Ck2vsXVeufsPt/tHnrQzlsSOmBvDrl+GheO5eXchiLcT8HxnB+jmQ058X/Vv9Pq6Z/hlKJ85Jc3CdiUh4P1nBKkiSpVdZwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVv1/xVCuu0pwNnUAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoIAAAFKCAYAAACJoz5RAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAw2ElEQVR4nO3debxtc/348deNXMksEpUy9Ea3+HWpDPVrki6l4WuIypeifJMkU5Mh5Kf4lpKiCRWRVF8iDcZKGU4D1/CmK3y5kXmI7sW9vz8+a3eWbZ9pn2Gfe9br+Xicx7p7rc9a+7M++3PPfp/PtKYtXLgQSZIkNc8zep0BSZIk9YaBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISuMgIg6NCNdm0tNExOsiYmFEvK7XeZmsIuLiiLi41/noJCL2jYibI2LxXudlKBGxdUQ8EhEr9zovmrwmfUWWJosRBHa7jmtGRiEipgHvA3YDNgCeCcwBzgC+mJmP9jB7wxYRzwb2AbYH1gQeB+4Afgd8KTNv6GH2JoWIWAPYH9gSeD7wJHA98FPg+Mx8oGeZG0cR8WHg0cw8eRyuvQzwSeCgzHxirK8/1jLz3IiYQ8nzx3udH01OBoLS8L2v7fUHgVcD72/bfxnwfeCoicjUcEXEYsBplODpN8DBwGPAa4HPAttHxJsy8x+9y+XQIuKZwCXADOB7wNeAZwHrAlsDvwcaHQhGxJbAWZTg73vA1ZTf9xtRgoL/C7y5ZxkcXx8G7gFOHodrv59S1747DtceLycCR0fEoZn5UK8zo8nHQFAapsz8fv11RLwJeGX7/prJ1mJwACUIPCYz96/t/0ZE/BD4CXASJZiaMFUr5ZKZ+dgwT3kHMBPYtb3Vp+quW25MM7iIiYgXAWdSWkjfkJl3tB3/FKVFWCP3fuC8zPxnrzMyAj8CvkL5v/+tHudFk5CBoDQOIuJQ4JDMnFbbdwulpeoo4BjgpZRu2Y9m5oUR8Q7gMOAlwHXA7pnZ13bdlwBHAG8Enk3p6vtcZv5oiPw8i9JNeCOlRegpMvPsiDgF2DUiXpmZV9TyfHFm7tJ2vYur815X2zcd+ATwXuCFlFaZHwKfrnc5V13sJwIXAZ8BAvhgROwOLJOZL++Q/z8Cj2fmq4C1qt2/6XAfTwD31s5bgxIAvwFYA5gP/Bb4ZGZeU0v3uio/7wHWobT2Lgf8EvgA8E/K5/YeSrmfBexRD17b7usQSpd1Agdm5vntee1wj0N+tlWg+wlgZ+AFlBbdm4DPZ+aPq2QHAMsAW7UHgVUZ3Vm9T/299wA+AqwNPAD8T1VG99XSXAysCmwHfBXYGLizSndGRGxOqdcbALdR6vUvaucfWpXLSymf+9bAAkod2TczHxmifKZVefwg5TN6CDiHUr73VGluoXzO9aEct2bmi6p9w6qjA7z/i4GXA8e37X8R8Dc6/2GyEPhsZh5avV4aOBT4D2C16h6uBQ7OzEtr521MaaXfDFgC6KN0R1/Udv3nVdfbGlgZ+DvwK+DjmfkwQGb+IyKuBt6JgaA6cLKINLHWBH4AnEv5QloeODsidqL81X4apct2TeDMqjsXgIhYD7gceBnwBWBfStBzZkS8d4j33RxYAThtkLFNre6ut430pqov6Z9QgpBzgb0oX7AfBn5aHa97LeUL9Szgo5QA+RTgZRHxlECwuu//U8vfLdV25w7Xbbdx9V4/AvYGvgS8Arik+hJtdwAwixL0fZvS+vgN4JvA+pQv57MogdjTAmrKF/fXq3v/NLAkcE4VJA1oBJ/tIZQ/Fi6hlNthlLJ7ZS3NNsDfMvO3g71n7b0/U+X5LsofC6dTWr4urAKnuuUon++VlLJ6FDg1It5NKePzKfV6qSrvnVpnT6fUxU9V53yQUl5D+TrwRUo57U35XLYFLoqIJas0HwNup5TJ+6qfj1X3OdI62m7TanvVMPI62D3sVeXjw8DngbspwTNVPv8v5Y+cFSmf74HAdOCX9QlGEbEqcAWlLp5VXfdkSl1Yqe19+4BNhnGPaiBbBKWJtQ7w2sz8DUBEXA/8AvgOsF5m/q3a/wClden1wK+rc78MzAU2qrVEHR8RvwSOiohTM3OgCS3rV9u/DJK31rH1B0kzkB2BtwCvz8xLWjsj4irKeMktKK1rLesCr8jMP9fS3kC5x/dSvqxb3keZDHJG9fqnlC/6gyktmBdTWvnO7dACdm57a2lEfI/S4voB2lrGKK0vr8zM+VXalYF3U1pZ3lKV79ciYh1KsHRw2/kzgE0z8/fV+SdTWuyOogTjAxnuZ/tWStfk7p0uEhHLAqtTWvSGVN3fQcAFwJaZ+WS1/8+UYQK7U1r/WlYFds7M71XpfkX5LE4DXpOZv6v2t+r1djy9FeoOSmvlwirt34GDqvGpv6aDiNgU+BDwn5n53dr+8ylB087ANzLzpxFxBHBPhyEbI62j7dattjcPkmYobwW+mZkdJ25UgdqJlPq8Ra2MTgD+BBxJf0B6FKVVcdPMvLx2mUM7BHw3U4Lv51HqmfRvtghKE+vGVhBYaf0Cv7gVBLbtXxMgIlYE3kRpwXh2RDyn9UNphVmd0qU8kGWq7cODpGkdW2aQNAPZntLtfG1b3i4BFlIC2rrL6kEgQDWL9Wxgp4h4Bvz7i3En4Oet7r/M/BclqDoGaM2CPhH434j4fhUMta5Z77pdKiJWonTHJWWcYbvvtYLAyuXVe5zUFmRfDqzWocXsqlYQWL3/vZQgabOIWKHD+430s30QeGnVjdxJ694H+5zr3kQJfr/cCgIr36O0ELaPF30MOLV2f0npSr6xFQRWnlJ/23y1rSy/Um3fOkg+twceAc5vK58bqny216+BrjGSOtpuJUpX9oPDeK+BPAi8KiJWH+D4BpShEqcBK9XyuCzlj5FXVfX4GZSu3p+3BYEAdPiD8P5q+5xR5F1TlC2C0sS6rf4iMx+MCID/bUvX+rJpBQ9rUwKSQ6ufTlahBDidDCfIax3rZtbwSyhfYHcPcHyVttdzBkh3CuUL+/WUVqrXUMZ87VdPVAVY+wP7R8TzKd2/e1PG8C2gtBBRdRkeRmllbO8Kvpenu63tdetz6PT5TKN07d9V239Th2veWG3XoP8LuW4kn+3BlBbRjIjrKK1up2Vmq7uyNSt0uMH8GtX2KfUmM5+MiJuAF7WlvyMzF7Tte5C28qnV607B701tae+JiPs7vFfdS4CleWpZ17XXr4GuMZI62sm06qfbNUL3p9Tx2yLiT5RA/3tVQN3KI5RhCQNZiTLWdVlg9jDft9VC6NqmehoDQWliPTnC/a1f4K3W+y8B5w2QdrAvheuq7cspgUQnrbF59a6vgb44FuOpeX5G9R57D5C+vTtqoBnCv6B82b+XEgi+l9LidM4A6cnM24HTIuJHlIH3746I91djIY+jdOEeR1nW5wFKoHgsnXtEuv18RmPYn21mXhoRa1HGcb6ZEvB+LCI+kZlfyMyHImIuZazheOhF+UApo3sp3fSddAqwO11jJHW03T2U+1mu7f06/h+pj+9tycwzI+I3wNspn99HgQMiYpfMPI3+uvAJyri+Tu5m5DPjWwH5PSM8Tw1gICgtGlrB2RMDjaMawu8oQdBOEfG5tm7Alp2r7Zm1ffdTWr3arcFTA8Y5lK7WCwYZpzikqiXqVGD3iNiHMhngzMycN4xz50fEXygtbM+hzGjdDvhuZn6snrbqph2PL8V1OuxrtfLcOsA5I/psM/N+ysSZ70aZDX4e8NmI+O/qcz0b2CMiNmvrru2klaegv+WSqutxHcq4tLG2Ttt7PYcSqNwyyDlzKGP4/jDU7GIG/uNltHX0+mr7Yp4aCLb+vXxb+jXooJq1fSJwYkQsD/yBMgnpNPpbyh8erC5ExHxK6++MYea9lec7h5leDeIYQWkRUC3yfBElQHra+KIY4hFS1dIYX6B84X+uw/lbA7sAZ9eXVaF8Mb06IpaopX0rZemSujOA5wL/1eHa06M8kWG4TqF0bZ5ICRCesnhvRGzQ6X6rL9VNgPvo7/57krZWqYjYkTLIfjxsFBGb1N5rJcoYx8uqAO5pRvLZVtern/sYZZzckpSFjgGOpoyn+3ZEPO0+I+K51UxhKOPO5gMfbY3LrLyH8nn+bPDb7cpH2iYzfLTanjvIOWdQvq/aJ+cQEYu1jb/8J527pEdbR1tB9Ub1ndUizfdQhifUfbhDPp/SkleNi/0b/UFkH/BX4OOd8tOqC1X3/E+AWRHxqg7p2ltiZ1KCaLuG9TS2CEqLjv+ifBldHRHfpARpqwCvosz0XXuI878AbAgcGBGvBn4MtCZevAe4hhIM1n2L0ip3fpRFp9eidNe2j/H7fpXu+CjLX/yWEoAFZczfdsDFw7nJzLy6atnbnvIl2d6qtQVweEScQ3mKyIOUwHRnSoC3V63F82zKMjMPUbpXNwR2YHQzPwczG/hZRBxHGZf5QUpQ22mpmbrhfrbXR8SllOVb7qFMLtgN+FmrpSwzb46IHSgtu9dVs6RbTxZ5BaV79bIq7T0RcThwOGV5kp9SJnh8hDKLfDzWnVsdOC8iflblf3fgl5n5q4FOqLrEj6eMCX05ZQjBPEq5bEsJEE+ukl8FfDgiDqG0PD6SmecwyjqambdVs6m3oCxdU/ct4BMR8a3q/V/L0ydvLQPcERFnUcr2IcpyQ2+hmpmdmQsi4gOUsYPXRcR3KMvhrEZ5Gsw0+ie1fLLKy8URcSKl2/u5wLsoE0luAYiIVSjDPk4Y6N7UbLYISouIakD5RlTBDWUdvg9TvuAPGsb5T1KCgF2qc46gtLq9j9Iy9Mr2VqtqQeB9KV9qx1Ja3N5K+XKqp1tA+QLanxK4HE3p7no15RFwV4/wdk+ptt/v0IpxFmXpjNUpa6ydSGlVuhl4Z2bWlzvZmzLwfgfKGLyXUr542yd/jJXfUT6THShLfcwD3lFfLLiTEXy2x1KeG3wgJXh4C6Usdmy73nmUbsPTgK0oYySPpgTCn6MEPa20R1AC0ecB/035o+Bk4I3D6ZLvwo6Ubsoj6X/axXaDnlHy+RHKkj8rUu7hKMo4ux8CF9aSHkYZU/pxyv0fV50/FnX0O8BWUZ51XXcYpZ5tS/mDazHKepR1j1I+15dRPtNjKZ/5fpTnZrfu89IqT3+g1IGvUsa53kdZd7CV7u+UPxROp5RpazzsFTx12MN/UFp9z0DqYNrChbYUS00V5bm951CevLH1YK0yEyki9qR8AUZm3jhU+skgqieLZOYevc7LZBT9TxZ5XjVObpFTddfeTHkSyNd7nZ/hqFoxL24fJyu12CIoNVhmPk5pMfgLcFZEvKLHWWrZDfj9ohIEqhmqx7YdReminvRDq6qxv2tTWl+ljiZ9RZY0vjLzn5RHsfVU1d22DWUs1IaUbjZpUsnM/6Z0oU96mXkuZf1FaUAGgpImi5UpY7oeAL6QmWf1NjuSNPU5RlCSJKmhbBHsQl9f33RKV9rfGXhFfUmSpMlgMcrKAFfOnDnzKasBGAh2Z2PgN73OhCRJ0gi8hrKG5r8ZCHbn7wAveclLWGKJJQZNOHv2bGbMGO5TgKYuy6GwHArLoZ9lUVgOheVQWA79xqIs5s+fz4033ghV/FJnINidJwGWWGIJpk+fPmTi4aRpAsuhsBwKy6GfZVFYDoXlUFgO/cawLJ42nM11BCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIayieLTGIP/3M+j857otfZGBNLL/ecXmdBkiS1mTSBYEQsDdwArA5snJlX1Y7tDHwKeBEwBzgsM89oO/+ZwGHAfwLLA1cCe2fmn9vSrQp8GXgLsBD4GfCxzLxnPO5rNB6d9wQXXHlbr7MxJjZcc6leZ0GSJLWZTF3Dh9IhMI2IbYFTgJ8As4BfAz+IiFltSb8E7AkcArwdmA9cEBGr1a61OHA+8DJgZ2A3YFPg7IiYNsb3I0mSNKlNihbBiJgB7AF8HDix7fDhwJmZ+cnq9UURsR7wWeDn1fmrV+d/NDO/We37A/A34GPAAdW5/wFsAMzIzGurdHOB31GCzPPG4/4kSZImo8nSIng88FXgxvrOiHgxsC5welv604CNI2Ll6vWbgcWAf3cXZ+bDlG7frWrnbQVc0woCq3SXAbe2pZMkSZryeh4IRsT7gLWBIzocXq/aXte2vxXIRS3dXZl5b4d0L4mIZ9TStV+rlW7dkeRbkiRpUdfTruGIWA44Gtg3Mx+JiPYkK1TbB9r2319tV6yla0/TSvdMYGngoSHSrT/8nBezZ88eVrq+vr6RXhqAZy27MnPnzu3q3MlmwzXX7rocphrLobAc+lkWheVQWA6F5dBvPMui12MEjwBuysxTe5yPrsyYMYPp06cPmqavr4+ZM2d2df277nuU1VZ7tKtzJ6Nuy2EqGU19mEosh36WRWE5FJZDYTn0G4uymDdv3oCNVz0LBCPipZQJHltExPLV7qVb24hYhv6Wv+WBO2unt1oK76u291dp2q0APA48Mox093XYL0mSNGX1cozgOpRA9CJKgHY/cE517CLgN8D11ev12s5tdeNmtb0eWCUiVuyQ7sbMXFBL136tVroburgHSZKkRVYvA8HfAq9v+9mnOrYHsFtm/o0SoO3Qdu6OwJWZeXf1+pfAAmD7VoJqgeq38dQlYc4DXlYtP9NK92rKQtUuHSNJkhqlZ13D1ZM8Lq7vq00W6as9WeRg4IyImAP8irJY9JuBrWvXuiMiTgA+HxFPUJaD2Q+YBhxbe4uzgKuBH0XEJyn3fzTwe6o1CSVJkpqi58vHDCUzzwR2BbYFfgFsCeyUme2B2z7A1ykTUM4GngW8KTPn1q71BOXRcrOB7wMnAX8AtsnMheN8K5IkSZNKr2cNP0VmXkxpxWvffwrlMXODnfs48InqZ7B0d/L0rmZJkqTGmfQtgpIkSRofBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNtXiv3jgi3gV8HFgXWBq4A/gJcHhmPlhLNwv4HLB+lebYzDyuw/X2A/YEVgWuBQ7MzAva0iwDHA1sCywJXATslZm3jPX9SZIkTXa9bBFcEbgU+CDwFuDLwPuBM1sJImIT4GzgT8As4CTg2IjYo36hKgg8Ejge2Bq4CTg3IjZoe88fANsAewE7AKsBF0TEUmN9c5IkSZNdz1oEM/Nbbbsujoh/ASdGxGqZORc4GPhjZn6gSnNRRLwQOCQivpGZCyJiOvAZSkvhMQARcQlwDfBpYPtq36soQeLWmXlete8aYA6wC/C1cbxdSZKkSWeyjRG8p9ouUQV4bwDOaEtzGqX79xXV602B5YDTWwky80ngh8CsiJhW7d4KeBA4v5buNuB31TFJkqRG6VmLYEtELAY8E3gppQXw7My8JSLWB5YArms75dpquy5wFbBe9fr6DumWBlYHbq/S3ZCZCzqk23IMbkWSJGmR0vNAELiX0qIHpbVup+rfK1TbB9rS319tV6ylm5eZjw2S7vYqXfu1WulW7LB/SLNnzx5Wur6+vm4uz7OWXZm5c+d2de5ks+Gaa3ddDlON5VBYDv0si8JyKCyHwnLoN55lMRkCwdcBSwEzKGP9zomILXqao2GaMWMG06dPHzRNX18fM2fO7Or6d933KKut9mhX505G3ZbDVDKa+jCVWA79LIvCcigsh8Jy6DcWZTFv3rwBG696Hghm5p+rf14WEX2U7t530t8lvHzbKa2Wwvuq7f3A9IhYMjP/NUS6F3bIwgq1NJIkSY0x2SaL/BlYAKxNmc07n/4xgC3rV9sbqm1rbGCndA9T1h5spYva5JF6uhuQJElqmMkWCG5CydPNmTkPuJBq+ZeaHYE7gT9Wry+jzAbeoZWgmoCyPXB+Zi6sdp9HaV3cspbuBcDm1TFJkqRG6eWTRX4BXECZtfsvYENgf+Bq4KdVssOASyPim8CpwGbA7sCerdm/mTkvIo4AjoyIuykB4m7AWvRPPCEzL4+Ic4FvR8S+wEPV9W8DTh7Pe5UkSZqMejlG8ArgvcCLq9e3ACcAX8zM+QCZ+fuIeDvlqSE7A3OBfTLzhPqFMvOYiAD4KPBcSnC5dWb+pe09dwSOoSwePZ3yiLntMnPqzMiQJEkapl4+WeQg4KBhpDuPYXTdVk8VOWaINA8DH6p+JEmSGm2yjRGUJEnSBDEQlCRJaigDQUmSpIYyEJQkSWooA0FJkqSGMhCUJElqKANBSZKkhhpxIBgRW3Z4Xq8kSZIWMd20CP4cuD0ijo6IDcY6Q5IkSZoY3QSC7wB+B+wJ/DEiro6I/SJitTHNmSRJksbViAPBzDw7M7enPNN3d+Bu4Cjg1oj4ZUS8NyKWGuN8SpIkaYx1PVkkMx/OzO9k5huBNYBPAasApwB3RcR3I+KNY5RPSZIkjbGxmjW8GPBMYDowDXgMeBPwq4j4U0TMGKP3kSRJ0hhZvNsTI2I5YHvgvcBmwBPAucAnqu0CYBvgS8BJwMajzawkSZLGzogDwYh4ByX42wpYErgS2Bv4QWbe15b8pxHxHOBro8ynJEmSxlg3LYI/Bu4Avgyckpk3DJH+auDULt5HkiRJ46ibQPDNwAWZuXA4iTPzCuCKLt5HkiRJ42jEgWBm/no8MiJJkqSJ1c0j5r4UETcNcvzGiDh6dNmSJEnSeOtm+ZitgTMGOX4G8LbusiNJkqSJ0k0g+ALglkGO31qlkSRJ0iTWTSD4EPDiQY6vSVlQWpIkSZNYN4HghcCHIuKF7Qci4kXAh6o0kiRJmsS6WT7mYGAWMDsiTgKurfbPAHYBngQOGpPcSZIkadx0s3zMTRGxGXA8sFfb4UuAvTIzxyJzkiRJGj9dPWs4M68FXlc9Pm7NaveczLx3zHImSZKkcdVVINiSmfcA94xRXiRJkjSBugoEI2IxYEtKa+AKwLS2JAsz8/BR5k2SJEnjaMSBYERsBJwFPJ+nB4AtCwEDQUmSpEmsmxbBrwHPAt4B/CYzHxjLDEmSJGlidBMIvhz4dGaeM9aZkSRJ0sTpZkHp2xm4S1iSJEmLiG4CwaOA3SNi2bHOjCRJkiZON13DKwL/BP4aET8C/pfyNJG6hZl59GgzJ0mSpPHTTSB4VO3fewyQZiFgIChJkjSJdRMIvnjMcyFJkqQJ182zhm8dj4xIkiRpYnX9iLmIWAd4HbAKcGpm3hIRSwCrAndm5vyxyaIkSZLGQzdPFnkGcALwAcoyMguB3wO3AEsA1wCHAf89ZrmUJEnSmOtm+ZhPAe8HDgI2obamYGY+Qnn83LvGJHeSJEkaN90EgrsC38nMI4G/djh+DbDOqHIlSZKkcddNIPh84IpBjj8GLNNddiRJkjRRugkE7wTWGOT4TMCZxZIkSZNcN4HgWcB/VbOGWxYCRMQsYGfgh2OQN0mSJI2jbgLBQ4HbgD8Bp1KCwE9FxB+AnwF/Af7fWGVQkiRJ42PEgWBmPgRsChwJPBf4F7A5sDQlSHxtZj42hnmUJEnSOOhqQenM/BclEDxybLMjSZKkidJN17AkSZKmgG6eLPKdYSRbmJkf6CI/kiRJmiDddA2/gWqWcM1iwPOq7d3AP0eZL0mSJI2zEQeCmfmiTvsj4pnAh4CPAVsMdZ2I2A54D2XdwRWBOcDXgRMzc0Et3Szgc8D6wB3AsZl5XIfr7QfsCawKXAscmJkXtKVZBjga2BZYErgI2Cszbxkqv5IkSVPNmI0RzMzHM/OrwC+Brw7jlH2BecD+wFuBnwJfAT7fShARmwBnU5aqmQWcBBwbEXvUL1QFgUcCxwNbAzcB50bEBm3v+QNgG2AvYAdgNeCCiFhqJPcqSZI0FXQ1a3gIfwHeN4x0b8vMu2uvL4qIpYGPRMRnMnMecDDwx9p4w4si4oXAIRHxjcxcEBHTgc9QWgqPAYiISyjPPP40sH2171WUIHHrzDyv2ncNpSVyF+Bro7prSZKkRcx4zBreAnh0qERtQWDLnyhdtitWAd4bgDPa0pxG6f59RfV6U2A54PTatZ+kPN1kVkRMq3ZvBTwInF9Ldxvwu+qYJElSo3Qza/jgAQ4tD7yWEqAd1WV+XgPcB/wDCGAJ4Lq2NNdW23WBq4D1qtfXd0i3NLA6cHuV7ob6+MNaui27zK8kSdIiq5uu4UMH2H8/pZt1D+CbI71oRGwE7Ap8NjOfjIgVqkMPdHgfKBNMAFYA5nV4mkk93e1VuvZrtdKt2GG/JEnSlNbNrOEx706OiFWBs4ArqE0Wmexmz549rHR9fX1dXf9Zy67M3Llzuzp3stlwzbW7LoepxnIoLId+lkVhORSWQ2E59BvPshiPySIjEhHLAT+njCvcJjMfrw61WvSWbzul1VJ4Xy3d9IhYsnr03WDpXtghCyvU0ozIjBkzmD59+qBp+vr6mDlzZjeX5677HmW11YYcbrnI6LYcppLR1IepxHLoZ1kUlkNhORSWQ7+xKIt58+YN2HjVzRjBTsHUkKqJGe3XWpKyPMwqwKaZeW/t8BxgPmVs3/m1/etX2xuqbWts4HqUySb1dA9T1h5spdsiIqZl5sK2dDcgSZLUMN10894C/K2Ln6eIiMUpM3tfDszKzFvrx6vlYy6kWv6lZkfgTuCP1evLKLOBd6hde7HqvPNrQd95lNbFLWvpXgBsXh2TJElqlG66hncDPgq8gLKUy43V/qAEabdRFoZun53b7njgbcABwFIR8erasesy8yHgMODSiPgmcCqwGbA7sGdr9m9mzouII4AjI+JuSoC4G7AWsFPrgpl5eUScC3w7IvYFWte/DTi5i3KQJElapHUTCD4PmA6snZn31w9ExCGUdflWzcz/N8R1Wi1zX+hw7PXAxZn5+4h4O+WpITsDc4F9MvOEeuLMPCYioASoz6UsCbN1Zv6l7bo7AsdQFo+eTnnE3HaZOXUG4kmSJA1TN4HgHsAX24NAgMy8t2q92xsYNBAc6JnFHdKdxzC6bqunihwzRJqHKc9D/tBw3luSJGkq62aM4EqUhZoH8uwqjSRJkiaxbgLBPwB7R8TT5jJXi0LvDVw+2oxJkiRpfHXTNfwR4GLgioi4Erip2r8OsDFlTb69xiR3kiRJGjcjbhHMzOuAl1FmBi8PbFv9LA98GXhZZl470PmSJEmaHLp6skhm3gXsU/1IkiRpETSqR8xFxDqUp4LMzswHxyZLkiRJmgjdTBYhInaKiNsoj2a7FJhZ7X9ORNwYEe1PA5EkSdIkM+JAMCL+A/g+5dm9+wPTWscy855q/85jlUFJkiSNj25aBD8N/DoztwRO6XD8cmCDUeVKkiRJ466bQHA94CeDHP8HsHJ32ZEkSdJE6SYQ/CeDP1lkLeCe7rIjSZKkidJNIHghsEtELNF+ICJWA3YHfjHajEmSJGl8dTtG8HnAVcCHgYXAVhFxFHANsAD47JjlUJIkSeOimyeL3ARsBtwJHEqZNfxx4ADgz8DmmXnb2GVRkiRJ42FEC0pHxGLA6sBdmfnmiFgBWJsSUN6cmXePQx4lSZI0Dkb6ZJFnAHOAA4EvZub9wJVjnitJkiSNuxF1DWfm48BcyrhASZIkLcK6mSxyEmXW8JJjnRlJkiRNnJF2DQPcCCwG3BARpwA3A4+1J8rMH44yb5IkSRpH3QSC36/9+6AB0iwEDAQlSZImsWEFghHxFeCUzOwDXl/tXprSEvjkOOVNkiRJ42i4LYIfAf4A9GXmJRGxEuWZwltk5iXjljtJkiSNm24mi7RMG7NcSJIkacKNJhCUJEnSIsxAUJIkqaFGMmt4zYh4ZfXv5artuhHxSKfEmXnFqHImSZKkcTWSQPCz1U/dcR3STaMsH7NYt5mSJEnS+BtuILjruOZCkiRJE25YgWBmnjLeGZEkSdLEcrKIJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDXU4r1884hYG9gPeDUwA7ghM2d0SDcL+BywPnAHcGxmHtch3X7AnsCqwLXAgZl5QVuaZYCjgW2BJYGLgL0y85axuzNJkqTJr9ctgi8Ftgb+ClzXKUFEbAKcDfwJmAWcBBwbEXu0pdsPOBI4vrrmTcC5EbFB2yV/AGwD7AXsAKwGXBARS43RPUmSJC0SetoiCJyTmf8DEBEnAxt1SHMw8MfM/ED1+qKIeCFwSER8IzMXRMR04DOUlsJjqutdAlwDfBrYvtr3KkqQuHVmnlftuwaYA+wCfG1c7lKSJGkS6mmLYGYuGOx4FeC9ATij7dBplO7fV1SvNwWWA06vXftJ4IfArIiYVu3eCngQOL+W7jbgd9UxSZKkxuh11/BQ1gKW4OndxtdW23Wr7XrV9voO6ZYGVq+lu6FDAHpt7VqSJEmN0Ouu4aGsUG0faNt/f7VdsZZuXmY+Nki626t07ddqpVuxw/5BzZ49e1jp+vr6RnppAJ617MrMnTu3q3Mnmw3XXLvrcphqLIfCcuhnWRSWQ2E5FJZDv/Esi8keCE5qM2bMYPr06YOm6evrY+bMmV1d/677HmW11R7t6tzJqNtymEpGUx+mEsuhn2VRWA6F5VBYDv3GoizmzZs3YOPVZO8abrXoLd+2v9VSeF8t3fSIWHIY6dqv1Up3X4f9kiRJU9ZkDwTnAPPpHwPYsn61vaHatsYGdkr3MGXtwVa6qE0eqae7AUmSpAaZ1IFgZs4DLqRa/qVmR+BO4I/V68sos4F3aCWIiMWq887PzIXV7vMoLYJb1tK9ANi8OiZJktQYvX6yyFL0L9uyBrBsRGxbvb4yM28FDgMujYhvAqcCmwG7A3u2Zv9m5ryIOAI4MiLupgSIu1FmHe/Uer/MvDwizgW+HRH7Ag9V178NOHlcb1aSJGmS6fVkkVWAM9v2tV7vCpycmb+PiLdTnhqyMzAX2CczT6iflJnHRATAR4HnUpaE2Toz/9J2/R2BYyiLR0+nPGJuu8ycOrMyJqEll3wWd903NYp4qemLs8yzl+h1NiRJGrWeBoLV833bx+t1Sncew+i6rZ4qcswQaR4GPlT9aII8/uRCLrjytl5nY0y8ceMXGghKkqaEST1GUJIkSePHQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhFu91BiZaRKwDHAdsDjwGnA4cmJmP9jRjkiRJE6xRgWBELA9cBNwKbAusAnwRWBl4d+9yJkmSNPEaFQgCHwJWADbMzHsAIuIJ4NSIODwzr+1p7iRJkiZQ08YIbgVc0AoCK2cB84BZvcmSJElSbzStRXA94Dv1HZk5LyLmAOuO4DqLAcyfP39YiefNmzeCS/d74vH5LP6MBV2dO9ksePKJKXMvTzw+n3nzFuv6/G7rw1RjOfSzLArLobAcCsuh32jLohavPO3La9rChQtHdfFFSUQ8DhyUmUe17f8t8I/MfNdwrtPX17c58JtxyKIkSdJ4ec3MmTN/W9/RtBbBsXIl8Brg78CTPc6LJEnSYBYDnkeJX56iaYHg/cDyHfavANww3IvMnDlzHvDbIRNKkiRNDnM67WzaZJHrKeME/y0ipgNrMYJAUJIkaSpoWiB4HvDGiFiptu+dwPTqmCRJUmM0bbLI8sBs4BbgcPoXlL4gM11QWpIkNUqjWgQz8wHgDcAjwI+BLwFnAO/vYbYkSZJ6olEtgpIkSerXqBZBSZIk9TMQlCRJaigDQUmSpIZq2oLSEyIi1gGOAzYHHgNOBw7MzEd7mrEJFBG7ACd1OHR8Zn5kgrMzYSJibWA/4NXADOCGzJzRId0s4HPA+sAdwLGZedxE5nU8DaccIuJk4D87nL5dZv5o3DM5ASJiO+A9wExgRcqCrl8HTszMBbV0U70+DFkOTagPABHxLuDjlOfbL035vH8CHJ6ZD9bSTfU6MWQ5NKVO1EXE0pR1jVcHNs7Mq2rHdgY+BbyI8n/osMw8Y7TvaSA4xqolai4CbgW2pX+JmpWBJi5R8xbgwdrrO3uVkQnyUmBr4HJKi/vTWt0jYhPgbOC7wL7AZsCxEfF4Zp4wgXkdT0OWQ+VmSoBQd+M45mui7Uv5XbA/cBfweuArwJrVvqbUhyHLoTLV6wOUQPhSyvfCfcDLgUOr7ZuhMXViyHKoNKFO1B1Kh9gsIrYFTgGOAn4JvAP4QUQ8lJk/H80bOmt4jEXEgcDBwBqZeU+1byfgVGBGZl7by/xNlFqL4MqtcmiCiHhGWwvHRh1awn4OrJiZr6rt+wbwNmD1ekvRomqY5dBx/1QSEStn5t1t+74I/BewfGbOa0h9GE45nMwUrw8DiYgPAidSPu+5TagTnXQoh5NpUJ2IiBnAHygtpSdSaxGMiOuBazJz+1r6X1L+/7xyNO/rGMGxtxVlgep68HMWMA+Y1ZssaaIM9Qu6eqThGyjrV9adBqwKvGKcsjahpuoX1Ui1Bz+VPwFLAis2qD4MWg4TnJ3JqPV9sURT6sQA/l0OPc1F7xwPfJW2Fs+IeDGlC/30tvSnARtHxMqjeVO7hsfeesB36juqv3bnUD7IppldVdLbgJOBz2XmE73NUk+tRfkld13b/lZL8brAVTTHWhHxAPBsylN/jhqLMS+T3GsoXWH/AILm1od6ObQ0pj5ExGLAMynDKA4Gzs7MWyJifRpUJwYqh1qSRtSJiHgfsDZlSM1GbYfXq7YD1YkAOv2xNSy2CI69FYAHOuy/n2b95ft34BBgF8o4wZ8ABwHf6mGeJoMVqu0Dbfvvr7ZNqiN/okwoeQdlPO3twOnVsIIpKSI2AnYFvpSZT9LQ+tChHKB59eFeymTCqyi/L3eq9jetTgxUDtCQOhERywFHAwdk5iMdkoxrnbBFUOMiM38B/KK261cR8SBwaEQcnplzepQ1TRKZ+eW2Xf8TERcCn6W0Hk8pEbEqZZjIFcDne5ydnhmoHJpWH4DXAUtRZtV/BjgnIrboaY5643V0KIfMfLJBdeII4KbMPLUXb26L4Ni7H1i+w/4VKN0gTfbDajuVx7gMpfUX3PJt+1t/8TW9jpwJvHC0Y14mm+ov/p8DjwLbZObj1aFG1YdBymEgU7I+AGTmnzPzssz8BvBOykzqd9KwOjFIOQxkStWJiHgpsAdwUEQsX608snR1eOmIWIZxrhMGgmPvevr784F/TxBYi7I2kJptDjCftjpCWSsMrCNTTkQsSVkKZBXgLZl5b+1wY+rDEOXQdH8GFlDGiDWmTnTwZ/rLoSnWofTOXkQJ+O4HzqmOXQT8hhJXwMB1IkeTAQPBsXce8MaIWKm2753A9OpYk70bWAj09TojvZKZ84ALge3bDu1IWWPxjxOeqUkiIqZRyuXWAWaZLnIiYnFKS/jLgVmZeWv9eFPqw1DlMMA5U64+DGITyvfxzU2pEwP4dzl0OjhF68RvKa2g9Z99qmN7ALtl5t8ofwDs0HbujsCVoy0LxwiOvROBvShjGQ6nf0HpMzKzfcbPlBURv6D8MptN+QtvFvBh4NuZ2fE/+VQQEUtRlhACWANYtloIFMp/2FuBw4BLI+KblPUlNwN2B/acKsuuDFUO1fYU4AfAXyldHrtRxgu9b8IyOv6Op6z9dgCwVES8unbsusx8iAbUB4YoB0oXVxPqQ+t34wWUGZ//AjakLKp9NfDTKtmUrxNDlUNErEED6kS11NzF9X0R0fpnX+3JIgcDZ1QrkPwKeDtl4e2tR5sHA8ExlpkPRMQbKKvm/5j+R8wd0NOMTbzrgfcDz6fUs5uAA4Fje5inibAKZQxLXev1rsDJmfn7iHg7cCSwMzAX2GcKPTEAhi6HsylPnPlMlfZxSkvHNpl5DlPHltX2Cx2OvR64uCH1YahyuJpm1Acok2TeC7y4en0LcALwxcycD9CQOjFoOUTEwzSnTgwpM8+s/sD+FGUm9Rxgp9E+VQR8sogkSVJjOUZQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGur/A0w6fGYym9kTAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df1 = df[df[\"name\"].isin([\"QuerySamplesComplete\"])]\n",
+    "df1['delta'] = df1['ts'].diff()\n",
+    "ax = df1['delta'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Time between QuerySamplesComplete (usec)');\n",
+    "plt.show()\n",
+    "\n",
+    "ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Time QuerySamplesComplete (usec)');"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc
new file mode 100644
index 000000000..de74eb820
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "utils.h"
+
+#include <chrono>
+#include <ctime>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+
+#include "logging.h"
+
+namespace mlperf {
+
+std::string DoubleToString(double value, int precision) {
+  std::stringstream ss;
+  ss.precision(precision);
+  ss << std::fixed << value;
+  return ss.str();
+}
+
+bool FileExists(const std::string filename) {
+  std::ifstream file_object(filename);
+  return file_object.good();
+}
+
+namespace {
+
+std::string DateTimeString(const char* format,
+                           std::chrono::system_clock::time_point tp,
+                           bool append_ms, bool utc) {
+  std::time_t tp_time_t = std::chrono::system_clock::to_time_t(tp);
+  std::tm date_time =
+      utc ? *std::gmtime(&tp_time_t) : *std::localtime(&tp_time_t);
+  constexpr size_t kDateTimeMaxSize = 256;
+  char date_time_cstring[kDateTimeMaxSize];
+  std::strftime(date_time_cstring, kDateTimeMaxSize, format, &date_time);
+  std::string date_time_string(date_time_cstring);
+  if (!append_ms) {
+    return date_time_string;
+  }
+
+  auto tp_time_t_part = std::chrono::system_clock::from_time_t(tp_time_t);
+  auto tp_remainder = tp - tp_time_t_part;
+  auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp_remainder)
+                .count();
+  if (ms < 0 || ms >= 1000) {
+    LogDetail([ms](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "WARNING: Unexpected milliseconds getting date and time."
+         << " ms: " << ms;
+      MLPERF_LOG_WARNING(detail, "warning_generic_message", ss.str());
+#else
+      detail("WARNING: Unexpected milliseconds getting date and time.", "ms",
+             ms);
+#endif
+    });
+  }
+  std::string ms_string = std::to_string(ms);
+  // Prefix with zeros so length is always 3.
+  ms_string.insert(0, std::min<size_t>(2, 3 - ms_string.length()), '0');
+  return date_time_string + "." + ms_string;
+}
+
+}  // namespace
+
+std::string CurrentDateTimeISO8601() {
+  return DateTimeString("%FT%TZ", std::chrono::system_clock::now(), false,
+                        false);
+}
+
+std::string DateTimeStringForPower(std::chrono::system_clock::time_point tp) {
+  return DateTimeString("%m-%d-%Y %T", tp, true, true);
+}
+
+std::string EscapeStringJson(const std::string& in) {
+  std::stringstream ss;
+  for (auto c = in.cbegin(); c != in.cend(); c++) {
+    int c_val = static_cast<int>(*c);
+    switch (*c) {
+      case '"':
+        ss << "\\\"";
+        break;
+      case '\\':
+        ss << "\\\\";
+        break;
+      case '\b':
+        ss << "\\b";
+        break;
+      case '\f':
+        ss << "\\f";
+        break;
+      case '\n':
+        ss << "\\n";
+        break;
+      case '\r':
+        ss << "\\r";
+        break;
+      case '\t':
+        ss << "\\t";
+        break;
+      default:
+        if (c_val >= 0x00 && c_val < 0x20) {
+          ss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << c_val;
+        } else {
+          ss << *c;
+        }
+    }
+  }
+  return ss.str();
+}
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h
new file mode 100644
index 000000000..c587e0cbe
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h
@@ -0,0 +1,70 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Various shared utility functions.
+
+#ifndef MLPERF_LOADGEN_UTILS_H
+#define MLPERF_LOADGEN_UTILS_H
+
+#include <algorithm>
+#include <chrono>
+#include <string>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+template <typename T>
+void RemoveValue(T* container, const typename T::value_type& value_to_remove) {
+  container->erase(std::remove_if(container->begin(), container->end(),
+                                  [&](typename T::value_type v) {
+                                    return v == value_to_remove;
+                                  }),
+                   container->end());
+}
+
+template <typename CountT, typename RatioT>
+double DurationToSeconds(
+    const std::chrono::duration<CountT, RatioT>& chrono_duration) {
+  return std::chrono::duration_cast<std::chrono::duration<double>>(
+             chrono_duration)
+      .count();
+}
+
+inline double QuerySampleLatencyToSeconds(QuerySampleLatency qsl) {
+  return static_cast<double>(qsl) / std::nano::den;
+}
+
+template <typename DurationT>
+inline DurationT SecondsToDuration(double seconds) {
+  return std::chrono::duration_cast<DurationT>(
+      std::chrono::duration<double>(seconds));
+}
+
+std::string CurrentDateTimeISO8601();
+
+/// \brief Uses a format that matches the one used by SPEC power
+/// measurement logging.
+std::string DateTimeStringForPower(std::chrono::system_clock::time_point tp);
+
+std::string DoubleToString(double value, int precision = 2);
+
+bool FileExists(const std::string filename);
+
+// \brief Escape special characters in a string for JSON.
+// Don't use this in performance critical path.
+std::string EscapeStringJson(const std::string& in);
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_UTILS_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc
new file mode 100644
index 000000000..3216c9d72
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc
@@ -0,0 +1,85 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Non-generated version logic.
+
+#include "version.h"
+
+#include "logging.h"
+#include "utils.h"
+
+namespace mlperf {
+
+/// Helper function to split a string based on a delimiting character.
+std::vector<std::string> splitString(const std::string& input,
+                                     const std::string& delimiter) {
+  std::vector<std::string> result;
+  size_t start = 0;
+  size_t next = 0;
+  while (next != std::string::npos) {
+    next = input.find(delimiter, start);
+    result.emplace_back(input, start, next - start);
+    start = next + 1;
+  }
+  return result;
+}
+
+/// Converts the hash-filename pairs to a dict.
+std::map<std::string, std::string> LoadgenSha1OfFilesToDict(
+    const std::string& in) {
+  std::map<std::string, std::string> result;
+  auto files = splitString(in, "\n");
+  for (const auto& file : files) {
+    auto hash_and_name = splitString(file, " ");
+    assert(hash_and_name.size() > 1);
+    result[hash_and_name[1]] = hash_and_name[0];
+  }
+  return result;
+}
+
+void LogLoadgenVersion() {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "loadgen_version",
+               LoadgenVersion() + " @ " + LoadgenGitRevision());
+    MLPERF_LOG(detail, "loadgen_build_date_local", LoadgenBuildDateLocal());
+    MLPERF_LOG(detail, "loadgen_build_date_utc", LoadgenBuildDateUtc());
+    MLPERF_LOG(detail, "loadgen_git_commit_date", LoadgenGitCommitDate());
+    MLPERF_LOG(detail, "loadgen_git_log_message",
+               EscapeStringJson(LoadgenGitLog()));
+    MLPERF_LOG(detail, "loadgen_git_status_message",
+               EscapeStringJson(LoadgenGitStatus()));
+    if (!LoadgenGitStatus().empty() && LoadgenGitStatus() != "NA") {
+      MLPERF_LOG_ERROR(detail, "error_uncommitted_loadgen_changes",
+                       "Loadgen built with uncommitted changes!");
+      ;
+    }
+    MLPERF_LOG(detail, "loadgen_file_sha1",
+               LoadgenSha1OfFilesToDict(LoadgenSha1OfFiles()));
+#else
+    detail("LoadgenVersionInfo:");
+    detail("version : " + LoadgenVersion() + " @ " + LoadgenGitRevision());
+    detail("build_date_local : " + LoadgenBuildDateLocal());
+    detail("build_date_utc   : " + LoadgenBuildDateUtc());
+    detail("git_commit_date  : " + LoadgenGitCommitDate());
+    detail("git_log :\n\n" + LoadgenGitLog() + "\n");
+    detail("git_status :\n\n" + LoadgenGitStatus() + "\n");
+    if (!LoadgenGitStatus().empty() && LoadgenGitStatus() != "NA") {
+      detail.Error("Loadgen built with uncommitted changes!");
+    }
+    detail("SHA1 of files :\n\n" + LoadgenSha1OfFiles() + "\n");
+#endif
+  });
+}
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h
new file mode 100644
index 000000000..87c3409aa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Declares the version-related strings that will be defined in
+/// a version_generated.cc as created by version_generator.py.
+
+#ifndef MLPERF_LOADGEN_VERSION_H
+#define MLPERF_LOADGEN_VERSION_H
+
+#include <string>
+
+namespace mlperf {
+
+// Non-generated.
+void LogLoadgenVersion();
+
+// Definitions generated at compile time.
+const std::string& LoadgenVersion();
+const std::string& LoadgenGitRevision();
+const std::string& LoadgenBuildDateLocal();
+const std::string& LoadgenBuildDateUtc();
+const std::string& LoadgenGitCommitDate();
+const std::string& LoadgenGitStatus();
+const std::string& LoadgenGitLog();
+const std::string& LoadgenSha1OfFiles();
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_VERSION_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py
new file mode 100644
index 000000000..2e7524330
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py
@@ -0,0 +1,141 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# \file
+#  \brief A script run by the build to generate the version definitions
+#  expected at link time.
+
+import datetime
+import errno
+import hashlib
+import os
+import sys
+import subprocess
+
+
+# Creates a C++ raw string literal using a delimiter that is very
+# unlikely to show up in a git stats.
+def make_raw_string(str):
+    delimeter = "LGVG_RSLD"
+    return 'R"' + delimeter + "(" + str + ")" + delimeter + '"'
+
+
+def func_def(name, string):
+    return (
+        "const std::string& Loadgen"
+        + name
+        + "() {\n"
+        + "  static const std::string str = "
+        + string
+        + ";\n"
+        + "  return str;\n"
+        + "}\n\n"
+    )
+
+
+# For clients that build the loadgen from the git respository without
+# any modifications.
+def generate_loadgen_version_definitions_git(ofile, git_command):
+    git_rev = os.popen(git_command + "rev-parse --short=10 HEAD").read()
+    git_commit_date = os.popen(
+        git_command +
+        "log --format=\"%cI\" -n 1").read()
+    git_status = os.popen(git_command + "status -s -uno .").read()
+    git_log = subprocess.Popen(
+        git_command + "log --pretty=oneline -n 16 --no-decorate", stdout=subprocess.PIPE, shell=True, encoding='ascii', errors="ignore").stdout.read()
+    ofile.write(func_def("GitRevision", "\"" + git_rev[0:-1] + "\""))
+    ofile.write(func_def("GitCommitDate", "\"" + git_commit_date[0:-1] + "\""))
+    ofile.write(func_def("GitStatus", make_raw_string(git_status[0:-1])))
+    ofile.write(func_def("GitLog", make_raw_string(git_log[0:-1])))
+
+
+# For clients that might not import the loadgen code as the original git
+# repository.
+def generate_loadgen_verstion_definitions_git_stubs(ofile):
+    na = '"NA"'
+    ofile.write(func_def("GitRevision", na))
+    ofile.write(func_def("GitCommitDate", na))
+    ofile.write(func_def("GitStatus", na))
+    ofile.write(func_def("GitLog", na))
+
+
+# Always log the sha1 of the loadgen files, regardless of whether we are
+# in the original git repository or not.
+def generate_loadgen_version_definitions_sha1(ofile, loadgen_root):
+    """Writes definition for Sha1OfFiles."""
+    sha1s = ""
+    loadgen_files = [
+        "/bindings/" + s for s in os.listdir(loadgen_root + "/bindings")
+    ] + ["/" + s for s in os.listdir(loadgen_root)]
+    for fn in sorted(loadgen_files):
+        full_fn = loadgen_root + fn
+        if not os.path.isfile(full_fn):
+            continue
+        file_data = open(full_fn, "rb").read()
+        sha1s += hashlib.sha1(file_data).hexdigest() + " " + fn + "\n"
+
+    ofile.write(func_def("Sha1OfFiles", make_raw_string(sha1s[0:-1])))
+
+
+# Outputs version function definitions to cc_filename.
+# Includes SHA1's of the relevant dirs in the loadgen_root directory.
+def generate_loadgen_version_definitions(cc_filename, loadgen_root):
+    """Generates the C++ source file with the loadgen version info."""
+    try:
+        os.makedirs(os.path.dirname(cc_filename))
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+    ofile = open(cc_filename, "w")
+    ofile.write("// DO NOT EDIT: Autogenerated by version_generator.py.\n\n")
+    ofile.write("#include <string>\n\n")
+    ofile.write("namespace mlperf {\n\n")
+    # Open and read the VERSION.txt file
+    with open(os.path.join(loadgen_root, "VERSION.txt"), "r") as version_file:
+        # Read and strip any extra whitespace/newlines
+        version_contents = version_file.read().strip()
+
+    # Write the version into the function definition
+    ofile.write(func_def("Version", f"\"{version_contents}\""))
+
+    date_time_now_local = datetime.datetime.now().isoformat()
+    date_time_now_utc = datetime.datetime.utcnow().isoformat()
+    ofile.write(func_def("BuildDateLocal", '"' + date_time_now_local + '"'))
+    ofile.write(func_def("BuildDateUtc", '"' + date_time_now_utc + '"'))
+
+    git_dir = '--git-dir="' + loadgen_root + '/../.git" '
+    git_work_tree = '--work-tree="' + loadgen_root + '/.." '
+    git_command = "git " + git_dir + git_work_tree
+    git_status = os.popen(git_command + "status")
+    git_status.read()
+    is_git_repo = git_status.close() is None
+    if is_git_repo:
+        generate_loadgen_version_definitions_git(ofile, git_command)
+    else:
+        generate_loadgen_verstion_definitions_git_stubs(ofile)
+    generate_loadgen_version_definitions_sha1(ofile, loadgen_root)
+
+    ofile.write("}  // namespace mlperf\n")
+    ofile.close()
+
+
+def main():
+    if len(sys.argv) != 3:
+        raise ValueError("Incorrect command-line arguments.")
+    generate_loadgen_version_definitions(sys.argv[1], sys.argv[2])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py
new file mode 100644
index 000000000..cb558726b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py
@@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+TorchScript-friendly boundary types for the HSTU sparse <-> dense interface.
+
+The eager path uses ``Dict[str, SequenceEmbedding]`` (a NamedTuple of
+``lengths`` and ``embedding`` tensors). TorchScript supports ``NamedTuple`` but
+does not script cleanly through ``Dict[str, NamedTuple]`` once the dict crosses
+device boundaries. The packaged sparse / dense modules instead exchange two
+parallel ``Dict[str, Tensor]`` dicts -- one of jagged values, one of lengths.
+
+These helpers convert between the two representations so we can keep the
+existing eager code unchanged while the scripted modules use only TS-friendly
+types at their boundaries.
+"""
+
+from typing import Dict, Tuple
+
+import torch
+from generative_recommenders.modules.dlrm_hstu import SequenceEmbedding
+
+
+# Per-feature jagged values (concatenated across batch, [L_total, table_dim]).
+SeqEmbValues = Dict[str, torch.Tensor]
+# Per-feature per-batch lengths ([B]).
+SeqEmbLengths = Dict[str, torch.Tensor]
+
+
+def flatten_seq_embeddings(
+    seq_embeddings: Dict[str, SequenceEmbedding],
+) -> Tuple[SeqEmbValues, SeqEmbLengths]:
+    """Split ``Dict[str, SequenceEmbedding]`` into parallel value/length dicts.
+
+    Lossless and zero-copy -- the returned tensors alias the inputs.
+    """
+    values: Dict[str, torch.Tensor] = {}
+    lengths: Dict[str, torch.Tensor] = {}
+    for k, v in seq_embeddings.items():
+        values[k] = v.embedding
+        lengths[k] = v.lengths
+    return values, lengths
+
+
+def unflatten_seq_embeddings(
+    values: SeqEmbValues,
+    lengths: SeqEmbLengths,
+) -> Dict[str, SequenceEmbedding]:
+    """Inverse of :func:`flatten_seq_embeddings`.
+
+    Reconstructs ``Dict[str, SequenceEmbedding]`` for code paths (e.g.
+    ``DlrmHSTU.main_forward``) that still consume the NamedTuple form.
+    """
+    out: Dict[str, SequenceEmbedding] = {}
+    for k, val in values.items():
+        out[k] = SequenceEmbedding(lengths=lengths[k], embedding=val)
+    return out
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf b/recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf
new file mode 100644
index 000000000..c6ca854f9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf
@@ -0,0 +1,5 @@
+# Please set these fields depending on the performance of your system to
+# override default LoadGen settings.
+*.Server.target_latency = 80
+# *.Server.min_duration = 20000
+# *.Offline.min_duration = 20000
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py b/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py
new file mode 100644
index 000000000..06b43aa48
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py
@@ -0,0 +1,539 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+import argparse
+import json
+import logging
+import os
+import tarfile
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from urllib.request import urlretrieve
+
+import numpy as np
+import pandas as pd
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("main")
+
+"""
+Usage:
+    mkdir -p data/ && python3 preprocess_public_data.py --dataset kuairand-1k
+    python3 preprocess_public_data.py --dataset yambda-5b   --data-path <root>
+    python3 preprocess_public_data.py --dataset yambda-500m --data-path <root>
+    python3 preprocess_public_data.py --dataset yambda-50m  --data-path <root>
+"""
+
+SUPPORTED_DATASETS = [
+    "kuairand-1k",
+    "kuairand-27k",
+    "yambda-50m",
+    "yambda-500m",
+    "yambda-5b",
+]
+
+
+def get_feature_merge_weights(dataset: str = "debug") -> Dict[str, int]:
+    if "kuairand" in dataset:
+        return {
+            "is_click": 1,
+            "is_like": 2,
+            "is_follow": 4,
+            "is_comment": 8,
+            "is_forward": 16,
+            "is_hate": 32,
+            "long_view": 64,
+            "is_profile_enter": 128,
+        }
+    else:
+        return {"dummy": 1}
+
+
+class DataProcessor:
+    def __init__(
+        self,
+        download_url: str,
+        data_path: str,
+        file_name: str,
+        prefix: str,
+    ) -> None:
+        self._download_url = download_url
+        self._data_path = data_path
+        self._file_name = file_name
+        self._prefix = prefix
+
+    def download(self) -> None:
+        return
+
+    def preprocess(self) -> None:
+        return
+
+    def file_exists(self, name: str) -> bool:
+        return os.path.isfile("%s/%s" % (os.getcwd(), name))
+
+
+class DLRMKuaiRandProcessor(DataProcessor):
+    def __init__(
+        self,
+        download_url: str,
+        data_path: str,
+        file_name: str,
+        prefix: str,
+    ) -> None:
+        super().__init__(download_url, data_path, file_name, prefix)
+        if prefix == "KuaiRand-1K":
+            self._log_files: List[str] = [
+                f"{data_path}{prefix}/data/log_standard_4_08_to_4_21_1k.csv",
+                f"{data_path}{prefix}/data/log_standard_4_22_to_5_08_1k.csv",
+            ]
+            self._user_features_file: str = (
+                f"{data_path}{prefix}/data/user_features_1k.csv"
+            )
+        elif prefix == "KuaiRand-27K":
+            self._log_files: List[str] = [
+                f"{data_path}{prefix}/data/log_standard_4_08_to_4_21_27k_part1.csv",
+                f"{data_path}{prefix}/data/log_standard_4_08_to_4_21_27k_part2.csv",
+                f"{data_path}{prefix}/data/log_standard_4_22_to_5_08_27k_part1.csv",
+                f"{data_path}{prefix}/data/log_standard_4_22_to_5_08_27k_part2.csv",
+            ]
+            self._user_features_file: str = (
+                f"{data_path}{prefix}/data/user_features_27k.csv"
+            )
+        self._output_file: str = f"{data_path}{prefix}/data/processed_seqs.csv"
+        self._event_merge_weight: Dict[str, int] = get_feature_merge_weights(
+            prefix.lower()
+        )
+
+    def download(self) -> None:
+        file_path = f"{self._data_path}{self._file_name}"
+        if not self.file_exists(file_path):
+            log.info(f"Downloading {self._download_url}")
+            urlretrieve(self._download_url, file_path)
+            log.info(f"Downloaded to {file_path}")
+            with tarfile.open(file_path, "r:*") as tar_ref:
+                tar_ref.extractall(path=self._data_path)
+                log.info("Data files extracted")
+            os.remove(file_path)
+            log.info("Tar file removed")
+
+    def preprocess(self) -> None:
+        self.download()
+        log.info("Preprocessing data...")
+        seq_cols = [
+            "video_id",
+            "time_ms",
+            "action_weights",
+            "play_time_ms",
+            "duration_ms",
+        ]
+        df = None
+        for idx, log_file in enumerate(self._log_files):
+            log.info(f"Processing {log_file}...")
+            log_df = pd.read_csv(
+                log_file,
+                delimiter=",",
+            )
+            df_grouped_by_user = log_df.groupby("user_id").agg(list).reset_index()
+
+            for event, weight in self._event_merge_weight.items():
+                df_grouped_by_user[event] = df_grouped_by_user[event].apply(
+                    lambda seq: np.where(np.array(seq) == 0, 0, weight)
+                )
+
+            events = list(self._event_merge_weight.keys())
+            df_grouped_by_user["action_weights"] = df_grouped_by_user.apply(
+                lambda row: [int(sum(x)) for x in zip(*[row[col] for col in events])],
+                axis=1,
+            )
+            df_grouped_by_user = df_grouped_by_user[["user_id"] + seq_cols]
+
+            if idx == 0:
+                df = df_grouped_by_user
+            else:
+                df = df.merge(df_grouped_by_user, on="user_id", suffixes=("_x", "_y"))
+                for col in seq_cols:
+                    df[col] = df.apply(
+                        lambda row: row[col + "_x"] + row[col + "_y"], axis=1
+                    )
+                    df = df.drop(columns=[col + "_x", col + "_y"])
+
+        max_seq_len = df["video_id"].apply(len).max()
+        min_seq_len = df["video_id"].apply(len).min()
+        average_seq_len = df["video_id"].apply(len).mean()
+        log.info(f"{max_seq_len=}, {min_seq_len=}, {average_seq_len=}")
+
+        log.info("Merging user features...")
+        user_features_df = pd.read_csv(self._user_features_file, delimiter=",")
+
+        def _one_hot_encode(row):
+            mapping = {category: i + 1 for i, category in enumerate(row.unique())}
+            row = row.map(mapping)
+            return row
+
+        for col in [
+            "user_active_degree",
+            "follow_user_num_range",
+            "fans_user_num_range",
+            "friend_user_num_range",
+            "register_days_range",
+        ]:
+            user_features_df[col] = _one_hot_encode(user_features_df[col])
+
+        final_df = pd.merge(df, user_features_df, on="user_id")
+        final_df.to_csv(self._output_file, index=False, sep=",")
+        log.info(f"Processed file saved to {self._output_file}")
+
+
+# ----------------------------------------------------------------------------
+# Yambda processor
+# ----------------------------------------------------------------------------
+#
+# Yambda is hosted on HuggingFace at `yandex/yambda` and comes in three sizes:
+# 50m, 500m, 5b. Each size shares the same catalog metadata (embeddings,
+# artist/album mappings); only the interaction stream differs.
+#
+# This processor:
+#   1) Downloads `multi_event.parquet` for the chosen size + the catalog
+#      metadata files via the `datasets` library.
+#   2) Encodes event_type strings into uint8.
+#   3) Splits temporally into train + test (Global Temporal Split, GTS).
+#   4) Builds per-user sessions by inactivity gap.
+#   5) Computes item popularity counts.
+#   6) Writes the layout expected by `DLRMv3YambdaDataset`:
+#
+#      <data_path>/processed_<size>/
+#          train_sessions.parquet
+#          test_events.parquet
+#          session_index.parquet
+#          item_popularity.npy
+#          split_meta.json
+#
+#      <data_path>/shared_metadata/
+#          artist_item_mapping.parquet
+#          album_item_mapping.parquet
+#          embeddings.parquet           (optional; not used by HSTU training)
+#
+# The HSTU training path then auto-builds an `hstu_cache_L<N>/` mmap under
+# `processed_<size>/` on first use.
+# ----------------------------------------------------------------------------
+
+YAMBDA_HF_REPO = "yandex/yambda"
+YAMBDA_SIZES = {"yambda-50m": "50m", "yambda-500m": "500m", "yambda-5b": "5b"}
+YAMBDA_METADATA_FILES = (
+    "artist_item_mapping",
+    "album_item_mapping",
+    "embeddings",
+)
+
+# Yambda timestamps are seconds (rounded to 5s boundaries).
+SECONDS_PER_DAY = 86400
+# Polars chunk size for streaming the 5b parquet (~150 GB on disk).
+YAMBDA_CHUNK_SIZE = 10_000_000
+EVENT_TYPE_MAP = {"listen": 0, "like": 1, "dislike": 2, "unlike": 3, "undislike": 4}
+
+
+class DLRMYambdaProcessor(DataProcessor):
+    """Download + preprocess Yambda (50m / 500m / 5b) for DLRMv3YambdaDataset."""
+
+    def __init__(
+        self,
+        data_path: str,
+        size: str,
+        session_gap_seconds: int = 1800,
+        train_days: int = 300,
+        gap_minutes: int = 30,
+        test_days: int = 1,
+    ) -> None:
+        assert size in {"50m", "500m", "5b"}, f"unknown yambda size {size}"
+        super().__init__(
+            download_url="",  # download is via HuggingFace `datasets` lib
+            data_path=data_path.rstrip("/") + "/",
+            file_name=f"{size}/multi_event.parquet",
+            prefix=f"yambda-{size}",
+        )
+        self._size: str = size
+        self._raw_dir: Path = Path(self._data_path) / "raw"
+        self._processed_dir: Path = Path(self._data_path) / f"processed_{size}"
+        self._shared_dir: Path = Path(self._data_path) / "shared_metadata"
+        self._session_gap_seconds: int = session_gap_seconds
+        self._train_days: int = train_days
+        self._gap_minutes: int = gap_minutes
+        self._test_days: int = test_days
+
+    def download(self) -> None:
+        try:
+            from datasets import DatasetDict, load_dataset
+        except ImportError as e:
+            raise ImportError(
+                "Downloading Yambda requires the `datasets` package "
+                "(`pip install datasets`)."
+            ) from e
+
+        self._raw_dir.mkdir(parents=True, exist_ok=True)
+        self._shared_dir.mkdir(parents=True, exist_ok=True)
+
+        # Size-specific interaction stream.
+        event_path = self._raw_dir / self._size / "multi_event.parquet"
+        if not event_path.exists():
+            event_path.parent.mkdir(parents=True, exist_ok=True)
+            log.info(
+                f"Downloading multi_event.parquet for {self._size} "
+                f"from {YAMBDA_HF_REPO} ..."
+            )
+            ds = load_dataset(
+                YAMBDA_HF_REPO,
+                data_dir=f"flat/{self._size}",
+                data_files="multi_event.parquet",
+            )
+            assert isinstance(ds, DatasetDict)
+            ds["train"].to_parquet(str(event_path))
+            log.info(f"Saved {event_path}")
+        else:
+            log.info(f"Already exists: {event_path}")
+
+        # Catalog metadata files (shared across sizes).
+        for name in YAMBDA_METADATA_FILES:
+            shared_path = self._shared_dir / f"{name}.parquet"
+            if shared_path.exists():
+                log.info(f"Already exists: {shared_path}")
+                continue
+            log.info(f"Downloading {name}.parquet from {YAMBDA_HF_REPO} ...")
+            ds = load_dataset(YAMBDA_HF_REPO, data_files=f"{name}.parquet")
+            assert isinstance(ds, DatasetDict)
+            ds["train"].to_parquet(str(shared_path))
+            log.info(f"Saved {shared_path}")
+
+    def preprocess(self) -> None:
+        self.download()
+        try:
+            import polars as pl
+        except ImportError as e:
+            raise ImportError(
+                "Yambda preprocessing requires polars "
+                "(`pip install polars-u64-idx` is recommended for the 5b "
+                "variant — stock polars overflows its 32-bit row index)."
+            ) from e
+
+        self._processed_dir.mkdir(parents=True, exist_ok=True)
+        event_path = self._raw_dir / self._size / "multi_event.parquet"
+
+        log.info(f"Loading multi_event from {event_path} ...")
+        events = self._load_events(pl, event_path)
+        log.info(f"Loaded {len(events):,} events")
+
+        events = self._encode_event_types(pl, events)
+        t_min = int(events["timestamp"].min())
+        t_max = int(events["timestamp"].max())
+        log.info(
+            f"Timestamp range: {t_min}..{t_max} "
+            f"({(t_max - t_min) / SECONDS_PER_DAY:.1f} days)"
+        )
+
+        train_start, train_end, test_start, test_end = self._split_boundaries(t_max)
+        log.info(
+            f"GTS train=[{train_start},{train_end}) gap=[{train_end},{test_start}) "
+            f"test=[{test_start},{test_end})"
+        )
+        train_events, test_events = self._temporal_split(
+            pl, events, train_start, train_end, test_start, test_end
+        )
+        log.info(
+            f"Train: {len(train_events):,} events, Test: {len(test_events):,} events"
+        )
+
+        gap_units = self._session_gap_seconds  # 1 unit = 1 second
+        sessions = self._build_sessions(pl, train_events, gap_units)
+        log.info(f"Built {len(sessions):,} sessions")
+
+        session_index = self._build_session_index(pl, sessions)
+        log.info(f"Session index covers {len(session_index):,} users")
+
+        item_popularity = self._compute_item_popularity(train_events)
+
+        sessions.write_parquet(str(self._processed_dir / "train_sessions.parquet"))
+        test_events.write_parquet(str(self._processed_dir / "test_events.parquet"))
+        session_index.write_parquet(str(self._processed_dir / "session_index.parquet"))
+        np.save(self._processed_dir / "item_popularity.npy", item_popularity)
+
+        with open(self._processed_dir / "split_meta.json", "w") as f:
+            json.dump(
+                {
+                    "size": self._size,
+                    "t_min": t_min,
+                    "t_max": t_max,
+                    "train_start": train_start,
+                    "train_end": train_end,
+                    "test_start": test_start,
+                    "test_end": test_end,
+                    "train_days": self._train_days,
+                    "gap_minutes": self._gap_minutes,
+                    "test_days": self._test_days,
+                    "session_gap_seconds": self._session_gap_seconds,
+                    "num_train_events": int(len(train_events)),
+                    "num_test_events": int(len(test_events)),
+                    "num_sessions": int(len(sessions)),
+                    "num_users": int(len(session_index)),
+                },
+                f,
+                indent=2,
+            )
+        log.info(f"Preprocessing complete: {self._processed_dir}")
+
+    # ------- helpers --------
+
+    def _load_events(self, pl, parquet_path: Path):
+        # 5b is too large to load in one polars pass on most boxes (~150 GB
+        # peak in-RAM with eager read). Stream in 10M-row chunks for safety.
+        if self._size == "5b":
+            log.info(f"Streaming load (chunk_size={YAMBDA_CHUNK_SIZE:,})...")
+            lf = pl.scan_parquet(parquet_path)
+            n = lf.select(pl.len()).collect().item()
+            log.info(f"Total rows: {n:,}")
+            chunks = []
+            for off in range(0, n, YAMBDA_CHUNK_SIZE):
+                chunk = lf.slice(off, YAMBDA_CHUNK_SIZE).collect()
+                chunks.append(chunk)
+                log.info(f"  loaded {off:,}..{off + len(chunk):,}")
+            return pl.concat(chunks)
+        return pl.read_parquet(parquet_path)
+
+    def _encode_event_types(self, pl, events):
+        dt = events["event_type"].dtype
+        if dt == pl.Utf8 or isinstance(dt, (pl.Categorical, pl.Enum)):
+            events = events.with_columns(
+                pl.col("event_type")
+                .cast(pl.Utf8)
+                .replace_strict(EVENT_TYPE_MAP)
+                .cast(pl.UInt8)
+                .alias("event_type")
+            )
+        return events
+
+    def _split_boundaries(self, t_max: int) -> Tuple[int, int, int, int]:
+        test_end = t_max
+        test_start = test_end - self._test_days * SECONDS_PER_DAY
+        train_end = test_start - self._gap_minutes * 60
+        train_start = train_end - self._train_days * SECONDS_PER_DAY
+        return train_start, train_end, test_start, test_end
+
+    def _temporal_split(self, pl, events, train_start, train_end, test_start, test_end):
+        train = events.filter(
+            (pl.col("timestamp") >= train_start) & (pl.col("timestamp") < train_end)
+        )
+        test_all = events.filter(
+            (pl.col("timestamp") >= test_start) & (pl.col("timestamp") < test_end)
+        )
+        # Test users must also appear in train (next-item prediction setup).
+        train_users = train.select("uid").unique()
+        test = test_all.join(train_users, on="uid", how="inner")
+        return train, test
+
+    def _build_sessions(self, pl, events, session_gap_units: int):
+        sorted_events = events.sort(["uid", "timestamp"])
+        return (
+            sorted_events
+            .with_columns(
+                (
+                    (pl.col("timestamp").diff().fill_null(0) > session_gap_units)
+                    .cast(pl.UInt32)
+                    .cum_sum()
+                )
+                .over("uid")
+                .alias("session_id")
+            )
+            .group_by(["uid", "session_id"])
+            .agg(
+                pl.col("item_id").alias("item_ids"),
+                pl.col("timestamp").alias("timestamps"),
+                pl.col("event_type").alias("event_types"),
+                pl.col("is_organic").alias("is_organic"),
+                pl.col("played_ratio_pct").alias("played_ratio_pct"),
+                pl.col("track_length_seconds").alias("track_length_seconds"),
+            )
+            .sort(["uid", "session_id"])
+        )
+
+    def _build_session_index(self, pl, sessions):
+        return (
+            sessions
+            .with_columns(pl.col("item_ids").list.len().alias("session_len"))
+            .group_by("uid")
+            .agg(
+                pl.col("session_id").alias("session_ids"),
+                pl.col("session_len").alias("session_lens"),
+                pl.col("session_len").cum_sum().alias("session_offsets"),
+            )
+            .sort("uid")
+        )
+
+    def _compute_item_popularity(self, train_events) -> np.ndarray:
+        counts = (
+            train_events
+            .group_by("item_id")
+            .len()
+            .sort("item_id")
+        )
+        max_item = int(counts["item_id"].max())
+        popularity = np.zeros(max_item + 1, dtype=np.int64)
+        popularity[counts["item_id"].to_numpy()] = counts["len"].to_numpy()
+        return popularity
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        choices=SUPPORTED_DATASETS,
+        required=True,
+        help="dataset",
+    )
+    parser.add_argument(
+        "--data-path",
+        default="data/",
+        help=(
+            "Root directory for raw + processed data. KuaiRand defaults to "
+            "the existing `data/` convention; Yambda defaults to `data/` too "
+            "but is commonly overridden to a shared filesystem location with "
+            "enough space for the 5b variant (~500 GB)."
+        ),
+    )
+    args = parser.parse_args()
+
+    data_path = args.data_path.rstrip("/") + "/"
+
+    if args.dataset == "kuairand-1k":
+        DLRMKuaiRandProcessor(
+            download_url="https://zenodo.org/records/10439422/files/KuaiRand-1K.tar.gz",
+            data_path=data_path,
+            file_name="KuaiRand-1K.tar.gz",
+            prefix="KuaiRand-1K",
+        ).preprocess()
+    elif args.dataset == "kuairand-27k":
+        DLRMKuaiRandProcessor(
+            download_url="https://zenodo.org/records/10439422/files/KuaiRand-27K.tar.gz",
+            data_path=data_path,
+            file_name="KuaiRand-27K.tar.gz",
+            prefix="KuaiRand-27K",
+        ).preprocess()
+    elif args.dataset in YAMBDA_SIZES:
+        DLRMYambdaProcessor(
+            data_path=data_path,
+            size=YAMBDA_SIZES[args.dataset],
+        ).preprocess()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/streaming_synthetic_data.py b/recommendation_v4/generative_recommenders/dlrm_v3/streaming_synthetic_data.py
new file mode 100644
index 000000000..bb9e508af
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/streaming_synthetic_data.py
@@ -0,0 +1,664 @@
+# pyre-strict
+"""
+Streaming synthetic data generator for DLRMv3.
+
+This module generates synthetic streaming recommendation data for benchmarking
+and testing purposes. It creates user-item interaction histories with timestamps,
+ratings, and category-based item distributions.
+"""
+
+import csv
+import logging
+import math
+import multiprocessing
+import os
+import random
+import shutil
+import time
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class StreamingSyntheticDataGenerator:
+    """
+    Generator for streaming synthetic recommendation data.
+
+    Creates realistic user-item interaction data with temporal dynamics,
+    category preferences, and rating distributions for benchmarking
+    recommendation systems.
+
+    Args:
+        num_categories: Number of item categories.
+        categories_per_user: Number of categories each user is interested in.
+        num_users: Total number of users to generate.
+        num_items: Total number of items in the catalog.
+        num_timestamps: Number of time periods in the streaming data.
+        avg_samples_per_item: Average number of interactions per item.
+        train_ratio: Fraction of timestamps used for training.
+        user_sampling_ratio: Probability of sampling a user at each timestamp.
+        num_eval_candidates: Number of candidates for evaluation.
+        num_inference_candidates: Number of candidates for inference.
+        debug: If True, use deterministic ratings for debugging.
+        rank: Process rank for distributed generation.
+    """
+
+    def __init__(
+        self,
+        num_categories: int,
+        categories_per_user: int,
+        num_users: int,
+        num_items: int,
+        num_timestamps: int,
+        avg_samples_per_item: int,
+        train_ratio: float,
+        user_sampling_ratio: float,
+        num_eval_candidates: int,
+        num_inference_candidates: int,
+        debug: bool = False,
+        rank: int = 0,
+    ) -> None:
+        self.num_categories = num_categories
+        self.categories_per_user = categories_per_user
+        self.num_users = num_users
+        self.num_items = num_items
+        self.num_timestamps = num_timestamps
+        self.avg_samples_per_item = avg_samples_per_item
+        self.avg_seq_len_per_timestamp = int(
+            num_items * avg_samples_per_item / num_users / num_timestamps
+        )
+        self.items_per_category: int = num_items // num_categories
+        self.category_to_start_end_item_idx: Dict[int, Tuple[int, int]] = {}
+        for i in range(num_categories):
+            start_idx = i * self.items_per_category
+            end_idx = (i + 1) * self.items_per_category
+            self.category_to_start_end_item_idx[i] = (start_idx, end_idx)
+        self.alpha_range = (1, 500)
+        self.min_seq_len: int = num_eval_candidates + 1
+        self.train_ratio = train_ratio
+        self.num_eval_candidates = num_eval_candidates
+        self.num_inference_candidates = num_inference_candidates
+        self.debug = debug
+        self.total_cnt = 0
+        self.rank = rank
+        logger.warning(f"rank {self.rank}: start generating item rating")
+        np.random.seed(1001)
+        self.item_rating = np.random.choice(  # pyre-ignore [4]
+            [5.0, 4.0, 3.0, 2.0, 1.0], size=num_items, p=[0.2, 0.25, 0.25, 0.2, 0.1]
+        )
+        logger.warning(f"rank {self.rank}: finish generating item rating")
+        self.user_sampling_ratio = user_sampling_ratio
+
+    def generate_one_timestamp(
+        self,
+        category_to_cnt: Dict[int, int],
+        categories: List[int],
+        t: int,
+        id: int,
+        output_folder: str,
+        uih_seq_len: int,
+        eval: bool,
+        inference: bool,
+        file_idx: int,
+        ts_buffers: Dict[int, List[int]],
+    ) -> Tuple[List[int], List[float], List[int], List[float], Dict[int, int]]:
+        """
+        Generate interaction data for a single user at one timestamp.
+
+        Args:
+            category_to_cnt: Running count of interactions per category.
+            categories: Categories this user is interested in.
+            t: Current timestamp index.
+            id: User ID.
+            output_folder: Output directory for files.
+            uih_seq_len: Length of user interaction history to generate.
+            eval: Whether this is for evaluation.
+            inference: Whether this is for inference.
+            file_idx: File index for output.
+            ts_buffers: Buffer for timestamp data.
+
+        Returns:
+            Tuple of (uih_item_ids, uih_ratings, candidate_ids, candidate_ratings,
+            updated_category_counts).
+        """
+        if t >= 0 and (not eval):
+            if t not in ts_buffers:
+                ts_buffers[t] = []
+            ts_buffers[t].append(id)
+        seq_len: int = self.num_inference_candidates if inference else uih_seq_len
+        self.total_cnt += seq_len
+        alpha = random.randint(self.alpha_range[0], self.alpha_range[1])
+        total_cnt = sum(category_to_cnt.values())
+        p = np.array(
+            [
+                (alpha / len(categories) + category_to_cnt[c]) / (alpha + total_cnt)
+                for c in categories
+            ]
+        )
+        item_categories = np.random.choice(categories, size=seq_len, p=p)
+        unique, counts = np.unique(item_categories, return_counts=True)
+        for cat, cnt in zip(unique, counts):
+            category_to_cnt[cat] += int(cnt)
+        sample_end_idx = int(
+            self.items_per_category * max((t + 1), 1) / self.num_timestamps
+        )
+        sample_inds = np.random.randint(0, sample_end_idx, size=seq_len)
+        offsets = np.array(
+            [self.category_to_start_end_item_idx[cat][0] for cat in item_categories]
+        )
+        sample_inds = sample_inds + offsets
+        num_categories = len(categories)
+        quarter = num_categories // 4
+        half = num_categories // 2
+        three_quarter = num_categories // 4 * 3
+        category_to_ratings = {}
+        cos1 = math.cos(t * math.pi / 4)
+        cos2 = math.cos((t + 2) * math.pi / 4)
+        cos3 = math.cos((t + 4) * math.pi / 4)
+        for i, cat in enumerate(categories):
+            if i < quarter:
+                if self.debug:
+                    ratings = np.full(seq_len, 5.0)
+                else:
+                    ratings = np.random.choice(
+                        [4.5 + 0.5 * cos1, 4.0 + 0.5 * cos2],
+                        size=seq_len,
+                        p=[0.8, 0.2],
+                    )
+            elif i < half:
+                if self.debug:
+                    ratings = np.full(seq_len, 4.0)
+                else:
+                    ratings = np.random.choice(
+                        [4.5 + 0.5 * cos1, 4.0 + 0.5 * cos2, 3.5 + 0.5 * cos3],
+                        size=seq_len,
+                        p=[0.1, 0.8, 0.1],
+                    )
+            elif i < three_quarter:
+                if self.debug:
+                    ratings = np.full(seq_len, 3.0)
+                else:
+                    ratings = np.random.choice(
+                        [3.5 + 0.5 * cos1, 3.0 + 0.5 * cos2, 2.5 + 0.5 * cos3],
+                        size=seq_len,
+                        p=[0.1, 0.8, 0.1],
+                    )
+            else:
+                if self.debug:
+                    ratings = np.full(seq_len, 2.0)
+                else:
+                    ratings = np.random.choice(
+                        [2.5 + 0.5 * cos1, 2.0 + 0.5 * cos2, 1.5 + 0.5 * cos3],
+                        size=seq_len,
+                        p=[0.1, 0.8, 0.1],
+                    )
+            category_to_ratings[cat] = ratings
+        sample_inds = sample_inds.tolist()
+        sample_ratings = [
+            (
+                category_to_ratings[item_categories[i]][i]
+                + self.item_rating[sample_inds[i]]
+            )
+            / 2
+            for i in range(seq_len)
+        ]
+        if not inference:
+            sub_indices = random.sample(range(seq_len), self.num_eval_candidates)
+            sample_candidate_inds = [sample_inds[i] for i in sub_indices]
+            sample_candidate_ratings = [sample_ratings[i] for i in sub_indices]
+            sample_uih_inds = sample_inds
+            sample_uih_ratings = sample_ratings
+        else:
+            sub_indices = random.sample(range(seq_len), uih_seq_len)
+            sample_uih_inds = [sample_inds[i] for i in sub_indices]
+            sample_uih_ratings = [sample_ratings[i] for i in sub_indices]
+            sample_candidate_inds = sample_inds
+            sample_candidate_ratings = sample_ratings
+        return (
+            sample_uih_inds,
+            sample_uih_ratings,
+            sample_candidate_inds,
+            sample_candidate_ratings,
+            category_to_cnt,
+        )
+
+    def gen_rand_seq_len(self) -> int:
+        """
+        Generate a random sequence length from a Gaussian distribution.
+
+        Returns:
+            Sequence length, guaranteed to be at least min_seq_len.
+        """
+        seq_len = round(
+            random.gauss(
+                self.avg_seq_len_per_timestamp, self.avg_seq_len_per_timestamp // 4
+            )
+        )
+        seq_len = self.min_seq_len if seq_len < self.min_seq_len else seq_len
+        return seq_len
+
+    def get_timestamp_sample(self, t: int) -> int:
+        """
+        Determine if a user should be sampled at this timestamp.
+
+        Args:
+            t: Timestamp index. Base timestamp (-1) is always sampled.
+
+        Returns:
+            1 if the user should be sampled, 0 otherwise.
+        """
+        if t == -1:
+            sample = 1
+        else:
+            sample = np.random.choice(
+                [1, 0],
+                size=1,
+                p=[self.user_sampling_ratio, 1 - self.user_sampling_ratio],
+            )[0]
+        return sample
+
+    def generate_one_user(
+        self,
+        id: int,
+        output_folder: str,
+        file_idx: int,
+        ts_buffers: Dict[int, List[int]],
+    ) -> List[str]:
+        """
+        Generate complete interaction history for one user.
+
+        Creates training, evaluation, and inference data for a single user
+        across all timestamps.
+
+        Args:
+            id: User ID.
+            output_folder: Output directory.
+            file_idx: File index for output.
+            ts_buffers: Buffer for timestamp metadata.
+
+        Returns:
+            List of CSV row values for this user's data.
+        """
+        categories = random.sample(range(self.num_categories), self.categories_per_user)
+        category_to_cnt = {c: 0 for c in categories}
+        out_list: List[str] = []
+        # t = -1 as base UIH
+        (
+            sample_inds,
+            sample_ratings,
+            sample_candidate_inds,
+            sample_candidate_ratings,
+            category_to_cnt,
+        ) = self.generate_one_timestamp(
+            category_to_cnt=category_to_cnt,
+            categories=categories,
+            t=-1,
+            id=id,
+            output_folder=output_folder,
+            uih_seq_len=self.gen_rand_seq_len(),
+            eval=False,
+            inference=False,
+            file_idx=file_idx,
+            ts_buffers=ts_buffers,
+        )
+        out_list.append(",".join([str(ind) for ind in sample_candidate_inds]))
+        out_list.append(",".join([str(rat) for rat in sample_candidate_ratings]))
+        out_list.append(",".join([str(ind) for ind in sample_inds]))
+        out_list.append(",".join([str(rat) for rat in sample_ratings]))
+        # train
+        for t in range(int(self.num_timestamps * self.train_ratio)):
+            if self.get_timestamp_sample(t):
+                (
+                    sample_inds,
+                    sample_ratings,
+                    sample_candidate_inds,
+                    sample_candidate_ratings,
+                    category_to_cnt,
+                ) = self.generate_one_timestamp(
+                    category_to_cnt=category_to_cnt,
+                    categories=categories,
+                    t=t,
+                    id=id,
+                    output_folder=output_folder,
+                    uih_seq_len=self.gen_rand_seq_len(),
+                    eval=False,
+                    inference=False,
+                    file_idx=file_idx,
+                    ts_buffers=ts_buffers,
+                )
+                out_list.append(",".join([str(ind) for ind in sample_candidate_inds]))
+                out_list.append(
+                    ",".join([str(rat) for rat in sample_candidate_ratings])
+                )
+                out_list.append(",".join([str(ind) for ind in sample_inds]))
+                out_list.append(",".join([str(rat) for rat in sample_ratings]))
+            else:
+                out_list += ["", "", "", ""]
+        # eval
+        (
+            sample_inds,
+            sample_ratings,
+            sample_candidate_inds,
+            sample_candidate_ratings,
+            category_to_cnt,
+        ) = self.generate_one_timestamp(
+            category_to_cnt=category_to_cnt,
+            categories=categories,
+            t=int(self.num_timestamps * self.train_ratio),
+            id=id,
+            output_folder=output_folder,
+            uih_seq_len=self.num_eval_candidates,
+            eval=True,
+            inference=False,
+            file_idx=file_idx,
+            ts_buffers=ts_buffers,
+        )
+        out_list.append(",".join([str(ind) for ind in sample_candidate_inds]))
+        out_list.append(",".join([str(rat) for rat in sample_candidate_ratings]))
+        out_list.append(",".join([str(ind) for ind in sample_inds]))
+        out_list.append(",".join([str(rat) for rat in sample_ratings]))
+        # inference
+        for t in range(
+            int(self.num_timestamps * self.train_ratio), self.num_timestamps
+        ):
+            if self.get_timestamp_sample(t):
+                (
+                    sample_inds,
+                    sample_ratings,
+                    sample_candidate_inds,
+                    sample_candidate_ratings,
+                    category_to_cnt,
+                ) = self.generate_one_timestamp(
+                    category_to_cnt=category_to_cnt,
+                    categories=categories,
+                    t=t,
+                    id=id,
+                    output_folder=output_folder,
+                    uih_seq_len=self.gen_rand_seq_len(),
+                    eval=False,
+                    inference=True,
+                    file_idx=file_idx,
+                    ts_buffers=ts_buffers,
+                )
+                out_list.append(",".join([str(ind) for ind in sample_candidate_inds]))
+                out_list.append(
+                    ",".join([str(rat) for rat in sample_candidate_ratings])
+                )
+                out_list.append(",".join([str(ind) for ind in sample_inds]))
+                out_list.append(",".join([str(rat) for rat in sample_ratings]))
+            else:
+                out_list += ["", "", "", ""]
+        return out_list
+
+    def write_dataset(
+        self, output_folder: str, num_files: int, file_idx: int, seed: int
+    ) -> None:
+        """
+        Write dataset for a single file partition.
+
+        Args:
+            output_folder: Output directory path.
+            num_files: Total number of files in the dataset.
+            file_idx: Index of this file partition.
+            seed: Random seed for reproducibility.
+        """
+        t0 = time.time()
+        num_users_per_file = self.num_users // num_files
+        user_id: int = num_users_per_file * file_idx
+        random.seed(seed + file_idx)
+        np.random.seed(seed + file_idx)
+        # Buffer timestamp data in memory to avoid excessive file I/O
+        ts_buffers: Dict[int, List[int]] = {}
+        output_file = output_folder + f"{file_idx}.csv"
+        with open(output_file, "w") as file:
+            writer = csv.writer(file)
+            for i in range(num_users_per_file):
+                out_list = self.generate_one_user(
+                    id=user_id,
+                    output_folder=output_folder,
+                    file_idx=file_idx,
+                    ts_buffers=ts_buffers,
+                )
+                user_id += 1
+                writer.writerow(out_list)
+                if i % 10000 == 0:
+                    logger.warning(
+                        f"rank {self.rank}: Done with users {i} for file {file_idx + 1} / {num_files}, total_cnt = {self.total_cnt}, spends {time.time() - t0} seconds."
+                    )
+        # Write buffered timestamp data after all users are processed
+        for ts, user_ids in ts_buffers.items():
+            ts_file = output_folder + f"ts_{file_idx}_{ts}.csv"
+            with open(ts_file, "w") as f:
+                writer = csv.writer(f)
+                for uid in user_ids:
+                    writer.writerow([uid])
+        logger.warning(
+            f"rank {self.rank}: Wrote {len(ts_buffers)} timestamp files for file {file_idx}"
+        )
+
+
+def worker(
+    rank: int,
+    world_size: int,
+    num_files: int,
+    num_users: int,
+    num_items: int,
+    num_categories: int,
+    categories_per_user: int,
+    num_timestamps: int,
+    avg_samples_per_item: int,
+    num_eval_candidates: int,
+    num_inference_candidates: int,
+    train_ratio: float,
+    user_sampling_ratio: float,
+    output_folder: str,
+) -> None:
+    """
+    Worker function for parallel data generation.
+
+    Each worker generates a subset of the dataset files.
+
+    Args:
+        rank: Worker rank.
+        world_size: Total number of workers.
+        num_files: Total files to generate.
+        num_users: Total users in dataset.
+        num_items: Total items in catalog.
+        num_categories: Number of item categories.
+        categories_per_user: Categories per user.
+        num_timestamps: Number of time periods.
+        avg_samples_per_item: Average interactions per item.
+        num_eval_candidates: Eval candidates count.
+        num_inference_candidates: Inference candidates count.
+        train_ratio: Training data fraction.
+        user_sampling_ratio: User sampling probability.
+        output_folder: Output directory.
+    """
+    generator = StreamingSyntheticDataGenerator(
+        num_categories=num_categories,
+        categories_per_user=categories_per_user,
+        num_users=num_users,
+        num_items=num_items,
+        num_timestamps=num_timestamps,
+        avg_samples_per_item=avg_samples_per_item,
+        train_ratio=train_ratio,
+        user_sampling_ratio=user_sampling_ratio,
+        num_eval_candidates=num_eval_candidates,
+        num_inference_candidates=num_inference_candidates,
+        debug=False,
+        rank=rank,
+    )
+    num_files_per_rank = num_files // world_size
+    file_indices = [i + rank * num_files_per_rank for i in range(num_files_per_rank)]
+    for file_idx in file_indices:
+        logger.warning(f"rank {rank}: start generating file {file_idx}")
+        generator.write_dataset(
+            output_folder=output_folder,
+            num_files=num_files,
+            file_idx=file_idx,
+            seed=1001,
+        )
+        logger.warning(f"rank {rank}: finish generating file {file_idx}")
+
+
+def write_offset(output_folder: str, num_files: int, num_users: int) -> None:
+    """
+    Write file byte offsets for random access to user data.
+
+    Creates an offset.csv file containing byte positions for each user
+    within their respective data files.
+
+    Args:
+        output_folder: Directory containing data files.
+        num_files: Number of data files.
+        num_users: Total number of users.
+    """
+    with open(output_folder + "offset.csv", "a") as output_file:
+        writer = csv.writer(output_file)
+        for i in range(num_files):
+            input_file = output_folder + f"{i}.csv"
+            offsets = []
+            with open(input_file, "r") as f:
+                while True:
+                    offset = f.tell()
+                    line = f.readline()
+                    if not line:
+                        break
+                    offsets.append(offset)
+            assert len(offsets) == num_users // num_files, (
+                f"num_users {num_users // num_files} != {len(offsets)}"
+            )
+            logger.warning(f"offsets for file {i} finished")
+            writer.writerow([",".join([str(offset) for offset in offsets])])
+
+
+def write_ts_metadata(output_folder: str, total_ts: int, num_files: int) -> None:
+    """
+    Write timestamp metadata for streaming simulation.
+
+    Creates files tracking which users are active at each timestamp
+    and cumulative counts for efficient streaming access.
+
+    Args:
+        output_folder: Output directory path.
+        total_ts: Total number of timestamps.
+        num_files: Number of data files.
+    """
+    with open(output_folder + "requests_per_ts.csv", "w") as file_requests:
+        with open(output_folder + "users_cumsum_per_ts.csv", "w") as file_cumsum:
+            requests_writer = csv.writer(file_requests)
+            cumsum_writer = csv.writer(file_cumsum)
+            for ts in range(total_ts):
+                requests = []
+                num_users_per_file = []
+                for file in range(num_files):
+                    with open(f"{output_folder}ts_{file}_{ts}.csv", "r") as file:
+                        reader = csv.reader(file)
+                        size = 0
+                        for row in reader:
+                            requests.append(int(row[0]))
+                            size += 1
+                        num_users_per_file.append(size)
+                cumsum = np.cumsum(num_users_per_file).tolist()
+                assert cumsum[-1] == len(requests)
+                requests_writer.writerow([",".join([str(r) for r in requests])])
+                cumsum_writer.writerow([",".join([str(s) for s in cumsum])])
+                logger.warning(f"ts {ts} finished")
+    with open(
+        output_folder + "requests_per_ts_offset.csv", "w"
+    ) as file_requests_offset:
+        writer = csv.writer(file_requests_offset)
+        input_file = output_folder + "requests_per_ts.csv"
+        offsets = []
+        with open(input_file, "r") as f:
+            while True:
+                offset = f.tell()
+                line = f.readline()
+                if not line:
+                    break
+                offsets.append(offset)
+        assert len(offsets) == total_ts, f"total_ts {total_ts} != {len(offsets)}"
+        logger.warning("offsets for file requests_per_ts.csv finished")
+        writer.writerow([",".join([str(offset) for offset in offsets])])
+
+
+def copy_sub_dataset(src_folder: str) -> None:
+    """
+    Copy a subset of dataset files for quick testing.
+
+    Creates a sampled_data subdirectory with essential files.
+
+    Args:
+        src_folder: Source folder containing full dataset.
+    """
+    dst_folder = src_folder + "sampled_data/"
+    files_to_copy = [
+        "0.csv",
+        "offset.csv",
+        "requests_per_ts.csv",
+        "requests_per_ts_offset.csv",
+        "users_cumsum_per_ts.csv",
+    ]
+    os.makedirs(dst_folder, exist_ok=True)
+    for filename in files_to_copy:
+        src_path = os.path.join(src_folder, filename)
+        dst_path = os.path.join(dst_folder, filename)
+        shutil.copy2(src_path, dst_path)
+    logger.warning("Files copied successfully.")
+
+
+def main() -> None:
+    """
+    Main entry point for synthetic data generation.
+
+    Configures and launches parallel workers to generate a complete
+    streaming recommendation dataset.
+    """
+    processes = []
+    num_files = 100
+    num_users = 5_000_000
+    num_items = 1_000_000_000
+    num_categories = 128
+    categories_per_user = 4
+    num_timestamps = 100
+    avg_samples_per_item = 50
+    num_eval_candidates = 32
+    num_inference_candidates = 2048
+    train_ratio = 0.9
+    user_sampling_ratio = 0.7
+    world_size = 5
+    username = os.getlogin()
+    output_folder = f"/home/{username}/data/streaming-100b/"
+    for i in range(world_size):
+        p = multiprocessing.Process(
+            target=worker,
+            args=(
+                i,
+                world_size,
+                num_files,
+                num_users,
+                num_items,
+                num_categories,
+                categories_per_user,
+                num_timestamps,
+                avg_samples_per_item,
+                num_eval_candidates,
+                num_inference_candidates,
+                train_ratio,
+                user_sampling_ratio,
+                output_folder,
+            ),
+        )
+        processes.append(p)
+        p.start()
+    for p in processes:
+        p.join()
+    write_offset(output_folder, num_files, num_users)
+    write_ts_metadata(output_folder, num_timestamps, num_files)
+    copy_sub_dataset(src_folder=output_folder)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/tests/test_lifetime_auc_resume.py b/recommendation_v4/generative_recommenders/dlrm_v3/tests/test_lifetime_auc_resume.py
new file mode 100644
index 000000000..3d5bbd1ee
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/tests/test_lifetime_auc_resume.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+Round-trip correctness test for ``LifetimeAUCMetricComputation`` checkpoint
+serialization.
+
+Background: torchrec's ``AUCMetricComputation`` registers its
+PREDICTIONS/LABELS/WEIGHTS buffers with ``persistent=False``, so the default
+``state_dict()`` returns them empty and a separate ``_num_samples`` counter is
+dropped too. Without the overrides on ``LifetimeAUCMetricComputation`` every
+checkpoint resume would silently restart the lifetime AUC from an empty buffer.
+
+These tests assert:
+  1. update -> compute == A; state_dict -> load_state_dict on a fresh metric ->
+     compute == A (buffers survive the round trip).
+  2. ``_num_samples`` round-trips exactly (required so the next update() does
+     not take the init-sentinel branch and desync windowed eviction).
+  3. The shared-blob path (buffers stripped) leaves a fresh metric empty, so the
+     per-rank artifact is the sole authority for the trailing buffer.
+
+Runs in <1s on CPU. Skipped automatically if torchrec is unavailable.
+"""
+
+import unittest
+
+import torch
+
+try:
+    from generative_recommenders.dlrm_v3.utils import LifetimeAUCMetricComputation
+
+    _HAVE_DEPS = True
+except Exception:  # pragma: no cover - import guard for envs without torchrec
+    _HAVE_DEPS = False
+
+
+def _make_metric(n_tasks: int = 1, window: int = 10_000_000):
+    return LifetimeAUCMetricComputation(
+        my_rank=0,
+        batch_size=128,
+        n_tasks=n_tasks,
+        window_size=window,
+    )
+
+
+def _feed(metric, preds, labels, weights) -> None:
+    metric.update(
+        predictions=preds,
+        labels=labels,
+        weights=weights,
+    )
+
+
+@unittest.skipUnless(_HAVE_DEPS, "torchrec / generative_recommenders not importable")
+class LifetimeAUCResumeTest(unittest.TestCase):
+    def setUp(self) -> None:
+        torch.manual_seed(0)
+        self.n_tasks = 1
+        self.n = 4096
+        self.preds = torch.rand(self.n_tasks, self.n)
+        self.labels = (torch.rand(self.n_tasks, self.n) > 0.5).float()
+        self.weights = torch.ones(self.n_tasks, self.n)
+
+    def _compute_value(self, metric) -> float:
+        reports = metric._compute()
+        return float(reports[0].value.flatten()[0].item())
+
+    def test_state_dict_round_trip_preserves_auc(self) -> None:
+        m = _make_metric(self.n_tasks)
+        _feed(m, self.preds, self.labels, self.weights)
+        auc_a = self._compute_value(m)
+        n_a = m.lifetime_sample_count()
+        self.assertEqual(n_a, self.n)
+
+        sd = m.state_dict()
+
+        fresh = _make_metric(self.n_tasks)
+        fresh.load_state_dict(sd)
+        auc_b = self._compute_value(fresh)
+
+        self.assertEqual(fresh.lifetime_sample_count(), self.n)
+        self.assertAlmostEqual(auc_a, auc_b, places=6)
+
+    def test_num_samples_round_trips(self) -> None:
+        m = _make_metric(self.n_tasks)
+        _feed(m, self.preds, self.labels, self.weights)
+        sd = m.state_dict()
+        fresh = _make_metric(self.n_tasks)
+        fresh.load_state_dict(sd)
+        self.assertEqual(fresh._num_samples, m._num_samples)
+
+    def test_continued_update_after_resume_matches_uninterrupted(self) -> None:
+        # Splitting a stream and resuming in the middle must equal feeding it all
+        # at once (this is what fails when _num_samples is not restored).
+        half = self.n // 2
+        p1, p2 = self.preds[:, :half], self.preds[:, half:]
+        l1, l2 = self.labels[:, :half], self.labels[:, half:]
+        w1, w2 = self.weights[:, :half], self.weights[:, half:]
+
+        ref = _make_metric(self.n_tasks)
+        _feed(ref, p1, l1, w1)
+        _feed(ref, p2, l2, w2)
+        auc_ref = self._compute_value(ref)
+
+        part = _make_metric(self.n_tasks)
+        _feed(part, p1, l1, w1)
+        resumed = _make_metric(self.n_tasks)
+        resumed.load_state_dict(part.state_dict())
+        _feed(resumed, p2, l2, w2)
+        auc_resumed = self._compute_value(resumed)
+
+        self.assertAlmostEqual(auc_ref, auc_resumed, places=6)
+
+    def test_blob_state_dict_strips_buffers(self) -> None:
+        from generative_recommenders.dlrm_v3.checkpoint import (
+            _metric_blob_state_dict,
+        )
+
+        m = _make_metric(self.n_tasks)
+        _feed(m, self.preds, self.labels, self.weights)
+        blob = _metric_blob_state_dict(m)
+        prefix = LifetimeAUCMetricComputation._LIFETIME_KEY_PREFIX
+        self.assertFalse(any(k.startswith(prefix) for k in blob.keys()))
+
+        # A fresh metric loaded from the stripped blob must NOT have history —
+        # the per-rank artifact is the only source of the trailing buffer.
+        fresh = _make_metric(self.n_tasks)
+        fresh.load_state_dict(blob)
+        self.assertEqual(fresh.lifetime_sample_count(), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py
new file mode 100644
index 000000000..2890851de
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py
@@ -0,0 +1,28 @@
+"""Gin-driven env-var bootstrap.
+
+Some env vars must be set *before* certain modules import (e.g. Triton's
+`@triton.autotune` decorator reads `TRITON_FULL_AUTOTUNE` at module load
+time, well before `gin.parse_config_file` runs in the default ordering).
+
+`apply_env_bootstrap()` is `@gin.configurable`, so the gin file becomes the
+canonical source of truth. `train_ranker.py` parses gin with
+`skip_unknown=True` early in `_main_func`, calls this function to push the
+bindings into `os.environ`, then does the heavy imports.
+"""
+
+import logging
+import os
+from typing import Optional
+
+import gin
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+@gin.configurable
+def apply_env_bootstrap(
+    TRITON_FULL_AUTOTUNE: Optional[bool] = None,
+) -> None:
+    if TRITON_FULL_AUTOTUNE is not None:
+        os.environ["TRITON_FULL_AUTOTUNE"] = "1" if TRITON_FULL_AUTOTUNE else "0"
+        logger.info("env bootstrap: TRITON_FULL_AUTOTUNE=%s", os.environ["TRITON_FULL_AUTOTUNE"])
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/debug.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/debug.gin
new file mode 100644
index 000000000..9261dc222
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/debug.gin
@@ -0,0 +1,35 @@
+batch_size = 16
+dataset = "debug"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.75
+
+# train loop variables
+train_loop.num_batches = 10
+train_loop.num_epochs = 1000
+train_loop.output_trace = True
+train_loop.metric_log_frequency = 10
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin
new file mode 100644
index 000000000..46d8e1272
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin
@@ -0,0 +1,41 @@
+batch_size = 16
+dataset = "kuairand-1k"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.75
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = 2
+make_train_test_dataloaders.prefetch_factor = 4
+
+# train loop variables
+train_loop.num_epochs = 5
+train_loop.output_trace = True
+train_loop.metric_log_frequency = 10
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
+
+# checkpoint
+# save_dmp_checkpoint.path = "/home/linjianma/ckpts/kuairand_1k"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/kuairand_1k/2025_01_12_17_56_43/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin
new file mode 100644
index 000000000..e2f371de4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin
@@ -0,0 +1,41 @@
+batch_size = 128
+dataset = "movielens-13b"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.75
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = 2
+make_train_test_dataloaders.prefetch_factor = 4
+
+# train loop variables
+train_loop.num_epochs = 1
+train_loop.output_trace = True
+train_loop.metric_log_frequency = 10
+train_eval_loop.num_epochs = 1
+train_eval_loop.output_trace = True
+train_eval_loop.metric_log_frequency = 10
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
+save_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_13b"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin
new file mode 100644
index 000000000..094271b57
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin
@@ -0,0 +1,56 @@
+batch_size = 64
+dataset = "movielens-18b"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.80
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = 2
+make_train_test_dataloaders.prefetch_factor = 4
+make_train_test_dataloaders.num_blocks = 20
+
+# train loop variables
+train_loop.num_epochs = 200
+train_loop.output_trace = False
+train_loop.metric_log_frequency = 40
+train_loop.checkpoint_frequency = 4000
+train_loop.start_batch_idx = 0
+
+# eval loop variables
+eval_loop.metric_log_frequency = 40
+
+# train eval loop variables
+train_eval_loop.num_epochs = 20
+train_eval_loop.output_trace = False
+train_eval_loop.start_train_batch_idx = 0
+train_eval_loop.start_eval_batch_idx = 0
+train_eval_loop.num_eval_batches = 200
+train_eval_loop.metric_log_frequency = 40
+train_eval_loop.checkpoint_frequency = 2000
+train_eval_loop.eval_frequency = 500
+
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/movielens_18b/"
+save_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin
new file mode 100644
index 000000000..2b6cd6b64
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin
@@ -0,0 +1,38 @@
+batch_size = 128
+dataset = "movielens-1m"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.9, 0.98)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.75
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = 2
+make_train_test_dataloaders.prefetch_factor = 4
+
+# train-eval loop variables
+train_eval_loop.num_epochs = 101
+train_eval_loop.output_trace = True
+train_eval_loop.metric_log_frequency = 10
+train_eval_loop.eval_frequency = 1
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin
new file mode 100644
index 000000000..c01fab5af
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin
@@ -0,0 +1,56 @@
+batch_size = 64
+dataset = "movielens-20m"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.80
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = 2
+make_train_test_dataloaders.prefetch_factor = 4
+make_train_test_dataloaders.num_blocks = 1
+
+# train loop variables
+train_loop.num_epochs = 200
+train_loop.output_trace = False
+train_loop.metric_log_frequency = 40
+train_loop.checkpoint_frequency = 4000
+train_loop.start_batch_idx = 0
+
+# eval loop variables
+eval_loop.metric_log_frequency = 10
+
+# train eval loop variables
+train_eval_loop.num_epochs = 20
+train_eval_loop.output_trace = False
+train_eval_loop.start_train_batch_idx = 0
+train_eval_loop.start_eval_batch_idx = 0
+train_eval_loop.num_eval_batches = 100
+train_eval_loop.metric_log_frequency = 40
+train_eval_loop.checkpoint_frequency = 2000
+train_eval_loop.eval_frequency = 200
+
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/movielens_20m/"
+# save_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/0.5T"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin
new file mode 100644
index 000000000..7d1df4bce
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin
@@ -0,0 +1,52 @@
+batch_size = 64
+num_workers = 2
+prefetch_factor = 4
+dataset = "streaming-100b"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.80
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = %num_workers
+make_train_test_dataloaders.prefetch_factor = %prefetch_factor
+make_train_test_dataloaders.num_blocks = 20
+
+get_dataset.name = %dataset
+get_dataset.new_path_prefix = "/home/linjianma"
+
+make_streaming_dataloader.batch_size = %batch_size
+make_streaming_dataloader.num_workers = %num_workers
+make_streaming_dataloader.prefetch_factor = %prefetch_factor
+
+# train eval loop variables
+streaming_train_eval_loop.num_train_ts = 90
+streaming_train_eval_loop.output_trace = False
+streaming_train_eval_loop.num_eval_batches = 500
+streaming_train_eval_loop.metric_log_frequency = 40
+streaming_train_eval_loop.checkpoint_frequency = 3
+
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/streaming_100b/run4/"
+save_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_100b/"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_100b/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin
new file mode 100644
index 000000000..872019962
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin
@@ -0,0 +1,63 @@
+batch_size = 64
+num_workers = 2
+prefetch_factor = 4
+dataset = "streaming-200b"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.80
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = %num_workers
+make_train_test_dataloaders.prefetch_factor = %prefetch_factor
+make_train_test_dataloaders.num_blocks = 20
+
+get_dataset.name = %dataset
+get_dataset.new_path_prefix = "/home/linjianma"
+
+make_streaming_dataloader.batch_size = %batch_size
+make_streaming_dataloader.num_workers = %num_workers
+make_streaming_dataloader.prefetch_factor = %prefetch_factor
+
+# train loop variables
+train_loop.num_epochs = 200
+train_loop.output_trace = False
+train_loop.metric_log_frequency = 40
+train_loop.checkpoint_frequency = 4000
+train_loop.start_batch_idx = 0
+
+# eval loop variables
+eval_loop.metric_log_frequency = 40
+
+# train eval loop variables
+streaming_train_eval_loop.num_train_ts = 90
+streaming_train_eval_loop.output_trace = False
+streaming_train_eval_loop.num_train_batches = 5000
+streaming_train_eval_loop.num_eval_batches = 200
+streaming_train_eval_loop.metric_log_frequency = 40
+streaming_train_eval_loop.checkpoint_frequency = 2000
+
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/streaming_200b/"
+# save_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_200b/"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_400m/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin
new file mode 100644
index 000000000..eba17bc23
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin
@@ -0,0 +1,61 @@
+batch_size = 64
+num_workers = 2
+prefetch_factor = 4
+dataset = "streaming-400m"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.80
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = %num_workers
+make_train_test_dataloaders.prefetch_factor = %prefetch_factor
+make_train_test_dataloaders.num_blocks = 20
+
+get_dataset.name = %dataset
+get_dataset.new_path_prefix = "/home/linjianma"
+
+make_streaming_dataloader.batch_size = %batch_size
+make_streaming_dataloader.num_workers = %num_workers
+make_streaming_dataloader.prefetch_factor = %prefetch_factor
+
+# train loop variables
+train_loop.num_epochs = 200
+train_loop.output_trace = False
+train_loop.metric_log_frequency = 40
+train_loop.checkpoint_frequency = 4000
+train_loop.start_batch_idx = 0
+
+# eval loop variables
+eval_loop.metric_log_frequency = 40
+
+# train eval loop variables
+streaming_train_eval_loop.num_train_ts = 8
+streaming_train_eval_loop.output_trace = False
+streaming_train_eval_loop.metric_log_frequency = 40
+streaming_train_eval_loop.checkpoint_frequency = 2000
+
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/streaming_400m/"
+# save_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_400m/"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_400m/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
new file mode 100644
index 000000000..dd645f06a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -0,0 +1,541 @@
+# Per-rank batch size. Env-overridable so a denser per-sample shape can fit in
+# HBM by lowering it without editing gin. Default 1024 preserves prior runs;
+# streaming train+eval both read this macro (make_*streaming_dataloader.batch_size).
+batch_size = @bs/env_int()
+bs/env_int.key     = "BATCH_SIZE"
+bs/env_int.default = 1024
+# Dataloader parallelism. Env-overridable so a perf sweep can probe whether the
+# shuffle steady-state cost is CPU-gather latency (hidden by more workers) vs
+# GPU-side embedding work (not). Defaults preserve prior behavior (4 / 8).
+num_workers = @nw/env_int()
+nw/env_int.key     = "NUM_WORKERS"
+nw/env_int.default = 4
+prefetch_factor = @pf/env_int()
+pf/env_int.key     = "PREFETCH_FACTOR"
+pf/env_int.default = 8
+dataset = "yambda-5b"
+
+# model parameters
+make_model.dataset = %dataset
+make_model.bf16_training = True
+# HSTU attention/compute backend: "TRITON" (fused, flash-style — low HBM) or
+# "PYTORCH" (unfused; materializes the dense [B,H,N,N] score tensor, ~32 GiB at
+# N=2048/bs=1024). TRITON validated on MI350/ROCm. The HSTU_HAMMER_KERNEL env
+# var, if set, overrides this binding for one-off runs.
+make_model.hammer_kernel = "TRITON"
+
+# False = use pinned triton kernel configs (deterministic; whether that's
+# the fast or slow equilibrium depends on which config was pinned for the
+# current training shape + GPU). For a NEW training config (new shape,
+# new GPU, new triton/torch version), set True and run with
+# TRITON_PRINT_AUTOTUNING=1 to discover the fast configs, then update the
+# pinned constants in ops/triton/_autotune_pinning.py call sites.
+apply_env_bootstrap.TRITON_FULL_AUTOTUNE = False
+
+# =============================================================================
+# $SEED — global RNG seed for reproducible MODEL INITIALIZATION.
+#
+# WHAT IT CONTROLS (all weight init is a deterministic function of $SEED):
+#   1. Dense parameters (HSTU transformer blocks, MLPs, action embeddings, the
+#      postprocessor). seed_everything() seeds python/numpy/torch/torch.cuda
+#      with $SEED right before make_model(). The SAME seed is set on every rank,
+#      so dense weights are initialized identically across ranks AND reproducibly
+#      run-to-run.
+#   2. Sparse embedding tables (item_id, artist_id, album_id, uid, the cross
+#      tables). These are materialized on-device while DMP shards the model, NOT
+#      in make_model(), so seed_everything() alone does not pin them. Two
+#      mechanisms tie them to $SEED:
+#        (a) make_optimizer_and_shard() RE-SEEDS torch/torch.cuda from $SEED
+#            immediately before DistributedModelParallel(...), so the fused
+#            FBGEMM TBE init (which draws from the global RNG on-device) is
+#            reproducible. This is the path that actually applies here.
+#        (b) get_embedding_table_config() also attaches a per-table seeded
+#            init_fn (seed derived from sha256($SEED, table_name)) for the
+#            eager/non-meta code path. It no-ops on the meta device that DMP
+#            uses, so (a) is the effective guarantee for this setup.
+#   3. Any other seeded RNG consumers (e.g. dropout's init-time draws).
+#
+#   SCOPE ("Tier 1"): reproducible for a FIXED sharding plan — same GPU/world
+#   size AND same planner output. It is NOT invariant to changing the GPU count
+#   or sharding (the per-shard draw boundaries move). The init DISTRIBUTION is
+#   unchanged from stock (uniform +/-1/sqrt(num_embeddings), or a table's
+#   explicit weight_init_min/max), so $SEED affects determinism, not quality.
+#
+#   VERIFY: INIT_CHECKSUM=1 (OFF by default) logs a per-table fingerprint + a
+#   one-line "[init-checksum] SEED=.. digest=.." right after DMP. Two builds with
+#   the same $SEED + plan print the same digest; different seeds differ. It is OFF
+#   by default because the fp64 per-shard reductions materialize a full fp64 copy
+#   of each local embedding shard (>150 GiB for the big tables), leaving almost no
+#   HBM headroom after sharding and OOMing the build on any node with residual
+#   memory. Enable only for an explicit reproducibility check (ideally a clean
+#   node / smaller batch).
+#
+# WHAT IT DOES *NOT* CONTROL (separate, independent knobs):
+#   - Streaming data order / shuffle permutation -> $STREAMING_SHUFFLE_SEED
+#     (get_dataset.streaming_shuffle_seed, below).
+#   - Train/eval holdout user split               -> $SPLIT_SALT (below).
+#   So holding $STREAMING_SHUFFLE_SEED + $SPLIT_SALT fixed and varying ONLY $SEED
+#   isolates the effect of model initialization (a clean init-seed A/B / sweep).
+#
+# PARSE NOTE: seed_everything() runs right before make_model() in train_ranker
+# (after the full gin parse), so this binding resolves in the second parse where
+# env_int is registered. Override per-run via $SEED.
+# Default 1 gives a fixed, reproducible seed each run; override $SEED to vary it
+# (set $SEED = -1 to draw a fresh random seed per run).
+seed_everything.seed = @seed/env_int()
+seed/env_int.key     = "SEED"
+seed/env_int.default = 1
+
+# $DECORRELATE_DROPOUT — re-seed torch/cuda with $SEED + rank after init so HSTU
+# dropout masks differ per data-parallel rank. 1 = on, 0 = identical masks (default).
+decorrelate_runtime_rng.enabled = @drr/env_int()
+drr/env_int.key     = "DECORRELATE_DROPOUT"
+drr/env_int.default = 0
+
+# dense model optimizer
+# Learning rate is env-overridable via $DENSE_LR (default 1e-7).
+dense_optimizer_factory_and_class.learning_rate = @dlr/env_float()
+dlr/env_float.key     = "DENSE_LR"
+dlr/env_float.default = 0.0000001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+# Learning rate is env-overridable via $SPARSE_LR (default 1e-7).
+sparse_optimizer_factory_and_class.learning_rate = @slr/env_float()
+slr/env_float.key     = "SPARSE_LR"
+slr/env_float.default = 0.0000001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# Gradient clipping for the STREAMING path (clips dense params; sparse tables
+# use a fused optimizer and are unaffected). Env-overridable via $GRAD_CLIP_NORM.
+# Default 1.0 = ON (max_norm=1.0); set 0.0 via $GRAD_CLIP_NORM to disable.
+streaming_train_eval_loop.grad_clip_norm = @gcn/env_float()
+gcn/env_float.key     = "GRAD_CLIP_NORM"
+gcn/env_float.default = 1.0
+
+# Data root: resolved at runtime from $DLRM_DATA_PATH if set, else the literal
+# below. Used by both make_train_test_dataloaders and get_dataset.
+# Scoped (`data/env_path`) so this binding doesn't collide with the RUN_NAME
+# env_path binding below — every distinct env_path() call site needs its own
+# scope or the later `env_path.key=...` overrides earlier ones.
+DATA_PATH = @data/env_path()
+data/env_path.key = "DLRM_DATA_PATH"
+data/env_path.default = "/apps/chcai/dlrm_data"
+
+# Shared train:eval split: fraction of USERS used for training; the remaining
+# (1 - this) fraction are held out as a fixed eval set and NEVER trained.
+# Bound to BOTH the static train-eval path (make_train_test_dataloaders, a
+# positional split) and the streaming path (get_dataset, an explicit by-user
+# hash split), so one value configures the holdout in either mode.
+# 1.0 = no holdout (legacy streaming behavior). Override via $TRAIN_SPLIT_PERCENTAGE.
+# Default 1.0: all users are trained AND evaluated (full-coverage eval), matching
+# the alleval/qa2a production runs; set <1.0 (e.g. 0.90) for a clean held-out cohort.
+TRAIN_SPLIT_PERCENTAGE = @tsp/env_float()
+tsp/env_float.key     = "TRAIN_SPLIT_PERCENTAGE"
+tsp/env_float.default = 1.0
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.eval_batch_size = 1024
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = %TRAIN_SPLIT_PERCENTAGE
+make_train_test_dataloaders.new_path_prefix = %DATA_PATH
+make_train_test_dataloaders.num_workers = %num_workers
+make_train_test_dataloaders.prefetch_factor = %prefetch_factor
+make_train_test_dataloaders.num_blocks = 1
+
+# embedding planner: per-rank HBM ceiling the torchrec sharder targets.
+# Override via $HBM_CAP_GB (e.g. lower to 150 to force more CW sharding).
+make_optimizer_and_shard.hbm_cap_gb = @env_int()
+env_int.key = "HBM_CAP_GB"
+env_int.default = 260
+
+# Sparse embedding all-to-all wire precision. The embedding shuffle is the
+# dominant, bandwidth-bound (esp. multi-node) collective; quantizing it via
+# TorchRec QCommsConfig halves (bf16/fp16, both 2 bytes) the wire volume.
+# Forward and backward are set independently (each: "fp32" | "bf16" | "fp16").
+# Both "fp32" = off (default; numerically identical to baseline trunk).
+# Per TorchRec golden_training, fwd=fp16 / bwd=bf16 is the recommended quantized
+# mix: fp16's mantissa suits bounded forward activations, while bf16's wider
+# exponent range avoids overflow on gradients.
+# Override via $SPARSE_A2A_FWD / $SPARSE_A2A_BWD.
+make_optimizer_and_shard.sparse_a2a_forward_precision = @saaf/env_str()
+saaf/env_str.key     = "SPARSE_A2A_FWD"
+saaf/env_str.default = "fp32"
+make_optimizer_and_shard.sparse_a2a_backward_precision = @saab/env_str()
+saab/env_str.key     = "SPARSE_A2A_BWD"
+saab/env_str.default = "fp32"
+
+get_dataset.name = %dataset
+get_dataset.new_path_prefix = %DATA_PATH
+# Total user-interaction-history (UIH) budget per sample, distributed evenly
+# across 3 behaviour pools (listen+ / like / skip) at L//3 events each.
+# Per-sample sequence the model sees =
+#     3 × (L // 3) + 8 contextual + 1 candidate
+# Choosing 4086 makes 3 × 1362 + 9 = 4095, the largest value that fits
+# get_hstu_configs.max_seq_len = 4096 with no dataset-side truncation (the
+# 4k analog of the previous 2039/2048 shape, where 3×679+9=2046 ≤ 2048).
+# Larger L overflows the budget; the dataset truncates UIH events to fit.
+# Note: like events are only 1.9% of the yambda corpus and max user lifetime
+# is ~28k events, so the like pool fills to ~105 events per anchor on
+# average (not 1362) — TRITON's jagged attention skips the unfilled slots,
+# so the under-fill costs sequence budget but not GPU compute.
+# Cache is keyed by L on disk under <processed_dir>/hstu_cache_L<N>/;
+# switching L reuses an existing cache or builds a new one. Override via
+# $HISTORY_LENGTH (default 4086 = the 4k-no-truncation shape; use 2039 with
+# MAX_SEQ_LEN=2048 to reuse the previous 2k single-task cache).
+get_dataset.history_length = @hl/env_int()
+hl/env_int.key     = "HISTORY_LENGTH"
+hl/env_int.default = 4086
+
+# UIH construction strategy — how a user's prior events become the sequence the
+# model attends over. Override via $HISTORY_STRATEGY. Both strategies scan the
+# last `scan_window` (20000) events before the anchor and consider ONLY the 3
+# behavior pools (listen+ / like / skip); dislike/unlike/undislike are excluded
+# (no action-weight bit), so the model's action_weights=[1,2,4] are unchanged.
+# The two differ only in HOW they pick events out of that scan window:
+#
+#   "interleaved" (default):
+#     - Budget = equal per-pool quota of HISTORY_LENGTH//3 (=1362) events; take
+#       the last 1362 of EACH pool independently, then merge + re-sort by time.
+#     - HISTORY_LENGTH is thus a PER-POOL cap (nominal total = 3 * L//3).
+#     - Consequence: likes are only ~1.9% of the corpus, so the like pool
+#       under-fills (~105 of its 1362 slots) and that budget is NOT reallocated
+#       — the sequence comes up short. Measured: ~2663 events effective, ~4.0%
+#       of them likes (i.e. likes OVER-represented vs their natural rate).
+#     - Use when you want guaranteed like exposure per sample (class-balanced UIH).
+#
+#   "last_n":
+#     - Budget = a single pool-agnostic cap: take the last HISTORY_LENGTH events
+#       of ANY pool type, already chronological (no per-pool split, no re-sort).
+#     - HISTORY_LENGTH is thus the LITERAL TOTAL UIH cap (not a per-pool L//3).
+#     - Consequence: the sequence fills to ~HISTORY_LENGTH (the only limit is how
+#       many pooled events exist in the scan window), so effective length is
+#       higher and the like share falls to its natural rate. Measured: ~4085
+#       events effective (~1.5x interleaved), ~1.2% of them likes.
+#     - Trade-off: more recent listen+/skip context (longer effective sequence)
+#       at the cost of fewer likes; ~1.4x eval/step compute (more non-skipped
+#       jagged-attention positions). Keep HISTORY_LENGTH=4086 to fill the 4k
+#       model budget.
+#
+# Both strategies are strategy-INDEPENDENT on disk: the gather runs at
+# sample-build time, so switching reuses the SAME hstu_cache_L<HISTORY_LENGTH>/
+# and positions cache — no rebuild needed.
+get_dataset.history_strategy = @hs/env_str()
+hs/env_str.key     = "HISTORY_STRATEGY"
+hs/env_str.default = "interleaved"
+
+# Anchor-eligibility floor: a LISTEN event becomes a trainable/eval anchor once
+# the user has >= MIN_HISTORY prior events. Decoupled from history_length (which
+# is only the gather/truncation cap) — jagged attention handles short UIH, so we
+# no longer need a full history_length of context to include a sample. The legacy
+# behavior (require a full history_length, which dropped ~60% of users) is
+# MIN_HISTORY=4086; the default below (0) includes ~all users AND their
+# cold-start first event (zero prior context). Positions/anchor-ts caches are
+# keyed by (L, MIN_HISTORY) so floors don't collide. Override via $MIN_HISTORY.
+get_dataset.min_history = @mh/env_int()
+mh/env_int.key     = "MIN_HISTORY"
+mh/env_int.default = 4086
+
+# Model-side attention budget. Dataset truncates UIH to fit this value if
+# `history_length + contextual + candidate` would overflow. Override via
+# $MAX_SEQ_LEN (default 4096, the 4k-no-truncation shape paired with
+# HISTORY_LENGTH=4086: 3*1362+9=4095 ≤ 4096). Set MAX_SEQ_LEN=2048 with
+# HISTORY_LENGTH=2039 for the previous 2k production single-task shape.
+get_hstu_configs.max_seq_len = @msl/env_int()
+msl/env_int.key     = "MAX_SEQ_LEN"
+msl/env_int.default = 4096
+
+# HSTU transformer depth (number of attention layers). Default 3.
+# Override per-run via $HSTU_NUM_LAYERS. NOTE: changing depth changes the model
+# shape, so a run with a new depth MUST use a FRESH CKPT_PATH (incompatible with
+# checkpoints of a different depth). Resolved in the full gin parse
+# (get_hstu_configs is not registered during the early skip_unknown parse), so
+# the @env_int reference is skipped on the first pass — same safe path as the LR knobs.
+get_hstu_configs.hstu_attn_num_layers = @nl/env_int()
+nl/env_int.key     = "HSTU_NUM_LAYERS"
+nl/env_int.default = 3
+
+# --- streaming (temporal-order) training -------------------------------------
+# Only consumed under `--mode streaming-train-eval`; the default train-eval
+# path above is unaffected. Trains time window T then evals window T+1,
+# advancing forward in wall-clock time (no future leakage). Window size is the
+# temporal-ordering granularity knob (default 1 day). num_train_ts is clamped
+# to the dataset's available window count at runtime; override via $NUM_TRAIN_TS.
+get_dataset.streaming_window_seconds = 86400
+get_dataset.streaming_sort_within_window = False
+# In-window shuffle to break user-major batching (consecutive sliding-window
+# anchors otherwise come from the same few users -> few unique embedding reads).
+# Diversity dial in [0,1] -- the AGREED, config-invariant benchmark knob. Maps to
+# a within-segment shuffle with K = round(fraction * per-window train-anchor
+# count): 0 = off (user-major, page-local mmap scans), 1 = full per-element
+# shuffle (max diversity), intermediate = interpolation. Same fraction => same
+# diversity regardless of world_size / #nodes / batch_size.
+#
+# Default 0.0 (no shuffle -> user-major, page-local mmap scans) so the standard
+# run matches the production streaming order. Together with the fixed seed below
+# the in-window order is fully DETERMINISTIC and identical across runs/resumes.
+# Override per-run via $STREAMING_SHUFFLE_FRACTION (e.g. 1.0 for a full
+# per-element shuffle / max diversity, 0.03 for the diversity/locality sweet spot).
+streaming_shuffle_fraction = 0.0
+get_dataset.streaming_shuffle_fraction = @ssf/env_float()
+ssf/env_float.key     = "STREAMING_SHUFFLE_FRACTION"
+ssf/env_float.default = %streaming_shuffle_fraction
+# Fixed shuffle seed -> reproducible permutation. Exposed as a knob; override via
+# $STREAMING_SHUFFLE_SEED only if you deliberately want a different draw.
+get_dataset.streaming_shuffle_seed = @ssfseed/env_int()
+ssfseed/env_int.key     = "STREAMING_SHUFFLE_SEED"
+ssfseed/env_int.default = 0
+# User-level train:eval holdout for the streaming path. With tsp<1.0, the top
+# (1 - tsp) fraction of users (by a deterministic hash of uid+split_salt) are
+# held out as a FIXED eval set and never trained -> no temporal/user leakage,
+# stable comparable eval curve, bounded eval time. split_salt lets you draw a
+# different holdout without changing the ratio. Override salt via $SPLIT_SALT.
+get_dataset.train_split_percentage = %TRAIN_SPLIT_PERCENTAGE
+get_dataset.split_salt = @ssalt/env_int()
+ssalt/env_int.key     = "SPLIT_SALT"
+ssalt/env_int.default = 0
+make_streaming_dataloader.batch_size = %batch_size
+make_streaming_dataloader.num_workers = %num_workers
+make_streaming_dataloader.prefetch_factor = %prefetch_factor
+make_persistent_streaming_dataloader.batch_size = %batch_size
+make_persistent_streaming_dataloader.num_workers = %num_workers
+make_persistent_streaming_dataloader.prefetch_factor = %prefetch_factor
+streaming_train_eval_loop.num_train_ts = @nts/env_int()
+nts/env_int.key = "NUM_TRAIN_TS"
+# 149 daily windows -> with start_ts=150 the run sweeps ts=150..298, the full
+# dense range of the corpus (matches the long e2e runs). Clamped to the
+# dataset's available window count at runtime. Override via $NUM_TRAIN_TS.
+nts/env_int.default = 149
+# Anchors need >= history_length prior events, so the first ~130 daily windows
+# are near-empty warm-up; start at a dense window. Override via $START_TS.
+streaming_train_eval_loop.start_ts = @sts/env_int()
+sts/env_int.key = "START_TS"
+sts/env_int.default = 150
+# Per-step metric logging cadence. Default 50 (one compute_and_log GPU->CPU
+# sync per 50 batches). The streaming-resume test sets METRIC_LOG_FREQ=1 so
+# every step emits a parseable "Step N metrics" line for trajectory comparison.
+streaming_train_eval_loop.metric_log_frequency = @mlf/env_int()
+mlf/env_int.key     = "METRIC_LOG_FREQ"
+mlf/env_int.default = 50
+# MLPerf train_loss event cadence (global train steps), INDEPENDENT of
+# METRIC_LOG_FREQ above. 0 (default) = fall back to METRIC_LOG_FREQ, preserving
+# the prior coupled behavior. Set $MLPERF_TRAIN_LOSS_LOG_FREQ>0 to log the MLPerf
+# train_loss event at a different rate than the console/TB metrics. Disable the
+# whole MLPerf stream with $MLPERF_LOGGING=0.
+streaming_train_eval_loop.mlperf_train_loss_log_frequency = @mltlf/env_int()
+mltlf/env_int.key     = "MLPERF_TRAIN_LOSS_LOG_FREQ"
+mltlf/env_int.default = 0
+# Diagnostic: log per-batch unique/total embedding-id counts on logged steps
+# (rank 0). Quantifies the user-major batching redundancy and the realized
+# diversity from get_dataset.streaming_shuffle_fraction. Off; set $DIAG_UNIQUE_EMB=1.
+streaming_train_eval_loop.streaming_diag_unique_emb = @due/env_int()
+due/env_int.key     = "DIAG_UNIQUE_EMB"
+due/env_int.default = 0
+# Chrome trace capture: reuses the shared Profiler.* bindings below (5-step
+# window at step 52). The streaming step counter advances across train+eval
+# batches, so step 52 lands in the first (train) window's compute. Off by
+# default to avoid profiler overhead in production runs; set $OUTPUT_TRACE=1.
+streaming_train_eval_loop.output_trace = @ot/env_int()
+ot/env_int.key     = "OUTPUT_TRACE"
+ot/env_int.default = 0
+# Reuse one DataLoader (persistent workers) across windows instead of respawning
+# per window. Skip eval to isolate window-reset cost. Override via env.
+streaming_train_eval_loop.persistent_loader = @pl/env_int()
+pl/env_int.key = "PERSISTENT_LOADER"
+pl/env_int.default = 1
+# ---- Eval cadence: choose EXACTLY ONE of the two knobs below ----------------
+# The streaming loop can decide WHEN to run the full-holdout eval pass in one of
+# two ways. They are MUTUALLY EXCLUSIVE — enabling the data-fraction cadence
+# (EVAL_EVERY_DATA_PCT>0) requires EVAL_EVERY_N_WINDOWS=0; setting both >0 raises
+# a ValueError at startup. The final end-of-run eval always runs in either mode.
+#
+# (1) PER-WINDOW cadence (EVAL_EVERY_N_WINDOWS, the default).
+# Full-holdout eval cadence (single knob; replaces the old EVAL_EACH_WINDOW
+# on/off switch). 0 = eval disabled (train-only, e.g. perf benchmarking or the
+# resume test; the eval dataloader isn't even built). 1 (default) = eval after
+# every window. N>1 (e.g. 5 via $EVAL_EVERY_N_WINDOWS) = eval every Nth window
+# (and always the final one) to amortize the cost of consuming the full next-day
+# eval window. The cadence is anchored to the absolute ts grid so eval points
+# stay stable across a mid-run resume.
+# NOTE: each daily window has a DIFFERENT number of training samples, so a
+# per-window cadence produces eval points that are UNEVENLY spaced in terms of
+# how much data was trained between them. Use the data-fraction cadence below if
+# you want evenly-spaced-by-data eval points instead.
+streaming_train_eval_loop.eval_every_n_windows = @evn/env_int()
+evn/env_int.key = "EVAL_EVERY_N_WINDOWS"
+evn/env_int.default = 1
+#
+# (2) DATA-FRACTION cadence (EVAL_EVERY_DATA_PCT).
+# Run the full-holdout eval every time the run has trained this FRACTION of the
+# run's TOTAL training data, so eval points are EVENLY spaced by data volume
+# (compute), independent of how many samples each daily window happens to hold.
+# This is the fix for the per-window cadence's uneven spacing noted above.
+#   value semantics (it is a fraction in (0, 1], NOT a percent number):
+#     0.0  (default) = OFF  -> fall back to the per-window EVAL_EVERY_N_WINDOWS.
+#     0.01           = eval every 1%  of the data  -> ~100 eval points total.
+#     0.05           = eval every 5%  of the data  -> ~20  eval points total.
+#     0.10           = eval every 10% of the data  -> ~10  eval points total.
+# How "fraction of data" becomes an eval trigger: at startup the fraction is
+# converted ONCE into a global train-step interval
+#     eval_interval_steps = round(pct * total_train_anchors / (batch_size * world_size))
+# where total_train_anchors is counted over the ORIGINAL requested window range
+# [start_ts, start_ts+num_train_ts). Eval then fires whenever the monotonic
+# train global_step crosses a multiple of that interval (it can fire MID-WINDOW,
+# i.e. partway through a daily window, and across window boundaries). Because the
+# interval is computed over the original range and global_step is
+# checkpoint-restored, the eval grid is identical on a cold start and on every
+# resume. Each eval record's label carries "@step=<global_step>" so the
+# trajectory can be plotted against data volume. Override via $EVAL_EVERY_DATA_PCT.
+streaming_train_eval_loop.eval_every_data_pct = @edp/env_float()
+edp/env_float.key     = "EVAL_EVERY_DATA_PCT"
+edp/env_float.default = 0.0
+# Double-buffer windows: prepare the next window (index mask + first-batch
+# prefetch) in a background thread during the current window's compute, hiding
+# the per-window reset. Needs persistent_loader=1. Override via env.
+streaming_train_eval_loop.double_buffer = @db/env_int()
+db/env_int.key = "DOUBLE_BUFFER"
+db/env_int.default = 1
+# Fixed eval-holdout window range (held-out users' anchors over these windows
+# form the eval set evaluated at EVERY eval step). EVAL_HOLDOUT_TS<0 (default)
+# resolves at runtime to start_ts+num_train_ts (the window just past training),
+# which is stable across resume. EVAL_HOLDOUT_NUM_WINDOWS widens the eval span.
+streaming_train_eval_loop.eval_holdout_ts = @eht/env_int()
+eht/env_int.key     = "EVAL_HOLDOUT_TS"
+eht/env_int.default = -1
+streaming_train_eval_loop.eval_holdout_num_windows = @ehnw/env_int()
+ehnw/env_int.key     = "EVAL_HOLDOUT_NUM_WINDOWS"
+ehnw/env_int.default = 1
+# num_train_batches / num_eval_batches unset => consume each full window.
+# Set them (e.g. via gin) to cap per-window steps for short experiments.
+
+# Default (non-streaming) train-eval loop variables; used unless
+# `--mode streaming-train-eval` selects the temporal-order path configured above.
+train_eval_loop.num_epochs = 1
+train_eval_loop.metric_log_frequency = 50
+train_eval_loop.eval_frequency = 500
+train_eval_loop.num_eval_batches = 200
+train_eval_loop.checkpoint_frequency = 1000000000  # disable mid-training checkpoints (disk-full guard)
+train_eval_loop.output_trace = True
+# 3-stage TorchRec pipeline: overlaps the embedding all-to-all (the dominant
+# exposed-comm bottleneck) with dense fwd/bwd. Set False to fall back to the
+# sequential fwd/bwd loop.
+train_eval_loop.use_pipeline = False
+
+# Run name → recommendation_v4/results/<run_name>/ (override via $RUN_NAME env).
+RUN_NAME = @run/env_path()
+run/env_path.key     = "RUN_NAME"
+run/env_path.default = "default"
+run_results_dir.run_name = %RUN_NAME
+
+# profiler: capture a 5-step window starting at step 52, every rank.
+# `trim_warmup = True` (default) post-filters the chrome trace so only the 5
+# `active` steps appear; the 1 WARMUP step still runs (lets CUPTI/ROCprof
+# settle) but its events are dropped from the output file.
+Profiler.trace_steps = [52]
+Profiler.warmup = 1
+Profiler.active = 5
+Profiler.trace_dir = @run_results_dir()
+
+# logger variables
+# TensorBoard event dir. DISABLED BY DEFAULT (empty path): the shared-NFS
+# tfevents writer is the only metrics sink whose I/O error is uncaught, and it
+# repeatedly crashed trainers on transient /apps `Errno 121` (Remote I/O) hiccups.
+# Nothing we consume reads TensorBoard — eval-window AUCs come from the durable
+# `<LOG>.metrics.jsonl` sink (try/except-guarded) and the text run log. An empty
+# path makes MetricsLogger install a no-op writer (see _NoOpSummaryWriter).
+# Re-enable for a run by setting $TENSORBOARD_LOG_PATH to a non-empty dir.
+MetricsLogger.tensorboard_log_path = @tbp/env_path()
+tbp/env_path.key     = "TENSORBOARD_LOG_PATH"
+tbp/env_path.default = ""
+MetricsLogger.world_size = 8
+# MLPerf convergence target: run stops when the selected eval AUC reaches it.
+# Default 1.0 is unreachable, so the run trains through all windows (no early
+# stop) out of the box; set $AUC_THRESHOLD=0.80275 for the MLPerf target.
+MetricsLogger.auc_threshold = @at/env_float()
+at/env_float.key     = "AUC_THRESHOLD"
+at/env_float.default = 1.0
+# EVAL_ACCURACY + early-stop are driven by per-window AUC (window_auc) vs the
+# threshold above.
+# Lifetime-AUC backend, selectable independently for the train cumulative AUC and
+# the eval cumulative ("lifetime_*") AUC. Both default to "binned":
+#   "binned" = BinnedCumulativeAUC: exact-cumulative AUC from an O(bins) score
+#     histogram (additive all-reduce, memory independent of #samples/#windows).
+#   "capped" = LifetimeAUCMetricComputation: AUC over a trailing buffer of
+#     `lifetime_auc_window` samples/rank (legacy; per-rank buffer all-gathered).
+# Override per-run via $TRAIN_LIFETIME_AUC_MODE / $EVAL_LIFETIME_AUC_MODE.
+MetricsLogger.train_lifetime_auc_mode = @tlam/env_str()
+tlam/env_str.key     = "TRAIN_LIFETIME_AUC_MODE"
+tlam/env_str.default = "binned"
+MetricsLogger.eval_lifetime_auc_mode = @elam/env_str()
+elam/env_str.key     = "EVAL_LIFETIME_AUC_MODE"
+elam/env_str.default = "binned"
+# Score-histogram resolution for the "binned" backend. Higher = finer AUC
+# resolution at O(bins) memory. Override via $CUMULATIVE_AUC_BINS.
+MetricsLogger.cumulative_auc_bins = @cab/env_int()
+cab/env_int.key     = "CUMULATIVE_AUC_BINS"
+cab/env_int.default = 100000
+# Trailing-buffer size (samples/rank) for the "capped" backend. Override via
+# $LIFETIME_AUC_WINDOW. Ignored when the backend is "binned".
+MetricsLogger.lifetime_auc_window = @law/env_int()
+law/env_int.key     = "LIFETIME_AUC_WINDOW"
+law/env_int.default = 10000000
+# Checkpointing disabled by default — a full DMP checkpoint is ~100s of GB and
+# the streaming loop always saves on the final window. save_dmp_checkpoint
+# no-ops on the empty path. Set $CKPT_PATH to a directory to re-enable; the
+# load path auto-resolves to the highest-numbered numeric subdir inside it.
+save_dmp_checkpoint.path = @ckpt/env_path()
+ckpt/env_path.key     = "CKPT_PATH"
+ckpt/env_path.default = ""
+load_dmp_checkpoint.path = @ckpt/env_path()
+# Retention: keep only the most-recent N numeric subdirs after each successful
+# save (atomic rename + prune-older). Override via $KEEP_LAST_N.
+save_dmp_checkpoint.keep_last_n = @kln/env_int()
+kln/env_int.key     = "KEEP_LAST_N"
+kln/env_int.default = 1
+# In-window checkpoint cadence for streaming-train-eval. 0 (default) = end-of-
+# window only; on crash mid-window the partial progress is lost. Set N>0 (e.g.
+# via $IN_WINDOW_CKPT_FREQ) for batch-granularity exact-once-on-resume — paired
+# with the auto-latest load above, the resumed run skips the K already-trained
+# batches of the partial window and continues with bit-equal trajectory.
+streaming_train_eval_loop.in_window_checkpoint_frequency = @iwcf/env_int()
+iwcf/env_int.key     = "IN_WINDOW_CKPT_FREQ"
+iwcf/env_int.default = 0
+# Global-step checkpoint cadence: save whenever the monotonic train global_step
+# crosses a multiple of N (a true "every 1000 steps" trigger that spans windows
+# and survives resume). 0 (default) = off. Override via $CKPT_STEP_FREQ.
+streaming_train_eval_loop.checkpoint_step_frequency = @csf/env_int()
+csf/env_int.key     = "CKPT_STEP_FREQ"
+csf/env_int.default = 0
+# Wall-clock checkpoint cadence in seconds: save when >= this many seconds have
+# elapsed since the last save (e.g. 3600 for hourly). Rank 0 owns the clock and
+# broadcasts the decision so all ranks save together. 0.0 (default) = off.
+# Override via $CKPT_TIME_INTERVAL_S.
+streaming_train_eval_loop.checkpoint_time_interval_s = @ctis/env_float()
+ctis/env_float.key     = "CKPT_TIME_INTERVAL_S"
+ctis/env_float.default = 0.0
+# Cap each train_ts window's batch count (mostly for the resume test driver).
+# Unset / 0 = use the full window.
+streaming_train_eval_loop.num_train_batches = @ntb/env_int()
+ntb/env_int.key     = "NUM_TRAIN_BATCHES"
+ntb/env_int.default = 0
+# Cap each eval (full-holdout) window's batch count. Unset / <=0 = consume the
+# full eval window (the genuine full-holdout NE/AUC; this is what the long run
+# uses). Set >0 via $NUM_EVAL_BATCHES to subsample eval for fast validation.
+streaming_train_eval_loop.num_eval_batches = @neb/env_int()
+neb/env_int.key     = "NUM_EVAL_BATCHES"
+neb/env_int.default = 0
+# Test-only failure injection: when >=0 and metric_logger.global_step['train']
+# reaches this, the process exits with code 42 right after the in-window save
+# fires. Used by the streaming resume test harness (train/tests/) to verify resume.
+streaming_train_eval_loop.die_at_step = @das/env_int()
+das/env_int.key     = "DIE_AT_STEP"
+das/env_int.default = -1
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
new file mode 100644
index 000000000..8a716f87b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
@@ -0,0 +1,413 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+"""MLPerf Training compliance logging for the DLRMv3 streaming-train-eval path.
+
+Rank-0-gated wrapper around ``mlperf_logging.mllog`` so the streaming loop emits
+the MLPerf event stream without every call site re-checking rank or the dep.
+"""
+
+import logging
+import os
+from typing import Any, Dict, Optional
+
+import gin
+import torch
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+try:
+    from mlperf_logging import mllog
+    from mlperf_logging.mllog import constants as mllog_constants
+
+    _MLLOG_AVAILABLE = True
+except Exception as e:  # pragma: no cover - import-time guard
+    mllog = None  # type: ignore[assignment]
+    mllog_constants = None  # type: ignore[assignment]
+    _MLLOG_AVAILABLE = False
+    logger.warning(
+        "mlperf_logging not importable (%s); MLPerf logging disabled. "
+        "Install via `pip install git+https://github.com/mlcommons/logging.git`.",
+        e,
+    )
+
+
+def _rank() -> int:
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+    return 0
+
+
+def _barrier() -> None:
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+
+class MLPerfLogger:
+    """Rank-0-gated facade over ``mllog``.
+
+    Event methods no-op on non-zero ranks and when mlperf_logging is absent.
+    ``sync=True`` barriers before emit so the timestamp reflects the slowest rank
+    (required for INIT_STOP/RUN_START/RUN_STOP).
+    """
+
+    def __init__(
+        self,
+        rank: Optional[int] = None,
+        log_path: Optional[str] = None,
+        default_stack_offset: int = 2,
+        benchmark_name: str = "hstu",
+        submitter_name: str = "AMD",
+        submission_platform: str = "MI355X",
+        fresh: bool = True,
+    ):
+        self.enabled: bool = _MLLOG_AVAILABLE
+        # Use the EXPLICIT caller rank: this is built before init_process_group,
+        # when dist.get_rank() would return 0 on every rank (all would log).
+        self.rank: int = rank if rank is not None else _rank()
+        self.benchmark_name: str = benchmark_name
+        self.submitter_name: str = submitter_name
+        self.submission_platform: str = submission_platform
+        self._logger = None
+        if not self.enabled:
+            return
+        # Only rank 0 emits, so only rank 0 needs the file handler.
+        if log_path and self.rank == 0:
+            log_dir = os.path.dirname(log_path)
+            if log_dir:  # guard: os.makedirs("") raises for a bare filename
+                os.makedirs(log_dir, exist_ok=True)
+            # mllog's FileHandler APPENDS (mode "a"), which is what a resume needs
+            # so the single run's event stream accumulates across relaunches into
+            # one file. On a genuine cold start, truncate first so a re-used run
+            # dir / a previous crashed-cold-start's orphaned stream can't leave a
+            # second run_start in the file (the compliance checker requires
+            # EXACTLY_ONE). Resume (fresh=False) appends to continue the stream.
+            if fresh:
+                open(log_path, "w").close()
+            mllog.config(filename=log_path, default_stack_offset=default_stack_offset)
+        else:
+            mllog.config(default_stack_offset=default_stack_offset)
+        self._logger = mllog.get_mllogger()
+
+    @property
+    def constants(self):  # pyre-ignore[3]
+        return mllog_constants
+
+    def event(
+        self,
+        key: str,
+        value: Any = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        sync: bool = False,
+    ) -> None:
+        if sync:
+            _barrier()
+        if self.enabled and self.rank == 0:
+            self._logger.event(key=key, value=value, metadata=metadata or {})
+
+    def start(
+        self,
+        key: str,
+        value: Any = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        sync: bool = False,
+    ) -> None:
+        if sync:
+            _barrier()
+        if self.enabled and self.rank == 0:
+            self._logger.start(key=key, value=value, metadata=metadata or {})
+
+    def end(
+        self,
+        key: str,
+        value: Any = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        sync: bool = False,
+    ) -> None:
+        if sync:
+            _barrier()
+        if self.enabled and self.rank == 0:
+            self._logger.end(key=key, value=value, metadata=metadata or {})
+
+    def submission_info(self) -> None:
+        """Emit the five SUBMISSION_* events required for a valid submission."""
+        if not (self.enabled and self.rank == 0):
+            return
+        c = mllog_constants
+        self.event(key=c.SUBMISSION_BENCHMARK, value=self.benchmark_name)
+        self.event(key=c.SUBMISSION_ORG, value=self.submitter_name)
+        self.event(key=c.SUBMISSION_DIVISION, value=c.CLOSED)
+        self.event(key=c.SUBMISSION_STATUS, value=c.ONPREM)
+        self.event(key=c.SUBMISSION_PLATFORM, value=self.submission_platform)
+
+    def log_run_start(
+        self,
+        global_batch_size: int,
+        seed: int,
+        gradient_accumulation_steps: int = 1,
+    ) -> None:
+        """Emit submission info + core hyperparameters, then INIT_STOP + RUN_START.
+
+        Optimizer names/LRs are read from gin (dense Adam + sparse RowWiseAdagrad),
+        resolving env-macro refs to concrete values. Call once on a genuine cold
+        start, after the model is built. INIT_STOP/RUN_START barrier so the
+        timestamp reflects the slowest rank, so ALL ranks must call this together
+        (non-rank-0 / disabled calls no-op the emit but still hit the barrier).
+        """
+        c = self.constants
+        self.submission_info()
+        self.event(key=c.GLOBAL_BATCH_SIZE, value=int(global_batch_size))
+        self.event(
+            key=c.GRADIENT_ACCUMULATION_STEPS, value=int(gradient_accumulation_steps)
+        )
+        self.event(key=c.SEED, value=int(seed))
+        self.event(
+            key=c.OPT_NAME,
+            value=_gin_param("dense_optimizer_factory_and_class.optimizer_name", "Adam"),
+        )
+        self.event(
+            key=c.OPT_BASE_LR,
+            value=_gin_param("dense_optimizer_factory_and_class.learning_rate", None),
+        )
+        self.event(
+            key="opt_sparse_name",
+            value=_gin_param(
+                "sparse_optimizer_factory_and_class.optimizer_name", "RowWiseAdagrad"
+            ),
+        )
+        self.event(
+            key="opt_sparse_base_learning_rate",
+            value=_gin_param(
+                "sparse_optimizer_factory_and_class.learning_rate", None
+            ),
+        )
+        self.end(key=c.INIT_STOP, sync=True)
+        self.start(key=c.RUN_START, sync=True)
+
+
+def _gin_param(name: str, default: Any) -> Any:
+    """Read a gin-bound parameter, resolving env-macro refs to concrete values.
+
+    Returns ``default`` if the parameter is unbound or a macro ref cannot be
+    resolved (so env-overridden LRs log as numbers, not unencodable objects).
+    """
+    try:
+        value = gin.query_parameter(name)
+    except (ValueError, KeyError):
+        return default
+    if hasattr(value, "scoped_configurable_fn"):
+        try:
+            return value.scoped_configurable_fn()
+        except Exception:
+            return default
+    return value
+
+
+class MLPerfRunTracker:
+    """Centralized MLPerf run-boundary state machine for the streaming loop.
+
+    Owns the block/eval/run markers, the SAMPLES_COUNT/EPOCH_NUM progress
+    metadata, and the convergence decision (per-window AUC vs the configured
+    ``auc_threshold``). Every method no-ops when ``logger`` is None, so the
+    streaming loop can call them unconditionally. The convergence metric is
+    fixed to per-window AUC (higher-is-better).
+    """
+
+    # MetricsLogger.compute key short name for per-window AUC.
+    _EVAL_METRIC_SHORT = "window_auc"
+
+    def __init__(
+        self,
+        logger: Optional[MLPerfLogger],
+        metric_logger: Any,
+        total_train_samples: int,
+        rank: int,
+        device: Any,
+    ):
+        self.logger = logger
+        self.metric_logger = metric_logger
+        self.total_train_samples = int(total_train_samples)
+        self.rank = int(rank)
+        self.device = device
+        self.run_stopped: bool = False
+        # Idempotency flag so the boundary helpers and the outer loop can both
+        # call start/stop without risking a double BLOCK_START/STOP.
+        self._block_open: bool = False
+
+    @property
+    def enabled(self) -> bool:
+        return self.logger is not None
+
+    def _progress(self) -> Dict[str, Any]:
+        c = self.logger.constants
+        samples = self.metric_logger.cumulative_train_samples
+        epoch = (
+            samples / self.total_train_samples if self.total_train_samples > 0 else 0.0
+        )
+        return {c.SAMPLES_COUNT: samples, c.EPOCH_NUM: epoch}
+
+    def log_dataset_sizes(self, eval_samples: Optional[int] = None) -> None:
+        if not self.enabled:
+            return
+        c = self.logger.constants
+        self.logger.event(key=c.TRAIN_SAMPLES, value=self.total_train_samples)
+        if eval_samples is not None:
+            self.logger.event(key=c.EVAL_SAMPLES, value=int(eval_samples))
+
+    def block_start(self) -> None:
+        if self.enabled and not self._block_open:
+            self.logger.start(
+                key=self.logger.constants.BLOCK_START, metadata=self._progress()
+            )
+            self._block_open = True
+
+    def block_stop(self) -> None:
+        if self.enabled and self._block_open:
+            self.logger.end(
+                key=self.logger.constants.BLOCK_STOP, metadata=self._progress()
+            )
+            self._block_open = False
+
+    def eval_start(self) -> None:
+        if self.enabled:
+            self.logger.start(
+                key=self.logger.constants.EVAL_START, metadata=self._progress()
+            )
+
+    def _target_metric(self, metrics: Dict[str, float]) -> Optional[float]:
+        # Key format `metric/{prefix}_{name}/{task}` (see MetricsLogger.compute);
+        # match the per-window AUC short name.
+        for key, val in metrics.items():
+            short = key.split("/")[-2] if "/" in key else key
+            if short == self._EVAL_METRIC_SHORT:
+                return float(val)
+        return None
+
+    def _meets_target(self, value: Optional[float]) -> bool:
+        thr = self.metric_logger.auc_threshold
+        if value is None or thr is None:
+            return False
+        return value >= thr
+
+    def run_stop(self, status: object) -> None:
+        # Emit RUN_STOP exactly once, with an all-rank barrier so the timestamp
+        # reflects the slowest rank (MLPerf requirement).
+        if not self.enabled or self.run_stopped:
+            return
+        c = self.logger.constants
+        self.logger.end(
+            key=c.RUN_STOP,
+            metadata={c.STATUS: status, **self._progress()},
+            sync=True,
+        )
+        self.run_stopped = True
+
+    def eval_stop(self, eval_metrics: Dict[str, float]) -> bool:
+        # Emit EVAL_ACCURACY + EVAL_STOP, early SUCCESS RUN_STOP on target.
+        # Rank 0 decides + broadcasts the stop bool so all ranks break in lockstep
+        # (a per-rank test could diverge and hang the next all-to-all).
+        if not self.enabled:
+            return False
+        c = self.logger.constants
+        eval_value = self._target_metric(eval_metrics)
+        if eval_value is not None:
+            self.logger.event(
+                key=c.EVAL_ACCURACY, value=eval_value, metadata=self._progress()
+            )
+        self.logger.end(key=c.EVAL_STOP, metadata=self._progress())
+        decision = torch.zeros(1, device=self.device)
+        if self.rank == 0 and not self.run_stopped and self._meets_target(eval_value):
+            decision[0] = 1.0
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            torch.distributed.broadcast(decision, src=0)
+        should_stop = bool(decision.item() > 0.5)
+        if should_stop:
+            # All ranks agree -> all reach the RUN_STOP barrier together.
+            self.run_stop(c.SUCCESS)
+        return should_stop
+
+    def finalize(self, final_metrics: Dict[str, float]) -> None:
+        # End-of-run RUN_STOP when the target was never crossed: SUCCESS iff the
+        # final eval metric meets the target, else ABORTED.
+        if not self.enabled or self.run_stopped:
+            return
+        c = self.logger.constants
+        success = self._meets_target(self._target_metric(final_metrics))
+        self.run_stop(c.SUCCESS if success else c.ABORTED)
+
+
+def mlperf_checkpoint_present(ckpt_path: str) -> bool:
+    """True iff ``ckpt_path`` resolves to an existing checkpoint (i.e. a resume).
+
+    A dependency-light mirror of ``checkpoint._resolve_latest_subdir`` so
+    ``train_ranker`` can decide cold-start vs resume BEFORE the heavy checkpoint
+    import + ``setup()``. This gates the one-time INIT_START/RUN_START markers:
+    emit them on a genuine cold start only, and never re-emit on a resume
+    relaunch (the MLPerf run spans the resume). Matches the loader's resolution:
+    empty path or a base dir with no numeric subdirs => cold start.
+    """
+    if not ckpt_path:
+        return False
+    base = ckpt_path.rstrip("/")
+    # A leaf save (numeric basename) is a resume iff that dir actually exists.
+    if os.path.basename(base).isdigit():
+        return os.path.isdir(base)
+    if not os.path.isdir(base):
+        return False
+    for name in os.listdir(base):
+        if name.isdigit() and os.path.isdir(os.path.join(base, name)):
+            return True
+    return False
+
+
+@gin.configurable
+def get_mlperf_logger(
+    rank: int = 0,
+    log_path: str = "",
+    benchmark_name: str = "hstu",
+    submitter_name: str = "AMD",
+    submission_platform: str = "MI355X",
+    fresh: bool = True,
+) -> Optional[MLPerfLogger]:
+    """Build a configured :class:`MLPerfLogger`, or ``None`` if unavailable.
+
+    Path defaults to ``$MLPERF_LOG_PATH``. Returns ``None`` (not a disabled
+    logger) so callers' ``is not None`` guards cleanly skip logging.
+
+    Disable knob: set ``$MLPERF_LOGGING=0`` (or false/no/off) to turn the whole
+    MLPerf event stream off — returns ``None`` on EVERY rank, so the train loop's
+    ``is not None`` guards skip emission AND the cross-rank train-loss all-reduce
+    in lockstep. Default (unset / "1") = enabled, preserving prior behavior.
+    """
+    if not _MLLOG_AVAILABLE:
+        return None
+    if os.environ.get("MLPERF_LOGGING", "1").strip().lower() in (
+        "0", "false", "no", "off",
+    ):
+        logger.info("MLPerf logging disabled via $MLPERF_LOGGING=0")
+        return None
+    resolved_path = os.environ.get("MLPERF_LOG_PATH", log_path)
+    # SUBMISSION_PLATFORM defaults to "MI355X"; override per-submitter via env.
+    resolved_platform = os.environ.get(
+        "MLPERF_SUBMISSION_PLATFORM", submission_platform
+    )
+    return MLPerfLogger(
+        rank=rank,
+        log_path=resolved_path,
+        benchmark_name=benchmark_name,
+        submitter_name=submitter_name,
+        submission_platform=resolved_platform,
+        fresh=fresh,
+    )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/checkpoint_cadence_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/checkpoint_cadence_test.py
new file mode 100644
index 000000000..4a483c262
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/checkpoint_cadence_test.py
@@ -0,0 +1,131 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""Unit tests for `select_in_window_checkpoint_reason` — the pure decision that
+drives the streaming loop's three fine-grained checkpoint cadences:
+
+  * `in_window_checkpoint_frequency` — per-window-local batch count
+  * `checkpoint_step_frequency`      — monotonic global step ("every 1000 steps")
+  * `checkpoint_time_interval_s`     — wall-clock ("hourly")
+
+These run without a GPU / distributed init: the loop broadcasts a single
+`elapsed_since_last_save` from rank 0 and then calls this pure function, so
+exercising the function directly fully covers the trigger semantics.
+"""
+import unittest
+
+from generative_recommenders.dlrm_v3.train.utils import (
+    select_in_window_checkpoint_reason,
+)
+
+
+def _reason(
+    *,
+    batch: int = 1,
+    step: int = 1,
+    elapsed: float = 0.0,
+    in_window: int = 0,
+    step_freq: int = 0,
+    time_s: float = 0.0,
+) -> str | None:
+    return select_in_window_checkpoint_reason(
+        train_batch_idx=batch,
+        global_step=step,
+        elapsed_since_last_save=elapsed,
+        in_window_checkpoint_frequency=in_window,
+        checkpoint_step_frequency=step_freq,
+        checkpoint_time_interval_s=time_s,
+    )
+
+
+class CheckpointCadenceTest(unittest.TestCase):
+    def test_all_disabled_never_fires(self) -> None:
+        for batch in (1, 100, 1000):
+            for step in (1, 1000, 5000):
+                self.assertIsNone(_reason(batch=batch, step=step, elapsed=1e9))
+
+    def test_step_based_every_1000(self) -> None:
+        # Fires exactly on multiples of the step frequency.
+        self.assertEqual(_reason(step=1000, step_freq=1000), "global_step")
+        self.assertEqual(_reason(step=2000, step_freq=1000), "global_step")
+        # Does not fire just off a boundary.
+        self.assertIsNone(_reason(step=999, step_freq=1000))
+        self.assertIsNone(_reason(step=1001, step_freq=1000))
+
+    def test_step_zero_does_not_trigger(self) -> None:
+        # global_step==0 must not trivially satisfy `0 % N == 0`.
+        self.assertIsNone(_reason(step=0, step_freq=1000))
+
+    def test_time_based_interval(self) -> None:
+        # At/over the interval -> fires; under -> no save.
+        self.assertEqual(
+            _reason(step=3, elapsed=3600.0, time_s=3600.0), "time_interval"
+        )
+        self.assertEqual(
+            _reason(step=3, elapsed=4000.0, time_s=3600.0), "time_interval"
+        )
+        self.assertIsNone(_reason(step=3, elapsed=3599.9, time_s=3600.0))
+
+    def test_in_window_batch_cadence(self) -> None:
+        self.assertEqual(_reason(batch=5, in_window=5), "in_window_batch")
+        self.assertEqual(_reason(batch=10, in_window=5), "in_window_batch")
+        self.assertIsNone(_reason(batch=4, in_window=5))
+
+    def test_precedence_in_window_over_step_over_time(self) -> None:
+        # All three would fire this batch; precedence picks in_window first.
+        self.assertEqual(
+            _reason(
+                batch=5,
+                step=1000,
+                elapsed=9999.0,
+                in_window=5,
+                step_freq=1000,
+                time_s=3600.0,
+            ),
+            "in_window_batch",
+        )
+        # in_window not due this batch -> step wins over time.
+        self.assertEqual(
+            _reason(
+                batch=4,
+                step=1000,
+                elapsed=9999.0,
+                in_window=5,
+                step_freq=1000,
+                time_s=3600.0,
+            ),
+            "global_step",
+        )
+        # Neither batch nor step due -> time wins.
+        self.assertEqual(
+            _reason(
+                batch=4,
+                step=999,
+                elapsed=9999.0,
+                in_window=5,
+                step_freq=1000,
+                time_s=3600.0,
+            ),
+            "time_interval",
+        )
+
+    def test_step_and_time_combined_independent(self) -> None:
+        # Step frequency enabled, time disabled: only step boundaries fire.
+        self.assertEqual(_reason(step=1000, step_freq=1000, time_s=0.0), "global_step")
+        self.assertIsNone(_reason(step=1000, elapsed=1e9, step_freq=0, time_s=0.0))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
new file mode 100644
index 000000000..b46da936b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
@@ -0,0 +1,166 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""End-to-end failure-injection test for streaming resume.
+
+Validates the four resume features end-to-end on the yambda-5b stack:
+  1. Mid-window save (in_window_checkpoint_frequency)
+  2. Within-window exact-once skip (StreamingWindowSampler.set_window skip)
+  3. Auto-detect-latest checkpoint subdir
+  4. keep_last_n retention (default 1)
+
+Test flow (driven by the sibling `streaming_resume_test.sh`):
+  Phase 1 (baseline): Run streaming-train-eval for N=2 train_ts × K batches/window
+    with die_at_step=-1. Capture per-batch window_ne / window_auc into traj_baseline.json.
+  Phase 2 (interrupt): Same config but die_at_step=M (M mid-window-2). Expect
+    process to exit(42) after the in-window checkpoint at step M lands.
+  Phase 3 (resume): Re-launch with same CKPT_PATH (auto-latest picks the
+    in-window save). Continue to the same total step count. Capture
+    traj_resumed.json (which only contains the post-resume steps).
+
+  Correctness is proven by the FUNCTIONAL INVARIANTS checked in the shell
+  driver (resumed at exactly batch_idx_in_window, per-rank RNG restored, atomic
+  save + keep_last_n), NOT by bit-equal trajectory matching. The training stack
+  is nondeterministic across runs (non-deterministic atomic scatter-add in the
+  embedding/attention backward on ROCm): two independent *cold* runs already
+  drift ~7e-4 in window_ne over 20 steps, and early-training chaos amplifies
+  it, so resume-vs-baseline can differ by a few percent even when resume is
+  perfect. The trajectory comparison here is therefore a LOOSE closeness bound
+  (default atol below) that only flags gross divergence — wrong data slice or
+  unrestored model state — while tolerating nondeterministic drift.
+
+This module also provides a CLI entry point used by the shell driver to (a)
+parse a train.log into a step-keyed dict of metrics, and (b) compare two such
+dicts and fail loudly on mismatch.
+"""
+
+import argparse
+import json
+import re
+import sys
+from typing import Dict, Tuple
+
+# Per-step metrics from MetricsLogger.compute_and_log are emitted like:
+#   "train - Step 51 metrics: {'metric/lifetime_ne/listen_plus': tensor(1.0954, ...)
+#     'metric/window_ne/listen_plus': tensor(0.9940, ...),
+#     'metric/window_accuracy/listen_plus': tensor(0.6231, ...) ..."
+_STEP_RE = re.compile(r"train - Step (\d+) metrics:")
+_WNE_RE = re.compile(r"window_ne/listen_plus.*?tensor\(([0-9.]+)")
+_WAUC_RE = re.compile(r"window_auc/listen_plus.*?tensor\(([0-9.]+)")
+_WACC_RE = re.compile(r"window_accuracy/listen_plus.*?tensor\(([0-9.]+)")
+
+
+def parse_trajectory(log_path: str) -> Dict[int, Dict[str, float]]:
+    """Extract a {step: {window_ne, window_auc, window_accuracy}} dict from a
+    train.log. The grep is loose on the metric line itself — we accept the
+    very long truncated form MetricsLogger prints."""
+    out: Dict[int, Dict[str, float]] = {}
+    with open(log_path, "r", errors="replace") as f:
+        for line in f:
+            m = _STEP_RE.search(line)
+            if not m:
+                continue
+            step = int(m.group(1))
+            wne = _WNE_RE.search(line)
+            wauc = _WAUC_RE.search(line)
+            wacc = _WACC_RE.search(line)
+            if not (wne and wauc and wacc):
+                continue
+            # Only keep ONE entry per step — log can have duplicate per-rank
+            # prints; first one wins (they're identical).
+            if step in out:
+                continue
+            out[step] = {
+                "window_ne": float(wne.group(1)),
+                "window_auc": float(wauc.group(1)),
+                "window_accuracy": float(wacc.group(1)),
+            }
+    return out
+
+
+def compare_trajectories(
+    baseline: Dict[int, Dict[str, float]],
+    resumed: Dict[int, Dict[str, float]],
+    min_resume_step: int,
+    atol: float = 0.15,
+) -> Tuple[bool, str]:
+    """Compare baseline vs resumed trajectories for steps >= min_resume_step.
+
+    This is a LOOSE closeness bound, not a bit-equality check — see the module
+    docstring. `atol` defaults to a value that tolerates the nondeterministic
+    cross-run drift of this stack while still catching gross resume bugs.
+    Returns (ok, message). `ok=False` on any divergence outside `atol`."""
+    steps = sorted(s for s in resumed if s >= min_resume_step)
+    if not steps:
+        return False, f"No resumed steps >= {min_resume_step}"
+    mismatches = []
+    for s in steps:
+        if s not in baseline:
+            mismatches.append(f"step {s}: missing from baseline")
+            continue
+        b = baseline[s]
+        r = resumed[s]
+        for key in ("window_ne", "window_auc", "window_accuracy"):
+            if abs(b[key] - r[key]) > atol:
+                mismatches.append(
+                    f"step {s} {key}: baseline={b[key]:.6f} "
+                    f"resumed={r[key]:.6f} diff={b[key]-r[key]:+.6f}"
+                )
+    if mismatches:
+        return False, (
+            f"{len(mismatches)} mismatches across {len(steps)} resumed steps "
+            f"(atol={atol}):\n  " + "\n  ".join(mismatches[:10])
+        )
+    return True, (
+        f"{len(steps)} resumed steps match baseline within atol={atol} "
+        f"(range: step {steps[0]}..{steps[-1]})"
+    )
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    p_parse = sub.add_parser("parse", help="Parse a train.log → traj JSON")
+    p_parse.add_argument("log")
+    p_parse.add_argument("out")
+
+    p_cmp = sub.add_parser("compare", help="Compare baseline vs resumed traj JSONs")
+    p_cmp.add_argument("baseline")
+    p_cmp.add_argument("resumed")
+    p_cmp.add_argument("--min-resume-step", type=int, required=True)
+    p_cmp.add_argument("--atol", type=float, default=0.15)
+
+    args = ap.parse_args()
+    if args.cmd == "parse":
+        traj = parse_trajectory(args.log)
+        with open(args.out, "w") as f:
+            json.dump(traj, f, indent=2)
+        print(f"Wrote {len(traj)} step entries to {args.out}", file=sys.stderr)
+        return 0
+    if args.cmd == "compare":
+        with open(args.baseline) as f:
+            baseline = {int(k): v for k, v in json.load(f).items()}
+        with open(args.resumed) as f:
+            resumed = {int(k): v for k, v in json.load(f).items()}
+        ok, msg = compare_trajectories(
+            baseline, resumed, args.min_resume_step, atol=args.atol
+        )
+        print(msg)
+        return 0 if ok else 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
new file mode 100755
index 000000000..e14e557e8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+# End-to-end failure-injection + resume test for streaming-train-eval.
+#
+# Validates exact-once mid-window resume on the yambda-5b stack:
+#   Phase 1 (baseline):     uninterrupted run for N=2 train_ts × K batches/window
+#   Phase 2 (interrupted):  same config but die_at_step=M → exits at step M
+#                           after the in-window checkpoint lands
+#   Phase 3 (resume):       re-launch with same CKPT_PATH → auto-latest picks
+#                           the in-window save → finishes the partial window
+#                           and the rest of the requested train_ts list
+# Assertion: traj_resumed[step].window_ne / window_auc / window_accuracy match
+# traj_baseline bit-equal (np.allclose atol=1e-4) for all step > die_at_step.
+#
+# Driven entirely via env-driven gin knobs defined in yambda_5b.gin:
+#   NUM_TRAIN_TS / NUM_TRAIN_BATCHES / IN_WINDOW_CKPT_FREQ / DIE_AT_STEP /
+#   CKPT_PATH / KEEP_LAST_N / EVAL_EVERY_N_WINDOWS
+#
+# Usage:
+#   bash generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh --jobid <slurm-jobid>
+#       [--container yambda_primus]
+#       [--num-train-batches 200]
+#       [--die-at-step 350]
+#       [--keep]    # retain LOG_DIR + CKPT after run for inspection
+
+set -uo pipefail
+
+JOBID=""
+CONTAINER="yambda_primus"
+NUM_TRAIN_BATCHES=200
+DIE_AT_STEP=350
+IN_WINDOW_FREQ=50
+KEEP=0
+# Trajectory closeness bound — NOT a bit-equality check. The ROCm training stack
+# is nondeterministic across runs (non-deterministic atomic scatter-add in the
+# embedding/attention backward): two independent *cold* runs already drift
+# ~7e-4 in window_ne over 20 steps, and early-training chaos (AUC~0.5) amplifies
+# any seed difference. So resume-vs-baseline can legitimately differ by a few
+# percent. This bound just catches GROSS divergence (wrong data skip, totally
+# unrestored state) while tolerating nondeterministic drift. The HARD resume
+# correctness gates are the functional-invariant checks below (RNG restored,
+# resumed-at-correct-step, atomic/keep_last_n), not this number.
+ATOL=0.15
+CKPT_ROOT=/apps/chcai/ckpts_resume_test
+LOG_DIR=/apps/chcai/streaming_resume_test
+REPO=/home/chcai/training/recommendation_v4
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --jobid) JOBID="$2"; shift 2;;
+        --container) CONTAINER="$2"; shift 2;;
+        --num-train-batches) NUM_TRAIN_BATCHES="$2"; shift 2;;
+        --die-at-step) DIE_AT_STEP="$2"; shift 2;;
+        --in-window-freq) IN_WINDOW_FREQ="$2"; shift 2;;
+        --atol) ATOL="$2"; shift 2;;
+        --keep) KEEP=1; shift;;
+        *) echo "Unknown arg: $1"; exit 1;;
+    esac
+done
+[[ -z "$JOBID" ]] && { echo "Error: --jobid required"; exit 1; }
+
+mkdir -p "$LOG_DIR"
+
+# Single-window mid-window resume: NUM_TRAIN_TS=1, so the whole test runs inside
+# train_ts=START_TS. die_at_step must land strictly inside that window, AT a
+# multiple of IN_WINDOW_FREQ so an in-window checkpoint is saved right before
+# the crash (resume then skips exactly DIE_AT_STEP already-trained batches).
+if (( DIE_AT_STEP <= 0 || DIE_AT_STEP >= NUM_TRAIN_BATCHES )); then
+    echo "Warning: die_at_step=$DIE_AT_STEP not strictly inside window (0, $NUM_TRAIN_BATCHES)" >&2
+fi
+if (( DIE_AT_STEP % IN_WINDOW_FREQ != 0 )); then
+    echo "Warning: die_at_step=$DIE_AT_STEP not a multiple of in_window_freq=$IN_WINDOW_FREQ; no save lands exactly at crash" >&2
+fi
+
+cleanup_workers() {
+    srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
+        "pkill -9 -f generative_recommenders 2>/dev/null; sleep 2; \
+         pkill -9 -f spawn_main 2>/dev/null; sleep 3; true" 2>/dev/null || true
+}
+clean_ckpt() {
+    srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" rm -rf "$CKPT_ROOT" 2>/dev/null || true
+}
+
+# Wait for a log line to appear OR a crash sentinel. Returns 0 if target found,
+# 1 if crash sentinel found first.
+wait_for_log() {
+    local log="$1"; local target_re="$2"; local timeout_s="${3:-1500}"
+    local elapsed=0
+    while (( elapsed < timeout_s )); do
+        if grep -qE "$target_re" "$log" 2>/dev/null; then
+            return 0
+        fi
+        if grep -qE "Traceback|RuntimeError|OutOfMemoryError" "$log" 2>/dev/null; then
+            return 1
+        fi
+        sleep 5
+        elapsed=$((elapsed + 5))
+    done
+    return 2
+}
+
+# Single train window of NUM_TRAIN_BATCHES steps → last train step == NUM_TRAIN_BATCHES.
+LAST_STEP=$NUM_TRAIN_BATCHES
+
+run_phase() {
+    local name="$1"; shift
+    local log="$LOG_DIR/${name}.log"
+    # Join the per-phase env overrides into ONE word. Using `$*` (not `$@`) is
+    # essential: `$@` embedded mid-string in the double-quoted `bash -lc "..."`
+    # expands to *multiple* arguments, so bash -lc would only run up to the
+    # first override and treat the rest as positional params — launch_smoke
+    # would never execute (silent 0-byte log).
+    local env_overrides="$*"
+    : > "$log"
+    echo "[$(date)] === phase '$name' ==="
+    cleanup_workers
+    srun --jobid="$JOBID" --overlap docker exec -d "$CONTAINER" bash -lc "
+        cd $REPO &&
+        HSTU_HAMMER_KERNEL=TRITON \
+        $env_overrides \
+        RUN_NAME=resume_test_$name \
+        LOG=$log \
+        bash scripts/launch_smoke_8gpu.sh
+    "
+}
+
+# === Phase 1: baseline ===
+clean_ckpt
+run_phase baseline \
+    "NUM_TRAIN_TS=1" \
+    "EVAL_EVERY_N_WINDOWS=0" \
+    "METRIC_LOG_FREQ=1" \
+    "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
+    "DIE_AT_STEP=-1"
+wait_for_log "$LOG_DIR/baseline.log" "train - Step $LAST_STEP metrics" 1500
+rc=$?
+cleanup_workers
+[[ $rc -ne 0 ]] && { echo "FAIL: baseline didn't finish"; tail -20 "$LOG_DIR/baseline.log"; exit 1; }
+
+# === Phase 2: interrupted ===
+clean_ckpt
+run_phase interrupt \
+    "NUM_TRAIN_TS=1" \
+    "EVAL_EVERY_N_WINDOWS=0" \
+    "METRIC_LOG_FREQ=1" \
+    "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
+    "IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ" \
+    "KEEP_LAST_N=1" \
+    "DIE_AT_STEP=$DIE_AT_STEP" \
+    "CKPT_PATH=$CKPT_ROOT"
+wait_for_log "$LOG_DIR/interrupt.log" "die_at_step=$DIE_AT_STEP hit" 1500
+rc=$?
+cleanup_workers
+[[ $rc -ne 0 ]] && { echo "FAIL: interrupt didn't hit die_at_step"; tail -20 "$LOG_DIR/interrupt.log"; exit 1; }
+
+SAVED=$(srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" ls "$CKPT_ROOT" 2>/dev/null | tr '\n' ' ')
+echo "Saved checkpoints after interrupt: $SAVED"
+
+# === Phase 3: resume ===
+run_phase resume \
+    "NUM_TRAIN_TS=1" \
+    "EVAL_EVERY_N_WINDOWS=0" \
+    "METRIC_LOG_FREQ=1" \
+    "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
+    "IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ" \
+    "KEEP_LAST_N=1" \
+    "DIE_AT_STEP=-1" \
+    "CKPT_PATH=$CKPT_ROOT"
+wait_for_log "$LOG_DIR/resume.log" "train - Step $LAST_STEP metrics" 1500
+rc=$?
+[[ $rc -ne 0 ]] && { cleanup_workers; echo "FAIL: resume didn't finish"; tail -20 "$LOG_DIR/resume.log"; exit 1; }
+# The resume run performs an end-of-window checkpoint save AFTER the final
+# step's metric line. That save (hundreds of GB) writes <ts>.tmp and then
+# atomically renames it onto <ts>, logging "checkpoint successfully saved" only
+# once the rename completes. If we kill workers right after the step line we'd
+# orphan a half-written <ts>.tmp and trip the stale-dir gate below — a harness
+# race, not a resume bug. Wait for the save to finish before tearing down.
+wait_for_log "$LOG_DIR/resume.log" "checkpoint successfully saved" 1500
+save_rc=$?
+cleanup_workers
+[[ $save_rc -ne 0 ]] && { echo "FAIL: resume end-of-window checkpoint save did not complete"; tail -20 "$LOG_DIR/resume.log"; exit 1; }
+
+# === HARD resume-correctness gates (functional invariants) ===
+# These — not the trajectory closeness check below — are the authoritative
+# proof the resume path is correct, because they're deterministic and immune
+# to the GPU nondeterminism that perturbs the metric trajectory.
+
+# (1) Re-entered the partial window at exactly the saved batch_idx_in_window.
+if ! grep -qE "Resuming mid-window at train_ts=[0-9]+ batch_idx_in_window=$DIE_AT_STEP\b" "$LOG_DIR/resume.log" 2>/dev/null; then
+    echo "FAIL: resume did not re-enter mid-window at batch_idx_in_window=$DIE_AT_STEP"
+    grep -E "Resuming" "$LOG_DIR/resume.log" 2>/dev/null | head -2
+    exit 1
+fi
+# (2) Per-rank RNG state was actually restored (dropout determinism path).
+RNG_RESTORED=$(grep -c "RNG state restored from" "$LOG_DIR/resume.log" 2>/dev/null || echo 0)
+echo "RNG state restored on $RNG_RESTORED ranks"
+[[ "$RNG_RESTORED" -lt 1 ]] && { echo "FAIL: no RNG state restored on resume"; exit 1; }
+# (3) The FIRST training step after resume is exactly die_at_step+1, i.e. the
+#     skip-already-trained-batches logic emitted the next unseen batch (not a
+#     restart from step 1, and not a gap).
+FIRST_RESUMED=$(grep -oE 'train - Step [0-9]+ metrics: \{.metric' "$LOG_DIR/resume.log" 2>/dev/null \
+    | grep -oE 'Step [0-9]+' | awk '{print $2}' | sort -n | head -1)
+echo "First resumed train step: $FIRST_RESUMED (expect $((DIE_AT_STEP + 1)))"
+[[ "$FIRST_RESUMED" != "$((DIE_AT_STEP + 1))" ]] && {
+    echo "FAIL: resume did not continue at step $((DIE_AT_STEP + 1)) (got $FIRST_RESUMED)"; exit 1; }
+
+# === Final on-disk state checks (atomic save + retention) ===
+NUM_CKPT=$(srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
+    "ls $CKPT_ROOT 2>/dev/null | grep -E '^[0-9]+$' | wc -l" | tr -d ' ')
+# Both .tmp (interrupted write) and .old (interrupted atomic-overwrite swap)
+# must be absent — their presence means a save crashed without clean recovery.
+STALE_CKPT=$(srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
+    "ls $CKPT_ROOT 2>/dev/null | grep -E '\\.(tmp|old)$' | wc -l" | tr -d ' ')
+echo "Final: $NUM_CKPT numeric ckpt subdirs, $STALE_CKPT stale (.tmp/.old) dirs (expect 1, 0)"
+[[ "$NUM_CKPT" != "1" ]] && { echo "FAIL: keep_last_n=1 violated"; exit 1; }
+[[ "$STALE_CKPT" != "0" ]] && { echo "FAIL: stale .tmp/.old dirs left behind"; exit 1; }
+echo "=== Resume functional invariants: ALL PASS ==="
+
+# === Trajectory closeness (sanity bound, NOT bit-equality) ===
+# Catches gross resume bugs (wrong data slice, unrestored model) that throw the
+# metric trajectory far off. Small drift is expected & tolerated (see ATOL note
+# at top). The functional invariants above are the real correctness proof.
+python3 $REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py parse \
+    "$LOG_DIR/baseline.log" "$LOG_DIR/traj_baseline.json"
+python3 $REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py parse \
+    "$LOG_DIR/resume.log" "$LOG_DIR/traj_resumed.json"
+
+python3 $REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py compare \
+    "$LOG_DIR/traj_baseline.json" "$LOG_DIR/traj_resumed.json" \
+    --min-resume-step $((DIE_AT_STEP + 1)) --atol $ATOL
+RC=$?
+
+if [[ "$KEEP" != "1" ]]; then
+    rm -rf "$LOG_DIR"
+    clean_ckpt
+fi
+
+if [[ $RC -eq 0 ]]; then
+    echo "=== PASS: resume validated (functional invariants + trajectory within $ATOL of baseline) ==="
+else
+    echo "=== FAIL: trajectory diverged beyond $ATOL — likely a real resume bug (wrong data slice / unrestored state), not nondeterminism ==="
+fi
+exit $RC
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/train_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/train_test.py
new file mode 100644
index 000000000..dfd58f6e5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/train_test.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+import unittest
+
+from generative_recommenders.common import gpu_unavailable
+from generative_recommenders.dlrm_v3.train.train_ranker import main
+
+
+class DLRMV3TrainTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_e2e(self) -> None:
+        main()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
new file mode 100644
index 000000000..d5797697b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -0,0 +1,369 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+import argparse
+import logging
+
+logging.basicConfig(level=logging.INFO)
+import os
+import sys
+import traceback
+
+import gin
+import torch
+from torch import multiprocessing as mp
+from torchrec.test_utils import get_free_port
+
+# NOTE: heavy imports of generative_recommenders.dlrm_v3.* are deferred to
+# inside _main_func so that gin-driven env-var bootstrap (see
+# _env_bootstrap.apply_env_bootstrap) can run BEFORE the triton kernel
+# modules evaluate their `@triton.autotune` decorators at module-load time.
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+SUPPORTED_CONFIGS = {
+    "debug": "debug.gin",
+    "kuairand-1k": "kuairand_1k.gin",
+    "movielens-1m": "movielens_1m.gin",
+    "movielens-20m": "movielens_20m.gin",
+    "movielens-13b": "movielens_13b.gin",
+    "movielens-18b": "movielens_18b.gin",
+    "streaming-400m": "streaming_400m.gin",
+    "streaming-200b": "streaming_200b.gin",
+    "streaming-100b": "streaming_100b.gin",
+    "yambda-5b": "yambda_5b.gin",
+}
+
+
+def _main_func(
+    local_rank: int,
+    world_size: int,
+    node_rank: int,
+    gpus_per_node: int,
+    master_addr: str,
+    master_port: int,
+    gin_file: str,
+    mode: str,
+) -> None:
+    # `local_rank` is the index handed out by mp.start_processes (0..gpus_per_node-1)
+    # and indexes this node's GPUs. The GLOBAL rank is what every downstream
+    # consumer wants (data sharding via StreamingWindowSampler, checkpoint I/O,
+    # metrics), so derive it once and pass it through as `rank`. Only the CUDA
+    # device must be node-local. Single-node (node_rank=0) → rank == local_rank,
+    # exactly as before.
+    rank = node_rank * gpus_per_node + local_rank
+    device = torch.device(f"cuda:{local_rank}")
+    logger.info(
+        f"rank: {rank} (node_rank={node_rank} local_rank={local_rank}), "
+        f"world_size: {world_size}, device: {device}"
+    )
+    # Phase 1: parse gin early with skip_unknown=True so env-bootstrap
+    # bindings take effect BEFORE any module-level @gin.configurable
+    # discovers itself. This is required because triton @triton.autotune
+    # decorators in generative_recommenders.ops.triton.* read env vars at
+    # module import time, and the heavy imports below pull those in.
+    from generative_recommenders.dlrm_v3.train._env_bootstrap import apply_env_bootstrap
+    from generative_recommenders.dlrm_v3.train.mlperf_logging_utils import (
+        get_mlperf_logger,
+        mlperf_checkpoint_present,
+    )
+
+    gin.parse_config_file(gin_file, skip_unknown=True)
+    apply_env_bootstrap()
+
+    # Cold-start vs resume, decided from the on-disk checkpoint BEFORE setup so
+    # the one-time INIT/RUN markers fire on a genuine cold start only and are NOT
+    # re-emitted on a resume relaunch — the MLPerf run (run_start..run_stop) spans
+    # the resume as a single coherent event stream in one appended log file.
+    mlperf_resume = mlperf_checkpoint_present(os.environ.get("CKPT_PATH", ""))
+    # Rank-0-gated MLPerf logger, only for the streaming-train-eval path. `fresh`
+    # truncates the log on cold start (one run_start per file) but appends on a
+    # resume so the pre-crash events are preserved and continued.
+    mlperf_logger = (
+        get_mlperf_logger(rank=rank, fresh=not mlperf_resume)
+        if mode == "streaming-train-eval"
+        else None
+    )
+    # INIT_START fires before setup on a cold start only (resume continues the
+    # already-open run, whose markers were emitted by the original process).
+    mlperf_cold_start = mlperf_logger is not None and not mlperf_resume
+    if mlperf_cold_start:
+        mlperf_logger.event(key=mlperf_logger.constants.CACHE_CLEAR, value=True)
+        mlperf_logger.start(key=mlperf_logger.constants.INIT_START)
+
+    # Phase 2: heavy imports. Triton kernel modules evaluate their autotune
+    # decorators here, using the env vars set above.
+    from generative_recommenders.dlrm_v3.checkpoint import load_dmp_checkpoint
+    from generative_recommenders.dlrm_v3.train.utils import (
+        cleanup,
+        decorrelate_runtime_rng,
+        eval_loop,
+        make_model,
+        make_optimizer_and_shard,
+        make_train_test_dataloaders,
+        seed_everything,
+        setup,
+        streaming_train_eval_loop,
+        train_eval_loop,
+        train_loop,
+    )
+    from generative_recommenders.dlrm_v3.utils import (
+        MetricsLogger,
+        get_gpu_peak_flops,
+    )
+
+    setup(
+        rank=rank,
+        world_size=world_size,
+        master_addr=master_addr,
+        master_port=master_port,
+        device=device,
+    )
+    # Phase 3: re-parse to bind the @gin.configurables now that they are
+    # registered. The earlier skip_unknown pass already consumed the
+    # env-bootstrap binding, but bindings are idempotent so re-applying is
+    # fine, and this pass is the one that actually wires up make_model,
+    # make_train_test_dataloaders, etc.
+    gin.parse_config_file(gin_file)
+
+    # Seed all RNGs (gin-configurable $SEED) BEFORE make_model() so weight init
+    # is reproducible run-to-run. Must follow the full parse above so the binding
+    # is wired, and precede make_model() below.
+    seed_everything(rank=rank)
+
+    model, model_configs, embedding_table_configs = make_model()
+    model, optimizer = make_optimizer_and_shard(
+        model=model,
+        device=device,
+        world_size=world_size,
+        local_world_size=gpus_per_node,
+    )
+    # Decorrelate forward-time stochasticity (HSTU dropout) per data-parallel
+    # rank. MUST run after make_model() + make_optimizer_and_shard() so the
+    # replicated dense weights and sharded embeddings stay init-identical across
+    # ranks; this only offsets the global RNG by rank so dropout masks differ.
+    decorrelate_runtime_rng(rank=rank)
+    train_dataloader, test_dataloader = make_train_test_dataloaders(
+        hstu_config=model_configs,
+        embedding_table_configs=embedding_table_configs,
+    )
+    # TFLOPS/MFU reporting: query the model's static dense estimate +
+    # current GPU's peak FLOPS. Both default to 0 if the model doesn't
+    # expose get_num_flops_per_sample, in which case MetricsLogger silently
+    # drops the tflops fields from the perf line.
+    inner_model = model.module if hasattr(model, "module") else model
+    num_flops_per_sample = (
+        float(inner_model.get_num_flops_per_sample())
+        if hasattr(inner_model, "get_num_flops_per_sample")
+        else 0.0
+    )
+    gpu_peak_flops = get_gpu_peak_flops(
+        "bf16" if getattr(model_configs, "bf16_training", True) else "fp32"
+    )
+    # Streaming fixed-holdout eval uses the dual fresh/cumulative metric sets:
+    # window_* = fresh per-pass full-holdout, lifetime_* = cumulative across
+    # passes (AUC via O(bins) histogram). Other modes keep the legacy single set.
+    metrics = MetricsLogger(
+        multitask_configs=model_configs.multitask_configs,
+        batch_size=train_dataloader.batch_size,
+        window_size=2500,
+        device=device,
+        rank=rank,
+        # Pass the live world_size so metric normalization is correct at any
+        # node count; the gin's MetricsLogger.world_size default (=8) is only a
+        # single-node fallback and would mis-normalize a multi-node run.
+        world_size=world_size,
+        num_flops_per_sample=num_flops_per_sample,
+        gpu_peak_flops=gpu_peak_flops,
+        model=model,
+        eval_cumulative=(mode == "streaming-train-eval"),
+        # Lifetime-AUC backend + bins/window come from gin (see yambda_5b.gin:
+        # MetricsLogger.{train,eval}_lifetime_auc_mode / cumulative_auc_bins /
+        # lifetime_auc_window), env-overridable. eval_cumulative stays explicit
+        # because it is runtime-mode dependent, not a config knob.
+    )
+    # Capture streaming resume hint (None for cold start / non-streaming
+    # checkpoints). For the streaming-train-eval mode, we forward this into
+    # streaming_train_eval_loop so it can advance past the last completed
+    # window OR re-enter the partial window and skip already-trained batches.
+    resume_train_ts, resume_batch_idx_in_window, resume_split_contract, resume_cold_start = (
+        load_dmp_checkpoint(
+            model=model,
+            optimizer=optimizer,
+            metric_logger=metrics,
+            device=device,
+            rank=rank,
+        )
+    )
+
+    # MLPerf run markers: open the run exactly once. On a cold start emit
+    # submission info + hyperparameters + INIT_STOP/RUN_START and mark the run as
+    # started (persisted in the checkpoint via metrics.mlperf_run_started). On a
+    # resume, load_dmp_checkpoint restored mlperf_run_started=True, so we skip the
+    # markers and just continue the stream. `metrics.mlperf_run_started` guards a
+    # double-emit even if cold/resume detection and the checkpoint ever disagree.
+    if mlperf_cold_start and not metrics.mlperf_run_started:
+        # Submission info + hyperparameters + INIT_STOP/RUN_START, all emitted by
+        # the logger (optimizer names/LRs read from gin internally). Seed is the
+        # value setup() resolved and exported to $SEED.
+        mlperf_logger.log_run_start(
+            global_batch_size=world_size * int(train_dataloader.batch_size),
+            seed=int(os.environ.get("SEED", "1")),
+        )
+        metrics.mlperf_run_started = True
+    # Pass the logger to the loop whenever MLPerf logging is enabled, so block /
+    # eval / train_loss / run_stop events emit on BOTH a cold start and a resume.
+    mlperf_run_active = mlperf_logger is not None and metrics.mlperf_run_started
+
+    # train loop
+    try:
+        if mode == "train":
+            train_loop(
+                rank=rank,
+                model=model,
+                dataloader=train_dataloader,
+                optimizer=optimizer,
+                metric_logger=metrics,
+                device=device,
+            )
+        elif mode == "eval":
+            # reinit metrics logger for eval
+            metrics = MetricsLogger(
+                multitask_configs=model_configs.multitask_configs,
+                batch_size=train_dataloader.batch_size,
+                window_size=1000,
+                device=device,
+                rank=rank,
+                world_size=world_size,
+            )
+            eval_loop(
+                rank=rank,
+                model=model,
+                dataloader=test_dataloader,
+                metric_logger=metrics,
+                device=device,
+            )
+        elif mode == "train-eval":
+            train_eval_loop(
+                rank=rank,
+                model=model,
+                train_dataloader=train_dataloader,
+                eval_dataloader=test_dataloader,
+                optimizer=optimizer,
+                metric_logger=metrics,
+                device=device,
+            )
+        elif mode == "streaming-train-eval":
+            streaming_train_eval_loop(
+                rank=rank,
+                model=model,
+                optimizer=optimizer,
+                metric_logger=metrics,
+                device=device,
+                hstu_config=model_configs,
+                embedding_table_configs=embedding_table_configs,
+                resume_train_ts=resume_train_ts,
+                resume_batch_idx_in_window=resume_batch_idx_in_window,
+                resume_split_contract=resume_split_contract,
+                resume_cold_start=resume_cold_start,
+                # Only pass the logger when run boundaries were emitted, so the
+                # loop never produces orphan block/eval events.
+                mlperf_logger=mlperf_logger if mlperf_run_active else None,
+            )
+    except Exception as e:
+        logger.info(traceback.format_exc())
+        raise Exception(e)
+    finally:
+        # Graceful distributed teardown on both success and failure: barrier so
+        # all ranks finish in lockstep, then destroy the process group (best-
+        # effort) to avoid noisy TCPStore/NCCL shutdown warnings at exit.
+        if torch.distributed.is_initialized():
+            try:
+                torch.distributed.barrier()
+            except Exception:
+                logger.info("teardown barrier failed (non-fatal)")
+            try:
+                cleanup()
+            except Exception:
+                logger.info("teardown destroy_process_group failed (non-fatal)")
+
+
+def get_args():  # pyre-ignore [3]
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset", default="debug", choices=SUPPORTED_CONFIGS.keys(), help="dataset"
+    )
+    parser.add_argument(
+        "--mode",
+        default="train",
+        choices=["train", "eval", "train-eval", "streaming-train-eval"],
+        help="mode",
+    )
+    args, unknown_args = parser.parse_known_args()
+    logger.warning(f"unknown_args: {unknown_args}")
+    return args
+
+
+def main() -> None:
+    args = get_args()
+    logger.info(args)
+    assert args.dataset in SUPPORTED_CONFIGS, f"Unsupported dataset: {args.dataset}"
+    assert args.mode in [
+        "train",
+        "eval",
+        "train-eval",
+        "streaming-train-eval",
+    ], f"Unsupported mode: {args.mode}"
+    # Distributed topology (single-node defaults reproduce the legacy behavior):
+    #   GPUS_PER_NODE  local procs to spawn on THIS node (default: all visible GPUs)
+    #   NNODES/NODE_RANK  multi-node fan-out, set by the SLURM launcher
+    #   WORLD_SIZE     global rank count = NNODES * GPUS_PER_NODE
+    #   MASTER_ADDR/PORT  rank-0 rendezvous; the port MUST match across nodes, so
+    #                     honor it from the env when set and only fall back to a
+    #                     random free port for the standalone single-node path.
+    GPUS_PER_NODE = int(os.environ.get("GPUS_PER_NODE", 0)) or torch.cuda.device_count()
+    NNODES = int(os.environ.get("NNODES", 1))
+    NODE_RANK = int(os.environ.get("NODE_RANK", 0))
+    WORLD_SIZE = NNODES * GPUS_PER_NODE
+    MASTER_ADDR = os.environ.get("MASTER_ADDR", "localhost")
+    MASTER_PORT = str(os.environ.get("MASTER_PORT") or get_free_port())
+    gin_path = f"{os.path.dirname(__file__)}/gin/{SUPPORTED_CONFIGS[args.dataset]}"
+    logger.info(
+        f"launching: nnodes={NNODES} node_rank={NODE_RANK} "
+        f"gpus_per_node={GPUS_PER_NODE} world_size={WORLD_SIZE} "
+        f"master={MASTER_ADDR}:{MASTER_PORT}"
+    )
+
+    mp.start_processes(
+        _main_func,
+        args=(
+            WORLD_SIZE,
+            NODE_RANK,
+            GPUS_PER_NODE,
+            MASTER_ADDR,
+            MASTER_PORT,
+            gin_path,
+            args.mode,
+        ),
+        nprocs=GPUS_PER_NODE,
+        join=True,
+        start_method="spawn",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
new file mode 100644
index 000000000..526a6f89d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -0,0 +1,2557 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+import logging
+import os
+import threading
+import time
+from collections.abc import Iterator
+from datetime import timedelta
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import gin
+import torch
+import torchrec
+from generative_recommenders.dlrm_v3.checkpoint import save_dmp_checkpoint, WINDOW_COMPLETE
+from generative_recommenders.dlrm_v3.configs import (
+    get_embedding_table_config,
+    get_hstu_configs,
+)
+from generative_recommenders.dlrm_v3.datasets.dataset import (
+    collate_fn,
+    Dataset,
+    Samples,
+)
+from generative_recommenders.dlrm_v3.utils import get_dataset, MetricsLogger, Profiler
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTU, DlrmHSTUConfig
+from torch import distributed as dist
+from torch.distributed.optim import (
+    _apply_optimizer_in_backward as apply_optimizer_in_backward,
+)
+from torch.optim.optimizer import Optimizer
+from torch.utils.data import DataLoader, Dataset as TorchDataset
+from torch.utils.data.distributed import _T_co, DistributedSampler
+from torchrec.distributed.model_parallel import DistributedModelParallel
+from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+from torchrec.distributed.sharding_plan import get_default_sharders
+from torchrec.distributed.types import ShardedTensor, ShardingEnv
+from torchrec.modules.embedding_configs import EmbeddingConfig
+from torchrec.modules.embedding_modules import (
+    EmbeddingBagCollection,
+    EmbeddingCollection,
+)
+from torchrec.optim.keyed import CombinedOptimizer, KeyedOptimizerWrapper
+from torchrec.optim.optimizers import in_backward_optimizer_filter
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+TORCHREC_TYPES: Set[Type[Union[EmbeddingBagCollection, EmbeddingCollection]]] = {
+    EmbeddingBagCollection,
+    EmbeddingCollection,
+}
+
+
+@gin.configurable
+def seed_everything(seed: int = -1, rank: int = 0) -> None:
+    """Seed all RNGs (same value on every rank) for reproducible dense weight init.
+
+    Call right before make_model(), after setup() (process group needed for the
+    broadcast) and the gin parse. seed < 0 ($SEED unset) draws a fresh random seed
+    per run (rank 0 broadcasts; exported to $SEED); seed >= 0 reproduces a run.
+    Data order/split are independent of this seed (StreamingWindowSampler/$SPLIT_SALT).
+    """
+    import random
+
+    import numpy as np
+
+    pinned = seed >= 0
+    if not pinned:
+        # rank 0 draws a random seed and broadcasts it so all ranks agree (an
+        # identical seed on every rank is REQUIRED, else dense weight init
+        # diverges across ranks and DDP/AllReduce trains garbage).
+        if dist.is_available() and dist.is_initialized():
+            device = torch.device(f"cuda:{torch.cuda.current_device()}")
+            drawn = int.from_bytes(os.urandom(4), "little") if rank == 0 else 0
+            _seed_t = torch.tensor([drawn], dtype=torch.int64, device=device)
+            dist.broadcast(_seed_t, src=0)
+            seed = int(_seed_t.item())
+        else:
+            seed = int.from_bytes(os.urandom(4), "little")
+    # Export the resolved value so the run is reproducible after the fact.
+    os.environ["SEED"] = str(seed)
+
+    logger.info(
+        f"[rank {rank}] seeding all RNGs with SEED={seed} "
+        f"({'pinned via $SEED' if pinned else 'random per-run; set $SEED to reproduce'})"
+    )
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+@gin.configurable
+def decorrelate_runtime_rng(rank: int = 0, enabled: bool = True) -> None:
+    """Re-seed torch/cuda with $SEED + rank so HSTU dropout draws different masks
+    per data-parallel rank (seed_everything's identical seed would draw the same).
+
+    MUST run after make_model() + make_optimizer_and_shard() so init stays
+    identical across ranks; it perturbs only forward-time stochasticity.
+    Reproducible (pure fn of $SEED + rank; RNG state checkpointed). enabled=False
+    keeps the legacy identical-mask behavior.
+    """
+    if not enabled:
+        logger.info(
+            f"[rank {rank}] decorrelate_runtime_rng disabled; dropout masks "
+            f"identical across ranks"
+        )
+        return
+    base = int(os.environ.get("SEED", "1"))
+    offset_seed = base + int(rank)
+    torch.manual_seed(offset_seed)
+    torch.cuda.manual_seed_all(offset_seed)
+    logger.info(
+        f"[rank {rank}] decorrelated runtime RNG: SEED={base} + rank={rank} "
+        f"=> {offset_seed} (per-rank dropout masks)"
+    )
+
+
+def setup(
+    rank: int,
+    world_size: int,
+    master_port: int,
+    device: torch.device,
+    master_addr: str = "localhost",
+) -> dist.ProcessGroup:
+    # Default "localhost" keeps the single-node path unchanged; multi-node
+    # launches pass the rank-0 host so every node rendezvouses at the same addr.
+    os.environ["MASTER_ADDR"] = master_addr
+    os.environ["MASTER_PORT"] = str(master_port)
+
+    BACKEND = dist.Backend.NCCL
+    TIMEOUT = 1800
+
+    # set device BEFORE init_process_group so NCCL binds this rank to its
+    # own GPU; otherwise every rank's first CUDA context lands on GPU 0,
+    # leaving stale allocations and triggering OOMs on rank 0.
+    torch.cuda.set_device(device)
+
+    # NOTE: RNG seeding for reproducible weight init lives in seed_everything(),
+    # which train_ranker calls right before make_model() (after setup() so the
+    # process group is initialized for the cross-rank seed broadcast, and after
+    # the full gin parse so the gin-configurable $SEED is bound). Seeding here
+    # would be too early to be gin-configurable.
+
+    # initialize the process group
+    #
+    # The default PG timeout must match TIMEOUT (not the 600s NCCL default):
+    # checkpoint saves go through DCP collectives on *this* default PG, and the
+    # 560GB sparse-embedding write is both slow on shared NFS and badly skewed
+    # across ranks (shards range ~37GB..~95GB), so the fastest rank can sit in
+    # the post-write allgather/barrier well past 600s waiting for the slowest
+    # rank. The stock 600s watchdog then SIGABRTs an otherwise-healthy job.
+    if not dist.is_initialized():
+        dist.init_process_group(
+            "nccl",
+            rank=rank,
+            world_size=world_size,
+            device_id=device,
+            timeout=timedelta(seconds=TIMEOUT),
+        )
+
+    pg = dist.new_group(
+        backend=BACKEND,
+        timeout=timedelta(seconds=TIMEOUT),
+    )
+
+    return pg
+
+
+def cleanup() -> None:
+    dist.destroy_process_group()
+
+
+class HammerToTorchDataset(TorchDataset):
+    def __init__(
+        self,
+        dataset: Dataset,
+    ) -> None:
+        self.dataset: Dataset = dataset
+
+    def __getitem__(self, idx: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        self.dataset.load_query_samples([idx])
+        sample = self.dataset.get_sample(idx)
+        self.dataset.unload_query_samples([idx])
+        return sample
+
+    def __getitems__(
+        self, indices: List[int]
+    ) -> List[Tuple[KeyedJaggedTensor, KeyedJaggedTensor]]:
+        self.dataset.load_query_samples(indices)
+        samples = [self.dataset.get_sample(i) for i in indices]
+        self.dataset.unload_query_samples(indices)
+        return samples
+
+
+class _ChainedRanges:
+    """O(1) __len__ + O(log K) __getitem__ over a sequence of `range`s.
+
+    Lets `torch.utils.data.Subset(dataset, _ChainedRanges([r1, r2, ...]))`
+    avoid materializing a Python list of all per-block indices (which at
+    multi-billion totals is ~28 B/int and dominates host RAM).
+    """
+
+    def __init__(self, ranges: List[range]) -> None:
+        self._ranges: List[range] = list(ranges)
+        offsets = [0]
+        for r in self._ranges:
+            offsets.append(offsets[-1] + len(r))
+        self._offsets: List[int] = offsets
+
+    def __len__(self) -> int:
+        return self._offsets[-1]
+
+    def __getitem__(self, idx: int) -> int:
+        import bisect
+        if idx < 0:
+            idx += self._offsets[-1]
+        if idx < 0 or idx >= self._offsets[-1]:
+            raise IndexError(idx)
+        bucket = bisect.bisect_right(self._offsets, idx) - 1
+        return self._ranges[bucket][idx - self._offsets[bucket]]
+
+
+class ChunkDistributedSampler(DistributedSampler[_T_co]):
+    """
+    Each rank reads a contiguous chunk (trunk) of the input data
+    """
+
+    def __init__(
+        self,
+        dataset: TorchDataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 1,
+        drop_last: bool = False,
+    ) -> None:
+        super().__init__(
+            dataset=dataset,
+            num_replicas=num_replicas,
+            rank=rank,
+            shuffle=shuffle,
+            seed=seed,
+            drop_last=drop_last,
+        )
+
+    def __iter__(self) -> Iterator[_T_co]:
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch * 1001 + int(self.rank))
+            indices_t = torch.randperm(self.num_samples, generator=g)
+        else:
+            indices_t = torch.arange(self.num_samples)
+        assert self.drop_last is True, (
+            "drop_last must be True for ChunkDistributedSampler"
+        )
+        indices_t = indices_t + (self.num_samples * int(self.rank))
+        assert indices_t.numel() == self.num_samples
+        # Iterate via the numpy view, NOT directly over the tensor: iter(Tensor)
+        # calls Tensor.unbind(0) which eagerly materializes one zero-dim Tensor
+        # object per element (~600 B each). For 40 M eval / 525 M train samples
+        # that's 24 GB / 315 GB of [heap] growth per rank, blowing host RAM
+        # before the first batch. numpy's iter yields one Python int at a time
+        # with O(1) extra memory.
+        indices_np = indices_t.numpy()
+        return (int(x) for x in indices_np)
+
+    def set_epoch(self, epoch: int) -> None:
+        logger.warning(f"Setting epoch to {epoch}")
+        self.epoch = epoch
+
+
+@gin.configurable
+def make_model(
+    dataset: str,
+    bf16_training: bool = False,
+    hammer_kernel: Optional[str] = None,
+) -> Tuple[torch.nn.Module, DlrmHSTUConfig, Dict[str, EmbeddingConfig]]:
+    hstu_config = get_hstu_configs(dataset)
+    table_config = get_embedding_table_config(dataset)
+
+    # bf16 autocast is off by default: on the PYTORCH attn backend the
+    # pt_hstu_attention QK einsum backward overflows in bf16 at long
+    # sequences (NaN at step 1 when N>1k). Safe with TRITON; flip via
+    # `make_model.bf16_training = True` in the gin.
+    model = DlrmHSTU(
+        hstu_configs=hstu_config,
+        embedding_tables=table_config,
+        is_inference=False,
+        bf16_training=bf16_training,
+    )
+
+    # HSTU attention/compute kernel backend. Precedence:
+    #   HSTU_HAMMER_KERNEL env var  >  make_model.hammer_kernel gin  >  model default.
+    # The env var stays as an ad-hoc override (e.g. forcing PYTORCH for a one-off
+    # debug run) without editing the gin. Note: the fused TRITON path avoids
+    # materializing the dense [B, H, N, N] attention-score tensor that the PYTORCH
+    # path allocates (~32 GiB at N=2048, bs=1024), so TRITON is both faster and
+    # far lighter on HBM. On older ROCm, TRITON could hit PassManager errors at
+    # some shapes (make_ttgir) — fall back to PYTORCH via the gin/env if so.
+    kernel_choice = (
+        os.environ.get("HSTU_HAMMER_KERNEL", "").upper()
+        or (hammer_kernel.upper() if hammer_kernel else "")
+    )
+    if kernel_choice:
+        model.set_hammer_kernel(HammerKernel[kernel_choice])
+        logger.warning(f"HSTU hammer kernel set to: {kernel_choice}")
+
+    return (
+        model,
+        hstu_config,
+        table_config,
+    )
+
+
+@gin.configurable()
+def dense_optimizer_factory_and_class(
+    optimizer_name: str,
+    betas: Tuple[float, float],
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    learning_rate: float,
+) -> Tuple[
+    Type[Optimizer], Dict[str, Any], Callable[[Iterable[torch.Tensor]], Optimizer]
+]:
+    kwargs: Dict[str, Any] = {"lr": learning_rate}
+    if optimizer_name == "Adam":
+        optimizer_cls = torch.optim.Adam
+        kwargs.update({"betas": betas, "eps": eps, "weight_decay": weight_decay})
+    elif optimizer_name == "SGD":
+        optimizer_cls = torch.optim.SGD
+        kwargs.update({"weight_decay": weight_decay, "momentum": momentum})
+    elif optimizer_name == "AdamW":
+        optimizer_cls = torch.optim.AdamW
+        kwargs.update({"betas": betas, "eps": eps, "weight_decay": weight_decay})
+    else:
+        raise Exception("Unsupported optimizer!")
+
+    optimizer_factory = lambda params: optimizer_cls(params, **kwargs)
+
+    logger.info(
+        f"[dense optimizer] {optimizer_name} learning_rate={learning_rate} "
+        f"(resolved from gin; override via $DENSE_LR)"
+    )
+    return optimizer_cls, kwargs, optimizer_factory
+
+
+@gin.configurable()
+def sparse_optimizer_factory_and_class(
+    optimizer_name: str,
+    betas: Tuple[float, float],
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    learning_rate: float,
+) -> Tuple[
+    Type[Optimizer], Dict[str, Any], Callable[[Iterable[torch.Tensor]], Optimizer]
+]:
+    kwargs: Dict[str, Any] = {"lr": learning_rate}
+    if optimizer_name == "Adam":
+        optimizer_cls = torch.optim.Adam
+        beta1, beta2 = betas
+        kwargs.update(
+            {"beta1": beta1, "beta2": beta2, "eps": eps, "weight_decay": weight_decay}
+        )
+    elif optimizer_name == "SGD":
+        optimizer_cls = torchrec.optim.SGD
+        kwargs.update({"weight_decay": weight_decay, "momentum": momentum})
+    elif optimizer_name == "RowWiseAdagrad":
+        optimizer_cls = torchrec.optim.RowWiseAdagrad
+        beta1, beta2 = betas
+        kwargs.update(
+            {
+                "eps": eps,
+                "beta1": beta1,
+                "beta2": beta2,
+                "weight_decay": weight_decay,
+            }
+        )
+    else:
+        raise Exception("Unsupported optimizer!")
+
+    optimizer_factory = lambda params: optimizer_cls(params, **kwargs)
+
+    logger.info(
+        f"[sparse optimizer] {optimizer_name} learning_rate={learning_rate} "
+        f"(resolved from gin; override via $SPARSE_LR)"
+    )
+    return optimizer_cls, kwargs, optimizer_factory
+
+
+def _maybe_apply_qcomm_a2a(
+    sharders: List[Any],
+    device: torch.device,
+    forward_precision: str = "fp32",
+    backward_precision: str = "fp32",
+) -> List[Any]:
+    """Optionally quantize the embedding all-to-all payload via TorchRec qcomm.
+
+    The yambda-5b embedding shuffle is the dominant, bandwidth-bound (multi-node)
+    collective (~14.5 GB/rank fp32); a bf16/fp16 wire dtype halves it. Quant/
+    dequant happen inside the comm op, transparent to the lookup consumer. Ported
+    from the DLRMv2 R2 lever, retargeted from ``EmbeddingBagCollectionSharder`` to
+    the sequence ``EmbeddingCollectionSharder`` this model uses.
+
+    Forward and backward are configured independently because they have different
+    numerical needs (TorchRec golden_training/train_dlrm.py recommends
+    forward=fp16, backward=bf16): the forward carries bounded embedding
+    activations where fp16's extra mantissa helps, while gradients have a wider
+    range that can overflow fp16, so bf16 (fp32 exponent range) is safer there.
+    bf16 and fp16 are both 2 bytes, so the wire volume / perf is identical — the
+    choice is purely numerical.
+
+    Args (set via gin on ``make_optimizer_and_shard``, env-overridable). Each is
+    one of ``fp32`` (that direction unquantized) | ``bf16`` | ``fp16``. If BOTH
+    are fp32 the sharders are returned untouched (identical to baseline trunk).
+    """
+    _COMM = {"bf16": "BF16", "fp16": "FP16", "fp32": "FP32"}
+    fwd = (forward_precision or "fp32").strip().lower()
+    bwd = (backward_precision or "fp32").strip().lower()
+    rank0 = (not dist.is_initialized()) or dist.get_rank() == 0
+    for name, p in (("forward", fwd), ("backward", bwd)):
+        if p not in _COMM:
+            if rank0:
+                logger.warning(
+                    "DLRMV4 qcomm a2a: unknown %s precision %r (want "
+                    "fp32|bf16|fp16); using fp32 a2a",
+                    name,
+                    p,
+                )
+            return sharders
+    if fwd == "fp32" and bwd == "fp32":
+        return sharders
+    try:
+        from torchrec.distributed.embedding import EmbeddingCollectionSharder
+        from torchrec.distributed.fbgemm_qcomm_codec import (
+            CommType,
+            get_qcomm_codecs_registry,
+            QCommsConfig,
+        )
+
+        qcfg = QCommsConfig(
+            forward_precision=getattr(CommType, _COMM[fwd]),
+            backward_precision=getattr(CommType, _COMM[bwd]),
+        )
+        registry = get_qcomm_codecs_registry(qcfg, device=device)
+        new_sharders = []
+        replaced = False
+        for s in sharders:
+            if type(s).__name__ == "EmbeddingCollectionSharder" and not replaced:
+                new_sharders.append(
+                    EmbeddingCollectionSharder(qcomm_codecs_registry=registry)
+                )
+                replaced = True
+            else:
+                new_sharders.append(s)
+        if rank0:
+            logger.info(
+                "DLRMV4 qcomm a2a ENABLED: forward=%s backward=%s "
+                "replaced_ec_sharder=%s",
+                fwd,
+                bwd,
+                replaced,
+            )
+        return new_sharders
+    except Exception as e:  # noqa: BLE001 — fall back to fp32 a2a on any failure
+        if rank0:
+            logger.warning(
+                "DLRMV4 qcomm a2a: failed to enable (%s: %s); using fp32 a2a",
+                type(e).__name__,
+                e,
+            )
+        return sharders
+
+
+@gin.configurable
+def make_optimizer_and_shard(
+    model: torch.nn.Module,
+    device: torch.device,
+    world_size: int,
+    local_world_size: Optional[int] = None,
+    hbm_cap_gb: int = 260,
+    sparse_a2a_forward_precision: str = "fp32",
+    sparse_a2a_backward_precision: str = "fp32",
+) -> Tuple[DistributedModelParallel, torch.optim.Optimizer]:
+    dense_opt_cls, dense_opt_args, dense_opt_factory = (
+        dense_optimizer_factory_and_class()
+    )
+
+    sparse_opt_cls, sparse_opt_args, sparse_opt_factory = (
+        sparse_optimizer_factory_and_class()
+    )
+    # Fuse sparse optimizer to backward step
+    for k, module in model.named_modules():
+        if type(module) in TORCHREC_TYPES:
+            for _, param in module.named_parameters(prefix=k):
+                if param.requires_grad:
+                    apply_optimizer_in_backward(
+                        sparse_opt_cls, [param], sparse_opt_args
+                    )
+    sharders = get_default_sharders()
+    sharders = _maybe_apply_qcomm_a2a(
+        sharders,
+        device,
+        forward_precision=sparse_a2a_forward_precision,
+        backward_precision=sparse_a2a_backward_precision,
+    )
+    # local_world_size = GPUs per node so the planner respects the intra-node
+    # (xGMI/NVLink) vs inter-node hierarchy when placing shards. Defaults to
+    # world_size for the single-node case (no behavior change).
+    planner = EmbeddingShardingPlanner(
+        topology=Topology(
+            local_world_size=local_world_size or world_size,
+            world_size=world_size,
+            compute_device="cuda",
+            hbm_cap=hbm_cap_gb * 1024 * 1024 * 1024,
+            ddr_cap=0,
+        )
+    )
+    pg = dist.GroupMember.WORLD
+    env = ShardingEnv.from_process_group(pg)  # pyre-ignore [6]
+    pg = env.process_group
+
+    plan = planner.collective_plan(model, sharders, pg)
+
+    # Re-seed right before DMP materializes/inits the sharded embedding tables.
+    # The per-table seeded init_fn (configs.get_embedding_table_config) handles
+    # the eager path, but the fused FBGEMM TBE path inits weights on-device and
+    # may bypass init_fn, drawing from the global RNG instead. Re-seeding here
+    # (same value on every rank) makes embedding init reproducible run-to-run for
+    # a fixed sharding plan (Tier 1). Dense params are already initialized in
+    # make_model, so this does not perturb them.
+    _emb_seed = int(os.environ.get("SEED", "1"))
+    torch.manual_seed(_emb_seed)
+    torch.cuda.manual_seed_all(_emb_seed)
+    logger.info(f"[emb-init] re-seeded RNGs before DMP with SEED={_emb_seed}")
+
+    # Shard model
+    model = DistributedModelParallel(
+        module=model,
+        device=device,
+        plan=plan,
+        sharders=sharders,
+    )
+
+    # --- startup init checksum (reproducibility probe) -------------------------
+    # Right after DMP materializes real weights, log a deterministic fingerprint
+    # of every parameter so two builds with the same $SEED + sharding plan can be
+    # diffed for byte-level init reproducibility. For sharded embeddings we
+    # all-reduce the per-shard (count, sum, sumsq) so the fingerprint covers the
+    # WHOLE table independent of how rows split across ranks; replicated dense
+    # params use rank 0's local copy.
+    # OFF BY DEFAULT: the fp64 reductions below (.sum(dtype=float64) /
+    # vector_norm(dtype=float64)) materialize a full fp64 copy of each local
+    # embedding shard (~2x the fp32 shard, i.e. >150 GiB for the big tables),
+    # which leaves almost no HBM headroom after sharding and will OOM the build on
+    # any node with residual memory. Only enable for explicit reproducibility
+    # checks, ideally with a smaller batch / on a clean node. Enable with
+    # INIT_CHECKSUM=1.
+    if os.environ.get("INIT_CHECKSUM", "0") == "1":
+        import hashlib
+
+        _rank = dist.get_rank() if dist.is_initialized() else 0
+        _fps: List[str] = []
+        for _name, _p in sorted(model.named_parameters(), key=lambda kv: kv[0]):
+            _sharded = isinstance(_p, ShardedTensor)
+            if _sharded:
+                _shards = _p.local_shards()
+                _loc = _shards[0].tensor if _shards else None
+            else:
+                _loc = _p
+            if _loc is None or _loc.numel() == 0:
+                _cnt, _sm, _sq = 0.0, 0.0, 0.0
+            else:
+                _det = _loc.detach()
+                _cnt = float(_det.numel())
+                _sm = _det.sum(dtype=torch.float64).item()
+                _nrm = torch.linalg.vector_norm(
+                    _det, ord=2, dtype=torch.float64
+                ).item()
+                _sq = _nrm * _nrm
+            if _sharded and dist.is_initialized():
+                _stat = torch.tensor(
+                    [_cnt, _sm, _sq], dtype=torch.float64, device=device
+                )
+                dist.all_reduce(_stat, op=dist.ReduceOp.SUM)
+                _cnt, _sm, _sq = _stat.tolist()
+            _fps.append(f"{_name}|{int(_cnt)}|{_sm:.6f}|{_sq:.6f}")
+            if _rank == 0:
+                logger.info(
+                    f"[init-checksum] {'sharded' if _sharded else 'dense'} "
+                    f"{_name} n={int(_cnt)} sum={_sm:.6f} sumsq={_sq:.6f}"
+                )
+        if _rank == 0:
+            _digest = hashlib.sha256("\n".join(_fps).encode()).hexdigest()[:16]
+            logger.info(
+                f"[init-checksum] SEED={os.environ.get('SEED', '?')} "
+                f"params={len(_fps)} digest={_digest}"
+            )
+
+    # Create keyed optimizer
+    all_optimizers = []
+    all_params = {}
+    non_fused_sparse_params = {}
+    for k, v in in_backward_optimizer_filter(model.named_parameters()):
+        if v.requires_grad:
+            if isinstance(v, ShardedTensor):
+                non_fused_sparse_params[k] = v
+            else:
+                all_params[k] = v
+
+    if non_fused_sparse_params:
+        all_optimizers.append(
+            (
+                "sparse_non_fused",
+                KeyedOptimizerWrapper(
+                    params=non_fused_sparse_params, optim_factory=sparse_opt_factory
+                ),
+            )
+        )
+
+    if all_params:
+        all_optimizers.append(
+            (
+                "dense",
+                KeyedOptimizerWrapper(
+                    params=all_params,
+                    optim_factory=dense_opt_factory,
+                ),
+            )
+        )
+    output_optimizer = CombinedOptimizer(all_optimizers)
+    output_optimizer.init_state(set(model.sparse_grad_parameter_names()))
+    return model, output_optimizer
+
+
+@gin.configurable
+def make_streaming_dataloader(
+    dataset: HammerToTorchDataset,
+    ts: Optional[int] = None,
+    batch_size: int = 0,
+    num_workers: int = 0,
+    prefetch_factor: int = 0,
+    train_only: bool = False,
+    indices: Optional["np.ndarray"] = None,
+) -> DataLoader:
+    # `indices` (explicit anchor index array) is used by the eval path to
+    # iterate the FIXED user-holdout set, which spans a window range rather than
+    # a single ts. Otherwise restrict to window `ts`; train_only=True drops
+    # held-out eval users so the non-persistent TRAIN loader never trains on
+    # them (no-leakage guarantee).
+    if indices is not None:
+        dataset.dataset.set_active_indices(indices)  # pyre-ignore [16]
+    else:
+        assert ts is not None, "make_streaming_dataloader needs ts or indices"
+        dataset.dataset.set_ts(ts, train_only=train_only)  # pyre-ignore [16]
+    total_items = dataset.dataset.get_item_count()
+    subset = torch.utils.data.Subset(dataset, range(total_items))
+    # shuffle=False keeps temporal order within the window: a non-shuffling
+    # DistributedSampler hands rank r the strided slice indices[r::num_replicas]
+    # (round-robin), so all ranks stay on the same time front and consume the
+    # window in index order. Fork ctx mirrors the train path (COW-share the
+    # mmap'd store instead of pickling it into every worker).
+    mp_ctx = "fork" if num_workers and num_workers > 0 else None
+    dataloader = DataLoader(
+        dataset=subset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=collate_fn,
+        drop_last=True,
+        num_workers=num_workers,
+        prefetch_factor=prefetch_factor,
+        sampler=DistributedSampler(subset, shuffle=False, drop_last=True),
+        multiprocessing_context=mp_ctx,
+    )
+    return dataloader
+
+
+class StreamingWindowSampler(torch.utils.data.Sampler):
+    """Per-rank sampler whose index list is swapped each window.
+
+    Yields this rank's round-robin slice of the active window's GLOBAL anchor
+    indices (into the dataset's ``_positions``). Because indices are global, a
+    single DataLoader with ``persistent_workers=True`` can be reused across all
+    windows: the main process re-iterates this sampler each window and ships the
+    new indices to the already-forked workers, which map any global index via
+    the shared mmap. No per-window worker respawn / dataset re-pickle.
+
+    Round-robin striding (rank r gets ``indices[r::world_size]``) over the
+    time-sorted window keeps every rank on the same time front; the window is
+    truncated to a multiple of ``world_size`` so all ranks get equal counts
+    (required for DDP collective lockstep).
+    """
+
+    def __init__(self, rank: int, world_size: int) -> None:
+        self._rank: int = rank
+        self._world_size: int = world_size
+        self._indices: List[int] = []
+
+    def set_window(self, global_indices, skip_samples: int = 0) -> None:
+        """Install this window's per-rank index list, optionally fast-forwarding.
+
+        ``skip_samples`` drops the first N per-rank samples from the list so the
+        next ``__iter__`` starts at sample N+1 in this rank's slice. Used on
+        resume to skip batches that were already trained: pass
+        ``skip_samples = batch_size * batches_completed`` and the dataloader
+        emits batches starting at exactly the next unseen batch.
+
+        The skip is safe because the sample order is fully deterministic given
+        (global_indices, rank, world_size): we re-derive the same per-rank list
+        as the pre-crash run, just hand back a tail slice of it.
+        """
+        n = (len(global_indices) // self._world_size) * self._world_size
+        per_rank = global_indices[:n][self._rank :: self._world_size].tolist()
+        if skip_samples < 0 or skip_samples > len(per_rank):
+            raise ValueError(
+                f"skip_samples={skip_samples} out of [0, {len(per_rank)}] "
+                f"for rank={self._rank} world_size={self._world_size}"
+            )
+        self._indices = per_rank[skip_samples:]
+
+    def __iter__(self):
+        return iter(self._indices)
+
+    def __len__(self) -> int:
+        return len(self._indices)
+
+
+@gin.configurable
+def make_persistent_streaming_dataloader(
+    dataset: HammerToTorchDataset,
+    sampler: StreamingWindowSampler,
+    batch_size: int,
+    num_workers: int,
+    prefetch_factor: int,
+) -> DataLoader:
+    """One reusable DataLoader for the whole streaming run. ``sampler`` is
+    mutated per window via ``set_window``; workers persist across windows."""
+    use_workers = bool(num_workers and num_workers > 0)
+    return DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=collate_fn,
+        drop_last=True,
+        num_workers=num_workers,
+        prefetch_factor=prefetch_factor if use_workers else None,
+        sampler=sampler,
+        persistent_workers=use_workers,
+        multiprocessing_context="fork" if use_workers else None,
+    )
+
+
+class _PrefetchingWindowLoader:
+    """Double-buffered window loader for the persistent streaming path.
+
+    Holds ``n_buffers`` pre-forked persistent worker pools that ping-pong: while
+    the current window trains on one pool, the *next* window's index selection
+    (``window_indices``) and first-batch prefetch are prepared on another pool
+    in a background thread. By the time training advances, that window is warm,
+    so the per-window reset (mask + first-batch stall) is hidden behind GPU
+    compute (~0 dead time at the boundary).
+
+    Worker pools are forked once on the main thread at the start of ``stream``;
+    afterwards only iterator resets happen (no forks), so background-thread
+    preparation cannot fork while other threads hold locks.
+    """
+
+    def __init__(
+        self,
+        dataset: "HammerToTorchDataset",
+        sampler_factory,
+        dl_factory,
+        n_buffers: int = 2,
+    ) -> None:
+        self._dataset = dataset
+        self._n = n_buffers
+        self._samplers = [sampler_factory() for _ in range(n_buffers)]
+        self._dls = [dl_factory(s) for s in self._samplers]
+        self._iters: List[Optional[object]] = [None] * n_buffers
+
+    def _prepare(self, buf: int, ts: int, skip_samples: int = 0) -> None:
+        # train_window_indices() is the O(N) mask (+ uid-hash filter for the
+        # holdout); numpy releases the GIL for it, so it overlaps the main
+        # thread's GPU dispatch. iter() then kicks off this pool's background
+        # prefetch. This is a TRAIN-only loader, so held-out eval users are
+        # excluded here. `skip_samples` is non-zero only for the very first
+        # window after a mid-window resume; subsequent windows always start at 0.
+        self._samplers[buf].set_window(
+            self._dataset.dataset.train_window_indices(ts), skip_samples=skip_samples
+        )
+        self._iters[buf] = iter(self._dls[buf])
+
+    def stream(self, ts_list: List[int], first_skip_samples: int = 0):
+        """Stream (ts, iterator) pairs. `first_skip_samples` is applied ONLY to
+        the first ts in ``ts_list`` (the mid-window-resumed window); every
+        subsequent window starts at sample 0 of its own per-rank list."""
+        n = len(ts_list)
+        if n == 0:
+            return
+        threads: List[Optional[threading.Thread]] = [None] * self._n
+        # Prime the first n_buffers windows on the main thread (forks all pools).
+        for b in range(min(self._n, n)):
+            skip = first_skip_samples if b == 0 else 0
+            self._prepare(b, ts_list[b], skip_samples=skip)
+        for i in range(n):
+            buf = i % self._n
+            if threads[buf] is not None:
+                threads[buf].join()
+                threads[buf] = None
+            yield ts_list[i], self._iters[buf]
+            # This pool is now free; prefetch the window n_buffers ahead.
+            # No skip on subsequent windows — only the first prepared window
+            # carries `first_skip_samples`.
+            j = i + self._n
+            if j < n:
+                th = threading.Thread(
+                    target=self._prepare, args=(buf, ts_list[j]), daemon=True
+                )
+                th.start()
+                threads[buf] = th
+
+
+@gin.configurable
+def make_train_test_dataloaders(
+    batch_size: int,
+    dataset_type: str,
+    hstu_config: DlrmHSTUConfig,
+    train_split_percentage: float,
+    embedding_table_configs: Dict[str, EmbeddingConfig],
+    new_path_prefix: str = "",
+    num_workers: int = 0,
+    num_blocks: int = 1,
+    prefetch_factor: Optional[int] = None,
+    eval_batch_size: Optional[int] = None,
+) -> Tuple[DataLoader, DataLoader]:
+    dataset_class, kwargs = get_dataset(
+        name=dataset_type, new_path_prefix=new_path_prefix
+    )
+    kwargs["embedding_config"] = embedding_table_configs
+
+    # Create dataset
+    dataset = HammerToTorchDataset(
+        dataset=dataset_class(hstu_config=hstu_config, is_inference=False, **kwargs)
+    )
+    total_items = dataset.dataset.get_item_count()
+    items_per_block = total_items // num_blocks
+    train_size_per_block = round(train_split_percentage * items_per_block)
+    # Avoid `extend(range(...))` which materializes a Python list of all sample
+    # indices — at 3.2B yambda samples × 28 bytes/int ≈ 90 GB/rank just for
+    # train_inds. Subset accepts any sequence with O(1) __len__ and __getitem__,
+    # so pass range objects (or a tiny chained view) directly.
+    if num_blocks == 1:
+        train_inds = range(0, train_size_per_block)
+        test_inds = range(train_size_per_block, items_per_block)
+    else:
+        train_inds = _ChainedRanges([
+            range(i * items_per_block, i * items_per_block + train_size_per_block)
+            for i in range(num_blocks)
+        ])
+        test_inds = _ChainedRanges([
+            range(i * items_per_block + train_size_per_block, (i + 1) * items_per_block)
+            for i in range(num_blocks)
+        ])
+    train_set = torch.utils.data.Subset(dataset, train_inds)
+    test_set = torch.utils.data.Subset(dataset, test_inds)
+
+    # When the parent rank is started via mp.start_processes(start_method="spawn"),
+    # torch.multiprocessing's default Process context is also "spawn". DataLoader
+    # then pickles `self._dataset` to send to each worker — which for our mmap'd
+    # 211 GB yambda store materializes the entire dataset into the parent's anon
+    # memory (~230 GB/rank). Forcing "fork" lets workers inherit the parent's
+    # mmap'd pages via COW with zero extra anon.
+    mp_ctx = "fork" if num_workers and num_workers > 0 else None
+    train_dataloader = DataLoader(
+        dataset=train_set,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=collate_fn,
+        drop_last=True,
+        num_workers=num_workers,
+        prefetch_factor=prefetch_factor,
+        sampler=ChunkDistributedSampler(train_set, drop_last=True, shuffle=True),
+        multiprocessing_context=mp_ctx,
+    )
+    test_dataloader = DataLoader(
+        dataset=test_set,
+        batch_size=eval_batch_size if eval_batch_size is not None else batch_size,
+        shuffle=False,
+        collate_fn=collate_fn,
+        drop_last=True,
+        num_workers=num_workers,
+        prefetch_factor=prefetch_factor,
+        sampler=ChunkDistributedSampler(test_set, drop_last=True, shuffle=True),
+        multiprocessing_context=mp_ctx,
+    )
+    return train_dataloader, test_dataloader
+
+
+# THROWAWAY DIAG state: per-embedding-table lookup stats accumulated across a
+# fixed window of steps (DIAG_EMB_STEPS, default 100) so the reported numbers are
+# averaged/aggregated rather than a single noisy batch. Rank-0 only.
+_EMB_DIAG_ACC: Dict[str, Dict[str, Any]] = {}
+_EMB_DIAG_NBATCH: int = 0
+# Cap (per-batch lookups) below which we also track a TRUE global-unique set
+# across the whole window (cheap for contextual/cross tables, total == batch
+# size). Sequential tables (item/artist/album) blow past this and only get the
+# per-batch averages.
+_EMB_DIAG_GLOBAL_CAP: int = 1 << 17  # 131072
+
+
+def _log_unique_embedding_diag(
+    sample, rank: int, step: int, max_steps: int = 100, log_every: int = 50
+) -> None:
+    """Diagnostic: aggregate per-embedding-table lookup stats over a step window.
+
+    Quantifies the user-major batching concern — when consecutive sliding-window
+    anchors come from the same few users, a batch reads very few UNIQUE embedding
+    rows (low unique/total), so embedding lookups are highly redundant. Covers the
+    base id tables AND the cross-feature tables (user_x_* / *_x_hour hashed
+    combos); the user_x_* tables should be the most redundant under shuffle OFF.
+
+    Accumulates over ``max_steps`` batches and emits an aggregate summary (mean
+    per-batch unique%/hot%/top10%, plus a true global-unique over the whole
+    window for the small contextual/cross tables). Rank-0 only, non-fatal.
+    """
+    if rank != 0:
+        return
+    global _EMB_DIAG_NBATCH
+    try:
+        from generative_recommenders.dlrm_v3.configs import YAMBDA_5B_CROSS_SPECS
+
+        cross_caps = {name: n for (name, _k, n, _s) in YAMBDA_5B_CROSS_SPECS}
+
+        def _table_of(key: str):
+            # cross tables match by exact name; resolve BEFORE substring fallbacks
+            # so e.g. 'user_x_artist' isn't misread as the artist_id table.
+            if key in cross_caps:
+                return key, cross_caps[key]
+            if key == "uid" or key.endswith("_uid") or key.endswith(".uid"):
+                return "uid", 0
+            if "artist" in key:
+                return "artist_id", 0
+            if "album" in key:
+                return "album_id", 0
+            if "item" in key and key.endswith("id"):
+                return "item_id", 0
+            return None, 0
+
+        for tag, kjt in (
+            ("uih", sample.uih_features_kjt),
+            ("cand", sample.candidates_features_kjt),
+        ):
+            for key in kjt.keys():
+                table, cap = _table_of(key)
+                if table is None:
+                    continue
+                vals = kjt[key].values()
+                total = int(vals.numel())
+                if total == 0:
+                    continue
+                u, counts = torch.unique(vals, return_counts=True)
+                uniq = int(u.numel())
+                hot1 = int(counts.max().item())
+                k = min(10, uniq)
+                topk = int(torch.topk(counts, k).values.sum().item())
+
+                slot = _EMB_DIAG_ACC.setdefault(
+                    f"{tag}.{key}",
+                    {
+                        "table": table,
+                        "cap": cap,
+                        "n": 0,
+                        "tot": 0,
+                        "uniq": 0,
+                        "upct": 0.0,
+                        "upct_min": 100.0,
+                        "upct_max": 0.0,
+                        "hot1pct": 0.0,
+                        "topkpct": 0.0,
+                        "glob": None,  # running global-unique id tensor (small tables)
+                    },
+                )
+                upct = 100.0 * uniq / total
+                slot["n"] += 1
+                slot["tot"] += total
+                slot["uniq"] += uniq
+                slot["upct"] += upct
+                slot["upct_min"] = min(slot["upct_min"], upct)
+                slot["upct_max"] = max(slot["upct_max"], upct)
+                slot["hot1pct"] += 100.0 * hot1 / total
+                slot["topkpct"] += 100.0 * topk / total
+                if total <= _EMB_DIAG_GLOBAL_CAP:
+                    prev = slot["glob"]
+                    merged = u if prev is None else torch.cat([prev, u])
+                    slot["glob"] = torch.unique(merged)
+
+        _EMB_DIAG_NBATCH += 1
+        n = _EMB_DIAG_NBATCH
+        if n % log_every == 0 or n >= max_steps:
+            lines = [f"emb-diag AGGREGATE over {n} batches (step<= {step}):"]
+            for name in sorted(_EMB_DIAG_ACC):
+                s = _EMB_DIAG_ACC[name]
+                c = max(1, s["n"])
+                cap_s = f" cap={s['cap']/1e6:.0f}M" if s["cap"] else ""
+                glob_s = ""
+                if s["glob"] is not None:
+                    g = int(s["glob"].numel())
+                    glob_s = (
+                        f" | global_uniq={g} over {s['tot']} seen "
+                        f"({s['tot']/max(1,g):.1f}x reuse)"
+                    )
+                lines.append(
+                    f"  {name}[{s['table']}]{cap_s}: "
+                    f"avg_tot={s['tot']/c:.0f} "
+                    f"avg_uniq%={s['upct']/c:.1f} "
+                    f"(min={s['upct_min']:.1f} max={s['upct_max']:.1f}) "
+                    f"avg_hot1%={s['hot1pct']/c:.1f} "
+                    f"avg_top10%={s['topkpct']/c:.1f}"
+                    f"{glob_s}"
+                )
+            logger.info("\n".join(lines))
+    except Exception as e:  # diagnostic must never break training
+        logger.warning(f"emb-diag failed: {e}")
+
+
+@gin.configurable
+def train_loop(
+    rank: int,
+    model: torch.nn.Module,
+    dataloader: torch.utils.data.DataLoader,
+    optimizer: Optimizer,
+    metric_logger: MetricsLogger,
+    device: torch.device,
+    num_epochs: int,
+    num_batches: Optional[int] = None,
+    output_trace: bool = False,
+    metric_log_frequency: int = 1,
+    checkpoint_frequency: int = 100,
+    start_batch_idx: int = 0,
+    streaming_diag_unique_emb: bool = False,
+    # lr_scheduler: to-do: Add a scheduler
+) -> None:
+    model.train()
+    batch_idx: int = start_batch_idx
+    profiler = Profiler(rank) if output_trace else None
+
+    for epoch in range(num_epochs):
+        dataloader.sampler.set_epoch(epoch)  # pyre-ignore [16]
+        for sample in dataloader:
+            if streaming_diag_unique_emb and batch_idx < int(
+                os.environ.get("DIAG_EMB_STEPS", "100")
+            ):
+                _log_unique_embedding_diag(
+                    sample,
+                    rank,
+                    batch_idx,
+                    max_steps=int(os.environ.get("DIAG_EMB_STEPS", "100")),
+                    log_every=metric_log_frequency,
+                )
+            optimizer.zero_grad()
+            sample.to(device)
+            (
+                _,
+                _,
+                aux_losses,
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+            ) = model.forward(
+                sample.uih_features_kjt,
+                sample.candidates_features_kjt,
+            )
+            # pyre-ignore
+            sum(aux_losses.values()).backward()
+            optimizer.step()
+            metric_logger.update(
+                mode="train",
+                predictions=mt_target_preds,
+                labels=mt_target_labels,
+                weights=mt_target_weights,
+                num_candidates=sample.candidates_features_kjt.lengths().view(
+                    len(sample.candidates_features_kjt.keys()), -1
+                )[0],
+            )
+            if batch_idx % metric_log_frequency != 0:
+                metric_logger.compute_and_log(
+                    mode="train",
+                    additional_logs={
+                        "losses": aux_losses,
+                    },
+                )
+            if batch_idx % checkpoint_frequency == 0 and batch_idx > 0:
+                save_dmp_checkpoint(
+                    model=model,
+                    optimizer=optimizer,
+                    metric_logger=metric_logger,
+                    rank=rank,
+                    batch_idx=batch_idx,
+                )
+            batch_idx += 1
+            if output_trace:
+                assert profiler is not None
+                profiler.step()
+            if num_batches is not None and batch_idx >= num_batches:
+                break
+        if num_batches is not None and batch_idx >= num_batches:
+            break
+
+
+@gin.configurable
+def eval_loop(
+    rank: int,
+    model: torch.nn.Module,
+    dataloader: torch.utils.data.DataLoader,
+    metric_logger: MetricsLogger,
+    device: torch.device,
+    metric_log_frequency: int = 1,
+    num_batches: Optional[int] = None,
+    output_trace: bool = False,
+    # lr_scheduler: to-do: Add a scheduler
+) -> None:
+    model.eval()
+    # Exclude eval wall-time from the train step-time window (see _run_eval_window).
+    metric_logger.pause_perf("eval")
+    batch_idx: int = 0
+    profiler = Profiler(rank) if output_trace else None
+    metric_logger.reset(mode="eval")
+    with torch.no_grad():
+        for sample in dataloader:
+            sample.to(device)
+            (
+                _,
+                _,
+                _,
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+            ) = model.forward(
+                sample.uih_features_kjt,
+                sample.candidates_features_kjt,
+            )
+            metric_logger.update(
+                mode="eval",
+                predictions=mt_target_preds,
+                labels=mt_target_labels,
+                weights=mt_target_weights,
+                num_candidates=sample.candidates_features_kjt.lengths().view(
+                    len(sample.candidates_features_kjt.keys()), -1
+                )[0],
+            )
+            if batch_idx % metric_log_frequency != 0:
+                metric_logger.compute_and_log(mode="eval")
+            batch_idx += 1
+            if output_trace:
+                assert profiler is not None
+                profiler.step()
+            if num_batches is not None and batch_idx >= num_batches:
+                break
+    metric_logger.compute_and_log(mode="eval")
+    for k, v in metric_logger.compute(mode="eval").items():
+        print(f"{k}: {v}")
+    metric_logger.resume_perf("eval")
+
+
+class _PipelineModelWrapper(torch.nn.Module):
+    """Adapt ``DlrmHSTU.forward`` to the ``(loss, output)`` contract that
+    ``TrainPipelineSparseDist`` expects.
+
+    The wrapped ``model`` is the same DMP instance handed to the pipeline as
+    ``model=``; the pipeline rewrites its sharded ``EmbeddingCollection`` in
+    place, so calling it here is what lets the embedding all-to-all overlap the
+    dense forward/backward compute.
+    """
+
+    def __init__(self, model: torch.nn.Module) -> None:
+        super().__init__()
+        self._model = model
+
+    def forward(
+        self, batch: Samples
+    ) -> Tuple[torch.Tensor, Tuple[Any, ...]]:
+        # The model runs in `_pipeline_mode`: it takes the whole batch as its
+        # single arg and reads the pre-merged sparse KJT off it. This keeps the
+        # EmbeddingCollection input a plain getattr on the batch placeholder so
+        # TorchRec pipelines its input_dist (instead of skipping it for "input
+        # modifications").
+        (
+            _,
+            _,
+            aux_losses,
+            mt_target_preds,
+            mt_target_labels,
+            mt_target_weights,
+        ) = self._model(batch)
+        loss = sum(aux_losses.values())
+        num_candidates = batch.candidates_features_kjt.lengths().view(
+            len(batch.candidates_features_kjt.keys()), -1
+        )[0]
+        output = (
+            aux_losses,
+            mt_target_preds,
+            mt_target_labels,
+            mt_target_weights,
+            num_candidates,
+        )
+        return loss, output
+
+
+def build_train_pipeline(
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    device: torch.device,
+    grad_clip_norm: float = 1.0,
+) -> Any:
+    """Build a ``TrainPipelineSparseDist`` for the DMP-wrapped HSTU model.
+
+    The 3-stage pipeline overlaps (1) H2D transfer of batch N+2, (2) the sparse
+    data-dist all-to-all of batch N+1's embedding lookup, and (3) dense fwd/bwd
+    of batch N, on separate CUDA streams. Requires the model to be wrapped with
+    ``DistributedModelParallel`` (see ``make_optimizer_and_shard``).
+    """
+    # Lazy import: keeps module import working on torchrec builds that move or
+    # rename the pipeline, and matches the reference Primus-DLRM setup.
+    from torchrec.distributed.train_pipeline import TrainPipelineSparseDist
+
+    # Switch the (DMP-wrapped) HSTU model into pipeline mode so both the fx trace
+    # and the live forward consume the batch as a single arg and read the
+    # pre-merged sparse KJT off it — required for the embedding input_dist to be
+    # pipelined. Eval call sites pass the batch the same way (see train_eval_loop).
+    underlying = model.module if hasattr(model, "module") else model
+    underlying._pipeline_mode = True
+
+    # The pipeline calls backward()+optimizer.step() internally inside
+    # progress(), leaving no in-loop hook point for gradient clipping. Clip via
+    # a full-backward hook (fires after autograd populates dense grads, before
+    # the optimizer step) to preserve parity with the sequential path's
+    # clip_grad_norm_(model.parameters(), max_norm=1.0).
+    if grad_clip_norm and grad_clip_norm > 0:
+
+        def _clip_grads(_m: torch.nn.Module, _gi: Any, _go: Any) -> None:
+            torch.nn.utils.clip_grad_norm_(
+                model.parameters(), max_norm=grad_clip_norm
+            )
+
+        model.register_full_backward_hook(_clip_grads)
+
+    return TrainPipelineSparseDist(
+        model=model,
+        optimizer=optimizer,
+        device=device,
+        execute_all_batches=True,
+        custom_model_fwd=_PipelineModelWrapper(model),
+    )
+
+
+@gin.configurable
+def train_eval_loop(
+    rank: int,
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    metric_logger: MetricsLogger,
+    device: torch.device,
+    num_epochs: int,
+    num_train_batches: Optional[int] = None,
+    num_eval_batches: Optional[int] = None,
+    train_dataloader: Optional[torch.utils.data.DataLoader] = None,
+    eval_dataloader: Optional[torch.utils.data.DataLoader] = None,
+    output_trace: bool = False,
+    metric_log_frequency: int = 1,
+    checkpoint_frequency: int = 100,
+    eval_frequency: int = 1,
+    start_train_batch_idx: int = 0,
+    start_eval_batch_idx: int = 0,
+    use_pipeline: bool = False,
+    # lr_scheduler: to-do: Add a scheduler
+) -> None:
+    train_batch_idx: int = start_train_batch_idx
+    eval_batch_idx: int = start_eval_batch_idx
+    profiler = Profiler(rank) if output_trace else None
+    assert train_dataloader is not None and eval_dataloader is not None
+
+    eval_data_iterator = iter(eval_dataloader)
+    train_data_iterator = iter(train_dataloader)
+
+    # 3-stage TorchRec pipeline (overlaps embedding a2a with dense compute).
+    # When enabled, progress() owns H2D copy, sparse-dist, fwd/bwd and the
+    # optimizer step; grad clipping moves to a full-backward hook (see builder).
+    train_pipeline = (
+        build_train_pipeline(model, optimizer, device) if use_pipeline else None
+    )
+
+    for epoch in range(num_epochs):
+        train_dataloader.sampler.set_epoch(epoch)  # pyre-ignore [16]
+        while True:
+            model.train()
+            if train_pipeline is not None:
+                try:
+                    (
+                        aux_losses,
+                        mt_target_preds,
+                        mt_target_labels,
+                        mt_target_weights,
+                        num_candidates,
+                    ) = train_pipeline.progress(train_data_iterator)
+                except StopIteration:
+                    train_data_iterator = iter(train_dataloader)
+                    break
+            else:
+                try:
+                    sample = next(train_data_iterator)
+                except StopIteration:
+                    train_data_iterator = iter(train_dataloader)
+                    break
+                optimizer.zero_grad()
+                sample.to(device)
+                (
+                    _,
+                    _,
+                    aux_losses,
+                    mt_target_preds,
+                    mt_target_labels,
+                    mt_target_weights,
+                ) = model.forward(
+                    sample.uih_features_kjt,
+                    sample.candidates_features_kjt,
+                )
+                # pyre-ignore
+                sum(aux_losses.values()).backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                optimizer.step()
+                num_candidates = sample.candidates_features_kjt.lengths().view(
+                    len(sample.candidates_features_kjt.keys()), -1
+                )[0]
+            metric_logger.update(
+                mode="train",
+                predictions=mt_target_preds,
+                labels=mt_target_labels,
+                weights=mt_target_weights,
+                num_candidates=num_candidates,
+            )
+            if train_batch_idx % metric_log_frequency == 0:
+                metric_logger.compute_and_log(
+                    mode="train",
+                    additional_logs={
+                        "losses": aux_losses,
+                    },
+                )
+            if train_batch_idx % checkpoint_frequency == 0 and train_batch_idx > 0:
+                save_dmp_checkpoint(
+                    model=model,
+                    optimizer=optimizer,
+                    metric_logger=metric_logger,
+                    rank=rank,
+                    batch_idx=train_batch_idx,
+                )
+            train_batch_idx += 1
+            if output_trace:
+                assert profiler is not None
+                profiler.step()
+            if train_batch_idx % eval_frequency == 0:
+                model.eval()
+                eval_batch_idx: int = 0
+                with torch.no_grad():
+                    while True:
+                        try:
+                            sample = next(eval_data_iterator)
+                        except StopIteration:
+                            eval_data_iterator = iter(eval_dataloader)
+                            sample = next(eval_data_iterator)
+                        sample.to(device)
+                        (
+                            _,
+                            _,
+                            _,
+                            mt_target_preds,
+                            mt_target_labels,
+                            mt_target_weights,
+                        ) = (
+                            # In pipeline mode the model takes the batch as one
+                            # arg (see _PipelineModelWrapper / DlrmHSTU.forward).
+                            model.forward(sample)
+                            if use_pipeline
+                            else model.forward(
+                                sample.uih_features_kjt,
+                                sample.candidates_features_kjt,
+                            )
+                        )
+                        metric_logger.update(
+                            mode="eval",
+                            predictions=mt_target_preds,
+                            labels=mt_target_labels,
+                            weights=mt_target_weights,
+                            num_candidates=sample.candidates_features_kjt.lengths().view(
+                                len(sample.candidates_features_kjt.keys()), -1
+                            )[0],
+                        )
+                        eval_batch_idx += 1
+                        if output_trace:
+                            assert profiler is not None
+                            profiler.step()
+                        if eval_batch_idx % metric_log_frequency == 0:
+                            metric_logger.compute_and_log(mode="eval")
+                        if (
+                            num_eval_batches is not None
+                            and eval_batch_idx >= num_eval_batches
+                        ):
+                            break
+                    for k, v in metric_logger.compute(mode="eval").items():
+                        print(f"{k}: {v}")
+                model.train()
+            # `num_train_batches` cap: None or 0 = run the whole window. >0 caps
+            # batches per window (mostly the streaming-resume test driver uses
+            # this to keep test windows short).
+            if num_train_batches and train_batch_idx >= num_train_batches:
+                break
+
+
+def select_in_window_checkpoint_reason(
+    *,
+    train_batch_idx: int,
+    global_step: int,
+    elapsed_since_last_save: float,
+    in_window_checkpoint_frequency: int,
+    checkpoint_step_frequency: int,
+    checkpoint_time_interval_s: float,
+) -> Optional[str]:
+    """Decide which (if any) in-window checkpoint cadence fires this batch.
+
+    Pure / distributed-agnostic so it can be unit-tested without a real run.
+    The caller computes `elapsed_since_last_save` (broadcast from rank 0 in the
+    streaming loop) so all ranks pass the same value and reach the same verdict.
+
+    Precedence (at most one save per batch): per-window-local batch count >
+    monotonic global step > wall-clock interval. Returns the trigger reason
+    string, or None when no cadence fires. A cadence is disabled when its
+    frequency/interval is 0 / 0.0.
+
+    Counter conventions match the loop: `train_batch_idx` is already
+    post-incremented (>=1 on the first batch), and `global_step` is guarded
+    >0 so step 0 doesn't trivially satisfy `% N == 0`.
+    """
+    if (
+        in_window_checkpoint_frequency > 0
+        and train_batch_idx % in_window_checkpoint_frequency == 0
+    ):
+        return "in_window_batch"
+    if (
+        checkpoint_step_frequency > 0
+        and global_step > 0
+        and global_step % checkpoint_step_frequency == 0
+    ):
+        return "global_step"
+    if (
+        checkpoint_time_interval_s > 0
+        and elapsed_since_last_save >= checkpoint_time_interval_s
+    ):
+        return "time_interval"
+    return None
+
+
+def _validate_split_contract(
+    saved: Optional[Dict[str, Any]],
+    live: Dict[str, Any],
+    rank: int,
+) -> None:
+    """Guarantee the train:eval split (and the inputs the resume skip-offset
+    depends on) are unchanged across a crash/resume.
+
+    `saved` is the contract recovered from the checkpoint (None on cold start or
+    legacy pre-holdout checkpoints). Any mismatch is fatal: continuing would
+    either desync the mid-window skip (duplicate/skip batches) or reassign users
+    so that previously held-out eval users get trained (leakage). Set
+    ALLOW_SPLIT_MISMATCH=1 to override (e.g. intentionally resuming a legacy
+    checkpoint into a holdout run, accepting the risk).
+    """
+    allow = os.environ.get("ALLOW_SPLIT_MISMATCH", "0") == "1"
+    if saved is None:
+        # Legacy / cold-start checkpoint with no recorded contract. Only a
+        # problem if this run actually holds users out (tsp < 1.0): we cannot
+        # prove the earlier run used the same split.
+        if live.get("train_split_percentage", 1.0) < 1.0 and not allow:
+            raise RuntimeError(
+                "Resuming a checkpoint with NO saved split contract into a "
+                f"user-holdout run (train_split_percentage="
+                f"{live['train_split_percentage']}). The earlier run's split "
+                "cannot be verified, so held-out eval users may have been "
+                "trained. Set ALLOW_SPLIT_MISMATCH=1 to override."
+            )
+        return
+    mismatches = {
+        k: (saved.get(k), live.get(k))
+        for k in live
+        if saved.get(k) != live.get(k)
+    }
+    if mismatches:
+        msg = (
+            "Split/resume contract mismatch between checkpoint and current run: "
+            + ", ".join(
+                f"{k}: checkpoint={s!r} current={c!r}" for k, (s, c) in mismatches.items()
+            )
+            + ". Resuming would desync the skip offset and/or leak held-out "
+            "users into training."
+        )
+        if allow:
+            if rank == 0:
+                logger.warning("%s ALLOW_SPLIT_MISMATCH=1 set — continuing anyway.", msg)
+        else:
+            raise RuntimeError(msg + " Set ALLOW_SPLIT_MISMATCH=1 to override.")
+    elif rank == 0:
+        logger.info("Split/resume contract verified against checkpoint: %s", live)
+
+
+@gin.configurable
+def streaming_train_eval_loop(
+    rank: int,
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    metric_logger: MetricsLogger,
+    device: torch.device,
+    num_train_ts: int,
+    hstu_config: DlrmHSTUConfig,
+    embedding_table_configs: Dict[str, EmbeddingConfig],
+    num_train_batches: Optional[int] = None,
+    num_eval_batches: Optional[int] = None,
+    output_trace: bool = False,
+    metric_log_frequency: int = 1,
+    # MLPerf `train_loss` event cadence, in global train steps, INDEPENDENT of
+    # metric_log_frequency (the console/TB cadence). 0 = fall back to
+    # metric_log_frequency (preserves prior coupled behavior). Wired to
+    # $MLPERF_TRAIN_LOSS_LOG_FREQ via gin.
+    mlperf_train_loss_log_frequency: int = 0,
+    checkpoint_frequency: int = 100,
+    start_ts: int = 0,
+    persistent_loader: bool = False,
+    eval_every_n_windows: int = 1,
+    # Data-fraction eval cadence (mutually exclusive with eval_every_n_windows).
+    # >0 = eval every this FRACTION of the run's total training data, converted
+    # once to a global train-step interval. 0.0 = OFF (use the per-window
+    # cadence). Wired to $EVAL_EVERY_DATA_PCT via gin.
+    eval_every_data_pct: float = 0.0,
+    double_buffer: bool = False,
+    # --- fixed user-holdout eval set ---
+    # Window range the fixed eval set is drawn from. None -> default to
+    # original_end_ts (start_ts + num_train_ts), the window just past training.
+    eval_holdout_ts: Optional[int] = None,
+    eval_holdout_num_windows: int = 1,
+    # --- resume / mid-window-exact-once knobs ---
+    resume_train_ts: Optional[int] = None,
+    resume_batch_idx_in_window: int = WINDOW_COMPLETE,
+    # Split contract recovered from the checkpoint (None on cold start or
+    # legacy checkpoints). Validated below against the live split so a resumed
+    # run cannot silently train a different user-split (would leak).
+    resume_split_contract: Optional[Dict[str, Any]] = None,
+    # True iff no checkpoint was loaded (genuine fresh run). Distinguishes a
+    # cold start (safe to establish a new split) from a resume that merely lacks
+    # a contract (legacy/non-streaming checkpoint), which the guard must reject.
+    resume_cold_start: bool = False,
+    in_window_checkpoint_frequency: int = 0,
+    # --- global step / wall-clock checkpoint cadences ---
+    checkpoint_step_frequency: int = 0,
+    checkpoint_time_interval_s: float = 0.0,
+    # --- gradient clipping (streaming path, dense params). 0.0 = OFF, which
+    #     preserves legacy streaming behavior. Wired to $GRAD_CLIP_NORM via gin. ---
+    grad_clip_norm: float = 0.0,
+    # --- diagnostic: log per-batch unique/total embedding-id counts ---
+    streaming_diag_unique_emb: bool = False,
+    # --- test-only failure injection knob ---
+    die_at_step: int = -1,
+    # MLPerf logger (rank-0-gated); None disables all MLPerf event emission.
+    mlperf_logger: Optional[Any] = None,
+) -> None:
+    """Streaming train+eval loop with per-window (and optionally mid-window)
+    checkpoints.
+
+    Resume semantics (set by train_ranker after `load_dmp_checkpoint` returns):
+      - resume_train_ts=None: cold start; honor `start_ts` as-is.
+      - resume_train_ts=N, resume_batch_idx_in_window=WINDOW_COMPLETE(-1):
+        previous run finished window N cleanly. Start at N+1 from sample 0.
+      - resume_train_ts=N, resume_batch_idx_in_window=K (K>=0): previous run
+        crashed mid-window after K completed batches. Re-enter window N and
+        skip the first K batches of THIS rank's per-rank sample list (deterministic
+        slice since `window_indices(N)` is a pure function of the anchor_ts cache).
+
+    Checkpoint cadences (all independent; any combination may be enabled):
+      - `checkpoint_frequency`: window-granularity. End-of-window save every
+        Nth train_ts (and always on the final window). Uses WINDOW_COMPLETE.
+      - `in_window_checkpoint_frequency`: per-window-local batch count. Fires
+        every N batches *within* a window (counter resets each window).
+      - `checkpoint_step_frequency`: global-step granularity. Fires whenever
+        the monotonic `metric_logger.global_step['train']` hits a multiple of
+        N — i.e. a true "every 1000 steps" trigger that spans windows and
+        survives resume (global_step is restored from the checkpoint).
+      - `checkpoint_time_interval_s`: wall-clock granularity. Fires when at
+        least this many seconds have elapsed since the last save (e.g. 3600
+        for hourly). Rank 0 owns the clock and broadcasts the decision so all
+        ranks save together (avoids the collective barrier in
+        `save_dmp_checkpoint` deadlocking on a split decision).
+
+    All in-window triggers (`in_window_checkpoint_frequency`,
+    `checkpoint_step_frequency`, `checkpoint_time_interval_s`) route through
+    `_save_mid_window`, which stamps `batch_idx_in_window=K` so a crash leaves
+    a resumable partial-window checkpoint. End-of-window saves
+    (`checkpoint_frequency`) always use the WINDOW_COMPLETE sentinel. 0 / 0.0
+    disables a given cadence (the default for all three fine-grained ones).
+
+    `die_at_step` is a test-only hook: when `metric_logger.global_step['train']`
+    reaches this value, the process exits with code 42 right after the in-window
+    save fires. Used by the failure-injection test to crash at a deterministic
+    boundary and then resume.
+    """
+    # Exactly one eval cadence may be active. eval_every_n_windows defaults to 1
+    # (eval every window), so enabling the data-fraction cadence REQUIRES
+    # explicitly disabling the per-window one (EVAL_EVERY_N_WINDOWS=0). Fail fast
+    # on a contradictory config rather than silently picking one.
+    if (eval_every_data_pct and eval_every_data_pct > 0) and eval_every_n_windows > 0:
+        raise ValueError(
+            "Conflicting eval cadences: eval_every_data_pct="
+            f"{eval_every_data_pct} (>0) AND eval_every_n_windows="
+            f"{eval_every_n_windows} (>0). They are mutually exclusive. To use "
+            "the data-fraction cadence set EVAL_EVERY_N_WINDOWS=0; to use the "
+            "per-window cadence set EVAL_EVERY_DATA_PCT=0."
+        )
+    # MLPerf train_loss cadence: independent of metric_log_frequency. 0 (the
+    # env-binding default) falls back to metric_log_frequency so unset behavior
+    # matches the prior coupled implementation.
+    mlperf_loss_every = (
+        mlperf_train_loss_log_frequency
+        if mlperf_train_loss_log_frequency and mlperf_train_loss_log_frequency > 0
+        else metric_log_frequency
+    )
+    profiler = Profiler(rank) if output_trace else None
+    # Normalize the per-window caps: <=0 (the env-binding default) means "no cap
+    # = consume the full window". The eval-break check below is `is not None and
+    # eval_batch_idx >= num_eval_batches`, so a literal 0 would (wrongly) break
+    # after the first batch — map it to None instead for the full-holdout eval.
+    if num_eval_batches is not None and num_eval_batches <= 0:
+        num_eval_batches = None
+    if num_train_batches is not None and num_train_batches <= 0:
+        num_train_batches = None
+    dataset_class, kwargs = get_dataset()
+    kwargs["embedding_config"] = embedding_table_configs
+    dataset = HammerToTorchDataset(
+        dataset=dataset_class(hstu_config=hstu_config, is_inference=False, **kwargs)
+    )
+    # Persistent path: build ONE DataLoader + a stateful sampler whose indices
+    # are swapped per window, so workers fork once and are reused across all
+    # windows (eliminates the per-window dataloader respawn + first-batch
+    # warmup). The non-persistent path recreates a DataLoader per window.
+    window_sampler: Optional[StreamingWindowSampler] = None
+    persistent_dl: Optional[DataLoader] = None
+    world_size = (
+        torch.distributed.get_world_size()
+        if torch.distributed.is_initialized()
+        else 1
+    )
+    if persistent_loader:
+        window_sampler = StreamingWindowSampler(rank=rank, world_size=world_size)
+        persistent_dl = make_persistent_streaming_dataloader(
+            dataset=dataset, sampler=window_sampler
+        )
+
+    # The fixed user-holdout eval is yambda-specific (needs window_indices +
+    # the split API). Other streaming datasets (synthetic) keep the legacy
+    # per-window eval. Detect support once.
+    supports_holdout = hasattr(dataset.dataset, "eval_holdout_indices")
+
+    # Fixed eval-holdout window range. Captured from the REQUESTED (start_ts,
+    # num_train_ts) BEFORE the resume block mutates them, so it is identical on
+    # cold start and on every resume (the supervisor relaunches with the same
+    # START_TS / NUM_TRAIN_TS). Defaults to the window just past training.
+    requested_end_ts = start_ts + num_train_ts
+    # Eval-cadence anchor: the ORIGINAL requested start_ts, captured BEFORE the
+    # resume block rebases start_ts. `_should_eval` keys the every-N-windows
+    # cadence off the absolute window ts relative to THIS anchor, so the eval
+    # grid (e.g. 150,160,170,...) is identical on cold start and on every resume.
+    # (Keying off the per-call loop index instead would re-anchor the grid to
+    # whatever window a mid-run resume happens to restart from.)
+    eval_anchor_ts = start_ts
+    # None (Python default) or <0 (the env-binding default) both mean "use the
+    # window just past training", which is stable across resume.
+    eval_holdout_ts_resolved = (
+        eval_holdout_ts
+        if (eval_holdout_ts is not None and eval_holdout_ts >= 0)
+        else requested_end_ts
+    )
+
+    # Data-fraction eval cadence: convert eval_every_data_pct into a global
+    # train-step interval ONCE, over the ORIGINAL requested window range
+    # [eval_anchor_ts, requested_end_ts). Keying the later trigger off
+    # `global_step % eval_interval_steps` (global_step is monotonic and
+    # checkpoint-restored) makes the eval grid identical on cold start and on
+    # every resume, exactly like checkpoint_step_frequency. 0 => disabled.
+    eval_interval_steps = 0
+    if eval_every_data_pct and eval_every_data_pct > 0:
+        # Per-rank batch size: the persistent loader carries it directly; the
+        # per-window path uses the same gin %batch_size (env BATCH_SIZE,
+        # default 1024 — matches make_streaming_dataloader.batch_size).
+        bs = (
+            persistent_dl.batch_size
+            if persistent_dl is not None
+            else int(os.environ.get("BATCH_SIZE", "1024"))
+        )
+        if hasattr(dataset.dataset, "total_train_anchors"):
+            total_train_anchors = dataset.dataset.total_train_anchors(  # pyre-ignore[16]
+                eval_anchor_ts, requested_end_ts - eval_anchor_ts
+            )
+            total_train_steps = total_train_anchors // max(1, bs * world_size)
+            eval_interval_steps = max(
+                1, round(eval_every_data_pct * total_train_steps)
+            )
+            if rank == 0:
+                logger.info(
+                    "[data-pct-eval] eval_every_data_pct=%.6g -> "
+                    "eval_interval_steps=%d (total_train_anchors=%d bs=%d "
+                    "world_size=%d total_train_steps=%d over windows [%d, %d))",
+                    eval_every_data_pct,
+                    eval_interval_steps,
+                    total_train_anchors,
+                    bs,
+                    world_size,
+                    total_train_steps,
+                    eval_anchor_ts,
+                    requested_end_ts,
+                )
+        elif rank == 0:
+            logger.warning(
+                "[data-pct-eval] dataset %s has no total_train_anchors(); "
+                "data-fraction eval is DISABLED (no per-window eval either, "
+                "since EVAL_EVERY_N_WINDOWS must be 0 to reach here) — only the "
+                "final eval will run.",
+                type(dataset.dataset).__name__,
+            )
+
+    # The split is an immutable run contract: a silent change across resume
+    # would both desync the mid-window skip offset AND turn held-out eval users
+    # into trained users (leakage). Build the live contract and validate the
+    # one recovered from the checkpoint against it; abort on any mismatch unless
+    # ALLOW_SPLIT_MISMATCH=1 is set (e.g. deliberately resuming a legacy run).
+    live_split_contract: Optional[Dict[str, Any]] = None
+    if supports_holdout:
+        live_split_contract = {
+            "train_split_percentage": dataset.dataset._train_split_percentage,  # pyre-ignore[16]
+            "split_salt": dataset.dataset._split_salt,  # pyre-ignore[16]
+            "eval_holdout_ts": eval_holdout_ts_resolved,
+            "eval_holdout_num_windows": eval_holdout_num_windows,
+            "batch_size": persistent_dl.batch_size if persistent_dl is not None else None,
+            "world_size": world_size,
+        }
+        # Only validate on an actual resume. On a genuine cold start there is no
+        # prior split to verify and establishing this run's split is always safe;
+        # validating there would wrongly reject every fresh holdout run. A resume
+        # that lacks a contract (legacy/non-streaming checkpoint) is NOT a cold
+        # start and is still validated (and rejected) below.
+        if not resume_cold_start:
+            _validate_split_contract(resume_split_contract, live_split_contract, rank)
+
+    # Apply resume hint: advance start_ts past the last completed window, or
+    # re-enter the partial window with a per-rank skip on its first iter.
+    # Shrink num_train_ts by the same amount so the resumed run finishes at
+    # the same final timestamp (start_ts + num_train_ts) as a fresh run would
+    # — i.e. resumed and uninterrupted produce identical total work.
+    first_skip_samples = 0
+    if resume_train_ts is not None:
+        original_end_ts = start_ts + num_train_ts
+        if resume_batch_idx_in_window == WINDOW_COMPLETE:
+            new_start = resume_train_ts + 1
+            if rank == 0:
+                logger.info(
+                    "Resuming from completed train_ts=%d → start_ts=%d "
+                    "(num_train_ts %d → %d)",
+                    resume_train_ts, new_start,
+                    num_train_ts, max(0, original_end_ts - new_start),
+                )
+            start_ts = new_start
+        else:
+            if rank == 0:
+                logger.info(
+                    "Resuming mid-window at train_ts=%d batch_idx_in_window=%d "
+                    "(skipping batches already trained)",
+                    resume_train_ts,
+                    resume_batch_idx_in_window,
+                )
+            start_ts = resume_train_ts
+            # `batch_size` is per-rank from the persistent dataloader (set via
+            # gin `make_persistent_streaming_dataloader.batch_size`). The
+            # skip-samples-per-rank below maps "K batches done" → "K * bs
+            # samples in this rank's index list", since each batch draws bs
+            # samples from this rank's deterministic round-robin slice.
+            assert persistent_dl is not None, (
+                "Mid-window resume requires persistent_loader=True"
+            )
+            first_skip_samples = resume_batch_idx_in_window * persistent_dl.batch_size
+        num_train_ts = max(0, original_end_ts - start_ts)
+        if num_train_ts == 0 and rank == 0:
+            logger.info(
+                "Resume target already reached (end_ts=%d, start_ts=%d) — "
+                "no further training windows; skipping straight to final eval.",
+                original_end_ts, start_ts,
+            )
+
+    if rank == 0:
+        logger.info(
+            "[grad-clip] streaming path gradient clipping %s (max_norm=%.4g via $GRAD_CLIP_NORM)",
+            "ENABLED" if (grad_clip_norm and grad_clip_norm > 0) else "OFF",
+            grad_clip_norm,
+        )
+
+    def _window_iter(ts: int, skip_samples: int = 0):
+        # TRAIN-only iterator: both branches exclude held-out eval users via
+        # train_window_indices / set_ts(train_only=True). (Eval uses the fixed
+        # holdout set, never this helper.)
+        if persistent_loader:
+            assert window_sampler is not None and persistent_dl is not None
+            window_sampler.set_window(
+                dataset.dataset.train_window_indices(ts),  # pyre-ignore [16]
+                skip_samples=skip_samples,
+            )
+            return iter(persistent_dl)
+        if skip_samples != 0:
+            raise NotImplementedError(
+                "skip_samples>0 requires persistent_loader=True"
+            )
+        return iter(
+            make_streaming_dataloader(dataset=dataset, ts=ts, train_only=True)
+        )
+    # Windows are [start_ts, start_ts + num_train_ts); each step trains window T
+    # then evals window T+1, so the last eval window is start_ts + num_train_ts,
+    # which must be < num_windows(). Anchors require >= history_length prior
+    # events, so the earliest windows are near-empty warm-up — use start_ts to
+    # begin at a dense window. Clamp instead of failing.
+    if hasattr(dataset.dataset, "num_windows"):
+        available = dataset.dataset.num_windows()  # pyre-ignore [16]
+        max_count = max(0, available - 1 - start_ts)
+        if num_train_ts > max_count:
+            logger.warning(
+                f"start_ts={start_ts} + num_train_ts={num_train_ts} exceeds "
+                f"available windows ({available}); clamping num_train_ts to {max_count}."
+            )
+            num_train_ts = max_count
+    # Wall-clock anchor for time-based checkpointing. Mutable single-element
+    # list so the nested train loop can reset it after each save. Starts at
+    # loop entry so the first time-trigger fires ~interval seconds in.
+    last_ckpt_time = [time.time()]
+
+    def _broadcast_elapsed() -> float:
+        """Seconds since the last save, owned by rank 0 and broadcast to all
+        ranks. save_dmp_checkpoint runs a collective barrier, so every rank must
+        feed the same wall-clock value into the cadence decision — otherwise a
+        split verdict (rank 0 saves, rank 1 doesn't) would deadlock. Broadcasting
+        rank 0's elapsed keeps the (pure) decision identical everywhere."""
+        elapsed = time.time() - last_ckpt_time[0]
+        if torch.distributed.is_initialized() and world_size > 1:
+            t = torch.tensor([elapsed], device=device, dtype=torch.float64)
+            torch.distributed.broadcast(t, src=0)
+            elapsed = float(t.item())
+        return elapsed
+
+    def _save_mid_window(train_ts: int, batch_idx_in_window: int) -> None:
+        """In-window checkpoint helper. Snapshots the same state as the
+        end-of-window save but stamps `batch_idx_in_window=K` instead of
+        WINDOW_COMPLETE so the resume path knows to skip K batches.
+        Uses train_ts as the numeric subdir name — every save into the same
+        train_ts overwrites the previous in-window snapshot (via atomic
+        replace), so disk stays bounded to keep_last_n train_ts dirs."""
+        save_dmp_checkpoint(
+            model=model,
+            optimizer=optimizer,
+            metric_logger=metric_logger,
+            rank=rank,
+            batch_idx=train_ts,
+            train_ts=train_ts,
+            batch_idx_in_window=batch_idx_in_window,
+            device=device,
+            split_contract=live_split_contract,
+        )
+
+    def _run_train_window(
+        train_data_iterator,
+        train_ts: int,
+        start_batch_idx: int = 0,
+        label: Optional[str] = None,
+        do_eval: Optional[Callable[[int, int], None]] = None,
+    ) -> None:
+        # `start_batch_idx` is set when we're re-entering a window that was
+        # interrupted mid-way (in_window resume); the dataloader iterator was
+        # already advanced past those batches via the sampler skip, and we
+        # account for them in the local counter so in-window saves and the
+        # die_at_step hook fire at the right relative offsets.
+        train_batch_idx = start_batch_idx
+        first_wait: Optional[float] = None
+        while True:
+            model.train()
+            _t_next = time.perf_counter() if (label and rank == 0) else None
+            try:
+                sample = next(train_data_iterator)
+            except StopIteration:
+                break
+            if _t_next is not None and first_wait is None:
+                first_wait = time.perf_counter() - _t_next
+            if streaming_diag_unique_emb and train_batch_idx < int(
+                os.environ.get("DIAG_EMB_STEPS", "100")
+            ):
+                _log_unique_embedding_diag(
+                    sample,
+                    rank,
+                    train_batch_idx,
+                    max_steps=int(os.environ.get("DIAG_EMB_STEPS", "100")),
+                    log_every=metric_log_frequency,
+                )
+            optimizer.zero_grad()
+            sample.to(device)
+            (
+                _,
+                _,
+                aux_losses,
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+            ) = model.forward(
+                sample.uih_features_kjt,
+                sample.candidates_features_kjt,
+            )
+            # pyre-ignore
+            sum(aux_losses.values()).backward()
+            # Gradient clipping for the streaming path. Clips dense params (the
+            # sparse embedding tables use a fused optimizer and are unaffected,
+            # same as the non-streaming path's clip_grad_norm_). OFF by default
+            # (grad_clip_norm=0.0 via $GRAD_CLIP_NORM) so legacy streaming runs
+            # are byte-for-byte unchanged; set >0 to enable.
+            if grad_clip_norm and grad_clip_norm > 0:
+                torch.nn.utils.clip_grad_norm_(
+                    model.parameters(), max_norm=grad_clip_norm
+                )
+            optimizer.step()
+            metric_logger.update(
+                mode="train",
+                predictions=mt_target_preds,
+                labels=mt_target_labels,
+                weights=mt_target_weights,
+                num_candidates=sample.candidates_features_kjt.lengths().view(
+                    len(sample.candidates_features_kjt.keys()), -1
+                )[0],
+            )
+            # MLPerf train_loss event on its own cadence (decoupled from the
+            # console/TB metric cadence below). Called every step; the cross-rank
+            # all-reduce only fires on the cadence, gated by the rank-identical
+            # global_step inside the method, so it stays in lockstep.
+            metric_logger.maybe_log_mlperf_train_loss(aux_losses, every=mlperf_loss_every)
+            if train_batch_idx % metric_log_frequency == 0:
+                metric_logger.compute_and_log(
+                    mode="train",
+                    additional_logs={
+                        "losses": aux_losses,
+                    },
+                )
+            train_batch_idx += 1
+            if output_trace:
+                assert profiler is not None
+                profiler.step()
+            # Fine-grained in-window checkpoint triggers. All stamp
+            # batch_idx_in_window so a crash here leaves a resumable partial
+            # checkpoint, and all fire AFTER the metric update so restored
+            # state reflects the just-completed batch. Triggers are mutually
+            # short-circuited (one save per batch max) but evaluated on the
+            # same deterministic counters across all ranks, so the collective
+            # inside save_dmp_checkpoint stays in lockstep.
+            gstep = metric_logger.global_step["train"]
+            # Wall-clock elapsed is broadcast from rank 0 so every rank feeds
+            # the same value into the (otherwise pure) cadence decision.
+            elapsed = (
+                _broadcast_elapsed() if checkpoint_time_interval_s > 0 else 0.0
+            )
+            save_reason = select_in_window_checkpoint_reason(
+                train_batch_idx=train_batch_idx,
+                global_step=gstep,
+                elapsed_since_last_save=elapsed,
+                in_window_checkpoint_frequency=in_window_checkpoint_frequency,
+                checkpoint_step_frequency=checkpoint_step_frequency,
+                checkpoint_time_interval_s=checkpoint_time_interval_s,
+            )
+            if save_reason is not None:
+                if rank == 0:
+                    logger.info(
+                        "checkpoint trigger=%s train_ts=%d batch=%d global_step=%d",
+                        save_reason,
+                        train_ts,
+                        train_batch_idx,
+                        gstep,
+                    )
+                _save_mid_window(train_ts, train_batch_idx)
+                # Reset the wall-clock anchor on ANY save so the next time
+                # trigger is measured from the most recent checkpoint.
+                last_ckpt_time[0] = time.time()
+            # Data-fraction eval cadence: run the full-holdout eval whenever the
+            # monotonic global step crosses a multiple of eval_interval_steps
+            # (i.e. every eval_every_data_pct of the training data). Keyed off
+            # global_step (checkpoint-restored) so the eval grid is identical
+            # across resume. Mid-window-safe: eval sets model.eval(), so restore
+            # train mode + dataset.is_eval afterward. do_eval is None unless the
+            # data-pct cadence is enabled.
+            if (
+                do_eval is not None
+                and eval_interval_steps > 0
+                and gstep > 0
+                and gstep % eval_interval_steps == 0
+            ):
+                if rank == 0:
+                    logger.info(
+                        "[data-pct-eval] trigger eval train_ts=%d global_step=%d "
+                        "(interval=%d)",
+                        train_ts,
+                        gstep,
+                        eval_interval_steps,
+                    )
+                do_eval(train_ts, gstep)
+                model.train()
+                dataset.dataset.is_eval = False  # pyre-ignore [16]
+                # Data-fraction eval may hit the MLPerf target and emit RUN_STOP
+                # (via _do_eval_*). Stop the window immediately so we don't train
+                # past the convergence point; the outer window loop checks the
+                # same flag and breaks too.
+                if mlt.run_stopped:
+                    break
+            # Test-only: deterministic crash for the failure-injection test.
+            # Triggered AFTER the save above, so on resume we re-enter at
+            # batch_idx_in_window=train_batch_idx and emit batches [K+1, end).
+            if (
+                die_at_step >= 0
+                and metric_logger.global_step["train"] >= die_at_step
+            ):
+                if rank == 0:
+                    logger.warning(
+                        "die_at_step=%d hit at train_ts=%d batch=%d global_step=%d "
+                        "→ sys.exit(42)",
+                        die_at_step,
+                        train_ts,
+                        train_batch_idx,
+                        metric_logger.global_step["train"],
+                    )
+                # Distributed barrier so all ranks exit together rather than
+                # leaving a few ranks hanging on NCCL ops.
+                torch.distributed.barrier()
+                import sys
+                sys.exit(42)
+            # `num_train_batches` cap: None or 0 = run the whole window. >0 caps
+            # batches per window (mostly the streaming-resume test driver uses
+            # this to keep test windows short).
+            if num_train_batches and train_batch_idx >= num_train_batches:
+                break
+        if label and rank == 0 and first_wait is not None:
+            logger.info(
+                f"[boundary] {label} train first-batch data-wait={first_wait * 1000:.1f}ms"
+            )
+
+    def _run_eval_window(
+        eval_data_iterator, label: Optional[str] = None
+    ) -> Dict[str, float]:
+        # DO NOT add a checkpoint trigger anywhere inside this function. The eval
+        # data iterator's position is not serializable, so a checkpoint taken
+        # mid-eval could not be resumed deterministically. `_maybe_checkpoint`
+        # only fires after a completed eval window or mid-train-window, so any
+        # restored state always sits on a completed-eval boundary -- which is
+        # also why the eval reset below is safe across resume.
+        #
+        # Exclude this eval pass's wall-time from the train step-time window so
+        # step_ms stays canonical even when eval coincides with a train interval;
+        # the duration is reported separately (window_eval_time_ms + total_eval
+        # below). Resumed unconditionally at the end of this function.
+        metric_logger.pause_perf("eval")
+        model.eval()
+        # Reset eval metrics so each pass reports a clean number over the FIXED
+        # holdout set. Without this, lifetime/window eval metrics would keep
+        # accumulating across eval steps (the old behavior, made worse now that
+        # every step sees the identical set), making the eval-AUC trajectory
+        # uninterpretable. With the reset, each eval point == AUC over the whole
+        # fixed holdout at that train step -> directly comparable across steps.
+        metric_logger.reset(mode="eval")
+        eval_batch_idx = 0
+        first_wait: Optional[float] = None
+        _t_enter = time.perf_counter() if (label and rank == 0) else None
+        with torch.no_grad():
+            while True:
+                _t_next = time.perf_counter() if (label and rank == 0) else None
+                try:
+                    sample = next(eval_data_iterator)
+                except StopIteration:
+                    break
+                if _t_next is not None and first_wait is None:
+                    first_wait = time.perf_counter() - _t_next
+                sample.to(device)
+                (
+                    _,
+                    _,
+                    _,
+                    mt_target_preds,
+                    mt_target_labels,
+                    mt_target_weights,
+                ) = model.forward(
+                    sample.uih_features_kjt,
+                    sample.candidates_features_kjt,
+                )
+                metric_logger.update(
+                    mode="eval",
+                    predictions=mt_target_preds,
+                    labels=mt_target_labels,
+                    weights=mt_target_weights,
+                    num_candidates=sample.candidates_features_kjt.lengths().view(
+                        len(sample.candidates_features_kjt.keys()), -1
+                    )[0],
+                )
+                eval_batch_idx += 1
+                if output_trace:
+                    assert profiler is not None
+                    profiler.step()
+                if eval_batch_idx % metric_log_frequency == 0:
+                    metric_logger.compute_and_log(mode="eval")
+                if num_eval_batches is not None and eval_batch_idx >= num_eval_batches:
+                    break
+            _eval_metrics = metric_logger.compute(mode="eval")
+            for k, v in _eval_metrics.items():
+                print(f"{k}: {v}")
+        if label and rank == 0 and _t_enter is not None:
+            _eval_total = time.perf_counter() - _t_enter
+            _fw = (first_wait * 1000) if first_wait is not None else float("nan")
+            logger.info(
+                f"[boundary] {label} eval first-batch data-wait={_fw:.1f}ms "
+                f"total_eval={_eval_total * 1000:.1f}ms batches={eval_batch_idx}"
+            )
+            # Dedicated per-eval metrics sink. One JSON line per eval boundary
+            # capturing the END-OF-PASS metric over the FIXED holdout -- the single
+            # correct value for that eval point (no interim/averaging ambiguity).
+            # Rank 0 only; append-only so it survives restarts and the trajectory
+            # accumulates across resumes. Written next to the main run log
+            # ("<LOG>.metrics.jsonl"), falling back to cwd if LOG is unset.
+            import json
+            import re as _re
+
+            _log = os.environ.get("LOG")
+            if _log:
+                _base = _log[:-4] if _log.endswith(".log") else _log
+                _metrics_path = f"{_base}.metrics.jsonl"
+            else:
+                _metrics_path = "streaming_eval_metrics.jsonl"
+            _ts_m = _re.search(r"train_ts=(\d+)", label)
+            _rec = {
+                "label": label,
+                "train_ts": int(_ts_m.group(1)) if _ts_m else None,
+                "global_step": int(metric_logger.global_step.get("train", -1)),
+                "eval_batches": eval_batch_idx,
+                "total_eval_ms": round(_eval_total * 1000, 1),
+                "wall_time": time.time(),
+            }
+            for _k, _v in _eval_metrics.items():
+                try:
+                    _rec[_k] = float(_v)
+                except (TypeError, ValueError):
+                    pass
+            try:
+                with open(_metrics_path, "a") as _f:
+                    _f.write(json.dumps(_rec) + "\n")
+            except OSError as _e:
+                logger.warning("failed to write metrics sink %s: %s", _metrics_path, _e)
+        metric_logger.resume_perf("eval")
+        # Return metrics (on every rank) so the MLPerf eval hooks can consume them.
+        return _eval_metrics
+
+    def _maybe_checkpoint(train_ts: int) -> None:
+        if (
+            train_ts % checkpoint_frequency == 0 and train_ts > 0
+        ) or train_ts == start_ts + num_train_ts - 1:
+            # End-of-window save: stamp WINDOW_COMPLETE so resume advances past
+            # this train_ts. `device` enables per-rank RNG snapshot for
+            # bit-equal resume of dropout-bearing modules.
+            save_dmp_checkpoint(
+                model=model,
+                optimizer=optimizer,
+                metric_logger=metric_logger,
+                rank=rank,
+                batch_idx=train_ts,
+                train_ts=train_ts,
+                batch_idx_in_window=WINDOW_COMPLETE,
+                device=device,
+                split_contract=live_split_contract,
+            )
+            last_ckpt_time[0] = time.time()
+
+    # Apply start_ts shift from resume (may have moved past the original start).
+    # num_train_ts is the requested *count*; preserve it so the loop runs for
+    # the same total number of windows post-resume as a fresh run would have.
+    train_ts_list = list(range(start_ts, start_ts + num_train_ts))
+    n_train = len(train_ts_list)
+
+    def _should_eval(i: int) -> bool:
+        """Whether to run the full-holdout eval after training window index `i`.
+
+        Single cadence knob `eval_every_n_windows`:
+          * <=0 -> eval disabled entirely (train-only; e.g. perf benchmarking or
+            the resume test). The eval dataloader is not even built.
+          * 1 (default) -> eval after every window.
+          * K>1 -> eval when the ABSOLUTE window ts is on the grid anchored at
+            `eval_anchor_ts` (the original start_ts), i.e. ts in {anchor,
+            anchor+K, anchor+2K, ...}, and ALWAYS on the final window so the
+            trajectory ends with an eval point. Anchoring to the absolute ts
+            (not the per-call loop index `i`) keeps the eval grid (e.g.
+            150,160,170,...) stable across a mid-run resume, which rebases
+            start_ts/`train_ts_list` to the resume window.
+        """
+        if eval_every_n_windows <= 0:
+            return False
+        if eval_every_n_windows == 1:
+            return True
+        return (train_ts_list[i] - eval_anchor_ts) % eval_every_n_windows == 0 or i == n_train - 1
+
+    # Fixed eval set: held-out users' anchors over the resolved holdout window
+    # range, computed ONCE and reused at every eval step. Same anchors every
+    # step -> stable, comparable eval-AUC curve, and bounded eval time
+    # (~(1 - train_split_percentage) of a window). Cached inside the dataset so
+    # re-deriving it (e.g. on resume) returns the identical set. None for
+    # datasets without holdout support (synthetic) -> legacy per-window eval.
+    eval_global_indices: Optional["np.ndarray"] = None
+    if supports_holdout:
+        eval_global_indices = dataset.dataset.eval_holdout_indices(  # pyre-ignore [16]
+            eval_holdout_ts_resolved, eval_holdout_num_windows
+        )
+        if rank == 0:
+            logger.info(
+                "Fixed eval holdout: ts=[%d, %d) -> %d anchors (train_split_percentage=%s)",
+                eval_holdout_ts_resolved,
+                eval_holdout_ts_resolved + eval_holdout_num_windows,
+                len(eval_global_indices),
+                dataset.dataset._train_split_percentage,  # pyre-ignore[16]
+            )
+
+    # --- MLPerf run tracking --------------------------------------------------
+    # total_train_samples = epoch_num denominator (global trainable samples over
+    # the window range), computed once and logged as TRAIN_SAMPLES.
+    total_train_samples = 0
+    if mlperf_logger is not None:
+        _idx_fn = getattr(
+            dataset.dataset, "train_window_indices", None
+        ) or getattr(dataset.dataset, "window_indices", None)
+        if _idx_fn is not None:
+            for _ts in train_ts_list:
+                total_train_samples += int(_idx_fn(_ts).size)
+        if rank == 0:
+            logger.info(
+                "MLPerf: total_train_samples=%d over %d windows",
+                total_train_samples,
+                n_train,
+            )
+        # Wire the logger + LR getter so MetricsLogger.compute emits train_loss.
+        metric_logger.mlperf_logger = mlperf_logger
+
+        def _current_lr() -> float:
+            return float(optimizer.param_groups[0]["lr"])
+
+        metric_logger.lr_getter = _current_lr
+
+    # Centralized MLPerf run-boundary state machine: owns block/eval/run markers,
+    # SAMPLES_COUNT/EPOCH_NUM progress metadata, and the per-window-AUC vs
+    # auc_threshold convergence decision. Every method no-ops when mlperf_logger
+    # is None, so the loop below calls them unconditionally.
+    from generative_recommenders.dlrm_v3.train.mlperf_logging_utils import (
+        MLPerfRunTracker,
+    )
+
+    mlt = MLPerfRunTracker(
+        logger=mlperf_logger,
+        metric_logger=metric_logger,
+        total_train_samples=total_train_samples,
+        rank=rank,
+        device=device,
+    )
+    mlt.log_dataset_sizes(
+        eval_samples=eval_global_indices.size
+        if eval_global_indices is not None
+        else None
+    )
+
+    if persistent_loader and double_buffer:
+        # Double-buffered: next window prepared in the background during the
+        # current window's compute. Eval (if enabled) uses its own pre-forked
+        # pool, primed up front on the main thread so no fork races a bg thread.
+        prefetcher = _PrefetchingWindowLoader(
+            dataset=dataset,
+            sampler_factory=lambda: StreamingWindowSampler(rank, world_size),
+            dl_factory=lambda s: make_persistent_streaming_dataloader(
+                dataset=dataset, sampler=s
+            ),
+        )
+        eval_sampler: Optional[StreamingWindowSampler] = None
+        eval_dl: Optional[DataLoader] = None
+        # Eval iterator is built one window ahead: the eval pool (idle while the
+        # current train window runs) prefetches the next eval window's first
+        # batches concurrently with train compute, so eval starts warm. yambda's
+        # sample content depends only on the sampler window, not is_eval, so
+        # prefetching during train is safe.
+        eval_iter: Optional[Iterator] = None
+        # Build/fork the eval pool when EITHER cadence needs it: the per-window
+        # cadence (eval_every_n_windows>0) or the data-fraction cadence
+        # (eval_interval_steps>0). Both are never simultaneously on (validated
+        # at entry), so this is "eval is enabled at all".
+        if (eval_every_n_windows > 0 or eval_interval_steps > 0) and len(
+            train_ts_list
+        ) > 0:
+            eval_sampler = StreamingWindowSampler(rank, world_size)
+            eval_dl = make_persistent_streaming_dataloader(
+                dataset=dataset, sampler=eval_sampler
+            )
+            # CRITICAL: fork the eval worker pool HERE, on the main thread,
+            # BEFORE prefetcher.stream() below spins up its background prep
+            # thread. The pool is persistent_workers=True, so this first iter()
+            # is the ONLY fork; every later iter() merely resets and reuses these
+            # workers (no fork), so it can never deadlock against the background
+            # thread holding an allocator/GIL-released lock. (Deferring this
+            # first fork into the loop — as a sparse-eval cadence naively might —
+            # hangs the run.) _should_eval(0) is always True when eval is enabled
+            # (0 % K == 0). The eval set is the FIXED holdout (same every step),
+            # so we install it on the sampler ONCE here; later evals just call
+            # iter() again to replay the identical set (no set_window churn).
+            eval_sampler.set_window(eval_global_indices)
+            eval_iter = iter(eval_dl)
+
+        # Data-fraction eval callback (double-buffer path). Fired mid-window by
+        # _run_train_window on the global-step cadence. Reuses the already-forked
+        # persistent eval pool: iter(eval_dl) here runs on the MAIN thread (a
+        # reset, not a fork — the only fork was the up-front iter() above), so it
+        # stays safe alongside the background window-prefetch thread.
+        def _do_eval_db(train_ts: int, gstep: int) -> None:
+            # Data-fraction eval boundary: this closes the current MLPerf block,
+            # runs the holdout eval with full EVAL_START/EVAL_STOP + EVAL_ACCURACY
+            # + convergence, then opens the next block. The block thus brackets
+            # exactly one eval_interval_steps of training (MLPerf block == work
+            # between two evals), instead of one timestamp window.
+            dataset.dataset.is_eval = True  # pyre-ignore [16]
+            assert eval_dl is not None
+            mlt.block_stop()
+            mlt.eval_start()
+            eval_metrics = _run_eval_window(
+                iter(eval_dl),
+                label=f"eval_holdout@train_ts={train_ts}@step={gstep}",
+            )
+            # Emits RUN_STOP (sets mlt.run_stopped) if the target is met;
+            # _run_train_window / the window loop break on that flag.
+            mlt.eval_stop(eval_metrics)
+            if not mlt.run_stopped:
+                mlt.block_start()
+
+        _db_do_eval = _do_eval_db if eval_interval_steps > 0 else None
+
+        # Block placement depends on the eval cadence. Per-window cadence
+        # (eval_every_n_windows>0): one block per timestamp window. Otherwise
+        # (data-fraction cadence, or no eval): a single block spans the whole
+        # run, split at each data-fraction eval boundary by _do_eval_db. Open
+        # the first block here for the latter; the boundary helper + the
+        # post-loop stop handle the rest.
+        _per_window_blocks = eval_every_n_windows > 0
+        if not _per_window_blocks:
+            mlt.block_start()
+
+        for i, (train_ts, train_data_iterator) in enumerate(
+            # Only the FIRST window after a mid-window resume needs the skip
+            # (handed via prefetcher.stream's first_skip_samples). The skip is
+            # zero on cold start (resume_train_ts is None → first_skip_samples=0)
+            # and on completed-window resume (mid-window slice is 0 too).
+            prefetcher.stream(train_ts_list, first_skip_samples=first_skip_samples)
+        ):
+            dataset.dataset.is_eval = False  # pyre-ignore [16]
+            # First iteration after a mid-window resume carries
+            # resume_batch_idx_in_window so in-window saves and the die_at_step
+            # hook keep accurate counters; otherwise count from 0.
+            start_batch = (
+                resume_batch_idx_in_window
+                if i == 0 and resume_batch_idx_in_window > 0
+                else 0
+            )
+            if _per_window_blocks:
+                mlt.block_start()
+            _run_train_window(
+                train_data_iterator,
+                train_ts=train_ts,
+                start_batch_idx=start_batch,
+                label=f"train_ts={train_ts}",
+                do_eval=_db_do_eval,
+            )
+            if _per_window_blocks:
+                mlt.block_stop()
+            should_stop = False
+            if _should_eval(i):
+                dataset.dataset.is_eval = True  # pyre-ignore [16]
+                assert eval_sampler is not None and eval_dl is not None
+                mlt.eval_start()
+                eval_metrics = _run_eval_window(
+                    eval_iter, label=f"eval_holdout@train_ts={train_ts}"
+                )
+                should_stop = mlt.eval_stop(eval_metrics)
+                # Re-arm the (already-forked) eval pool for the NEXT eval. The
+                # holdout set is fixed, so the sampler window is unchanged; we
+                # only need a fresh iter() to replay it. iter() reuses the
+                # persistent workers — no fork, safe alongside the bg thread.
+                next_eval_i = next(
+                    (j for j in range(i + 1, n_train) if _should_eval(j)), None
+                )
+                if next_eval_i is not None:
+                    eval_iter = iter(eval_dl)
+            _maybe_checkpoint(train_ts)
+            # should_stop: per-window convergence. mlt.run_stopped:
+            # data-fraction convergence (RUN_STOP fired mid-window by _do_eval_db).
+            if should_stop or mlt.run_stopped:
+                # MLPerf target reached: RUN_STOP already emitted; stop training.
+                break
+
+        # Close the run-spanning block for the data-fraction / no-eval case.
+        # Idempotent: a no-op if the last eval boundary already closed it (i.e.
+        # convergence stopped the run) or if per-window blocks were used.
+        if not _per_window_blocks:
+            mlt.block_stop()
+    else:
+        # Data-fraction eval callback (non-double-buffer path). Builds a fresh
+        # eval dataloader per call over the FIXED holdout set (or the legacy
+        # next-window eval when the dataset has no holdout support).
+        def _do_eval_nb(train_ts: int, gstep: int) -> None:
+            # Data-fraction eval boundary (non-double-buffer path). See _do_eval_db:
+            # close the current MLPerf block, run the eval with full markers +
+            # convergence, then open the next block so a block brackets one
+            # eval_interval_steps of training rather than a timestamp window.
+            dataset.dataset.is_eval = True  # pyre-ignore [16]
+            mlt.block_stop()
+            mlt.eval_start()
+            if eval_global_indices is not None:
+                eval_metrics = _run_eval_window(
+                    iter(
+                        make_streaming_dataloader(
+                            dataset=dataset, indices=eval_global_indices
+                        )
+                    ),
+                    label=f"eval_holdout@train_ts={train_ts}@step={gstep}",
+                )
+            else:
+                eval_metrics = _run_eval_window(
+                    iter(make_streaming_dataloader(dataset=dataset, ts=train_ts + 1)),
+                    label=f"eval@train_ts={train_ts}@step={gstep}",
+                )
+            mlt.eval_stop(eval_metrics)
+            if not mlt.run_stopped:
+                mlt.block_start()
+
+        _nb_do_eval = _do_eval_nb if eval_interval_steps > 0 else None
+
+        # See the double-buffer branch: per-window blocks for the per-window
+        # cadence, else a single run-spanning block split at data-fraction eval
+        # boundaries by _do_eval_nb.
+        _per_window_blocks = eval_every_n_windows > 0
+        if not _per_window_blocks:
+            mlt.block_start()
+
+        for i, train_ts in enumerate(train_ts_list):
+            dataset.dataset.is_eval = False  # pyre-ignore [16]
+            skip = first_skip_samples if i == 0 else 0
+            start_batch = (
+                resume_batch_idx_in_window
+                if i == 0 and resume_batch_idx_in_window > 0
+                else 0
+            )
+            if _per_window_blocks:
+                mlt.block_start()
+            _run_train_window(
+                _window_iter(train_ts, skip_samples=skip),
+                train_ts=train_ts,
+                start_batch_idx=start_batch,
+                do_eval=_nb_do_eval,
+            )
+            if _per_window_blocks:
+                mlt.block_stop()
+            should_stop = False
+            if _should_eval(i):
+                dataset.dataset.is_eval = True  # pyre-ignore [16]
+                mlt.eval_start()
+                if eval_global_indices is not None:
+                    eval_metrics = _run_eval_window(
+                        iter(
+                            make_streaming_dataloader(
+                                dataset=dataset, indices=eval_global_indices
+                            )
+                        ),
+                        label=f"eval_holdout@train_ts={train_ts}",
+                    )
+                else:
+                    # Legacy per-window eval (datasets without user holdout).
+                    eval_metrics = _run_eval_window(
+                        iter(make_streaming_dataloader(dataset=dataset, ts=train_ts + 1))
+                    )
+                should_stop = mlt.eval_stop(eval_metrics)
+            _maybe_checkpoint(train_ts)
+            # should_stop: per-window convergence. mlt.run_stopped:
+            # data-fraction convergence (RUN_STOP fired mid-window by _do_eval_nb).
+            if should_stop or mlt.run_stopped:
+                # MLPerf target reached: RUN_STOP already emitted; stop training.
+                break
+
+        # Close the run-spanning block for the data-fraction / no-eval case
+        # (idempotent; no-op under per-window blocks or after a convergence stop).
+        if not _per_window_blocks:
+            mlt.block_stop()
+
+    # Final eval over the fixed user-holdout set (legacy final-window eval
+    # otherwise). Skipped if the MLPerf target already stopped the run mid-run.
+    if not mlt.run_stopped:
+        dataset.dataset.is_eval = True  # pyre-ignore [16]
+        mlt.eval_start()
+        if eval_global_indices is not None:
+            final_metrics = _run_eval_window(
+                iter(
+                    make_streaming_dataloader(
+                        dataset=dataset, indices=eval_global_indices
+                    )
+                ),
+                label="eval_holdout@final",
+            )
+        else:
+            final_metrics = _run_eval_window(
+                iter(make_streaming_dataloader(dataset=dataset, ts=num_train_ts)),
+                label="eval@final",
+            )
+        mlt.eval_stop(final_metrics)
+        if rank == 0:
+            for k, v in final_metrics.items():
+                print(f"{k}: {v}")
+        # End-of-run RUN_STOP: SUCCESS if final metric met target, else ABORTED.
+        mlt.finalize(final_metrics)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
new file mode 100644
index 000000000..ed456bde6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -0,0 +1,1767 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+"""
+mlperf dlrm_v3 inference benchmarking tool.
+"""
+
+import contextlib
+import logging
+import os
+import time
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional
+
+import gin
+import tensorboard  # @manual=//tensorboard:lib  # noqa: F401 - required implicit dep when using torch.utils.tensorboard
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset
+from generative_recommenders.dlrm_v3.datasets.kuairand import DLRMv3KuaiRandDataset
+from generative_recommenders.dlrm_v3.datasets.movie_lens import DLRMv3MovieLensDataset
+from generative_recommenders.dlrm_v3.datasets.synthetic_movie_lens import (
+    DLRMv3SyntheticMovieLensDataset,
+)
+from generative_recommenders.dlrm_v3.datasets.synthetic_streaming import (
+    DLRMv3SyntheticStreamingDataset,
+)
+from generative_recommenders.dlrm_v3.datasets.yambda import DLRMv3YambdaDataset
+from generative_recommenders.modules.multitask_module import (
+    MultitaskTaskType,
+    TaskConfig,
+)
+from torch.profiler import profile, profiler, ProfilerActivity  # pyre-ignore [21]
+from torch.utils.tensorboard import SummaryWriter
+from torchrec.metrics.accuracy import AccuracyMetricComputation
+from torchrec.metrics.auc import AUCMetricComputation, compute_auc
+from torchrec.metrics.gauc import GAUCMetricComputation
+from torchrec.metrics.mae import MAEMetricComputation
+from torchrec.metrics.metrics_namespace import MetricName, MetricPrefix
+from torchrec.metrics.mse import MSEMetricComputation
+from torchrec.metrics.ne import NEMetricComputation
+from torchrec.metrics.rec_metric import (
+    MetricComputationReport,
+    RecMetricComputation,
+)
+
+
+class LifetimeAUCMetricComputation(AUCMetricComputation):
+    """AUC over a 10M-sample (~5.5 eval-window) trailing buffer; emits with the
+    LIFETIME prefix.
+
+    NOTE: despite the name, this is NOT an uncapped since-step-0 AUC. The parent
+    ``AUCMetricComputation`` evicts the prediction/label/weight buffers down to
+    ``window_size`` in ``update()``; we instantiate it with
+    ``window_size=10_000_000``, so "lifetime" is a ~10M-sample trailing window.
+    Raise ``window_size`` (accepting unbounded buffer growth) if true cumulative
+    AUC is ever required.
+
+    Checkpoint correctness: torchrec registers the PREDICTIONS/LABELS/WEIGHTS
+    buffers with ``persistent=False`` (so the default ``state_dict()`` drops
+    them) and tracks a separate ``self._num_samples`` counter. Without the
+    overrides below, every checkpoint resume would silently restart this metric
+    from an empty buffer. We therefore serialize the buffers AND ``_num_samples``
+    explicitly; restoring ``_num_samples`` is mandatory, since leaving it at 0
+    makes the next ``update()`` take the init-sentinel branch and desync the
+    windowed eviction. These buffers are per-rank-local (cross-rank gather only
+    happens transiently at compute time), so the checkpoint layer MUST persist
+    and restore them per-rank — see ``checkpoint.py``.
+    """
+
+    # Prefix used for the explicitly-serialized non-persistent buffers so the
+    # keys can't collide with any persistent state the parent might register.
+    _LIFETIME_KEY_PREFIX: str = "_lifetime_"
+
+    def _compute(self) -> List[MetricComputationReport]:
+        from typing import cast as _cast
+        from torchrec.metrics.auc import LABELS, PREDICTIONS, WEIGHTS
+        return [
+            MetricComputationReport(
+                name=MetricName.AUC,
+                metric_prefix=MetricPrefix.LIFETIME,
+                value=compute_auc(
+                    self._n_tasks,
+                    _cast(List[torch.Tensor], getattr(self, PREDICTIONS)),
+                    _cast(List[torch.Tensor], getattr(self, LABELS)),
+                    _cast(List[torch.Tensor], getattr(self, WEIGHTS)),
+                    self._apply_bin,
+                ),
+            )
+        ]
+
+    def lifetime_sample_count(self) -> int:
+        """Current number of buffered samples (greppable for sanity logs)."""
+        return int(getattr(self, "_num_samples", 0))
+
+    def state_dict(
+        self,
+        destination: Optional[Dict[str, Any]] = None,
+        prefix: str = "",
+        keep_vars: bool = False,
+    ) -> Dict[str, Any]:
+        from torchrec.metrics.auc import LABELS, PREDICTIONS, WEIGHTS
+
+        destination = super().state_dict(
+            destination=destination, prefix=prefix, keep_vars=keep_vars
+        )
+        # The parent registers these buffers persistent=False, so they are absent
+        # from `destination`. Concatenate each buffer list to one (n_tasks, N)
+        # tensor and serialize it alongside the sample counter.
+        for attr in (PREDICTIONS, LABELS, WEIGHTS):
+            buf = getattr(self, attr)
+            if isinstance(buf, (list, tuple)) and len(buf) > 0:
+                flat = torch.cat([t for t in buf], dim=-1)
+            elif isinstance(buf, torch.Tensor):
+                flat = buf
+            else:
+                flat = torch.empty(0)
+            destination[prefix + self._LIFETIME_KEY_PREFIX + attr] = (
+                flat.detach().cpu().clone()
+            )
+        destination[prefix + self._LIFETIME_KEY_PREFIX + "num_samples"] = (
+            torch.tensor(int(getattr(self, "_num_samples", 0)), dtype=torch.long)
+        )
+        return destination
+
+    def load_state_dict(
+        self,
+        state_dict: Dict[str, Any],
+        strict: bool = True,
+    ) -> Any:
+        from torchrec.metrics.auc import LABELS, PREDICTIONS, WEIGHTS
+
+        # Copy so we can strip our custom keys before delegating to the parent
+        # (whose strict load would otherwise reject them as unexpected).
+        remaining = dict(state_dict)
+        saved_bufs: Dict[str, torch.Tensor] = {}
+        for attr in (PREDICTIONS, LABELS, WEIGHTS):
+            key = self._LIFETIME_KEY_PREFIX + attr
+            if key in remaining:
+                saved_bufs[attr] = remaining.pop(key)
+        num_key = self._LIFETIME_KEY_PREFIX + "num_samples"
+        saved_num = remaining.pop(num_key, None)
+
+        result = super().load_state_dict(remaining, strict=strict)
+
+        if saved_bufs:
+            # Device of the live (init-sentinel) buffers; keep restored buffers
+            # co-located so subsequent update()/compute() stay on-device.
+            existing = getattr(self, PREDICTIONS)
+            dev = (
+                existing[0].device
+                if isinstance(existing, (list, tuple)) and len(existing) > 0
+                else torch.device("cpu")
+            )
+            for attr, val in saved_bufs.items():
+                setattr(self, attr, [val.to(dev)])
+            if saved_num is not None:
+                self._num_samples = int(saved_num.item())
+        return result
+
+
+# Sentinel "window size" used for the FRESH eval metrics so torchrec's windowed
+# eviction never fires within a single eval pass (the per-pass reset bounds the
+# buffer to exactly one full holdout pass). 1<<60 is far above any realistic
+# per-rank sample count and avoids sys.maxsize overflow inside torchrec math.
+UNBOUNDED_WINDOW: int = 1 << 60
+
+
+class BinnedCumulativeAUC(RecMetricComputation):
+    """Cumulative AUC via a fixed-resolution score histogram (LIFETIME prefix).
+
+    Global AUC is a rank statistic, so it has no fixed-size additive sufficient
+    statistic the way NE/Accuracy do - exact cumulative AUC otherwise needs every
+    (score, label) pair retained and sorted (the buffer-based ``AUCMetricComputation``
+    / ``LifetimeAUCMetricComputation``). Instead we keep two weighted histograms of
+    positive/negative mass per score bin. This gives an AUC exact up to bin width
+    with O(num_bins) memory that does NOT grow with sample count, and - because
+    histograms are additive - cross-rank sync is a cheap all-reduce (dist_reduce_fx
+    "sum") rather than all-gathering millions of predictions. The state is truly
+    cumulative across all eval passes (never evicted, never reset on eval).
+
+    Predictions MUST be probabilities in [0, 1] (the same tensor feeds NE, which
+    requires probabilities; the model applies sigmoid in multitask_module). Values
+    are clamped into [0, 1] defensively.
+    """
+
+    def __init__(self, *args, num_bins: int = 100_000, **kwargs) -> None:
+        # window_size is irrelevant here (no windowed state); pass through.
+        super().__init__(*args, **kwargs)
+        self._num_bins: int = int(num_bins)
+        self._add_state(
+            "pos_hist",
+            torch.zeros((self._n_tasks, self._num_bins), dtype=torch.float64),
+            add_window_state=False,
+            dist_reduce_fx="sum",
+            persistent=True,
+        )
+        self._add_state(
+            "neg_hist",
+            torch.zeros((self._n_tasks, self._num_bins), dtype=torch.float64),
+            add_window_state=False,
+            dist_reduce_fx="sum",
+            persistent=True,
+        )
+
+    def cumulative_sample_count(self) -> int:
+        """Total weighted samples in the histograms (greppable for sanity logs)."""
+        return int((self.pos_hist.sum() + self.neg_hist.sum()).item())
+
+    def update(
+        self,
+        *,
+        predictions: Optional[torch.Tensor],
+        labels: torch.Tensor,
+        weights: Optional[torch.Tensor],
+        **kwargs: Dict[str, Any],
+    ) -> None:
+        if predictions is None or weights is None:
+            raise ValueError(
+                "BinnedCumulativeAUC.update requires predictions and weights"
+            )
+        preds = predictions.float().clamp_(0.0, 1.0)  # (n_tasks, n_examples)
+        labels = labels.float()
+        weights = weights.float()
+        # Bin index per example; the top edge (p==1.0) folds into the last bin.
+        idx = (preds * self._num_bins).long().clamp_(0, self._num_bins - 1)
+        pos_w = (weights * labels).to(self.pos_hist.dtype)
+        neg_w = (weights * (1.0 - labels)).to(self.neg_hist.dtype)
+        self.pos_hist.scatter_add_(1, idx, pos_w)
+        self.neg_hist.scatter_add_(1, idx, neg_w)
+
+    def _compute(self) -> List[MetricComputationReport]:
+        # By compute() time torchmetrics has all-reduced (summed) the histograms
+        # across ranks, so these are the global per-bin masses.
+        pos = self.pos_hist  # (n_tasks, num_bins)
+        neg = self.neg_hist
+        total_pos = pos.sum(dim=1)
+        total_neg = neg.sum(dim=1)
+        # Lower bin index == lower score. A positive in bin b outranks every
+        # negative in bins < b (exclusive prefix sum), and ties in bin b score
+        # 0.5. AUC = sum_b pos_b * (neg_below_b + 0.5*neg_b) / (P * N).
+        neg_below = torch.cumsum(neg, dim=1) - neg
+        numerator = (pos * (neg_below + 0.5 * neg)).sum(dim=1)
+        denom = total_pos * total_neg
+        auc = torch.where(
+            denom > 0,
+            numerator / denom,
+            torch.full_like(numerator, 0.5),
+        ).to(torch.float32)
+        return [
+            MetricComputationReport(
+                name=MetricName.AUC,
+                metric_prefix=MetricPrefix.LIFETIME,
+                value=auc,
+            )
+        ]
+
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("utils")
+
+
+def _trim_warmup_from_trace(path: str, keep_n_active: int) -> None:
+    """Post-process a chrome trace to drop events from WARMUP-phase steps.
+
+    torch.profiler captures events during BOTH the WARMUP and RECORD phases
+    of a schedule and writes them all to the exported trace. There is no
+    built-in flag to exclude WARMUP from the export. We approximate it by:
+
+      1) Finding all ``ProfilerStep#N`` spans in the file.
+      2) Keeping only the last ``keep_n_active`` of them (sorted by start
+         timestamp) as the "active" range.
+      3) Filtering ``traceEvents`` to events whose ``ts`` falls inside that
+         range. Metadata events (``ph='M'``) are always preserved.
+
+    Mutates the file in place.
+    """
+    import json as _json
+    with open(path) as f:
+        d = _json.load(f)
+    events = d.get("traceEvents", [])
+
+    # ProfilerStep spans mark training-step boundaries; we filter by their
+    # time ranges rather than by name index because step numbering can offset
+    # between schedule_fn argument and the value printed in the trace.
+    # torch.profiler emits one ProfilerStep#N span per CPU thread that ran
+    # during that step, so dedupe by name first so "5 active steps" means
+    # 5 distinct step numbers, not 5 spans.
+    name_to_span: Dict[str, tuple] = {}
+    for e in events:
+        nm = e.get("name", "")
+        if "ProfilerStep" not in nm or e.get("ph") != "X" or "ts" not in e:
+            continue
+        ts = e["ts"]
+        end = ts + e.get("dur", 0)
+        prev = name_to_span.get(nm)
+        if prev is None:
+            name_to_span[nm] = (ts, end)
+        else:
+            name_to_span[nm] = (min(prev[0], ts), max(prev[1], end))
+    if len(name_to_span) <= keep_n_active:
+        return
+    sorted_spans = sorted(name_to_span.values())
+    active = sorted_spans[-keep_n_active:]
+    t_start = min(s for s, _ in active)
+    t_end = max(e for _, e in active)
+
+    def _keep(e: dict) -> bool:
+        if e.get("ph") == "M":
+            return True
+        ts = e.get("ts")
+        if ts is None:
+            return True
+        return t_start <= ts < t_end
+
+    kept = [e for e in events if _keep(e)]
+    d["traceEvents"] = kept
+    with open(path, "w") as f:
+        _json.dump(d, f)
+    logger.warning(
+        f"Trimmed WARMUP events from {path}: {len(events):,} -> {len(kept):,} "
+        f"(kept active range [{t_start:.0f}, {t_end:.0f}] us)"
+    )
+
+
+# GPU activity categories used to detect GPU stream rows and their busy time.
+_GPU_KERNEL_CATS = frozenset({"kernel", "gpu_memcpy", "gpu_memset"})
+
+
+def _is_rocm() -> bool:
+    """True on ROCm/AMD builds (``torch.version.hip`` set), False on CUDA/B200.
+
+    The ProfilerStep-layout normalization and the sub-us kernel de-overlap are
+    workarounds for how roctracer projects annotations/kernels onto HIP streams;
+    CUDA/CUPTI traces don't have those artifacts, so these passes must be skipped
+    on NVIDIA to avoid touching otherwise-correct traces.
+    """
+    return getattr(torch.version, "hip", None) is not None
+
+
+def _normalize_profilerstep_layout(path: str) -> None:
+    """Collapse fragmented GPU-side ``ProfilerStep#N`` spans into one span/step.
+
+    ``torch.profiler`` emits ``ProfilerStep#N`` as a CPU ``user_annotation`` that
+    Kineto projects onto the GPU timeline as ``gpu_user_annotation`` spans. On
+    CUDA the blocking H2D copy shares the compute stream, so each step projects
+    onto a single GPU stream and renders as one full-width span. On ROCm a
+    blocking H2D copy lands on HIP's null stream (a different stream than the
+    non-null compute stream), so the step splits across two GPU rows and looks
+    truncated in Perfetto — a pure rendering artifact (every kernel is still
+    captured, and the underlying GPU is busy for the whole step).
+
+    This rewrites each per-step GPU ``ProfilerStep`` annotation to a single span
+    on the rank's busiest (compute) GPU stream, covering the kernel extent inside
+    that step's CPU window. Works on a raw per-rank trace (GPU streams are tids
+    under one pid) by keying the busiest stream on ``(pid, tid)``. No-op when the
+    annotation already lives on a single GPU stream (the CUDA case), so it is
+    safe to run on every platform. Mutates the file in place.
+    """
+    import json as _json
+
+    with open(path) as f:
+        d = _json.load(f)
+    events = d.get("traceEvents", [])
+
+    # Per (pid,tid) GPU busy time -> identify the busiest = compute stream.
+    stream_busy: Dict[tuple, int] = {}
+    for e in events:
+        if e.get("ph") == "X" and e.get("cat") in _GPU_KERNEL_CATS:
+            dur = e.get("dur", 0)
+            if dur > 0:
+                key = (e.get("pid"), e.get("tid"))
+                stream_busy[key] = stream_busy.get(key, 0) + dur
+    if not stream_busy:
+        return
+    busiest = max(stream_busy, key=lambda k: stream_busy[k])
+
+    # Existing GPU-side ProfilerStep spans and the streams they sit on.
+    gpu_ps_streams = set()
+    template = None
+    for e in events:
+        if e.get("cat") == "gpu_user_annotation" and str(
+            e.get("name", "")
+        ).startswith("ProfilerStep"):
+            gpu_ps_streams.add((e.get("pid"), e.get("tid")))
+            if template is None:
+                template = e
+    # No fragmentation (single stream or none) -> leave the trace untouched.
+    if len(gpu_ps_streams) <= 1:
+        return
+
+    # CPU ProfilerStep windows: step name -> [min ts, max end].
+    cpu_win: Dict[str, list] = {}
+    for e in events:
+        if (
+            e.get("cat") == "user_annotation"
+            and e.get("ph") == "X"
+            and str(e.get("name", "")).startswith("ProfilerStep")
+        ):
+            ts = e.get("ts", 0)
+            end = ts + e.get("dur", 0)
+            w = cpu_win.get(e["name"])
+            if w is None:
+                cpu_win[e["name"]] = [ts, end]
+            else:
+                w[0] = min(w[0], ts)
+                w[1] = max(w[1], end)
+
+    # GPU kernel extents (any stream) for clamping each step's span.
+    gpu_kernels = [
+        (e.get("ts", 0), e.get("ts", 0) + e.get("dur", 0))
+        for e in events
+        if e.get("ph") == "X"
+        and e.get("cat") in _GPU_KERNEL_CATS
+        and e.get("dur", 0) > 0
+    ]
+
+    new_spans = []
+    for sname, (cs, ce) in cpu_win.items():
+        ks = [(ts, end) for ts, end in gpu_kernels if end > cs and ts < ce]
+        if not ks:
+            continue
+        gmin = min(ts for ts, _ in ks)
+        gmax = max(end for _, end in ks)
+        span = dict(template) if template else {}
+        span.update(
+            {
+                "ph": "X",
+                "cat": "gpu_user_annotation",
+                "name": sname,
+                "pid": busiest[0],
+                "tid": busiest[1],
+                "ts": gmin,
+                "dur": gmax - gmin,
+                "args": {"normalized_profilerstep": True},
+            }
+        )
+        new_spans.append(span)
+
+    if not new_spans:
+        return
+
+    out = [
+        e
+        for e in events
+        if not (
+            e.get("cat") == "gpu_user_annotation"
+            and str(e.get("name", "")).startswith("ProfilerStep")
+        )
+    ]
+    dropped = len(events) - len(out)
+    out.extend(new_spans)
+    d["traceEvents"] = out
+    with open(path, "w") as f:
+        _json.dump(d, f)
+    logger.warning(
+        f"Normalized GPU ProfilerStep layout in {path}: dropped {dropped} "
+        f"fragmented span(s) across {len(gpu_ps_streams)} stream(s), wrote "
+        f"{len(new_spans)} span(s) on busiest stream pid={busiest[0]} "
+        f"tid={busiest[1]}"
+    )
+
+
+def _deoverlap_gpu_slices(path: str, max_snap_us: float = 5.0) -> None:
+    """Remove sub-microsecond kernel overlaps that break Perfetto's renderer.
+
+    Perfetto draws all ``ph=="X"`` slices on a single track (one ``(pid, tid)``)
+    as a strict nested stack ordered by start time: a slice that *opens* while a
+    previous slice on the same track is still open is treated as that slice's
+    child and is **clipped to the parent's end**. ROCm's roctracer reports
+    per-stream kernel timestamps at ns granularity, so two back-to-back kernels
+    on the same compute stream occasionally overlap by a fraction of a
+    microsecond (e.g. an 88 ns ``elementwise`` epilogue ending 0.075 us *after*
+    the next 21 ms ``_hstu_attn_bwd`` kernel begins). Perfetto then nests the
+    long kernel inside the tiny one and clips it to a sub-pixel sliver, so the
+    kernel "disappears" from the timeline even though it is fully present in the
+    JSON.
+
+    This pulls each slice's end back to just *before* the next slice's start
+    whenever they overlap by less than ``max_snap_us`` (a measurement artifact,
+    not real concurrency — kernels on one stream are serialized), leaving genuine
+    nesting (a small kernel fully contained in a larger one) untouched. The
+    adjustment is sub-microsecond and does not change any reported duration
+    meaningfully. Mutates the file in place; best-effort.
+
+    Critically, the slices are separated by a tiny ``_GAP_US`` (~1 ns) rather
+    than snapped to an *exactly equal* end==start timestamp. A coincident
+    end==start is just as fatal as an overlap in Perfetto: it nests the next
+    slice inside the previous one and clips it to zero width (this is the ~1 ns
+    gap that roctracer leaves between cleanly-rendered back-to-back kernels). So
+    we also fix exact-touch (``a_end == b.ts``) boundaries, not just overlaps.
+    """
+    import json as _json
+    from collections import defaultdict
+
+    # ~1 ns. Matches the natural inter-kernel gap roctracer leaves between
+    # back-to-back kernels that Perfetto already renders correctly. Must be
+    # strictly > 0 so end != start after the nudge.
+    _GAP_US = 0.001
+
+    with open(path) as f:
+        d = _json.load(f)
+    events = d.get("traceEvents", [])
+
+    tracks: Dict[tuple, list] = defaultdict(list)
+    for e in events:
+        if (
+            e.get("ph") == "X"
+            and e.get("cat") in _GPU_KERNEL_CATS
+            and e.get("dur", 0) > 0
+        ):
+            tracks[(e.get("pid"), e.get("tid"))].append(e)
+
+    snapped = 0
+    max_clip = 0.0
+    for sl in tracks.values():
+        # Sort by start, then longest-first so a container precedes the slices
+        # it nests; consecutive pairs are then either disjoint, properly nested,
+        # or a tiny artifact overlap.
+        sl.sort(key=lambda e: (e["ts"], -e["dur"]))
+        for i in range(len(sl) - 1):
+            a = sl[i]
+            b = sl[i + 1]
+            a_end = a["ts"] + a["dur"]
+            b_end = b["ts"] + b["dur"]
+            # Touching (a_end == b.ts) or partial overlap (a ends inside b) both
+            # break rendering; true containment (a_end >= b_end) is valid nesting
+            # and is left alone.
+            if b["ts"] <= a_end < b_end:
+                desired_end = b["ts"] - _GAP_US
+                clip = a_end - desired_end
+                if a["ts"] < desired_end and 0 < clip < max_snap_us:
+                    a["dur"] = desired_end - a["ts"]
+                    snapped += 1
+                    if clip > max_clip:
+                        max_clip = clip
+
+    if snapped:
+        with open(path, "w") as f:
+            _json.dump(d, f)
+        logger.warning(
+            f"De-overlapped GPU slices in {path}: snapped {snapped} sub-us "
+            f"overlap(s) (max {max_clip:.3f}us) so Perfetto renders every kernel"
+        )
+
+
+def _deoverlap_gpu_annotations(path: str, max_snap_us: float = 5.0) -> None:
+    """Separate touching/overlapping *sibling* GPU annotations so Perfetto draws
+    each one full width (the B200-style stacked layout).
+
+    Same root cause as :func:`_deoverlap_gpu_slices`, but at the annotation
+    boundary instead of the kernel boundary. The forward/backward phase
+    annotations Kineto projects onto the GPU stream (``## item_forward ##``,
+    ``## user_forward ##``, ``## multitask_module ##``, the ``## stu_* ##``
+    pairs, ...) are emitted as a chain of siblings laid end-to-end: each is meant
+    to end exactly where the next begins. Perfetto stores timestamps as int64 ns,
+    and the absolute step timestamps are ~5.4e12 us where a float64's quantum is
+    already ~1 ns, so a sibling boundary that should be coincident instead lands
+    a few ns off. When the earlier sibling's end falls *at or after* the next
+    sibling's start, Perfetto nests the next sibling inside it and clips it to a
+    sub-pixel sliver — so e.g. the 100+ ms ``## user_forward ##`` span vanishes on
+    some ranks/steps and renders on others purely by rounding luck.
+
+    Unlike kernels (all flat on one stream), annotations form a real nesting
+    hierarchy — ``## user_forward ##`` legitimately *contains* the ``## stu_* ##``
+    spans and their kernels — so this cannot blindly snap consecutive slices. It
+    walks the per-track slice stack (sorted by start, longest-first) and only
+    snaps a slice ``a`` back when the next slice ``b`` is **not** contained in it
+    (``b`` extends beyond ``a``'s end), i.e. they are siblings rather than
+    parent/child. Real containment is left untouched, and a snap is skipped if it
+    would clip into ``a``'s own descendants (kernels or child annotations).
+    Mutates the file in place; best-effort. Run after :func:`_deoverlap_gpu_slices`
+    so kernel boundaries are already clean.
+    """
+    import json as _json
+    from collections import defaultdict
+
+    # ~2 ns. The annotation boundaries sit at ~5.4e12 us where a float64's
+    # quantum is ~0.98 ns, so a 1 ns nudge can round back onto the neighbour's
+    # timestamp (an exact touch, which Perfetto still nests+clips). 2 ns (~2
+    # quanta) reliably separates them and is still far below any visible width.
+    _GAP_US = 0.002
+
+    with open(path) as f:
+        d = _json.load(f)
+    events = d.get("traceEvents", [])
+
+    # Stack the full per-track hierarchy over BOTH kernels and annotations so a
+    # parent annotation knows the extent of its descendants (the snap guard),
+    # but only annotation slices are ever trimmed.
+    _ANN = "gpu_user_annotation"
+    tracks: Dict[tuple, list] = defaultdict(list)
+    for e in events:
+        if (
+            e.get("ph") == "X"
+            and e.get("dur", 0) > 0
+            and (e.get("cat") in _GPU_KERNEL_CATS or e.get("cat") == _ANN)
+        ):
+            tracks[(e.get("pid"), e.get("tid"))].append(e)
+
+    snapped = 0
+    max_clip = 0.0
+    for sl in tracks.values():
+        # Longest-first on ties so a container precedes the slices it nests.
+        sl.sort(key=lambda e: (e["ts"], -e["dur"]))
+        # Each frame: [event, max_descendant_end]. The stack holds the chain of
+        # currently-open ancestors for the slice being placed.
+        stack: list = []
+        for b in sl:
+            b_ts = b["ts"]
+            b_end = b_ts + b["dur"]
+            while stack:
+                a = stack[-1][0]
+                a_end = a["ts"] + a["dur"]
+                if a_end < b_ts:
+                    # a closed strictly before b begins -> disjoint sibling, pop.
+                    frame = stack.pop()
+                    eff = frame[0]["ts"] + frame[0]["dur"]
+                    if stack:
+                        stack[-1][1] = max(stack[-1][1], eff, frame[1])
+                    continue
+                if a_end < b_end:
+                    # b starts at/inside a but extends past a's end => they are
+                    # siblings (not parent/child), and a's tail nests+clips b in
+                    # Perfetto. Snap a's end to just before b. This fires for both
+                    # annotation tails (## item_forward ## overhanging
+                    # ## user_forward ##) and kernel tails that straddle an
+                    # annotation boundary (a layer-norm kernel ending a few ns
+                    # past the start of the next phase span) -- both are sub-us
+                    # roctracer/rounding artifacts, since kernels on one stream
+                    # are serialized and phase spans are sequential.
+                    desired_end = b_ts - _GAP_US
+                    clip = a_end - desired_end
+                    # Guard: only snap when a's deepest descendant ends at or
+                    # before b's start. If a child (kernel or nested span)
+                    # actually extends *past* b.ts, trimming a wouldn't fix b's
+                    # clipping (the child would still nest b) and could drop a
+                    # real child into b's territory, so leave it. A descendant
+                    # ending exactly at the boundary is itself rounding noise and
+                    # is clipped by <=1 ns, which is fine.
+                    if (
+                        a["ts"] < desired_end
+                        and stack[-1][1] <= b_ts
+                        and 0 < clip < max_snap_us
+                    ):
+                        a["dur"] = desired_end - a["ts"]
+                        snapped += 1
+                        if clip > max_clip:
+                            max_clip = clip
+                    frame = stack.pop()
+                    eff = frame[0]["ts"] + frame[0]["dur"]
+                    if stack:
+                        stack[-1][1] = max(stack[-1][1], eff, frame[1])
+                    continue
+                # a_end >= b_end: a fully contains b -> b is a child, stop.
+                break
+            stack.append([b, b_ts])
+
+    if snapped:
+        with open(path, "w") as f:
+            _json.dump(d, f)
+        logger.warning(
+            f"De-overlapped GPU annotations in {path}: snapped {snapped} sub-us "
+            f"sibling overlap(s) (max {max_clip:.3f}us) so Perfetto renders every "
+            f"annotation full width"
+        )
+
+
+def _on_trace_ready_fn(
+    rank: Optional[int] = None,
+    trace_dir: str = "/tmp/dlrm_v3_traces",
+    keep_n_active: Optional[int] = None,
+    trace_steps: Optional[List[int]] = None,
+) -> Callable[[torch.profiler.profile], None]:
+    """Create the on_trace_ready callback that exports a chrome trace to disk.
+
+    Filename follows ``trace_step{step}_rank{rank}.json`` so multi-rank
+    captures don't collide and ``scripts/stitch_traces.py`` can merge them
+    by step number.
+
+    The ``{step}`` label:
+
+    * If ``trace_steps`` is provided (multi-window mode), the Nth callback
+      invocation labels its file with ``trace_steps[N]`` -- i.e. the
+      user-requested step that triggered the window. This is the most
+      intuitive labelling.
+    * Otherwise falls back to ``p.step_num`` (torch.profiler's internal
+      counter at trigger time, off by ~warmup+active from the schedule
+      arg).
+
+    If ``keep_n_active`` is set, the exported file is post-processed to keep
+    only the last N ProfilerStep-spans worth of events (i.e. drop WARMUP).
+    """
+    state = {"fire_count": 0}
+
+    def handle_fn(p: torch.profiler.profile) -> None:
+        os.makedirs(trace_dir, exist_ok=True)
+        if trace_steps:
+            i = state["fire_count"]
+            step_label = (
+                trace_steps[i] if i < len(trace_steps) else getattr(p, "step_num", 0)
+            )
+        else:
+            step_label = getattr(p, "step_num", 0)
+        state["fire_count"] += 1
+        rank_str = f"_rank{rank}" if rank is not None else ""
+        file_name = f"trace_step{step_label}{rank_str}.json"
+        path = os.path.join(trace_dir, file_name)
+        logger.warning(
+            p.key_averages(group_by_input_shape=True).table(
+                sort_by="self_cuda_time_total"
+            )
+        )
+        # Tracing is best-effort: a write/trim failure (permissions, disk full,
+        # malformed export) must never crash the training run. Degrade to a
+        # warning so the loop continues — especially important since streaming
+        # enables output_trace by default.
+        try:
+            p.export_chrome_trace(path)
+            logger.warning(f"Trace written to: {path}")
+            if keep_n_active is not None and keep_n_active > 0:
+                _trim_warmup_from_trace(path, keep_n_active)
+            # ROCm/AMD-only rendering fixes. CUDA/CUPTI (e.g. B200) traces don't
+            # exhibit the fragmented-ProfilerStep or sub-us kernel-overlap
+            # artifacts, so skip entirely on NVIDIA to avoid touching otherwise
+            # correct traces. Best-effort like trim above.
+            if _is_rocm():
+                # Normalize the GPU-side ProfilerStep layout so ROCm traces
+                # render with one full-width step span per stream like CUDA.
+                _normalize_profilerstep_layout(path)
+                # Snap roctracer's sub-us kernel overlaps so Perfetto doesn't
+                # mis-nest and hide long kernels.
+                _deoverlap_gpu_slices(path)
+                # Same fix at the annotation-sibling boundary so phase spans
+                # (## user_forward ##, ## stu_* ##, ...) render full width.
+                _deoverlap_gpu_annotations(path)
+        except Exception as exc:
+            logger.warning(f"Trace export/trim failed for {path}: {exc!r} (skipping)")
+
+    return handle_fn
+
+
+def profiler_or_nullcontext(
+    enabled: bool, with_stack: bool, trace_dir: str = "/tmp/dlrm_v3_traces"
+):
+    """One-shot profile context for ad-hoc captures (no scheduling)."""
+    return (
+        profile(
+            # pyre-fixme[16]: Module `profiler` has no attribute `ProfilerActivity`.
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            on_trace_ready=_on_trace_ready_fn(trace_dir=trace_dir),
+            with_stack=with_stack,
+        )
+        if enabled
+        else contextlib.nullcontext()
+    )
+
+
+def _multi_window_schedule(
+    trace_steps,
+    warmup: int,
+    active: int,
+):
+    """Custom schedule that profiles around each step in ``trace_steps``.
+
+    Step s gets:
+        [s - warmup, s)        -> WARMUP
+        [s, s + active - 1)    -> RECORD
+        s + active - 1         -> RECORD_AND_SAVE
+    """
+    windows = [(s - warmup, s, s + active) for s in sorted(trace_steps)]
+
+    def schedule_fn(step: int) -> torch.profiler.ProfilerAction:
+        for warmup_start, active_start, active_end in windows:
+            if warmup_start <= step < active_start:
+                return torch.profiler.ProfilerAction.WARMUP
+            if active_start <= step < active_end - 1:
+                return torch.profiler.ProfilerAction.RECORD
+            if step == active_end - 1:
+                return torch.profiler.ProfilerAction.RECORD_AND_SAVE
+        return torch.profiler.ProfilerAction.NONE
+
+    return schedule_fn
+
+
+@gin.configurable
+class Profiler:
+    """Scheduled torch.profiler wrapper that writes Chrome traces to disk.
+
+    Two modes (set via gin):
+
+    * Single window (default): ``wait=10, warmup=20, active=50, repeat=1``.
+      Captures one contiguous window starting after ``wait`` steps.
+    * Multi-window: ``trace_steps=[500, 1000, 5000]`` (overrides wait+repeat).
+      Captures a separate window around each listed step.
+
+    All knobs are gin-tunable, e.g. in a gin file::
+
+        Profiler.trace_dir = "/path/to/results/exp42/trace"
+        Profiler.trace_steps = [500, 1000, 5000]
+        Profiler.warmup = 5
+        Profiler.active = 10
+    """
+
+    def __init__(
+        self,
+        rank: int,
+        active: int = 50,
+        wait: int = 10,
+        warmup: int = 20,
+        repeat: int = 1,
+        trace_steps: Optional[List[int]] = None,
+        trace_dir: str = "/tmp/dlrm_v3_traces",
+        trim_warmup: bool = True,
+        record_shapes: bool = True,
+        profile_memory: bool = False,
+        with_stack: bool = False,
+        with_flops: bool = False,
+        with_modules: bool = False,
+    ) -> None:
+        self.rank = rank
+        self.trace_dir = trace_dir
+        if trace_steps:
+            sched = _multi_window_schedule(trace_steps, warmup, active)
+        else:
+            sched = torch.profiler.schedule(
+                wait=wait, warmup=warmup, active=active, repeat=repeat
+            )
+        keep_n = active if trim_warmup else None
+        self._profiler: profiler.profile = torch.profiler.profile(
+            schedule=sched,
+            on_trace_ready=_on_trace_ready_fn(
+                self.rank, trace_dir, keep_n, trace_steps
+            ),
+            # pyre-fixme[16]: Module `profiler` has no attribute `ProfilerActivity`.
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            record_shapes=record_shapes,
+            profile_memory=profile_memory,
+            with_stack=with_stack,
+            with_flops=with_flops,
+            with_modules=with_modules,
+        )
+
+    def step(self) -> None:
+        """Advance the profiler to the next step."""
+        self._profiler.step()
+
+
+class _NoOpSummaryWriter:
+    """Drop-in stand-in for SummaryWriter used when TensorBoard is disabled
+    (empty ``tensorboard_log_path``). All scalar writes become no-ops so the
+    metrics path (compute + text-log + ``.metrics.jsonl`` sinks) runs unchanged
+    and never crashes on TB file I/O. The shared ``/apps`` tfevents writer was a
+    crash source under transient filer ``Errno 121`` (Remote I/O) errors, and
+    nothing we consume reads TensorBoard — eval-window AUCs come from the JSONL
+    sink and the text log."""
+
+    def add_scalar(self, *args, **kwargs) -> None:
+        pass
+
+    def flush(self) -> None:
+        pass
+
+    def close(self) -> None:
+        pass
+
+
+@gin.configurable
+class MetricsLogger:
+    """
+    Logger for tracking and computing recommendation metrics.
+
+    Supports both classification metrics (NE, Accuracy, GAUC) and regression
+    metrics (MSE, MAE) based on multitask configuration.
+
+    Args:
+        multitask_configs: List of task configurations defining metric types.
+        batch_size: Batch size for metric computation.
+        window_size: Window size for running metric aggregation.
+        device: Device to place metric tensors on.
+        rank: Process rank for distributed training.
+        tensorboard_log_path: Optional path for TensorBoard logging.
+    """
+
+    def __init__(
+        self,
+        multitask_configs: List[TaskConfig],
+        batch_size: int,
+        window_size: int,
+        device: torch.device,
+        rank: int,
+        tensorboard_log_path: str = "",
+        world_size: int = 1,
+        auc_threshold: Optional[float] = None,
+        num_flops_per_sample: float = 0.0,
+        gpu_peak_flops: float = 0.0,
+        model: Optional[torch.nn.Module] = None,
+        eval_cumulative: bool = False,
+        cumulative_auc_bins: int = 100_000,
+        train_lifetime_auc_mode: str = "binned",
+        eval_lifetime_auc_mode: str = "binned",
+        lifetime_auc_window: int = 10_000_000,
+    ) -> None:
+        # tflops/mfu reporting state (optional — when both num_flops_per_sample
+        # and gpu_peak_flops are set, the train perf line gains tflops_algo/gpu,
+        # mfu, tflops_real/gpu, hfu, fill. The jagged ("real") numbers come
+        # from `model._last_jagged_flops_per_sample` stashed by DlrmHSTU.main_forward.
+        self._num_flops_per_sample: float = max(0.0, float(num_flops_per_sample))
+        self._gpu_peak_flops: float = max(0.0, float(gpu_peak_flops))
+        self._model_ref: Optional[torch.nn.Module] = model
+        if rank == 0 and self._num_flops_per_sample > 0 and self._gpu_peak_flops > 0:
+            logger.info(
+                f"FLOPS reporting enabled: {self._num_flops_per_sample / 1e9:.1f} "
+                f"GFLOP/sample (dense fwd+bwd), GPU peak {self._gpu_peak_flops / 1e12:.0f} TFLOPS"
+            )
+        self.multitask_configs: List[TaskConfig] = multitask_configs
+        all_classification_tasks: List[str] = [
+            task.task_name
+            for task in self.multitask_configs
+            if task.task_type != MultitaskTaskType.REGRESSION
+        ]
+        all_regression_tasks: List[str] = [
+            task.task_name
+            for task in self.multitask_configs
+            if task.task_type == MultitaskTaskType.REGRESSION
+        ]
+        assert all_classification_tasks + all_regression_tasks == [
+            task.task_name for task in multitask_configs
+        ]
+        self.task_names: List[str] = all_classification_tasks + all_regression_tasks
+
+        # Eval metric semantics:
+        #   eval_cumulative=False (default, legacy / static / non-streaming eval):
+        #     a single eval set with the configured window_size, including a
+        #     lifetime AUC. Unchanged behavior.
+        #   eval_cumulative=True (streaming fixed-holdout eval): a FRESH eval set
+        #     (window_size=UNBOUNDED, reset each pass -> per-pass full-holdout
+        #     "window_*") PLUS a CUMULATIVE set ("eval_cum", never reset ->
+        #     "lifetime_*"). NE/Accuracy/GAUC are cumulative for free via their
+        #     persistent scalar sums; AUC cumulative uses the selected backend.
+        #
+        # Lifetime-AUC backend is configurable independently for train and eval:
+        #   "binned" (default): BinnedCumulativeAUC - exact-cumulative AUC via an
+        #     O(num_bins) score histogram (additive all-reduce, no unbounded
+        #     buffer, memory independent of #samples/#windows).
+        #   "capped": LifetimeAUCMetricComputation - AUC over a trailing buffer of
+        #     `lifetime_auc_window` samples/rank (the legacy approach; per-rank
+        #     buffer all-gathered at compute).
+        self._eval_cumulative: bool = eval_cumulative
+        self._cumulative_auc_bins: int = int(cumulative_auc_bins)
+        self._train_lifetime_auc_mode: str = str(train_lifetime_auc_mode)
+        self._eval_lifetime_auc_mode: str = str(eval_lifetime_auc_mode)
+        self._lifetime_auc_window: int = int(lifetime_auc_window)
+        n_cls = len(all_classification_tasks)
+        n_reg = len(all_regression_tasks)
+
+        def _make_lifetime_auc(mode: str) -> RecMetricComputation:
+            if mode == "binned":
+                # window_size=0: no torchrec windowed state; histograms only.
+                return BinnedCumulativeAUC(
+                    my_rank=rank, batch_size=batch_size, n_tasks=n_cls,
+                    window_size=0, num_bins=self._cumulative_auc_bins,
+                ).to(device)
+            if mode == "capped":
+                return LifetimeAUCMetricComputation(
+                    my_rank=rank, batch_size=batch_size, n_tasks=n_cls,
+                    window_size=self._lifetime_auc_window,
+                ).to(device)
+            raise ValueError(
+                f"lifetime_auc_mode must be 'binned' or 'capped', got {mode!r}"
+            )
+
+        def _make_class(ws: int, lifetime_mode: Optional[str]) -> List[RecMetricComputation]:
+            mets: List[RecMetricComputation] = [
+                NEMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=ws).to(device),
+                AccuracyMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=ws).to(device),
+                GAUCMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=ws).to(device),
+                AUCMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=ws).to(device),
+            ]
+            if lifetime_mode is not None:
+                mets.append(_make_lifetime_auc(lifetime_mode))
+            return mets
+
+        def _make_class_cumulative() -> List[RecMetricComputation]:
+            # NE/Accuracy/GAUC: cumulative via persistent lifetime sums (window
+            # value ignored at compute). AUC: selected lifetime backend.
+            return [
+                NEMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=window_size).to(device),
+                AccuracyMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=window_size).to(device),
+                GAUCMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=window_size).to(device),
+                _make_lifetime_auc(self._eval_lifetime_auc_mode),
+            ]
+
+        def _make_reg(ws: int) -> List[RecMetricComputation]:
+            return [
+                MSEMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_reg, window_size=ws).to(device),
+                MAEMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_reg, window_size=ws).to(device),
+            ]
+
+        self.class_metrics: Dict[str, List[RecMetricComputation]] = {"train": [], "eval": []}
+        self.regression_metrics: Dict[str, List[RecMetricComputation]] = {"train": [], "eval": []}
+        if eval_cumulative:
+            self.class_metrics["eval_cum"] = []
+            self.regression_metrics["eval_cum"] = []
+
+        if all_classification_tasks:
+            self.class_metrics["train"] = _make_class(window_size, lifetime_mode=self._train_lifetime_auc_mode)
+            if eval_cumulative:
+                self.class_metrics["eval"] = _make_class(UNBOUNDED_WINDOW, lifetime_mode=None)
+                self.class_metrics["eval_cum"] = _make_class_cumulative()
+            else:
+                self.class_metrics["eval"] = _make_class(window_size, lifetime_mode=self._eval_lifetime_auc_mode)
+
+        if all_regression_tasks:
+            self.regression_metrics["train"] = _make_reg(window_size)
+            if eval_cumulative:
+                self.regression_metrics["eval"] = _make_reg(UNBOUNDED_WINDOW)
+                self.regression_metrics["eval_cum"] = _make_reg(window_size)
+            else:
+                self.regression_metrics["eval"] = _make_reg(window_size)
+
+        self.global_step: Dict[str, int] = {"train": 0, "eval": 0}
+        # MLPerf `samples_count` progress unit: global trained samples, persisted
+        # alongside global_step so a resumed run continues the count.
+        self.cumulative_train_samples: int = 0
+        # Whether the MLPerf run markers (RUN_START etc.) were already emitted for
+        # this logical run. Checkpointed so a resume relaunch knows the run is
+        # already open and does NOT re-emit INIT_START/RUN_START (the compliance
+        # checker requires EXACTLY_ONE); the resumed process continues the same
+        # event stream and emits the single RUN_STOP at convergence/end.
+        self.mlperf_run_started: bool = False
+        self._rank: int = int(rank)
+        # Optional MLPerf logger + LR accessor wired by the streaming loop (duck-
+        # typed to avoid a train-module import cycle); drives the train_loss event.
+        self.mlperf_logger: Optional[Any] = None
+        self.lr_getter: Optional[Callable[[], float]] = None
+        self.tb_logger: Optional[SummaryWriter] = None
+        if tensorboard_log_path != "":
+            self.tb_logger = SummaryWriter(log_dir=tensorboard_log_path, purge_step=0)
+            self.tb_logger.flush()
+        else:
+            # TB disabled: use a no-op writer so the existing call sites (and the
+            # `assert self.tb_logger is not None` in compute_and_log) keep working
+            # while no tfevents are written to the fragile shared filer.
+            self.tb_logger = _NoOpSummaryWriter()
+
+        # Throughput / time-to-target tracking. Counters are train-only; eval
+        # samples are not relevant for headline samples/sec numbers.
+        self._world_size: int = max(1, int(world_size))
+        self._auc_threshold: Optional[float] = auc_threshold
+        self._time_to_target_logged: bool = False
+        self._perf_t_start: float = time.perf_counter()
+        self._perf_t_window: float = self._perf_t_start
+        self._perf_steps_in_window: int = 0
+        self._perf_total_samples: int = 0
+        self._perf_samples_counter: torch.Tensor = torch.zeros(
+            1, dtype=torch.long, device=device
+        )
+        # Non-train wall-time to exclude from the train step-time window so
+        # `step_ms` reports the canonical per-step compute latency even when an
+        # interval coincides with eval or checkpointing. Categorized so the
+        # excluded time is also reportable (eval_ms / ckpt_ms) rather than just
+        # discarded. The trainer brackets eval/ckpt regions via
+        # pause_perf(cat)/resume_perf(cat); accumulators reset each train-perf log.
+        self._perf_excluded: Dict[str, float] = {"eval": 0.0, "ckpt": 0.0}
+        self._perf_pause: Dict[str, Optional[float]] = {}
+
+    def pause_perf(self, category: str) -> None:
+        """Start excluding wall-time under `category` (e.g. "eval"/"ckpt") from
+        the train step-time window. Idempotent: a second pause without an
+        intervening resume is a no-op (keeps the earliest start)."""
+        if self._perf_pause.get(category) is None:
+            self._perf_pause[category] = time.perf_counter()
+
+    def resume_perf(self, category: str) -> None:
+        """Stop excluding `category` and fold the elapsed interval into the
+        per-category accumulator. No-op if not currently paused."""
+        t0 = self._perf_pause.get(category)
+        if t0 is not None:
+            self._perf_excluded[category] = (
+                self._perf_excluded.get(category, 0.0)
+                + (time.perf_counter() - t0)
+            )
+            self._perf_pause[category] = None
+
+    @property
+    def auc_threshold(self) -> Optional[float]:
+        """Configured time-to-target AUC threshold (None if unset)."""
+        return self._auc_threshold
+
+    @property
+    def all_metrics(self) -> Dict[str, List[RecMetricComputation]]:
+        """
+        Get all metrics for train and eval modes.
+
+        Returns:
+            Dictionary mapping mode ('train'/'eval') to list of metric computations.
+        """
+        out = {
+            "train": self.class_metrics["train"] + self.regression_metrics["train"],
+            "eval": self.class_metrics["eval"] + self.regression_metrics["eval"],
+        }
+        if "eval_cum" in self.class_metrics or "eval_cum" in self.regression_metrics:
+            out["eval_cum"] = self.class_metrics.get(
+                "eval_cum", []
+            ) + self.regression_metrics.get("eval_cum", [])
+        return out
+
+    def update(
+        self,
+        predictions: torch.Tensor,
+        weights: torch.Tensor,
+        labels: torch.Tensor,
+        num_candidates: torch.Tensor,
+        mode: str = "train",
+    ) -> None:
+        """
+        Update metrics with new batch of predictions and labels.
+
+        Args:
+            predictions: Model prediction tensor.
+            weights: Sample weight tensor.
+            labels: Ground truth label tensor.
+            num_candidates: Number of candidates per sample (for GAUC).
+            mode: Either 'train' or 'eval'.
+        """
+        # On eval, update BOTH the fresh set and the never-reset cumulative set
+        # (if enabled) from the same batch.
+        update_targets = list(self.all_metrics[mode])
+        if mode == "eval" and "eval_cum" in self.all_metrics:
+            update_targets = update_targets + self.all_metrics["eval_cum"]
+        for metric in update_targets:
+            if isinstance(metric, GAUCMetricComputation):
+                metric.update(
+                    predictions=predictions,
+                    labels=labels,
+                    weights=weights,
+                    num_candidates=num_candidates,
+                )
+            else:
+                metric.update(
+                    predictions=predictions,
+                    labels=labels,
+                    weights=weights,
+                )
+        self.global_step[mode] += 1
+        if mode == "train":
+            # Accumulate on-device to avoid a per-step GPU->CPU sync; we read
+            # the counter only at compute_and_log boundaries.
+            self._perf_samples_counter += num_candidates.sum().to(
+                self._perf_samples_counter.dtype
+            )
+            self._perf_steps_in_window += 1
+            # MLPerf progress counter: per-rank sample count scaled by world size
+            # approximates global trained samples without an extra collective.
+            self.cumulative_train_samples += int(num_candidates.numel()) * self._world_size
+
+    def compute(self, mode: str = "train") -> Dict[str, float]:
+        """
+        Compute and return all metrics for the current window.
+
+        Args:
+            mode: Either 'train' or 'eval'.
+
+        Returns:
+            Dictionary mapping metric names to their computed values.
+        """
+        all_computed_metrics = {}
+
+        if mode == "eval" and "eval_cum" in self.all_metrics:
+            # Dual-set eval: `window_*` (fresh per-pass) from the reset-each-pass
+            # set; `lifetime_*` (cumulative across passes) from the never-reset
+            # set. Filter each set to the matching prefix, and drop GAUC's
+            # auxiliary `*_num_samples` reports. Key names are unchanged
+            # (`window_auc`, `lifetime_ne`, ...) so dashboards keep working.
+            def _emit(
+                metrics: List[RecMetricComputation], keep_prefix: str
+            ) -> None:
+                for metric in metrics:
+                    for computed in metric.compute():
+                        pfx = str(computed.metric_prefix)
+                        name = str(computed.name)
+                        if pfx != keep_prefix or name.endswith("num_samples"):
+                            continue
+                        all_values = computed.value.cpu()
+                        for i, task_name in enumerate(self.task_names):
+                            if i >= len(all_values):
+                                break
+                            all_computed_metrics[f"metric/{pfx}{name}/{task_name}"] = (
+                                all_values[i]
+                            )
+
+            _emit(self.all_metrics["eval"], "window_")
+            _emit(self.all_metrics["eval_cum"], "lifetime_")
+        else:
+            for metric in self.all_metrics[mode]:
+                computed_metrics = metric.compute()
+                for computed in computed_metrics:
+                    all_values = computed.value.cpu()
+                    for i, task_name in enumerate(self.task_names):
+                        if i >= len(all_values):
+                            break
+                        key = f"metric/{str(computed.metric_prefix) + str(computed.name)}/{task_name}"
+                        all_computed_metrics[key] = all_values[i]
+
+        logger.info(
+            f"{mode} - Step {self.global_step[mode]} metrics: {all_computed_metrics}"
+        )
+        return all_computed_metrics
+
+    def _global_mean_loss(self, loss_terms: Dict[str, torch.Tensor]) -> float:
+        """Cross-rank mean of the summed per-task losses.
+
+        The 1-element all-reduce MUST run on every rank in lockstep; callers gate
+        it on a rank-identical counter (global_step / a deterministic frequency)
+        so it cannot desync.
+        """
+        loss_t = torch.stack(
+            [v.detach().float().sum() for v in loss_terms.values()]
+        ).sum()
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            torch.distributed.all_reduce(loss_t, op=torch.distributed.ReduceOp.SUM)
+            loss_t = loss_t / self._world_size
+        return float(loss_t)
+
+    def maybe_log_mlperf_train_loss(
+        self, aux_losses: Dict[str, torch.Tensor], every: int
+    ) -> None:
+        """Emit the MLPerf ``train_loss`` event on its OWN cadence.
+
+        Decoupled from the console/TB metric cadence (``compute_and_log``): call
+        this every train step with the just-computed ``aux_losses`` and the
+        desired interval ``every`` (in global train steps). The cross-rank loss
+        all-reduce only fires on the cadence, gated by ``global_step["train"]``
+        which is incremented identically on all ranks in ``update()`` — so the
+        collective stays in lockstep. No-op when MLPerf logging is disabled
+        (``mlperf_logger is None`` on every rank) or ``every <= 0``.
+        """
+        if self.mlperf_logger is None or every <= 0 or not aux_losses:
+            return
+        if self.global_step["train"] % every != 0:
+            return
+        train_loss = self._global_mean_loss(aux_losses)
+        c = self.mlperf_logger.constants
+        md: Dict[str, Any] = {c.SAMPLES_COUNT: self.cumulative_train_samples}
+        if self.lr_getter is not None:
+            try:
+                md["lr"] = float(self.lr_getter())
+            except Exception:
+                pass
+        self.mlperf_logger.event(
+            key=getattr(c, "TRAIN_LOSS", "train_loss"),
+            value=train_loss,
+            metadata=md,
+        )
+
+    def compute_and_log(
+        self,
+        mode: str = "train",
+        additional_logs: Optional[Dict[str, Dict[str, torch.Tensor]]] = None,
+    ) -> Dict[str, float]:
+        """
+        Compute metrics and log to TensorBoard.
+
+        Args:
+            mode: Either 'train' or 'eval'.
+            additional_logs: Optional additional data to log.
+
+        Returns:
+            Dictionary mapping metric names to their computed values.
+
+        Raises:
+            AssertionError: If TensorBoard logger is not configured.
+        """
+        assert self.tb_logger is not None
+        all_computed_metrics = self.compute(mode)
+        for k, v in all_computed_metrics.items():
+            self.tb_logger.add_scalar(  # pyre-ignore [16]
+                f"{mode}_{k}",
+                v,
+                global_step=self.global_step[mode],
+            )
+
+        if additional_logs is not None:
+            for tag, data in additional_logs.items():
+                for data_name, data_value in data.items():
+                    self.tb_logger.add_scalar(
+                        f"{tag}/{mode}_{data_name}",
+                        data_value.detach().clone().cpu(),
+                        global_step=self.global_step[mode],
+                    )
+
+        # Global (cross-rank mean) train loss to console/TB at the metric-logging
+        # cadence. The 1-element all-reduce runs on every rank in lockstep, so it
+        # cannot desync. The MLPerf `train_loss` EVENT is emitted separately via
+        # ``maybe_log_mlperf_train_loss`` so its cadence can be tuned independently
+        # of this console/TB cadence (see that method).
+        if mode == "train" and additional_logs is not None and "losses" in additional_logs:
+            loss_terms = additional_logs["losses"]
+            if loss_terms:
+                train_loss = self._global_mean_loss(loss_terms)
+                self.tb_logger.add_scalar(
+                    "train_loss", train_loss, global_step=self.global_step["train"]
+                )
+                if self._rank == 0:
+                    logger.info(
+                        f"train - Step {self.global_step['train']} "
+                        f"train_loss={train_loss:.5f}"
+                    )
+
+        # Throughput metrics (train only). One GPU->CPU sync per call.
+        if mode == "train" and self._perf_steps_in_window > 0:
+            now = time.perf_counter()
+            wall_dt = max(now - self._perf_t_window, 1e-6)
+            # Subtract bracketed eval/checkpoint wall-time so step_ms / sps /
+            # MFU reflect canonical train-step compute, not eval+ckpt stalls
+            # that happened to land in this window. The excluded time is also
+            # surfaced separately below (eval_ms / ckpt_ms) rather than discarded.
+            eval_s = self._perf_excluded.get("eval", 0.0)
+            ckpt_s = self._perf_excluded.get("ckpt", 0.0)
+            dt = max(wall_dt - eval_s - ckpt_s, 1e-6)
+            n_samples = int(self._perf_samples_counter.item())
+            self._perf_total_samples += n_samples
+            local_sps = n_samples / dt
+            global_sps = local_sps * self._world_size
+            step_ms = dt * 1000.0 / self._perf_steps_in_window
+            wall_step_ms = wall_dt * 1000.0 / self._perf_steps_in_window
+            eval_ms = eval_s * 1000.0
+            ckpt_ms = ckpt_s * 1000.0
+            elapsed = now - self._perf_t_start
+            step = self.global_step["train"]
+            self.tb_logger.add_scalar(
+                "perf/train_samples_per_sec_local", local_sps, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/train_samples_per_sec_global", global_sps, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/train_step_time_ms", step_ms, global_step=step
+            )
+            # Inclusive (old-semantics) per-step wall time and the eval/ckpt
+            # breakdown that was excluded from step_ms above.
+            self.tb_logger.add_scalar(
+                "perf/train_wall_step_time_ms", wall_step_ms, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/window_eval_time_ms", eval_ms, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/window_ckpt_time_ms", ckpt_ms, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/train_total_samples", self._perf_total_samples, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/train_elapsed_sec", elapsed, global_step=step
+            )
+            # TFLOPS / MFU reporting (algo = dense yardstick, real = jagged).
+            #   tflops_algo/gpu, mfu  — uses max_seq_len^2 attention work (the
+            #     MFU yardstick: the FLOPs the workload would do if every
+            #     user's UIH filled the padded seq length).
+            #   tflops_real/gpu, hfu — uses this batch's mean(s_i^2) (actual
+            #     GPU work; hardware utilization).
+            #   fill                  — real / algo as a percent; how much of
+            #     the algo budget the model actually executed this batch.
+            # The jagged stash is read from the inner model; the model ref may
+            # be a DMP wrapper, so unwrap via .module if present.
+            tflops_str = ""
+            if self._num_flops_per_sample > 0 and self._gpu_peak_flops > 0:
+                local_flops = self._num_flops_per_sample * local_sps
+                tflops_algo = local_flops / 1e12
+                mfu = 100.0 * local_flops / self._gpu_peak_flops
+                self.tb_logger.add_scalar("perf/train_tflops_algo_gpu", tflops_algo, global_step=step)
+                self.tb_logger.add_scalar("perf/train_mfu_pct", mfu, global_step=step)
+                tflops_str = f" tflops_algo/gpu={tflops_algo:.1f} mfu={mfu:.1f}%"
+                jagged_t = None
+                m = self._model_ref
+                if m is not None:
+                    inner = m.module if hasattr(m, "module") else m
+                    jagged_t = getattr(inner, "_last_jagged_flops_per_sample", None)
+                if jagged_t is not None:
+                    jagged = float(jagged_t.item())
+                    if 0 < jagged < self._num_flops_per_sample:
+                        tflops_real = jagged * local_sps / 1e12
+                        hfu = 100.0 * jagged * local_sps / self._gpu_peak_flops
+                        fill = 100.0 * jagged / self._num_flops_per_sample
+                        self.tb_logger.add_scalar("perf/train_tflops_real_gpu", tflops_real, global_step=step)
+                        self.tb_logger.add_scalar("perf/train_hfu_pct", hfu, global_step=step)
+                        self.tb_logger.add_scalar("perf/train_fill_pct", fill, global_step=step)
+                        tflops_str += f" tflops_real/gpu={tflops_real:.1f} hfu={hfu:.1f}% fill={fill:.1f}%"
+            logger.info(
+                f"train - Step {step} perf: local_sps={local_sps:.1f} "
+                f"global_sps={global_sps:.1f} step_ms={step_ms:.2f} "
+                f"elapsed_sec={elapsed:.1f} total_samples={self._perf_total_samples} "
+                f"wall_step_ms={wall_step_ms:.2f} eval_ms={eval_ms:.1f} "
+                f"ckpt_ms={ckpt_ms:.1f}"
+                + tflops_str
+            )
+            self._perf_t_window = now
+            self._perf_steps_in_window = 0
+            self._perf_samples_counter.zero_()
+            # Reset the excluded-time accumulators for the next window. Any
+            # still-open pause (eval/ckpt straddling this log) is cleared so its
+            # remaining time is not double-counted; in practice perf logs fire
+            # only after a train step, never mid eval/ckpt.
+            self._perf_excluded = {"eval": 0.0, "ckpt": 0.0}
+            self._perf_pause = {}
+
+        # Time-to-target: latch wall-clock once any task's AUC crosses threshold.
+        # Matches MLPerf DLRM-DCNv2 reporting style (default upstream target 0.80275).
+        if (
+            self._auc_threshold is not None
+            and not self._time_to_target_logged
+        ):
+            for key, val in all_computed_metrics.items():
+                metric_short = key.split("/")[-2] if "/" in key else key
+                if metric_short.endswith("auc") and not metric_short.endswith("gauc"):
+                    if float(val) >= self._auc_threshold:
+                        ttt = time.perf_counter() - self._perf_t_start
+                        self.tb_logger.add_scalar(
+                            f"perf/time_to_auc_{self._auc_threshold:.5f}_sec",
+                            ttt,
+                            global_step=self.global_step[mode],
+                        )
+                        logger.info(
+                            f"REACHED AUC>={self._auc_threshold} on {key}="
+                            f"{float(val):.6f} at elapsed_sec={ttt:.2f} "
+                            f"step={self.global_step[mode]}"
+                        )
+                        self._time_to_target_logged = True
+                        break
+
+        return all_computed_metrics
+
+    def reset(self, mode: str = "train"):
+        """
+        Reset all metrics for a given mode.
+
+        Args:
+            mode: Either 'train' or 'eval'.
+        """
+        for metric in self.all_metrics[mode]:
+            metric.reset()
+
+
+# the datasets we support
+SUPPORTED_DATASETS = [
+    "debug",
+    "movielens-1m",
+    "movielens-20m",
+    "movielens-13b",
+    "movielens-18b",
+    "kuairand-1k",
+    "streaming-400m",
+    "streaming-200b",
+    "streaming-100b",
+    "sampled-streaming-100b",
+    "yambda-5b",
+]
+
+
+@gin.configurable
+def env_path(key: str = "", default: str = "") -> str:
+    """Resolve a path from os.environ[key], falling back to `default`.
+
+    Intended as a gin macro so paths can be overridden via env vars without
+    editing the gin file. Example gin usage:
+
+        DATA_PATH = @env_path()
+        env_path.key = "DLRM_DATA_PATH"
+        env_path.default = "/some/default/path"
+        make_train_test_dataloaders.new_path_prefix = %DATA_PATH
+    """
+    return os.environ.get(key, default) if key else default
+
+
+@gin.configurable
+def env_str(key: str = "", default: str = "") -> str:
+    """Resolve a string from os.environ[key], falling back to `default`.
+
+    Companion to `env_int`/`env_float` for categorical/string overrides (e.g. a
+    metric backend selector). Example gin usage:
+
+        MetricsLogger.train_lifetime_auc_mode = @tlam/env_str()
+        tlam/env_str.key     = "TRAIN_LIFETIME_AUC_MODE"
+        tlam/env_str.default = "binned"
+    """
+    raw = os.environ.get(key) if key else None
+    return raw if raw else default
+
+
+@gin.configurable
+def env_int(key: str = "", default: int = 0) -> int:
+    """Resolve an int from os.environ[key], falling back to `default`.
+
+    Companion to `env_path` for numeric overrides. Example gin usage:
+
+        make_optimizer_and_shard.hbm_cap_gb = @env_int()
+        env_int.key = "HBM_CAP_GB"
+        env_int.default = 260
+    """
+    raw = os.environ.get(key) if key else None
+    return int(raw) if raw else default
+
+
+@gin.configurable
+def env_float(key: str = "", default: float = 0.0) -> float:
+    """Resolve a float from os.environ[key], falling back to `default`.
+
+    Companion to `env_int` for fractional/duration overrides (e.g. a
+    checkpoint time interval in seconds). Example gin usage:
+
+        streaming_train_eval_loop.checkpoint_time_interval_s = @env_float()
+        env_float.key     = "CKPT_TIME_INTERVAL_S"
+        env_float.default = 3600.0
+    """
+    raw = os.environ.get(key) if key else None
+    return float(raw) if raw else default
+
+
+_GPU_PEAK_FLOPS_TABLE: Dict[str, Dict[str, float]] = {
+    # Per-GPU peak TFLOPS by dtype. Values from vendor datasheets / Primus-DLRM
+    # peak_table. Used as the denominator in MFU/HFU. Keyed by case-insensitive
+    # substring of torch.cuda.get_device_name(0).
+    "MI355X": {"bf16": 2300e12, "fp32": 575e12},
+    "MI350X": {"bf16": 2300e12, "fp32": 575e12},
+    "MI300X": {"bf16": 1300e12, "fp32": 653e12},
+    "MI325X": {"bf16": 1300e12, "fp32": 653e12},
+    "B200":   {"bf16": 2250e12, "fp32": 1125e12},
+    "H100":   {"bf16": 990e12,  "fp32": 67e12},
+    "A100":   {"bf16": 312e12,  "fp32": 19.5e12},
+}
+
+
+def get_gpu_peak_flops(dtype: str = "bf16") -> float:
+    """Peak FLOPS for the current GPU at the given dtype.
+
+    Falls back to MI350X's number with a warning when the device name doesn't
+    match any table entry — better to over-report MFU than to silently skip.
+    """
+    if not torch.cuda.is_available():
+        return 0.0
+    name = torch.cuda.get_device_name(0)
+    for gpu_key, peaks in _GPU_PEAK_FLOPS_TABLE.items():
+        if gpu_key in name:
+            return peaks.get(dtype, peaks["bf16"])
+    logger.warning(
+        f"Unknown GPU for peak FLOPS: {name}; defaulting to MI350X bf16 (2300 TF)"
+    )
+    return _GPU_PEAK_FLOPS_TABLE["MI350X"]["bf16"]
+
+
+@gin.configurable
+def run_results_dir(run_name: str = "default", subdir: str = "results") -> str:
+    """Resolve ``<recommendation_v4>/<subdir>/<run_name>`` from this file's location.
+
+    Used as a gin macro to give per-run output directories that persist on the
+    host (recommendation_v4 is bind-mounted into the training container).
+
+    Example gin usage::
+
+        RUN_NAME = @env_path()
+        env_path.key     = "RUN_NAME"
+        env_path.default = "default"
+        run_results_dir.run_name = %RUN_NAME
+        Profiler.trace_dir = @run_results_dir()
+    """
+    # utils.py lives at <recommendation_v4>/generative_recommenders/dlrm_v3/utils.py;
+    # parents[2] climbs to <recommendation_v4>/.
+    repo_root = Path(__file__).resolve().parents[2]
+    return str(repo_root / subdir / run_name)
+
+
+@gin.configurable
+def get_dataset(
+    name: str,
+    new_path_prefix: str = "",
+    history_length: Optional[int] = None,
+    min_history: Optional[int] = None,
+    history_strategy: str = "interleaved",
+    streaming_window_seconds: int = 86400,
+    streaming_sort_within_window: bool = False,
+    streaming_shuffle_fraction: float = 0.0,
+    streaming_shuffle_seed: int = 0,
+    train_split_percentage: float = 1.0,
+    split_salt: int = 0,
+):
+    """
+    Get dataset class and configuration by name.
+
+    Args:
+        name: Dataset identifier (must be in SUPPORTED_DATASETS).
+        new_path_prefix: Optional prefix to prepend to data paths.
+
+    Returns:
+        Tuple of (dataset_class, kwargs_dict) for dataset instantiation.
+
+    Raises:
+        AssertionError: If dataset name is not supported.
+    """
+    assert name in SUPPORTED_DATASETS, f"dataset {name} not supported"
+    if name == "debug":
+        return DLRMv3RandomDataset, {}
+    if name == "movielens-1m":
+        return (
+            DLRMv3MovieLensDataset,
+            {
+                "ratings_file": os.path.join(
+                    new_path_prefix, "data/ml-1m/sasrec_format.csv"
+                ),
+            },
+        )
+    if name == "movielens-20m":
+        return (
+            DLRMv3MovieLensDataset,
+            {
+                "ratings_file": os.path.join(
+                    new_path_prefix, "data/ml-20m/sasrec_format.csv"
+                ),
+            },
+        )
+    if name == "movielens-13b":
+        return (
+            DLRMv3SyntheticMovieLensDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/ml-13b/16x16384"
+                ),
+            },
+        )
+    if name == "movielens-18b":
+        return (
+            DLRMv3SyntheticMovieLensDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/ml-18b/20x36864"
+                ),
+            },
+        )
+    if name == "kuairand-1k":
+        return (
+            DLRMv3KuaiRandDataset,
+            {
+                "seq_logs_file": os.path.join(
+                    new_path_prefix, "data/KuaiRand-1K/data/processed_seqs.csv"
+                ),
+            },
+        )
+    if name == "streaming-400m":
+        return (
+            DLRMv3SyntheticStreamingDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/streaming-400m/"
+                ),
+                "train_ts": 8,
+                "total_ts": 10,
+                "num_files": 3,
+                "num_users": 150_000,
+                "num_items": 1_500_000,
+                "num_categories": 128,
+            },
+        )
+    if name == "streaming-200b":
+        return (
+            DLRMv3SyntheticStreamingDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/streaming-200b/"
+                ),
+                "train_ts": 90,
+                "total_ts": 100,
+                "num_files": 100,
+                "num_users": 10_000_000,
+                "num_items": 1_000_000_000,
+                "num_categories": 128,
+            },
+        )
+    if name == "streaming-100b":
+        return (
+            DLRMv3SyntheticStreamingDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/streaming-100b/"
+                ),
+                "train_ts": 90,
+                "total_ts": 100,
+                "num_files": 100,
+                "num_users": 5_000_000,
+                "num_items": 1_000_000_000,
+                "num_categories": 128,
+            },
+        )
+    if name == "yambda-5b":
+        from generative_recommenders.dlrm_v3.configs import YAMBDA_5B_CROSS_SPECS
+
+        return (
+            DLRMv3YambdaDataset,
+            {
+                # Layout: <new_path_prefix>/processed_5b/{train_sessions.parquet,...}
+                # and <new_path_prefix>/shared_metadata/{artist,album}_item_mapping.parquet.
+                # The dataset auto-builds a MAP_SHARED-mmap'd cache of the
+                # flat columns + LISTEN-anchor positions under
+                # <processed_dir>/hstu_cache_L<history_length>/ on first use;
+                # all ranks on a node share the same physical pages.
+                "processed_dir": os.path.join(new_path_prefix, "processed_5b"),
+                "metadata_dir": os.path.join(new_path_prefix, "shared_metadata"),
+                # Per-pool truncation cap; total interleaved UIH ~ 3*L/3 = L.
+                # Override via `get_dataset.history_length = N` in gin.
+                "history_length": history_length if history_length is not None else 4096,
+                "scan_window": 20000,
+                # Anchor-eligibility floor: a LISTEN event qualifies once the
+                # user has >= min_history prior events. Decoupled from
+                # history_length (gather cap) since jagged attention handles
+                # short UIH. None = legacy (require a full history_length).
+                # Override via `get_dataset.min_history = N` / $MIN_HISTORY.
+                "min_history": min_history,
+                # UIH construction: "interleaved" (per-pool L//3 cap) or
+                # "last_n" (last history_length pooled events, no per-pool
+                # split). Strategy-independent on disk — both reuse the same
+                # hstu_cache_L<history_length>/ and positions file (the gather
+                # runs at sample-construction time), so switching needs no
+                # rebuild. Override via $HISTORY_STRATEGY.
+                "history_strategy": history_strategy,
+                "cross_specs": YAMBDA_5B_CROSS_SPECS,
+                # Temporal-streaming knobs (only used under --mode
+                # streaming-train-eval; ignored by the default train-eval path).
+                "streaming_window_seconds": streaming_window_seconds,
+                "streaming_sort_within_window": streaming_sort_within_window,
+                # In-window shuffle diversity dial in [0,1]: K=round(frac*N) within-
+                # segment shuffle. 0=off/user-major, 1=full. Config-invariant and
+                # deterministic by (seed, ts).
+                "streaming_shuffle_fraction": streaming_shuffle_fraction,
+                "streaming_shuffle_seed": streaming_shuffle_seed,
+                # User-level train:eval holdout for the streaming path. 1.0 =
+                # no holdout (legacy). <1.0 holds out (1 - tsp) of users as a
+                # fixed eval set; those users are never trained.
+                "train_split_percentage": train_split_percentage,
+                "split_salt": split_salt,
+            },
+        )
+    if name == "sampled-streaming-100b":
+        return (
+            DLRMv3SyntheticStreamingDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/streaming-100b/sampled_data/"
+                ),
+                "train_ts": 90,
+                "total_ts": 100,
+                "num_files": 1,
+                "num_users": 50_000,
+                "num_items": 1_000_000_000,
+                "num_categories": 128,
+            },
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/action_encoder.py b/recommendation_v4/generative_recommenders/modules/action_encoder.py
new file mode 100644
index 000000000..13b65557e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/action_encoder.py
@@ -0,0 +1,112 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from generative_recommenders.common import HammerModule
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+
+
+class ActionEncoder(HammerModule):
+    def __init__(
+        self,
+        action_embedding_dim: int,
+        action_feature_name: str,
+        action_weights: List[int],
+        watchtime_feature_name: str = "",
+        watchtime_to_action_thresholds_and_weights: Optional[
+            List[Tuple[int, int]]
+        ] = None,
+        embedding_init_std: float = 0.1,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._watchtime_feature_name: str = watchtime_feature_name
+        self._action_feature_name: str = action_feature_name
+        self._watchtime_to_action_thresholds_and_weights: List[Tuple[int, int]] = (
+            watchtime_to_action_thresholds_and_weights
+            if watchtime_to_action_thresholds_and_weights is not None
+            else []
+        )
+        self.register_buffer(
+            "_combined_action_weights",
+            torch.tensor(
+                action_weights
+                + [x[1] for x in self._watchtime_to_action_thresholds_and_weights]
+            ),
+        )
+        self._num_action_types: int = len(action_weights) + len(
+            self._watchtime_to_action_thresholds_and_weights
+        )
+        self._action_embedding_dim = action_embedding_dim
+        self._action_embedding_table: torch.nn.Parameter = torch.nn.Parameter(
+            torch.empty((self._num_action_types, action_embedding_dim)).normal_(
+                mean=0, std=embedding_init_std
+            ),
+        )
+        self._target_action_embedding_table: torch.nn.Parameter = torch.nn.Parameter(
+            torch.empty((1, self._num_action_types * action_embedding_dim)).normal_(
+                mean=0, std=embedding_init_std
+            ),
+        )
+
+    @property
+    def output_embedding_dim(self) -> int:
+        return self._action_embedding_dim * self._num_action_types
+
+    def forward(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        uih_offsets: torch.Tensor,
+        target_offsets: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        seq_actions = seq_payloads[self._action_feature_name]
+        if len(self._watchtime_to_action_thresholds_and_weights) > 0:
+            watchtimes = seq_payloads[self._watchtime_feature_name]
+            for threshold, weight in self._watchtime_to_action_thresholds_and_weights:
+                seq_actions = torch.bitwise_or(
+                    seq_actions, (watchtimes >= threshold).to(torch.int64) * weight
+                )
+        exploded_actions = (
+            torch.bitwise_and(
+                seq_actions.unsqueeze(-1), self._combined_action_weights.unsqueeze(0)
+            )
+            > 0
+        )
+        action_embeddings = (
+            exploded_actions.unsqueeze(-1) * self._action_embedding_table.unsqueeze(0)
+        ).view(-1, self._num_action_types * self._action_embedding_dim)
+        total_targets: int = seq_embeddings.size(0) - action_embeddings.size(0)
+        action_embeddings = concat_2D_jagged(
+            max_seq_len=max_uih_len + max_targets,
+            values_left=action_embeddings,
+            values_right=self._target_action_embedding_table.tile(
+                total_targets,
+                1,
+            ),
+            max_len_left=max_uih_len,
+            max_len_right=max_targets,
+            offsets_left=uih_offsets,
+            offsets_right=target_offsets,
+            kernel=self.hammer_kernel(),
+        )
+        return action_embeddings
diff --git a/recommendation_v4/generative_recommenders/modules/content_encoder.py b/recommendation_v4/generative_recommenders/modules/content_encoder.py
new file mode 100644
index 000000000..acca82dbf
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/content_encoder.py
@@ -0,0 +1,109 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Dict, List, Optional
+
+import torch
+from generative_recommenders.common import HammerModule
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+
+
+class ContentEncoder(HammerModule):
+    def __init__(
+        self,
+        input_embedding_dim: int,
+        additional_content_features: Optional[Dict[str, int]] = None,
+        target_enrich_features: Optional[Dict[str, int]] = None,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._input_embedding_dim: int = input_embedding_dim
+        self._additional_content_features: Dict[str, int] = (
+            additional_content_features
+            if additional_content_features is not None
+            else {}
+        )
+        self._target_enrich_features: Dict[str, int] = (
+            target_enrich_features if target_enrich_features is not None else {}
+        )
+        self._target_enrich_dummy_embeddings: torch.nn.ParameterDict = (
+            torch.nn.ParameterDict(
+                {
+                    name: torch.nn.Parameter(
+                        torch.empty((1, dim)).normal_(mean=0, std=0.1),
+                    )
+                    for name, dim in self._target_enrich_features.items()
+                }
+            )
+        )
+
+    @property
+    def output_embedding_dim(self) -> int:
+        return self._input_embedding_dim + sum(
+            list(self._additional_content_features.values())
+            + list(self._target_enrich_features.values())
+        )
+
+    def forward(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        uih_offsets: torch.Tensor,
+        target_offsets: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        content_embeddings_list: List[torch.Tensor] = [seq_embeddings]
+        if len(self._additional_content_features) > 0:
+            content_embeddings_list = content_embeddings_list + [
+                (seq_payloads[x].to(seq_embeddings.dtype))
+                for x in self._additional_content_features.keys()
+            ]
+
+        if self._target_enrich_dummy_embeddings:
+            total_seq_len: int = seq_embeddings.size(0)
+            for name, param in self._target_enrich_dummy_embeddings.items():
+                enrich_embeddings_target = seq_payloads[name].to(seq_embeddings.dtype)
+                total_targets: int = enrich_embeddings_target.size(0)
+                total_uih_len: int = total_seq_len - total_targets
+                enrich_embeddings_uih = param.tile(total_uih_len, 1).to(
+                    seq_embeddings.dtype
+                )
+                enrich_embeddings = concat_2D_jagged(
+                    max_seq_len=max_uih_len + max_targets,
+                    values_left=enrich_embeddings_uih,
+                    values_right=enrich_embeddings_target,
+                    max_len_left=max_uih_len,
+                    max_len_right=max_targets,
+                    offsets_left=uih_offsets,
+                    offsets_right=target_offsets,
+                    kernel=self.hammer_kernel(),
+                )
+                content_embeddings_list.append(enrich_embeddings)
+
+        if (
+            len(self._target_enrich_features) == 0
+            and len(self._additional_content_features) == 0
+        ):
+            return seq_embeddings
+        else:
+            content_embeddings = torch.cat(
+                content_embeddings_list,
+                dim=1,
+            )
+            return content_embeddings
diff --git a/recommendation_v4/generative_recommenders/modules/contextual_interleave_preprocessor.py b/recommendation_v4/generative_recommenders/modules/contextual_interleave_preprocessor.py
new file mode 100644
index 000000000..fff0d72f0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/contextual_interleave_preprocessor.py
@@ -0,0 +1,357 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from math import sqrt
+from typing import Callable, Dict, Optional, Tuple
+
+import torch
+from generative_recommenders.common import fx_unwrap_optional_tensor
+from generative_recommenders.modules.action_encoder import ActionEncoder
+from generative_recommenders.modules.content_encoder import ContentEncoder
+from generative_recommenders.modules.contextualize_mlps import (
+    ContextualizedMLP,
+    ParameterizedContextualizedMLP,
+)
+from generative_recommenders.modules.preprocessors import (
+    get_contextual_input_embeddings,
+    InputPreprocessor,
+)
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+
+
+class ContextualInterleavePreprocessor(InputPreprocessor):
+    def __init__(
+        self,
+        input_embedding_dim: int,
+        output_embedding_dim: int,
+        contextual_feature_to_max_length: Dict[str, int],
+        contextual_feature_to_min_uih_length: Dict[str, int],
+        content_encoder: ContentEncoder,
+        content_contextualize_mlp_fn: Callable[
+            [int, int, int, bool], ContextualizedMLP
+        ],
+        action_encoder: ActionEncoder,
+        action_contextualize_mlp_fn: Callable[[int, int, int, bool], ContextualizedMLP],
+        pmlp_contextual_dropout_ratio: float = 0.0,
+        enable_interleaving: bool = False,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._input_embedding_dim: int = input_embedding_dim
+        self._output_embedding_dim: int = output_embedding_dim
+        self._contextual_feature_to_max_length: Dict[str, int] = (
+            contextual_feature_to_max_length
+        )
+        self._max_contextual_seq_len: int = sum(
+            contextual_feature_to_max_length.values()
+        )
+        self._contextual_feature_to_min_uih_length: Dict[str, int] = (
+            contextual_feature_to_min_uih_length
+        )
+        std = 1.0 * sqrt(2.0 / float(input_embedding_dim + output_embedding_dim))
+        self._batched_contextual_linear_weights = torch.nn.Parameter(
+            torch.empty(
+                (
+                    self._max_contextual_seq_len,
+                    input_embedding_dim,
+                    output_embedding_dim,
+                )
+            ).normal_(0.0, std)
+        )
+        self._pmlp_contextual_dropout_ratio: float = pmlp_contextual_dropout_ratio
+        self._batched_contextual_linear_bias = torch.nn.Parameter(
+            torch.empty((self._max_contextual_seq_len, 1, output_embedding_dim)).fill_(
+                0.0
+            )
+        )
+        contextual_embedding_dim: int = (
+            self._max_contextual_seq_len * input_embedding_dim
+        )
+        self._content_encoder: ContentEncoder = content_encoder
+        self._content_embedding_mlp: ContextualizedMLP = content_contextualize_mlp_fn(
+            self._content_encoder.output_embedding_dim,
+            output_embedding_dim,
+            contextual_embedding_dim,
+            is_inference,
+        )
+        self._action_encoder: ActionEncoder = action_encoder
+        self._action_embedding_mlp: ContextualizedMLP = action_contextualize_mlp_fn(
+            self._action_encoder.output_embedding_dim,
+            output_embedding_dim,
+            contextual_embedding_dim,
+            is_inference,
+        )
+        self._enable_interleaving: bool = enable_interleaving
+
+    def combine_embeddings(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        content_embeddings: torch.Tensor,
+        action_embeddings: torch.Tensor,
+        contextual_embeddings: Optional[torch.Tensor],
+        num_targets: torch.Tensor,
+    ) -> Tuple[
+        int,
+        int,
+        int,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        if self._enable_interleaving:
+            output_seq_timestamps = seq_timestamps.repeat_interleave(2)
+            output_seq_embeddings = torch.stack(
+                [content_embeddings, action_embeddings], dim=1
+            ).reshape(-1, self._output_embedding_dim)
+            if self.interleave_targets():
+                output_seq_lengths = seq_lengths * 2
+                output_max_seq_len = (max_uih_len + max_targets) * 2
+                output_num_targets = num_targets * 2
+                output_total_uih_len = total_uih_len * 2
+                output_total_targets = total_targets * 2
+            else:
+                seq_lengths_by_2 = seq_lengths * 2
+                output_seq_lengths = seq_lengths_by_2 - num_targets
+                output_max_seq_len = 2 * max_uih_len + max_targets
+                indices = torch.arange(
+                    2 * (max_uih_len + max_targets), device=seq_lengths.device
+                ).view(1, -1)
+                valid_mask = torch.logical_and(
+                    indices < seq_lengths_by_2.view(-1, 1),
+                    torch.logical_or(
+                        indices < (output_seq_lengths - num_targets).view(-1, 1),
+                        torch.remainder(indices, 2) == 0,
+                    ),
+                )
+                jagged_valid_mask = (
+                    torch.ops.fbgemm.dense_to_jagged(
+                        valid_mask.int().unsqueeze(-1),
+                        [
+                            torch.ops.fbgemm.asynchronous_complete_cumsum(
+                                seq_lengths_by_2
+                            )
+                        ],
+                    )[0]
+                    .to(torch.bool)
+                    .squeeze(1)
+                )
+                output_seq_embeddings = output_seq_embeddings[jagged_valid_mask]
+                output_seq_timestamps = output_seq_timestamps[jagged_valid_mask]
+                output_num_targets = num_targets
+                output_total_uih_len = total_uih_len * 2
+                output_total_targets = total_targets
+        else:
+            output_max_seq_len = max_uih_len + max_targets
+            output_seq_lengths = seq_lengths
+            output_num_targets = num_targets
+            output_seq_timestamps = seq_timestamps
+            output_seq_embeddings = content_embeddings + action_embeddings
+            output_total_uih_len = total_uih_len
+            output_total_targets = total_targets
+
+        # concat contextual embeddings
+        output_seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            output_seq_lengths
+        )
+        if self._max_contextual_seq_len > 0:
+            output_seq_embeddings = concat_2D_jagged(
+                max_seq_len=self._max_contextual_seq_len + output_max_seq_len,
+                values_left=fx_unwrap_optional_tensor(contextual_embeddings).reshape(
+                    -1, self._output_embedding_dim
+                ),
+                values_right=output_seq_embeddings,
+                max_len_left=self._max_contextual_seq_len,
+                max_len_right=output_max_seq_len,
+                offsets_left=None,
+                offsets_right=output_seq_offsets,
+                kernel=self.hammer_kernel(),
+            )
+            output_seq_timestamps = concat_2D_jagged(
+                max_seq_len=self._max_contextual_seq_len + output_max_seq_len,
+                values_left=torch.zeros(
+                    (output_seq_lengths.size(0) * self._max_contextual_seq_len, 1),
+                    dtype=output_seq_timestamps.dtype,
+                    device=output_seq_timestamps.device,
+                ),
+                values_right=output_seq_timestamps.unsqueeze(-1),
+                max_len_left=self._max_contextual_seq_len,
+                max_len_right=output_max_seq_len,
+                offsets_left=None,
+                offsets_right=output_seq_offsets,
+                kernel=self.hammer_kernel(),
+            ).squeeze(-1)
+            output_max_seq_len = output_max_seq_len + self._max_contextual_seq_len
+            output_total_uih_len = (
+                output_total_uih_len
+                + self._max_contextual_seq_len * output_seq_lengths.size(0)
+            )
+            output_seq_lengths = output_seq_lengths + self._max_contextual_seq_len
+            output_seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+                output_seq_lengths
+            )
+
+        return (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+        )
+
+    def forward(  # noqa C901
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        int,
+        int,
+        int,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+    ]:
+        max_seq_len = max_uih_len + max_targets
+        with torch.autocast(
+            "cuda",
+            dtype=torch.bfloat16,
+            enabled=(not self.is_inference and self._training_dtype == torch.bfloat16),
+        ):
+            # get contextual_embeddings
+            contextual_embeddings: Optional[torch.Tensor] = None
+            pmlp_contextual_embeddings: Optional[torch.Tensor] = None
+            if self._max_contextual_seq_len > 0:
+                contextual_input_embeddings = get_contextual_input_embeddings(
+                    seq_lengths=seq_lengths,
+                    seq_payloads=seq_payloads,
+                    contextual_feature_to_max_length=self._contextual_feature_to_max_length,
+                    contextual_feature_to_min_uih_length=self._contextual_feature_to_min_uih_length,
+                    dtype=seq_embeddings.dtype,
+                )
+                if isinstance(
+                    self._action_embedding_mlp, ParameterizedContextualizedMLP
+                ) or isinstance(
+                    self._action_embedding_mlp, ParameterizedContextualizedMLP
+                ):
+                    pmlp_contextual_embeddings = torch.nn.functional.dropout(
+                        contextual_input_embeddings,
+                        p=self._pmlp_contextual_dropout_ratio,
+                        training=self.training,
+                    )
+                contextual_embeddings = torch.baddbmm(
+                    self._batched_contextual_linear_bias.to(
+                        contextual_input_embeddings.dtype
+                    ),
+                    contextual_input_embeddings.view(
+                        -1, self._max_contextual_seq_len, self._input_embedding_dim
+                    ).transpose(0, 1),
+                    self._batched_contextual_linear_weights.to(
+                        contextual_input_embeddings.dtype
+                    ),
+                ).transpose(0, 1)
+
+            # content embeddings
+            seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(seq_lengths)
+            target_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(num_targets)
+            uih_offsets = seq_offsets - target_offsets
+            content_embeddings = self._content_encoder(
+                max_uih_len=max_uih_len,
+                max_targets=max_targets,
+                uih_offsets=uih_offsets,
+                target_offsets=target_offsets,
+                seq_embeddings=seq_embeddings,
+                seq_payloads=seq_payloads,
+            )
+            content_embeddings = self._content_embedding_mlp(
+                seq_embeddings=content_embeddings,
+                seq_offsets=seq_offsets,
+                max_seq_len=max_seq_len,
+                contextual_embeddings=pmlp_contextual_embeddings,
+            )
+
+            # action embeddings
+            action_embeddings = self._action_encoder(
+                max_uih_len=max_uih_len,
+                max_targets=max_targets,
+                uih_offsets=uih_offsets,
+                target_offsets=target_offsets,
+                seq_embeddings=seq_embeddings,
+                seq_payloads=seq_payloads,
+            ).to(seq_embeddings.dtype)
+            action_embeddings = self._action_embedding_mlp(
+                seq_embeddings=action_embeddings,
+                seq_offsets=seq_offsets,
+                max_seq_len=max_seq_len,
+                contextual_embeddings=pmlp_contextual_embeddings,
+            )
+
+            (
+                output_max_seq_len,
+                output_total_uih_len,
+                output_total_targets,
+                output_seq_lengths,
+                output_seq_offsets,
+                output_seq_timestamps,
+                output_seq_embeddings,
+                output_num_targets,
+            ) = self.combine_embeddings(
+                max_uih_len=max_uih_len,
+                max_targets=max_targets,
+                total_uih_len=total_uih_len,
+                total_targets=total_targets,
+                seq_lengths=seq_lengths,
+                seq_timestamps=seq_timestamps,
+                content_embeddings=content_embeddings,
+                action_embeddings=action_embeddings,
+                contextual_embeddings=contextual_embeddings,
+                num_targets=num_targets,
+            )
+
+        return (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+            seq_payloads,
+        )
+
+    def interleave_targets(self) -> bool:
+        return self.is_train and self._enable_interleaving
diff --git a/recommendation_v4/generative_recommenders/modules/contextualize_mlps.py b/recommendation_v4/generative_recommenders/modules/contextualize_mlps.py
new file mode 100644
index 000000000..dc49effeb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/contextualize_mlps.py
@@ -0,0 +1,141 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+import abc
+from typing import Optional
+
+import torch
+from generative_recommenders.common import HammerModule, init_mlp_weights_optional_bias
+from generative_recommenders.ops.jagged_tensors import jagged_dense_bmm_broadcast_add
+from generative_recommenders.ops.layer_norm import LayerNorm, SwishLayerNorm
+from libfb.py.pyre import none_throws
+
+
+class ContextualizedMLP(HammerModule):
+    @abc.abstractmethod
+    def forward(
+        self,
+        max_seq_len: int,
+        seq_embeddings: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        contextual_embeddings: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Args:
+            seq_embeddings: (L, D)
+            seq_offsets: (B + 1,)
+            max_seq_len: int
+            contextual_embeddings: (B, D')
+        """
+        pass
+
+
+class SimpleContextualizedMLP(ContextualizedMLP):
+    def __init__(
+        self,
+        sequential_input_dim: int,
+        sequential_output_dim: int,
+        hidden_dim: int,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._mlp: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=sequential_input_dim,
+                out_features=hidden_dim,
+            ),
+            SwishLayerNorm(hidden_dim, is_inference=is_inference),
+            torch.nn.Linear(
+                in_features=hidden_dim,
+                out_features=sequential_output_dim,
+            ),
+            LayerNorm(sequential_output_dim),
+        ).apply(init_mlp_weights_optional_bias)
+
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        max_seq_len: int,
+        contextual_embeddings: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        return self._mlp(seq_embeddings)
+
+
+class ParameterizedContextualizedMLP(ContextualizedMLP):
+    def __init__(
+        self,
+        contextual_embedding_dim: int,
+        sequential_input_dim: int,
+        sequential_output_dim: int,
+        hidden_dim: int,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+
+        self._sequential_input_dim: int = sequential_input_dim
+        self._sequential_output_dim: int = sequential_output_dim
+
+        self._dense_features_compress: torch.nn.Module = torch.nn.Linear(
+            in_features=contextual_embedding_dim,
+            out_features=hidden_dim,
+        ).apply(init_mlp_weights_optional_bias)
+
+        self._attn_raw_weights: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=hidden_dim,
+                out_features=sequential_input_dim * sequential_output_dim,
+            ),
+        ).apply(init_mlp_weights_optional_bias)
+
+        self._attn_weights_norm: torch.nn.Module = torch.nn.LayerNorm(
+            [sequential_input_dim, sequential_output_dim]
+        )
+
+        self._res_weights: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=hidden_dim,
+                out_features=hidden_dim,
+            ),
+            SwishLayerNorm(hidden_dim),
+            torch.nn.Linear(
+                in_features=hidden_dim,
+                out_features=sequential_output_dim,
+            ),
+        ).apply(init_mlp_weights_optional_bias)
+
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        max_seq_len: int,
+        contextual_embeddings: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        shared_input = self._dense_features_compress(none_throws(contextual_embeddings))
+        attn_weights = self._attn_weights_norm(
+            self._attn_raw_weights(shared_input).reshape(
+                -1, self._sequential_input_dim, self._sequential_output_dim
+            )
+        )
+        return jagged_dense_bmm_broadcast_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=seq_embeddings,
+            dense=attn_weights.to(seq_embeddings.dtype),
+            bias=self._res_weights(shared_input),
+            kernel=self.hammer_kernel(),
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py b/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
new file mode 100644
index 000000000..f11bb226a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
@@ -0,0 +1,757 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, List, NamedTuple, Optional, Tuple
+
+import torch
+from generative_recommenders.common import (
+    fx_infer_max_len,
+    fx_mark_length_features,
+    HammerKernel,
+    HammerModule,
+    init_mlp_weights_optional_bias,
+    set_static_max_seq_lens,
+)
+from generative_recommenders.modules.hstu_transducer import HSTUTransducer
+from generative_recommenders.modules.multitask_module import (
+    DefaultMultitaskModule,
+    MultitaskTaskType,
+    TaskConfig,
+)
+from generative_recommenders.modules.positional_encoder import HSTUPositionalEncoder
+from generative_recommenders.modules.postprocessors import (
+    LayerNormPostprocessor,
+    TimestampLayerNormPostprocessor,
+)
+from generative_recommenders.modules.preprocessors import ContextualPreprocessor
+from generative_recommenders.modules.stu import STU, STULayer, STULayerConfig, STUStack
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+from generative_recommenders.ops.layer_norm import LayerNorm, SwishLayerNorm
+from torch.autograd.profiler import record_function
+from torchrec import KeyedJaggedTensor
+from torchrec.modules.embedding_configs import EmbeddingConfig
+from torchrec.modules.embedding_modules import EmbeddingCollection
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+def fx_total_targets(num_candidates: torch.Tensor) -> int:
+    """Sum a per-sample candidate-count tensor to a Python int.
+
+    Wrapped with ``torch.fx.wrap`` so ``TrainPipelineSparseDist``'s symbolic
+    trace treats it as an opaque leaf instead of recursing into the data-
+    dependent ``int(Proxy.sum().item())`` (which raises during tracing).
+    """
+    return int(num_candidates.sum().item())
+
+
+torch.fx.wrap("fx_infer_max_len")
+torch.fx.wrap("fx_total_targets")
+torch.fx.wrap("len")
+
+
+class SequenceEmbedding(NamedTuple):
+    lengths: torch.Tensor
+    embedding: torch.Tensor
+
+
+@dataclass
+class DlrmHSTUConfig:
+    max_seq_len: int = 16384
+    max_num_candidates: int = 10
+    max_num_candidates_inference: int = 5
+    hstu_num_heads: int = 1
+    hstu_attn_linear_dim: int = 256
+    hstu_attn_qk_dim: int = 128
+    hstu_attn_num_layers: int = 12
+    hstu_embedding_table_dim: int = 192
+    hstu_preprocessor_hidden_dim: int = 256
+    hstu_transducer_embedding_dim: int = 0
+    hstu_group_norm: bool = False
+    hstu_input_dropout_ratio: float = 0.2
+    hstu_linear_dropout_rate: float = 0.2
+    contextual_feature_to_max_length: Dict[str, int] = field(default_factory=dict)
+    contextual_feature_to_min_uih_length: Dict[str, int] = field(default_factory=dict)
+    candidates_weight_feature_name: str = ""
+    candidates_watchtime_feature_name: str = ""
+    candidates_querytime_feature_name: str = ""
+    causal_multitask_weights: float = 0.2
+    multitask_configs: List[TaskConfig] = field(default_factory=list)
+    user_embedding_feature_names: List[str] = field(default_factory=list)
+    item_embedding_feature_names: List[str] = field(default_factory=list)
+    uih_post_id_feature_name: str = ""
+    uih_action_time_feature_name: str = ""
+    uih_weight_feature_name: str = ""
+    hstu_uih_feature_names: List[str] = field(default_factory=list)
+    hstu_candidate_feature_names: List[str] = field(default_factory=list)
+    merge_uih_candidate_feature_mapping: List[Tuple[str, str]] = field(
+        default_factory=list
+    )
+    action_weights: Optional[List[int]] = None
+    action_embedding_init_std: float = 0.1
+    enable_postprocessor: bool = True
+    use_layer_norm_postprocessor: bool = False
+
+
+def _get_supervision_labels_and_weights(
+    supervision_bitmasks: torch.Tensor,
+    watchtime_sequence: torch.Tensor,
+    task_configs: List[TaskConfig],
+) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+    supervision_labels: Dict[str, torch.Tensor] = {}
+    supervision_weights: Dict[str, torch.Tensor] = {}
+    for task in task_configs:
+        if task.task_type == MultitaskTaskType.REGRESSION:
+            supervision_labels[task.task_name] = watchtime_sequence.to(torch.float32)
+        elif task.task_type == MultitaskTaskType.BINARY_CLASSIFICATION:
+            supervision_labels[task.task_name] = (
+                torch.bitwise_and(supervision_bitmasks, task.task_weight) > 0
+            ).to(torch.float32)
+        else:
+            raise RuntimeError("Unsupported MultitaskTaskType")
+    return supervision_labels, supervision_weights
+
+
+class DlrmHSTU(HammerModule):
+    def __init__(  # noqa C901
+        self,
+        hstu_configs: DlrmHSTUConfig,
+        embedding_tables: Dict[str, EmbeddingConfig],
+        is_inference: bool,
+        is_dense: bool = False,
+        bf16_training: bool = True,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        logger.info(f"Initialize HSTU module with configs {hstu_configs}")
+        # When True, forward() takes the whole `Samples` batch as its single
+        # positional arg and reads the pre-merged sparse KJT off it. This keeps
+        # the EmbeddingCollection's input a plain getattr on the batch placeholder
+        # so TorchRec's TrainPipelineSparseDist can pipeline its input_dist. Set
+        # by build_train_pipeline(); leave False for eager / inference / eval.
+        self._pipeline_mode: bool = False
+        self._hstu_configs = hstu_configs
+        self._bf16_training: bool = bf16_training
+        # Last batch's jagged FLOPs/sample (0-d tensor on GPU). Populated by
+        # main_forward; MetricsLogger reads + .item()s on each compute_and_log
+        # to compute tflops_real/gpu and hfu (vs dense yardstick from
+        # get_num_flops_per_sample()).
+        self._last_jagged_flops_per_sample: Optional[torch.Tensor] = None
+        set_static_max_seq_lens([self._hstu_configs.max_seq_len])
+
+        if not is_dense:
+            self._embedding_collection: EmbeddingCollection = EmbeddingCollection(
+                tables=list(embedding_tables.values()),
+                need_indices=False,
+                device=torch.device("meta"),
+            )
+
+        # multitask configs must be sorted by task types
+        self._multitask_configs: List[TaskConfig] = hstu_configs.multitask_configs
+        self._multitask_module = DefaultMultitaskModule(
+            task_configs=self._multitask_configs,
+            embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+            prediction_fn=lambda in_dim, num_tasks: torch.nn.Sequential(
+                torch.nn.Linear(in_features=in_dim, out_features=512),
+                SwishLayerNorm(512),
+                torch.nn.Linear(in_features=512, out_features=num_tasks),
+            ).apply(init_mlp_weights_optional_bias),
+            causal_multitask_weights=hstu_configs.causal_multitask_weights,
+            is_inference=self._is_inference,
+        )
+        self._additional_embedding_features: List[str] = [
+            uih_feature_name
+            for (
+                uih_feature_name,
+                candidate_feature_name,
+            ) in self._hstu_configs.merge_uih_candidate_feature_mapping
+            if (
+                candidate_feature_name
+                in self._hstu_configs.item_embedding_feature_names
+            )
+            and (uih_feature_name in self._hstu_configs.user_embedding_feature_names)
+            and (uih_feature_name is not self._hstu_configs.uih_post_id_feature_name)
+        ]
+
+        # preprocessor setup
+        preprocessor = ContextualPreprocessor(
+            input_embedding_dim=hstu_configs.hstu_embedding_table_dim,
+            hidden_dim=hstu_configs.hstu_preprocessor_hidden_dim,
+            output_embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+            contextual_feature_to_max_length=hstu_configs.contextual_feature_to_max_length,
+            contextual_feature_to_min_uih_length=hstu_configs.contextual_feature_to_min_uih_length,
+            action_embedding_dim=8,
+            action_feature_name=self._hstu_configs.uih_weight_feature_name,
+            action_weights=self._hstu_configs.action_weights,
+            action_embedding_init_std=self._hstu_configs.action_embedding_init_std,
+            additional_embedding_features=self._additional_embedding_features,
+            is_inference=is_inference,
+        )
+
+        # positional encoder
+        positional_encoder = HSTUPositionalEncoder(
+            num_position_buckets=8192,
+            num_time_buckets=2048,
+            embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+            contextual_seq_len=sum(
+                dict(hstu_configs.contextual_feature_to_max_length).values()
+            ),
+            is_inference=self._is_inference,
+        )
+
+        if hstu_configs.enable_postprocessor:
+            if hstu_configs.use_layer_norm_postprocessor:
+                postprocessor = LayerNormPostprocessor(
+                    embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+                    eps=1e-5,
+                    is_inference=self._is_inference,
+                )
+            else:
+                postprocessor = TimestampLayerNormPostprocessor(
+                    embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+                    time_duration_features=[
+                        (60 * 60, 24),  # hour of day
+                        (24 * 60 * 60, 7),  # day of week
+                        # (24 * 60 * 60, 365), # time of year (approximate)
+                    ],
+                    eps=1e-5,
+                    is_inference=self._is_inference,
+                )
+        else:
+            postprocessor = None
+
+        # construct HSTU
+        stu_module: STU = STUStack(
+            stu_list=[
+                STULayer(
+                    config=STULayerConfig(
+                        embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+                        num_heads=hstu_configs.hstu_num_heads,
+                        hidden_dim=hstu_configs.hstu_attn_linear_dim,
+                        attention_dim=hstu_configs.hstu_attn_qk_dim,
+                        output_dropout_ratio=hstu_configs.hstu_linear_dropout_rate,
+                        use_group_norm=hstu_configs.hstu_group_norm,
+                        causal=True,
+                        target_aware=True,
+                        max_attn_len=None,
+                        attn_alpha=None,
+                        recompute_normed_x=True,
+                        recompute_uvqk=True,
+                        recompute_y=True,
+                        sort_by_length=True,
+                        contextual_seq_len=0,
+                    ),
+                    is_inference=is_inference,
+                )
+                for _ in range(hstu_configs.hstu_attn_num_layers)
+            ],
+            is_inference=is_inference,
+        )
+        self._hstu_transducer: HSTUTransducer = HSTUTransducer(
+            stu_module=stu_module,
+            input_preprocessor=preprocessor,
+            output_postprocessor=postprocessor,
+            input_dropout_ratio=hstu_configs.hstu_input_dropout_ratio,
+            positional_encoder=positional_encoder,
+            is_inference=self._is_inference,
+            return_full_embeddings=False,
+            listwise=False,
+        )
+
+        # item embeddings
+        self._item_embedding_mlp: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=hstu_configs.hstu_embedding_table_dim
+                * len(self._hstu_configs.item_embedding_feature_names),
+                out_features=512,
+            ),
+            SwishLayerNorm(512),
+            torch.nn.Linear(
+                in_features=512,
+                out_features=hstu_configs.hstu_transducer_embedding_dim,
+            ),
+            LayerNorm(hstu_configs.hstu_transducer_embedding_dim),
+        ).apply(init_mlp_weights_optional_bias)
+
+    # -- FLOPs estimation -----------------------------------------------------
+    # Convention matches TorchTitan / Primus-DLRM: matmul = 6 × M × N × K
+    # (×3 fwd+bwd, ×2 FMA), attention = 2 matmuls (Q·K^T + att·V).
+    # Embedding lookups excluded — they're memory-bound, not compute.
+    #
+    # HSTU vs OneTrans: HSTU collapses attention + FFN into a single UVQK
+    # projection plus SiLU(U) ⊙ y elementwise gating. There is NO separate
+    # FFN block (which dominates FLOPs in a standard transformer), so HSTU
+    # is intentionally compute-leaner per layer for the same N.
+    def _hstu_layer_flops(
+        self, n_tokens_linear: float, n_tokens_attn_sq: float
+    ) -> float:
+        """Per-layer FLOPs given linear-op token count and attention-token²
+        count. Dense estimate uses ``N`` and ``N²``; jagged estimate
+        substitutes ``mean(s_i)`` and ``mean(s_i²)``."""
+        cfg = self._hstu_configs
+        D = cfg.hstu_embedding_table_dim
+        H = cfg.hstu_num_heads
+        hd = cfg.hstu_attn_linear_dim  # V/U head dim
+        qd = cfg.hstu_attn_qk_dim       # Q/K head dim
+        uvqk = 6 * n_tokens_linear * D * (2 * hd + 2 * qd) * H
+        attn = 6 * n_tokens_attn_sq * H * (qd + hd)  # Q·K^T + att·V
+        out = 6 * n_tokens_linear * (3 * H * hd) * D
+        return uvqk + attn + out
+
+    def get_num_flops_per_sample(self) -> float:
+        """Dense-equivalent fwd+bwd FLOPs per sample at ``max_seq_len``.
+
+        Used as the MFU yardstick (peak utilization the workload could
+        theoretically reach if every sample's sequence were the full padded
+        length). The actual ``tflops_real``/``hfu`` reported per step uses
+        the jagged estimate stashed by ``main_forward``.
+        """
+        cfg = self._hstu_configs
+        N = float(cfg.max_seq_len)
+        n_layers = cfg.hstu_attn_num_layers
+        flops = n_layers * self._hstu_layer_flops(
+            n_tokens_linear=N, n_tokens_attn_sq=N * N
+        )
+        # Multitask head (Linear(D, n_tasks)) — negligible but cheap to add.
+        n_tasks = len(self._multitask_configs)
+        if n_tasks > 0:
+            flops += 6 * n_tasks * cfg.hstu_embedding_table_dim
+        return float(flops)
+
+    def _compute_jagged_flops_per_sample(
+        self,
+        uih_seq_lengths: torch.Tensor,
+        num_candidates: torch.Tensor,
+    ) -> torch.Tensor:
+        """Jagged fwd+bwd FLOPs per sample for THIS batch's actual lengths.
+
+        Per-sample merged sequence length s_i = uih_seq_lengths[i] +
+        num_candidates[i]. Returns a 0-d tensor on the batch's device;
+        caller should ``.item()`` it (one D→H sync per logging interval).
+        """
+        s = (uih_seq_lengths + num_candidates).float()
+        mean_s = s.mean()
+        mean_s_sq = (s * s).mean()
+        cfg = self._hstu_configs
+        n_layers = cfg.hstu_attn_num_layers
+        flops = n_layers * (
+            6 * mean_s * cfg.hstu_embedding_table_dim
+              * (2 * cfg.hstu_attn_linear_dim + 2 * cfg.hstu_attn_qk_dim)
+              * cfg.hstu_num_heads
+            + 6 * mean_s_sq * cfg.hstu_num_heads
+              * (cfg.hstu_attn_qk_dim + cfg.hstu_attn_linear_dim)
+            + 6 * mean_s * (3 * cfg.hstu_num_heads * cfg.hstu_attn_linear_dim)
+              * cfg.hstu_embedding_table_dim
+        )
+        n_tasks = len(self._multitask_configs)
+        if n_tasks > 0:
+            flops = flops + 6 * n_tasks * cfg.hstu_embedding_table_dim
+        return flops
+
+    def _construct_payload(
+        self,
+        payload_features: Dict[str, torch.Tensor],
+        seq_embeddings: Dict[str, SequenceEmbedding],
+    ) -> Dict[str, torch.Tensor]:
+        if len(self._hstu_configs.contextual_feature_to_max_length) > 0:
+            contextual_offsets: List[torch.Tensor] = []
+            for x in self._hstu_configs.contextual_feature_to_max_length.keys():
+                contextual_offsets.append(
+                    torch.ops.fbgemm.asynchronous_complete_cumsum(
+                        seq_embeddings[x].lengths
+                    )
+                )
+        else:
+            # Dummy, offsets are unused
+            contextual_offsets = torch.empty((0, 0))
+        if torch.jit.is_scripting():
+            # Explicit loops are TS-clean (avoid the dict-merge / dict-comp
+            # idioms below, which TorchScript cannot script).
+            out: Dict[str, torch.Tensor] = {}
+            for k, v in payload_features.items():
+                out[k] = v
+            for x in self._hstu_configs.contextual_feature_to_max_length.keys():
+                out[x] = seq_embeddings[x].embedding
+            i = 0
+            for x in self._hstu_configs.contextual_feature_to_max_length.keys():
+                # pyre-ignore[6]
+                out[x + "_offsets"] = contextual_offsets[i]
+                i += 1
+            for x in self._additional_embedding_features:
+                out[x] = seq_embeddings[x].embedding
+            return out
+        return {
+            **payload_features,
+            **{
+                x: seq_embeddings[x].embedding
+                for x in self._hstu_configs.contextual_feature_to_max_length.keys()
+            },
+            **{
+                x + "_offsets": contextual_offsets[i]
+                for i, x in enumerate(
+                    list(self._hstu_configs.contextual_feature_to_max_length.keys())
+                )
+            },
+            **{
+                x: seq_embeddings[x].embedding
+                for x in self._additional_embedding_features
+            },
+        }
+
+    def _user_forward(
+        self,
+        max_uih_len: int,
+        max_candidates: int,
+        seq_embeddings: Dict[str, SequenceEmbedding],
+        payload_features: Dict[str, torch.Tensor],
+        num_candidates: torch.Tensor,
+        total_uih_len: Optional[int] = None,
+        total_targets: Optional[int] = None,
+    ) -> torch.Tensor:
+        source_lengths = seq_embeddings[
+            self._hstu_configs.uih_post_id_feature_name
+        ].lengths
+        source_timestamps = concat_2D_jagged(
+            max_seq_len=max_uih_len + max_candidates,
+            max_len_left=max_uih_len,
+            offsets_left=payload_features["uih_offsets"],
+            values_left=payload_features[
+                self._hstu_configs.uih_action_time_feature_name
+            ].unsqueeze(-1),
+            max_len_right=max_candidates,
+            offsets_right=payload_features["candidate_offsets"],
+            values_right=payload_features[
+                self._hstu_configs.candidates_querytime_feature_name
+            ].unsqueeze(-1),
+            kernel=self.hammer_kernel(),
+        ).squeeze(-1)
+        if total_targets is None:
+            total_targets = fx_total_targets(num_candidates)
+        if total_uih_len is None:
+            total_uih_len = source_timestamps.numel() - total_targets
+        embedding = seq_embeddings[
+            self._hstu_configs.uih_post_id_feature_name
+        ].embedding
+        dtype = embedding.dtype
+        if (not self.is_inference) and self._bf16_training:
+            embedding = embedding.to(torch.bfloat16)
+        if torch.jit.is_scripting():
+            # TorchScript does not support ``with torch.autocast(...)``.
+            # In script-mode inference the dense path is already in bf16
+            # (move_sparse_output_to_device upcasts on the C++ side), so
+            # autocast is a no-op for the path the predictor exercises.
+            candidates_user_embeddings, _ = self._hstu_transducer(
+                max_uih_len=max_uih_len,
+                max_targets=max_candidates,
+                total_uih_len=total_uih_len,
+                total_targets=total_targets,
+                seq_embeddings=embedding,
+                seq_lengths=source_lengths,
+                seq_timestamps=source_timestamps,
+                seq_payloads=self._construct_payload(
+                    payload_features=payload_features,
+                    seq_embeddings=seq_embeddings,
+                ),
+                num_targets=num_candidates,
+            )
+        else:
+            with torch.autocast(
+                "cuda",
+                dtype=torch.bfloat16,
+                enabled=(not self.is_inference) and self._bf16_training,
+            ):
+                candidates_user_embeddings, _ = self._hstu_transducer(
+                    max_uih_len=max_uih_len,
+                    max_targets=max_candidates,
+                    total_uih_len=total_uih_len,
+                    total_targets=total_targets,
+                    seq_embeddings=embedding,
+                    seq_lengths=source_lengths,
+                    seq_timestamps=source_timestamps,
+                    seq_payloads=self._construct_payload(
+                        payload_features=payload_features,
+                        seq_embeddings=seq_embeddings,
+                    ),
+                    num_targets=num_candidates,
+                )
+        candidates_user_embeddings = candidates_user_embeddings.to(dtype)
+
+        return candidates_user_embeddings
+
+    def _item_forward(
+        self,
+        seq_embeddings: Dict[str, SequenceEmbedding],
+    ) -> torch.Tensor:  # [L, D]
+        all_embeddings = torch.cat(
+            [
+                seq_embeddings[name].embedding
+                for name in self._hstu_configs.item_embedding_feature_names
+            ],
+            dim=-1,
+        )
+        item_embeddings = self._item_embedding_mlp(all_embeddings)
+        return item_embeddings
+
+    def preprocess(
+        self,
+        uih_features: KeyedJaggedTensor,
+        candidates_features: KeyedJaggedTensor,
+        merged_sparse_features: Optional[KeyedJaggedTensor] = None,
+    ) -> Tuple[
+        Dict[str, SequenceEmbedding],
+        Dict[str, torch.Tensor],
+        int,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+    ]:
+        # Embedding lookup for uih + candidates. When the caller (the pipeline
+        # path) supplies the pre-merged KJT from the batch, feed it straight to
+        # the EmbeddingCollection: that keeps the lookup's input a plain getattr
+        # off the batch so TorchRec's TrainPipelineSparseDist can hoist its
+        # input_dist into the prefetch stage. Building it here (cat +
+        # from_lengths_sync's .sync()) is an "input modification" that makes
+        # TorchRec skip pipelining the embedding collection.
+        if merged_sparse_features is None:
+            merged_sparse_features = KeyedJaggedTensor.from_lengths_sync(
+                keys=uih_features.keys() + candidates_features.keys(),
+                values=torch.cat(
+                    [uih_features.values(), candidates_features.values()],
+                    dim=0,
+                ),
+                lengths=torch.cat(
+                    [uih_features.lengths(), candidates_features.lengths()],
+                    dim=0,
+                ),
+            )
+        seq_embeddings_dict = self._embedding_collection(merged_sparse_features)
+        num_candidates = fx_mark_length_features(
+            candidates_features.lengths().view(len(candidates_features.keys()), -1)
+        )[0]
+        max_num_candidates = fx_infer_max_len(num_candidates)
+        uih_seq_lengths = uih_features[
+            self._hstu_configs.uih_post_id_feature_name
+        ].lengths()
+        max_uih_len = fx_infer_max_len(uih_seq_lengths)
+
+        # prepare payload features
+        payload_features: Dict[str, torch.Tensor] = {}
+        for (
+            uih_feature_name,
+            candidate_feature_name,
+        ) in self._hstu_configs.merge_uih_candidate_feature_mapping:
+            if (
+                candidate_feature_name
+                not in self._hstu_configs.item_embedding_feature_names
+                and uih_feature_name
+                not in self._hstu_configs.user_embedding_feature_names
+            ):
+                values_left = uih_features[uih_feature_name].values()
+                if self._is_inference and (
+                    candidate_feature_name
+                    == self._hstu_configs.candidates_weight_feature_name
+                    or candidate_feature_name
+                    == self._hstu_configs.candidates_watchtime_feature_name
+                ):
+                    total_candidates = torch.sum(num_candidates).item()
+                    values_right = torch.zeros(
+                        total_candidates,  # pyre-ignore
+                        dtype=torch.int64,
+                        device=values_left.device,
+                    )
+                else:
+                    values_right = candidates_features[candidate_feature_name].values()
+                payload_features[uih_feature_name] = values_left
+                payload_features[candidate_feature_name] = values_right
+        payload_features["uih_offsets"] = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            uih_seq_lengths
+        )
+        payload_features["candidate_offsets"] = (
+            torch.ops.fbgemm.asynchronous_complete_cumsum(num_candidates)
+        )
+
+        seq_embeddings = {
+            k: SequenceEmbedding(
+                lengths=seq_embeddings_dict[k].lengths(),
+                embedding=seq_embeddings_dict[k].values(),
+            )
+            for k in self._hstu_configs.user_embedding_feature_names
+            + self._hstu_configs.item_embedding_feature_names
+        }
+
+        return (
+            seq_embeddings,
+            payload_features,
+            max_uih_len,
+            uih_seq_lengths,
+            max_num_candidates,
+            num_candidates,
+        )
+
+    def main_forward(
+        self,
+        seq_embeddings: Dict[str, SequenceEmbedding],
+        payload_features: Dict[str, torch.Tensor],
+        max_uih_len: int,
+        uih_seq_lengths: torch.Tensor,
+        max_num_candidates: int,
+        num_candidates: torch.Tensor,
+        total_uih_len: Optional[int] = None,
+        total_targets: Optional[int] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
+        # Stash this batch's jagged FLOPs/sample for MetricsLogger to read.
+        # No D->H sync: the .item() happens once per metric_log_frequency in
+        # the trainer, not on every step. Eval-mode batches also produce a
+        # stash but the trainer only consumes it on train batches.
+        if not torch.jit.is_scripting():
+            self._last_jagged_flops_per_sample = (
+                self._compute_jagged_flops_per_sample(
+                    uih_seq_lengths=uih_seq_lengths,
+                    num_candidates=num_candidates,
+                )
+            )
+
+        # merge uih and candidates embeddings
+        for (
+            uih_feature_name,
+            candidate_feature_name,
+        ) in self._hstu_configs.merge_uih_candidate_feature_mapping:
+            if uih_feature_name in seq_embeddings:
+                seq_embeddings[uih_feature_name] = SequenceEmbedding(
+                    lengths=uih_seq_lengths + num_candidates,
+                    embedding=concat_2D_jagged(
+                        max_seq_len=max_uih_len + max_num_candidates,
+                        max_len_left=max_uih_len,
+                        offsets_left=torch.ops.fbgemm.asynchronous_complete_cumsum(
+                            uih_seq_lengths
+                        ),
+                        values_left=seq_embeddings[uih_feature_name].embedding,
+                        max_len_right=max_num_candidates,
+                        offsets_right=torch.ops.fbgemm.asynchronous_complete_cumsum(
+                            num_candidates
+                        ),
+                        values_right=seq_embeddings[candidate_feature_name].embedding,
+                        kernel=self.hammer_kernel(),
+                    ),
+                )
+
+        with record_function("## item_forward ##"):
+            candidates_item_embeddings = self._item_forward(
+                seq_embeddings,
+            )
+        with record_function("## user_forward ##"):
+            candidates_user_embeddings = self._user_forward(
+                max_uih_len=max_uih_len,
+                max_candidates=max_num_candidates,
+                seq_embeddings=seq_embeddings,
+                payload_features=payload_features,
+                num_candidates=num_candidates,
+                total_uih_len=total_uih_len,
+                total_targets=total_targets,
+            )
+        with record_function("## multitask_module ##"):
+            supervision_labels, supervision_weights = (
+                _get_supervision_labels_and_weights(
+                    supervision_bitmasks=payload_features[
+                        self._hstu_configs.candidates_weight_feature_name
+                    ],
+                    watchtime_sequence=payload_features[
+                        self._hstu_configs.candidates_watchtime_feature_name
+                    ],
+                    task_configs=self._multitask_configs,
+                )
+            )
+            mt_target_preds, mt_target_labels, mt_target_weights, mt_losses = (
+                self._multitask_module(
+                    encoded_user_embeddings=candidates_user_embeddings,
+                    item_embeddings=candidates_item_embeddings,
+                    supervision_labels=supervision_labels,
+                    supervision_weights=supervision_weights,
+                )
+            )
+
+        aux_losses: Dict[str, torch.Tensor] = {}
+        if not self._is_inference and self.training:
+            for i, task in enumerate(self._multitask_configs):
+                aux_losses[task.task_name] = mt_losses[i]
+
+        return (
+            candidates_user_embeddings,
+            candidates_item_embeddings,
+            aux_losses,
+            mt_target_preds,
+            mt_target_labels,
+            mt_target_weights,
+        )
+
+    def forward(
+        self,
+        uih_features: KeyedJaggedTensor,
+        candidates_features: Optional[KeyedJaggedTensor] = None,
+        merged_sparse_features: Optional[KeyedJaggedTensor] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
+        # Pipeline mode: TorchRec fx-traces this forward (via DMP.module) and the
+        # pipeline calls it with the single `Samples` batch. Unpacking the KJTs
+        # here — rather than in the wrapper — makes the EmbeddingCollection's
+        # input `batch.merged_sparse_features` a getattr off the batch placeholder,
+        # which is what lets TrainPipelineSparseDist hoist the embedding input_dist
+        # into the prefetch stage. Guarded from TorchScript (inference path).
+        if not torch.jit.is_scripting() and self._pipeline_mode:
+            batch = uih_features
+            uih_features = batch.uih_features_kjt
+            candidates_features = batch.candidates_features_kjt
+            merged_sparse_features = batch.merged_sparse_features
+
+        with record_function("## preprocess ##"):
+            (
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+            ) = self.preprocess(
+                uih_features=uih_features,
+                candidates_features=candidates_features,
+                merged_sparse_features=merged_sparse_features,
+            )
+
+        with record_function("## main_forward ##"):
+            return self.main_forward(
+                seq_embeddings=seq_embeddings,
+                payload_features=payload_features,
+                max_uih_len=max_uih_len,
+                uih_seq_lengths=uih_seq_lengths,
+                max_num_candidates=max_num_candidates,
+                num_candidates=num_candidates,
+            )
diff --git a/recommendation_v4/generative_recommenders/modules/dynamic_stu.py b/recommendation_v4/generative_recommenders/modules/dynamic_stu.py
new file mode 100644
index 000000000..e1fe8ad16
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/dynamic_stu.py
@@ -0,0 +1,304 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+import abc
+import contextlib
+from typing import Any, Generator, Optional, Tuple
+
+import torch
+from generative_recommenders.common import fx_infer_max_len
+from generative_recommenders.modules.stu import STU
+from generative_recommenders.ops.jagged_tensors import (
+    hstu_concat_l2_embeddings,
+    hstu_split_l2_embeddings,
+)
+
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+@contextlib.contextmanager
+# pyre-ignore[3]
+def _freeze_rng_state() -> Generator[Any, None, None]:
+    rng_state = torch.get_rng_state()
+    if torch.cuda.is_available():
+        cuda_rng_state = torch.cuda.get_rng_state()
+    try:
+        yield
+    finally:
+        if torch.cuda.is_available():
+            # pyre-ignore[61]
+            torch.cuda.set_rng_state(cuda_rng_state)
+        torch.set_rng_state(rng_state)
+
+
+class DynamicSTU(STU):
+    def __init__(self, stu: STU, is_inference: bool) -> None:
+        super().__init__(is_inference)
+        self._stu = stu
+
+    @abc.abstractmethod
+    def _preprocess(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+        int,
+        Optional[torch.Tensor],
+    ]:
+        pass
+
+    @abc.abstractmethod
+    def _postprocess(
+        self,
+        stu_output: torch.Tensor,
+    ) -> torch.Tensor:
+        pass
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        (
+            x,
+            x_lengths,
+            x_offsets,
+            max_seq_len,
+            num_targets,
+            max_kv_caching_len,
+            kv_caching_lengths,
+        ) = self._preprocess(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+            max_kv_caching_len=max_kv_caching_len,
+            kv_caching_lengths=kv_caching_lengths,
+        )
+
+        stu_output = self._stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+            max_kv_caching_len=max_kv_caching_len,
+            kv_caching_lengths=kv_caching_lengths,
+        )
+
+        return self._postprocess(
+            stu_output=stu_output,
+        )
+
+
+class SDSTU(DynamicSTU):
+    def __init__(
+        self,
+        stu: STU,
+        is_inference: bool,
+        dropout_ratio: float = 0.5,
+        seed: int = 0,
+    ) -> None:
+        """
+        Stochastic Depth STU
+        """
+        super().__init__(stu=stu, is_inference=is_inference)
+        self._dropout_ratio: float = dropout_ratio
+        self._iter: int = 0
+        self._seed: int = seed
+        self._skip_x: Optional[torch.Tensor] = None
+
+    def _preprocess(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+        int,
+        Optional[torch.Tensor],
+    ]:
+        if self.training:
+            with _freeze_rng_state():
+                torch.manual_seed(self._iter + self._seed)
+                prob = torch.rand(1)
+                if prob.item() <= self._dropout_ratio:
+                    new_x = torch.empty(size=(0, x.shape[1]), device=x.device)
+                    self._skip_x = x
+                    new_x_lengths = torch.zeros_like(x_lengths)
+                    new_x_offsets = torch.zeros_like(x_offsets)
+                    new_max_seq_len = 1
+                else:
+                    new_x = x
+                    new_x_lengths = x_lengths
+                    new_x_offsets = x_offsets
+                    new_max_seq_len = max_seq_len
+            self._iter += 1
+        else:
+            new_x = x
+            new_x_lengths = x_lengths
+            new_x_offsets = x_offsets
+            new_max_seq_len = max_seq_len
+        return (
+            new_x,
+            new_x_lengths,
+            new_x_offsets,
+            new_max_seq_len,
+            num_targets,
+            max_kv_caching_len,
+            kv_caching_lengths,
+        )
+
+    def _postprocess(
+        self,
+        stu_output: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.training and self._skip_x is not None:
+            ret = self._skip_x
+            self._skip_x = None
+            return ret
+        else:
+            return stu_output
+
+
+@torch.fx.wrap
+def _fx_unwrap_optional_tuple_tensor(
+    optional: Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    assert optional is not None, "Expected optional to be non-None"
+    return optional
+
+
+class L2STU(DynamicSTU):
+    def __init__(
+        self,
+        stu: STU,
+        max_l2_len: int,
+        is_inference: bool,
+        contextual_seq_len: int = 0,
+    ) -> None:
+        """
+        Stochastic Depth STU
+        """
+        super().__init__(stu=stu, is_inference=is_inference)
+        self._max_l2_len: int = max_l2_len
+        self._contextual_seq_len: int = contextual_seq_len
+        self._saved_tensors: Optional[
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+        ] = None
+        self._runtime_max_l2_len: int = 0
+        self._runtime_prefix_len: int = 0
+
+    def _preprocess(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+        int,
+        Optional[torch.Tensor],
+    ]:
+        prefix_lengths = (
+            x_lengths - self._max_l2_len - num_targets - self._contextual_seq_len
+        )
+        prefix_lengths = torch.clamp(prefix_lengths, min=0)
+        prefix_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(prefix_lengths)
+        l2_lengths = x_lengths - prefix_lengths
+        l2_offsets = x_offsets - prefix_offsets
+        self._runtime_max_l2_len: int = fx_infer_max_len(l2_lengths)
+        self._runtime_prefix_len: int = fx_infer_max_len(prefix_lengths)
+        prefix_x, l2_x = hstu_split_l2_embeddings(
+            max_seq_len=max_seq_len,
+            x=x,
+            prefix_offsets=prefix_offsets,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=self._contextual_seq_len,
+            kernel=self.hammer_kernel(),
+        )
+        self._saved_tensors = (
+            prefix_offsets,
+            prefix_x,
+            l2_offsets,
+        )
+        return (
+            l2_x,
+            l2_lengths,
+            l2_offsets,
+            self._runtime_max_l2_len,
+            num_targets,
+            max_kv_caching_len,
+            kv_caching_lengths,
+        )
+
+    def _postprocess(
+        self,
+        stu_output: torch.Tensor,
+    ) -> torch.Tensor:
+        (
+            prefix_offsets,
+            prefix_x,
+            l2_offsets,
+        ) = _fx_unwrap_optional_tuple_tensor(self._saved_tensors)
+        self._saved_tensors = None
+        return hstu_concat_l2_embeddings(
+            max_prefix_len=self._runtime_prefix_len,
+            prefix_x=prefix_x,
+            prefix_offsets=prefix_offsets,
+            max_l2_len=self._runtime_max_l2_len,
+            l2_x=stu_output,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=self._contextual_seq_len,
+            kernel=self.hammer_kernel(),
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/hstu_transducer.py b/recommendation_v4/generative_recommenders/modules/hstu_transducer.py
new file mode 100644
index 000000000..ce91a67c9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/hstu_transducer.py
@@ -0,0 +1,330 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import logging
+from typing import Dict, Optional, Tuple
+
+import torch
+from generative_recommenders.common import fx_unwrap_optional_tensor, HammerModule
+from generative_recommenders.modules.positional_encoder import HSTUPositionalEncoder
+from generative_recommenders.modules.postprocessors import (
+    L2NormPostprocessor,
+    OutputPostprocessor,
+)
+from generative_recommenders.modules.preprocessors import InputPreprocessor
+from generative_recommenders.modules.stu import STU
+from generative_recommenders.ops.jagged_tensors import split_2D_jagged
+from torch.profiler import record_function
+
+logger: logging.Logger = logging.getLogger(__name__)
+torch.fx.wrap("len")
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+@torch.fx.wrap
+def default_seq_payload(
+    seq_payloads: Optional[Dict[str, torch.Tensor]],
+) -> Dict[str, torch.Tensor]:
+    if seq_payloads is None:
+        return {}
+    else:
+        return torch.jit._unwrap_optional(seq_payloads)
+
+
+class HSTUTransducer(HammerModule):
+    def __init__(
+        self,
+        stu_module: STU,
+        input_preprocessor: InputPreprocessor,
+        output_postprocessor: Optional[OutputPostprocessor] = None,
+        input_dropout_ratio: float = 0.0,
+        positional_encoder: Optional[HSTUPositionalEncoder] = None,
+        is_inference: bool = True,
+        return_full_embeddings: bool = False,
+        listwise: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._stu_module = stu_module
+        self._input_preprocessor: InputPreprocessor = input_preprocessor
+        self._output_postprocessor: OutputPostprocessor = (
+            output_postprocessor
+            if output_postprocessor is not None
+            else L2NormPostprocessor(is_inference=is_inference)
+        )
+        assert self._is_inference == self._input_preprocessor._is_inference, (
+            f"input_preprocessor must have the same mode; self: {self._is_inference} vs input_preprocessor {self._input_preprocessor._is_inference}"
+        )
+        self._positional_encoder: Optional[HSTUPositionalEncoder] = positional_encoder
+        self._input_dropout_ratio: float = input_dropout_ratio
+        self._return_full_embeddings: bool = return_full_embeddings
+        self._listwise_training: bool = listwise and self.is_train
+
+        for name, m in self.named_modules():
+            if "_stu_module" in name:
+                continue
+            elif isinstance(m, torch.nn.Linear):
+                torch.nn.init.xavier_normal_(m.weight)
+            elif isinstance(m, torch.nn.LayerNorm):
+                if m.weight.dim() >= 2:
+                    torch.nn.init.xavier_normal_(m.weight)
+                if m.bias is not None and m.bias.dim() >= 2:
+                    torch.nn.init.xavier_normal_(m.bias)
+
+    def _preprocess(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        int,
+        int,
+        int,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+    ]:
+        seq_payloads = default_seq_payload(seq_payloads)
+
+        with record_function("hstu_input_preprocessor"):
+            (
+                output_max_seq_len,
+                output_total_uih_len,
+                output_total_targets,
+                output_seq_lengths,
+                output_seq_offsets,
+                output_seq_timestamps,
+                output_seq_embeddings,
+                output_num_targets,
+                output_seq_payloads,
+            ) = self._input_preprocessor(
+                max_uih_len=max_uih_len,
+                max_targets=max_targets,
+                total_uih_len=total_uih_len,
+                total_targets=total_targets,
+                seq_lengths=seq_lengths,
+                seq_timestamps=seq_timestamps,
+                seq_embeddings=seq_embeddings,
+                num_targets=num_targets,
+                seq_payloads=seq_payloads,
+            )
+
+        with record_function("hstu_positional_encoder"):
+            if self._positional_encoder is not None:
+                output_seq_embeddings = self._positional_encoder(
+                    max_seq_len=output_max_seq_len,
+                    seq_lengths=output_seq_lengths,
+                    seq_offsets=output_seq_offsets,
+                    seq_timestamps=output_seq_timestamps,
+                    seq_embeddings=output_seq_embeddings,
+                    num_targets=(
+                        None if self._listwise_training else output_num_targets
+                    ),
+                )
+
+        output_seq_embeddings = torch.nn.functional.dropout(
+            output_seq_embeddings,
+            p=self._input_dropout_ratio,
+            training=self.training,
+        )
+
+        return (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+            output_seq_payloads,
+        )
+
+    def _hstu_compute(
+        self,
+        max_seq_len: int,
+        seq_lengths: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+    ) -> torch.Tensor:
+        with record_function("hstu"):
+            seq_embeddings = self._stu_module(
+                max_seq_len=max_seq_len,
+                x=seq_embeddings,
+                x_lengths=seq_lengths,
+                x_offsets=seq_offsets,
+                num_targets=(None if self._listwise_training else num_targets),
+            )
+        return seq_embeddings
+
+    def _postprocess(
+        self,
+        max_seq_len: int,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[Optional[torch.Tensor], torch.Tensor]:
+        with record_function("hstu_output_postprocessor"):
+            if self._return_full_embeddings:
+                seq_embeddings = self._output_postprocessor(
+                    seq_embeddings=seq_embeddings,
+                    seq_timestamps=seq_timestamps,
+                    seq_payloads=seq_payloads,
+                )
+            uih_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+                seq_lengths - num_targets
+            )
+            candidates_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+                num_targets
+            )
+            _, candidate_embeddings = split_2D_jagged(
+                values=seq_embeddings,
+                max_seq_len=max_seq_len,
+                total_len_left=total_uih_len,
+                total_len_right=total_targets,
+                max_len_left=max_uih_len,
+                max_len_right=max_targets,
+                offsets_left=uih_offsets,
+                offsets_right=candidates_offsets,
+                kernel=self.hammer_kernel(),
+            )
+            interleave_targets: bool = self._input_preprocessor.interleave_targets()
+            if interleave_targets:
+                candidate_embeddings = candidate_embeddings.view(
+                    -1, 2, candidate_embeddings.size(-1)
+                )[:, 0, :]
+            if not self._return_full_embeddings:
+                _, candidate_timestamps = split_2D_jagged(
+                    values=seq_timestamps.unsqueeze(-1),
+                    max_seq_len=max_seq_len,
+                    total_len_left=total_uih_len,
+                    total_len_right=total_targets,
+                    max_len_left=max_uih_len,
+                    max_len_right=max_targets,
+                    offsets_left=uih_offsets,
+                    offsets_right=candidates_offsets,
+                    kernel=self.hammer_kernel(),
+                )
+                candidate_timestamps = candidate_timestamps.squeeze(-1)
+                if interleave_targets:
+                    candidate_timestamps = candidate_timestamps.view(-1, 2)[:, 0]
+                candidate_embeddings = self._output_postprocessor(
+                    seq_embeddings=candidate_embeddings,
+                    seq_timestamps=candidate_timestamps,
+                    seq_payloads=seq_payloads,
+                )
+
+            return (
+                seq_embeddings if self._return_full_embeddings else None,
+                candidate_embeddings,
+            )
+
+    def forward(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+    ]:
+        orig_dtype = seq_embeddings.dtype
+        if not self._is_inference:
+            seq_embeddings = seq_embeddings.to(self._training_dtype)
+
+        (
+            max_seq_len,
+            total_uih_len,
+            total_targets,
+            seq_lengths,
+            seq_offsets,
+            seq_timestamps,
+            seq_embeddings,
+            num_targets,
+            seq_payloads,
+        ) = self._preprocess(
+            max_uih_len=max_uih_len,
+            max_targets=max_targets,
+            total_uih_len=total_uih_len,
+            total_targets=total_targets,
+            seq_lengths=seq_lengths,
+            seq_timestamps=seq_timestamps,
+            seq_embeddings=seq_embeddings,
+            num_targets=num_targets,
+            seq_payloads=seq_payloads,
+        )
+
+        encoded_embeddings = self._hstu_compute(
+            max_seq_len=max_seq_len,
+            seq_lengths=seq_lengths,
+            seq_offsets=seq_offsets,
+            seq_timestamps=seq_timestamps,
+            seq_embeddings=seq_embeddings,
+            num_targets=num_targets,
+        )
+
+        encoded_embeddings, encoded_candidate_embeddings = self._postprocess(
+            max_seq_len=max_seq_len,
+            max_uih_len=max_uih_len,
+            max_targets=max_targets,
+            total_uih_len=total_uih_len,
+            total_targets=total_targets,
+            seq_lengths=seq_lengths,
+            seq_embeddings=encoded_embeddings,
+            seq_timestamps=seq_timestamps,
+            num_targets=num_targets,
+            seq_payloads=seq_payloads,
+        )
+
+        if not self._is_inference:
+            encoded_candidate_embeddings = encoded_candidate_embeddings.to(orig_dtype)
+            if self._return_full_embeddings:
+                encoded_embeddings = fx_unwrap_optional_tensor(encoded_embeddings).to(
+                    orig_dtype
+                )
+        return (
+            encoded_candidate_embeddings,
+            encoded_embeddings,
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/multitask_module.py b/recommendation_v4/generative_recommenders/modules/multitask_module.py
new file mode 100644
index 000000000..3cb11996f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/multitask_module.py
@@ -0,0 +1,288 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import abc
+import logging
+from dataclasses import dataclass
+from enum import IntEnum
+from typing import Callable, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from generative_recommenders.common import HammerModule
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class MultitaskTaskType(IntEnum):
+    BINARY_CLASSIFICATION = 0
+    REGRESSION = 1
+
+
+@dataclass
+class TaskConfig:
+    task_name: str
+    task_weight: int
+    task_type: MultitaskTaskType
+
+
+class MultitaskModule(HammerModule):
+    @abc.abstractmethod
+    def forward(
+        self,
+        encoded_user_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+        supervision_labels: Dict[str, torch.Tensor],
+        supervision_weights: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
+        """
+        Computes multi-task predictions.
+
+        Args:
+            encoded_user_embeddings: (L, D) x float.
+            item_embeddings: (L, D) x float.
+            supervision_labels: Dict[T, L] x float or int
+            supervision_weights: Dict[T', L] x float or int, T' <= T
+        Returns:
+            (T, L) x float, predictions, labels, weights, losses
+        """
+        pass
+
+
+def _compute_pred_and_logits(
+    prediction_module: torch.nn.Module,
+    encoded_user_embeddings: torch.Tensor,
+    item_embeddings: torch.Tensor,
+    task_offsets: List[int],
+    has_multiple_task_types: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    mt_logits = prediction_module(encoded_user_embeddings * item_embeddings).transpose(
+        0, 1
+    )
+    mt_preds_list: List[torch.Tensor] = []
+    for task_type in MultitaskTaskType:
+        logits = mt_logits[
+            task_offsets[task_type] : task_offsets[task_type + 1],
+            :,
+        ]
+        if task_offsets[task_type + 1] - task_offsets[task_type] > 0:
+            if task_type == MultitaskTaskType.REGRESSION:
+                mt_preds_list.append(logits)
+            else:
+                mt_preds_list.append(F.sigmoid(logits))
+    if has_multiple_task_types:
+        mt_preds: torch.Tensor = torch.concat(mt_preds_list, dim=0)
+    else:
+        mt_preds: torch.Tensor = mt_preds_list[0]
+
+    return mt_preds, mt_logits
+
+
+def _compute_labels_and_weights(
+    supervision_labels: Dict[str, torch.Tensor],
+    supervision_weights: Dict[str, torch.Tensor],
+    task_configs: List[TaskConfig],
+    device: torch.device,
+    dtype: torch.dtype = torch.float32,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    first_label: torch.Tensor = list(supervision_labels.values())[0]
+    default_supervision_weight = torch.ones_like(
+        first_label,
+        dtype=dtype,
+        device=device,
+    )
+    mt_lables_list: List[torch.Tensor] = []
+    mt_weights_list: List[torch.Tensor] = []
+    for task in task_configs:
+        mt_lables_list.append(supervision_labels[task.task_name])
+        mt_weights_list.append(
+            supervision_weights.get(task.task_name, default_supervision_weight)
+        )
+    if len(task_configs) > 1:
+        mt_labels = torch.stack(mt_lables_list, dim=0)
+        mt_weights = torch.stack(mt_weights_list, dim=0)
+    else:
+        mt_labels = mt_lables_list[0].unsqueeze(0)
+        mt_weights = mt_weights_list[0].unsqueeze(0)
+    return mt_labels, mt_weights
+
+
+def _compute_loss(
+    task_offsets: List[int],
+    causal_multitask_weights: float,
+    mt_logits: torch.Tensor,
+    mt_labels: torch.Tensor,
+    mt_weights: torch.Tensor,
+    has_multiple_task_types: bool,
+) -> torch.Tensor:
+    mt_losses_list: List[torch.Tensor] = []
+    for task_type in MultitaskTaskType:
+        if task_offsets[task_type + 1] - task_offsets[task_type] > 0:
+            logits = mt_logits[
+                task_offsets[task_type] : task_offsets[task_type + 1],
+                :,
+            ]
+            labels = mt_labels[
+                task_offsets[task_type] : task_offsets[task_type + 1],
+                :,
+            ]
+            weights = mt_weights[
+                task_offsets[task_type] : task_offsets[task_type + 1],
+                :,
+            ]
+            if task_type == MultitaskTaskType.REGRESSION:
+                mt_losses_list.append(
+                    F.mse_loss(logits, labels, reduction="none") * weights
+                )
+            else:
+                mt_losses_list.append(
+                    F.binary_cross_entropy_with_logits(
+                        input=logits, target=labels, reduction="none"
+                    )
+                    * weights
+                )
+
+    if has_multiple_task_types:
+        mt_losses = torch.concat(mt_losses_list, dim=0)
+    else:
+        mt_losses = mt_losses_list[0]
+    mt_losses = (
+        mt_losses.sum(-1) / mt_weights.sum(-1).clamp(min=1.0) * causal_multitask_weights
+    )
+    return mt_losses
+
+
+class DefaultMultitaskModule(MultitaskModule):
+    def __init__(
+        self,
+        task_configs: List[TaskConfig],
+        embedding_dim: int,
+        prediction_fn: Callable[[int, int], torch.nn.Module],
+        causal_multitask_weights: float,
+        is_inference: bool,
+    ) -> None:
+        super().__init__(is_inference)
+        assert sorted(task_configs, key=lambda x: x.task_type) == task_configs, (
+            "task_configs must be sorted by task_type."
+        )
+        assert len(task_configs) > 0, "task_configs must be non-empty."
+        self._task_configs: List[TaskConfig] = task_configs
+        self._task_offsets: List[int] = [0] * (len(MultitaskTaskType) + 1)
+        for task in self._task_configs:
+            self._task_offsets[task.task_type + 1] += 1
+        self._has_multiple_task_types: bool = self._task_offsets.count(0) < len(
+            MultitaskTaskType
+        )
+        self._task_offsets[1:] = np.cumsum(self._task_offsets[1:]).tolist()
+        self._causal_multitask_weights: float = causal_multitask_weights
+        self._prediction_module: torch.nn.Module = prediction_fn(
+            embedding_dim, len(task_configs)
+        )
+
+    def forward(
+        self,
+        encoded_user_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+        supervision_labels: Dict[str, torch.Tensor],
+        supervision_weights: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
+        orig_dtype = encoded_user_embeddings.dtype
+        if not self._is_inference:
+            encoded_user_embeddings = encoded_user_embeddings.to(self._training_dtype)
+            item_embeddings = item_embeddings.to(self._training_dtype)
+
+        if torch.jit.is_scripting():
+            # Script-mode fast path: skip torch.autocast (unsupported in TS)
+            # and inline _compute_pred_and_logits to avoid its
+            # `torch.nn.Module` parameter annotation (TS only knows
+            # concrete module types). The dense module is already in bf16
+            # at this point, so autocast is a no-op for the predictor path.
+            mt_logits = self._prediction_module(
+                encoded_user_embeddings * item_embeddings
+            ).transpose(0, 1)
+            mt_preds_list: List[torch.Tensor] = []
+            # MultitaskTaskType is an IntEnum (BINARY_CLASSIFICATION=0,
+            # REGRESSION=1) but TorchScript treats it as an opaque Enum.
+            # Iterate by the integer task indices directly.
+            for task_type in range(len(self._task_offsets) - 1):
+                start = self._task_offsets[task_type]
+                end = self._task_offsets[task_type + 1]
+                logits = mt_logits[start:end, :]
+                if end - start > 0:
+                    # 1 == MultitaskTaskType.REGRESSION
+                    if task_type == 1:
+                        mt_preds_list.append(logits)
+                    else:
+                        mt_preds_list.append(F.sigmoid(logits))
+            if self._has_multiple_task_types:
+                mt_preds: torch.Tensor = torch.concat(mt_preds_list, dim=0)
+            else:
+                mt_preds: torch.Tensor = mt_preds_list[0]
+            return mt_preds, None, None, None
+
+        with torch.autocast(
+            "cuda",
+            dtype=torch.bfloat16,
+            enabled=(not self.is_inference and self._training_dtype == torch.bfloat16),
+        ):
+            mt_preds, mt_logits = _compute_pred_and_logits(
+                prediction_module=self._prediction_module,
+                encoded_user_embeddings=encoded_user_embeddings,
+                item_embeddings=item_embeddings,
+                task_offsets=self._task_offsets,
+                has_multiple_task_types=self._has_multiple_task_types,
+            )
+
+        # losses are always computed in fp32
+        mt_labels: Optional[torch.Tensor] = None
+        mt_weights: Optional[torch.Tensor] = None
+        mt_losses: Optional[torch.Tensor] = None
+        if not self._is_inference:
+            mt_labels, mt_weights = _compute_labels_and_weights(
+                supervision_labels=supervision_labels,
+                supervision_weights=supervision_weights,
+                task_configs=self._task_configs,
+                device=encoded_user_embeddings.device,
+            )
+            mt_losses = _compute_loss(
+                task_offsets=self._task_offsets,
+                causal_multitask_weights=self._causal_multitask_weights,
+                mt_logits=mt_logits.to(mt_labels.dtype),
+                mt_labels=mt_labels,
+                mt_weights=mt_weights,
+                has_multiple_task_types=self._has_multiple_task_types,
+            )
+            mt_preds = mt_preds.to(orig_dtype)
+
+        return (
+            mt_preds,
+            mt_labels,
+            mt_weights,
+            mt_losses,
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/positional_encoder.py b/recommendation_v4/generative_recommenders/modules/positional_encoder.py
new file mode 100644
index 000000000..99d904fd4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/positional_encoder.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from math import sqrt
+from typing import Optional
+
+import torch
+from generative_recommenders.common import HammerModule
+from generative_recommenders.ops.position import add_timestamp_positional_embeddings
+
+
+class HSTUPositionalEncoder(HammerModule):
+    def __init__(
+        self,
+        num_position_buckets: int,
+        num_time_buckets: int,
+        embedding_dim: int,
+        contextual_seq_len: int,
+        is_inference: bool = True,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._embedding_dim: int = embedding_dim
+        self._contextual_seq_len: int = contextual_seq_len
+        self._position_embeddings_weight: torch.nn.Parameter = torch.nn.Parameter(
+            torch.empty(num_position_buckets, embedding_dim).uniform_(
+                -sqrt(1.0 / num_position_buckets),
+                sqrt(1.0 / num_position_buckets),
+            ),
+        )
+        self._timestamp_embeddings_weight: torch.nn.Parameter = torch.nn.Parameter(
+            torch.empty(num_time_buckets + 1, embedding_dim).uniform_(
+                -sqrt(1.0 / num_time_buckets),
+                sqrt(1.0 / num_time_buckets),
+            ),
+        )
+
+    def forward(
+        self,
+        max_seq_len: int,
+        seq_lengths: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        seq_embeddings = add_timestamp_positional_embeddings(
+            alpha=self._embedding_dim**0.5,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=self._contextual_seq_len,
+            position_embeddings_weight=self._position_embeddings_weight,
+            timestamp_embeddings_weight=self._timestamp_embeddings_weight,
+            seq_offsets=seq_offsets,
+            seq_lengths=seq_lengths,
+            seq_embeddings=seq_embeddings,
+            timestamps=seq_timestamps,
+            num_targets=num_targets,
+            interleave_targets=False,
+            kernel=self.hammer_kernel(),
+        )
+        return seq_embeddings
diff --git a/recommendation_v4/generative_recommenders/modules/postprocessors.py b/recommendation_v4/generative_recommenders/modules/postprocessors.py
new file mode 100644
index 000000000..7958e3fa9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/postprocessors.py
@@ -0,0 +1,182 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from abc import abstractmethod
+from typing import Dict, List, Tuple
+
+import torch
+from generative_recommenders.common import HammerModule, init_mlp_weights_optional_bias
+
+
+@torch.fx.wrap
+def _cast_dtype(t: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    if t.dtype != dtype:
+        return t.to(dtype)
+    return t
+
+
+class OutputPostprocessor(HammerModule):
+    """An abstract class for post-processing user embeddings after HSTU layers."""
+
+    @abstractmethod
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Args:
+            seq_embeddings: (L, D)
+            seq_timestamps: (L, )
+            seq_payloads: str-keyed tensors. Implementation specific.
+
+        Returns:
+            postprocessed seq_embeddings, (L, D)
+        """
+        pass
+
+
+class L2NormPostprocessor(OutputPostprocessor):
+    """Postprocesses user embeddings with l2 norm."""
+
+    def __init__(self, is_inference: bool = False) -> None:
+        super().__init__(is_inference=is_inference)
+
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        return seq_embeddings / torch.linalg.norm(
+            seq_embeddings, ord=2, dim=-1, keepdim=True
+        ).clamp(min=1e-6)
+
+
+class LayerNormPostprocessor(OutputPostprocessor):
+    """Postprocesses user embeddings with layer norm."""
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+
+        self._layer_norm: torch.nn.Module = torch.nn.LayerNorm(
+            normalized_shape=[embedding_dim], eps=eps
+        )
+
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        # pyre-fixme[6]: For 1st argument expected `dtype` but got `Union[dtype,
+        #  Tensor, Module]`.
+        return self._layer_norm(seq_embeddings.to(self._layer_norm.weight.dtype))
+
+
+@torch.fx.wrap
+def _unsqueeze_if_needed(t: torch.Tensor, embedding: torch.Tensor) -> torch.Tensor:
+    if embedding.dim() == 3:
+        return t.unsqueeze(0)
+    return t
+
+
+class TimestampLayerNormPostprocessor(OutputPostprocessor):
+    """Postprocesses user embeddings with timestamp-based MLP -> layer norm."""
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        time_duration_features: List[Tuple[int, int]],
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+
+        self._layer_norm: torch.nn.Module = torch.nn.LayerNorm(
+            normalized_shape=[embedding_dim], eps=eps
+        )
+        self.register_buffer(
+            "_period_units",
+            torch.Tensor([f[0] for f in time_duration_features]).view(1, -1),
+        )
+        self.register_buffer(
+            "_units_per_period",
+            torch.Tensor([f[1] for f in time_duration_features]).view(1, -1),
+        )
+        self._time_feature_combiner: torch.nn.Module = torch.nn.Linear(
+            embedding_dim + 2 * len(time_duration_features),
+            embedding_dim,
+        ).apply(init_mlp_weights_optional_bias)
+
+    def _concat_time_features(
+        self,
+        combined_embeddings: torch.Tensor,
+        timestamps: torch.Tensor,  # [B] or [B, D]
+    ) -> torch.Tensor:
+        # concat time representation to combined embeddings
+        period_units = self._period_units
+        units_per_period = self._units_per_period
+
+        timestamps = timestamps.unsqueeze(-1)
+        period_units = _unsqueeze_if_needed(period_units, combined_embeddings)
+        units_per_period = _unsqueeze_if_needed(
+            units_per_period, combined_embeddings
+        ).float()
+        # Compute time features in float32 to avoid bf16 precision loss through
+        # discontinuous floor/remainder ops, matching Inductor fusion behavior.
+        _units_elapsed_type: torch.dtype = combined_embeddings.dtype
+        _units_since_epoch = torch.div(
+            timestamps.float(), period_units.float(), rounding_mode="floor"
+        )  # [sum(N_i), num_time_features] or [B, N, num_time_features]
+        _units_elapsed = (
+            (torch.remainder(_units_since_epoch, units_per_period) / units_per_period)
+            * 2
+            * 3.14
+        )
+        _units_elapsed = torch.view_as_real(
+            torch.polar(
+                _cast_dtype(torch.ones_like(_units_elapsed), torch.float32),
+                _cast_dtype(_units_elapsed, torch.float32),
+            )
+        ).flatten(
+            -2, -1
+        )  # [sum(N_i), num_time_features * 2] or [B, N, num_time_features * 2]
+        _units_elapsed = _cast_dtype(_units_elapsed, _units_elapsed_type)
+        combined_embeddings = torch.cat([combined_embeddings, _units_elapsed], dim=-1)
+        return combined_embeddings
+
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        user_embeddings = self._time_feature_combiner(
+            self._concat_time_features(seq_embeddings, timestamps=seq_timestamps).to(
+                self._time_feature_combiner.weight.dtype  # pyre-fixme[6]: For 1st argument expected `dtype` but got `Union[dtype,
+                #  Tensor, Module]`.
+            )
+        )
+        return self._layer_norm(user_embeddings)
diff --git a/recommendation_v4/generative_recommenders/modules/preprocessors.py b/recommendation_v4/generative_recommenders/modules/preprocessors.py
new file mode 100644
index 000000000..dc7806bb4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/preprocessors.py
@@ -0,0 +1,334 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import abc
+from math import sqrt
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from generative_recommenders.common import (
+    fx_unwrap_optional_tensor,
+    HammerModule,
+    init_mlp_weights_optional_bias,
+    jagged_to_padded_dense,
+)
+from generative_recommenders.modules.action_encoder import ActionEncoder
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+from generative_recommenders.ops.layer_norm import LayerNorm, SwishLayerNorm
+
+
+class InputPreprocessor(HammerModule):
+    """An abstract class for pre-processing sequence embeddings before HSTU layers."""
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        int,
+        int,
+        int,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+    ]:
+        """
+        Args:
+            max_uih_len: int
+            max_targets: int
+            total_uih_len: int
+            total_targets: int
+            seq_lengths: (B,)
+            seq_embeddings: (L, D)
+            seq_timestamps: (B, N)
+            num_targets: (B,) Optional.
+            seq_payloads: str-keyed tensors. Implementation specific.
+
+        Returns:
+            (max_seq_len, total_uih_len, total_targets, lengths, offsets, timestamps, embeddings, num_targets, payloads) updated based on input preprocessor.
+        """
+        pass
+
+    def interleave_targets(self) -> bool:
+        return False
+
+
+def get_contextual_input_embeddings(
+    seq_lengths: torch.Tensor,
+    seq_payloads: Dict[str, torch.Tensor],
+    contextual_feature_to_max_length: Dict[str, int],
+    contextual_feature_to_min_uih_length: Dict[str, int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    padded_values: List[torch.Tensor] = []
+    for key, max_len in contextual_feature_to_max_length.items():
+        v = torch.flatten(
+            jagged_to_padded_dense(
+                values=seq_payloads[key].to(dtype),
+                offsets=[seq_payloads[key + "_offsets"]],
+                max_lengths=[max_len],
+                padding_value=0.0,
+            ),
+            1,
+            2,
+        )
+        min_uih_length = contextual_feature_to_min_uih_length.get(key, 0)
+        if min_uih_length > 0:
+            v = v * (seq_lengths.view(-1, 1) >= min_uih_length)
+        padded_values.append(v)
+    return torch.cat(padded_values, dim=1)
+
+
+class ContextualPreprocessor(InputPreprocessor):
+    def __init__(
+        self,
+        input_embedding_dim: int,
+        hidden_dim: int,
+        output_embedding_dim: int,
+        contextual_feature_to_max_length: Dict[str, int],
+        contextual_feature_to_min_uih_length: Dict[str, int],
+        action_embedding_dim: int = 8,
+        action_feature_name: str = "",
+        action_weights: Optional[List[int]] = None,
+        additional_embedding_features: List[str] = [],
+        action_embedding_init_std: float = 0.1,
+        is_inference: bool = True,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._output_embedding_dim: int = output_embedding_dim
+        self._input_embedding_dim: int = input_embedding_dim
+        self._hidden_dim: int = hidden_dim
+        self._contextual_feature_to_max_length: Dict[str, int] = (
+            contextual_feature_to_max_length
+        )
+        self._max_contextual_seq_len: int = sum(
+            contextual_feature_to_max_length.values()
+        )
+        self._contextual_feature_to_min_uih_length: Dict[str, int] = (
+            contextual_feature_to_min_uih_length
+        )
+        if self._max_contextual_seq_len > 0:
+            std = 1.0 * sqrt(
+                2.0 / float(input_embedding_dim + self._output_embedding_dim)
+            )
+            self._batched_contextual_linear_weights: torch.nn.Parameter = (
+                torch.nn.Parameter(
+                    torch.empty(
+                        (
+                            self._max_contextual_seq_len,
+                            input_embedding_dim,
+                            self._output_embedding_dim,
+                        )
+                    ).normal_(0.0, std)
+                )
+            )
+            self._batched_contextual_linear_bias: torch.nn.Parameter = (
+                torch.nn.Parameter(
+                    torch.empty(
+                        (self._max_contextual_seq_len, self._output_embedding_dim)
+                    ).fill_(0.0)
+                )
+            )
+        self._content_embedding_mlp: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=self._input_embedding_dim,
+                out_features=self._hidden_dim,
+            ),
+            SwishLayerNorm(self._hidden_dim),
+            torch.nn.Linear(
+                in_features=self._hidden_dim,
+                out_features=self._output_embedding_dim,
+            ),
+            LayerNorm(self._output_embedding_dim),
+        ).apply(init_mlp_weights_optional_bias)
+        self._additional_embedding_features: List[str] = additional_embedding_features
+        self._additional_embedding_mlp: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=self._input_embedding_dim
+                * len(additional_embedding_features),
+                out_features=self._hidden_dim,
+            ),
+            SwishLayerNorm(self._hidden_dim),
+            torch.nn.Linear(
+                in_features=self._hidden_dim,
+                out_features=self._output_embedding_dim,
+            ),
+            LayerNorm(self._output_embedding_dim),
+        ).apply(init_mlp_weights_optional_bias)
+        self._action_feature_name: str = action_feature_name
+        self._action_weights: Optional[List[int]] = action_weights
+        if self._action_weights is not None:
+            self._action_encoder: ActionEncoder = ActionEncoder(
+                action_feature_name=action_feature_name,
+                action_weights=self._action_weights,
+                action_embedding_dim=action_embedding_dim,
+                embedding_init_std=action_embedding_init_std,
+                is_inference=is_inference,
+            )
+            self._action_embedding_mlp: torch.nn.Module = torch.nn.Sequential(
+                torch.nn.Linear(
+                    in_features=self._action_encoder.output_embedding_dim,
+                    out_features=self._hidden_dim,
+                ),
+                SwishLayerNorm(self._hidden_dim),
+                torch.nn.Linear(
+                    in_features=self._hidden_dim,
+                    out_features=self._output_embedding_dim,
+                ),
+                LayerNorm(self._output_embedding_dim),
+            ).apply(init_mlp_weights_optional_bias)
+
+    def forward(  # noqa C901
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        int,
+        int,
+        int,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+    ]:
+        output_seq_embeddings = self._content_embedding_mlp(seq_embeddings)
+        if len(self._additional_embedding_features) > 0:
+            additional_embeddings = torch.cat(
+                [
+                    seq_payloads[feature]
+                    for feature in self._additional_embedding_features
+                ],
+                dim=1,
+            )
+            output_seq_embeddings = (
+                output_seq_embeddings
+                + self._additional_embedding_mlp(additional_embeddings)
+            )
+        max_seq_len = max_uih_len + max_targets
+        target_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(num_targets)
+        seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(seq_lengths)
+        uih_offsets = seq_offsets - target_offsets
+        if self._action_weights is not None:
+            action_embeddings = self._action_encoder(
+                max_uih_len=max_uih_len,
+                max_targets=max_targets,
+                uih_offsets=uih_offsets,
+                target_offsets=target_offsets,
+                seq_embeddings=seq_embeddings,
+                seq_payloads=seq_payloads,
+            )
+            output_seq_embeddings = output_seq_embeddings + self._action_embedding_mlp(
+                action_embeddings
+            )
+
+        output_max_seq_len = max_seq_len
+        output_total_uih_len = total_uih_len
+        output_total_targets = total_targets
+        output_seq_lengths = seq_lengths
+        output_num_targets = num_targets
+        output_seq_timestamps = seq_timestamps
+        output_seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            output_seq_lengths
+        )
+        # concat contextual embeddings
+        if self._max_contextual_seq_len > 0:
+            contextual_input_embeddings = get_contextual_input_embeddings(
+                seq_lengths=seq_lengths,
+                seq_payloads=seq_payloads,
+                contextual_feature_to_max_length=self._contextual_feature_to_max_length,
+                contextual_feature_to_min_uih_length=self._contextual_feature_to_min_uih_length,
+                dtype=seq_embeddings.dtype,
+            )
+            contextual_embeddings = torch.baddbmm(
+                self._batched_contextual_linear_bias.view(
+                    -1, 1, self._output_embedding_dim
+                ).to(contextual_input_embeddings.dtype),
+                contextual_input_embeddings.view(
+                    -1, self._max_contextual_seq_len, self._input_embedding_dim
+                ).transpose(0, 1),
+                self._batched_contextual_linear_weights.to(
+                    contextual_input_embeddings.dtype
+                ),
+            ).transpose(0, 1)
+            output_seq_embeddings = concat_2D_jagged(
+                max_seq_len=self._max_contextual_seq_len + output_max_seq_len,
+                values_left=fx_unwrap_optional_tensor(contextual_embeddings).reshape(
+                    -1, self._output_embedding_dim
+                ),
+                values_right=output_seq_embeddings,
+                max_len_left=self._max_contextual_seq_len,
+                max_len_right=output_max_seq_len,
+                offsets_left=None,
+                offsets_right=output_seq_offsets,
+                kernel=self.hammer_kernel(),
+            )
+            output_seq_timestamps = concat_2D_jagged(
+                max_seq_len=self._max_contextual_seq_len + output_max_seq_len,
+                values_left=torch.zeros(
+                    (output_seq_lengths.size(0) * self._max_contextual_seq_len, 1),
+                    dtype=output_seq_timestamps.dtype,
+                    device=output_seq_timestamps.device,
+                ),
+                values_right=output_seq_timestamps.unsqueeze(-1),
+                max_len_left=self._max_contextual_seq_len,
+                max_len_right=output_max_seq_len,
+                offsets_left=None,
+                offsets_right=output_seq_offsets,
+                kernel=self.hammer_kernel(),
+            ).squeeze(-1)
+            output_max_seq_len = output_max_seq_len + self._max_contextual_seq_len
+            output_seq_lengths = output_seq_lengths + self._max_contextual_seq_len
+            output_seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+                output_seq_lengths
+            )
+            output_total_uih_len = (
+                output_total_uih_len
+                + self._max_contextual_seq_len * output_seq_lengths.size(0)
+            )
+
+        return (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+            seq_payloads,
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/stu.py b/recommendation_v4/generative_recommenders/modules/stu.py
new file mode 100644
index 000000000..45c6ea5f3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/stu.py
@@ -0,0 +1,471 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+import abc
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import torch
+from generative_recommenders.common import fx_unwrap_optional_tensor, HammerModule
+from generative_recommenders.ops.hstu_attention import delta_hstu_mha
+from generative_recommenders.ops.hstu_compute import (
+    hstu_compute_output,
+    hstu_compute_uqvk,
+    hstu_preprocess_and_attention,
+)
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged, split_2D_jagged
+from torch.autograd.profiler import record_function
+
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+class STU(HammerModule, abc.ABC):
+    def cached_forward(
+        self,
+        delta_x: torch.Tensor,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        pass
+
+
+@dataclass
+class STULayerConfig:
+    embedding_dim: int
+    num_heads: int
+    hidden_dim: int
+    attention_dim: int
+    output_dropout_ratio: float = 0.3
+    causal: bool = True
+    target_aware: bool = True
+    max_attn_len: Optional[int] = None
+    attn_alpha: Optional[float] = None
+    use_group_norm: bool = False
+    recompute_normed_x: bool = True
+    recompute_uvqk: bool = True
+    recompute_y: bool = True
+    sort_by_length: bool = True
+    contextual_seq_len: int = 0
+
+
+@torch.fx.wrap
+def _update_kv_cache(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    k: Optional[torch.Tensor],
+    v: Optional[torch.Tensor],
+    max_kv_caching_len: int,
+    kv_caching_lengths: Optional[torch.Tensor],
+    orig_k_cache: Optional[torch.Tensor],
+    orig_v_cache: Optional[torch.Tensor],
+    orig_max_kv_caching_len: int,
+    orig_kv_caching_offsets: Optional[torch.Tensor],
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], int, Optional[torch.Tensor]]:
+    if kv_caching_lengths is not None:
+        kv_caching_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            kv_caching_lengths
+        )
+        delta_offsets = seq_offsets - kv_caching_offsets
+        k_cache, _ = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=fx_unwrap_optional_tensor(k).flatten(1, 2),
+            max_len_left=None,
+            max_len_right=None,
+            offsets_left=kv_caching_offsets,
+            offsets_right=delta_offsets,
+        )
+        v_cache, _ = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=fx_unwrap_optional_tensor(v).flatten(1, 2),
+            max_len_left=None,
+            max_len_right=None,
+            offsets_left=kv_caching_offsets,
+            offsets_right=delta_offsets,
+        )
+        if max_kv_caching_len == 0:
+            max_kv_caching_len = int(kv_caching_lengths.max().item())
+        return (
+            k_cache,
+            v_cache,
+            max_kv_caching_len,
+            kv_caching_offsets,
+        )
+    else:
+        return (
+            orig_k_cache,
+            orig_v_cache,
+            orig_max_kv_caching_len,
+            orig_kv_caching_offsets,
+        )
+
+
+@torch.fx.wrap
+def _construct_full_kv(
+    delta_k: torch.Tensor,
+    delta_v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    max_kv_caching_len: int,
+    kv_caching_offsets: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor]:
+    L, _ = delta_k.shape
+    B = kv_caching_offsets.shape[0] - 1
+    delta_size = L // B
+    full_k = concat_2D_jagged(
+        max_seq_len=max_kv_caching_len + delta_size,
+        values_left=k_cache,
+        values_right=delta_k,
+        max_len_left=max_kv_caching_len,
+        max_len_right=delta_size,
+        offsets_left=kv_caching_offsets,
+        offsets_right=None,
+    )
+    full_v = concat_2D_jagged(
+        max_seq_len=max_kv_caching_len + delta_size,
+        values_left=v_cache,
+        values_right=delta_v,
+        max_len_left=max_kv_caching_len,
+        max_len_right=delta_size,
+        offsets_left=kv_caching_offsets,
+        offsets_right=None,
+    )
+    full_kv_caching_offsets = kv_caching_offsets + delta_size * torch.arange(
+        B + 1, device=delta_k.device
+    )
+    return (
+        full_k,
+        full_v,
+        max_kv_caching_len + delta_size,
+        full_kv_caching_offsets,
+    )
+
+
+class STULayer(STU):
+    max_kv_caching_len: int
+    k_cache: Optional[torch.Tensor]
+    v_cache: Optional[torch.Tensor]
+    kv_caching_offsets: Optional[torch.Tensor]
+
+    def __init__(
+        self,
+        config: STULayerConfig,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(
+            is_inference=is_inference,
+        )
+        self.reset_kv_cache()
+        self._num_heads: int = config.num_heads
+        self._embedding_dim: int = config.embedding_dim
+        self._hidden_dim: int = config.hidden_dim
+        self._attention_dim: int = config.attention_dim
+        self._output_dropout_ratio: float = config.output_dropout_ratio
+        self._target_aware: bool = config.target_aware
+        self._causal: bool = config.causal
+        self._max_attn_len: int = config.max_attn_len or 0
+        self._attn_alpha: float = config.attn_alpha or 1.0 / (self._attention_dim**0.5)
+        self._use_group_norm: bool = config.use_group_norm
+        self._recompute_normed_x: bool = config.recompute_normed_x
+        self._recompute_uvqk: bool = config.recompute_uvqk
+        self._recompute_y: bool = config.recompute_y
+        self._sort_by_length: bool = config.sort_by_length
+        self._contextual_seq_len: int = config.contextual_seq_len
+
+        self._uvqk_weight: torch.nn.Parameter = torch.nn.Parameter(
+            torch.empty(
+                (
+                    self._embedding_dim,
+                    (self._hidden_dim * 2 + self._attention_dim * 2) * self._num_heads,
+                )
+            ),
+        )
+        torch.nn.init.xavier_uniform_(self._uvqk_weight)
+        self._uvqk_beta: torch.nn.Parameter = torch.nn.Parameter(
+            torch.zeros(
+                (self._hidden_dim * 2 + self._attention_dim * 2) * self._num_heads,
+            ),
+        )
+        self._input_norm_weight: torch.nn.Parameter = torch.nn.Parameter(
+            torch.ones((self._embedding_dim,)),
+        )
+        self._input_norm_bias: torch.nn.Parameter = torch.nn.Parameter(
+            torch.zeros((self._embedding_dim,)),
+        )
+        self._output_weight = torch.nn.Parameter(
+            torch.empty(
+                (
+                    self._hidden_dim * self._num_heads * 3,
+                    self._embedding_dim,
+                )
+            ),
+        )
+        torch.nn.init.xavier_uniform_(self._output_weight)
+        output_norm_shape: int = (
+            self._hidden_dim * self._num_heads
+            if not self._use_group_norm
+            else self._num_heads
+        )
+        self._output_norm_weight: torch.nn.Parameter = torch.nn.Parameter(
+            torch.ones((output_norm_shape,)),
+        )
+        self._output_norm_bias: torch.nn.Parameter = torch.nn.Parameter(
+            torch.zeros((output_norm_shape,)),
+        )
+
+    def reset_kv_cache(self) -> None:
+        self.k_cache = None
+        self.v_cache = None
+        self.kv_caching_offsets = None
+        self.max_kv_caching_len = 0
+
+    def update_kv_cache(
+        self,
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        k: Optional[torch.Tensor],
+        v: Optional[torch.Tensor],
+        max_kv_caching_len: int,
+        kv_caching_lengths: Optional[torch.Tensor],
+    ) -> None:
+        self.k_cache, self.v_cache, self.max_kv_caching_len, self.kv_caching_offsets = (
+            _update_kv_cache(
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                k=k,
+                v=v,
+                max_kv_caching_len=max_kv_caching_len,
+                kv_caching_lengths=kv_caching_lengths,
+                orig_k_cache=self.k_cache,
+                orig_v_cache=self.v_cache,
+                orig_max_kv_caching_len=self.max_kv_caching_len,
+                orig_kv_caching_offsets=self.kv_caching_offsets,
+            )
+        )
+
+    def construct_full_kv(
+        self,
+        delta_k: torch.Tensor,
+        delta_v: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor]:
+        return _construct_full_kv(
+            delta_k=delta_k,
+            delta_v=delta_v,
+            k_cache=fx_unwrap_optional_tensor(self.k_cache),
+            v_cache=fx_unwrap_optional_tensor(self.v_cache),
+            max_kv_caching_len=self.max_kv_caching_len,
+            kv_caching_offsets=self.kv_caching_offsets,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        with record_function("## stu_preprocess_and_attention ##"):
+            u, attn_output, k, v = hstu_preprocess_and_attention(
+                x=x,
+                norm_weight=self._input_norm_weight.to(x.dtype),
+                norm_bias=self._input_norm_bias.to(x.dtype),
+                norm_eps=1e-6,
+                num_heads=self._num_heads,
+                attn_dim=self._attention_dim,
+                hidden_dim=self._hidden_dim,
+                uvqk_weight=self._uvqk_weight.to(x.dtype),
+                uvqk_bias=self._uvqk_beta.to(x.dtype),
+                max_seq_len=max_seq_len,
+                seq_offsets=x_offsets,
+                attn_alpha=self._attn_alpha,
+                causal=self._causal,
+                num_targets=num_targets if self._target_aware else None,
+                max_attn_len=self._max_attn_len,
+                contextual_seq_len=self._contextual_seq_len,
+                recompute_uvqk_in_backward=self._recompute_uvqk,
+                recompute_normed_x_in_backward=self._recompute_normed_x,
+                sort_by_length=self._sort_by_length,
+                prefill=kv_caching_lengths is not None,
+                kernel=self.hammer_kernel(),
+            )
+
+        self.update_kv_cache(
+            max_seq_len=max_seq_len,
+            seq_offsets=x_offsets,
+            k=k,
+            v=v,
+            max_kv_caching_len=max_kv_caching_len,
+            kv_caching_lengths=kv_caching_lengths,
+        )
+
+        with record_function("## stu_compute_output ##"):
+            return hstu_compute_output(
+                attn=attn_output,
+                u=u,
+                x=x,
+                norm_weight=self._output_norm_weight.to(x.dtype),
+                norm_bias=self._output_norm_bias.to(x.dtype),
+                norm_eps=1e-6,
+                dropout_ratio=self._output_dropout_ratio,
+                output_weight=self._output_weight.to(x.dtype),
+                group_norm=self._use_group_norm,
+                num_heads=self._num_heads,
+                linear_dim=self._hidden_dim,
+                concat_u=True,
+                concat_x=True,
+                mul_u_activation_type="none",
+                training=self.training,
+                kernel=self.hammer_kernel(),
+                recompute_y_in_backward=self._recompute_y,
+            )
+
+    def cached_forward(
+        self,
+        delta_x: torch.Tensor,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        with record_function("## stu_compute_uqvk ##"):
+            delta_u, delta_q, delta_k, delta_v = hstu_compute_uqvk(
+                x=delta_x,
+                norm_weight=self._input_norm_weight.to(delta_x.dtype),
+                norm_bias=self._input_norm_bias.to(delta_x.dtype),
+                norm_eps=1e-6,
+                num_heads=self._num_heads,
+                attn_dim=self._attention_dim,
+                hidden_dim=self._hidden_dim,
+                uvqk_weight=self._uvqk_weight.to(delta_x.dtype),
+                uvqk_bias=self._uvqk_beta.to(delta_x.dtype),
+                kernel=self.hammer_kernel(),
+            )
+        k, v, max_seq_len, seq_offsets = self.construct_full_kv(
+            delta_k=delta_k.flatten(1, 2),
+            delta_v=delta_v.flatten(1, 2),
+        )
+        self.update_kv_cache(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            k=k,
+            v=v,
+            max_kv_caching_len=max_kv_caching_len,
+            kv_caching_lengths=kv_caching_lengths,
+        )
+        k = k.view(-1, self._num_heads, self._attention_dim)
+        v = v.view(-1, self._num_heads, self._hidden_dim)
+        with record_function("## delta_hstu_mha ##"):
+            delta_attn_output = delta_hstu_mha(
+                max_seq_len=max_seq_len,
+                alpha=self._attn_alpha,
+                delta_q=delta_q,
+                k=k,
+                v=v,
+                seq_offsets=seq_offsets,
+                num_targets=num_targets if self._target_aware else None,
+                max_attn_len=self._max_attn_len,
+                contextual_seq_len=self._contextual_seq_len,
+                kernel=self.hammer_kernel(),
+            ).view(-1, self._hidden_dim * self._num_heads)
+        with record_function("## stu_compute_output ##"):
+            return hstu_compute_output(
+                attn=delta_attn_output,
+                u=delta_u,
+                x=delta_x,
+                norm_weight=self._output_norm_weight.to(delta_x.dtype),
+                norm_bias=self._output_norm_bias.to(delta_x.dtype),
+                norm_eps=1e-6,
+                dropout_ratio=self._output_dropout_ratio,
+                output_weight=self._output_weight.to(delta_x.dtype),
+                group_norm=self._use_group_norm,
+                num_heads=self._num_heads,
+                linear_dim=self._hidden_dim,
+                concat_u=True,
+                concat_x=True,
+                mul_u_activation_type="none",
+                training=self.training,
+                kernel=self.hammer_kernel(),
+                recompute_y_in_backward=self._recompute_y,
+            )
+
+
+class STUStack(STU):
+    def __init__(
+        self,
+        stu_list: List[STU],
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._stu_layers: torch.nn.ModuleList = torch.nn.ModuleList(modules=stu_list)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for layer in self._stu_layers:
+            x = layer(
+                x=x,
+                x_lengths=x_lengths,
+                x_offsets=x_offsets,
+                max_seq_len=max_seq_len,
+                num_targets=num_targets,
+                max_kv_caching_len=max_kv_caching_len,
+                kv_caching_lengths=kv_caching_lengths,
+            )
+        return x
+
+    def cached_forward(
+        self,
+        delta_x: torch.Tensor,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for layer in self._stu_layers:
+            delta_x = layer.cached_forward(  # pyre-ignore [29]
+                delta_x=delta_x,
+                num_targets=num_targets,
+                max_kv_caching_len=max_kv_caching_len,
+                kv_caching_lengths=kv_caching_lengths,
+            )
+        return delta_x
diff --git a/recommendation_v4/generative_recommenders/modules/tests/action_encoder_test.py b/recommendation_v4/generative_recommenders/modules/tests/action_encoder_test.py
new file mode 100644
index 000000000..184b314ea
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/action_encoder_test.py
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from generative_recommenders.modules.action_encoder import ActionEncoder
+
+
+class ActionEncoderTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_forward(self) -> None:
+        device = torch.device("cuda")
+        action_embedding_dim = 32
+        action_weights = [1, 2, 4, 8, 16]
+        watchtime_to_action_thresholds_and_weights = [
+            (30, 32),
+            (60, 64),
+            (100, 128),
+        ]
+        num_action_types = len(action_weights) + len(
+            watchtime_to_action_thresholds_and_weights
+        )
+        combined_action_weights = action_weights + [
+            x[1] for x in watchtime_to_action_thresholds_and_weights
+        ]
+        enabled_actions = [
+            [0],
+            [0, 1],
+            [1, 3, 4],
+            [1, 2, 3, 4],
+            [1, 2],
+            [2],
+        ]
+        watchtimes = [40, 20, 110, 31, 26, 55]
+        for i, wt in enumerate(watchtimes):
+            for j, w in enumerate(watchtime_to_action_thresholds_and_weights):
+                if wt > w[0]:
+                    enabled_actions[i].append(j + len(action_weights))
+        actions = [
+            sum([combined_action_weights[t] for t in x]) for x in enabled_actions
+        ]
+
+        encoder = ActionEncoder(
+            watchtime_feature_name="watchtimes",
+            action_feature_name="actions",
+            action_weights=action_weights,
+            watchtime_to_action_thresholds_and_weights=watchtime_to_action_thresholds_and_weights,
+            action_embedding_dim=action_embedding_dim,
+            is_inference=False,
+        ).to(device)
+
+        seq_lengths = [6, 3]
+        seq_offsets = [0, 6, 9]
+        num_targets = [2, 1]
+        uih_offsets = [0, 4, 6]
+        target_offsets = [0, 2, 3]
+        seq_embeddings = torch.rand(9, 128, device=device)
+        action_embeddings = encoder(
+            max_uih_len=4,
+            max_targets=2,
+            uih_offsets=torch.tensor(uih_offsets, device=device),
+            target_offsets=torch.tensor(target_offsets, device=device),
+            seq_embeddings=seq_embeddings,
+            seq_payloads={
+                "watchtimes": torch.tensor(watchtimes, device=device),
+                "actions": torch.tensor(actions, device=device),
+            },
+        )
+        self.assertEqual(
+            action_embeddings.shape, (9, action_embedding_dim * num_action_types)
+        )
+        for b in range(len(seq_lengths)):
+            b_start = seq_offsets[b]
+            b_end = seq_offsets[b + 1]
+            u_start = uih_offsets[b]
+            for j in range(b_start, b_end):
+                embedding = action_embeddings[j].view(num_action_types, -1)
+                for atype in range(num_action_types):
+                    if b_end - j <= num_targets[b]:
+                        torch.testing.assert_allclose(
+                            embedding[atype],
+                            encoder._target_action_embedding_table.view(
+                                num_action_types, -1
+                            )[atype],
+                        )
+                    else:
+                        if atype in enabled_actions[j - b_start + u_start]:
+                            torch.testing.assert_allclose(
+                                embedding[atype],
+                                encoder._action_embedding_table[atype],
+                            )
+                        else:
+                            torch.testing.assert_allclose(
+                                embedding[atype], torch.zeros_like(embedding[atype])
+                            )
+        action_embeddings.sum().backward()
diff --git a/recommendation_v4/generative_recommenders/modules/tests/content_encoder_test.py b/recommendation_v4/generative_recommenders/modules/tests/content_encoder_test.py
new file mode 100644
index 000000000..e67656388
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/content_encoder_test.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from generative_recommenders.modules.content_encoder import ContentEncoder
+
+
+class ContentEncoderTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_forward(self) -> None:
+        device = torch.device("cuda")
+        input_embedding_dim = 32
+        additional_embedding_dim = 64
+        enrich_embedding_dim = 16
+        encoder = ContentEncoder(
+            input_embedding_dim=input_embedding_dim,
+            additional_content_features={
+                "a0": additional_embedding_dim,
+                "a1": additional_embedding_dim,
+            },
+            target_enrich_features={
+                "t0": enrich_embedding_dim,
+                "t1": enrich_embedding_dim,
+            },
+            is_inference=False,
+        ).to(device)
+        seq_lengths = [6, 3]
+        num_targets = [2, 1]
+        uih_offsets = [0, 4, 6]
+        target_offsets = [0, 2, 3]
+        seq_embeddings = torch.rand(
+            sum(seq_lengths), input_embedding_dim, device=device
+        ).requires_grad_(True)
+        seq_payloads = {
+            "a0": torch.rand(
+                sum(seq_lengths), additional_embedding_dim, device=device
+            ).requires_grad_(True),
+            "a1": torch.rand(
+                sum(seq_lengths), additional_embedding_dim, device=device
+            ).requires_grad_(True),
+            "t0": torch.rand(
+                sum(num_targets), enrich_embedding_dim, device=device
+            ).requires_grad_(True),
+            "t1": torch.rand(
+                sum(num_targets), enrich_embedding_dim, device=device
+            ).requires_grad_(True),
+        }
+        content_embeddings = encoder(
+            max_uih_len=4,
+            max_targets=2,
+            uih_offsets=torch.tensor(uih_offsets, device=device),
+            target_offsets=torch.tensor(target_offsets, device=device),
+            seq_embeddings=seq_embeddings,
+            seq_payloads=seq_payloads,
+        )
+        content_embeddings.sum().backward()
diff --git a/recommendation_v4/generative_recommenders/modules/tests/contextual_interleave_preprocessor_test.py b/recommendation_v4/generative_recommenders/modules/tests/contextual_interleave_preprocessor_test.py
new file mode 100644
index 000000000..c3202072c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/contextual_interleave_preprocessor_test.py
@@ -0,0 +1,499 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from generative_recommenders.modules.action_encoder import ActionEncoder
+from generative_recommenders.modules.content_encoder import ContentEncoder
+from generative_recommenders.modules.contextual_interleave_preprocessor import (
+    ContextualInterleavePreprocessor,
+)
+from generative_recommenders.modules.contextualize_mlps import (
+    ParameterizedContextualizedMLP,
+    SimpleContextualizedMLP,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class ContextualInterleavePreprocessorTest(unittest.TestCase):
+    # pyre-ignore
+    @given(
+        enable_interleaving=st.sampled_from([True, False]),
+        enable_pmlp=st.sampled_from([True, False]),
+        is_train=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=50, deadline=None)
+    def test_forward(
+        self,
+        enable_interleaving: bool,
+        enable_pmlp: bool,
+        is_train: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        device = torch.device("cuda")
+
+        input_embedding_dim = 64
+        output_embedding_dim = 32
+        action_embedding_dim = 16
+        action_encoder_hidden_dim = 256
+        content_encoder_hidden_dim = 128
+        contextual_len = 3
+
+        content_encoder = ContentEncoder(
+            input_embedding_dim=input_embedding_dim,
+            additional_content_features={
+                "a0": input_embedding_dim,
+                "a1": input_embedding_dim,
+            },
+            target_enrich_features={
+                "t0": input_embedding_dim,
+                "t1": input_embedding_dim,
+            },
+            is_inference=False,
+        ).to(device)
+        action_embedding_dim = 32
+        action_weights = [1, 2, 4, 8, 16]
+        watchtime_to_action_thresholds_and_weights = [
+            (30, 32),
+            (60, 64),
+            (100, 128),
+        ]
+        action_encoder = ActionEncoder(
+            watchtime_feature_name="watchtimes",
+            action_feature_name="actions",
+            action_weights=action_weights,
+            watchtime_to_action_thresholds_and_weights=watchtime_to_action_thresholds_and_weights,
+            action_embedding_dim=action_embedding_dim,
+            is_inference=False,
+        ).to(device)
+
+        preprocessor = ContextualInterleavePreprocessor(
+            input_embedding_dim=input_embedding_dim,
+            output_embedding_dim=output_embedding_dim,
+            contextual_feature_to_max_length={"c_0": 1, "c_1": 2},
+            contextual_feature_to_min_uih_length={"c_1": 4},
+            pmlp_contextual_dropout_ratio=0.2,
+            content_encoder=content_encoder,
+            content_contextualize_mlp_fn=lambda in_dim,
+            out_dim,
+            contextual_dim,
+            is_inference: ParameterizedContextualizedMLP(
+                contextual_embedding_dim=contextual_dim,
+                sequential_input_dim=in_dim,
+                sequential_output_dim=out_dim,
+                hidden_dim=content_encoder_hidden_dim,
+                is_inference=is_inference,
+            )
+            if enable_pmlp
+            else SimpleContextualizedMLP(
+                sequential_input_dim=in_dim,
+                sequential_output_dim=out_dim,
+                hidden_dim=content_encoder_hidden_dim,
+                is_inference=is_inference,
+            ),
+            action_encoder=action_encoder,
+            action_contextualize_mlp_fn=lambda in_dim,
+            out_dim,
+            contextual_dim,
+            is_inference: ParameterizedContextualizedMLP(
+                contextual_embedding_dim=contextual_dim,
+                sequential_input_dim=in_dim,
+                sequential_output_dim=out_dim,
+                hidden_dim=action_encoder_hidden_dim,
+                is_inference=is_inference,
+            )
+            if enable_pmlp
+            else SimpleContextualizedMLP(
+                sequential_input_dim=in_dim,
+                sequential_output_dim=out_dim,
+                hidden_dim=action_encoder_hidden_dim,
+                is_inference=is_inference,
+            ),
+            enable_interleaving=enable_interleaving,
+            is_inference=False,
+        ).to(device)
+        preprocessor.set_training_dtype(dtype)
+        if not is_train:
+            preprocessor.eval()
+
+        # inputs
+        seq_lengths = [6, 3]
+        num_targets = [2, 1]
+        seq_embeddings = torch.rand(
+            (sum(seq_lengths), input_embedding_dim),
+            device=device,
+            dtype=dtype,
+        )
+        seq_timestamps = torch.tensor(
+            [1, 2, 3, 4, 5, 6, 10, 20, 30],
+            device=device,
+        )
+        watchtimes = [40, 20, 110, 31, 26, 55]
+        actions = [1, 3, 26, 30, 6, 4]
+        (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+            _,
+        ) = preprocessor(
+            max_uih_len=4,
+            max_targets=2,
+            total_uih_len=sum(seq_lengths) - sum(num_targets),
+            total_targets=sum(num_targets),
+            seq_lengths=torch.tensor(seq_lengths, device=device),
+            seq_timestamps=seq_timestamps,
+            seq_embeddings=seq_embeddings,
+            seq_payloads={
+                # contextual
+                "c_0": torch.rand((2, input_embedding_dim), device=device, dtype=dtype),
+                "c_0_offsets": torch.tensor([0, 1, 1], device=device),
+                "c_1": torch.rand((4, input_embedding_dim), device=device, dtype=dtype),
+                "c_1_offsets": torch.tensor([0, 2, 3], device=device),
+                # action
+                "watchtimes": torch.tensor(watchtimes, device=device),
+                "actions": torch.tensor(actions, device=device),
+                # content
+                "a0": torch.rand_like(seq_embeddings).requires_grad_(True),
+                "a1": torch.rand_like(seq_embeddings).requires_grad_(True),
+                "t0": torch.rand(
+                    sum(num_targets), input_embedding_dim, device=device, dtype=dtype
+                ).requires_grad_(True),
+                "t1": torch.rand(
+                    sum(num_targets), input_embedding_dim, device=device, dtype=dtype
+                ).requires_grad_(True),
+            },
+            num_targets=torch.tensor(num_targets, device=device),
+        )
+        if enable_interleaving:
+            if is_train:
+                expected_output_seq_lengths = [
+                    2 * s + contextual_len for s in seq_lengths
+                ]
+                expected_max_seq_len = max(expected_output_seq_lengths)
+                expected_output_num_targets = [2 * s for s in num_targets]
+                expected_seq_embedding_size = (
+                    sum(expected_output_seq_lengths),
+                    output_embedding_dim,
+                )
+                expected_seq_timestamps_size = (sum(expected_output_seq_lengths),)
+                expected_output_seq_timestamps = [
+                    0,
+                    0,
+                    0,
+                    1,
+                    1,
+                    2,
+                    2,
+                    3,
+                    3,
+                    4,
+                    4,
+                    5,
+                    5,
+                    6,
+                    6,
+                    0,
+                    0,
+                    0,
+                    10,
+                    10,
+                    20,
+                    20,
+                    30,
+                    30,
+                ]
+            else:
+                expected_output_seq_lengths = [
+                    2 * s - n + contextual_len for s, n in zip(seq_lengths, num_targets)
+                ]
+                expected_max_seq_len = max(expected_output_seq_lengths)
+                expected_output_num_targets = num_targets
+                expected_seq_embedding_size = (
+                    sum(expected_output_seq_lengths),
+                    output_embedding_dim,
+                )
+                expected_seq_timestamps_size = (sum(expected_output_seq_lengths),)
+                expected_output_seq_timestamps = [
+                    0,
+                    0,
+                    0,
+                    1,
+                    1,
+                    2,
+                    2,
+                    3,
+                    3,
+                    4,
+                    4,
+                    5,
+                    6,
+                    0,
+                    0,
+                    0,
+                    10,
+                    10,
+                    20,
+                    20,
+                    30,
+                ]
+        else:
+            expected_output_seq_lengths = [s + contextual_len for s in seq_lengths]
+            expected_max_seq_len = max(expected_output_seq_lengths)
+            expected_output_num_targets = num_targets
+            expected_seq_embedding_size = (
+                sum(expected_output_seq_lengths),
+                output_embedding_dim,
+            )
+            expected_seq_timestamps_size = (sum(expected_output_seq_lengths),)
+            expected_output_seq_timestamps = [
+                0,
+                0,
+                0,
+                1,
+                2,
+                3,
+                4,
+                5,
+                6,
+                0,
+                0,
+                0,
+                10,
+                20,
+                30,
+            ]
+
+        self.assertEqual(output_max_seq_len, expected_max_seq_len)
+        self.assertEqual(output_seq_lengths.tolist(), expected_output_seq_lengths)
+        torch.testing.assert_allclose(
+            torch.ops.fbgemm.asynchronous_complete_cumsum(output_seq_lengths),
+            output_seq_offsets,
+        )
+        self.assertEqual(output_num_targets.tolist(), expected_output_num_targets)
+        self.assertEqual(
+            output_seq_embeddings.size(),
+            expected_seq_embedding_size,
+        )
+        self.assertEqual(
+            output_seq_timestamps.size(),
+            expected_seq_timestamps_size,
+        )
+        self.assertEqual(
+            output_seq_timestamps.tolist(),
+            expected_output_seq_timestamps,
+        )
+
+        # test combine embeddings
+        batch_size = 10
+        max_uih_len = 100
+        max_targets = 20
+        max_seq_len = max_uih_len + max_targets
+        seq_lengths = torch.randint(0, max_uih_len, (batch_size,), device=device)
+        total_uih_len = int(seq_lengths.sum().item())
+        num_targets = torch.randint(1, max_targets, (batch_size,), device=device)
+        total_targets = int(num_targets.sum().item())
+        seq_lengths = seq_lengths + num_targets
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(seq_lengths, dim=0)
+        total_seq_len = int(torch.sum(seq_lengths).item())
+        seq_timestamps = torch.randint(0, 1000000, (total_seq_len,), device=device)
+        content_embeddings = torch.rand(
+            (total_seq_len, output_embedding_dim),
+            device=device,
+        ).requires_grad_(True)
+        action_embeddings = torch.rand(
+            (total_seq_len, output_embedding_dim),
+            device=device,
+        ).requires_grad_(True)
+        contextual_embeddings = torch.rand(
+            (total_seq_len, 3 * output_embedding_dim), device=device
+        ).requires_grad_(True)
+        (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+        ) = preprocessor.combine_embeddings(
+            max_uih_len=max_uih_len,
+            max_targets=max_targets,
+            total_uih_len=total_uih_len,
+            total_targets=total_targets,
+            seq_lengths=seq_lengths,
+            seq_timestamps=seq_timestamps,
+            content_embeddings=content_embeddings,
+            action_embeddings=action_embeddings,
+            contextual_embeddings=contextual_embeddings,
+            num_targets=num_targets,
+        )
+        seq_embeddings = action_embeddings + content_embeddings
+        if enable_interleaving:
+            if is_train:
+                self.assertEqual(
+                    output_max_seq_len,
+                    max_seq_len * 2 + contextual_len,
+                )
+                self.assertEqual(
+                    output_total_uih_len,
+                    total_uih_len * 2 + contextual_len * batch_size,
+                )
+                self.assertEqual(
+                    output_total_targets,
+                    total_targets * 2,
+                )
+                torch.testing.assert_allclose(
+                    output_seq_lengths, seq_lengths * 2 + contextual_len
+                )
+                torch.testing.assert_allclose(output_num_targets, num_targets * 2)
+            else:
+                self.assertEqual(
+                    output_max_seq_len,
+                    max_uih_len * 2 + max_targets + contextual_len,
+                )
+                self.assertEqual(
+                    output_total_uih_len,
+                    total_uih_len * 2 + contextual_len * batch_size,
+                )
+                self.assertEqual(
+                    output_total_targets,
+                    total_targets,
+                )
+                torch.testing.assert_allclose(
+                    output_seq_lengths, seq_lengths * 2 - num_targets + contextual_len
+                )
+                torch.testing.assert_allclose(output_num_targets, num_targets)
+        else:
+            self.assertEqual(
+                output_max_seq_len,
+                max_seq_len + contextual_len,
+            )
+            self.assertEqual(
+                output_total_uih_len,
+                total_uih_len + contextual_len * batch_size,
+            )
+            self.assertEqual(
+                output_total_targets,
+                total_targets,
+            )
+            torch.testing.assert_allclose(
+                output_seq_lengths, seq_lengths + contextual_len
+            )
+            torch.testing.assert_allclose(output_num_targets, num_targets)
+        for b in range(batch_size):
+            input_start = int(seq_offsets[b].item())
+            input_end = int(seq_offsets[b + 1].item())
+            output_start = int(output_seq_offsets[b].item())
+            output_end = int(output_seq_offsets[b + 1].item())
+            input_targets = int(num_targets[b].item())
+            output_targets = int(output_num_targets[b].item())
+            torch.testing.assert_allclose(
+                output_seq_timestamps[output_start : output_start + contextual_len],
+                torch.zeros(3, device=device),
+            )
+            torch.testing.assert_allclose(
+                output_seq_embeddings[
+                    output_start : output_start + contextual_len
+                ].view(-1),
+                contextual_embeddings[b],
+            )
+            if enable_interleaving:
+                torch.testing.assert_allclose(
+                    output_seq_timestamps[
+                        output_start + contextual_len : output_end - output_targets
+                    ].view(-1, 2)[:, 0],
+                    seq_timestamps[input_start : input_end - input_targets],
+                )
+                torch.testing.assert_allclose(
+                    output_seq_timestamps[
+                        output_start + contextual_len : output_end - output_targets
+                    ].view(-1, 2)[:, 1],
+                    seq_timestamps[input_start : input_end - input_targets],
+                )
+                torch.testing.assert_allclose(
+                    output_seq_embeddings[
+                        output_start + contextual_len : output_end - output_targets
+                    ].view(-1, 2, output_embedding_dim)[:, 0, :],
+                    content_embeddings[input_start : input_end - input_targets],
+                )
+                torch.testing.assert_allclose(
+                    output_seq_embeddings[
+                        output_start + contextual_len : output_end - output_targets
+                    ].view(-1, 2, output_embedding_dim)[:, 1, :],
+                    action_embeddings[input_start : input_end - input_targets],
+                )
+                if is_train:
+                    torch.testing.assert_allclose(
+                        output_seq_timestamps[
+                            output_end - output_targets : output_end
+                        ].view(-1, 2)[:, 0],
+                        seq_timestamps[input_end - input_targets : input_end],
+                    )
+                    torch.testing.assert_allclose(
+                        output_seq_timestamps[
+                            output_end - output_targets : output_end
+                        ].view(-1, 2)[:, 1],
+                        seq_timestamps[input_end - input_targets : input_end],
+                    )
+                    torch.testing.assert_allclose(
+                        output_seq_embeddings[
+                            output_end - output_targets : output_end
+                        ].view(-1, 2, output_embedding_dim)[:, 0, :],
+                        content_embeddings[input_end - input_targets : input_end],
+                    )
+                    torch.testing.assert_allclose(
+                        output_seq_embeddings[
+                            output_end - output_targets : output_end
+                        ].view(-1, 2, output_embedding_dim)[:, 1, :],
+                        action_embeddings[input_end - input_targets : input_end],
+                    )
+                else:
+                    torch.testing.assert_allclose(
+                        output_seq_timestamps[output_end - output_targets : output_end],
+                        seq_timestamps[input_end - input_targets : input_end],
+                    )
+                    torch.testing.assert_allclose(
+                        output_seq_embeddings[output_end - output_targets : output_end],
+                        content_embeddings[input_end - input_targets : input_end],
+                    )
+            else:
+                torch.testing.assert_allclose(
+                    output_seq_timestamps[output_start + contextual_len : output_end],
+                    seq_timestamps[input_start:input_end],
+                )
+                torch.testing.assert_allclose(
+                    output_seq_embeddings[output_start + contextual_len : output_end],
+                    seq_embeddings[input_start:input_end],
+                )
diff --git a/recommendation_v4/generative_recommenders/modules/tests/dynamic_stu_test.py b/recommendation_v4/generative_recommenders/modules/tests/dynamic_stu_test.py
new file mode 100644
index 000000000..c1c598f1f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/dynamic_stu_test.py
@@ -0,0 +1,279 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import copy
+import unittest
+from typing import List
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel, set_dev_mode
+from generative_recommenders.modules.dynamic_stu import L2STU, SDSTU
+from generative_recommenders.modules.stu import STU, STULayer, STULayerConfig, STUStack
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class DynamicStuTest(unittest.TestCase):
+    # pyre-ignore
+    @given(
+        causal=st.sampled_from([True]),
+        num_layers=st.sampled_from([2]),
+        num_heads=st.sampled_from([2]),
+        max_uih_len=st.sampled_from([300]),
+        batch_size=st.sampled_from([8]),
+        embedding_dim=st.sampled_from([16]),
+        attention_dim=st.sampled_from([32]),
+        linear_hidden_dim=st.sampled_from([64]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    def test_l2_stu(
+        self,
+        causal: bool,
+        num_layers: int,
+        num_heads: int,
+        max_uih_len: int,
+        batch_size: int,
+        embedding_dim: int,
+        attention_dim: int,
+        linear_hidden_dim: int,
+        has_multiple_targets: bool,
+        contextual_seq_len: int,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+        l3_stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=has_multiple_targets,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ]
+        l3_stu: List[STU] = [
+            L2STU(
+                stu=STUStack(
+                    stu_list=l3_stu_layers,
+                    is_inference=False,
+                ),
+                max_l2_len=100,
+                contextual_seq_len=contextual_seq_len,
+                is_inference=False,
+            )
+        ]
+        l2_stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=has_multiple_targets,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ] + l3_stu
+        l2_stu: List[STU] = [
+            L2STU(
+                stu=STUStack(
+                    stu_list=l2_stu_layers,
+                    is_inference=False,
+                ),
+                max_l2_len=200,
+                contextual_seq_len=contextual_seq_len,
+                is_inference=False,
+            )
+        ]
+        stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=has_multiple_targets,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ] + l2_stu
+        stu = STUStack(
+            stu_list=stu_layers,
+            is_inference=False,
+        ).to(device)
+        stu.recursive_setattr("_hammer_kernel", HammerKernel.TRITON)
+
+        x_lengths = torch.randint(max_uih_len + 1, (batch_size,), device=device)
+        x_lengths = x_lengths + contextual_seq_len
+        max_seq_len = max_uih_len + contextual_seq_len
+        max_targets = 20
+        num_targets = torch.randint(1, max_targets, size=(batch_size,), device=device)
+        if has_multiple_targets:
+            x_lengths = x_lengths + num_targets
+            max_seq_len = max_seq_len + max_targets
+        x_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(x_lengths)
+        total_seq_len = int(x_offsets[-1].cpu().item())
+        x = torch.randn(
+            int(total_seq_len),
+            embedding_dim,
+            device=device,
+            dtype=dtype,
+        ).requires_grad_(True)
+        stu_output = stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        dout = torch.randn_like(stu_output)
+        stu_output.backward(dout)
+        self.assertTrue(stu_output.shape == x.shape)
+
+    # pyre-ignore
+    @given(
+        causal=st.sampled_from([True]),
+        num_layers=st.sampled_from([2]),
+        num_heads=st.sampled_from([2]),
+        max_uih_len=st.sampled_from([300]),
+        batch_size=st.sampled_from([8]),
+        embedding_dim=st.sampled_from([16]),
+        attention_dim=st.sampled_from([32]),
+        linear_hidden_dim=st.sampled_from([64]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+        dropout_ratio=st.sampled_from([0.0, 0.3, 1.0]),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    def test_sd_stu(
+        self,
+        causal: bool,
+        num_layers: int,
+        num_heads: int,
+        max_uih_len: int,
+        batch_size: int,
+        embedding_dim: int,
+        attention_dim: int,
+        linear_hidden_dim: int,
+        has_multiple_targets: bool,
+        contextual_seq_len: int,
+        dropout_ratio: float,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+        stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=has_multiple_targets,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ]
+        stu = STUStack(
+            stu_list=stu_layers,
+            is_inference=False,
+        ).to(device)
+        sd_stu = SDSTU(
+            stu=copy.deepcopy(stu),
+            dropout_ratio=dropout_ratio,
+            is_inference=False,
+        ).to(device)
+        stu.recursive_setattr("_hammer_kernel", HammerKernel.PYTORCH)
+        sd_stu.recursive_setattr("_hammer_kernel", HammerKernel.PYTORCH)
+        x_lengths = torch.randint(max_uih_len + 1, (batch_size,), device=device)
+        x_lengths = x_lengths + contextual_seq_len
+        max_seq_len = max_uih_len + contextual_seq_len
+        max_targets = 20
+        num_targets = torch.randint(1, max_targets, size=(batch_size,), device=device)
+        if has_multiple_targets:
+            x_lengths = x_lengths + num_targets
+            max_seq_len = max_seq_len + max_targets
+        x_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(x_lengths)
+        total_seq_len = int(x_offsets[-1].cpu().item())
+        x = torch.randn(
+            int(total_seq_len),
+            embedding_dim,
+            device=device,
+            dtype=dtype,
+        ).requires_grad_(True)
+        stu_output = stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        dout = torch.randn_like(stu_output)
+        stu_output.backward(dout)
+        assert x.grad is not None
+        d_x, x.grad = x.grad.detach().clone(), None
+        x = x.detach().clone().requires_grad_(True)
+        sd_stu_output = sd_stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        dout = dout.detach().clone()
+        sd_stu_output.backward(dout)
+        d_sd_x = x.grad.detach().clone()
+
+        self.assertTrue(sd_stu_output.shape == x.shape)
+        if dropout_ratio == 0.0:
+            torch.testing.assert_close(stu_output, sd_stu_output)
+            torch.testing.assert_close(d_x, d_sd_x)
+        if dropout_ratio == 1.0:
+            torch.testing.assert_close(x, sd_stu_output)
diff --git a/recommendation_v4/generative_recommenders/modules/tests/multitask_module_test.py b/recommendation_v4/generative_recommenders/modules/tests/multitask_module_test.py
new file mode 100644
index 000000000..66f2db185
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/multitask_module_test.py
@@ -0,0 +1,233 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+from typing import Dict, List, Tuple
+
+import torch
+from generative_recommenders.common import gpu_unavailable, set_dev_mode
+from generative_recommenders.modules.multitask_module import (
+    DefaultMultitaskModule,
+    MultitaskTaskType,
+    TaskConfig,
+)
+from generative_recommenders.ops.layer_norm import SwishLayerNorm
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+_task_configs: List[List[TaskConfig]] = [
+    [
+        TaskConfig(
+            task_name="is_click",
+            task_weight=1,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="vvp",
+            task_weight=2,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="is_click",
+            task_weight=1,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_like",
+            task_weight=2,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_follow",
+            task_weight=4,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="rating",
+            task_weight=1,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+        TaskConfig(
+            task_name="vvp",
+            task_weight=2,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="type_1",
+            task_weight=2,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="is_click",
+            task_weight=1,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_like",
+            task_weight=2,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_follow",
+            task_weight=4,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="rating",
+            task_weight=1,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+        TaskConfig(
+            task_name="vvp",
+            task_weight=2,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="is_click",
+            task_weight=1,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_like",
+            task_weight=2,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_follow",
+            task_weight=4,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="rating",
+            task_weight=1,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+        TaskConfig(
+            task_name="vvp",
+            task_weight=2,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+    ],
+]
+
+
+def _get_random_supervision_labels_and_weights(
+    num_examples: int,
+    task_configs: List[TaskConfig],
+    device: torch.device,
+) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+    supervision_labels: Dict[str, torch.Tensor] = {}
+    supervision_weights: Dict[str, torch.Tensor] = {}
+    for task in task_configs:
+        if task.task_type == MultitaskTaskType.REGRESSION:
+            supervision_labels[task.task_name] = torch.randn(
+                num_examples, device=device
+            ).to(torch.float32)
+        else:
+            supervision_labels[task.task_name] = torch.randint(
+                0,
+                11,
+                (num_examples,),
+                device=device,
+            ).to(torch.float32)
+
+    return supervision_labels, supervision_weights
+
+
+class MultiTaskModuleTest(unittest.TestCase):
+    # pyre-ignore
+    @given(
+        task_config_idx=st.sampled_from(range(len(_task_configs))),
+        training=st.booleans(),
+        is_inference=st.booleans(),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=50, deadline=None)
+    def test_default_multitask_module(
+        self,
+        task_config_idx: int,
+        training: bool,
+        is_inference: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+
+        L = 200
+        embedding_dim = 64
+        causal_multitask_weights = 0.3
+
+        task_configs: List[TaskConfig] = _task_configs[task_config_idx]
+        task_configs.sort(key=lambda x: x.task_type)
+        multitask_module = DefaultMultitaskModule(
+            task_configs=task_configs,
+            embedding_dim=embedding_dim,
+            prediction_fn=lambda in_dim, num_tasks: torch.nn.Sequential(
+                torch.nn.Linear(in_features=in_dim, out_features=512),
+                SwishLayerNorm(512),
+                torch.nn.Linear(in_features=512, out_features=num_tasks),
+            ),
+            causal_multitask_weights=causal_multitask_weights,
+            is_inference=is_inference,
+        ).to(device)
+        multitask_module.set_training_dtype(dtype)
+        supervision_labels, supervision_weights = (
+            _get_random_supervision_labels_and_weights(
+                num_examples=L,
+                task_configs=task_configs,
+                device=device,
+            )
+        )
+        encoded_user_embeddings = torch.rand(L, embedding_dim, device=device)
+        item_embeddings = torch.rand(L, embedding_dim, device=device)
+
+        (
+            mt_preds,
+            mt_labels,
+            mt_weights,
+            mt_losses,
+        ) = multitask_module(
+            encoded_user_embeddings=encoded_user_embeddings,
+            item_embeddings=item_embeddings,
+            supervision_labels=supervision_labels,
+            supervision_weights=supervision_weights,
+        )
+
+        self.assertEqual(mt_preds.size(), (len(task_configs), L))
+        if not is_inference:
+            self.assertEqual(mt_labels.size(), (len(task_configs), L))
+            self.assertEqual(mt_weights.size(), (len(task_configs), L))
+            if training:
+                self.assertEqual(mt_losses.size(), (len(task_configs),))
diff --git a/recommendation_v4/generative_recommenders/modules/tests/stu_test.py b/recommendation_v4/generative_recommenders/modules/tests/stu_test.py
new file mode 100644
index 000000000..f6440e55e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/stu_test.py
@@ -0,0 +1,453 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import copy
+import unittest
+from typing import List
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel, set_dev_mode
+from generative_recommenders.modules.stu import STU, STULayer, STULayerConfig, STUStack
+from generative_recommenders.ops.jagged_tensors import split_2D_jagged
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+def _inplace_swap(
+    batch_size: int,
+    x: torch.Tensor,
+    swap_from: torch.Tensor,
+    swap_to: torch.Tensor,
+) -> torch.Tensor:
+    for i in range(batch_size):
+        tmp = x[i, swap_from[i], :].detach().clone()
+        x[i, swap_from[i], :] = x[i, swap_to[i], :]
+        x[i, swap_to[i], :] = tmp
+    return x
+
+
+class StuTest(unittest.TestCase):
+    # pyre-ignore
+    @given(
+        causal=st.sampled_from([True]),
+        num_layers=st.sampled_from([2]),
+        num_heads=st.sampled_from([1, 2]),
+        max_uih_len=st.sampled_from([20, 64]),
+        batch_size=st.sampled_from([8]),
+        embedding_dim=st.sampled_from([16]),
+        attention_dim=st.sampled_from([32]),
+        linear_hidden_dim=st.sampled_from([64]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+        use_group_norm=st.sampled_from([True, False]),
+        recompute_uvqk_in_backward=st.sampled_from([True, False]),
+        recompute_normed_x_in_backward=st.sampled_from([True, False]),
+        recompute_y_in_backward=st.sampled_from([True, False]),
+        empty_inputs=st.sampled_from([False]),
+        dtype=st.sampled_from(
+            [torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=100, deadline=None)
+    def test_triton(
+        self,
+        causal: bool,
+        num_layers: int,
+        num_heads: int,
+        max_uih_len: int,
+        batch_size: int,
+        embedding_dim: int,
+        attention_dim: int,
+        linear_hidden_dim: int,
+        has_multiple_targets: bool,
+        contextual_seq_len: int,
+        use_group_norm: bool,
+        recompute_uvqk_in_backward: bool,
+        recompute_normed_x_in_backward: bool,
+        recompute_y_in_backward: bool,
+        empty_inputs: bool,  # test the case where all the seqlen in the batch are 0
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+
+        stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=has_multiple_targets,
+                    max_attn_len=None,
+                    attn_alpha=None,
+                    use_group_norm=use_group_norm,
+                    recompute_normed_x=recompute_normed_x_in_backward,
+                    recompute_uvqk=recompute_uvqk_in_backward,
+                    recompute_y=recompute_y_in_backward,
+                    sort_by_length=True,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ]
+        stu = STUStack(
+            stu_list=stu_layers,
+            is_inference=False,
+        ).to(device)
+        stu.recursive_setattr("_hammer_kernel", HammerKernel.PYTORCH)
+        stu_triton = copy.deepcopy(stu)
+        stu_triton.recursive_setattr("_hammer_kernel", HammerKernel.TRITON)
+
+        if empty_inputs:
+            x_lengths = torch.zeros(batch_size, dtype=torch.int32, device=device)
+            num_targets = torch.zeros(batch_size, dtype=torch.int32, device=device)
+            contextual_seq_len = 0
+            max_seq_len = 16
+        else:
+            x_lengths = torch.randint(max_uih_len + 1, (batch_size,), device=device)
+            x_lengths = x_lengths + contextual_seq_len
+            max_seq_len = max_uih_len + contextual_seq_len
+            max_targets = 20
+            num_targets = torch.randint(
+                1, max_targets, size=(batch_size,), device=device
+            )
+            if has_multiple_targets:
+                x_lengths = x_lengths + num_targets
+                max_seq_len = max_seq_len + max_targets
+        x_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(x_lengths)
+        total_seq_len = int(x_offsets[-1].cpu().item())
+        x = torch.randn(
+            int(total_seq_len),
+            embedding_dim,
+            device=device,
+            dtype=dtype,
+        ).requires_grad_(True)
+        x_triton = x.clone().detach().requires_grad_()
+        stu_output = stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        stu_triton_output = stu_triton(
+            x=x_triton,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        atol = 5e-3 if dtype == torch.bfloat16 else None
+        rtol = 1e-2 if dtype == torch.bfloat16 else None
+        torch.testing.assert_close(stu_triton_output, stu_output, atol=atol, rtol=rtol)
+        dout = torch.randn_like(stu_output)
+        stu_output.backward(dout)
+        dout = dout.detach().clone()
+        stu_triton_output.backward(dout)
+        torch.testing.assert_close(x.grad, x_triton.grad, atol=atol, rtol=rtol)
+
+    # pyre-ignore
+    @given(
+        dtype=st.sampled_from(
+            [torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=8, deadline=None)
+    def test_target_invariance(
+        self,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+        num_layers = 2
+        num_heads = 2
+        max_seq_len = 32
+        batch_size = 8
+        embedding_dim = 16
+        attention_dim = 32
+        linear_hidden_dim = 32
+        causal = True
+        use_group_norm = False
+        recompute_normed_x_in_backward = False
+        recompute_uvqk_in_backward = False
+        recompute_y_in_backward = False
+        max_attn_len = None
+        stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=True,
+                    max_attn_len=max_attn_len,
+                    attn_alpha=None,
+                    use_group_norm=use_group_norm,
+                    recompute_normed_x=recompute_normed_x_in_backward,
+                    recompute_uvqk=recompute_uvqk_in_backward,
+                    recompute_y=recompute_y_in_backward,
+                    sort_by_length=True,
+                    contextual_seq_len=0,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ]
+        stu = STUStack(
+            stu_list=stu_layers,
+            is_inference=False,
+        ).to(device)
+
+        x_lengths = torch.randint(
+            low=2, high=max_seq_len + 1, size=(batch_size,), device=device
+        )
+        num_targets = torch.randint(low=2, high=10, size=(batch_size,), device=device)
+        x_lengths = x_lengths + num_targets
+        x_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(x_lengths)
+        total_seq_len = int(x_offsets[-1].cpu())
+
+        swap_from = torch.remainder(
+            torch.randint(20, (batch_size,), device=device), num_targets
+        )
+        swap_to = torch.remainder(
+            torch.randint(20, (batch_size,), device=device), num_targets
+        )
+        swap_from = x_lengths - 1 - swap_from
+        swap_to = x_lengths - 1 - swap_to
+        max_seq_len = int(x_lengths.max().item())
+
+        # forward()
+        x = torch.randn(
+            int(total_seq_len),
+            embedding_dim,
+            device=device,
+            dtype=dtype,
+        ).requires_grad_(True)
+        stu_output = stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        stu_output_dense = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=stu_output,
+            offsets=[x_offsets],
+            max_lengths=[max_seq_len],
+            padding_value=0.0,
+        )
+
+        # swapped forward().
+        dense_x = torch.ops.fbgemm.jagged_to_padded_dense(
+            x.detach(),
+            [x_offsets],
+            [max_seq_len],
+        )
+        swapped_dense_x = _inplace_swap(batch_size, dense_x, swap_from, swap_to)
+        swapped_x = torch.ops.fbgemm.dense_to_jagged(
+            swapped_dense_x,
+            [x_offsets],
+        )[0].requires_grad_(True)
+        swapped_stu_output = stu(
+            x=swapped_x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        swapped_stu_output_dense = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=swapped_stu_output,
+            offsets=[x_offsets],
+            max_lengths=[max_seq_len],
+            padding_value=0.0,
+        )
+
+        # backward
+        dout = torch.randn_like(stu_output_dense)
+        stu_output_dense.backward(dout)
+        dout = dout.detach().clone()
+        swapped_stu_output_dense.backward(
+            _inplace_swap(batch_size, dout, swap_from, swap_to)
+        )
+
+        swapped_swapped_stu_output_dense = _inplace_swap(
+            batch_size, swapped_stu_output_dense, swap_from, swap_to
+        )
+        torch.testing.assert_close(stu_output_dense, swapped_swapped_stu_output_dense)
+
+        # backward
+        torch.testing.assert_close(
+            torch.ops.fbgemm.jagged_to_padded_dense(
+                swapped_x.grad,
+                [x_offsets],
+                [max_seq_len],
+            ),
+            _inplace_swap(
+                batch_size,
+                torch.ops.fbgemm.jagged_to_padded_dense(
+                    x.grad,
+                    [x_offsets],
+                    [max_seq_len],
+                ),
+                swap_from,
+                swap_to,
+            ),
+        )
+
+    # pyre-ignore[56]
+    @given(
+        num_layers=st.sampled_from([1, 2, 4]),
+        num_heads=st.sampled_from([1, 4]),
+        max_uih_len=st.sampled_from([20, 128]),
+        batch_size=st.sampled_from([4, 8]),
+        embedding_dim=st.sampled_from([32]),
+        attention_dim=st.sampled_from([16]),
+        linear_hidden_dim=st.sampled_from([64]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    @unittest.skipIf(*gpu_unavailable)
+    @torch.inference_mode()
+    def test_cached_forward(
+        self,
+        num_layers: int,
+        num_heads: int,
+        max_uih_len: int,
+        batch_size: int,
+        embedding_dim: int,
+        attention_dim: int,
+        linear_hidden_dim: int,
+        contextual_seq_len: int,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+        use_group_norm = False
+        recompute_normed_x_in_backward = False
+        recompute_uvqk_in_backward = False
+        recompute_y_in_backward = False
+        max_attn_len = None
+        stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=True,
+                    target_aware=True,
+                    max_attn_len=max_attn_len,
+                    attn_alpha=None,
+                    use_group_norm=use_group_norm,
+                    recompute_normed_x=recompute_normed_x_in_backward,
+                    recompute_uvqk=recompute_uvqk_in_backward,
+                    recompute_y=recompute_y_in_backward,
+                    sort_by_length=True,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=True,
+            )
+            for _ in range(num_layers)
+        ]
+        stu = STUStack(
+            stu_list=stu_layers,
+            is_inference=True,
+        ).to(device)
+        stu.recursive_setattr("_hammer_kernel", HammerKernel.TRITON)
+        stu.eval()
+
+        x_lengths = torch.randint(
+            max_uih_len, max_uih_len + 1, (batch_size,), device=device
+        )
+        x_lengths = x_lengths + contextual_seq_len
+        max_seq_len = max_uih_len + contextual_seq_len
+        delta_size = 20
+        max_targets = delta_size * 2
+        num_targets = torch.randint(
+            delta_size, max_targets + 1, size=(batch_size,), device=device
+        )
+        x_lengths = x_lengths + num_targets + contextual_seq_len
+        max_seq_len = max_seq_len + max_targets + contextual_seq_len
+        x_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(x_lengths)
+        total_seq_len = int(x_offsets[-1].cpu().item())
+        x = torch.randn(
+            int(total_seq_len),
+            embedding_dim,
+            device=device,
+        ).requires_grad_(True)
+
+        # default forward().
+        ref_y = stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        prime_lengths = x_lengths - delta_size
+        prime_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(prime_lengths)
+        _, ref_delta_y = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=ref_y,
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=prime_offsets,
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+
+        # cached forward().
+        prime_x, delta_x = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=x,
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=prime_offsets,
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+        _ = stu(
+            x=prime_x,
+            x_lengths=prime_lengths,
+            x_offsets=prime_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets - delta_size,
+            max_kv_caching_len=max_seq_len - delta_size,
+            kv_caching_lengths=x_lengths - delta_size,
+        )
+        delta_y = stu.cached_forward(
+            delta_x=delta_x,
+            num_targets=num_targets,
+        )
+
+        torch.testing.assert_close(ref_delta_y, delta_y)
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/addmm_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/addmm_bench.py
new file mode 100644
index 000000000..b1be3a803
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/benchmarks/addmm_bench.py
@@ -0,0 +1,174 @@
+# pyre-unsafe
+import time
+from typing import List, Optional, Tuple
+
+import click
+import pandas as pd
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.triton.triton_addmm import (
+    triton_addmm_fwd,
+    triton_addmm_fwd_tma_persistent,
+    triton_addmm_fwd_tma_ws_persistent_tlx,
+    triton_addmm_fwd_tma_ws_tlx,
+)
+from generative_recommenders.ops.utils import is_sm100
+
+try:
+    # @manual=//triton:triton
+    import triton.language.extra.tlx as tlx  # type: ignore
+
+    HAS_TLX = True
+except ImportError:
+    tlx = None
+    HAS_TLX = False
+
+
+def get_kernel(provider: str) -> HammerKernel:
+    if provider == "triton":
+        return HammerKernel.TRITON
+    elif provider == "pytorch":
+        return HammerKernel.PYTORCH
+    else:
+        raise ValueError(f"Unknown provider {provider}")
+
+
+def get_dtype(dtype: str) -> torch.dtype:
+    if dtype == "bfloat16":
+        return torch.bfloat16
+    elif dtype == "float32":
+        return torch.float32
+    elif dtype == "float16":
+        return torch.float16
+    else:
+        raise ValueError(f"Not supported dtype {dtype}")
+
+
+@click.command()
+@click.option("--m", type=int, default=0)
+@click.option("--k", type=int, default=4096)
+@click.option("--n", type=int, default=4096)
+@click.option("--dtype", type=str, default="bfloat16")
+@click.option("--return-result", type=bool, default=False)
+@click.option("--broadcast-y", type=bool, is_flag=True, default=False)
+def main(
+    m: int,
+    k: int,
+    n: int,
+    dtype: str,
+    return_result: bool,
+    broadcast_y: bool,
+) -> Optional[Tuple[List[triton.testing.Benchmark], List[pd.DataFrame]]]:
+    if m == 0:
+        batch_sizes = [64, 128, 256, 512]
+    else:
+        batch_sizes = [m]
+    line_vals = [
+        "pytorch",
+        "triton",
+        "triton_tma_persistent",
+        "triton_tma_persistent_ws",
+    ]
+    line_names = [
+        "PyTorch",
+        "Triton",
+        "Triton TMA Persistent",
+        "Triton TMA Persistent WS",
+    ]
+    styles = [
+        ("red", "-"),
+        ("green", "-"),
+        ("orange", "-"),
+        ("purple", "-"),
+    ]
+    if is_sm100() and HAS_TLX:  # tmem is only supported on Blackwell
+        line_vals.append("triton_tma_ws_tlx")
+        line_names.append("Triton TMA WS TLX")
+        styles.append(("cyan", "-"))
+        line_vals.append("triton_tma_persistent_ws_tlx")
+        line_names.append("Triton TMA Persistent WS TLX")
+        styles.append(("magenta", "-"))
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["batch_size"],
+            x_vals=batch_sizes,
+            line_arg="provider",
+            line_vals=line_vals,
+            line_names=line_names,
+            styles=styles,
+            ylabel="ms",
+            plot_name=f"addmm-K-{k}-N-{n}-mode-{mode}-dtype-{dtype}-broadcast_y-{broadcast_y}",
+            args={
+                "K": k,
+                "N": n,
+                "dtype": dtype,
+                "broadcast_y": broadcast_y,
+            },
+        )
+        for mode in ["fwd"]
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_addmm(
+        batch_size: int,
+        K: int,
+        N: int,
+        dtype: str,
+        provider: str,
+        broadcast_y: bool,
+    ) -> float:
+        warmup = 20
+        rep = 2000
+        x = torch.randn(
+            (batch_size, K), dtype=get_dtype(dtype), device=torch.device("cuda")
+        ).requires_grad_(True)
+        weight = torch.randn(
+            (N, K), dtype=get_dtype(dtype), device=torch.device("cuda")
+        ).requires_grad_(True)
+        if broadcast_y:
+            y = torch.randn(
+                (N), dtype=get_dtype(dtype), device=torch.device("cuda")
+            ).requires_grad_(True)
+        else:
+            y = torch.randn(
+                (batch_size, N), dtype=get_dtype(dtype), device=torch.device("cuda")
+            ).requires_grad_(True)
+
+        # Make sure tensors are contiguous for TMA kernels
+        weight_t_contiguous = weight.T.contiguous()
+
+        if provider == "pytorch":
+            fn = lambda: torch.addmm(y, x, weight.T)  # noqa E731
+        elif provider == "triton_tma_persistent":
+            fn = lambda: triton_addmm_fwd_tma_persistent(
+                x, weight_t_contiguous, y, warp_specialize=False
+            )  # noqa E731
+        elif provider == "triton_tma_persistent_ws":
+            fn = lambda: triton_addmm_fwd_tma_persistent(
+                x, weight_t_contiguous, y, warp_specialize=True
+            )  # noqa E731
+        elif provider == "triton_tma_persistent_ws_tlx":
+            fn = lambda: triton_addmm_fwd_tma_ws_persistent_tlx(
+                x, weight_t_contiguous, y
+            )  # noqa E731
+        elif provider == "triton_tma_ws_tlx":
+            fn = lambda: triton_addmm_fwd_tma_ws_tlx(x, weight_t_contiguous, y)  # noqa E731
+        elif provider == "triton":
+            fn = lambda: triton_addmm_fwd(x, weight_t_contiguous, y)  # noqa E731
+        else:
+            raise ValueError(f"Unknown provider: {provider}")
+        time.sleep(2)
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        return ms
+
+    df = bench_addmm.run(print_data=True, return_df=return_result)
+
+    if return_result:
+        return configs, df
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py
new file mode 100644
index 000000000..cc7fbede7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py
@@ -0,0 +1,406 @@
+# pyre-strict
+import os
+from typing import List, Optional, Tuple
+
+import click
+import pandas as pd
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.common import (
+    apply_sampling,
+    blackwell_tlx_unavailable,
+    generate_sparse_seq_len,
+    HammerKernel,
+)
+from generative_recommenders.ops.cpp.cuda_hstu_attention import cuda_hstu_mha
+from generative_recommenders.ops.hstu_attention import delta_hstu_mha, hstu_mha
+
+try:
+    from hammer.ops.ragged_hstu_attention import ragged_hstu_mha
+    from hammer.utils import HammerKernel as HammerKernel2
+except ImportError:
+    pass
+
+
+def _get_kernel(provider: str) -> HammerKernel:
+    if provider == "triton":
+        return HammerKernel.TRITON
+    elif provider == "tlx":
+        return HammerKernel.TLX
+    elif provider == "pytorch":
+        return HammerKernel.PYTORCH
+    else:
+        raise ValueError(f"Unknown provider {provider}")
+
+
+def _flops(
+    batch_size: int,
+    max_seqlen: int,
+    attn_dim: int,
+    hidden_dim: int,
+    nheads: int,
+    seq_offsets: torch.Tensor,
+    mode: str = "fwd",
+) -> float:
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    ratio = 2.0  # triangular masking
+    f1 = 0.0
+    f2 = 0.0
+    for i in range(batch_size):
+        seq_len = int((seq_offsets[i + 1] - seq_offsets[i]).item())
+        # (QK^T), dQ = d(QK^T)K, dK^T = Q^Td(QK^T)
+        f1 += 2 * nheads * attn_dim * seq_len**2 // ratio
+        # (QK^T)V, d(QK^T) = dOV^T, dV = (QK^T)^TdO,
+        f2 += 2 * nheads * hidden_dim * seq_len**2 // ratio
+    if mode == "fwd":
+        return f1 + f2  # computes (QK^T) and (QK^T)V
+    elif mode == "bwd":
+        return 3 * f1 + 2 * f2  # computes (QK^T), dQ, dK, dV, d(QK^T)
+    else:
+        return 4 * f1 + 3 * f2
+
+
+@click.command()
+@click.option(
+    "--batch-size",
+    type=int,
+    default=512,
+)
+@click.option("--heads", type=int, default=4)
+@click.option("--attn-dim", type=int, default=128)
+@click.option("--hidden-dim", type=int, default=128)
+@click.option("--max-seq-len-log2", type=int, default=13)
+@click.option("--data-type", type=str, default="bf16")
+@click.option("--seq-sparsity", type=float, default=0.95)
+@click.option("--has-delta-q", type=bool, default=False)
+@click.option("--delta-size", type=int, default=256)
+@click.option("--target-size", type=int, default=20)
+@click.option("--bench-backward", type=bool, default=True)
+@click.option("--bench-forward", type=bool, default=True)
+@click.option("--bench-tlx", type=bool, default=False)
+@click.option("--bench-pytorch", type=bool, default=False)
+@click.option("--bench-ragged", type=bool, default=True)
+@click.option("--report-flops", type=bool, default=False)
+@click.option("--return-result", type=bool, default=False)
+@click.option("--max-attn-len", type=int, default=0)
+@click.option("--min-full-attn-seq-len", type=int, default=0)
+@click.option("--contextual-seq-len", type=int, default=0)
+@click.option("--sampling-alpha", type=float, default=2.0)
+@click.option("--triton-enable-tma", type=bool, default=False)
+@click.option("--dynamic-attn-scale", type=bool, default=False)
+@click.option("--num-softmax-heads", type=int, default=0)
+def main(  # noqa: C901
+    batch_size: int,
+    heads: int,
+    attn_dim: int,
+    hidden_dim: int,
+    max_seq_len_log2: int,
+    data_type: str,
+    seq_sparsity: float,
+    has_delta_q: bool,
+    delta_size: int,
+    target_size: int,
+    bench_backward: bool,
+    bench_forward: bool,
+    bench_tlx: bool,
+    bench_pytorch: bool,
+    bench_ragged: bool,
+    report_flops: bool,
+    return_result: bool,
+    max_attn_len: int,
+    min_full_attn_seq_len: int,
+    contextual_seq_len: int,
+    sampling_alpha: float,
+    triton_enable_tma: bool,
+    dynamic_attn_scale: bool,
+    num_softmax_heads: int,
+) -> Optional[Tuple[List[triton.testing.Benchmark], List[pd.DataFrame]]]:
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    if data_type == "fp32":
+        dtype = torch.float32
+    elif data_type == "fp16":
+        dtype = torch.float16
+    elif data_type == "bf16":
+        dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}.")
+
+    line_vals = ["triton", "flash_cuda_jagged"]
+    line_names = ["triton", "flash_cuda_jagged"]
+    styles = [("blue", "-"), ("green", "-")]
+    if bench_pytorch:
+        line_vals.append("pytorch")
+        line_names.append("PyTorch")
+        styles.append(("green", "-"))
+    if bench_ragged:
+        line_vals.append("ragged")
+        line_names.append("ragged")
+        styles.append(("red", "-"))
+    if bench_tlx and not blackwell_tlx_unavailable[0]:
+        line_vals.append("tlx")
+        line_names.append("tlx")
+        styles.append(("cyan", "-"))
+
+    bench_backward = False if has_delta_q else bench_backward
+    modes = []
+    if bench_forward:
+        modes.append("fwd")
+    if bench_backward:
+        modes.append("bwd")
+    assert len(modes) > 0
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["seq_len"],
+            x_vals=[2**i for i in range(8, max_seq_len_log2)],
+            line_arg="provider",
+            line_vals=line_vals,
+            line_names=line_names,
+            styles=styles,
+            ylabel="ms",
+            plot_name=f"hstu-attn-b{batch_size}-h{heads}-d{attn_dim}-v{hidden_dim}--sparsity{seq_sparsity}-{mode}-{dtype}-target{target_size}-mattn{max_attn_len}-full{min_full_attn_seq_len}-c{contextual_seq_len}-sl_alpha{sampling_alpha}-triton_tma{triton_enable_tma}-dynamic_scale{dynamic_attn_scale}-num_softmax_heads{num_softmax_heads}",
+            args={
+                "batch_size": batch_size,
+                "heads": heads,
+                "attn_dim": attn_dim,
+                "hidden_dim": hidden_dim,
+                "dtype": dtype,
+                "mode": mode,
+                "seq_sparsity": seq_sparsity,
+                "has_delta_q": has_delta_q,
+                "delta_size": delta_size,
+                "target_size": target_size,
+                "bench_backward": bench_backward,
+                "report_flops": report_flops,
+                "max_attn_len": max_attn_len,
+                "min_full_attn_seq_len": min_full_attn_seq_len,
+                "contextual_seq_len": contextual_seq_len,
+                "sampling_alpha": sampling_alpha,
+                "triton_enable_tma": triton_enable_tma,
+                "dynamic_attn_scale": dynamic_attn_scale,
+                "num_softmax_heads": num_softmax_heads,
+            },
+        )
+        for mode in modes
+    ]
+
+    @triton.testing.perf_report(configs)
+    def _bench_hstu_attention(
+        batch_size: int,
+        heads: int,
+        seq_len: int,
+        attn_dim: int,
+        hidden_dim: int,
+        mode: str,
+        provider: str,
+        dtype: torch.dtype,
+        seq_sparsity: float,
+        has_delta_q: bool,
+        delta_size: int,
+        target_size: int,
+        bench_backward: bool,
+        report_flops: bool,
+        max_attn_len: int,
+        min_full_attn_seq_len: int,
+        contextual_seq_len: int,
+        sampling_alpha: float,
+        triton_enable_tma: bool,
+        dynamic_attn_scale: bool,
+        num_softmax_heads: int,
+    ) -> float:
+        assert mode in ["fwd", "bwd"]
+        warmup = 25
+        rep = 1000
+        torch.manual_seed(1001)  # for reproducibility
+        alpha = 1.0 / attn_dim
+        causal = True
+        lengths = generate_sparse_seq_len(
+            size=batch_size,
+            max_seq_len=seq_len,
+            sparsity=seq_sparsity,
+            device=torch.device("cuda"),
+        )
+        lengths = apply_sampling(lengths, sampling_alpha, max_seq_len=seq_len)
+        if has_delta_q:
+            lengths = lengths + delta_size
+            num_targets = torch.ones_like(lengths) * delta_size
+            seq_len = seq_len + delta_size
+        else:
+            delta_size = 0
+            num_targets = None
+            if target_size != 0:
+                num_targets = torch.randint(
+                    1,
+                    target_size + 1,
+                    (batch_size,),
+                    device=lengths.device,
+                    dtype=lengths.dtype,
+                )
+                num_targets = torch.where(
+                    num_targets > lengths, lengths, num_targets
+                ).to(torch.int32)
+        max_attn_len = max_attn_len if max_attn_len < seq_len else seq_len
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        L = int(seq_offsets[-1].item())
+        x = torch.empty(
+            (L, heads, attn_dim * 2 + hidden_dim),
+            dtype=dtype,
+            device=torch.device("cuda"),
+        ).uniform_(-0.01, 0.01)
+        q, k, v = torch.split(x, [attn_dim, attn_dim, hidden_dim], dim=-1)
+        delta_q = torch.empty(
+            (batch_size * delta_size, heads, attn_dim),
+            dtype=dtype,
+            device=torch.device("cuda"),
+        ).uniform_(-0.1, 0.1)
+        delta_x_offsets = torch.arange(0, delta_size, device=torch.device("cuda"))
+        delta_x_offsets = (seq_offsets[1:] - delta_size).view(
+            batch_size, 1
+        ) + delta_x_offsets.view(1, delta_size)
+        delta_x_offsets = delta_x_offsets.view(-1)
+        attn_scale = torch.empty(
+            (L,),
+            dtype=torch.float32,
+            device=torch.device("cuda"),
+        ).uniform_(0.5, 1.0)
+
+        if bench_backward:
+            q = q.requires_grad_(True)
+            k = k.requires_grad_(True)
+            v = v.requires_grad_(True)
+        assert provider in [
+            "triton",
+            "pytorch",
+            "flash_cuda_jagged",
+            "flash_cuda",
+            "tlx",
+            "ragged",
+        ]
+        if has_delta_q:
+            fn = lambda: delta_hstu_mha(  # noqa E731
+                max_seq_len=seq_len,
+                alpha=alpha,
+                delta_q=delta_q,
+                k=k,
+                v=v,
+                seq_offsets=seq_offsets,
+                num_targets=num_targets,
+                kernel=_get_kernel(provider),
+            )
+        else:
+            if provider == "flash_cuda_jagged":
+                fn = lambda: cuda_hstu_mha(  # noqa E731
+                    q=q,
+                    k=k,
+                    v=v,
+                    alpha=alpha,
+                    causal=True,
+                    seq_offsets=seq_offsets.to(torch.int32),
+                    attn_scale=attn_scale if dynamic_attn_scale else None,
+                    max_seq_len=seq_len,
+                    max_attn_len=max_attn_len,
+                    min_full_attn_seq_len=min_full_attn_seq_len,
+                    contextual_seq_len=contextual_seq_len,
+                    num_targets=num_targets,
+                    sort_by_length=False,
+                    num_softmax_heads=num_softmax_heads,
+                )
+            elif provider == "flash_cuda":
+                q, k, v = [
+                    torch.randn(
+                        batch_size,
+                        seq_len,
+                        heads,
+                        attn_dim,
+                        device="cuda",
+                        dtype=dtype,
+                        requires_grad=True,
+                    )
+                    for _ in range(3)
+                ]
+                fn = lambda: cuda_hstu_mha(  # noqa E731
+                    q=q,
+                    k=k,
+                    v=v,
+                    alpha=alpha,
+                    causal=True,
+                    max_seq_len=seq_len,
+                    max_attn_len=max_attn_len,
+                    min_full_attn_seq_len=min_full_attn_seq_len,
+                    contextual_seq_len=contextual_seq_len,
+                    num_targets=num_targets,
+                    sort_by_length=False,
+                    num_softmax_heads=num_softmax_heads,
+                )
+            elif provider == "ragged":
+                fn = lambda: ragged_hstu_mha(  # noqa E731
+                    max_seq_len=seq_len,
+                    alpha=alpha,
+                    q=q,
+                    k=k,
+                    v=v,
+                    seq_offsets=seq_offsets,
+                    dropout_pr=0.0,
+                    training=True,
+                    invalid_attn_mask_type="lower_triangular",
+                    num_targets=num_targets,
+                    attn_scale=attn_scale if dynamic_attn_scale else None,
+                    max_attn_len=max_attn_len,
+                    contextual_seq_len=contextual_seq_len,
+                    full_attn_size=min_full_attn_seq_len,
+                    sort_by_length=True,
+                    kernel=HammerKernel2.TRITON,
+                    num_softmax_heads=num_softmax_heads,
+                )
+            else:
+                fn = lambda: hstu_mha(  # noqa E731
+                    max_seq_len=seq_len,
+                    alpha=alpha,
+                    q=q,
+                    k=k,
+                    v=v,
+                    seq_offsets=seq_offsets,
+                    causal=causal,
+                    dropout_pr=0.0,
+                    training=True,
+                    num_targets=num_targets,
+                    max_attn_len=max_attn_len,
+                    contextual_seq_len=contextual_seq_len,
+                    sort_by_length=True,
+                    kernel=_get_kernel(provider),
+                    enable_tma=triton_enable_tma,
+                )
+        if mode == "bwd":
+            o = fn()
+            do = torch.randn_like(o)
+            fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        all_flops = _flops(
+            batch_size, seq_len, attn_dim, hidden_dim, heads, seq_offsets, mode
+        )
+        if has_delta_q:
+            all_flops = all_flops / seq_len * delta_size
+        if report_flops:
+            return all_flops / ms / 1e9
+        else:
+            return ms
+
+    df = _bench_hstu_attention.run(
+        print_data=True,
+        show_plots=False,
+        save_path="/tmp/" + os.environ["USER"],
+        return_df=return_result,
+    )
+
+    if return_result:
+        return configs, df
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_bench.py
new file mode 100644
index 000000000..dcfb9819e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_bench.py
@@ -0,0 +1,199 @@
+# pyre-strict
+import math
+from typing import List, Optional, Tuple
+
+import click
+import pandas as pd
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.triton.triton_jagged import triton_jagged_dense_bmm
+
+# buck2 run @mode/{opt,inplace} //generative_recommenders/ops/benchmarks:jagged_dense_bmm_bench -- --fwd-only
+
+
+def jagged_dense_bmm(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    """
+    Wrapper function for jagged_dense_bmm with kernel selection.
+    Computing out = jagged x dense
+    jagged has shape (sum_B(M_i), K), dense has shape (B, K, N)
+    out has shape (sum_B(M_i), N)
+    """
+    if kernel == HammerKernel.TRITON:
+        return triton_jagged_dense_bmm(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+        )
+    elif kernel == HammerKernel.PYTORCH:
+        # PyTorch implementation - manual implementation using standard operations
+        B, K, N = dense.shape
+        outputs = []
+        for i in range(B):
+            start_idx = seq_offsets[i]
+            end_idx = seq_offsets[i + 1]
+            if start_idx < end_idx:
+                jagged_seq = jagged[start_idx:end_idx]  # (seq_len, K)
+                dense_seq = dense[i]  # (K, N)
+                output_seq = torch.mm(jagged_seq, dense_seq)  # (seq_len, N)
+                outputs.append(output_seq)
+        return (
+            torch.cat(outputs, dim=0)
+            if outputs
+            else torch.empty(0, N, device=jagged.device, dtype=jagged.dtype)
+        )
+    else:
+        raise ValueError(f"Unsupported kernel: {kernel}")
+
+
+@click.command()
+@click.option(
+    "--batch-size",
+    type=int,
+    default=512,
+)
+@click.option(
+    "--max-seq-len",
+    type=int,
+    default=8192,
+    show_default=True,
+)
+@click.option(
+    "-d",
+    type=int,
+    default=64,
+    show_default=True,
+)
+@click.option(
+    "-k",
+    type=int,
+    default=64,
+    show_default=True,
+)
+@click.option("--dtype", type=str, default="bf16")
+@click.option("--fwd-only", is_flag=True)
+@click.option("--return-result", type=bool, default=False)
+def main(
+    batch_size: int,
+    max_seq_len: int,
+    d: int,
+    k: int,
+    dtype: str,
+    fwd_only: bool,
+    return_result: bool,
+) -> Optional[Tuple[List[triton.testing.Benchmark], List[pd.DataFrame]]]:
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    max_seq_len_log2 = int(round(math.log2(max_seq_len)))
+    if dtype == "fp32":
+        pt_dtype = torch.float32
+    elif dtype == "fp16":
+        pt_dtype = torch.float16
+    elif dtype == "bf16":
+        pt_dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {dtype}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["seq_len"],
+            x_vals=[2**i for i in range(5, max_seq_len_log2 + 1)],
+            line_arg="provider",
+            line_vals=["triton", "pytorch"],
+            line_names=["Triton", "Pytorch"],
+            styles=[("red", "-"), ("blue", "-")],
+            ylabel="ms",
+            plot_name=f"jagged_dense_bmm-b{batch_size}-D{d}-K{k}-{dtype}",
+            args={
+                "batch_size": batch_size,
+                "D": d,
+                "K": k,
+                "dtype": pt_dtype,
+                "mode": mode,
+            },
+        )
+        for mode in (["fwd"] if fwd_only else ["fwd", "bwd"])
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_jagged_dense_bmm(
+        batch_size: int,
+        seq_len: int,
+        D: int,
+        K: int,
+        mode: str,
+        provider: str,
+        dtype: torch.dtype,
+    ) -> float:
+        assert mode in ["fwd", "bwd"]
+        warmup = 25
+        rep = 100
+
+        max_seq_len = seq_len
+        lengths = torch.randint(
+            max_seq_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        jagged_size = int(seq_offsets[-1].item())
+        jagged = (
+            torch.empty((jagged_size, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        dense = (
+            torch.empty((batch_size, D, K), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if provider == "triton":
+            fn = lambda: jagged_dense_bmm(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged,
+                dense=dense,
+                kernel=HammerKernel.TRITON,
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        elif provider == "pytorch":
+            fn = lambda: jagged_dense_bmm(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged,
+                dense=dense,
+                kernel=HammerKernel.PYTORCH,
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        else:
+            raise ValueError(f"unsupported provider: {provider}")
+
+    df = bench_jagged_dense_bmm.run(print_data=True, return_df=return_result)
+
+    if return_result:
+        return configs, df
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_broadcast_add_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_broadcast_add_bench.py
new file mode 100644
index 000000000..193704bf9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_broadcast_add_bench.py
@@ -0,0 +1,270 @@
+# pyre-strict
+import math
+import pickle
+from typing import List
+
+import click
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.jagged_tensors import jagged_dense_bmm_broadcast_add
+from generative_recommenders.ops.triton.triton_jagged import (
+    jagged_dense_bmm_broadcast_add_kernel,
+    triton_jagged_dense_bmm,
+    triton_jagged_dense_broadcast_add,
+)
+
+# buck2 run @mode/{opt,inplace} //generative_recommenders/ops/benchmarks:jagged_dense_bmm_broadcast_add_bench -- --fwd-only
+
+# To dump the jagged_dense_bmm_broadcast_add_kernel cache
+# buck2 run @mode/opt //generative_recommenders/ops/benchmarks:jagged_dense_bmm_broadcast_add_bench -- --fwd-only --dump-cache-dir=/home/${USER}/fbsource/fbcode/generative_recommenders/ops/triton/jagged_dense_bmm_broadcast_add_kernel_cache.pkl
+
+
+def get_kernel(provider: str) -> HammerKernel:
+    if provider == "triton":
+        return HammerKernel.TRITON
+    elif provider == "pytorch":
+        return HammerKernel.PYTORCH
+    else:
+        raise ValueError(f"Unknown provider {provider}")
+
+
+def jagged_dense_broadcast_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    """
+    Wrapper function for jagged_dense_broadcast_add with kernel selection.
+    Computing out = jagged + dense (broadcasted)
+    jagged has shape (sum_B(M_i), N), dense has shape (B, N)
+    out has shape (sum_B(M_i), N)
+    """
+    if kernel == HammerKernel.TRITON:
+        return triton_jagged_dense_broadcast_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+        )
+    elif kernel == HammerKernel.PYTORCH:
+        # PyTorch implementation - manual implementation using standard operations
+        B, N = dense.shape
+        outputs = []
+        for i in range(B):
+            start_idx = seq_offsets[i]
+            end_idx = seq_offsets[i + 1]
+            if start_idx < end_idx:
+                jagged_seq = jagged[start_idx:end_idx]  # (seq_len, N)
+                dense_seq = dense[i]  # (N,)
+                output_seq = jagged_seq + dense_seq  # (seq_len, N)
+                outputs.append(output_seq)
+        return (
+            torch.cat(outputs, dim=0)
+            if outputs
+            else torch.empty(0, N, device=jagged.device, dtype=jagged.dtype)
+        )
+    else:
+        raise ValueError(f"Unsupported kernel: {kernel}")
+
+
+def jagged_dense_bmm(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    """
+    Wrapper function for jagged_dense_bmm with kernel selection.
+    Computing out = jagged x dense
+    jagged has shape (sum_B(M_i), K), dense has shape (B, K, N)
+    out has shape (sum_B(M_i), N)
+    """
+    if kernel == HammerKernel.TRITON:
+        return triton_jagged_dense_bmm(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+        )
+    elif kernel == HammerKernel.PYTORCH:
+        # PyTorch implementation - manual implementation using standard operations
+        B, K, N = dense.shape
+        outputs = []
+        for i in range(B):
+            start_idx = seq_offsets[i]
+            end_idx = seq_offsets[i + 1]
+            if start_idx < end_idx:
+                jagged_seq = jagged[start_idx:end_idx]  # (seq_len, K)
+                dense_seq = dense[i]  # (K, N)
+                output_seq = torch.mm(jagged_seq, dense_seq)  # (seq_len, N)
+                outputs.append(output_seq)
+        return (
+            torch.cat(outputs, dim=0)
+            if outputs
+            else torch.empty(0, N, device=jagged.device, dtype=jagged.dtype)
+        )
+    else:
+        raise ValueError(f"Unsupported kernel: {kernel}")
+
+
+@click.command()
+@click.option(
+    "--batch-size",
+    type=int,
+    default=384,
+)
+@click.option(
+    "--max-seq-len",
+    type=int,
+    default=4096,
+    show_default=True,
+)
+@click.option(
+    "-d",
+    type=int,
+    default=512,
+    show_default=True,
+)
+@click.option(
+    "-k",
+    type=int,
+    default=512,
+    show_default=True,
+)
+@click.option("--dtype", type=str, default="bf16")
+@click.option("--fwd-only", is_flag=True)
+@click.option("--dump-cache-dir", type=str, default="")
+def main(
+    batch_size: int,
+    max_seq_len: int,
+    d: int,
+    k: int,
+    dtype: str,
+    fwd_only: bool,
+    dump_cache_dir: str,
+) -> None:
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    max_seq_len_log2 = int(round(math.log2(max_seq_len)))
+    if dtype == "fp32":
+        pt_dtype = torch.float32
+    elif dtype == "fp16":
+        pt_dtype = torch.float16
+    elif dtype == "bf16":
+        pt_dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {dtype}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["seq_len"],
+            x_vals=[2**i for i in range(8, max_seq_len_log2 + 1)],
+            line_arg="provider",
+            line_vals=["triton", "pytorch", "triton_nonfused"],
+            line_names=["Triton", "Pytorch", "Triton_Nonfused"],
+            styles=[("red", "-"), ("blue", "-"), ("green", "-")],
+            ylabel="ms",
+            plot_name=f"jagged_dense_bmm_broadcast_add-{mode}-b{batch_size}-D{d}-K{k}-{dtype}",
+            args={
+                "batch_size": batch_size,
+                "D": d,
+                "K": k,
+                "dtype": pt_dtype,
+                "mode": mode,
+            },
+        )
+        for mode in (["fwd"] if fwd_only else ["fwd", "bwd"])
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_jagged_dense_bmm_broadcast_add(
+        batch_size: int,
+        seq_len: int,
+        D: int,
+        K: int,
+        mode: str,
+        provider: str,
+        dtype: torch.dtype,
+    ) -> float:
+        assert mode in ["fwd", "bwd"]
+        warmup = 25
+        rep = 100
+
+        max_seq_len = seq_len
+        lengths = torch.randint(
+            max_seq_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        jagged_size = int(seq_offsets[-1].item())
+        jagged = (
+            torch.empty((jagged_size, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        dense = (
+            torch.empty((batch_size, D, K), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        bias = (
+            torch.empty((batch_size, K), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if provider in ["triton", "pytorch"]:
+            fn = lambda: jagged_dense_bmm_broadcast_add(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged,
+                dense=dense,
+                bias=bias,
+                kernel=get_kernel(provider),
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        elif provider == "triton_nonfused":
+            fn = lambda: jagged_dense_broadcast_add(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged_dense_bmm(
+                    max_seq_len=max_seq_len,
+                    seq_offsets=seq_offsets,
+                    jagged=jagged,
+                    dense=dense,
+                    kernel=HammerKernel.TRITON,
+                ),
+                dense=bias,
+                kernel=HammerKernel.TRITON,
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        else:
+            raise ValueError(f"unsupported provider: {provider}")
+
+    bench_jagged_dense_bmm_broadcast_add.run(print_data=True)
+    if dump_cache_dir:
+        with open(dump_cache_dir, "wb") as data:
+            # @lint-ignore PYTHONPICKLEISBAD
+            pickle.dump(jagged_dense_bmm_broadcast_add_kernel.cache, data)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_broadcast_add_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_broadcast_add_bench.py
new file mode 100644
index 000000000..049258e3d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_broadcast_add_bench.py
@@ -0,0 +1,205 @@
+# pyre-strict
+import math
+import pickle
+from typing import List, Optional, Tuple
+
+import click
+import pandas as pd
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.triton.triton_jagged import (
+    jagged_dense_broadcast_add_kernel,
+    triton_jagged_dense_broadcast_add,
+)
+
+# buck2 run @mode/{opt,inplace} //generative_recommenders/ops/benchmarks:jagged_dense_broadcast_add_bench
+
+
+# To dump the jagged_dense_broadcast_add_kernel cache, run:
+# buck2 run @mode/{opt,inplace} //generative_recommenders/ops/benchmarks:jagged_dense_broadcast_add_bench -- --dump-ragged-tuner-cache-dir=/home/${USER}/fbsource/fbcode/generative_recommenders/ops/triton/jagged_dense_broadcast_add_kernel_cache.pkl
+
+
+def jagged_dense_broadcast_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    """
+    Wrapper function for jagged_dense_broadcast_add with kernel selection.
+    Computing out = jagged + dense (broadcasted)
+    jagged has shape (sum_B(M_i), N), dense has shape (B, N)
+    out has shape (sum_B(M_i), N)
+    """
+    if kernel == HammerKernel.TRITON:
+        return triton_jagged_dense_broadcast_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+        )
+    elif kernel == HammerKernel.PYTORCH:
+        # PyTorch implementation - manual implementation using standard operations
+        B, N = dense.shape
+        outputs = []
+        for i in range(B):
+            start_idx = seq_offsets[i]
+            end_idx = seq_offsets[i + 1]
+            if start_idx < end_idx:
+                jagged_seq = jagged[start_idx:end_idx]  # (seq_len, N)
+                dense_seq = dense[i]  # (N,)
+                output_seq = jagged_seq + dense_seq  # (seq_len, N)
+                outputs.append(output_seq)
+        return (
+            torch.cat(outputs, dim=0)
+            if outputs
+            else torch.empty(0, N, device=jagged.device, dtype=jagged.dtype)
+        )
+    else:
+        raise ValueError(f"Unsupported kernel: {kernel}")
+
+
+@click.command()
+@click.option(
+    "--batch-size",
+    type=int,
+    default=512,
+)
+@click.option(
+    "--max-seq-len",
+    type=int,
+    default=8192,
+    show_default=True,
+)
+@click.option(
+    "-d",
+    type=int,
+    default=64,
+    show_default=True,
+)
+@click.option("--dtype", type=str, default="fp32")
+@click.option("--fwd-only", is_flag=True)
+@click.option("--dump-ragged-tuner-cache-dir", type=str, default="")
+@click.option("--return-result", type=bool, default=False)
+def main(
+    batch_size: int,
+    max_seq_len: int,
+    d: int,
+    dtype: str,
+    fwd_only: bool,
+    dump_ragged_tuner_cache_dir: str,
+    return_result: bool,
+) -> Optional[Tuple[List[triton.testing.Benchmark], List[pd.DataFrame]]]:
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    max_seq_len_log2 = int(round(math.log2(max_seq_len)))
+    if dtype == "fp32":
+        pt_dtype = torch.float32
+    elif dtype == "fp16":
+        pt_dtype = torch.float16
+    elif dtype == "bf16":
+        pt_dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {dtype}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["seq_len"],
+            x_vals=[2**i for i in range(5, max_seq_len_log2 + 1)],
+            line_arg="provider",
+            line_vals=["triton", "pytorch"],
+            line_names=["Triton", "Pytorch"],
+            styles=[("red", "-"), ("blue", "-")],
+            ylabel="ms",
+            plot_name=f"jagged_dense_broadcast_add-b{batch_size}-D{d}-{dtype}-{mode}",
+            args={
+                "batch_size": batch_size,
+                "D": d,
+                "dtype": pt_dtype,
+                "mode": mode,
+            },
+        )
+        for mode in (["fwd"] if fwd_only else ["fwd", "fwd+bwd"])
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_jagged_dense_broadcast_add(
+        batch_size: int,
+        seq_len: int,
+        D: int,
+        mode: str,
+        provider: str,
+        dtype: torch.dtype,
+    ) -> float:
+        assert mode in ["fwd", "bwd", "fwd+bwd"]
+        warmup = 25
+        rep = 100
+
+        max_seq_len = seq_len
+        lengths = torch.randint(
+            max_seq_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        jagged_size = int(seq_offsets[-1].item())
+        jagged = (
+            torch.empty((jagged_size, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        dense = (
+            torch.empty((batch_size, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if provider == "triton":
+            fn = lambda: jagged_dense_broadcast_add(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged,
+                dense=dense,
+                kernel=HammerKernel.TRITON,
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        elif provider == "pytorch":
+            fn = lambda: jagged_dense_broadcast_add(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged,
+                dense=dense,
+                kernel=HammerKernel.PYTORCH,
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        else:
+            raise ValueError(f"unsupported provider: {provider}")
+
+    df = bench_jagged_dense_broadcast_add.run(print_data=True, return_df=return_result)
+
+    if dump_ragged_tuner_cache_dir:
+        with open(dump_ragged_tuner_cache_dir, "wb") as data:
+            # @lint-ignore PYTHONPICKLEISBAD
+            pickle.dump(jagged_dense_broadcast_add_kernel.cache, data)
+
+    if return_result:
+        return configs, df
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py
new file mode 100644
index 000000000..95c43853f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py
@@ -0,0 +1,125 @@
+# pyre-strict
+from typing import List
+
+import click
+import torch
+
+# @manual=//triton:triton
+import triton
+from hammer.ops.jagged import concat_1D_jagged_jagged
+
+# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:concat_1d_jagged_jagged_bench
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+@click.command()
+@click.option("--data-type", type=str, default="float32")
+@click.option("--batch-size", type=int, default=512)
+@click.option("--max-seq-len-log2", type=int, default=20)
+@click.option("--seq-sparsity", type=float, default=0.8)
+def main(
+    data_type: str,
+    batch_size: int,
+    max_seq_len_log2: int,
+    seq_sparsity: float,
+) -> None:
+    if data_type == "float32":
+        dtype = torch.float32
+    elif data_type == "float16":
+        dtype = torch.float16
+    elif data_type == "bfloat16":
+        dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["max_seq_len"],
+            x_vals=[2**i for i in range(6, max_seq_len_log2)],
+            line_arg="method",
+            line_vals=[
+                "custom_cuda",
+                "hammer_pytorch",
+            ],
+            line_names=["Custom CUDA", "Hammer PyTorch"],
+            styles=[("green", "-"), ("orange", "--")],
+            ylabel="ms",
+            plot_name=f"concat_1d_jagged_jagged_batch{batch_size}_sparsity{seq_sparsity}_{data_type}",
+            args={
+                "dtype": dtype,
+                "batch_size": batch_size,
+                "seq_sparsity": seq_sparsity,
+            },
+        )
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_concat_1d_jagged_jagged(
+        max_seq_len: int,
+        batch_size: int,
+        method: str,
+        dtype: torch.dtype,
+        seq_sparsity: float,
+    ) -> float:
+        warmup = 50
+        rep = 500
+        torch.manual_seed(1001)
+
+        lengths_left = torch.randint(
+            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
+        )
+
+        total_left = int(lengths_left.sum().item())
+        total_right = int(lengths_right.sum().item())
+
+        values_left = torch.randn(total_left, dtype=dtype)
+        values_right = torch.randn(total_right, dtype=dtype)
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+        max_seq_len_left = int(lengths_left.max().item())
+        max_seq_len_right = int(lengths_right.max().item())
+
+        lengths_left = lengths_left.cuda()
+        lengths_right = lengths_right.cuda()
+        values_left = values_left.cuda()
+        values_right = values_right.cuda()
+        offsets_left = offsets_left.cuda()
+        offsets_right = offsets_right.cuda()
+
+        if method == "custom_cuda":
+            fn = lambda: torch.ops.hstu.concat_1d_jagged_jagged(  # noqa E731
+                lengths_left, values_left, lengths_right, values_right
+            )
+        elif method == "hammer_pytorch":
+            fn = lambda: concat_1D_jagged_jagged(  # noqa E731
+                max_seq_len_left,
+                offsets_left,
+                values_left,
+                max_seq_len_right,
+                offsets_right,
+                values_right,
+            )
+        else:
+            raise ValueError(f"unknown method: {method}")
+
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        return ms
+
+    bench_concat_1d_jagged_jagged.run(print_data=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py
new file mode 100644
index 000000000..7806d6970
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py
@@ -0,0 +1,117 @@
+# pyre-strict
+from typing import List
+
+import click
+import torch
+
+# @manual=//triton:triton
+import triton
+from hammer.ops.jagged import jagged_transpose_1D
+
+# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:jagged_transpose_1d_bench
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+@click.command()
+@click.option("--data-type", type=str, default="float32")
+@click.option("--size1", type=int, default=32)
+@click.option("--size2", type=int, default=16)
+@click.option("--max-len-log2", type=int, default=19)
+@click.option("--seq-sparsity", type=float, default=0.8)
+def main(
+    data_type: str,
+    size1: int,
+    size2: int,
+    max_len_log2: int,
+    seq_sparsity: float,
+) -> None:
+    if data_type == "float32":
+        dtype = torch.float32
+    elif data_type == "float16":
+        dtype = torch.float16
+    elif data_type == "bfloat16":
+        dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["max_len"],
+            x_vals=[2**i for i in range(4, max_len_log2)],
+            line_arg="method",
+            line_vals=[
+                "custom_cuda",
+                "hammer_pytorch",
+            ],
+            line_names=["Custom CUDA", "Hammer PyTorch"],
+            styles=[("green", "-"), ("orange", "--")],
+            ylabel="ms",
+            plot_name=f"jagged_transpose_1d_size1_{size1}_size2_{size2}_sparsity{seq_sparsity}_{data_type}",
+            args={
+                "dtype": dtype,
+                "size1": size1,
+                "size2": size2,
+                "seq_sparsity": seq_sparsity,
+            },
+        )
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_jagged_transpose_1d(
+        max_len: int,
+        size1: int,
+        size2: int,
+        method: str,
+        dtype: torch.dtype,
+        seq_sparsity: float,
+    ) -> float:
+        warmup = 50
+        rep = 500
+        torch.manual_seed(1001)
+
+        lengths = torch.randint(
+            1, int(max_len * seq_sparsity) + 1, (size1 * size2,), dtype=torch.int32
+        )
+        offsets = torch.zeros(
+            (size1 * size2 + 1,), dtype=lengths.dtype, device=lengths.device
+        )
+        offsets[1:] = torch.cumsum(lengths.view(-1), dim=0)
+
+        values = torch.randn(int(offsets[-1].item()), dtype=dtype)
+
+        lengths = lengths.cuda()
+        offsets = offsets.cuda()
+        values = values.cuda()
+
+        if method == "custom_cuda":
+            fn = lambda: torch.ops.hstu.jagged_transpose_1d(  # noqa E731
+                values=values,
+                offsets=offsets,
+                lengths=lengths,
+                max_len=max_len,
+                size1=size1,
+                size2=size2,
+            )
+        elif method == "hammer_pytorch":
+            fn = lambda: jagged_transpose_1D(  # noqa E731
+                values=values,
+                offsets=offsets,
+                lengths=lengths,
+                max_len=max_len,
+                size1=size1,
+                size2=size2,
+            )
+        else:
+            raise ValueError(f"unknown method: {method}")
+
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        return ms
+
+    bench_jagged_transpose_1d.run(print_data=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py
new file mode 100644
index 000000000..a3f2483fa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py
@@ -0,0 +1,150 @@
+# pyre-strict
+from typing import List
+
+import click
+import torch
+
+# @manual=//triton:triton
+import triton
+from hammer.ops.jagged import replace_last_n_with_jagged
+from hammer.utils import HammerKernel
+
+# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:replace_last_n_with_jagged_bench
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+@click.command()
+@click.option("--data-type", type=str, default="float32")
+@click.option("--batch-size", type=int, default=512)
+@click.option("--embedding-dim", type=int, default=64)
+@click.option("--max-seq-len-log2", type=int, default=16)
+@click.option("--seq-sparsity", type=float, default=0.8)
+def main(
+    data_type: str,
+    batch_size: int,
+    embedding_dim: int,
+    max_seq_len_log2: int,
+    seq_sparsity: float,
+) -> None:
+    if data_type == "float32":
+        dtype = torch.float32
+    elif data_type == "float16":
+        dtype = torch.float16
+    elif data_type == "bfloat16":
+        dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["max_seq_len"],
+            x_vals=[2**i for i in range(6, max_seq_len_log2)],
+            line_arg="method",
+            line_vals=[
+                "custom_cuda",
+                "hammer_pytorch",
+                "hammer_triton",
+            ],
+            line_names=[
+                "Custom CUDA",
+                "Hammer PyTorch",
+                "Hammer Triton",
+            ],
+            styles=[
+                ("green", "-"),
+                ("orange", "--"),
+                ("purple", "-."),
+            ],
+            ylabel="ms",
+            plot_name=f"replace_last_n_with_jagged_batch{batch_size}_dim{embedding_dim}_sparsity{seq_sparsity}_{data_type}",
+            args={
+                "dtype": dtype,
+                "batch_size": batch_size,
+                "embedding_dim": embedding_dim,
+                "seq_sparsity": seq_sparsity,
+            },
+        )
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_replace_last_n_with_jagged(
+        max_seq_len: int,
+        batch_size: int,
+        method: str,
+        dtype: torch.dtype,
+        embedding_dim: int,
+        seq_sparsity: float,
+    ) -> float:
+        warmup = 50
+        rep = 500
+        torch.manual_seed(1001)
+
+        min_left_len = max(1, int(max_seq_len * seq_sparsity * 0.3))
+        max_left_len = int(max_seq_len * seq_sparsity)
+
+        lengths_left = torch.randint(
+            min_left_len, max_left_len + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            1, min_left_len + 1, (batch_size,), dtype=torch.int32
+        )
+
+        lengths_right = torch.min(lengths_right, lengths_left)
+
+        total_left = int(lengths_left.sum().item())
+        total_right = int(lengths_right.sum().item())
+
+        values_left = torch.randn(total_left, embedding_dim, dtype=dtype)
+        values_right = torch.randn(total_right, embedding_dim, dtype=dtype)
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+
+        lengths_left = lengths_left.cuda()
+        lengths_right = lengths_right.cuda()
+        values_left = values_left.cuda()
+        values_right = values_right.cuda()
+        offsets_left = offsets_left.cuda()
+        offsets_right = offsets_right.cuda()
+
+        if method == "custom_cuda":
+            fn = lambda: torch.ops.hstu.replace_last_n_with_jagged(  # noqa E731
+                lengths_left, values_left, lengths_right, values_right
+            )
+        elif method == "hammer_pytorch":
+            fn = lambda: replace_last_n_with_jagged(  # noqa E731
+                max_seq_len_left=max_seq_len,
+                offsets_left=offsets_left,
+                values_left=values_left,
+                offsets_right=offsets_right,
+                values_right=values_right,
+            )
+        elif method == "hammer_triton":
+            fn = lambda: replace_last_n_with_jagged(  # noqa E731
+                max_seq_len_left=max_seq_len,
+                offsets_left=offsets_left,
+                values_left=values_left,
+                offsets_right=offsets_right,
+                values_right=values_right,
+                kernel=HammerKernel.TRITON,
+            )
+        else:
+            raise ValueError(f"unknown method: {method}")
+
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        return ms
+
+    bench_replace_last_n_with_jagged.run(print_data=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py
new file mode 100644
index 000000000..4aaa9d77c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py
@@ -0,0 +1,116 @@
+# pyre-strict
+from typing import List
+
+import click
+import torch
+
+# @manual=//triton:triton
+import triton
+from hammer.ops.jagged import split_1D_jagged_jagged
+
+# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:split_1d_jagged_jagged_bench
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+@click.command()
+@click.option("--data-type", type=str, default="float32")
+@click.option("--batch-size", type=int, default=512)
+@click.option("--max-seq-len-log2", type=int, default=20)
+@click.option("--seq-sparsity", type=float, default=0.8)
+def main(
+    data_type: str,
+    batch_size: int,
+    max_seq_len_log2: int,
+    seq_sparsity: float,
+) -> None:
+    if data_type == "float32":
+        dtype = torch.float32
+    elif data_type == "float16":
+        dtype = torch.float16
+    elif data_type == "bfloat16":
+        dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["max_seq_len"],
+            x_vals=[2**i for i in range(6, max_seq_len_log2)],
+            line_arg="method",
+            line_vals=[
+                "custom_cuda",
+                "hammer_pytorch",
+            ],
+            line_names=["Custom CUDA", "Hammer PyTorch"],
+            styles=[("green", "-"), ("orange", "--")],
+            ylabel="ms",
+            plot_name=f"split_1d_jagged_jagged_batch{batch_size}_sparsity{seq_sparsity}_{data_type}",
+            args={
+                "dtype": dtype,
+                "batch_size": batch_size,
+                "seq_sparsity": seq_sparsity,
+            },
+        )
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_split_1d_jagged_jagged(
+        max_seq_len: int,
+        batch_size: int,
+        method: str,
+        dtype: torch.dtype,
+        seq_sparsity: float,
+    ) -> float:
+        warmup = 50
+        rep = 500
+        torch.manual_seed(1001)
+
+        lengths_left = torch.randint(
+            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
+        )
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+
+        combined_offsets = offsets_left + offsets_right
+        combined_values = torch.randn(int(combined_offsets[-1].item()), dtype=dtype)
+
+        max_seq_len_combined = int((lengths_left + lengths_right).max().item())
+
+        lengths_left = lengths_left.cuda()
+        lengths_right = lengths_right.cuda()
+        combined_values = combined_values.cuda()
+        offsets_left = offsets_left.cuda()
+        offsets_right = offsets_right.cuda()
+
+        if method == "custom_cuda":
+            fn = lambda: torch.ops.hstu.split_1d_jagged_jagged(  # noqa E731
+                lengths_left, lengths_right, combined_values
+            )
+        elif method == "hammer_pytorch":
+            fn = lambda: split_1D_jagged_jagged(  # noqa E731
+                max_seq_len_combined, combined_values, offsets_left, offsets_right
+            )
+        else:
+            raise ValueError(f"unknown method: {method}")
+
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        return ms
+
+    bench_split_1d_jagged_jagged.run(print_data=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/common.h b/recommendation_v4/generative_recommenders/ops/cpp/common.h
new file mode 100644
index 000000000..1c4b43768
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/common.h
@@ -0,0 +1,60 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND4(                                 \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                    \
+      TYPE,                                                              \
+      NAME,                                                              \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                              \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
+
+inline __attribute__((always_inline)) uint32_t
+div_round_up(uint32_t a, uint32_t b) {
+  return (a + b - 1) / b;
+};
+
+inline __attribute__((always_inline)) uint32_t next_power_of_2(uint32_t n) {
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n++;
+  return n;
+}
+
+/*
+ * Because different .SO may include the same CUDA CUB kernels, this results in
+ * confusion, where libA may end up calling libB's cub kernel and causing
+ * failures when we static link libcudart_static.a. To avoid this, we annotate
+ * only the public functions and hide the rest.
+ */
+#define DLL_PUBLIC __attribute__((visibility("default")))
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp b/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp
new file mode 100644
index 000000000..4ebd426d7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp
@@ -0,0 +1,44 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+
+namespace hstu {
+
+at::Tensor complete_cumsum_cpu(const at::Tensor& values) {
+  TORCH_CHECK(values.dim() == 1);
+  auto len = values.size(0);
+  const torch::Tensor index = at::range(0, len, at::kLong).cpu();
+  auto output = fbgemm_gpu::asynchronous_complete_cumsum_cpu(values);
+  return output;
+}
+
+at::Tensor complete_cumsum_meta(const at::Tensor& values) {
+  auto len = values.sym_size(0);
+  auto output = at::native::empty_meta_symint(
+      {len + 1},
+      /*dtype=*/::std::make_optional(values.scalar_type()),
+      /*layout=*/::std::make_optional(values.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+  return output;
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu b/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu
new file mode 100644
index 000000000..06d1abdd6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu
@@ -0,0 +1,51 @@
+#include "common.h"
+
+#include <cub/device/device_scan.cuh>
+
+namespace hstu {
+
+DLL_PUBLIC at::Tensor complete_cumsum_cuda(const at::Tensor& values) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values.get_device());
+
+  TORCH_CHECK(values.numel() < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(values.dim() == 1);
+  const auto values_contig = values.contiguous();
+
+  auto cumsum = at::empty({values_contig.numel() + 1}, values_contig.options());
+  cumsum[0].zero_();
+
+  AT_DISPATCH_FLOATING_TYPES_AND4(
+      at::ScalarType::Int,
+      at::ScalarType::Long,
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      values_contig.scalar_type(),
+      "complete_cumsum_cuda",
+      [&] {
+        size_t temp_storage_bytes = 0;
+        AT_CUDA_CHECK(
+            cub::DeviceScan::InclusiveSum(
+                nullptr,
+                temp_storage_bytes,
+                values_contig.data_ptr<scalar_t>(),
+                cumsum.data_ptr<scalar_t>() + 1,
+                values_contig.numel(),
+                at::cuda::getCurrentCUDAStream()));
+        auto temp_storage = at::empty(
+            {static_cast<int64_t>(temp_storage_bytes)},
+            values_contig.options().dtype(at::kByte));
+        AT_CUDA_CHECK(
+            cub::DeviceScan::InclusiveSum(
+                temp_storage.data_ptr(),
+                temp_storage_bytes,
+                values_contig.data_ptr<scalar_t>(),
+                cumsum.data_ptr<scalar_t>() + 1,
+                values_contig.numel(),
+                at::cuda::getCurrentCUDAStream()));
+      });
+
+  return cumsum;
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp b/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp
new file mode 100644
index 000000000..51b313443
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp
@@ -0,0 +1,111 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+void _concat_1d_jagged_jagged_cpu_kernel(
+    int32_t B,
+    const at::TensorAccessor<index_t, 1>& offsets_left,
+    const at::TensorAccessor<val_t, 1>& values_left,
+    const at::TensorAccessor<index_t, 1>& offsets_right,
+    const at::TensorAccessor<val_t, 1>& values_right,
+    at::TensorAccessor<val_t, 1> combined_values) {
+  for (auto b : c10::irange(B)) {
+    auto left_start = offsets_left[b];
+    auto left_len = offsets_left[b + 1] - left_start;
+    auto right_start = offsets_right[b];
+    auto right_len = offsets_right[b + 1] - right_start;
+    auto combined_start = left_start + right_start;
+    for (auto i = 0; i < left_len; ++i) {
+      combined_values[combined_start + i] = values_left[left_start + i];
+    }
+    for (auto i = 0; i < right_len; ++i) {
+      combined_values[left_len + combined_start + i] =
+          values_right[right_start + i];
+    }
+  }
+}
+
+at::Tensor concat_1d_jagged_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CPU);
+  auto L = values_left.numel() + values_right.numel();
+  TORCH_CHECK(L < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
+  auto B = lengths_left.size(0);
+  auto combined_values = at::empty({L}, values_left.options());
+  if (L == 0) {
+    return combined_values;
+  }
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_right.view({-1}));
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "concat_1d_jagged_jagged_values_cpu_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values_left.scalar_type(),
+            "concat_1d_jagged_jagged_values_cpu_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _concat_1d_jagged_jagged_cpu_kernel<index_t, val_t>(
+                  B,
+                  offsets_left.accessor<index_t, 1>(),
+                  values_left.accessor<val_t, 1>(),
+                  offsets_right.accessor<index_t, 1>(),
+                  values_right.accessor<val_t, 1>(),
+                  combined_values.accessor<val_t, 1>());
+            });
+      });
+  return combined_values;
+}
+
+at::Tensor concat_1d_jagged_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  auto L = values_left.numel() + values_right.numel();
+  return at::native::empty_meta_symint(
+      {L},
+      /*dtype=*/::std::make_optional(values_left.scalar_type()),
+      /*layout=*/::std::make_optional(values_left.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu b/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu
new file mode 100644
index 000000000..8eeae9d59
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu
@@ -0,0 +1,130 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "common.h"
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
+
+namespace hstu {
+
+static constexpr int32_t kMaxThreads = 1024;
+
+template <typename index_t, typename val_t>
+__global__
+__launch_bounds__(kMaxThreads) void _concat_1d_jagged_jagged_cuda_kernel(
+    int32_t B,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_left,
+    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
+        values_left,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_right,
+    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
+        values_right,
+    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
+        combined_values) {
+  for (auto b = blockIdx.x * blockDim.y + threadIdx.y;
+       b < static_cast<uint32_t>(B);
+       b += gridDim.x * blockDim.y) {
+    auto left_start = offsets_left[b];
+    auto left_len = offsets_left[b + 1] - left_start;
+    auto right_start = offsets_right[b];
+    auto right_len = offsets_right[b + 1] - right_start;
+    auto combined_start = left_start + right_start;
+    for (auto i = threadIdx.x; i < static_cast<uint32_t>(left_len + right_len);
+         i += blockDim.x) {
+      if (i < static_cast<uint32_t>(left_len)) {
+        combined_values[combined_start + i] = values_left[left_start + i];
+      } else {
+        combined_values[combined_start + i] =
+            values_right[right_start + i - left_len];
+      }
+    }
+  }
+}
+
+at::Tensor concat_1d_jagged_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values_left.get_device());
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CUDA);
+  auto L = values_left.numel() + values_right.numel();
+  TORCH_CHECK(L < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(values_left.get_device() == lengths_left.get_device());
+  TORCH_CHECK(values_left.get_device() == lengths_right.get_device());
+  TORCH_CHECK(values_left.get_device() == values_right.get_device());
+  auto B = lengths_left.size(0);
+  auto combined_values = at::empty({L}, values_left.options());
+  if (L == 0) {
+    return combined_values;
+  }
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_right.view({-1}));
+  // Optimized thread block configuration based on benchmark results
+  uint32_t B_blocks = 4;
+  dim3 threads(256, B_blocks);
+  auto blocks = div_round_up(B, B_blocks);
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "concat_1d_jagged_jagged_values_cuda_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values_left.scalar_type(),
+            "concat_1d_jagged_jagged_values_cuda_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _concat_1d_jagged_jagged_cuda_kernel<index_t, val_t>
+                  <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                      B,
+                      offsets_left.packed_accessor32<
+                          index_t,
+                          1,
+                          at::RestrictPtrTraits>(),
+                      values_left
+                          .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                      offsets_right.packed_accessor32<
+                          index_t,
+                          1,
+                          at::RestrictPtrTraits>(),
+                      values_right
+                          .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                      combined_values.packed_accessor32<
+                          val_t,
+                          1,
+                          at::RestrictPtrTraits>());
+            });
+      });
+  return combined_values;
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp b/recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp
new file mode 100644
index 000000000..155cc7572
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp
@@ -0,0 +1,207 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+/*
+ * Because different .SO may include the same CUDA CUB kernels, this results in
+ * confusion, where libA may end up calling libB's cub kernel and causing
+ * failures when we static link libcudart_static.a. To avoid this, we annotate
+ * only the public functions and hide the rest.
+ */
+#define DLL_PUBLIC __attribute__((visibility("default")))
+
+namespace hstu {
+at::Tensor expand_1d_jagged_to_dense_cpu(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const int64_t max_len);
+
+at::Tensor expand_1d_jagged_to_dense_meta(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const c10::SymInt max_len);
+
+at::Tensor expand_1d_jagged_to_dense_cuda(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const int64_t max_len);
+
+at::Tensor complete_cumsum_cpu(const at::Tensor& values);
+
+at::Tensor complete_cumsum_cuda(const at::Tensor& values);
+
+at::Tensor complete_cumsum_meta(const at::Tensor& values);
+
+at::Tensor concat_1d_jagged_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+at::Tensor concat_1d_jagged_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+at::Tensor concat_1d_jagged_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values);
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values);
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values);
+
+at::Tensor replace_last_n_with_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+at::Tensor replace_last_n_with_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+at::Tensor replace_last_n_with_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cpu(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cuda(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_meta(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2);
+
+DLL_PUBLIC std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_meta(
+    const at::Tensor& keys,
+    const at::Tensor& values,
+    const std::optional<int64_t>& end_bit,
+    const bool descending = false) {
+  TORCH_CHECK(
+      keys.dtype() == at::kInt || keys.dtype() == at::kLong ||
+      keys.dtype() == at::kByte || keys.dtype() == at::kShort);
+  TORCH_CHECK(keys.dim() == 1);
+  TORCH_CHECK(values.dim() == 1);
+  return {at::empty_like(keys), at::empty_like(values)};
+}
+
+std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_cuda(
+    const at::Tensor& keys,
+    const at::Tensor& values,
+    const std::optional<int64_t>& end_bit,
+    const bool descending = false);
+
+} // namespace hstu
+
+TORCH_LIBRARY_FRAGMENT(hstu, m) {
+  m.def(
+      "expand_1d_jagged_to_dense(Tensor values, Tensor offsets, SymInt max_len) -> Tensor");
+  m.def(
+      "concat_1d_jagged_jagged(Tensor lengths_left, Tensor values_left, Tensor lengths_right, Tensor values_right) -> Tensor");
+  m.def(
+      "split_1d_jagged_jagged(Tensor lengths_left, Tensor lengths_right, Tensor combined_values) -> (Tensor, Tensor)");
+  m.def(
+      "replace_last_n_with_jagged(Tensor lengths_left, Tensor values_left, Tensor lengths_right, Tensor values_right) -> Tensor");
+  m.def(
+      "jagged_transpose_1d(Tensor values, Tensor offsets, Tensor lengths, int max_len, int size1, int size2) -> (Tensor, Tensor, Tensor)");
+  m.def("complete_cumsum(Tensor values) -> Tensor");
+  m.def(
+      "sort_kv_pairs(Tensor keys, Tensor values, int? end_bit=None, bool descending=False) -> (Tensor, Tensor)");
+}
+
+TORCH_LIBRARY_IMPL(hstu, CPU, m) {
+  m.impl("expand_1d_jagged_to_dense", hstu::expand_1d_jagged_to_dense_cpu);
+  m.impl("concat_1d_jagged_jagged", hstu::concat_1d_jagged_jagged_cpu);
+  m.impl("split_1d_jagged_jagged", hstu::split_1d_jagged_jagged_cpu);
+  m.impl("replace_last_n_with_jagged", hstu::replace_last_n_with_jagged_cpu);
+  m.impl("jagged_transpose_1d", hstu::jagged_transpose_1d_cpu);
+  m.impl("complete_cumsum", hstu::complete_cumsum_cpu);
+}
+
+TORCH_LIBRARY_IMPL(hstu, CUDA, m) {
+  m.impl("expand_1d_jagged_to_dense", hstu::expand_1d_jagged_to_dense_cuda);
+  m.impl("concat_1d_jagged_jagged", hstu::concat_1d_jagged_jagged_cuda);
+  m.impl("split_1d_jagged_jagged", hstu::split_1d_jagged_jagged_cuda);
+  m.impl("replace_last_n_with_jagged", hstu::replace_last_n_with_jagged_cuda);
+  m.impl("jagged_transpose_1d", hstu::jagged_transpose_1d_cuda);
+  m.impl("complete_cumsum", hstu::complete_cumsum_cuda);
+  m.impl(
+      "sort_kv_pairs",
+      torch::dispatch(
+          c10::DispatchKey::CUDA, TORCH_FN(hstu::sort_kv_pairs_cuda)));
+}
+
+TORCH_LIBRARY_IMPL(hstu, Meta, m) {
+  m.impl("expand_1d_jagged_to_dense", hstu::expand_1d_jagged_to_dense_meta);
+  m.impl("concat_1d_jagged_jagged", hstu::concat_1d_jagged_jagged_meta);
+  m.impl("split_1d_jagged_jagged", hstu::split_1d_jagged_jagged_meta);
+  m.impl("replace_last_n_with_jagged", hstu::replace_last_n_with_jagged_meta);
+  m.impl("jagged_transpose_1d", hstu::jagged_transpose_1d_meta);
+  m.impl("complete_cumsum", hstu::complete_cumsum_meta);
+  m.impl(
+      "sort_kv_pairs",
+      torch::dispatch(
+          c10::DispatchKey::Meta, TORCH_FN(hstu::sort_kv_pairs_meta)));
+}
+
+TORCH_LIBRARY_IMPL(hstu, Autograd, m) {
+  m.impl(
+      "expand_1d_jagged_to_dense",
+      torch::autograd::autogradNotImplementedFallback());
+  m.impl("complete_cumsum", torch::autograd::autogradNotImplementedFallback());
+}
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py
new file mode 100644
index 000000000..0f9458c8b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py
@@ -0,0 +1,193 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.ops.utils import is_sm100_plus
+
+try:
+    # We need to import the CUDA kernels after importing torch
+    import hstu._C  # pyre-ignore [21]
+except:
+    pass
+try:
+    torch.ops.load_library(
+        "//generative_recommenders/fb/ultra/ops/blackwell/hstu_attention:hstu_flash_attention"
+    )
+    torch.ops.load_library(
+        "//generative_recommenders/ops/cpp/hstu_attention:hstu_flash_attention"
+    )
+except:
+    pass
+
+
+def cuda_hstu_mha(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: Optional[torch.Tensor] = None,
+    causal: bool = False,
+    num_targets: Optional[torch.Tensor] = None,
+    attn_scale: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    min_full_attn_seq_len: int = 0,
+    contextual_seq_len: int = 0,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
+    sort_by_length: bool = False,
+    deterministic: bool = False,
+    sm_margin: int = 0,
+    max_q_len: int = 0,
+    seq_offsets_q: Optional[torch.Tensor] = None,
+    num_softmax_heads: int = 0,
+    training: bool = True,
+    max_seq_len_tensor: Optional[torch.Tensor] = None,
+    contextual_seq_len_tensor: Optional[torch.Tensor] = None,
+    max_attn_len_tensor: Optional[torch.Tensor] = None,
+    min_full_attn_seq_len_tensor: Optional[torch.Tensor] = None,
+    num_groups: int = 1,
+    is_inference: bool = False,
+) -> torch.Tensor:
+    """
+    Arguments:
+        q, k, v: (batch_size, seqlen, nheads, headdim) or (total_seqlen, nheads, headdim)
+    """
+    if is_sm100_plus() and not is_inference:
+        return torch.ops.bw_hstu.bw_hstu_mha(
+            max_seq_len,
+            alpha,
+            q,
+            k,
+            v,
+            seq_offsets,
+            causal,
+            num_targets,
+            attn_scale,
+            max_attn_len,
+            min_full_attn_seq_len,
+            contextual_seq_len,
+            q_descale,
+            k_descale,
+            v_descale,
+            sort_by_length,
+            deterministic,
+            sm_margin,
+            max_q_len,
+            seq_offsets_q,
+            max_seq_len_tensor,
+            contextual_seq_len_tensor,
+            max_attn_len_tensor,
+            min_full_attn_seq_len_tensor,
+            num_groups,
+            num_softmax_heads,
+        )
+    else:
+        return cuda_hstu_mha_inference_wrapper(
+            max_seq_len,
+            alpha,
+            q,
+            k,
+            v,
+            seq_offsets,
+            causal,
+            num_targets,
+            attn_scale,
+            max_attn_len,
+            min_full_attn_seq_len,
+            contextual_seq_len,
+            q_descale,
+            k_descale,
+            v_descale,
+            sort_by_length,
+            deterministic,
+            sm_margin,
+            max_q_len,
+            seq_offsets_q,
+            num_softmax_heads,
+            training,
+            max_seq_len_tensor,
+            contextual_seq_len_tensor,
+            max_attn_len_tensor,
+            min_full_attn_seq_len_tensor,
+            num_groups,
+        )
+
+
+@torch.fx.wrap
+def cuda_hstu_mha_inference_wrapper(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: Optional[torch.Tensor] = None,
+    causal: bool = False,
+    num_targets: Optional[torch.Tensor] = None,
+    attn_scale: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    min_full_attn_seq_len: int = 0,
+    contextual_seq_len: int = 0,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
+    sort_by_length: bool = False,
+    deterministic: bool = False,
+    sm_margin: int = 0,
+    max_q_len: int = 0,
+    seq_offsets_q: Optional[torch.Tensor] = None,
+    num_softmax_heads: int = 0,
+    training: bool = True,
+    max_seq_len_tensor: Optional[torch.Tensor] = None,
+    contextual_seq_len_tensor: Optional[torch.Tensor] = None,
+    max_attn_len_tensor: Optional[torch.Tensor] = None,
+    min_full_attn_seq_len_tensor: Optional[torch.Tensor] = None,
+    num_groups: int = 1,
+) -> torch.Tensor:
+    attn_scale = attn_scale.to(torch.float32) if attn_scale is not None else attn_scale
+
+    return torch.ops.hstu.hstu_mha(
+        max_seq_len,
+        alpha,
+        q,
+        k,
+        v,
+        seq_offsets,
+        causal,
+        num_targets,
+        attn_scale,
+        max_attn_len,
+        min_full_attn_seq_len,
+        contextual_seq_len,
+        q_descale,
+        k_descale,
+        v_descale,
+        sort_by_length,
+        deterministic,
+        sm_margin,
+        max_q_len,
+        seq_offsets_q,
+        num_softmax_heads,
+        training,
+        max_seq_len_tensor,
+        contextual_seq_len_tensor,
+        max_attn_len_tensor,
+        min_full_attn_seq_len_tensor,
+        num_groups,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py b/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py
new file mode 100644
index 000000000..2184ef2a5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py
@@ -0,0 +1,668 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.ops.triton.triton_addmm import (
+    maybe_triton_addmm_fwd,
+    triton_addmm_bwd,
+)
+from generative_recommenders.ops.triton.triton_layer_norm import (
+    triton_weighted_layer_norm_bwd,
+)
+from generative_recommenders.ops.utils import copy_if_different_ptr, is_sm100_plus
+from torch.nn import functional as F
+
+try:
+    from generative_recommenders.fb.ultra.ops.fp8.fp8_addmm import (
+        fp8_rowwise_quantize_addmm,
+    )
+    from generative_recommenders.fb.ultra.ops.fp8.layer_norm_quantization import (
+        triton_weighted_layer_norm_quantization_fwd,
+    )
+    from hammer.ops.triton.triton_apply_rope import (
+        triton_apply_rope_bwd,
+        triton_apply_rope_fwd,
+    )
+
+    if is_sm100_plus():
+        print("is sm100_plus architecture, loading hstu flash attention for blackwell")
+        torch.ops.load_library(
+            "//generative_recommenders/fb/ultra/ops/blackwell/hstu_attention:hstu_flash_attention"
+        )
+    print("loading hstu flash attention for general architecture")
+    torch.ops.load_library(
+        "//generative_recommenders/ops/cpp/hstu_attention:hstu_flash_attention"
+    )
+except Exception as ex:
+    print(f"Library importing error when importing library: {ex}")
+
+
+class _HSTUPreprocessAndAttentionFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore [14]
+    def forward(
+        ctx,  # pyre-ignore [2]
+        x: torch.Tensor,
+        norm_weight: torch.Tensor,
+        norm_bias: torch.Tensor,
+        norm_eps: float,
+        num_heads: int,
+        attn_dim: int,
+        hidden_dim: int,
+        uvqk_weight: torch.Tensor,
+        uvqk_bias: Optional[torch.Tensor],
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        alpha: float,
+        invalid_attn_mask_type: str,
+        num_targets: Optional[torch.Tensor],
+        rotary_weights: Optional[
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        ] = None,
+        attn_scale: Optional[torch.Tensor] = None,
+        recompute_uvqk_in_backward: bool = False,
+        recompute_normed_x_in_backward: bool = False,
+        contextual_seq_len: int = 0,
+        sort_by_length: bool = False,
+        max_attn_len: Optional[int] = None,
+        full_attn_size: Optional[int] = None,
+        silu_u: bool = True,
+        fp8_in_addmm_fwd: bool = False,
+        num_softmax_heads: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        max_attn_len = max_attn_len or 0
+        full_attn_size = full_attn_size or 0
+        normed_x, x_mean, x_rstd, BLOCK_D, x_scale, normed_x_fp8 = (
+            triton_weighted_layer_norm_quantization_fwd(
+                x=x,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=norm_eps,
+                quantize_output=fp8_in_addmm_fwd,
+            )
+        )
+        # When silu_u is False and we want to recompute in backward, we split the weight
+        # for u and vqk separately during training to compute them independently.
+        # This avoids needing to clone u (which would otherwise keep the whole uvqk alive).
+        if not silu_u and recompute_uvqk_in_backward:
+            # Split the weights/biases to compute u and vqk separately
+            u_weight, vqk_weight = uvqk_weight.split(
+                [
+                    hidden_dim * num_heads,
+                    hidden_dim * num_heads
+                    + attn_dim * num_heads
+                    + attn_dim * num_heads,
+                ],
+                dim=1,
+            )
+            if uvqk_bias is not None:
+                u_bias, vqk_bias = uvqk_bias.split(
+                    [
+                        hidden_dim * num_heads,
+                        hidden_dim * num_heads
+                        + attn_dim * num_heads
+                        + attn_dim * num_heads,
+                    ],
+                    dim=0,
+                )
+            else:
+                u_bias, vqk_bias = None, None
+            if fp8_in_addmm_fwd:
+                assert x_scale is not None and normed_x_fp8 is not None
+                u = fp8_rowwise_quantize_addmm(
+                    x=normed_x,
+                    x_fp8=normed_x_fp8,
+                    w=u_weight,
+                    y=u_bias,
+                    x_scale=x_scale,
+                    custom_kernel=False,
+                    is_inference=False,
+                ).contiguous()
+                vqk = fp8_rowwise_quantize_addmm(
+                    x=normed_x,
+                    x_fp8=normed_x_fp8,
+                    w=vqk_weight,
+                    y=vqk_bias,
+                    x_scale=x_scale,
+                    custom_kernel=False,
+                    is_inference=False,
+                ).contiguous()
+            else:
+                u = maybe_triton_addmm_fwd(normed_x, u_weight, u_bias).contiguous()
+                vqk = maybe_triton_addmm_fwd(
+                    normed_x, vqk_weight, vqk_bias
+                ).contiguous()
+            v, q, k = vqk.split(
+                [
+                    hidden_dim * num_heads,
+                    attn_dim * num_heads,
+                    attn_dim * num_heads,
+                ],
+                dim=1,
+            )
+            # uvqk is not used since we split the computation, but we need it
+            # for saving in case recompute_uvqk_in_backward is False in a
+            # different code path. Set to None to satisfy type checker.
+            uvqk = None
+        else:
+            if fp8_in_addmm_fwd:
+                assert (
+                    x_scale is not None
+                    and normed_x_fp8 is not None
+                    and uvqk_bias is not None
+                )
+                uvqk = fp8_rowwise_quantize_addmm(
+                    x=normed_x,
+                    x_fp8=normed_x_fp8,
+                    w=uvqk_weight,
+                    y=uvqk_bias,
+                    x_scale=x_scale,
+                    custom_kernel=False,
+                    is_inference=False,
+                ).contiguous()
+            else:
+                uvqk = maybe_triton_addmm_fwd(
+                    normed_x, uvqk_weight, uvqk_bias
+                ).contiguous()
+            u, v, q, k = uvqk.split(
+                [
+                    hidden_dim * num_heads,
+                    hidden_dim * num_heads,
+                    attn_dim * num_heads,
+                    attn_dim * num_heads,
+                ],
+                dim=1,
+            )
+            if silu_u:
+                u = F.silu(u)
+        if rotary_weights is not None:
+            q_cos_weights = rotary_weights[0]
+            q_sin_weights = rotary_weights[1]
+            k_cos_weights = rotary_weights[2]
+            k_sin_weights = rotary_weights[3]
+            _q = triton_apply_rope_fwd(
+                x=q.view(-1, num_heads, attn_dim),
+                N=max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=q_cos_weights,
+                sin_rope=q_sin_weights,
+            ).view(-1, num_heads * attn_dim)
+            _k = triton_apply_rope_fwd(
+                x=k.view(-1, num_heads, attn_dim),
+                N=max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=k_cos_weights,
+                sin_rope=k_sin_weights,
+            ).view(-1, num_heads * attn_dim)
+            copy_if_different_ptr(q, _q)
+            copy_if_different_ptr(k, _k)
+        q = q.view(-1, num_heads, attn_dim)
+        k = k.view(-1, num_heads, attn_dim)
+        v = v.view(-1, num_heads, hidden_dim)
+        if is_sm100_plus():
+            out, softmax_lse = torch.ops.bw_hstu.bw_hstu_mha_fwd(
+                max_seq_len,
+                alpha,
+                q,
+                k,
+                v,
+                seq_offsets,
+                True,  # causal
+                num_targets,
+                attn_scale,
+                max_attn_len,
+                full_attn_size,
+                contextual_seq_len,
+                None,  # q_descale
+                None,  # k_descale
+                None,  # v_descale
+                0,  # sm_margin
+                max_seq_len,  # max_q_len,
+                None,  # seq_offsets_q,
+                None,  # max_seq_len_tensor,
+                None,  # contextual_seq_len_tensor,
+                None,  # max_attn_len_tensor,
+                None,  # min_full_attn_seq_len_tensor,
+                1,  # num_groups
+                num_softmax_heads,  # num_softmax_heads
+            )
+        else:
+            out, softmax_lse = torch.ops.hstu.hstu_mha_fwd(
+                max_seq_len,
+                alpha,
+                q,
+                k,
+                v,
+                seq_offsets,
+                True,  # causal
+                num_targets,
+                attn_scale,
+                max_attn_len,
+                full_attn_size,
+                contextual_seq_len,
+                None,  # q_descale
+                None,  # k_descale
+                None,  # v_descale
+                0,  # sm_margin
+                0,  # max_q_len,
+                None,  # seq_offsets_q,
+                num_softmax_heads,  # num_softmax_heads,
+            )
+        # update ctx
+        saved_tensors = [
+            x,
+            norm_weight,
+            norm_bias,
+            x_mean,
+            x_rstd,
+            uvqk_weight,
+            seq_offsets,
+            out,
+        ]
+        if num_softmax_heads > 0:
+            saved_tensors.append(softmax_lse)
+        if num_targets is not None:
+            saved_tensors.append(num_targets)
+        if attn_scale is not None:
+            saved_tensors.append(attn_scale)
+        if not recompute_normed_x_in_backward:
+            saved_tensors.append(normed_x)
+        if recompute_uvqk_in_backward:
+            if uvqk_bias is not None:
+                saved_tensors.append(uvqk_bias)
+            if fp8_in_addmm_fwd:
+                saved_tensors.append(x_scale)  # pyre-ignore
+                saved_tensors.append(normed_x_fp8)  # pyre-ignore
+        else:
+            saved_tensors.append(uvqk)
+        if rotary_weights is not None:
+            saved_tensors.append(rotary_weights[0])
+            saved_tensors.append(rotary_weights[1])
+            saved_tensors.append(rotary_weights[2])
+            saved_tensors.append(rotary_weights[3])
+        ctx.save_for_backward(*saved_tensors)
+        ctx.alpha = alpha
+        ctx.invalid_attn_mask_type = invalid_attn_mask_type
+        ctx.has_multiple_targets = num_targets is not None
+        ctx.has_rotary_weights = rotary_weights is not None
+        ctx.has_attn_scale = attn_scale is not None
+        ctx.max_seq_len = max_seq_len
+        ctx.max_attn_len = max_attn_len
+        ctx.full_attn_size = full_attn_size
+        ctx.recompute_normed_x_in_backward = recompute_normed_x_in_backward
+        ctx.recompute_uvqk_in_backward = recompute_uvqk_in_backward
+        ctx.hidden_dim = hidden_dim
+        ctx.attn_dim = attn_dim
+        ctx.num_heads = num_heads
+        ctx.has_uvqk_bias = uvqk_bias is not None
+        ctx.uvqk_bias_1d = uvqk_bias.dim() == 1 if uvqk_bias is not None else False
+        ctx.norm_eps = norm_eps
+        ctx.norm_BLOCK_D = BLOCK_D
+        ctx.contextual_seq_len = contextual_seq_len
+        ctx.sort_by_length = sort_by_length
+        ctx.silu_u = silu_u
+        ctx.fp8_in_addmm_fwd = fp8_in_addmm_fwd
+        ctx.num_softmax_heads = num_softmax_heads
+        return u, out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx,  # pyre-ignore[2]
+        _du: torch.Tensor,
+        dout: torch.Tensor,
+    ) -> Tuple[
+        torch.Tensor,  # d_x
+        torch.Tensor,  # d_norm_weight
+        torch.Tensor,  # d_norm_bias
+        None,
+        None,
+        None,
+        None,
+        torch.Tensor,  # d_uvqk_weight
+        torch.Tensor,  # d_uvqk_bias
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        x, norm_weight, norm_bias, x_mean, x_rstd, uvqk_weight, seq_offsets, out = (
+            ctx.saved_tensors[:8]
+        )
+        idx = 8
+        if ctx.num_softmax_heads > 0:
+            softmax_lse = ctx.saved_tensors[idx]
+            idx += 1
+        else:
+            softmax_lse = None
+        if ctx.has_multiple_targets:
+            num_targets = ctx.saved_tensors[idx]
+            idx += 1
+        else:
+            num_targets = None
+        if ctx.has_attn_scale:
+            attn_scale = ctx.saved_tensors[idx]
+            idx += 1
+        else:
+            attn_scale = None
+        if ctx.recompute_normed_x_in_backward:
+            normed_x, _, _, _, _, _ = triton_weighted_layer_norm_quantization_fwd(
+                x=x,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=ctx.norm_eps,
+                mean=x_mean,
+                rstd=x_rstd,
+                quantize_output=ctx.fp8_in_addmm_fwd,
+            )
+        else:
+            normed_x = ctx.saved_tensors[idx]
+            idx += 1
+        if ctx.recompute_uvqk_in_backward:
+            if ctx.has_uvqk_bias:
+                uvqk_bias = ctx.saved_tensors[idx]
+                idx += 1
+            else:
+                uvqk_bias = None
+            if not ctx.silu_u:
+                # When silu_u is False, we only recompute vqk (not u)
+                # Split the weights/biases to extract vqk portion
+                _, vqk_weight = uvqk_weight.split(
+                    [
+                        ctx.hidden_dim * ctx.num_heads,
+                        ctx.hidden_dim * ctx.num_heads
+                        + ctx.attn_dim * ctx.num_heads
+                        + ctx.attn_dim * ctx.num_heads,
+                    ],
+                    dim=1,
+                )
+                vqk_bias = None
+                if ctx.has_uvqk_bias:
+                    _, vqk_bias = uvqk_bias.split(
+                        [
+                            ctx.hidden_dim * ctx.num_heads,
+                            ctx.hidden_dim * ctx.num_heads
+                            + ctx.attn_dim * ctx.num_heads
+                            + ctx.attn_dim * ctx.num_heads,
+                        ],
+                        dim=0,
+                    )
+                if ctx.fp8_in_addmm_fwd:
+                    x_scale, normed_x_fp8 = ctx.saved_tensors[idx : idx + 2]
+                    vqk = fp8_rowwise_quantize_addmm(
+                        x=normed_x,
+                        x_fp8=normed_x_fp8,
+                        w=vqk_weight,
+                        y=vqk_bias,
+                        x_scale=x_scale,
+                        custom_kernel=False,
+                        is_inference=False,
+                    )
+                    idx += 2
+                else:
+                    vqk = maybe_triton_addmm_fwd(
+                        normed_x, vqk_weight, vqk_bias
+                    ).contiguous()
+                # Split vqk into v, q, k components
+                v, q, k = vqk.split(
+                    [
+                        ctx.hidden_dim * ctx.num_heads,
+                        ctx.attn_dim * ctx.num_heads,
+                        ctx.attn_dim * ctx.num_heads,
+                    ],
+                    dim=1,
+                )
+                u = None
+            else:
+                # When silu_u is True, we recompute uvqk (all components)
+                if ctx.fp8_in_addmm_fwd:
+                    x_scale, normed_x_fp8 = ctx.saved_tensors[idx : idx + 2]
+                    uvqk = fp8_rowwise_quantize_addmm(
+                        x=normed_x,
+                        x_fp8=normed_x_fp8,
+                        w=uvqk_weight,
+                        y=uvqk_bias,
+                        x_scale=x_scale,
+                        custom_kernel=False,
+                        is_inference=False,
+                    )
+                    idx += 2
+                else:
+                    uvqk = maybe_triton_addmm_fwd(
+                        normed_x, uvqk_weight, uvqk_bias
+                    ).contiguous()
+                # Split uvqk into u, v, q, k components
+                u, v, q, k = uvqk.split(
+                    [
+                        ctx.hidden_dim * ctx.num_heads,
+                        ctx.hidden_dim * ctx.num_heads,
+                        ctx.attn_dim * ctx.num_heads,
+                        ctx.attn_dim * ctx.num_heads,
+                    ],
+                    dim=1,
+                )
+        else:
+            uvqk = ctx.saved_tensors[idx]
+            idx += 1
+            # Split saved uvqk into u, v, q, k components
+            u, v, q, k = uvqk.split(
+                [
+                    ctx.hidden_dim * ctx.num_heads,
+                    ctx.hidden_dim * ctx.num_heads,
+                    ctx.attn_dim * ctx.num_heads,
+                    ctx.attn_dim * ctx.num_heads,
+                ],
+                dim=1,
+            )
+        if ctx.has_rotary_weights:
+            q_cos_weights, q_sin_weights, k_cos_weights, k_sin_weights = (
+                ctx.saved_tensors[idx : idx + 4]
+            )
+            idx += 4
+        else:
+            q_cos_weights, q_sin_weights, k_cos_weights, k_sin_weights = (
+                None,
+                None,
+                None,
+                None,
+            )
+
+        duvqk = torch.empty(
+            [
+                x.size(0),
+                ctx.hidden_dim * ctx.num_heads * 2 + ctx.attn_dim * ctx.num_heads * 2,
+            ],
+            device=x.device,
+            dtype=x.dtype,
+        )
+        du, dv, dq, dk = duvqk.split(
+            [
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+            ],
+            dim=1,
+        )
+        q = q.view(-1, ctx.num_heads, ctx.attn_dim)
+        k = k.view(-1, ctx.num_heads, ctx.attn_dim)
+        v = v.view(-1, ctx.num_heads, ctx.hidden_dim)
+        dq = dq.view(-1, ctx.num_heads, ctx.attn_dim)
+        dk = dk.view(-1, ctx.num_heads, ctx.attn_dim)
+        dv = dv.view(-1, ctx.num_heads, ctx.hidden_dim)
+        if (
+            ctx.recompute_uvqk_in_backward and ctx.has_rotary_weights
+        ):  # recompute ROPE on qk
+            q = triton_apply_rope_fwd(
+                x=q,
+                N=ctx.max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=q_cos_weights,
+                sin_rope=q_sin_weights,
+            )
+            k = triton_apply_rope_fwd(
+                x=k,
+                N=ctx.max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=k_cos_weights,
+                sin_rope=k_sin_weights,
+            )
+        dq = dq.view(-1, ctx.num_heads, ctx.attn_dim)
+        dk = dk.view(-1, ctx.num_heads, ctx.attn_dim)
+        dv = dv.view(-1, ctx.num_heads, ctx.hidden_dim)
+        # Note: the two operations below update duvqk in place
+        if is_sm100_plus():
+            _dq, _dk, _dv = torch.ops.bw_hstu.bw_hstu_mha_bwd(
+                ctx.max_seq_len,
+                ctx.alpha,
+                dout,
+                q,
+                k,
+                v,
+                dq,
+                dk,
+                dv,
+                seq_offsets,
+                True,  # causal
+                num_targets,
+                attn_scale,
+                ctx.max_attn_len,
+                ctx.full_attn_size,
+                ctx.contextual_seq_len,
+                ctx.sort_by_length,
+                False,  # deterministic
+                0,  # sm_margin
+                ctx.max_seq_len,  # max_q_len,
+                None,  # seq_offsets_q,
+                None,  # max_seq_len_tensor,
+                None,  # contextual_seq_len_tensor,
+                None,  # max_attn_len_tensor,
+                None,  # min_full_attn_seq_len_tensor,
+                1,  # num_groups
+                ctx.num_softmax_heads,  # num_softmax_heads
+                out,  # out
+                softmax_lse,  # lse
+            )
+        else:
+            _dq, _dk, _dv = torch.ops.hstu.hstu_mha_bwd(
+                ctx.max_seq_len,
+                ctx.alpha,
+                dout,
+                q,
+                k,
+                v,
+                dq,
+                dk,
+                dv,
+                out,
+                seq_offsets,
+                True,  # causal
+                num_targets,
+                attn_scale,
+                ctx.max_attn_len,
+                ctx.full_attn_size,
+                ctx.contextual_seq_len,
+                ctx.sort_by_length,
+                False,  # deterministic
+                0,  # sm_margin
+                0,  # max_q_len,
+                None,  # seq_offsets_q,
+                ctx.num_softmax_heads,  # num_softmax_heads,
+                softmax_lse,
+            )
+        if ctx.has_rotary_weights:
+            _dq = triton_apply_rope_bwd(
+                grad=_dq,
+                N=ctx.max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=q_cos_weights,
+                sin_rope=q_sin_weights,
+            )
+            _dk = triton_apply_rope_bwd(
+                grad=_dk,
+                N=ctx.max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=k_cos_weights,
+                sin_rope=k_sin_weights,
+            )
+        copy_if_different_ptr(dq, _dq)
+        copy_if_different_ptr(dk, _dk)
+        copy_if_different_ptr(dv, _dv)
+        if ctx.silu_u:
+            torch.ops.aten.silu_backward(_du, u, grad_input=du)
+        else:
+            copy_if_different_ptr(du, _du)
+        d_normed_x, d_uvqk_weight, d_uvqk_bias = triton_addmm_bwd(
+            x=normed_x,
+            w=uvqk_weight,
+            dz=duvqk,
+            is_y_1d=ctx.uvqk_bias_1d and ctx.has_uvqk_bias,
+        )
+        d_x, d_norm_weight, d_norm_bias = triton_weighted_layer_norm_bwd(
+            dy=d_normed_x,
+            x=x,
+            weight=norm_weight,
+            bias=norm_bias,
+            mean=x_mean,
+            rstd=x_rstd,
+            learnable=True,
+            eps=ctx.norm_eps,
+            BLOCK_D=ctx.norm_BLOCK_D,
+        )
+        # pyre-ignore[7]
+        return (
+            d_x,
+            d_norm_weight,
+            d_norm_bias,
+            None,
+            None,
+            None,
+            None,
+            d_uvqk_weight,
+            d_uvqk_bias if ctx.has_uvqk_bias else None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp b/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp
new file mode 100644
index 000000000..4730078e6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+void expand_1d_jagged_to_dense_cpu_kernel_(
+    int64_t B,
+    int64_t max_len,
+    const at::TensorAccessor<val_t, 1>& values,
+    const at::TensorAccessor<index_t, 1>& offsets,
+    at::TensorAccessor<val_t, 2> output) {
+  for (auto i : c10::irange(B)) {
+    int64_t begin = offsets[i];
+    int64_t end = offsets[i + 1];
+    if (end - begin == 0) {
+      for (int64_t j : c10::irange(max_len)) {
+        output[i][j] = 0;
+        continue;
+      }
+    } else {
+      int64_t j = 0;
+      for (; j < std::min(end - begin, max_len); ++j) {
+        output[i][j] = values[begin + j];
+      }
+      for (; j < max_len; ++j) {
+        output[i][j] = values[end - 1];
+      }
+    }
+  } // for each i
+}
+
+at::Tensor expand_1d_jagged_to_dense_cpu(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const int64_t max_len) {
+  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CPU);
+  TORCH_CHECK(values.numel() < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(max_len >= 0);
+  auto B = offsets.size(0) - 1;
+  auto output = at::empty({B, max_len}, values.options());
+  if (values.numel() == 0 || max_len == 0) {
+    return output;
+  }
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::BFloat16,
+      at::ScalarType::Half,
+      values.scalar_type(),
+      "expand_1d_jagged_to_dense_cpu_input1",
+      [&] {
+        using val_t = scalar_t;
+        AT_DISPATCH_INTEGRAL_TYPES(
+            offsets.scalar_type(), "expand_1d_jagged_to_dense_cpu_input2", [&] {
+              using index_t = scalar_t;
+              expand_1d_jagged_to_dense_cpu_kernel_<index_t, val_t>(
+                  B,
+                  max_len,
+                  values.accessor<val_t, 1>(),
+                  offsets.accessor<index_t, 1>(),
+                  output.accessor<val_t, 2>());
+            });
+      });
+  return output;
+}
+
+at::Tensor expand_1d_jagged_to_dense_meta(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const c10::SymInt max_len) {
+  auto B = offsets.sym_size(0) - 1;
+  auto output = at::empty_symint({B, max_len}, values.options());
+  return output;
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu b/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu
new file mode 100644
index 000000000..aa3678d2b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu
@@ -0,0 +1,103 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "common.h"
+
+static constexpr int32_t kMaxThreads = 1024;
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+__global__
+__launch_bounds__(kMaxThreads) void expand_1d_jagged_to_dense_cuda_kernel_(
+    int64_t B,
+    int64_t max_len,
+    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
+    at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits> output) {
+  int64_t b = blockIdx.y;
+  int64_t begin = offsets[b];
+  int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t end = offsets[b + 1];
+  if (end - begin == 0) {
+    if (i < max_len) {
+      output[b][i] = 0;
+    }
+  } else {
+    if (i < std::min(end - begin, max_len)) {
+      output[b][i] = values[i + begin];
+    } else if (i < max_len) {
+      output[b][i] = values[end - 1];
+    }
+  }
+}
+
+at::Tensor expand_1d_jagged_to_dense_cuda(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const int64_t max_len) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values.get_device());
+  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CUDA);
+  TORCH_CHECK(values.numel() < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(values.get_device() == offsets.get_device());
+  TORCH_CHECK(max_len >= 0);
+  auto B = offsets.size(0) - 1;
+  auto output = at::empty({B, max_len}, values.options());
+  if (values.numel() == 0 || max_len == 0) {
+    return output;
+  }
+  uint32_t nthreads_per_block = max_len > 64 ? 64 : max_len;
+  dim3 grid_size = dim3(div_round_up(max_len, nthreads_per_block), B);
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::BFloat16,
+      at::ScalarType::Half,
+      values.scalar_type(),
+      "expand_1d_jagged_to_dense_cuda_input1",
+      [&] {
+        using val_t = scalar_t;
+        AT_DISPATCH_INTEGRAL_TYPES(
+            offsets.scalar_type(),
+            "expand_1d_jagged_to_dense_cuda_input2",
+            [&] {
+              using index_t = scalar_t;
+              expand_1d_jagged_to_dense_cuda_kernel_<index_t, val_t><<<
+                  grid_size,
+                  nthreads_per_block,
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  B,
+                  max_len,
+                  values.packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                  offsets
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  output.packed_accessor32<val_t, 2, at::RestrictPtrTraits>());
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
+      });
+
+  return output;
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h
new file mode 100644
index 000000000..a22ae7745
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h
@@ -0,0 +1,66 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/arch/copy_sm90_tma.hpp>
+
+namespace cute {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_BULK_REDUCE_ADD {
+  CUTE_HOST_DEVICE static void
+  copy(float const* smem_ptr, float* gmem_ptr, int32_t store_bytes) {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2;\n"
+        :
+        : "l"(gmem_ptr), "r"(smem_int_ptr), "r"(store_bytes)
+        : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH(
+        "Trying to use BULK_REDUCE_ADD without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  CUTE_HOST_DEVICE static void copy(
+      float const* smem_ptr,
+      float* gmem_ptr,
+      int32_t store_bytes,
+      uint64_t cache_hint) {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "cp.reduce.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.add.f32 [%0], [%1], %2, %3;\n"
+        :
+        : "l"(gmem_ptr), "r"(smem_int_ptr), "r"(store_bytes), "l"(cache_hint)
+        : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH(
+        "Trying to use BULK_REDUCE_ADD without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h
new file mode 100644
index 000000000..833f3ae28
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h
@@ -0,0 +1,481 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/barrier.h>
+#include <cutlass/cutlass.h>
+
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+
+#include "copy_sm90_bulk_reduce.h"
+#include "named_barrier.h"
+#include "seqlen.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    class TileShape_MNK_,
+    class Element_,
+    class ArchTag_,
+    int NumEpilogueThreads_,
+    bool Jagged,
+    bool dKV_swapAB_,
+    int AtomLayoutKdKV = 1>
+struct CollectiveEpilogueBwd {
+  using TileShape_MNK = TileShape_MNK_;
+  using Element = Element_;
+  using ArchTag = ArchTag_;
+  static constexpr int NumEpilogueThreads = NumEpilogueThreads_;
+  static constexpr bool dKV_swapAB = dKV_swapAB_;
+  static constexpr bool Use_TMA =
+      !Jagged && ArchTag::kMinComputeCapability >= 90;
+
+  static_assert(ArchTag::kMinComputeCapability >= 80);
+
+  using GmemTiledCopydKVTMA = cute::SM90_TMA_STORE;
+
+  // These are for storing the output tensor without TMA (e.g., for setting
+  // output to zero)
+  static constexpr int kGmemElemsPerLoad =
+      sizeof(cute::uint128_t) / sizeof(Element);
+  static_assert(
+      get<2>(TileShape_MNK{}) % kGmemElemsPerLoad == 0,
+      "Headdim must be a multiple of kGmemElemsPerLoad");
+  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
+  static constexpr int kGmemThreadsPerRow =
+      cutlass::gcd(kHeadDim / kGmemElemsPerLoad, NumEpilogueThreads);
+  static_assert(
+      NumEpilogueThreads % kGmemThreadsPerRow == 0,
+      "NumEpilogueThreads must be a multiple of kGmemThreadsPerRow");
+  using GmemLayoutAtom = Layout<
+      Shape<
+          Int<NumEpilogueThreads / kGmemThreadsPerRow>,
+          Int<kGmemThreadsPerRow>>,
+      Stride<Int<kGmemThreadsPerRow>, _1>>;
+  using GmemTiledCopydKV = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
+      GmemLayoutAtom{},
+      Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{})); // Val layout, 8 or 16 vals
+                                                     // per store
+
+  using SmemLayoutAtomdKVTMA =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               // TODO: do we have to change this if dKV_swapAB is true?
+               decltype(cute::get<1>(TileShape_MNK{})),
+               Int<CUTE_STATIC_V(cute::get<2>(TileShape_MNK{})) /
+                   AtomLayoutKdKV>>());
+  using SmemLayoutdKVTMA = decltype(tile_to_shape(
+      SmemLayoutAtomdKVTMA{},
+      select<1, 2>(TileShape_MNK{})));
+  using SmemLayoutdKVtTMA = decltype(cute::composition(
+      SmemLayoutdKVTMA{},
+      make_layout(
+          make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
+          make_stride(decltype(get<1>(TileShape_MNK{})){}, _1{}))));
+
+  // If we don't use TMA
+  static constexpr int kBlockKSmem =
+      kHeadDim % 64 == 0 ? 64 : (kHeadDim % 32 == 0 ? 32 : 16);
+  static constexpr int kSwizzle =
+      kBlockKSmem == 64 ? 3 : (kBlockKSmem == 32 ? 2 : 1);
+  using SmemLayoutAtomdKVSTG = decltype(composition(
+      Swizzle<kSwizzle, 3, 3>{},
+      Layout<Shape<Int<8>, Int<kBlockKSmem>>, Stride<Int<kBlockKSmem>, _1>>{}));
+
+  using SmemLayoutAtomdKV =
+      std::conditional_t<Use_TMA, SmemLayoutAtomdKVTMA, SmemLayoutAtomdKVSTG>;
+  using SmemLayoutdKV = decltype(tile_to_shape(
+      SmemLayoutAtomdKV{},
+      select<1, 2>(TileShape_MNK{})));
+  using SmemLayoutdKVt = decltype(cute::composition(
+      SmemLayoutdKV{},
+      make_layout(
+          make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
+          make_stride(decltype(get<1>(TileShape_MNK{})){}, _1{}))));
+
+  using SmemCopyAtomdKV = Copy_Atom<
+      std::conditional_t<
+          ArchTag::kMinComputeCapability >= 90,
+          std::conditional_t<
+              !dKV_swapAB,
+              cute::SM90_U32x4_STSM_N,
+              cute::SM90_U16x8_STSM_T>,
+          AutoVectorizingCopyWithAssumedAlignment<128>>,
+      Element>;
+
+  static constexpr size_t SmemAlignmentdKV =
+      ArchTag::kMinComputeCapability >= 90
+      ? cutlass::detail::alignment_for_swizzle(SmemLayoutdKV{})
+      : 128;
+  static_assert(SmemAlignmentdKV >= 128, "Require at least 128B alignment");
+
+  struct TensorStorage : cute::aligned_struct<SmemAlignmentdKV> {
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutdKV>, SmemAlignmentdKV>
+            smem_dk;
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutdKV>, SmemAlignmentdKV>
+            smem_dv;
+  };
+
+  using ShapedKV =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen_k, d, head,
+                                                       // batch)
+  using StridedKV = cute::Stride<int64_t, _1, int64_t, int64_t>;
+
+  using TMA_dKV = std::conditional_t<
+      Use_TMA,
+      decltype(make_tma_copy(
+          GmemTiledCopydKVTMA{},
+          make_tensor(
+              make_gmem_ptr(static_cast<Element*>(nullptr)),
+              ShapedKV{},
+              StridedKV{}),
+          SmemLayoutdKVTMA{},
+          select<1, 2>(TileShape_MNK{}),
+          _1{})), // no mcast for dKV
+      std::nullptr_t>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    Element* ptr_dK;
+    ShapedKV const shape_dK;
+    StridedKV const stride_dK;
+    Element* ptr_dV;
+    StridedKV const stride_dV;
+    int const num_heads_q;
+    int const* seq_offsets;
+  };
+
+  // Device side kernel params
+  struct Params {
+    Element* ptr_dK;
+    ShapedKV const shape_dK;
+    StridedKV const stride_dK;
+    Element* ptr_dV;
+    StridedKV const stride_dV;
+    TMA_dKV tma_store_dK, tma_store_dV;
+    int const* seq_offsets = nullptr;
+  };
+
+  static Params to_underlying_arguments(Arguments const& args) {
+    Tensor mdK =
+        make_tensor(make_gmem_ptr(args.ptr_dK), args.shape_dK, args.stride_dK);
+    Tensor mdV =
+        make_tensor(make_gmem_ptr(args.ptr_dV), args.shape_dK, args.stride_dV);
+    TMA_dKV tma_store_dK = [&] {
+      if constexpr (Use_TMA) {
+        return make_tma_copy(
+            GmemTiledCopydKVTMA{},
+            mdK,
+            SmemLayoutdKVTMA{},
+            select<1, 2>(TileShape_MNK{}),
+            _1{}); // no mcast for dKV
+      } else {
+        return nullptr;
+      }
+    }();
+    TMA_dKV tma_store_dV = [&] {
+      if constexpr (Use_TMA) {
+        return make_tma_copy(
+            GmemTiledCopydKVTMA{},
+            mdV,
+            SmemLayoutdKVTMA{},
+            select<1, 2>(TileShape_MNK{}),
+            _1{}); // no mcast for dKV
+      } else {
+        return nullptr;
+      }
+    }();
+    return {
+        args.ptr_dK,
+        args.shape_dK,
+        args.stride_dK,
+        args.ptr_dV,
+        args.stride_dV,
+        tma_store_dK,
+        tma_store_dV,
+        args.seq_offsets};
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
+  /// performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& params) {
+    if constexpr (Use_TMA) {
+      cute::prefetch_tma_descriptor(params.tma_store_dK.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(params.tma_store_dV.get_tma_descriptor());
+    }
+  }
+
+  template <typename SharedStorage, typename FrgTensorO, typename TiledMma>
+  CUTLASS_DEVICE void store(
+      Params const& params,
+      FrgTensorO const& tdKrdK,
+      FrgTensorO const& tdVrdV,
+      SharedStorage& shared_storage,
+      TiledMma tiled_mma,
+      int thread_idx,
+      cute::tuple<int32_t, int32_t, int32_t> const& block_coord) {
+    auto [n_block, bidh, bidb] = block_coord;
+    Tensor sdK = cute::as_position_independent_swizzle_tensor(make_tensor(
+        make_smem_ptr(shared_storage.tensors.epilogue.smem_dk.data()),
+        SmemLayoutdKV{}));
+    Tensor sdV = cute::as_position_independent_swizzle_tensor(make_tensor(
+        make_smem_ptr(shared_storage.tensors.epilogue.smem_dv.data()),
+        SmemLayoutdKV{}));
+    Tensor sdKt = cute::as_position_independent_swizzle_tensor(make_tensor(
+        make_smem_ptr(shared_storage.tensors.epilogue.smem_dk.data()),
+        SmemLayoutdKVt{}));
+    Tensor sdVt = cute::as_position_independent_swizzle_tensor(make_tensor(
+        make_smem_ptr(shared_storage.tensors.epilogue.smem_dv.data()),
+        SmemLayoutdKVt{}));
+    auto smem_tiled_copy_dKV = make_tiled_copy_C(SmemCopyAtomdKV{}, tiled_mma);
+    auto smem_thr_copy_dKV = smem_tiled_copy_dKV.get_thread_slice(thread_idx);
+
+    Tensor tdVrdV_out = make_tensor_like<Element>(tdVrdV);
+    hstu::convert_type_out(tdVrdV, tdVrdV_out);
+    Tensor tdKrdK_out = make_tensor_like<Element>(tdKrdK);
+    hstu::convert_type_out(tdKrdK, tdKrdK_out);
+    Tensor taccdKrdK = smem_thr_copy_dKV.retile_S(
+        tdKrdK_out); // ((Atom,AtomNum), MMA_M, MMA_N)
+    Tensor taccdVrdV = smem_thr_copy_dKV.retile_S(
+        tdVrdV_out); // ((Atom,AtomNum), MMA_M, MMA_N)
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(smem_thr_copy_dKV);
+    // print(sdK); printf("\n"); print(sdKt); printf("\n"); }
+    Tensor taccdKsdK = smem_thr_copy_dKV.partition_D(
+        cute::conditional_return<!dKV_swapAB>(
+            sdK, sdKt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    Tensor taccdVsdV = smem_thr_copy_dKV.partition_D(
+        cute::conditional_return<!dKV_swapAB>(
+            sdV, sdVt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+    // Make sure all WGs have finished reading K and V
+    hstu::named_barrier_sync(
+        NumEpilogueThreads,
+        cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+    cute::copy(smem_tiled_copy_dKV, taccdVrdV, taccdVsdV);
+    cute::copy(smem_tiled_copy_dKV, taccdKrdK, taccdKsdK);
+    if constexpr (Use_TMA) {
+      cutlass::arch::fence_view_async_shared(); // ensure smem writes are
+                                                // visible to TMA
+      cutlass::arch::NamedBarrier::arrive(
+          NumEpilogueThreads + cutlass::NumThreadsPerWarp,
+          cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+
+      Tensor mdK = params.tma_store_dK.get_tma_tensor(params.shape_dK);
+      Tensor mdV = params.tma_store_dV.get_tma_tensor(params.shape_dK);
+      Tensor gdK = local_tile(
+          mdK(_, _, bidh, bidb),
+          select<1, 2>(TileShape_MNK{}),
+          make_coord(n_block, _0{})); // (M, K)
+      Tensor gdV = local_tile(
+          mdV(_, _, bidh, bidb),
+          select<1, 2>(TileShape_MNK{}),
+          make_coord(n_block, _0{})); // (M, K)
+      auto block_tma_dK = params.tma_store_dK.get_slice(_0{});
+      auto block_tma_dV = params.tma_store_dV.get_slice(_0{});
+      Tensor tdKgdK = block_tma_dK.partition_D(gdK); // (TMA, TMA_M, TMA_K)
+      Tensor tdKsdK = block_tma_dK.partition_S(sdK); // (TMA, TMA_M, TMA_K)
+      Tensor tdVgdV = block_tma_dV.partition_D(gdV); // (TMA, TMA_M, TMA_K)
+      Tensor tdVsdV = block_tma_dV.partition_S(sdV); // (TMA, TMA_M, TMA_K)
+      int warp_idx_sync =
+          __shfl_sync(0xffffffff, thread_idx / cutlass::NumThreadsPerWarp, 0);
+      if (warp_idx_sync ==
+          NumEpilogueThreads / cutlass::NumThreadsPerWarp - 1) {
+        cutlass::arch::NamedBarrier::sync(
+            NumEpilogueThreads + cutlass::NumThreadsPerWarp,
+            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+        if (cute::elect_one_sync()) {
+          cute::copy(params.tma_store_dV, tdVsdV, tdVgdV);
+          cute::copy(params.tma_store_dK, tdKsdK, tdKgdK);
+          tma_store_arrive();
+        }
+      }
+      tma_store_wait<0>();
+      // // Tell warp 0 that smem_k and smem_v are ready
+      // cutlass::arch::NamedBarrier::arrive(NumEpilogueThreads +
+      // cutlass::NumThreadsPerWarp,
+      // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/);
+
+    } else {
+      hstu::named_barrier_sync(
+          NumEpilogueThreads,
+          cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+      static constexpr int kBlockN = get<1>(TileShape_MNK{});
+      hstu::SeqlenInfo<Jagged, kBlockN> seqlen_info{
+          bidb, size<0>(params.shape_dK), params.seq_offsets};
+      Tensor mdK = make_tensor(
+          make_gmem_ptr(params.ptr_dK), params.shape_dK, params.stride_dK)(
+          _, _, bidh, !Jagged ? bidb : 0);
+      Tensor gdK = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdK),
+          select<1, 2>(TileShape_MNK{}),
+          make_coord(n_block, _0{})); // (M, K)
+      Tensor mdV = make_tensor(
+          make_gmem_ptr(params.ptr_dV), params.shape_dK, params.stride_dV)(
+          _, _, bidh, !Jagged ? bidb : 0);
+      Tensor gdV = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdV),
+          select<1, 2>(TileShape_MNK{}),
+          make_coord(n_block, _0{})); // (M, K)
+
+      GmemTiledCopydKV gmem_tiled_copy_dKV;
+      auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(thread_idx);
+      Tensor tdKVgdV = gmem_thr_copy_dKV.partition_D(gdV);
+      Tensor tdKVsdV =
+          gmem_thr_copy_dKV.partition_S(sdV); // (TMA, TMA_M, TMA_K)
+      Tensor tdKVgdK = gmem_thr_copy_dKV.partition_D(gdK);
+      Tensor tdKVsdK =
+          gmem_thr_copy_dKV.partition_S(sdK); // (TMA, TMA_M, TMA_K)
+      Tensor tdKVrdV = make_fragment_like(tdKVgdV);
+      Tensor tdKVrdK = make_fragment_like(tdKVgdK);
+      Tensor cdKV = cute::make_identity_tensor(
+          select<1, 2>(TileShape_MNK{})); // (BLK_M,BLK_K) -> (blk_m,blk_k)
+      // Repeat the partitioning with identity layouts
+      Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
+      Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKVgdV)));
+#pragma unroll
+      for (int k = 0; k < size(tdKVpdKV); ++k) {
+        tdKVpdKV(k) = get<1>(tdKVcdKV(_0{}, _0{}, k)) < get<1>(params.shape_dK);
+      }
+      // Need to check OOB when reading from smem if kBlockN isn't evenly tiled
+      static constexpr bool EvenN =
+          kBlockN % CUTE_STATIC_V(size<0>(GmemLayoutAtom{})) == 0;
+      hstu::copy<
+          /*Is_even_MN=*/EvenN,
+          /*Is_even_K=*/true,
+          /*Clear_OOB_MN=*/false>(
+          gmem_tiled_copy_dKV, tdKVsdV, tdKVrdV, tdKVcdKV, tdKVpdKV, kBlockN);
+      hstu::copy<
+          /*Is_even_MN=*/EvenN,
+          /*Is_even_K=*/true,
+          /*Clear_OOB_MN=*/false>(
+          gmem_tiled_copy_dKV, tdKVsdK, tdKVrdK, tdKVcdKV, tdKVpdKV, kBlockN);
+      // // Tell warp 0 that smem_k and smem_v are ready
+      // cutlass::arch::fence_view_async_shared(); // ensure smem reads are done
+      // before next TMA to smem_k/v
+      // hstu::named_barrier_arrive(NumEpilogueThreads +
+      // cutlass::NumThreadsPerWarp,
+      // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/); Construct
+      // identity layout for gdKV Clear_OOB_K must be false since we don't want
+      // to write zeros to gmem
+      hstu::copy<
+          /*Is_even_MN=*/false,
+          /*Is_even_K=*/false,
+          /*Clear_OOB_MN=*/false,
+          /*Clear_OOB_K=*/false>(
+          gmem_tiled_copy_dKV,
+          tdKVrdV,
+          tdKVgdV,
+          tdKVcdKV,
+          tdKVpdKV,
+          std::min(seqlen_info.seqlen - n_block * kBlockN, kBlockN));
+      hstu::copy<
+          /*Is_even_MN=*/false,
+          /*Is_even_K=*/false,
+          /*Clear_OOB_MN=*/false,
+          /*Clear_OOB_K=*/false>(
+          gmem_tiled_copy_dKV,
+          tdKVrdK,
+          tdKVgdK,
+          tdKVcdKV,
+          tdKVpdKV,
+          std::min(seqlen_info.seqlen - n_block * kBlockN, kBlockN));
+    }
+  }
+
+  CUTLASS_DEVICE void store_tail() {
+    // if constexpr (Use_TMA) { tma_store_wait<0>(); }
+  }
+
+  // Write 0 to dK and dV
+  CUTLASS_DEVICE void store_zero(
+      Params const& params,
+      int thread_idx,
+      cute::tuple<int32_t, int32_t, int32_t> const& block_coord) {
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    auto [n_block, bidh, bidb] = block_coord;
+    hstu::SeqlenInfo<Jagged, kBlockN> seqlen_info{
+        bidb, size<0>(params.shape_dK), params.seq_offsets};
+    Tensor mdK = make_tensor(
+        make_gmem_ptr(params.ptr_dK), params.shape_dK, params.stride_dK)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor gdK = local_tile(
+        cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdK),
+        select<1, 2>(TileShape_MNK{}),
+        make_coord(n_block, _0{})); // (M, K)
+    Tensor mdV = make_tensor(
+        make_gmem_ptr(params.ptr_dV), params.shape_dK, params.stride_dV)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor gdV = local_tile(
+        cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdV),
+        select<1, 2>(TileShape_MNK{}),
+        make_coord(n_block, _0{})); // (M, K)
+
+    GmemTiledCopydKV gmem_tiled_copy_dKV;
+    auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(thread_idx);
+    Tensor tdKVgdK = gmem_thr_copy_dKV.partition_D(gdK);
+    Tensor tdKVgdV = gmem_thr_copy_dKV.partition_D(gdV);
+    Tensor tdKVrdKV = make_fragment_like(tdKVgdK);
+    clear(tdKVrdKV);
+    // Construct identity layout for gdKV
+    Tensor cdKV = cute::make_identity_tensor(
+        select<1, 2>(TileShape_MNK{})); // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    // Repeat the partitioning with identity layouts
+    Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
+    Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKVgdK)));
+#pragma unroll
+    for (int k = 0; k < size(tdKVpdKV); ++k) {
+      tdKVpdKV(k) = get<1>(tdKVcdKV(_0{}, _0{}, k)) < get<1>(params.shape_dK);
+    }
+    // Clear_OOB_K must be false since we don't want to write zeros to gmem
+    hstu::copy<
+        /*Is_even_MN=*/false,
+        /*Is_even_K=*/false,
+        /*Clear_OOB_MN=*/false,
+        /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_dKV,
+        tdKVrdKV,
+        tdKVgdK,
+        tdKVcdKV,
+        tdKVpdKV,
+        seqlen_info.seqlen - n_block * kBlockN);
+    hstu::copy<
+        /*Is_even_MN=*/false,
+        /*Is_even_K=*/false,
+        /*Clear_OOB_MN=*/false,
+        /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_dKV,
+        tdKVrdKV,
+        tdKVgdV,
+        tdKVcdKV,
+        tdKVpdKV,
+        seqlen_info.seqlen - n_block * kBlockN);
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h
new file mode 100644
index 000000000..c794a114c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h
@@ -0,0 +1,550 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include <cutlass/fast_math.h> // For FastDivMod
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+
+#include "named_barrier.h"
+#include "seqlen.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    class TileShape_MNK_,
+    class ClusterShape_,
+    class Element_,
+    class ArchTag_,
+    int NumEpilogueThreads_,
+    bool Jagged,
+    bool FP8PermuteCol = false>
+struct CollectiveEpilogueFwd {
+  using TileShape_MNK = TileShape_MNK_;
+  using ClusterShape = ClusterShape_;
+  using Element = Element_;
+  using ArchTag = ArchTag_;
+  static constexpr int NumEpilogueThreads = NumEpilogueThreads_;
+  static constexpr bool Use_smem = sizeof(Element) <= 2;
+  static constexpr bool Use_TMA_O =
+      ArchTag::kMinComputeCapability >= 90 && !Jagged && Use_smem;
+
+  static_assert(ArchTag::kMinComputeCapability >= 80);
+  static_assert(
+      ArchTag::kMinComputeCapability >= 90 ||
+      CUTE_STATIC_V(size(ClusterShape{})) == 1);
+
+  static constexpr int kBlockM = get<0>(TileShape_MNK{});
+  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
+
+  using GmemTiledCopyOTMA = cute::SM90_TMA_STORE;
+
+  // These are for storing the output tensor without TMA (e.g., for setting
+  // output to zero)
+  static constexpr int kGmemElemsPerStore =
+      sizeof(cute::uint128_t) / sizeof(Element);
+  static_assert(
+      kHeadDim % kGmemElemsPerStore == 0,
+      "Headdim must be a multiple of kGmemElemsPerStore");
+  // We want each "row" to have 64 elements (128 bytes, i.e. 1 cache line). We
+  // want each thread to have 4 elements in the M direction and 2 elements in
+  // the K direction. In the case of PackGQA, this reduces the number of times
+  // we need to call divmod.
+  static constexpr int kBytePerRow = kHeadDim * sizeof(Element);
+  static constexpr int kBlockKGmem =
+      (kBytePerRow % 128 == 0 ? 128 : (kBytePerRow % 64 == 0 ? 64 : 32)) /
+      sizeof(Element);
+  // static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim %
+  // 64 == 0 ? 64 : 32); static constexpr int kGmemThreadsPerRow =
+  // cutlass::gcd(kHeadDim / kGmemElemsPerStore, NumEpilogueThreads);
+  static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerStore;
+  // If PackGQA, we split the work of compute O_ptr among threads in the same
+  // row, so we need this to within a warp
+  static_assert(cutlass::NumThreadsPerWarp % kGmemThreadsPerRow == 0);
+  static_assert(
+      NumEpilogueThreads % kGmemThreadsPerRow == 0,
+      "NumEpilogueThreads must be a multiple of kGmemThreadsPerRow");
+  using GmemLayoutAtom = Layout<
+      Shape<
+          Int<NumEpilogueThreads / kGmemThreadsPerRow>,
+          Int<kGmemThreadsPerRow>>,
+      Stride<Int<kGmemThreadsPerRow>, _1>>;
+  static_assert(
+      kBlockM % CUTE_STATIC_V(shape<0>(GmemLayoutAtom{})) == 0,
+      "kBlockM must be a multiple of NumEpilogueThreads / kGmemThreadsPerRow");
+  using GmemTiledCopyO = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
+      GmemLayoutAtom{},
+      Layout<Shape<_1, Int<kGmemElemsPerStore>>>{})); // Val layout, 8 or 16
+                                                      // vals per store
+
+  using SmemLayoutAtomOTMA =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<0>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutOTMA = decltype(tile_to_shape(
+      SmemLayoutAtomOTMA{},
+      select<0, 2>(TileShape_MNK{})));
+  static constexpr int kSwizzle = kBlockKGmem == 128
+      ? 4
+      : (kBlockKGmem == 64 ? 3 : (kBlockKGmem == 32 ? 2 : 1));
+  static constexpr int kSwizzleBase =
+      sizeof(Element) == 4 ? 2 : (sizeof(Element) == 2 ? 3 : 4);
+  using SmemLayoutAtomO = decltype(composition(
+      Swizzle<kSwizzle, kSwizzleBase, kSwizzleBase>{},
+      Layout<Shape<_8, Int<kBlockKGmem>>, Stride<Int<kBlockKGmem>, _1>>{}));
+  using SmemLayoutOSTS =
+      decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
+  using SmemLayoutO = std::conditional_t<
+      ArchTag::kMinComputeCapability >= 90,
+      SmemLayoutOTMA,
+      SmemLayoutOSTS>;
+
+  using ShapeO =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t, int32_t>; // (seqlen_q, d,
+                                                                // head, batch,
+                                                                // num_splits)
+  using StrideO = cute::Stride<int64_t, _1, int64_t, int64_t, int64_t>;
+  // ((qhead_per_khead, seqlen_q), d, nheads, batch, num_splits)
+  using ShapeOPacked = ShapeO;
+  using StrideOPacked = StrideO;
+  // ((qhead_per_khead, seqlen_q), nheads, batch, num_splits)
+  using StrideLSE =
+      cute::Stride<_1, int64_t, int64_t, int64_t>; // (seqlen_q, head, batch,
+  // num_splits)
+  using ShapeLSEPacked = cute::Shape<int32_t, int32_t, int32_t, int32_t>;
+  using StrideLSEPacked = StrideLSE;
+  using EpilogueTile_MN = decltype(select<0, 1>(TileShape_MNK{}));
+  using CopyOpR2S = std::conditional_t<
+      ArchTag::kMinComputeCapability >= 90,
+      // cute::SM90_U32x4_STSM_N if Element size is 2 bytes (fp16, bf16)
+      decltype(cutlass::epilogue::collective::detail::
+                   sm90_get_smem_store_op_for_accumulator<
+                       StrideO,
+                       Element,
+                       EpilogueTile_MN>()),
+      AutoVectorizingCopyWithAssumedAlignment<128>>;
+  using SmemCopyAtomO = Copy_Atom<CopyOpR2S, Element>;
+
+  // static constexpr size_t SmemAlignmentO =
+  // cutlass::detail::alignment_for_swizzle(SmemLayoutO{});
+  // static_assert(SmemAlignmentO >= 128, "Require at least 128B alignment");
+  // struct TensorStorage : cute::aligned_struct<SmemAlignmentO> {
+  //     cute::array_aligned<Element, Use_smem ? cute::cosize_v<SmemLayoutO> :
+  //     0, SmemAlignmentO> smem_o;
+  // };
+  struct TensorStorage : cute::aligned_struct<128> {
+    cute::array_aligned<Element, Use_smem ? cute::cosize_v<SmemLayoutO> : 0>
+        smem_o;
+  };
+
+  using TMA_O = std::conditional_t<
+      Use_TMA_O,
+      decltype(make_tma_copy(
+          GmemTiledCopyOTMA{},
+          make_tensor(
+              make_gmem_ptr(static_cast<Element*>(nullptr)),
+              ShapeO{},
+              StrideO{}),
+          SmemLayoutOTMA{},
+          select<0, 2>(TileShape_MNK{}),
+          _1{})), // no mcast for O
+      std::nullptr_t>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    Element* ptr_O;
+    ShapeO const shape_O;
+    StrideO const stride_O;
+    int32_t const nheads;
+    int32_t const num_softmax_heads;
+    StrideLSE const stride_lse;
+    float* ptr_lse = nullptr;
+    int const* seq_offsets = nullptr;
+  };
+
+  // Device side kernel params
+  struct Params {
+    Element* ptr_O;
+    ShapeO const shape_O;
+    StrideO const stride_O;
+    ShapeOPacked const shape_O_packed;
+    StrideOPacked const stride_O_packed;
+    float* ptr_lse;
+    StrideLSE const stride_lse;
+    ShapeLSEPacked const shape_lse_packed;
+    StrideLSEPacked const stride_lse_packed;
+    TMA_O tma_store_O;
+    int const* seq_offsets = nullptr;
+  };
+
+  static Params to_underlying_arguments(Arguments const& args) {
+    Tensor mO =
+        make_tensor(make_gmem_ptr(args.ptr_O), args.shape_O, args.stride_O);
+    TMA_O tma_store_O = [&] {
+      if constexpr (Use_TMA_O) {
+        return make_tma_copy(
+            GmemTiledCopyOTMA{},
+            mO,
+            SmemLayoutO{},
+            select<0, 2>(TileShape_MNK{}),
+            _1{}); // no mcast
+      } else {
+        return nullptr;
+      }
+    }();
+    // If PackGQA, reshape O to be ((qhead_per_khead, seqlen_q), head_size,
+    // nhead_k, batch_size, num_splits)
+    int const qhead_per_khead = 1;
+    auto const shape_O_packed = cute::conditional_return<true>(
+        args.shape_O,
+        make_shape(
+            make_shape(qhead_per_khead, get<0>(args.shape_O)),
+            get<1>(args.shape_O),
+            args.nheads,
+            get<3>(args.shape_O),
+            get<4>(args.shape_O)));
+    auto const stride_O_packed = cute::conditional_return<true>(
+        args.stride_O,
+        make_stride(
+            make_stride(get<2>(args.stride_O), get<0>(args.stride_O)),
+            get<1>(args.stride_O),
+            get<2>(args.stride_O) * qhead_per_khead,
+            get<3>(args.stride_O),
+            get<4>(args.stride_O)));
+    auto const shape_lse_packed = select<0, 2, 3, 4>(args.shape_O);
+    auto const stride_lse_packed = args.stride_lse;
+    return {
+        args.ptr_O,
+        args.shape_O,
+        args.stride_O,
+        shape_O_packed,
+        stride_O_packed,
+        args.ptr_lse,
+        args.stride_lse,
+        shape_lse_packed,
+        stride_lse_packed,
+        tma_store_O,
+        args.seq_offsets};
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
+  /// performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& params) {
+    if constexpr (Use_TMA_O) {
+      cute::prefetch_tma_descriptor(params.tma_store_O.get_tma_descriptor());
+    }
+  }
+
+  template <typename SharedStorage, typename FrgTensorO, typename TiledMma>
+  CUTLASS_DEVICE void store(
+      Params const& params,
+      FrgTensorO const& tOrO,
+      SharedStorage& shared_storage,
+      TiledMma tiled_mma,
+      int thread_idx,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord) {
+    auto [m_block, bidh, bidb, split_idx] = block_coord;
+    Tensor sO = make_tensor(
+        make_smem_ptr(shared_storage.tensors.epilogue.smem_o.data()),
+        SmemLayoutO{});
+    // Tensor sO_pi = cute::as_position_independent_swizzle_tensor(sO);
+
+    Tensor tOrO_out = make_tensor_like<Element>(tOrO);
+    hstu::convert_type_out(tOrO, tOrO_out);
+    if constexpr (
+        FP8PermuteCol && (sizeof(Element) == 2 || sizeof(Element) == 4)) {
+      hstu::permute_output_fp8_Vcolmajor(tOrO_out);
+    }
+
+    // Make sure all WGs have finished reading V
+    // Technically we don't need this if we're not using smem, but the mainloop
+    // makes the assumption that all epilogue threads sync at least once during
+    // the epilogue (so that we can start loading Q with cp.async if we need).
+    hstu::named_barrier_sync(
+        NumEpilogueThreads,
+        cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+
+    // Step 1: Write O from rmem -> smem
+    if constexpr (Use_smem) {
+      auto smem_tiled_copy_O = make_tiled_copy_C(SmemCopyAtomO{}, tiled_mma);
+      auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(thread_idx);
+      Tensor taccOrO =
+          smem_thr_copy_O.retile_S(tOrO_out); // ((Atom,AtomNum), MMA_M, MMA_N)
+      Tensor taccOsO =
+          smem_thr_copy_O.partition_D(sO); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+      // Tensor taccOsO = smem_thr_copy_O.partition_D(sO_pi);     //
+      // ((Atom,AtomNum),PIPE_M,PIPE_N)
+      cute::copy(smem_tiled_copy_O, taccOrO, taccOsO);
+      if constexpr (Use_TMA_O) {
+        cutlass::arch::fence_view_async_shared(); // ensure smem writes are
+                                                  // visible to TMA
+        cutlass::arch::NamedBarrier::arrive(
+            NumEpilogueThreads + cutlass::NumThreadsPerWarp,
+            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+      } else {
+        hstu::named_barrier_sync(
+            NumEpilogueThreads,
+            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+      }
+    } else {
+      if constexpr (ArchTag::kMinComputeCapability >= 90) {
+#pragma unroll
+        for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
+          shared_storage.pipelines.barrier_O.arrive(cta_id);
+        }
+      }
+    }
+
+    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info{
+        bidb, size<0>(params.shape_O), params.seq_offsets};
+    int offset_o = seqlen_info.offset;
+    int seqlen_o = seqlen_info.seqlen;
+
+    // Step 2: Write LSE from rmem -> gmem
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    // (MMA,MMA_M,MMA_K)
+    Tensor taccOcO = thread_mma.partition_C(
+        cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
+    static_assert(decltype(size<0, 0>(taccOcO))::value == 2);
+    static_assert(decltype(size<0, 1>(taccOcO))::value == 2);
+    Tensor taccOcO_rowcol = make_tensor(
+        taccOcO.data(), hstu::convert_layout_acc_rowcol(taccOcO.layout()));
+    Tensor taccOcO_row = taccOcO_rowcol(_, _0{});
+    // Step 3: Write O from smem -> gmem
+    if constexpr (Use_TMA_O) {
+      Tensor mO = params.tma_store_O.get_tma_tensor(params.shape_O)(
+          _, _, bidh, bidb, split_idx);
+      Tensor gO = local_tile(
+          mO,
+          select<0, 2>(TileShape_MNK{}),
+          make_coord(m_block, _0{})); // (M, K)
+      auto block_tma_O = params.tma_store_O.get_slice(_0{});
+      Tensor tOgO = block_tma_O.partition_D(gO); // (TMA, TMA_M, TMA_K)
+      Tensor tOsO = block_tma_O.partition_S(sO); // (TMA, TMA_M, TMA_K)
+      int warp_idx_sync =
+          __shfl_sync(0xffffffff, thread_idx / cutlass::NumThreadsPerWarp, 0);
+      if (warp_idx_sync ==
+          NumEpilogueThreads / cutlass::NumThreadsPerWarp - 1) {
+        cutlass::arch::NamedBarrier::sync(
+            NumEpilogueThreads + cutlass::NumThreadsPerWarp,
+            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+        if (cute::elect_one_sync()) {
+          cute::copy(params.tma_store_O, tOsO, tOgO);
+          tma_store_arrive();
+          tma_store_wait<0>();
+#pragma unroll
+          for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
+            shared_storage.pipelines.barrier_O.arrive(cta_id);
+          }
+        }
+      }
+    } else { // Don't use TMA in Jagged case since we don't want to overwrite
+             // the output of another sequence
+      Tensor mO = make_tensor(
+          make_gmem_ptr(params.ptr_O + offset_o * get<0>(params.stride_O)),
+          params.shape_O_packed,
+          params.stride_O_packed)(_, _, bidh, !Jagged ? bidb : 0, split_idx);
+      Tensor gO = local_tile(
+          mO,
+          select<0, 2>(TileShape_MNK{}),
+          make_coord(m_block, _0{})); // (M, K)
+      // if (thread_idx == 0) { printf("Before O write, m_block: %d, bidh: %d,
+      // bidb: %d, split_idx: %d, offset_o: %d, seqlen_o: %d, mO_addr = %p, addr
+      // diff = %d\n", m_block, bidh, bidb, split_idx, offset_o, seqlen_o,
+      // mO.data(), reinterpret_cast<int>(&mO(0)) -
+      // reinterpret_cast<int>(params.ptr_O)); }
+      if constexpr (Use_smem) {
+        GmemTiledCopyO gmem_tiled_copy_O;
+        auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
+        Tensor tOsO =
+            gmem_thr_copy_O.partition_S(sO); // ((Atom,AtomNum),ATOM_M,ATOM_N)
+        // Tensor tOsO = gmem_thr_copy_O.partition_S(sO_pi);        //
+        // ((Atom,AtomNum),ATOM_M,ATOM_N)
+        Tensor tOrO = make_fragment_like(tOsO);
+        cute::copy(gmem_tiled_copy_O, tOsO, tOrO);
+        if constexpr (ArchTag::kMinComputeCapability >= 90) {
+          cutlass::arch::fence_view_async_shared(); // ensure smem reads are
+                                                    // done before next TMA to
+                                                    // smem_v
+#pragma unroll
+          for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
+            shared_storage.pipelines.barrier_O.arrive(cta_id);
+          }
+        }
+        // (BLK_M,BLK_K) -> (blk_m,blk_k)
+        Tensor tOcO = gmem_thr_copy_O.partition_D(
+            cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
+        Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOsO)));
+#pragma unroll
+        for (int k = 0; k < size(tOpO); ++k) {
+          tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O);
+        }
+        Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
+        // Clear_OOB_K must be false since we don't want to write zeros to
+        // gmem
+        hstu::copy<
+            /*Is_even_MN=*/false,
+            /*Is_even_K=*/false,
+            /*Clear_OOB_MN=*/false,
+            /*Clear_OOB_K=*/false>(
+            gmem_tiled_copy_O,
+            tOrO,
+            tOgO,
+            tOcO,
+            tOpO,
+            seqlen_o - m_block * kBlockM);
+      } else {
+        // We already arrived on barrier_O earlier
+        static constexpr int kGmemElemsPerStoreDirect = 2;
+        cute::Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>
+            gmem_copy_direct;
+        // Reshape acc from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M),
+        // ncol=(2, V, MMA_N))
+        Tensor tOrO_rowcol = make_tensor(
+            tOrO_out.data(), hstu::convert_layout_acc_rowcol(tOrO.layout()));
+        Tensor tOrO_copy = cute::tiled_divide(
+            tOrO_rowcol, Shape<_1, Int<kGmemElemsPerStoreDirect>>{});
+        Tensor tOgO = thread_mma.partition_C(gO);
+        Tensor tOgO_rowcol = make_tensor(
+            tOgO.data(), hstu::convert_layout_acc_rowcol(tOgO.layout()));
+        Tensor tOgO_copy = cute::tiled_divide(
+            tOgO_rowcol, Shape<_1, Int<kGmemElemsPerStoreDirect>>{});
+        Tensor taccOcO_col = taccOcO_rowcol(_0{}, _);
+#pragma unroll
+        for (int m = 0; m < size(taccOcO_row); ++m) {
+          if (get<0>(taccOcO_row(m)) < seqlen_o - m_block * kBlockM) {
+#pragma unroll
+            for (int k = 0; k < size(taccOcO_col) / kGmemElemsPerStoreDirect;
+                 ++k) {
+              if (get<1>(taccOcO_col(k * kGmemElemsPerStoreDirect)) <
+                  get<1>(params.shape_O)) {
+                cute::copy(
+                    gmem_copy_direct, tOrO_copy(_, m, k), tOgO_copy(_, m, k));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  template <typename FrgTensorLSE, typename TiledMma>
+  CUTLASS_DEVICE void store_softmax(
+      Params const& params,
+      FrgTensorLSE const& lse,
+      TiledMma tiled_mma,
+      int thread_idx,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord) {
+    auto [m_block, bidh, bidb, split_idx] = block_coord;
+    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info{
+        bidb, size<0>(params.shape_O), params.seq_offsets};
+    int offset_o = seqlen_info.offset;
+    int seqlen_o = seqlen_info.seqlen;
+    // Step 2: Write LSE from rmem -> gmem
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    // (MMA,MMA_M,MMA_K)
+    Tensor taccOcO = thread_mma.partition_C(
+        cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
+    static_assert(decltype(size<0, 0>(taccOcO))::value == 2);
+    static_assert(decltype(size<0, 1>(taccOcO))::value == 2);
+    Tensor taccOcO_rowcol = make_tensor(
+        taccOcO.data(), hstu::convert_layout_acc_rowcol(taccOcO.layout()));
+    Tensor taccOcO_row = taccOcO_rowcol(_, _0{});
+    CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row)); // MMA_M
+    Tensor mLSE = make_tensor(
+        make_gmem_ptr(params.ptr_lse + offset_o * get<0>(params.stride_lse)),
+        params.shape_lse_packed,
+        params.stride_lse_packed)(_, bidh, !Jagged ? bidb : 0, 0);
+#pragma unroll
+    for (int mi = 0; mi < size(lse); ++mi) {
+      int const row = m_block * kBlockM + get<0>(taccOcO_row(mi));
+      if (get<1>(taccOcO_row(_0{})) == 0 && row < seqlen_o) {
+        mLSE(row) = lse(mi);
+      }
+    }
+  }
+
+  CUTLASS_DEVICE void store_tail() {
+    // Don't need to do tma_store_wait<0>() here since we already did in @store
+  }
+
+  // Write 0 to output and -inf to LSE
+  template <bool Clear_O = true>
+  CUTLASS_DEVICE void store_zero(
+      Params const& params,
+      int thread_idx,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    auto [m_block, bidh, bidb, split_idx] = block_coord;
+    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info{
+        bidb, size<0>(params.shape_O), params.seq_offsets};
+    int offset_o = seqlen_info.offset;
+    int seqlen_o = seqlen_info.seqlen;
+    Tensor mO = make_tensor(
+        make_gmem_ptr(params.ptr_O + offset_o * get<0>(params.stride_O)),
+        params.shape_O_packed,
+        params.stride_O_packed)(_, _, bidh, !Jagged ? bidb : 0, split_idx);
+
+    static_assert(kBlockM <= NumEpilogueThreads);
+    if constexpr (!Clear_O) {
+      return;
+    }
+
+    GmemTiledCopyO gmem_tiled_copy_O;
+    auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
+    Tensor tOcO = gmem_thr_copy_O.partition_D(
+        cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
+    Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOcO)));
+#pragma unroll
+    for (int k = 0; k < size(tOpO); ++k) {
+      tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O);
+    }
+    Tensor gO = local_tile(
+        mO, select<0, 2>(TileShape_MNK{}), make_coord(m_block, _0{})); // (M, K)
+    Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
+    Tensor tOrO = make_fragment_like(tOgO);
+    cute::clear(tOrO);
+    // Clear_OOB_K must be false since we don't want to write zeros to gmem
+    hstu::copy<
+        /*Is_even_MN=*/false,
+        /*Is_even_K=*/false,
+        /*Clear_OOB_MN=*/false,
+        /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_O,
+        tOrO,
+        tOgO,
+        tOcO,
+        tOpO,
+        seqlen_o - m_block * kBlockM);
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h
new file mode 100644
index 000000000..ef37e3408
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h
@@ -0,0 +1,157 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <vector>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace hstu {
+
+struct Qkv_params {
+  using index_t = int64_t;
+  // The QKV matrices.
+  void* __restrict__ q_ptr;
+  void* __restrict__ k_ptr;
+  void* __restrict__ v_ptr;
+
+  // The stride between rows of the Q, K and V matrices.
+  index_t q_batch_stride;
+  index_t k_batch_stride;
+  index_t v_batch_stride;
+  index_t q_row_stride;
+  index_t k_row_stride;
+  index_t v_row_stride;
+  index_t q_head_stride;
+  index_t k_head_stride;
+  index_t v_head_stride;
+  index_t v_dim_stride;
+
+  // The number of heads.
+  int h;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Flash_fwd_params : public Qkv_params {
+  using index_t = int64_t;
+
+  // The O matrix (output).
+  void* __restrict__ o_ptr;
+
+  // The stride between rows of O.
+  index_t o_batch_stride;
+  index_t o_row_stride;
+  index_t o_head_stride;
+
+  // For FP8 scaling
+  float* __restrict__ q_descale_ptr;
+  float* __restrict__ k_descale_ptr;
+  float* __restrict__ v_descale_ptr;
+  index_t q_descale_batch_stride;
+  index_t q_descale_head_stride;
+  index_t k_descale_batch_stride;
+  index_t k_descale_head_stride;
+  index_t v_descale_batch_stride;
+  index_t v_descale_head_stride;
+
+  // The dimensions.
+  int b, max_kv_len, max_q_len, qk_d, v_d, total_seq_len_q, total_seq_len_kv;
+
+  // groups
+  int num_groups, batch_size_per_group;
+  int* __restrict__ max_seq_len_tensor;
+  int* __restrict__ contextual_seq_len_tensor;
+  int* __restrict__ max_attn_len_tensor;
+  int* __restrict__ min_full_attn_seq_len_tensor;
+
+  // The scaling factors for the kernel.
+  float alpha;
+
+  int* __restrict__ seq_offsets;
+  int* __restrict__ seq_offsets_q;
+  float* __restrict__ softmax_lse;
+  int* __restrict__ num_targets;
+  float* __restrict__ attn_scale;
+
+  // Local window size
+  int max_attn_len, contextual_seq_len, min_full_attn_seq_len,
+      num_softmax_heads;
+
+  // Pointer to the RNG seed (idx 0) and offset (idx 1).
+  uint64_t* rng_state;
+
+  bool is_bf16;
+  bool is_fp32;
+  bool is_e4m3;
+  bool is_causal;
+  bool is_local;
+  bool has_contexual_mask;
+  bool scalar_scale;
+  bool training;
+
+  int* __restrict__ tile_count_semaphore;
+
+  int arch;
+  int num_sm;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Flash_bwd_params : public Flash_fwd_params {
+  using index_t = int64_t;
+
+  // The dO and dQKV matrices.
+  void* __restrict__ do_ptr;
+  void* __restrict__ dq_ptr;
+  void* __restrict__ dk_ptr;
+  void* __restrict__ dv_ptr;
+  float* __restrict__ softmax_lse_log2;
+  float* __restrict__ softmax_d;
+
+  // To accumulate dQ
+  void* __restrict__ dq_accum_ptr;
+  int* __restrict__ dq_semaphore;
+
+  // The stride between rows of the dO, dQ, dK and dV matrices.
+  index_t do_batch_stride;
+  index_t do_row_stride;
+  index_t do_head_stride;
+  index_t dq_batch_stride;
+  index_t dk_batch_stride;
+  index_t dv_batch_stride;
+  index_t dq_row_stride;
+  index_t dk_row_stride;
+  index_t dv_row_stride;
+  index_t dq_head_stride;
+  index_t dk_head_stride;
+  index_t dv_head_stride;
+
+  int* __restrict__ sort_by_length_indices;
+
+  int max_q_len_rounded, qk_d_rounded, v_d_rounded;
+
+  bool deterministic;
+  index_t dq_accum_split_stride;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Arch, typename T, int Headdim, bool Softmax>
+void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream);
+template <int Arch, typename T, int Headdim, bool Softmax>
+void run_mha_bwd_(Flash_bwd_params& params, cudaStream_t stream);
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp
new file mode 100644
index 000000000..389e6620f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp
@@ -0,0 +1,322 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <Python.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cutlass/numeric_types.h>
+#include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/library.h> // @manual
+#include <torch/nn/functional.h>
+#include "flash_common.h"
+
+extern "C" {
+/* Creates a dummy empty _C module that can be imported from Python.
+    The import from Python will load the .so consisting of this file
+    in this extension, so that the TORCH_LIBRARY static initializers
+    below are run. */
+PyObject* PyInit__C(void) {
+  static struct PyModuleDef module_def = {
+      PyModuleDef_HEAD_INIT,
+      "_C", /* name of module */
+      NULL, /* module documentation, may be NULL */
+      -1, /* size of per-interpreter state of the module,
+              or -1 if the module keeps state in global variables. */
+      NULL, /* methods */
+  };
+  return PyModule_Create(&module_def);
+}
+}
+
+namespace hstu {
+
+class HSTUFlashAttentionFunctionGPU
+    : public torch::autograd::Function<HSTUFlashAttentionFunctionGPU> {
+ public:
+  static at::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      int64_t max_seq_len,
+      double alpha,
+      at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+      at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+      at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+      const std::optional<at::Tensor>& seq_offsets,
+      bool causal,
+      const std::optional<at::Tensor>& num_targets,
+      const std::optional<at::Tensor>& attn_scale,
+      int64_t max_attn_len,
+      int64_t min_full_attn_seq_len,
+      int64_t contextual_seq_len,
+      const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+      const std::optional<at::Tensor>& k_descale, // (b, h_k)
+      const std::optional<at::Tensor>& v_descale, // (b, h_k)
+      bool sort_by_length,
+      bool deterministic,
+      const int64_t sm_margin,
+      int64_t max_q_len,
+      const std::optional<at::Tensor>& seq_offsets_q,
+      int64_t num_softmax_heads,
+      bool training,
+      const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+      const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+      const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+      const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+          std::nullopt,
+      int64_t num_groups = 1) {
+    ctx->saved_data["max_seq_len"] = max_seq_len;
+    ctx->saved_data["alpha"] = alpha;
+    ctx->saved_data["causal"] = causal;
+    ctx->saved_data["max_attn_len"] = max_attn_len;
+    ctx->saved_data["min_full_attn_seq_len"] = min_full_attn_seq_len;
+    ctx->saved_data["contextual_seq_len"] = contextual_seq_len;
+    ctx->saved_data["deterministic"] = deterministic;
+    ctx->saved_data["sort_by_length"] = sort_by_length;
+    ctx->saved_data["sm_margin"] = sm_margin;
+    ctx->saved_data["max_q_len"] = max_q_len;
+    ctx->saved_data["num_softmax_heads"] = num_softmax_heads;
+    ctx->saved_data["num_groups"] = num_groups;
+    auto fwd_out = hstu::hstu_mha_fwd(
+        max_seq_len, // max_seq_len
+        alpha, // alpha
+        q, // q
+        k, // k
+        v, // v
+        seq_offsets, // seq_offsets
+        causal, // causal
+        num_targets, // num_targets
+        attn_scale, // attn_scale
+        max_attn_len, // max_attn_len
+        min_full_attn_seq_len, // min_full_attn_seq_len
+        contextual_seq_len, // contextual_seq_len
+        q_descale, // q_descale
+        k_descale, // k_descale
+        v_descale, // v_descale
+        sm_margin, // sm_margin
+        max_q_len, // max_q_len
+        seq_offsets_q, // seq_offsets_q
+        num_softmax_heads, // num_softmax_heads
+        training,
+        max_seq_len_tensor,
+        contextual_seq_len_tensor,
+        max_attn_len_tensor,
+        min_full_attn_seq_len_tensor,
+        num_groups);
+    auto out = get<0>(fwd_out);
+    auto softmax_lse = get<1>(fwd_out);
+    ctx->save_for_backward(
+        {q,
+         k,
+         v,
+         out,
+         seq_offsets.value_or(at::Tensor()),
+         num_targets.value_or(at::Tensor()),
+         attn_scale.value_or(at::Tensor()),
+         seq_offsets_q.value_or(at::Tensor()),
+         softmax_lse.value_or(at::Tensor()),
+         max_seq_len_tensor.value_or(at::Tensor()),
+         contextual_seq_len_tensor.value_or(at::Tensor()),
+         max_attn_len_tensor.value_or(at::Tensor()),
+         min_full_attn_seq_len_tensor.value_or(at::Tensor())});
+    return out;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_outputs) {
+    auto saved_tensors = ctx->get_saved_variables();
+    auto saved_data = ctx->saved_data;
+    auto q = saved_tensors[0];
+    auto k = saved_tensors[1];
+    auto v = saved_tensors[2];
+    auto out = saved_tensors[3];
+    auto seq_offsets = saved_tensors[4];
+    auto num_targets = saved_tensors[5];
+    auto attn_scale = saved_tensors[6];
+    auto seq_offsets_q = saved_tensors[7];
+    auto softmax_lse = saved_tensors[8];
+    auto max_seq_len_tensor = saved_tensors[9];
+    auto contextual_seq_len_tensor = saved_tensors[10];
+    auto max_attn_len_tensor = saved_tensors[11];
+    auto min_full_attn_seq_len_tensor = saved_tensors[12];
+    auto seq_offsets_opt =
+        seq_offsets.defined() ? std::optional(seq_offsets) : std::nullopt;
+    auto num_targets_opt =
+        num_targets.defined() ? std::optional(num_targets) : std::nullopt;
+    auto attn_scale_opt =
+        attn_scale.defined() ? std::optional(attn_scale) : std::nullopt;
+    auto seq_offsets_q_opt =
+        seq_offsets_q.defined() ? std::optional(seq_offsets_q) : std::nullopt;
+    auto softmax_lse_opt =
+        softmax_lse.defined() ? std::optional(softmax_lse) : std::nullopt;
+    auto max_seq_len_tensor_opt = max_seq_len_tensor.defined()
+        ? std::optional(max_seq_len_tensor)
+        : std::nullopt;
+    auto contextual_seq_len_tensor_opt = contextual_seq_len_tensor.defined()
+        ? std::optional(contextual_seq_len_tensor)
+        : std::nullopt;
+    auto max_attn_len_tensor_opt = max_attn_len_tensor.defined()
+        ? std::optional(max_attn_len_tensor)
+        : std::nullopt;
+    auto min_full_attn_seq_len_tensor_opt =
+        min_full_attn_seq_len_tensor.defined()
+        ? std::optional(min_full_attn_seq_len_tensor)
+        : std::nullopt;
+
+    auto dq = at::empty_like(q);
+    auto dk = at::empty_like(k);
+    auto dv = at::empty_like(v);
+
+    auto bwd_res = hstu::hstu_mha_bwd(
+        saved_data["max_seq_len"].toInt(), // max_seq_len
+        saved_data["alpha"].toDouble(), // alpha
+        grad_outputs[0], // dout
+        q, // q
+        k, // k
+        v, // v
+        dq, // dq
+        dk, // dk
+        dv, // dv
+        out, // out
+        seq_offsets_opt, // seq_offsets
+        saved_data["causal"].toBool(), // causal
+        num_targets_opt, // num_targets
+        attn_scale_opt, // attn_scale
+        saved_data["max_attn_len"].toInt(), // max_attn_len
+        saved_data["min_full_attn_seq_len"].toInt(), // min_full_attn_seq_len
+        saved_data["contextual_seq_len"].toInt(), // contextual_seq_len
+        saved_data["sort_by_length"].toBool(), // sort_by_length
+        saved_data["deterministic"].toBool(), // deterministic
+        saved_data["sm_margin"].toInt(), // sm_margin
+        saved_data["max_q_len"].toInt(), // max_q_len
+        seq_offsets_q_opt, // seq_offsets_q
+        saved_data["num_softmax_heads"].toInt(), // num_softmax_heads
+        softmax_lse_opt,
+        max_seq_len_tensor_opt,
+        contextual_seq_len_tensor_opt,
+        max_attn_len_tensor_opt,
+        min_full_attn_seq_len_tensor_opt,
+        saved_data["num_groups"].toInt());
+
+    return {
+        torch::autograd::Variable(), // max_seq_len
+        torch::autograd::Variable(), // alpha
+        bwd_res[0], // dq
+        bwd_res[1], // dk
+        bwd_res[2], // dv
+        torch::autograd::Variable(), // seq_offsets
+        torch::autograd::Variable(), // causal
+        torch::autograd::Variable(), // num_targets
+        torch::autograd::Variable(), // attn_scale
+        torch::autograd::Variable(), // max_attn_len
+        torch::autograd::Variable(), // min_full_attn_seq_len
+        torch::autograd::Variable(), // contextual_seq_len
+        torch::autograd::Variable(), // q_descale
+        torch::autograd::Variable(), // k_descale
+        torch::autograd::Variable(), // v_descale
+        torch::autograd::Variable(), // sort_by_length
+        torch::autograd::Variable(), // deterministic
+        torch::autograd::Variable(), // sm_margin
+        torch::autograd::Variable(), // max_q_len
+        torch::autograd::Variable(), // seq_offsets_q
+        torch::autograd::Variable(), // num_softmax_heads
+        torch::autograd::Variable(), // training
+        torch::autograd::Variable(), // max_seq_len_tensor
+        torch::autograd::Variable(), // contextual_seq_len_tensor
+        torch::autograd::Variable(), // max_attn_len_tensor
+        torch::autograd::Variable(), // min_full_attn_seq_len_tensor
+        torch::autograd::Variable(), // num_groups
+    };
+  }
+};
+
+at::Tensor cuda_hstu_mha(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    bool sort_by_length,
+    bool deterministic,
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1) {
+  return hstu::HSTUFlashAttentionFunctionGPU::apply(
+      max_seq_len,
+      alpha,
+      q,
+      k,
+      v,
+      seq_offsets,
+      causal,
+      num_targets,
+      attn_scale,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      q_descale,
+      k_descale,
+      v_descale,
+      sort_by_length,
+      deterministic,
+      sm_margin,
+      max_q_len,
+      seq_offsets_q,
+      num_softmax_heads,
+      training,
+      max_seq_len_tensor,
+      contextual_seq_len_tensor,
+      max_attn_len_tensor,
+      min_full_attn_seq_len_tensor,
+      num_groups);
+}
+
+TORCH_LIBRARY_FRAGMENT(hstu, m) {
+  m.impl(
+      "hstu_mha",
+      torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(cuda_hstu_mha)));
+
+  m.impl(
+      "hstu_mha_fwd",
+      torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(hstu::hstu_mha_fwd)));
+
+  m.impl(
+      "hstu_mha_bwd",
+      torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(hstu::hstu_mha_bwd)));
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp
new file mode 100644
index 000000000..c02424efe
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp
@@ -0,0 +1,256 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#include <torch/library.h> // @manual
+#include <torch/nn/functional.h>
+#include "flash_common_cpu.h"
+
+namespace hstu {
+
+at::Tensor hstu_mha_cpu(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    bool sort_by_length,
+    bool deterministic,
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1) {
+  auto fwd_out = hstu::hstu_mha_fwd_dummy(
+      max_seq_len,
+      alpha,
+      q,
+      k,
+      v,
+      seq_offsets,
+      causal,
+      num_targets,
+      attn_scale,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      q_descale,
+      k_descale,
+      v_descale,
+      sm_margin,
+      max_q_len,
+      seq_offsets_q,
+      num_softmax_heads,
+      training);
+  return get<0>(fwd_out);
+}
+
+at::Tensor hstu_mha_meta(
+    const at::SymInt max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    bool sort_by_length,
+    bool deterministic,
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1) {
+  auto fwd_out = hstu::hstu_mha_fwd_meta(
+      max_seq_len,
+      alpha,
+      q,
+      k,
+      v,
+      seq_offsets,
+      causal,
+      num_targets,
+      attn_scale,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      q_descale,
+      k_descale,
+      v_descale,
+      sm_margin,
+      max_q_len,
+      seq_offsets_q,
+      num_softmax_heads,
+      training);
+  return get<0>(fwd_out);
+}
+
+// CPU-only implementation that registers under main hstu namespace
+// This provides fallback implementations when GPU code is not compiled
+TORCH_LIBRARY_FRAGMENT(hstu, m) {
+  // Only register operators if they haven't been registered by GPU code
+  // This allows CPU-only builds to work while GPU builds use GPU
+  // implementations
+
+  m.def(
+      "hstu_mha_fwd("
+      "SymInt max_seq_len, "
+      "float alpha, "
+      "Tensor q, "
+      "Tensor k, "
+      "Tensor v, "
+      "Tensor? seq_offsets, "
+      "bool causal, "
+      "Tensor? num_targets, "
+      "Tensor? attn_scale, "
+      "int max_attn_len, "
+      "int min_full_attn_seq_len, "
+      "int contextual_seq_len, "
+      "Tensor? q_descale, "
+      "Tensor? k_descale, "
+      "Tensor? v_descale, "
+      "int sm_margin = 0,"
+      "int max_q_len = 0,"
+      "Tensor? seq_offsets_q = None,"
+      "int num_softmax_heads = 0,"
+      "bool training = True,"
+      "Tensor? max_seq_len_tensor = None,"
+      "Tensor? contextual_seq_len_tensor = None,"
+      "Tensor? max_attn_len_tensor = None,"
+      "Tensor? min_full_attn_seq_len_tensor = None,"
+      "int num_groups = 1"
+      ") -> (Tensor, Tensor?)");
+
+  m.def(
+      "hstu_mha_bwd("
+      "int max_seq_len, "
+      "float alpha, "
+      "Tensor dout, "
+      "Tensor q, "
+      "Tensor k, "
+      "Tensor v, "
+      "Tensor dq, "
+      "Tensor dk, "
+      "Tensor dv, "
+      "Tensor out, "
+      "Tensor? seq_offsets, "
+      "bool causal, "
+      "Tensor? num_targets, "
+      "Tensor? attn_scale, "
+      "int max_attn_len, "
+      "int min_full_attn_seq_len, "
+      "int contextual_seq_len, "
+      "bool sort_by_length,"
+      "bool deterministic,"
+      "int sm_margin = 0,"
+      "int max_q_len = 0,"
+      "Tensor? seq_offsets_q = None,"
+      "int num_softmax_heads = 0,"
+      "Tensor? softmax_lse = None,"
+      "Tensor? max_seq_len_tensor = None,"
+      "Tensor? contextual_seq_len_tensor = None,"
+      "Tensor? max_attn_len_tensor = None,"
+      "Tensor? min_full_attn_seq_len_tensor = None,"
+      "int num_groups = 1"
+      ") -> Tensor[]");
+
+  m.def(
+      "hstu_mha("
+      "SymInt max_seq_len, "
+      "float alpha, "
+      "Tensor q, "
+      "Tensor k, "
+      "Tensor v, "
+      "Tensor? seq_offsets, "
+      "bool causal, "
+      "Tensor? num_targets, "
+      "Tensor? attn_scale, "
+      "int max_attn_len, "
+      "int min_full_attn_seq_len, "
+      "int contextual_seq_len, "
+      "Tensor? q_descale, "
+      "Tensor? k_descale, "
+      "Tensor? v_descale, "
+      "bool sort_by_length, "
+      "bool deterministic, "
+      "int sm_margin = 0,"
+      "int max_q_len = 0,"
+      "Tensor? seq_offsets_q = None,"
+      "int num_softmax_heads = 0,"
+      "bool training = True,"
+      "Tensor? max_seq_len_tensor = None,"
+      "Tensor? contextual_seq_len_tensor = None,"
+      "Tensor? max_attn_len_tensor = None,"
+      "Tensor? min_full_attn_seq_len_tensor = None,"
+      "int num_groups = 1"
+      ") -> Tensor");
+
+  // Register CPU implementations
+  m.impl(
+      "hstu_mha",
+      torch::dispatch(c10::DispatchKey::CPU, TORCH_FN(hstu_mha_cpu)));
+  m.impl(
+      "hstu_mha",
+      torch::dispatch(c10::DispatchKey::Meta, TORCH_FN(hstu_mha_meta)));
+
+  m.impl(
+      "hstu_mha_fwd",
+      torch::dispatch(
+          c10::DispatchKey::CPU, TORCH_FN(hstu::hstu_mha_fwd_dummy)));
+  m.impl(
+      "hstu_mha_fwd",
+      torch::dispatch(
+          c10::DispatchKey::Meta, TORCH_FN(hstu::hstu_mha_fwd_meta)));
+
+  m.impl(
+      "hstu_mha_bwd",
+      torch::dispatch(
+          c10::DispatchKey::CPU, TORCH_FN(hstu::hstu_mha_bwd_dummy)));
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h
new file mode 100644
index 000000000..051d5141b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h
@@ -0,0 +1,402 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/arch/reg_reconfig.h>
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/kernel_hardware_info.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "tile_scheduler.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    bool Softmax,
+    class CollectiveMainloop_,
+    class CollectiveEpilogue_,
+    class TileScheduler_>
+class FlashAttnBwdSm90 {
+ public:
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape_MNK = typename CollectiveMainloop::TileShape_MNK;
+  using TiledMmaSdP = typename CollectiveMainloop::TiledMmaSdP;
+  using TiledMmadKV = typename CollectiveMainloop::TiledMmadKV;
+  using ArchTag = typename CollectiveMainloop::ArchTag;
+  using ClusterShape = typename CollectiveMainloop::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static constexpr bool dKV_swapAB = CollectiveMainloop::dKV_swapAB;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  using TileScheduler = TileScheduler_;
+  using TileSchedulerArguments = typename hstu::TileSchedulerArguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups =
+      CUTE_STATIC_V(size(TiledMmaSdP{})) / cutlass::NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock =
+      CUTE_STATIC_V(size(TiledMmaSdP{})) +
+      (NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static_assert(NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement =
+      NumMmaWarpGroups == 2 ? 24 : 32;
+  static constexpr uint32_t MmaRegisterRequirement =
+      NumMmaWarpGroups == 2 ? 240 : 160;
+  // If you want to print from the producer warp, you'd need to increase the
+  // number of registers Otherwise you'll get CUDA error. static constexpr
+  // uint32_t LoadRegisterRequirement = 40; static constexpr uint32_t
+  // MmaRegisterRequirement = NumMmaWarpGroups == 2 ? 232 : 152;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128> {
+      union {
+        typename CollectiveMainloop::TensorStorage mainloop;
+        typename CollectiveEpilogue::TensorStorage epilogue;
+      };
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16> {
+      alignas(16) cutlass::arch::ClusterTransactionBarrier barrier_KV;
+      alignas(16) typename CollectiveMainloop::MainloopPipeline::SharedStorage
+          pipeline_q;
+      alignas(16)
+          typename CollectiveMainloop::MainloopPipeline_dO::SharedStorage
+          pipeline_do;
+      alignas(16) typename TileScheduler::SharedStorage smem_scheduler;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    cutlass::KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    cutlass::KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const& args) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST(
+          "  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST(
+        "to_underlying_arguments(): Setting persistent grid SM count to "
+        << sm_count);
+
+    cutlass::KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    return {
+        CollectiveMainloop::to_underlying_arguments(args.mainloop),
+        CollectiveEpilogue::to_underlying_arguments(args.epilogue),
+        hw_info,
+        TileScheduler::to_underlying_arguments(args.scheduler)};
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3 get_grid_shape(Params const& params) {
+    return TileScheduler::get_grid_shape(
+        params.scheduler, params.hw_info.sm_count);
+  }
+
+  static dim3 get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char* smem_buf) {
+    static constexpr int NumMmaThreads =
+        NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
+    static constexpr int NumCopyThreads =
+        NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup;
+
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    using PipelineParams = typename MainloopPipeline::Params;
+    using PipelineState = typename MainloopPipeline::PipelineState;
+    using MainloopPipeline_dO =
+        typename CollectiveMainloop::MainloopPipeline_dO;
+    using PipelineParams_dO = typename MainloopPipeline_dO::Params;
+    using PipelineState_dO = typename MainloopPipeline_dO::PipelineState;
+    static constexpr bool Q_dO_same_stages =
+        std::is_same_v<MainloopPipeline, MainloopPipeline_dO>;
+
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int const lane_predicate = cute::elect_one_sync();
+    int const warp_idx = cutlass::canonical_warp_idx_sync();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if (warp_idx == 0 && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Obtain warp index
+    int const warp_group_thread_idx =
+        threadIdx.x % cutlass::NumThreadsPerWarpGroup;
+
+    PipelineParams pipeline_params;
+    if constexpr (Softmax) {
+      pipeline_params.transaction_bytes =
+          CollectiveMainloop::TmaTransactionBytesQ +
+          CollectiveMainloop::TmaTransactionBytesLSE;
+    } else {
+      pipeline_params.transaction_bytes =
+          CollectiveMainloop::TmaTransactionBytesQ;
+    }
+    int warp_group_idx = cutlass::canonical_warp_group_idx();
+    pipeline_params.role = warp_group_idx == 0
+        ? MainloopPipeline::ThreadCategory::Producer
+        : MainloopPipeline::ThreadCategory::Consumer;
+    pipeline_params.is_leader = warp_group_thread_idx == 0;
+    pipeline_params.num_consumers = NumMmaThreads;
+
+    if (warp_idx == 0 && lane_predicate) {
+      shared_storage.pipelines.barrier_KV.init(1 /*numThreads*/);
+    }
+    // We're counting on pipeline_q to call cutlass::arch::fence_barrier_init();
+    MainloopPipeline pipeline_q(
+        shared_storage.pipelines.pipeline_q, pipeline_params, ClusterShape{});
+    auto role_dO = warp_group_idx == 0
+        ? MainloopPipeline_dO::ThreadCategory::Producer
+        : MainloopPipeline_dO::ThreadCategory::Consumer;
+    PipelineParams_dO pipeline_params_dO{
+        pipeline_params.transaction_bytes,
+        role_dO,
+        pipeline_params.is_leader,
+        pipeline_params.num_consumers};
+    MainloopPipeline_dO pipeline_do(
+        shared_storage.pipelines.pipeline_do,
+        cute::conditional_return<Q_dO_same_stages>(
+            pipeline_params, pipeline_params_dO),
+        ClusterShape{});
+
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue;
+
+    // We need this to guarantee that the Pipeline init is visible to all
+    // producers and consumer blocks in the Cluster
+    if constexpr (size(ClusterShape{}) > 1) {
+      cute::cluster_arrive_relaxed();
+      cute::cluster_wait();
+    } else {
+      __syncthreads();
+    }
+
+    if (warp_group_idx == 0) { // Producer
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      int warp_idx_in_warpgroup =
+          __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+      if (warp_idx_in_warpgroup == 0) { // Load K, V, and do TMA on Q and dO
+        PipelineState smem_pipe_write =
+            cutlass::make_producer_start_state<MainloopPipeline>();
+        PipelineState_dO smem_pipe_write_do =
+            cutlass::make_producer_start_state<MainloopPipeline_dO>();
+
+        TileScheduler scheduler(
+            reinterpret_cast<typename TileScheduler::SharedStorage*>(
+                &shared_storage.pipelines.smem_scheduler));
+        for (auto work_tile_info =
+                 scheduler.template get_initial_work</*IsProducerWarp=*/true>(
+                     params.scheduler);
+             work_tile_info.is_valid(params.scheduler);
+             work_tile_info =
+                 scheduler.template get_next_work</*IsProducerWarp=*/true>(
+                     params.scheduler, work_tile_info)) {
+          auto block_coord_ = work_tile_info.get_block_coord(params.scheduler);
+          auto [n_block, bidh, bidb, _ /*split_idx*/] = block_coord_;
+          cute::tuple<int32_t, int32_t, int32_t> block_coord = {
+              n_block, bidh, bidb};
+          auto scheduler_prefetch = [&scheduler, &params, &work_tile_info]() {
+            scheduler.prefetch_next_work(params.scheduler, work_tile_info);
+          };
+          collective_mainloop.load(
+              params.mainloop,
+              pipeline_q,
+              pipeline_do,
+              smem_pipe_write,
+              smem_pipe_write_do,
+              shared_storage,
+              scheduler_prefetch,
+              block_coord);
+        }
+        collective_mainloop.load_tail(
+            pipeline_q, pipeline_do, smem_pipe_write, smem_pipe_write_do);
+      } else if (warp_idx_in_warpgroup == 1) {
+        TileScheduler scheduler(
+            reinterpret_cast<typename TileScheduler::SharedStorage*>(
+                &shared_storage.pipelines.smem_scheduler));
+        for (auto work_tile_info =
+                 scheduler.template get_initial_work</*IsProducerWarp=*/false>(
+                     params.scheduler);
+             work_tile_info.is_valid(params.scheduler);
+             work_tile_info =
+                 scheduler.template get_next_work</*IsProducerWarp=*/false>(
+                     params.scheduler, work_tile_info)) {
+          auto block_coord_ = work_tile_info.get_block_coord(params.scheduler);
+          auto [n_block, bidh, bidb, _ /*split_idx*/] = block_coord_;
+          cute::tuple<int32_t, int32_t, int32_t> block_coord = {
+              n_block, bidh, bidb};
+          collective_mainloop.store_dq(
+              params.mainloop, shared_storage, block_coord);
+        }
+      }
+    } else { // Consumer
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      TileScheduler scheduler(
+          reinterpret_cast<typename TileScheduler::SharedStorage*>(
+              &shared_storage.pipelines.smem_scheduler));
+      // Initialize matmul objects.
+      TiledMmadKV tiled_mma_dKV;
+
+      PipelineState smem_pipe_read;
+      PipelineState_dO smem_pipe_read_do;
+
+      collective_mainloop.mma_init();
+      scheduler.init_consumer();
+
+      int work_idx = 0;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (auto work_tile_info =
+               scheduler.template get_initial_work</*IsProducerWarp=*/false>(
+                   params.scheduler);
+           work_tile_info.is_valid(params.scheduler);
+           work_tile_info =
+               scheduler.template get_next_work</*IsProducerWarp=*/false>(
+                   params.scheduler, work_tile_info)) {
+        auto block_coord_ = work_tile_info.get_block_coord(params.scheduler);
+        auto [n_block, bidh, bidb, _ /*split_idx*/] = block_coord_;
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+        if (threadIdx.x == 0 || threadIdx.x == 128) {
+          std::printf(
+              "n_block: (%d), bidh: (%d), bidb: (%d), blockIdx.x: (%d), blockIdx.y: (%d), blockIdx.z: (%d)\n",
+              n_block,
+              bidh,
+              bidb,
+              blockIdx.x,
+              blockIdx.y,
+              blockIdx.z);
+        }
+#endif
+        cute::tuple<int32_t, int32_t, int32_t> block_coord = {
+            n_block, bidh, bidb};
+
+        // dK and dV output accumulator.
+        Tensor tdKrdK = partition_fragment_C(
+            tiled_mma_dKV,
+            select<!dKV_swapAB ? 1 : 2, !dKV_swapAB ? 2 : 1>(TileShape_MNK{}));
+        Tensor tdVrdV = partition_fragment_C(
+            tiled_mma_dKV,
+            select<!dKV_swapAB ? 1 : 2, !dKV_swapAB ? 2 : 1>(TileShape_MNK{}));
+
+        bool tile_valid;
+        if constexpr (Softmax) {
+          tile_valid = collective_mainloop.mma_softmax(
+              params.mainloop,
+              pipeline_q,
+              pipeline_do,
+              smem_pipe_read,
+              smem_pipe_read_do,
+              tdKrdK,
+              tdVrdV,
+              threadIdx.x - NumCopyThreads,
+              work_idx,
+              block_coord,
+              shared_storage);
+        } else {
+          tile_valid = collective_mainloop.mma(
+              params.mainloop,
+              pipeline_q,
+              pipeline_do,
+              smem_pipe_read,
+              smem_pipe_read_do,
+              tdKrdK,
+              tdVrdV,
+              threadIdx.x - NumCopyThreads,
+              work_idx,
+              block_coord,
+              shared_storage);
+        }
+        if (tile_valid) {
+          collective_epilogue.store(
+              params.epilogue,
+              tdKrdK,
+              tdVrdV,
+              shared_storage,
+              tiled_mma_dKV,
+              threadIdx.x - NumCopyThreads,
+              block_coord);
+        } else {
+          collective_epilogue.store_zero(
+              params.epilogue, threadIdx.x - NumCopyThreads, block_coord);
+        }
+      }
+      collective_epilogue.store_tail();
+    }
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h
new file mode 100644
index 000000000..6900852df
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h
@@ -0,0 +1,492 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/cluster_launch.hpp" // For ClusterLauncher
+#include "cutlass/device_kernel.h" // For device_kernel
+#include "cutlass/kernel_launch.h" // For kernel_launch
+
+#include "epilogue_bwd.h"
+#include "flash.h"
+#include "flash_bwd_kernel_sm90.h"
+#include "flash_bwd_postprocess_kernel.h"
+#include "flash_bwd_preprocess_kernel.h"
+#include "mainloop_bwd_sm90_tma_gmma_ws.h"
+#include "static_switch.h"
+#include "tile_scheduler.h"
+#include "tile_size.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    int Arch,
+    int kHeadDim,
+    int kBlockM,
+    int kBlockN,
+    typename Element,
+    bool Causal,
+    bool Local,
+    bool Contexual_mask,
+    bool Jagged,
+    bool Has_targets,
+    bool Deterministic,
+    int Stages_dO = 2,
+    int Stages_dS_or_QSm80 = 2,
+    bool SdP_swapAB = true,
+    bool dKV_swapAB = false,
+    bool dQ_swapAB = false,
+    int NumMmaWarpGroups = 2,
+    int AtomLayoutMSdP = 1,
+    int AtomLayoutNdKV = 2,
+    int AtomLayoutMdQ = 1,
+    bool V_in_regs = false,
+    bool Cross = false,
+    bool Softmax = false>
+void run_flash_bwd(hstu::Flash_bwd_params& params, cudaStream_t stream) {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+  std::printf(
+      "[flash_bwd_launch_template] Local: (%d), Jagged: (%d), Has_targets: (%d), Causal: (%d), max_kv_len: (%d), kHeadDim: (%d), kBlockM: (%d), kBlockN: (%d)\n",
+      Local,
+      Jagged,
+      Has_targets,
+      Causal,
+      params.max_kv_len,
+      kHeadDim,
+      kBlockM,
+      kBlockN);
+#endif
+  static_assert(
+      !(Causal && Local), "Causal and Local cannot be true at the same time.");
+  using ElementAccum = float;
+  using ArchTag =
+      std::conditional_t<Arch >= 90, cutlass::arch::Sm90, cutlass::arch::Sm80>;
+
+  int const total_q_padded_rounded =
+      cute::round_up(params.total_seq_len_q + params.b * kBlockM, kBlockM);
+  int seqlen_q = !Jagged ? params.max_q_len : params.total_seq_len_q;
+  int seqlen_kv = !Jagged ? params.max_kv_len : params.total_seq_len_kv;
+  int seqlen_q_rounded =
+      !Jagged ? params.max_q_len_rounded : total_q_padded_rounded;
+  int batch = !Jagged ? params.b : 1;
+
+  using TileShape_MK = cute::Shape<Int<kBlockM>, Int<kHeadDim>>;
+  using PreprocessKernel = hstu::FlashAttnBwdPreprocess<
+      TileShape_MK,
+      Element,
+      ElementAccum,
+      ArchTag,
+      /*Clear_dQaccum=*/true,
+      Jagged,
+      Softmax>;
+  typename PreprocessKernel::Arguments preprocess_args{
+      static_cast<Element const*>(params.o_ptr),
+      {seqlen_q, params.v_d, params.h, batch}, // shape_O
+      {params.o_row_stride,
+       _1{},
+       params.o_head_stride,
+       !Jagged ? params.o_batch_stride : 0}, // stride_O
+      static_cast<Element const*>(params.do_ptr),
+      {params.do_row_stride,
+       _1{},
+       params.do_head_stride,
+       !Jagged ? params.do_batch_stride : 0}, // stride_dO
+      static_cast<float*>(params.softmax_d),
+      {seqlen_q_rounded, params.num_softmax_heads, batch}, // shape_dPsum
+      {_1{},
+       seqlen_q_rounded,
+       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
+               : 0}, // stride_dPsum
+      static_cast<float*>(params.softmax_lse),
+      {_1{},
+       seqlen_q,
+       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
+               : 0}, // stride_LSE
+      static_cast<float*>(params.softmax_lse_log2),
+      {_1{},
+       seqlen_q_rounded,
+       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
+               : 0}, // stride_LSE_log2
+      static_cast<ElementAccum*>(params.dq_accum_ptr),
+      {seqlen_q_rounded * params.qk_d_rounded,
+       params.h,
+       batch}, // shape_dQaccum
+      {_1{},
+       seqlen_q_rounded * params.qk_d_rounded,
+       !Jagged ? params.qk_d_rounded * params.max_q_len_rounded * params.h
+               : 0}, // stride_dQaccum
+      params.b,
+      params.h,
+      params.num_softmax_heads,
+      params.max_q_len,
+      params.dq_semaphore,
+      Cross ? params.seq_offsets_q : params.seq_offsets};
+  typename PreprocessKernel::Params preprocess_params =
+      PreprocessKernel::to_underlying_arguments(preprocess_args);
+  int num_m_block = cute::ceil_div(params.max_q_len, kBlockM);
+  dim3 grid_m(num_m_block, params.h, params.b);
+  cutlass::kernel_launch<PreprocessKernel>(
+      grid_m,
+      PreprocessKernel::MaxThreadsPerBlock,
+      PreprocessKernel::SharedStorageSize,
+      stream,
+      preprocess_params,
+      false /*launch_with_pdl*/);
+  CHECK_CUDA_KERNEL_LAUNCH();
+
+  using TileShape_MNK = cute::Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+  using ClusterShape =
+      cute::Shape<_1, Int<1>, _1>; // Currently doesn't not support cluster
+  // Stages_dS_or_QSm80 is Stages_dS if Sm90 and Stages if Sm80
+  static constexpr int Stages = Arch >= 90 ? 2 : Stages_dS_or_QSm80;
+  static constexpr int Stages_dS = Arch >= 90 ? Stages_dS_or_QSm80 : 1;
+  using CollectiveMainloop = hstu::CollectiveMainloopBwdSm90<
+      Stages,
+      Stages_dO,
+      Stages_dS,
+      ClusterShape,
+      TileShape_MNK,
+      Element,
+      ElementAccum,
+      cutlass::arch::Sm90,
+      Causal,
+      Local,
+      Contexual_mask,
+      Jagged,
+      Has_targets,
+      Deterministic,
+      SdP_swapAB,
+      dKV_swapAB,
+      dQ_swapAB,
+      NumMmaWarpGroups,
+      AtomLayoutMSdP,
+      AtomLayoutNdKV,
+      AtomLayoutMdQ,
+      V_in_regs,
+      Cross,
+      Softmax>;
+  using CollectiveEpilogue = hstu::CollectiveEpilogueBwd<
+      TileShape_MNK,
+      Element,
+      ArchTag,
+      CollectiveMainloop::NumMmaThreads,
+      Jagged,
+      dKV_swapAB,
+      NumMmaWarpGroups*(Arch >= 90 ? 1 : cutlass::NumWarpsPerWarpGroup) /
+          AtomLayoutNdKV>;
+  using Scheduler =
+      hstu::SingleTileScheduler<Jagged, kBlockN, false /*Sort_by_length*/>;
+  using AttnKernel = hstu::enable_sm90_or_later<hstu::FlashAttnBwdSm90<
+      Softmax,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      Scheduler>>;
+
+  typename CollectiveMainloop::Arguments mainloop_args{
+      static_cast<Element const*>(params.q_ptr),
+      {seqlen_q, params.qk_d, params.h, batch}, // shape_Q
+      {params.q_row_stride,
+       _1{},
+       params.q_head_stride,
+       !Jagged ? params.q_batch_stride : 0}, // stride_Q
+      static_cast<Element const*>(params.k_ptr),
+      {seqlen_kv, params.qk_d, params.h, batch}, // shape_K
+      {params.k_row_stride,
+       _1{},
+       params.k_head_stride,
+       !Jagged ? params.k_batch_stride : 0}, // stride_K
+      static_cast<Element const*>(params.v_ptr),
+      {seqlen_kv, params.v_d, params.h, batch}, // shape_V
+      {params.v_row_stride,
+       _1{},
+       params.v_head_stride,
+       !Jagged ? params.v_batch_stride : 0}, // stride_V
+      static_cast<Element const*>(params.do_ptr),
+      {seqlen_q, params.v_d, params.h, batch}, // shape_dO
+      {params.do_row_stride,
+       _1{},
+       params.do_head_stride,
+       !Jagged ? params.do_batch_stride : 0}, // stride_dO
+      static_cast<ElementAccum*>(params.dq_accum_ptr),
+      {seqlen_q_rounded * params.qk_d_rounded,
+       params.h,
+       batch}, // shape_dQaccum
+      {_1{},
+       seqlen_q_rounded * params.qk_d_rounded,
+       !Jagged ? params.qk_d_rounded * params.max_q_len_rounded * params.h
+               : 0}, // stride_dQaccum
+      static_cast<float*>(params.softmax_lse_log2),
+      {seqlen_q_rounded, params.num_softmax_heads, batch}, // shape_LSE
+      {_1{},
+       seqlen_q_rounded,
+       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
+               : 0}, // stride_LSE_log2
+      static_cast<float*>(params.softmax_d),
+      {_1{},
+       seqlen_q_rounded,
+       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
+               : 0}, // stride_dPsum
+      params.max_attn_len,
+      params.min_full_attn_seq_len,
+      params.contextual_seq_len,
+      1.0f / params.max_kv_len,
+      params.alpha,
+      params.b,
+      params.num_softmax_heads,
+      params.num_groups,
+      params.batch_size_per_group,
+      params.dq_semaphore,
+      params.seq_offsets,
+      params.seq_offsets_q,
+      params.num_targets,
+      params.max_seq_len_tensor,
+      params.contextual_seq_len_tensor,
+      params.max_attn_len_tensor,
+      params.min_full_attn_seq_len_tensor,
+      params.attn_scale,
+      params.scalar_scale};
+  typename CollectiveEpilogue::Arguments epilogue_args{
+      static_cast<typename CollectiveEpilogue::Element*>(params.dk_ptr),
+      [&] {
+        return typename CollectiveEpilogue::ShapedKV{
+            seqlen_kv, params.qk_d, params.h, batch}; // shape_dK
+      }(),
+      [&] {
+        return typename CollectiveEpilogue::StridedKV{
+            params.dk_row_stride,
+            _1{},
+            params.dk_head_stride,
+            !Jagged ? params.dk_batch_stride : 0}; // stride_dK
+      }(),
+      static_cast<typename CollectiveEpilogue::Element*>(params.dv_ptr),
+      [&] {
+        return typename CollectiveEpilogue::StridedKV{
+            params.dv_row_stride,
+            _1{},
+            params.dv_head_stride,
+            !Jagged ? params.dv_batch_stride : 0}; // stride_dV
+      }(),
+      params.h,
+      params.seq_offsets};
+
+  int num_blocks_n =
+      cutlass::ceil_div(params.max_kv_len, get<1>(TileShape_MNK{}));
+  num_blocks_n = cutlass::round_up(num_blocks_n, size<1>(ClusterShape{}));
+  typename hstu::TileSchedulerArguments scheduler_args{
+      num_blocks_n,
+      params.h,
+      params.b,
+      params.max_kv_len,
+      params.qk_d,
+      sizeof(Element),
+      params.tile_count_semaphore,
+      params.seq_offsets,
+      params.sort_by_length_indices};
+
+  int device;
+  cudaGetDevice(&device);
+  typename AttnKernel::Params kernel_params =
+      AttnKernel::to_underlying_arguments(
+          {mainloop_args,
+           epilogue_args,
+           {device, params.num_sm},
+           scheduler_args});
+
+  dim3 grid_dims = AttnKernel::get_grid_shape(kernel_params);
+  dim3 block_dims = AttnKernel::get_block_shape();
+  int smem_size = AttnKernel::SharedStorageSize;
+  if constexpr (size(ClusterShape{}) > 1) {
+    void const* kernel = (void const*)cutlass::device_kernel<AttnKernel>;
+    if (smem_size >= 48 * 1024) {
+      CHECK_CUDA(cudaFuncSetAttribute(
+          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+    dim3 cluster_dims(
+        size<0>(ClusterShape{}),
+        size<1>(ClusterShape{}),
+        size<2>(ClusterShape{}));
+    cutlass::ClusterLauncher::launch(
+        grid_dims,
+        cluster_dims,
+        block_dims,
+        smem_size,
+        stream,
+        kernel,
+        kernel_params,
+        false /*launch_with_pdl*/);
+  } else {
+    if (smem_size >= 48 * 1024) {
+      CHECK_CUDA(cudaFuncSetAttribute(
+          cutlass::device_kernel<AttnKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size));
+    }
+    cutlass::kernel_launch<AttnKernel>(
+        grid_dims,
+        block_dims,
+        smem_size,
+        stream,
+        kernel_params,
+        false /*launch_with_pdl*/);
+  }
+  CHECK_CUDA_KERNEL_LAUNCH();
+
+  using PostprocessKernel = hstu::FlashAttnBwdPostprocessConvertdQ<
+      TileShape_MK,
+      Element,
+      ElementAccum,
+      ArchTag,
+      AttnKernel::CollectiveMainloop::NumMmaThreads,
+      typename AttnKernel::CollectiveMainloop::TiledMmadQ,
+      AttnKernel::CollectiveMainloop::dQ_swapAB,
+      Jagged,
+      Softmax>;
+  typename PostprocessKernel::Arguments postprocess_args{
+      static_cast<ElementAccum const*>(params.dq_accum_ptr),
+      {seqlen_q_rounded * params.qk_d_rounded,
+       params.h,
+       batch}, // shape_dQaccum
+      {_1{},
+       seqlen_q_rounded * params.qk_d_rounded,
+       !Jagged ? params.qk_d_rounded * params.max_q_len_rounded * params.h
+               : 0}, // stride_dQaccum
+      static_cast<Element*>(params.dq_ptr),
+      {seqlen_q, params.qk_d, params.h, batch}, // shape_dQ
+      {params.dq_row_stride,
+       _1{},
+       params.dq_head_stride,
+       params.dq_batch_stride}, // stride_dQ
+      Cross ? params.seq_offsets_q : params.seq_offsets};
+  typename PostprocessKernel::Params postprocess_params =
+      PostprocessKernel::to_underlying_arguments(postprocess_args);
+  int num_m_block_postprocess =
+      cute::ceil_div(params.max_q_len, get<0>(TileShape_MK{}));
+  dim3 grid_m_postprocess(num_m_block_postprocess, params.h, params.b);
+  int smem_size_postprocess = PostprocessKernel::SharedStorageSize;
+  if (smem_size_postprocess >= 48 * 1024) {
+    CHECK_CUDA(cudaFuncSetAttribute(
+        cutlass::device_kernel<PostprocessKernel>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_size_postprocess));
+  }
+  cutlass::kernel_launch<PostprocessKernel>(
+      grid_m_postprocess,
+      PostprocessKernel::MaxThreadsPerBlock,
+      smem_size_postprocess,
+      stream,
+      postprocess_params,
+      false /*launch_with_pdl*/);
+  CHECK_CUDA_KERNEL_LAUNCH();
+}
+
+template <
+    int Arch,
+    typename T,
+    int kBlockM,
+    int kBlockN,
+    int kHeadDim,
+    bool Causal,
+    bool Local,
+    int Stages_dO = 2,
+    int Stages_dS_or_QSm80 = 2,
+    bool SdP_swapAB = true,
+    bool dKV_swapAB = false,
+    bool dQ_swapAB = false,
+    int NumMmaWarpGroups = 2,
+    int AtomLayoutMSdP = 1,
+    int AtomLayoutNdKV = 2,
+    int AtomLayoutMdQ = 1,
+    bool V_in_regs = false,
+    bool Softmax = false>
+void run_mha_bwd_dispatch(hstu::Flash_bwd_params& params, cudaStream_t stream) {
+  BOOL_SWITCH(params.seq_offsets != nullptr, Jagged, [&] {
+    BOOL_SWITCH(params.num_targets != nullptr, Has_targets, [&] {
+      BOOL_SWITCH(params.has_contexual_mask, Contexual_mask, [&] {
+        BOOL_SWITCH(params.seq_offsets_q, Cross, [&] {
+          run_flash_bwd<
+              Arch,
+              kHeadDim,
+              kBlockM,
+              kBlockN,
+              T,
+              Causal,
+              Local,
+              Contexual_mask,
+              Jagged,
+              Has_targets,
+              false /*Deterministic*/,
+              Stages_dO,
+              Stages_dS_or_QSm80,
+              SdP_swapAB,
+              dKV_swapAB,
+              dQ_swapAB,
+              NumMmaWarpGroups,
+              AtomLayoutMSdP,
+              AtomLayoutNdKV,
+              AtomLayoutMdQ,
+              V_in_regs,
+              Cross,
+              Softmax>(params, stream);
+        });
+      });
+    });
+  });
+}
+
+template <int Arch, typename T, int kHeadDim, bool Softmax>
+void run_mha_bwd_(hstu::Flash_bwd_params& params, cudaStream_t stream) {
+  CAUSAL_LOCAL_SWITCH(params.is_causal, params.is_local, Causal, Local, [&] {
+    int const kBlockM = hstu::kBlockM_bwd(Arch, kHeadDim, Causal, Local);
+    int const kBlockN = hstu::kBlockN_bwd(Arch, kHeadDim);
+    bool const V_in_regs = hstu::V_in_regs_bwd(Arch, kHeadDim);
+    static constexpr std::tuple<int, int> Stages =
+        hstu::Stages_bwd(Arch, kHeadDim);
+    static constexpr std::tuple<bool, bool, bool> swapAB =
+        hstu::swapAB_bwd(Arch, kHeadDim, Causal, Local);
+    int const NumMmaWarpGroups = hstu::NumMmaWarpGroups_bwd(Arch, kHeadDim);
+    static constexpr std::tuple<int, int, int> AtomLayout =
+        hstu::AtomLayout_bwd(Arch, kHeadDim);
+    run_mha_bwd_dispatch<
+        Arch,
+        T,
+        kBlockM,
+        kBlockN,
+        kHeadDim,
+        Causal,
+        Local,
+        std::get<0>(Stages), /*Stages_dO*/
+        std::get<1>(Stages), /*Stages_dS_or_QSm80*/
+        std::get<0>(swapAB), /*SdP_swapAB*/
+        std::get<1>(swapAB), /*dKV_swapAB*/
+        std::get<2>(swapAB), /*dQ_swapAB*/
+        NumMmaWarpGroups,
+        std::get<0>(AtomLayout), /*AtomLayoutMSdP*/
+        std::get<1>(AtomLayout), /*AtomLayoutNdKV*/
+        std::get<2>(AtomLayout), /*AtomLayoutMdQ*/
+        V_in_regs,
+        Softmax>(params, stream);
+  });
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h
new file mode 100644
index 000000000..ca04a1456
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h
@@ -0,0 +1,348 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+#include "cutlass/arch/barrier.h"
+
+#include "seqlen.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    class TileShape_MK_,
+    class Element,
+    class ElementAccum,
+    class ArchTag_,
+    int kNThreads,
+    class TiledMma,
+    bool dQ_swapAB,
+    bool Jagged,
+    bool Softmax>
+class FlashAttnBwdPostprocessConvertdQ {
+ public:
+  // Type Aliases
+  using TileShape_MK = TileShape_MK_;
+  using ArchTag = ArchTag_;
+
+  static_assert(ArchTag::kMinComputeCapability >= 75);
+  static constexpr bool IsSm90 = ArchTag::kMinComputeCapability >= 90;
+
+  static constexpr uint32_t MaxThreadsPerBlock = kNThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 2;
+
+  static constexpr int kBlockM = get<0>(TileShape_MK{});
+  static constexpr int kHeadDim = get<1>(TileShape_MK{});
+  static_assert(
+      !IsSm90 || kNThreads % cutlass::NumThreadsPerWarpGroup == 0,
+      "kNThreads must be a multiple of NumThreadsPerWarpGroup");
+  static constexpr int NumdQWarpGgroups =
+      kNThreads / cutlass::NumThreadsPerWarpGroup;
+  using R2SLayoutAtomdQaccum = std::conditional_t<
+      IsSm90,
+      Layout<
+          Shape<Int<cutlass::NumThreadsPerWarpGroup>, Int<NumdQWarpGgroups>>>,
+      Layout<Shape<Int<kNThreads>>>>;
+  using R2STiledCopydQaccum = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementAccum>{},
+      R2SLayoutAtomdQaccum{},
+      Layout<Shape<Int<IsSm90 ? 4 : 1>>>{})); // Val layout, 1 or 4 vals per
+                                              // read
+  using G2SLayoutAtomdQaccum = Layout<Shape<Int<kNThreads>>>;
+  // UniversalCopy instead of AutoVectorizingCopyWithAssumedAlignment as the
+  // latter generates cp.async instructions
+  using G2STiledCopydQaccum = decltype(make_tiled_copy(
+      Copy_Atom<UniversalCopy<uint128_t>, ElementAccum>{},
+      G2SLayoutAtomdQaccum{},
+      Layout<Shape<_4>>{})); // Val layout, 4 vals per read
+  // We don't do bound checking for the gmem -> smem load so we just assert
+  // here.
+  static_assert(IsSm90 || (kBlockM * kHeadDim) % (kNThreads * 4) == 0);
+  static constexpr int SmemdQaccumSize = size(TileShape_MK{});
+  using SmemLayoutdQaccumFlat = Layout<Shape<Int<SmemdQaccumSize>>>;
+  using SmemLayoutdQaccum = std::conditional_t<
+      IsSm90,
+      Layout<Shape<
+          Int<kBlockM * kHeadDim / NumdQWarpGgroups>,
+          Int<NumdQWarpGgroups>>>,
+      Layout<Shape<Int<kBlockM * kHeadDim>>>>;
+
+  // We can't just use kHeadDim here. E.g. if MMA shape is 64 x 96 but split
+  // across 2 WGs, then setting kBlockKSmem to 32 will cause "Static shape_div
+  // failure". We want to treat it as 64 x 48, so kBlockKSmem should be 16.
+  static constexpr int MmaShapeN = get<1>(typename TiledMma::AtomShape_MNK{});
+  static constexpr int kBlockKSmem =
+      MmaShapeN % 64 == 0 ? 64 : (MmaShapeN % 32 == 0 ? 32 : 16);
+  static constexpr int kSwizzle =
+      kBlockKSmem == 64 ? 3 : (kBlockKSmem == 32 ? 2 : 1);
+  using SmemLayoutAtomdQ = decltype(composition(
+      Swizzle<kSwizzle, 3, 3>{},
+      Layout<Shape<Int<8>, Int<kBlockKSmem>>, Stride<Int<kBlockKSmem>, _1>>{}));
+  using SmemLayoutdQ =
+      decltype(tile_to_shape(SmemLayoutAtomdQ{}, TileShape_MK{}));
+  using SmemLayoutdQt = decltype(cute::composition(
+      SmemLayoutdQ{},
+      make_layout(
+          make_shape(get<1>(TileShape_MK{}), get<0>(TileShape_MK{})),
+          make_stride(Int<get<0>(TileShape_MK{})>{}, _1{}))));
+
+  using SmemCopyAtomdQ = Copy_Atom<
+      std::conditional_t<
+          IsSm90,
+          std::conditional_t<
+              !dQ_swapAB,
+              cute::SM90_U32x4_STSM_N,
+              cute::SM90_U16x8_STSM_T>,
+          AutoVectorizingCopyWithAssumedAlignment<128>>,
+      Element>;
+
+  static constexpr int kGmemElemsPerLoad =
+      sizeof(cute::uint128_t) / sizeof(Element);
+  static_assert(
+      kHeadDim % kGmemElemsPerLoad == 0,
+      "Headdim must be a multiple of kGmemElemsPerLoad");
+  static constexpr int kGmemThreadsPerRow =
+      cutlass::gcd(kHeadDim / kGmemElemsPerLoad, int(MaxThreadsPerBlock));
+  static_assert(
+      MaxThreadsPerBlock % kGmemThreadsPerRow == 0,
+      "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRow");
+  using GmemLayoutAtom = Layout<
+      Shape<
+          Int<MaxThreadsPerBlock / kGmemThreadsPerRow>,
+          Int<kGmemThreadsPerRow>>,
+      Stride<Int<kGmemThreadsPerRow>, _1>>;
+  using GmemTiledCopy = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
+      GmemLayoutAtom{},
+      Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{})); // Val layout, 8 or 16 vals
+                                                     // per load
+
+  struct SharedStorage : cute::aligned_struct<128> {
+    cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutdQaccum>>
+        smem_dqacc;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutdQ>> smem_dq;
+    alignas(16) cutlass::arch::ClusterTransactionBarrier barrier_dQaccum;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  using ShapedQ =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen_q, d, head,
+                                                       // batch)
+  using StridedQ = cute::Stride<int64_t, _1, int64_t, int64_t>;
+  using ShapedQaccum =
+      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen_q * d, head, batch)
+  using StridedQaccum = cute::Stride<_1, int64_t, int64_t>;
+
+  // Device side arguments
+  struct Arguments {
+    ElementAccum const* ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum const stride_dQaccum;
+    Element* ptr_dQ;
+    ShapedQ const shape_dQ;
+    StridedQ const stride_dQ;
+    int const* seq_offsets = nullptr;
+  };
+
+  // Kernel entry point API
+  struct Params {
+    ElementAccum const* ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum const stride_dQaccum;
+    Element* ptr_dQ;
+    ShapedQ const shape_dQ;
+    StridedQ const stride_dQ;
+    int const* seq_offsets = nullptr;
+  };
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const& args) {
+    return {
+        args.ptr_dQaccum,
+        args.shape_dQaccum,
+        args.stride_dQaccum,
+        args.ptr_dQ,
+        args.shape_dQ,
+        args.stride_dQ,
+        args.seq_offsets};
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char* smem_buf) {
+    static constexpr int kBlockM = get<0>(TileShape_MK{});
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    Tensor sdQaccum = make_tensor(
+        make_smem_ptr(shared_storage.smem_dqacc.data()), SmemLayoutdQaccum{});
+    Tensor sdQaccum_flat = make_tensor(
+        make_smem_ptr(shared_storage.smem_dqacc.data()),
+        SmemLayoutdQaccumFlat{});
+    Tensor sdQ = make_tensor(
+        make_smem_ptr(shared_storage.smem_dq.data()), SmemLayoutdQ{});
+    Tensor sdQt = make_tensor(
+        make_smem_ptr(shared_storage.smem_dq.data()), SmemLayoutdQt{});
+
+    int const thread_idx = threadIdx.x;
+    int const m_block = blockIdx.x;
+    int const bidh = blockIdx.y;
+    int const bidb = blockIdx.z;
+
+    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info(
+        bidb, size<0>(params.shape_dQ), params.seq_offsets);
+    if (Jagged && m_block * kBlockM >= seqlen_info.seqlen) {
+      return;
+    }
+
+    // Step 1: load dQaccum from gmem to smem
+    Tensor mdQaccum = make_tensor(
+        make_gmem_ptr(
+            reinterpret_cast<ElementAccum const*>(params.ptr_dQaccum)),
+        params.shape_dQaccum,
+        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
+    Tensor gdQaccum = local_tile(
+        domain_offset(
+            make_coord(seqlen_info.offset_padded * kHeadDim), mdQaccum),
+        Shape<Int<kBlockM * kHeadDim>>{},
+        make_coord(m_block)); // (M * K)
+    if constexpr (IsSm90) { // Use BulkCopy
+      static constexpr uint32_t TmaTransactionBytesdQaccum =
+          static_cast<uint32_t>(
+              size(SmemLayoutdQaccumFlat{}) *
+              cute::sizeof_bits_v<ElementAccum> / 8);
+      auto bulk_copy = Copy_Traits<SM90_BULK_COPY_AUTO>{};
+      // if (thread0()) { print(gdQaccum); printf("\n"); print(sdQaccum_flat);
+      // printf("\n"); }
+      if (thread_idx == 0) {
+        shared_storage.barrier_dQaccum.init(1 /*numThreads*/);
+        shared_storage.barrier_dQaccum.arrive_and_expect_tx(
+            TmaTransactionBytesdQaccum);
+        copy(
+            bulk_copy.with(
+                *reinterpret_cast<uint64_t*>(&shared_storage.barrier_dQaccum)),
+            gdQaccum,
+            sdQaccum_flat);
+      }
+      __syncthreads();
+      shared_storage.barrier_dQaccum.wait(0);
+    } else {
+      G2STiledCopydQaccum g2s_tiled_copy_dQaccum;
+      auto g2s_thr_copy_dQaccum =
+          g2s_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+      Tensor tdQgdQaccumg2s = g2s_thr_copy_dQaccum.partition_S(gdQaccum);
+      Tensor tdQsdQaccumg2s = g2s_thr_copy_dQaccum.partition_D(sdQaccum);
+      cute::copy(g2s_tiled_copy_dQaccum, tdQgdQaccumg2s, tdQsdQaccumg2s);
+      __syncthreads();
+    }
+
+    // __syncthreads(); if (cute::thread0()) { print_tensor(sdQaccum); }
+
+    // Step 2: Load dQaccum from smem to register, then convert fp32 ->
+    // fp16/bf16
+    R2STiledCopydQaccum s2r_tiled_copy_dQaccum;
+    auto s2r_thr_copy_dQaccum =
+        s2r_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+    Tensor tdQsdQaccum = s2r_thr_copy_dQaccum.partition_S(sdQaccum);
+    TiledMma tiled_mma_dQ;
+    Tensor taccdQrdQaccum = partition_fragment_C(
+        tiled_mma_dQ,
+        select<!dQ_swapAB ? 0 : 1, !dQ_swapAB ? 1 : 0>(TileShape_MK{}));
+    // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1) {
+    // print(tiled_mma_dQ); printf("\n"); } if (blockIdx.x == 0 && blockIdx.y ==
+    // 0 && threadIdx.x == 1) { print(tdQsdQaccum); } if (blockIdx.x == 0 &&
+    // blockIdx.y == 0 && threadIdx.x == 1) { print(taccdQrdQaccum); }
+    CUTE_STATIC_ASSERT_V(size(taccdQrdQaccum) == size(tdQsdQaccum));
+    Tensor tdQrdQaccum = s2r_thr_copy_dQaccum.retile_D(taccdQrdQaccum);
+    cute::copy(s2r_tiled_copy_dQaccum, tdQsdQaccum, tdQrdQaccum);
+    // Convert tdQrdQ from fp32 to fp16
+    Tensor rdQ = make_tensor_like<Element>(taccdQrdQaccum);
+    hstu::convert_type_out(taccdQrdQaccum, rdQ);
+
+    // Step 3: Copy dQ from register to smem
+    auto smem_tiled_copy_dQ = make_tiled_copy_C(SmemCopyAtomdQ{}, tiled_mma_dQ);
+    auto smem_thr_copy_dQ = smem_tiled_copy_dQ.get_thread_slice(thread_idx);
+    Tensor taccdQrdQ =
+        smem_thr_copy_dQ.retile_S(rdQ); // ((Atom,AtomNum), MMA_N, MMA_N)
+    // if (cute::thread0()) { print(smem_tiled_copy_dQ); }
+    // if (cute::thread0()) { print(smem_thr_copy_dQ); }
+    // if (cute::thread0()) { print(sdQ); }
+    Tensor taccdQsdQ = smem_thr_copy_dQ.partition_D(
+        cute::conditional_return<!dQ_swapAB>(
+            sdQ, sdQt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQ);
+    __syncthreads();
+
+    // Step 4: Copy dQ from smem to register to prepare for coalesced write to
+    // gmem
+    Tensor mdQ = make_tensor(
+        make_gmem_ptr(params.ptr_dQ), params.shape_dQ, params.stride_dQ)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor gdQ = local_tile(
+        domain_offset(make_coord(seqlen_info.offset, _0{}), mdQ),
+        TileShape_MK{},
+        make_coord(m_block, _0{})); // (M, K)
+    GmemTiledCopy gmem_tiled_copy_dQ;
+    auto gmem_thr_copy_dQ = gmem_tiled_copy_dQ.get_thread_slice(thread_idx);
+    Tensor tdQsdQ =
+        gmem_thr_copy_dQ.partition_S(sdQ); // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tdQgdQ = gmem_thr_copy_dQ.partition_D(gdQ);
+
+    Tensor tdQrdQ = make_fragment_like(tdQsdQ);
+    Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(
+        cute::make_identity_tensor(TileShape_MK{}));
+    Tensor tdQpdQ = make_tensor<bool>(make_shape(size<2>(tdQgdQ)));
+#pragma unroll
+    for (int k = 0; k < size(tdQpdQ); ++k) {
+      tdQpdQ(k) = get<1>(tdQcdQ(_0{}, _0{}, k)) < get<1>(params.shape_dQ);
+    }
+    // Need to check OOB when reading from smem if kBlockM isn't evenly tiled
+    static constexpr bool EvenM =
+        kBlockM % CUTE_STATIC_V(size<0>(GmemLayoutAtom{})) == 0;
+    hstu::
+        copy</*Is_even_MN=*/EvenM, /*Is_even_K=*/true, /*Clear_OOB_MN=*/false>(
+            gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ, tdQcdQ, tdQpdQ, kBlockM);
+
+    // Step 5: Copy dQ from register to gmem
+    // Clear_OOB_K must be false since we don't want to write zeros to gmem
+    hstu::copy<
+        /*Is_even_MN=*/false,
+        /*Is_even_K=*/false,
+        /*Clear_OOB_MN=*/false,
+        /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_dQ,
+        tdQrdQ,
+        tdQgdQ,
+        tdQcdQ,
+        tdQpdQ,
+        std::min(seqlen_info.seqlen - m_block * kBlockM, kBlockM));
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h
new file mode 100644
index 000000000..8d29778af
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h
@@ -0,0 +1,349 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+
+#include "seqlen.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    class TileShape_MK_,
+    class Element,
+    class ElementAccum,
+    class ArchTag_,
+    bool Clear_dQaccum,
+    bool Jagged,
+    bool Softmax>
+class FlashAttnBwdPreprocess {
+ public:
+  // Type Aliases
+  using TileShape_MK = TileShape_MK_;
+  using ArchTag = ArchTag_;
+
+  static_assert(
+      std::is_same_v<Element, cutlass::half_t> &&
+          ArchTag::kMinComputeCapability >= 75 ||
+      std::is_same_v<Element, cutlass::bfloat16_t> &&
+          ArchTag::kMinComputeCapability >= 80 ||
+      std::is_same_v<Element, cutlass::float_e4m3_t> &&
+          ArchTag::kMinComputeCapability >= 89);
+
+  static constexpr uint32_t MaxThreadsPerBlock = 256;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 2;
+  static constexpr int SharedStorageSize = 0;
+
+  static constexpr int kGmemElemsPerLoad =
+      sizeof(cute::uint128_t) / sizeof(Element);
+  static_assert(
+      get<1>(TileShape_MK{}) % kGmemElemsPerLoad == 0,
+      "Headdim must be a multiple of kGmemElemsPerLoad");
+  static constexpr int kBlockM = get<0>(TileShape_MK{});
+  static constexpr int kHeadDim = get<1>(TileShape_MK{});
+  // We want kBlockKGmem to be a power of 2 so that when we do the summing,
+  // it's just between threads in the same warp
+  static constexpr int kBlockKGmem =
+      kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32);
+  static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerLoad;
+  static_assert(
+      MaxThreadsPerBlock % kGmemThreadsPerRow == 0,
+      "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRow");
+  using GmemLayoutAtom = Layout<
+      Shape<
+          Int<MaxThreadsPerBlock / kGmemThreadsPerRow>,
+          Int<kGmemThreadsPerRow>>,
+      Stride<Int<kGmemThreadsPerRow>, _1>>;
+  using GmemTiledCopy = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
+      GmemLayoutAtom{},
+      Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{})); // Val layout, 8 or 16 vals
+                                                     // per load
+
+  static constexpr int kGmemElemsPerLoadAccum =
+      sizeof(cute::uint128_t) / sizeof(ElementAccum);
+  static_assert(
+      (kBlockM * kHeadDim / kGmemElemsPerLoadAccum) % MaxThreadsPerBlock == 0,
+      "MaxThreadsPerBlock must divide kBlockM * kHeadDim / kGmemElemsPerLoadAccum");
+  using GmemLayoutAtomAccum = Layout<Shape<Int<MaxThreadsPerBlock>>>;
+  using GmemTiledCopyAccum = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementAccum>{},
+      GmemLayoutAtomAccum{},
+      Layout<Shape<Int<kGmemElemsPerLoadAccum>>>{})); // Val layout, 4 vals per
+                                                      // store
+
+  using ShapeO =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen_q, d, head,
+                                                       // batch)
+  using StrideO = cute::Stride<int64_t, _1, int64_t, int64_t>;
+  using ShapedPsum =
+      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen_q, head, batch)
+  using StridedPsum = cute::Stride<_1, int64_t, int64_t>;
+  using ShapedQaccum =
+      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen_q * d, head, batch)
+  using StridedQaccum = cute::Stride<_1, int64_t, int64_t>;
+
+  // Device side arguments
+  struct Arguments {
+    Element const* ptr_O;
+    ShapeO const shape_O;
+    StrideO const stride_O;
+    Element const* ptr_dO;
+    StrideO const stride_dO;
+    float* ptr_dPsum;
+    ShapedPsum const shape_dPsum;
+    StridedPsum const stride_dPsum;
+    float const* ptr_LSE;
+    StridedPsum const stride_LSE;
+    float* ptr_LSE_log2;
+    StridedPsum const stride_LSE_log2;
+    ElementAccum* ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum const stride_dQaccum;
+    int num_batch; // We need this to know the size of dq_semaphore in case of
+                   // jagged
+    int num_heads;
+    int num_softmax_heads;
+    int max_seq_len;
+    int* dq_semaphore;
+    int const* seq_offsets = nullptr;
+  };
+
+  // Kernel entry point API
+  struct Params {
+    Element const* ptr_O;
+    ShapeO const shape_O;
+    StrideO const stride_O;
+    Element const* ptr_dO;
+    StrideO const stride_dO;
+    float* ptr_dPsum;
+    ShapedPsum const shape_dPsum;
+    StridedPsum const stride_dPsum;
+    float const* ptr_LSE;
+    StridedPsum const stride_LSE;
+    float* ptr_LSE_log2;
+    StridedPsum const stride_LSE_log2;
+    ElementAccum* ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum const stride_dQaccum;
+    int num_batch;
+    int num_heads;
+    int num_softmax_heads;
+    int max_seq_len;
+    int* dq_semaphore;
+    int const* seq_offsets = nullptr;
+  };
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const& args) {
+    return {args.ptr_O,       args.shape_O,       args.stride_O,
+            args.ptr_dO,      args.stride_dO,     args.ptr_dPsum,
+            args.shape_dPsum, args.stride_dPsum,  args.ptr_LSE,
+            args.stride_LSE,  args.ptr_LSE_log2,  args.stride_LSE_log2,
+            args.ptr_dQaccum, args.shape_dQaccum, args.stride_dQaccum,
+            args.num_batch,   args.num_heads,     args.num_softmax_heads,
+            args.max_seq_len, args.dq_semaphore,  args.seq_offsets};
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, [[maybe_unused]] char* smem_buf) {
+    static constexpr int kBlockM = get<0>(TileShape_MK{});
+
+    int const thread_idx = threadIdx.x;
+    int const m_block = blockIdx.x;
+    int const bidh = blockIdx.y;
+    int const bidb = blockIdx.z;
+
+    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info(
+        bidb, params.max_seq_len, params.seq_offsets);
+    int const seqlen_o = seqlen_info.seqlen;
+    if (Jagged && m_block * kBlockM >= seqlen_o) {
+      return;
+    }
+
+    if constexpr (Softmax) {
+      Tensor mO = make_tensor(
+          make_gmem_ptr(params.ptr_O), params.shape_O, params.stride_O)(
+          _, _, bidh, !Jagged ? bidb : 0);
+      Tensor gO = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mO),
+          TileShape_MK{},
+          make_coord(m_block, _0{})); // (M, K)
+      Tensor mdO = make_tensor(
+          make_gmem_ptr(params.ptr_dO), params.shape_O, params.stride_dO)(
+          _, _, bidh, !Jagged ? bidb : 0);
+      Tensor gdO = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdO),
+          TileShape_MK{},
+          make_coord(m_block, _0{})); // (M, K)
+
+      auto shape_LSE = select<0, 2, 3>(params.shape_O);
+      Tensor mLSE = make_tensor(
+          make_gmem_ptr(params.ptr_LSE), shape_LSE, params.stride_LSE)(
+          _, bidh, !Jagged ? bidb : 0);
+      Tensor gLSE = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset), mLSE),
+          Shape<Int<kBlockM>>{},
+          make_coord(m_block));
+      static_assert(kBlockM <= MaxThreadsPerBlock);
+      float lse =
+          thread_idx < seqlen_o - m_block * kBlockM && thread_idx < kBlockM
+          ? gLSE(thread_idx)
+          : 0.0f;
+
+      GmemTiledCopy gmem_tiled_copy_O;
+      auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
+
+      Tensor tOgO = gmem_thr_copy_O.partition_S(gO);
+      Tensor tOgdO = gmem_thr_copy_O.partition_S(gdO);
+      // Construct identity layout for gO
+      Tensor cO = cute::make_identity_tensor(
+          TileShape_MK{}); // (BLK_M,BLK_K) -> (blk_m,blk_k)
+      // Repeat the partitioning with identity layouts
+      Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
+      Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
+#pragma unroll
+      for (int k = 0; k < size(tOpO); ++k) {
+        tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O);
+      }
+
+      // (8, kBlockM / 32, kHeadDim / 64) or (8, kBlockM / 16, kHeadDim / 128)
+      Tensor tOrO = make_fragment_like(tOgO);
+      Tensor tOrdO = make_fragment_like(tOgdO);
+      hstu::copy<
+          /*Is_even_MN=*/false,
+          /*Is_even_K=*/false,
+          /*Clear_OOB_MN=*/true,
+          /*Clearn_OOB_K=*/true>(
+          gmem_tiled_copy_O,
+          tOgO,
+          tOrO,
+          tOcO,
+          tOpO,
+          seqlen_o - m_block * kBlockM);
+      hstu::copy<
+          /*Is_even_MN=*/false,
+          /*Is_even_K=*/false,
+          /*Clear_OOB_MN=*/true,
+          /*Clearn_OOB_K=*/true>(
+          gmem_tiled_copy_O,
+          tOgdO,
+          tOrdO,
+          tOcO,
+          tOpO,
+          seqlen_o - m_block * kBlockM);
+      // Reshape from e.g. (8, kBlockM / 32, kHeadDim / 64) to (kBlockM / 32,
+      // (8, kHeadDim / 64))
+      Layout l = make_layout(
+          get<1>(tOrO.layout()),
+          make_layout(get<0>(tOrO.layout()), get<2>(tOrO.layout())));
+      Tensor tOrO_l = make_tensor(tOrO.data(), l);
+      Tensor o_fp32 = make_tensor_like<float>(tOrO_l);
+      hstu::convert_type_out(tOrO_l, o_fp32);
+      Tensor tOrdO_l = make_tensor(tOrdO.data(), l);
+      Tensor do_fp32 = make_tensor_like<float>(tOrdO_l);
+      hstu::convert_type_out(tOrdO_l, do_fp32);
+      // Sum across the last dimension
+      Tensor dP_sum = make_tensor<float>(make_shape(size<0>(o_fp32)));
+#pragma unroll
+      for (int mi = 0; mi < size<0>(o_fp32); ++mi) {
+        float dP_sum_cur = do_fp32(mi, 0) * o_fp32(mi, 0);
+#pragma unroll
+        for (int ni = 1; ni < size<1>(o_fp32); ni++) {
+          dP_sum_cur += do_fp32(mi, ni) * o_fp32(mi, ni);
+        }
+        hstu::SumOp<float> sum_op;
+        dP_sum(mi) =
+            hstu::Allreduce<kGmemThreadsPerRow>::run(dP_sum_cur, sum_op);
+      }
+
+      Tensor mdPsum = make_tensor(
+          make_gmem_ptr(params.ptr_dPsum),
+          params.shape_dPsum,
+          params.stride_dPsum)(_, bidh, !Jagged ? bidb : 0);
+      Tensor gdPsum = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset_padded), mdPsum),
+          Shape<Int<kBlockM>>{},
+          make_coord(m_block));
+      if (get<1>(tOcO(_0{}, _0{}, _0{})) == 0) {
+#pragma unroll
+        for (int mi = 0; mi < size(dP_sum); ++mi) {
+          int const row = get<0>(tOcO(_0{}, mi, _0{}));
+          gdPsum(row) = row < seqlen_o - m_block * kBlockM ? dP_sum(mi) : 0;
+        }
+      }
+
+      int const seqlen_rounded = cute::round_up(seqlen_o, kBlockM);
+      Tensor mLSElog2 = make_tensor(
+          make_gmem_ptr(params.ptr_LSE_log2),
+          params.shape_dPsum,
+          params.stride_LSE_log2)(_, bidh, !Jagged ? bidb : 0);
+      Tensor gLSElog2 = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset_padded), mLSElog2),
+          Shape<Int<kBlockM>>{},
+          make_coord(m_block));
+      if (thread_idx < seqlen_rounded - m_block * kBlockM &&
+          thread_idx < kBlockM) {
+        gLSElog2(thread_idx) = lse * float(M_LOG2E);
+      }
+    }
+    if constexpr (Clear_dQaccum) {
+      Tensor mdQaccum = make_tensor(
+          make_gmem_ptr(params.ptr_dQaccum),
+          params.shape_dQaccum,
+          params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
+      Tensor gdQaccum = local_tile(
+          cute::domain_offset(
+              make_coord(seqlen_info.offset_padded * kHeadDim), mdQaccum),
+          Shape<Int<kBlockM * kHeadDim>>{},
+          make_coord(m_block));
+      GmemTiledCopyAccum gmem_tiled_copy_dQaccum;
+      auto gmem_thr_copy_dQaccum =
+          gmem_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+      Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_D(gdQaccum);
+      Tensor zero = make_fragment_like(tdQgdQaccum);
+      clear(zero);
+      cute::copy(
+          Copy_Atom<
+              AutoVectorizingCopyWithAssumedAlignment<128>,
+              ElementAccum>{},
+          zero,
+          tdQgdQaccum);
+    }
+
+    if (params.dq_semaphore != nullptr && thread_idx == 0) {
+      int const num_batch = params.num_batch;
+      int const num_head = params.num_heads;
+      params.dq_semaphore
+          [bidh + bidb * num_head + m_block * num_head * num_batch] = 0;
+    }
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp
new file mode 100644
index 000000000..66ec445be
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp
@@ -0,0 +1,1165 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+// Include these 2 headers instead of torch/extension.h since we don't need all
+// of the torch headers.
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Optional.h>
+#include <torch/nn/functional.h>
+#include <torch/version.h> // For TORCH_VERSION* macros
+
+#include <cutlass/numeric_types.h>
+
+#include "flash.h"
+#include "flash_common.h"
+#include "static_switch.h"
+#include "tile_size.h"
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...)                           \
+  TORCH_CHECK(                                        \
+      x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \
+      #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+at::Tensor switch_to_contiguous_if_needed(const at::Tensor& x) {
+  if (x.stride(x.dim() - 1) == 1) {
+    return x;
+  }
+  return x.contiguous();
+}
+
+namespace hstu {
+
+void set_params_fprop(
+    hstu::Flash_fwd_params& params,
+    // sizes
+    const size_t b,
+    const size_t total_seq_len_kv,
+    const size_t total_seq_len_q,
+    const size_t max_seq_len,
+    const size_t max_q_len,
+    const size_t h,
+    const size_t qk_d,
+    const size_t v_d,
+    // device pointers
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& out,
+    void* seq_offsets,
+    void* num_targets,
+    void* attn_scale,
+    void* seq_offsets_q,
+    void* softmax_lse,
+    void* max_seq_len_tensor,
+    void* contextual_seq_len_tensor,
+    void* max_attn_len_tensor,
+    void* min_full_attn_seq_len_tensor,
+    const int num_groups,
+    bool causal,
+    float alpha,
+    const bool scalar_scale,
+    const int max_attn_len,
+    const int min_full_attn_seq_len,
+    const int contextual_seq_len,
+    const int num_softmax_heads,
+    const bool training,
+    const int sm_margin = 0) {
+  // Reset the parameters
+  params = {};
+
+  params.is_bf16 = q.dtype() == torch::kBFloat16;
+  params.is_e4m3 = q.dtype() == torch::kFloat8_e4m3fn;
+
+  // Set the pointers and strides.
+  params.q_ptr = q.data_ptr();
+  params.k_ptr = k.data_ptr();
+  params.v_ptr = v.data_ptr();
+  params.o_ptr = out.data_ptr();
+  // All stride are in elements, not bytes.
+  params.q_row_stride = q.stride(-3);
+  params.k_row_stride = k.stride(-3);
+  params.v_row_stride = v.stride(-3);
+  params.o_row_stride = out.stride(-3);
+  params.q_head_stride = q.stride(-2);
+  params.k_head_stride = k.stride(-2);
+  params.v_head_stride = v.stride(-2);
+  params.o_head_stride = out.stride(-2);
+  params.v_dim_stride = v.stride(-1);
+
+  if (seq_offsets == nullptr) {
+    params.q_batch_stride = q.stride(0);
+    params.k_batch_stride = k.stride(0);
+    params.v_batch_stride = v.stride(0);
+    params.o_batch_stride = out.stride(0);
+  }
+
+  params.seq_offsets = static_cast<int*>(seq_offsets);
+  params.seq_offsets_q = static_cast<int*>(seq_offsets_q);
+  params.num_targets = static_cast<int*>(num_targets);
+  params.attn_scale = static_cast<float*>(attn_scale);
+  params.softmax_lse = static_cast<float*>(softmax_lse);
+  params.max_seq_len_tensor = static_cast<int*>(max_seq_len_tensor);
+  params.contextual_seq_len_tensor =
+      static_cast<int*>(contextual_seq_len_tensor);
+  params.max_attn_len_tensor = static_cast<int*>(max_attn_len_tensor);
+  params.min_full_attn_seq_len_tensor =
+      static_cast<int*>(min_full_attn_seq_len_tensor);
+  params.num_groups = num_groups;
+  params.batch_size_per_group = b / num_groups;
+
+  // Set the dimensions.
+  params.b = b;
+  params.h = h;
+  params.total_seq_len_q = total_seq_len_q;
+  params.total_seq_len_kv = total_seq_len_kv;
+  params.max_kv_len = max_seq_len;
+  params.max_q_len = max_q_len;
+  params.qk_d = qk_d;
+  params.v_d = v_d;
+
+  params.alpha = alpha;
+
+  // Note: when num_groups > 1, max_attn_len, contextual_seq_len,
+  // min_full_attn_seq_len represent the max value in the tensor.
+  params.is_local = max_attn_len > 0;
+  params.is_causal = causal && (!params.is_local);
+  params.has_contexual_mask = contextual_seq_len > 0;
+  params.scalar_scale = scalar_scale;
+  params.num_softmax_heads = num_softmax_heads;
+  params.training = training;
+
+  params.max_attn_len = max_attn_len;
+  params.min_full_attn_seq_len = min_full_attn_seq_len;
+  params.contextual_seq_len = contextual_seq_len;
+
+  params.arch = at::cuda::getCurrentDeviceProperties()->major * 10 +
+      at::cuda::getCurrentDeviceProperties()->minor;
+  params.num_sm =
+      at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin;
+
+#ifdef FLASHATTENTION_DISABLE_LOCAL
+  TORCH_CHECK(
+      !params.is_local,
+      "This flash attention build does not support local attention.");
+#endif
+}
+
+void set_params_dgrad(
+    hstu::Flash_bwd_params& params,
+    // sizes
+    const size_t b,
+    const size_t total_seq_len_kv,
+    const size_t total_seq_len_q,
+    const size_t max_seq_len,
+    const size_t max_q_len,
+    const size_t max_q_len_rounded,
+    const size_t h,
+    const size_t qk_d,
+    const size_t v_d,
+    const size_t qk_d_rounded,
+    const size_t v_d_rounded,
+    // device pointers
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& out,
+    const at::Tensor& dout,
+    const at::Tensor& dq,
+    const at::Tensor& dk,
+    const at::Tensor& dv,
+    void* dq_accum_d,
+    void* seq_offsets,
+    void* num_targets,
+    void* attn_scale,
+    void* sort_by_length_indices,
+    void* seq_offsets_q,
+    void* softmax_lse,
+    void* softmax_d,
+    void* softmax_lse_log2,
+    void* max_seq_len_tensor,
+    void* contextual_seq_len_tensor,
+    void* max_attn_len_tensor,
+    void* min_full_attn_seq_len_tensor,
+    const int num_groups,
+    const bool scalar_scale,
+    const bool causal,
+    const float alpha,
+    const int max_attn_len,
+    const int min_full_attn_seq_len,
+    const int contextual_seq_len,
+    const int num_softmax_heads,
+    bool deterministic = false,
+    int const sm_margin = 0) {
+  hstu::set_params_fprop(
+      params,
+      b,
+      total_seq_len_kv,
+      total_seq_len_q,
+      max_seq_len,
+      max_q_len,
+      h,
+      qk_d,
+      v_d,
+      q,
+      k,
+      v,
+      out,
+      seq_offsets,
+      num_targets,
+      attn_scale,
+      seq_offsets_q,
+      softmax_lse,
+      max_seq_len_tensor,
+      contextual_seq_len_tensor,
+      max_attn_len_tensor,
+      min_full_attn_seq_len_tensor,
+      num_groups,
+      causal,
+      alpha,
+      scalar_scale,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      num_softmax_heads,
+      false /* training */,
+      sm_margin);
+
+  // Set the pointers and strides.
+  params.do_ptr = dout.data_ptr();
+  params.do_row_stride = dout.stride(-3);
+  params.do_head_stride = dout.stride(-2);
+  params.dq_ptr = dq.data_ptr();
+  params.dk_ptr = dk.data_ptr();
+  params.dv_ptr = dv.data_ptr();
+  params.dq_row_stride = dq.stride(-3);
+  params.dk_row_stride = dk.stride(-3);
+  params.dv_row_stride = dv.stride(-3);
+  params.dq_head_stride = dq.stride(-2);
+  params.dk_head_stride = dk.stride(-2);
+  params.dv_head_stride = dv.stride(-2);
+
+  params.qk_d_rounded = qk_d_rounded;
+  params.v_d_rounded = v_d_rounded;
+  params.max_q_len_rounded = max_q_len_rounded;
+
+  params.sort_by_length_indices = static_cast<int*>(sort_by_length_indices);
+
+  if (seq_offsets == nullptr) {
+    params.do_batch_stride = dout.stride(0);
+    params.dq_batch_stride = dq.stride(0);
+    params.dk_batch_stride = dk.stride(0);
+    params.dv_batch_stride = dv.stride(0);
+  }
+  params.dq_accum_ptr = dq_accum_d;
+  params.softmax_lse_log2 = static_cast<float*>(softmax_lse_log2);
+  params.softmax_d = static_cast<float*>(softmax_d);
+  params.deterministic = deterministic;
+}
+
+void run_mha_fwd(hstu::Flash_fwd_params& params, cudaStream_t stream) {
+  // HEADDIM_SWITCH(params.d, [&] {
+  //     hstu::run_mha_fwd_<cutlass::half_t, kHeadSize>(params, stream);
+  // });
+  ARCH_SWITCH(params.arch, Arch, [&] {
+    BOOL_SWITCH(params.num_softmax_heads == params.h, Softmax, [&] {
+      if (!params.is_e4m3) {
+        if (params.is_bf16) {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+          if (params.qk_d <= 64) {
+            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 64, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+          if (params.qk_d <= 96) {
+            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 96, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+          if (params.qk_d <= 128) {
+            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 128, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+          if (params.qk_d <= 192) {
+            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 192, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+          if (params.qk_d <= 256) {
+            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 256, Softmax>(
+                params, stream);
+          }
+#endif
+        } else {
+#ifndef FLASHATTENTION_DISABLE_FP16
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+          if (params.qk_d <= 64) {
+            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 64, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+          if (params.qk_d <= 96) {
+            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 96, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+          if (params.qk_d <= 128) {
+            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 128, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+          if (params.qk_d <= 192) {
+            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 192, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+          if (params.qk_d <= 256) {
+            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 256, Softmax>(
+                params, stream);
+          }
+#endif
+#else
+                                TORCH_CHECK(false, "This flash attention build does not support FP16.");
+#endif
+        }
+      } else {
+#ifndef FLASHATTENTION_DISABLE_FP8
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+        if (params.qk_d <= 64) {
+          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 64, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+        if (params.qk_d <= 96) {
+          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 96, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+        if (params.qk_d <= 128) {
+          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 128, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+        if (params.qk_d <= 192) {
+          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 192, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+        if (params.qk_d <= 256) {
+          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 256, Softmax>(
+              params, stream);
+        }
+#endif
+#else
+                            TORCH_CHECK(false, "This flash attention build does not support FP8.");
+#endif
+      }
+    });
+  });
+}
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin,
+    int64_t max_q_len,
+    const std::optional<at::Tensor>& seq_offsets_q,
+    int64_t num_softmax_heads,
+    bool training,
+    const std::optional<at::Tensor>& max_seq_len_tensor,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor,
+    const std::optional<at::Tensor>& max_attn_len_tensor,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
+    int64_t num_groups) {
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  bool is_sm9x = dprops->major >= 9;
+  TORCH_CHECK(is_sm9x, "HSTU Attention only supports Hopper GPUs or newer.");
+
+  q = switch_to_contiguous_if_needed(q);
+  k = switch_to_contiguous_if_needed(k);
+  v = switch_to_contiguous_if_needed(v);
+
+  auto q_type = q.scalar_type();
+  TORCH_CHECK(
+      q_type == at::ScalarType::Half || q_type == at::ScalarType::BFloat16 ||
+          q_type == at::ScalarType::Float8_e4m3fn,
+      "FlashAttention only supports fp16, bf16, and fp8_e4m3 data type");
+  if (dprops->major < 9) {
+    TORCH_CHECK(
+        q_type == at::ScalarType::Half || q_type == at::ScalarType::BFloat16,
+        "FlashAttention on Ampere/Ada cards only supports fp16 and bf16 data type");
+  }
+  TORCH_CHECK(
+      k.scalar_type() == q_type, "query and key must have the same dtype");
+  TORCH_CHECK(
+      v.scalar_type() == q_type, "query and value must have the same dtype");
+
+  CHECK_DEVICE(q);
+  CHECK_DEVICE(k);
+  CHECK_DEVICE(v);
+
+  TORCH_CHECK(
+      q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(
+      k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(
+      v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+
+  at::Tensor seq_offsets_;
+  bool const is_jagged = seq_offsets.has_value();
+  if (is_jagged) {
+    seq_offsets_ = seq_offsets.value();
+    CHECK_DEVICE(seq_offsets_);
+    CHECK_CONTIGUOUS(seq_offsets_);
+    TORCH_CHECK(
+        seq_offsets_.dtype() == torch::kInt32,
+        "seq_offsets_ must have dtype torch.int32");
+  }
+  at::Tensor num_targets_;
+  bool const has_multiple_targets = num_targets.has_value();
+  if (has_multiple_targets) {
+    num_targets_ = num_targets.value();
+    CHECK_DEVICE(num_targets_);
+    CHECK_CONTIGUOUS(num_targets_);
+    TORCH_CHECK(
+        num_targets_.dtype() == torch::kInt32,
+        "num_targets_ must have dtype torch.int32");
+  }
+  at::Tensor seq_offsets_q_;
+  bool const is_cross_attn = seq_offsets_q.has_value();
+  if (is_cross_attn) {
+    seq_offsets_q_ = seq_offsets_q.value();
+    CHECK_DEVICE(seq_offsets_q_);
+    CHECK_CONTIGUOUS(seq_offsets_q_);
+    TORCH_CHECK(
+        seq_offsets_q_.dtype() == torch::kInt32,
+        "seq_offsets_q_ must have dtype torch.int32");
+  } else {
+    max_q_len = max_seq_len;
+  }
+  at::Tensor attn_scale_;
+  bool scalar_scale = true;
+  bool const has_attn_scale = attn_scale.has_value();
+  if (has_attn_scale) {
+    attn_scale_ = attn_scale.value();
+    scalar_scale = attn_scale_.numel() == num_groups;
+    CHECK_DEVICE(attn_scale_);
+    TORCH_CHECK(
+        attn_scale_.dtype() == torch::kFloat32,
+        "attn_scale_ must have dtype torch.float32");
+  }
+  at::Tensor max_seq_len_tensor_;
+  at::Tensor contextual_seq_len_tensor_;
+  at::Tensor max_attn_len_tensor_;
+  at::Tensor min_full_attn_seq_len_tensor_;
+  if (num_groups > 1) {
+    TORCH_CHECK(
+        max_seq_len_tensor.has_value(),
+        "max_seq_len_tensor cannot be empty for num_groups > 1.");
+    max_seq_len_tensor_ = max_seq_len_tensor.value();
+    CHECK_DEVICE(max_seq_len_tensor_);
+    TORCH_CHECK(max_seq_len_tensor_.dtype() == torch::kInt32);
+    if (!is_cross_attn) {
+      TORCH_CHECK(
+          contextual_seq_len_tensor.has_value(),
+          "contextual_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      TORCH_CHECK(
+          max_attn_len_tensor.has_value(),
+          "max_attn_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      TORCH_CHECK(
+          min_full_attn_seq_len_tensor.has_value(),
+          "min_full_attn_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      contextual_seq_len_tensor_ = contextual_seq_len_tensor.value();
+      max_attn_len_tensor_ = max_attn_len_tensor.value();
+      min_full_attn_seq_len_tensor_ = min_full_attn_seq_len_tensor.value();
+      CHECK_DEVICE(contextual_seq_len_tensor_);
+      CHECK_DEVICE(max_attn_len_tensor_);
+      CHECK_DEVICE(min_full_attn_seq_len_tensor_);
+      TORCH_CHECK(contextual_seq_len_tensor_.dtype() == torch::kInt32);
+      TORCH_CHECK(max_attn_len_tensor_.dtype() == torch::kInt32);
+      TORCH_CHECK(min_full_attn_seq_len_tensor_.dtype() == torch::kInt32);
+    }
+  }
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+  if (is_jagged && has_multiple_targets) {
+    auto uih_lengths = seq_offsets_.slice(0, 1)
+                           .sub(seq_offsets_.slice(0, 0, -1))
+                           .sub(num_targets_);
+    TORCH_CHECK(
+        (uih_lengths.gt(0)).sum().item<int64_t>() == num_targets_.size(0),
+        "some uih seqlen is 0");
+    TORCH_CHECK(
+        (uih_lengths.greater_equal(contextual_seq_len)).sum().item<int64_t>() ==
+            num_targets_.size(0),
+        "some uih seqlen is less than contextual_seq_len");
+  }
+#endif
+  TORCH_CHECK(
+      q.size(-1) == k.size(-1) && k.size(-1) == v.size(-1),
+      "only attndim == hidden_dim is supported");
+
+  auto const sizes_q = q.sizes();
+  auto const sizes_k = k.sizes();
+  const int batch_size = !is_jagged ? sizes_q[0] : seq_offsets_.size(0) - 1;
+  TORCH_CHECK(
+      batch_size % num_groups == 0, "batch_size not divisible by num_groups");
+  int total_seq_len_q = !is_jagged ? batch_size * max_q_len : sizes_q[0];
+  int total_seq_len_kv = !is_jagged ? batch_size * max_seq_len : sizes_k[0];
+  int num_heads = q.size(-2);
+  int const qk_head_size = q.size(-1);
+  int const v_head_size = v.size(-1);
+  int const max_headdim = get_max_headdim();
+  TORCH_CHECK(
+      qk_head_size <= max_headdim && v_head_size <= max_headdim,
+      "FlashAttention forward only supports head dimension at most " +
+          std::to_string(max_headdim));
+  TORCH_CHECK(max_attn_len >= 0, "max_attn_len must be at least 0");
+  TORCH_CHECK(
+      min_full_attn_seq_len >= 0, "min_full_attn_seq_len must be at least 0");
+  TORCH_CHECK(contextual_seq_len >= 0, "contextual_seq_len must be at least 0");
+  if (max_attn_len > 0) {
+    TORCH_CHECK(
+        min_full_attn_seq_len > 0,
+        "min_full_attn_seq_len=0 not supported when max_attn_len > 0");
+  }
+  TORCH_CHECK(
+      0 == num_softmax_heads || num_softmax_heads == num_heads,
+      "num_softmax_heads must be either 0 or num_heads");
+  if (!is_jagged) {
+    CHECK_SHAPE(q, batch_size, max_q_len, num_heads, qk_head_size);
+    CHECK_SHAPE(k, batch_size, max_seq_len, num_heads, qk_head_size);
+    CHECK_SHAPE(v, batch_size, max_seq_len, num_heads, v_head_size);
+  } else {
+    CHECK_SHAPE(q, total_seq_len_q, num_heads, qk_head_size);
+    CHECK_SHAPE(k, total_seq_len_kv, num_heads, qk_head_size);
+    CHECK_SHAPE(v, total_seq_len_kv, num_heads, v_head_size);
+    CHECK_SHAPE(seq_offsets_, batch_size + 1);
+  }
+  if (has_multiple_targets) {
+    CHECK_SHAPE(num_targets_, batch_size);
+  }
+  if (is_cross_attn) {
+    CHECK_SHAPE(seq_offsets_q_, batch_size + 1);
+  }
+
+  int const alignment = q_type == torch::kFloat8_e4m3fn ? 16 : 8;
+  TORCH_CHECK(
+      qk_head_size % alignment == 0 && v_head_size % alignment == 0,
+      "head_size should be a multiple of " + std::to_string(alignment));
+
+  auto opts = q.options();
+  auto out_type = q_type == at::ScalarType::Float8_e4m3fn
+      ? at::ScalarType::BFloat16
+      : q_type;
+  at::Tensor out;
+  if (!is_jagged) {
+    out = torch::empty(
+        {batch_size, max_q_len, num_heads, v_head_size}, opts.dtype(out_type));
+  } else {
+    out = torch::empty(
+        {total_seq_len_q, num_heads, v_head_size}, opts.dtype(out_type));
+  }
+  std::optional<at::Tensor> softmax_lse = std::nullopt;
+
+  // Early return for empty sequences to avoid TMA descriptor
+  // initialization failure
+  if (total_seq_len_kv == 0 || total_seq_len_q == 0) {
+    return {out, std::nullopt};
+  }
+
+  if (num_softmax_heads > 0) {
+    if (!is_jagged) {
+      softmax_lse = torch::empty(
+          {batch_size, num_softmax_heads, max_q_len}, opts.dtype(at::kFloat));
+    } else {
+      softmax_lse = torch::empty(
+          {num_softmax_heads, total_seq_len_q}, opts.dtype(at::kFloat));
+    }
+  }
+
+  // Otherwise the kernel will be launched from cuda:0 device
+  // Cast to char to avoid compiler warning about narrowing
+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+  hstu::Flash_fwd_params params;
+  hstu::set_params_fprop(
+      params,
+      batch_size,
+      total_seq_len_kv,
+      total_seq_len_q,
+      max_seq_len,
+      max_q_len,
+      num_heads,
+      qk_head_size,
+      v_head_size,
+      q,
+      k,
+      v,
+      out,
+      !is_jagged ? nullptr : seq_offsets_.data_ptr(),
+      !has_multiple_targets ? nullptr : num_targets_.data_ptr(),
+      !has_attn_scale ? nullptr : attn_scale_.data_ptr(),
+      !is_cross_attn ? nullptr : seq_offsets_q_.data_ptr(),
+      (num_softmax_heads == 0) ? nullptr : softmax_lse.value().data_ptr(),
+      num_groups > 1 ? max_seq_len_tensor_.data_ptr() : nullptr,
+      ((num_groups > 1) && (!is_cross_attn))
+          ? contextual_seq_len_tensor_.data_ptr()
+          : nullptr,
+      ((num_groups > 1) && (!is_cross_attn)) ? max_attn_len_tensor_.data_ptr()
+                                             : nullptr,
+      ((num_groups > 1) && (!is_cross_attn))
+          ? min_full_attn_seq_len_tensor_.data_ptr()
+          : nullptr,
+      num_groups,
+      causal,
+      alpha,
+      scalar_scale,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      num_softmax_heads,
+      training,
+      sm_margin);
+  at::Tensor tile_count_semaphore;
+  // We don't use the persistent scheduler if not jagged
+  bool const persistent_scheduler = params.arch >= 90
+      ? (params.is_causal || params.is_local || is_jagged)
+      : (params.is_causal || is_jagged);
+  if (persistent_scheduler) {
+    tile_count_semaphore = torch::zeros({1}, opts.dtype(torch::kInt32));
+    params.tile_count_semaphore = tile_count_semaphore.data_ptr<int>();
+  } else {
+    params.tile_count_semaphore = nullptr;
+  }
+
+  if (q_type == at::ScalarType::Float8_e4m3fn) {
+    if (q_descale.has_value()) {
+      auto q_descale_ = q_descale.value();
+      CHECK_DEVICE(q_descale_);
+      CHECK_SHAPE(q_descale_, batch_size, num_heads);
+      params.q_descale_ptr = q_descale_.data_ptr<float>();
+      params.q_descale_batch_stride = q_descale_.stride(0);
+      params.q_descale_head_stride = q_descale_.stride(1);
+    } else {
+      params.q_descale_ptr = nullptr;
+    }
+    if (k_descale.has_value()) {
+      auto k_descale_ = k_descale.value();
+      CHECK_DEVICE(k_descale_);
+      CHECK_SHAPE(k_descale_, batch_size, num_heads);
+      params.k_descale_ptr = k_descale_.data_ptr<float>();
+      params.k_descale_batch_stride = k_descale_.stride(0);
+      params.k_descale_head_stride = k_descale_.stride(1);
+    } else {
+      params.k_descale_ptr = nullptr;
+    }
+    if (v_descale.has_value()) {
+      auto v_descale_ = v_descale.value();
+      CHECK_DEVICE(v_descale_);
+      CHECK_SHAPE(v_descale_, batch_size, num_heads);
+      params.v_descale_ptr = v_descale_.data_ptr<float>();
+      params.v_descale_batch_stride = v_descale_.stride(0);
+      params.v_descale_head_stride = v_descale_.stride(1);
+    } else {
+      params.v_descale_ptr = nullptr;
+    }
+  }
+
+#ifdef FLASHATTENTION_DISABLE_LOCAL
+  TORCH_CHECK(
+      !params.is_local,
+      "This flash attention build does not support local attention.");
+#endif
+
+  if (total_seq_len_q > 0 && num_heads > 0) {
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    run_mha_fwd(params, stream);
+  }
+  return {out, softmax_lse};
+}
+
+void run_mha_bwd(hstu::Flash_bwd_params& params, cudaStream_t stream) {
+#ifndef FLASHATTENTION_DISABLE_BACKWARD
+  // FP16_SWITCH(!params.is_bf16, [&] {
+  //     HEADDIM_SWITCH(params.d, [&] {
+  //         hstu::run_mha_bwd_<elem_type, kHeadDim>(params, stream);
+  //     });
+  // });
+  ARCH_SWITCH(params.arch, Arch, [&] {
+    BOOL_SWITCH(params.num_softmax_heads == params.h, Softmax, [&] {
+      if (!params.is_bf16) {
+#ifndef FLASHATTENTION_DISABLE_FP16
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+        if (params.qk_d <= 64) {
+          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 64, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+        if (params.qk_d <= 96) {
+          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 96, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+        if (params.qk_d <= 128) {
+          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 128, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+        if (params.qk_d <= 192) {
+          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 192, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+        if (params.qk_d <= 256) {
+          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 256, Softmax>(
+              params, stream);
+        }
+#endif
+#else
+                TORCH_CHECK(false, "This flash attention build does not support FP16.");
+#endif
+      } else {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+        if (params.qk_d <= 64) {
+          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 64, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+        if (params.qk_d <= 96) {
+          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 96, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+        if (params.qk_d <= 128) {
+          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 128, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+        if (params.qk_d <= 192) {
+          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 192, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+        if (params.qk_d <= 256) {
+          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 256, Softmax>(
+              params, stream);
+        }
+#endif
+      }
+    });
+  });
+#endif
+}
+
+std::vector<at::Tensor> hstu_mha_bwd(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& dout,
+    at::Tensor& q,
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& dq,
+    at::Tensor& dk,
+    at::Tensor& dv,
+    at::Tensor& out,
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    bool sort_by_length,
+    bool const deterministic,
+    const int64_t sm_margin,
+    int64_t max_q_len,
+    const std::optional<at::Tensor>& seq_offsets_q,
+    int64_t num_softmax_heads,
+    const std::optional<at::Tensor>& softmax_lse,
+    const std::optional<at::Tensor>& max_seq_len_tensor,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor,
+    const std::optional<at::Tensor>& max_attn_len_tensor,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
+    int64_t num_groups) {
+#ifdef FLASHATTENTION_DISABLE_BACKWARD
+  TORCH_CHECK(false, "This flash attention build does not support backward.");
+#endif
+
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  bool is_sm9x = dprops->major >= 9;
+  TORCH_CHECK(is_sm9x, "HSTU Attention only supports Hopper GPUs or newer.");
+
+  q = switch_to_contiguous_if_needed(q);
+  k = switch_to_contiguous_if_needed(k);
+  v = switch_to_contiguous_if_needed(v);
+  out = switch_to_contiguous_if_needed(out);
+  dout = switch_to_contiguous_if_needed(dout);
+
+  auto q_type = q.dtype();
+  TORCH_CHECK(
+      q_type == torch::kFloat16 || q_type == torch::kBFloat16,
+      "FlashAttention only support fp16 and bf16 data type");
+  TORCH_CHECK(k.dtype() == q_type, "query and key must have the same dtype");
+  TORCH_CHECK(v.dtype() == q_type, "query and value must have the same dtype");
+  TORCH_CHECK(
+      dout.dtype() == q_type, "query and dout must have the same dtype");
+
+  CHECK_DEVICE(q);
+  CHECK_DEVICE(k);
+  CHECK_DEVICE(v);
+  CHECK_DEVICE(dout);
+
+  TORCH_CHECK(
+      q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(
+      k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(
+      v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(
+      dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+
+  at::Tensor seq_offsets_;
+  bool const is_jagged = seq_offsets.has_value();
+  if (is_jagged) {
+    seq_offsets_ = seq_offsets.value();
+    CHECK_DEVICE(seq_offsets_);
+    CHECK_CONTIGUOUS(seq_offsets_);
+    TORCH_CHECK(
+        seq_offsets_.dtype() == torch::kInt32,
+        "seq_offsets_ must have dtype torch.int32");
+  }
+  at::Tensor sort_by_length_indices_;
+  if (sort_by_length && is_jagged) {
+    auto seq_lengths =
+        seq_offsets_.slice(0, 1).sub(seq_offsets_.slice(0, 0, -1));
+    std::tuple<torch::Tensor, torch::Tensor> sort_result = torch::sort(
+        seq_lengths, false /*stable*/, 0 /*dim*/, true /*descending*/);
+    sort_by_length_indices_ = std::get<1>(sort_result).to(torch::kInt32);
+    CHECK_DEVICE(sort_by_length_indices_);
+    CHECK_CONTIGUOUS(sort_by_length_indices_);
+    TORCH_CHECK(
+        sort_by_length_indices_.dtype() == torch::kInt32,
+        "sort_by_length_indices_ must have dtype torch.int32");
+  }
+  at::Tensor num_targets_;
+  bool const has_multiple_targets = num_targets.has_value();
+  if (has_multiple_targets) {
+    num_targets_ = num_targets.value();
+    CHECK_DEVICE(num_targets_);
+    CHECK_CONTIGUOUS(num_targets_);
+    TORCH_CHECK(
+        num_targets_.dtype() == torch::kInt32,
+        "num_targets_ must have dtype torch.int32");
+  }
+  at::Tensor attn_scale_;
+  bool scalar_scale = true;
+  bool const has_attn_scale = attn_scale.has_value();
+  if (has_attn_scale) {
+    attn_scale_ = attn_scale.value();
+    scalar_scale = attn_scale_.numel() == num_groups;
+    CHECK_DEVICE(attn_scale_);
+    TORCH_CHECK(
+        attn_scale_.dtype() == torch::kFloat32,
+        "attn_scale_ must have dtype torch.float32");
+  }
+  at::Tensor seq_offsets_q_;
+  bool const is_cross_attn = seq_offsets_q.has_value();
+  if (is_cross_attn) {
+    seq_offsets_q_ = seq_offsets_q.value();
+    CHECK_DEVICE(seq_offsets_q_);
+    CHECK_CONTIGUOUS(seq_offsets_q_);
+    TORCH_CHECK(
+        seq_offsets_q_.dtype() == torch::kInt32,
+        "seq_offsets_q_ must have dtype torch.int32");
+  } else {
+    max_q_len = max_seq_len;
+  }
+  at::Tensor max_seq_len_tensor_;
+  at::Tensor contextual_seq_len_tensor_;
+  at::Tensor max_attn_len_tensor_;
+  at::Tensor min_full_attn_seq_len_tensor_;
+  if (num_groups > 1) {
+    TORCH_CHECK(
+        max_seq_len_tensor.has_value(),
+        "max_seq_len_tensor cannot be empty for num_groups > 1.");
+    max_seq_len_tensor_ = max_seq_len_tensor.value();
+    CHECK_DEVICE(max_seq_len_tensor_);
+    TORCH_CHECK(max_seq_len_tensor_.dtype() == torch::kInt32);
+    if (!is_cross_attn) {
+      TORCH_CHECK(
+          contextual_seq_len_tensor.has_value(),
+          "contextual_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      TORCH_CHECK(
+          max_attn_len_tensor.has_value(),
+          "max_attn_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      TORCH_CHECK(
+          min_full_attn_seq_len_tensor.has_value(),
+          "min_full_attn_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      contextual_seq_len_tensor_ = contextual_seq_len_tensor.value();
+      max_attn_len_tensor_ = max_attn_len_tensor.value();
+      min_full_attn_seq_len_tensor_ = min_full_attn_seq_len_tensor.value();
+      CHECK_DEVICE(contextual_seq_len_tensor_);
+      CHECK_DEVICE(max_attn_len_tensor_);
+      CHECK_DEVICE(min_full_attn_seq_len_tensor_);
+      TORCH_CHECK(contextual_seq_len_tensor_.dtype() == torch::kInt32);
+      TORCH_CHECK(max_attn_len_tensor_.dtype() == torch::kInt32);
+      TORCH_CHECK(min_full_attn_seq_len_tensor_.dtype() == torch::kInt32);
+    }
+  }
+  auto const sizes_q = q.sizes();
+  auto const sizes_kv = k.sizes();
+  int const batch_size = !is_jagged ? sizes_q[0] : seq_offsets_.size(0) - 1;
+  TORCH_CHECK(
+      batch_size % num_groups == 0, "batch_size not divisible by num_groups");
+  if (!is_jagged) {
+    max_seq_len = sizes_kv[1];
+  }
+  int const total_seq_len_q = !is_jagged ? batch_size * sizes_q[1] : sizes_q[0];
+  int const total_seq_len_kv =
+      !is_jagged ? batch_size * sizes_kv[1] : sizes_kv[0];
+  int const num_heads = q.size(-2);
+  int const qk_head_size = q.size(-1);
+  int const v_head_size = v.size(-1);
+  TORCH_CHECK(
+      qk_head_size % 8 == 0 && v_head_size % 8 == 0,
+      "head_size should be a multiple of 8");
+  int const max_headdim = get_max_headdim();
+  TORCH_CHECK(
+      qk_head_size <= max_headdim && v_head_size <= max_headdim,
+      "FlashAttention backward only supports head dimension at most " +
+          std::to_string(max_headdim));
+  TORCH_CHECK(max_attn_len >= 0, "max_attn_len must be at least 0");
+  TORCH_CHECK(
+      min_full_attn_seq_len >= 0, "min_full_attn_seq_len must be at least 0");
+  TORCH_CHECK(contextual_seq_len >= 0, "contextual_seq_len must be at least 0");
+  if (!is_jagged) {
+    CHECK_SHAPE(q, batch_size, max_q_len, num_heads, qk_head_size);
+    CHECK_SHAPE(k, batch_size, max_seq_len, num_heads, qk_head_size);
+    CHECK_SHAPE(v, batch_size, max_seq_len, num_heads, v_head_size);
+    CHECK_SHAPE(dout, batch_size, max_q_len, num_heads, v_head_size);
+    CHECK_SHAPE(dq, batch_size, max_q_len, num_heads, qk_head_size);
+    CHECK_SHAPE(dk, batch_size, max_seq_len, num_heads, qk_head_size);
+    CHECK_SHAPE(dv, batch_size, max_seq_len, num_heads, v_head_size);
+  } else {
+    CHECK_SHAPE(q, total_seq_len_q, num_heads, qk_head_size);
+    CHECK_SHAPE(k, total_seq_len_kv, num_heads, qk_head_size);
+    CHECK_SHAPE(v, total_seq_len_kv, num_heads, v_head_size);
+    CHECK_SHAPE(dout, total_seq_len_q, num_heads, v_head_size);
+    CHECK_SHAPE(dq, total_seq_len_q, num_heads, qk_head_size);
+    CHECK_SHAPE(dk, total_seq_len_kv, num_heads, qk_head_size);
+    CHECK_SHAPE(dv, total_seq_len_kv, num_heads, v_head_size);
+    CHECK_SHAPE(seq_offsets_, batch_size + 1);
+  }
+  if (has_multiple_targets) {
+    CHECK_SHAPE(num_targets_, batch_size);
+  }
+  if (is_cross_attn) {
+    CHECK_SHAPE(seq_offsets_q_, batch_size + 1);
+  }
+  int const arch = at::cuda::getCurrentDeviceProperties()->major * 10 +
+      at::cuda::getCurrentDeviceProperties()->minor;
+  int const qk_head_size_rounded = round_up_headdim(qk_head_size);
+  int const v_head_size_rounded = round_up_headdim(v_head_size);
+  // Very important that these match the kernel configs
+  bool const is_local = max_attn_len > 0;
+  int const kBlockM =
+      hstu::kBlockM_bwd(arch, qk_head_size_rounded, causal, is_local);
+  auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+  int const max_q_len_rounded = round_multiple(max_q_len, kBlockM);
+  int const total_seq_len_q_padded_rounded =
+      round_multiple(total_seq_len_q + batch_size * kBlockM, kBlockM);
+
+  TORCH_CHECK(dq.dtype() == q_type, "dq must have the same dtype as q");
+  CHECK_DEVICE(dq);
+  TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+  if (!is_jagged) {
+    CHECK_SHAPE(dq, batch_size, max_q_len, num_heads, qk_head_size);
+  } else {
+    CHECK_SHAPE(dq, total_seq_len_q, num_heads, qk_head_size);
+  }
+  TORCH_CHECK(dk.dtype() == q_type, "dk must have the same dtype as q");
+  CHECK_DEVICE(dk);
+  TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+  if (!is_jagged) {
+    CHECK_SHAPE(dk, batch_size, max_seq_len, num_heads, qk_head_size);
+  } else {
+    CHECK_SHAPE(dk, total_seq_len_kv, num_heads, qk_head_size);
+  }
+  TORCH_CHECK(dv.dtype() == q_type, "dv must have the same dtype as q");
+  CHECK_DEVICE(dv);
+  TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+  if (!is_jagged) {
+    CHECK_SHAPE(dv, batch_size, max_seq_len, num_heads, v_head_size);
+  } else {
+    CHECK_SHAPE(dv, total_seq_len_kv, num_heads, v_head_size);
+  }
+
+  // Otherwise the kernel will be launched from cuda:0 device
+  // Cast to char to avoid compiler warning about narrowing
+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+  auto opts = q.options();
+
+  at::Tensor dq_accum;
+  if (!is_jagged) {
+    dq_accum = torch::empty(
+        {batch_size, num_heads, max_q_len_rounded * qk_head_size_rounded},
+        opts.dtype(at::kFloat));
+  } else {
+    dq_accum = torch::empty(
+        {num_heads, total_seq_len_q_padded_rounded * qk_head_size_rounded},
+        opts.dtype(at::kFloat));
+  }
+  at::Tensor softmax_d, softmax_lse_log2;
+  if (!is_jagged) {
+    // Need softmax_d to have seqlen_q_rounded since we want its address to be
+    // aligned by 16/8 bytes for TMA / LDG.64
+    softmax_d = torch::empty(
+        {batch_size, num_softmax_heads, max_q_len_rounded},
+        opts.dtype(at::kFloat));
+    softmax_lse_log2 = torch::empty(
+        {batch_size, num_softmax_heads, max_q_len_rounded},
+        opts.dtype(at::kFloat));
+  } else {
+    softmax_d = torch::empty(
+        {num_softmax_heads, total_seq_len_q_padded_rounded},
+        opts.dtype(at::kFloat));
+    softmax_lse_log2 = torch::empty(
+        {num_softmax_heads, total_seq_len_q_padded_rounded},
+        opts.dtype(at::kFloat));
+  }
+
+  // Early return for empty sequences; analog to TMA prevention guard
+  // in hstu_mha_fwd
+  if (total_seq_len_kv == 0 || total_seq_len_q == 0) {
+    return {dq, dk, dv};
+  }
+
+  hstu::Flash_bwd_params params;
+  hstu::set_params_dgrad(
+      params,
+      batch_size,
+      total_seq_len_kv,
+      total_seq_len_q,
+      max_seq_len,
+      max_q_len,
+      max_q_len_rounded,
+      num_heads,
+      qk_head_size,
+      v_head_size,
+      qk_head_size_rounded,
+      v_head_size_rounded,
+      q,
+      k,
+      v,
+      out,
+      dout,
+      dq,
+      dk,
+      dv,
+      dq_accum.data_ptr(),
+      !is_jagged ? nullptr : seq_offsets_.data_ptr(),
+      !has_multiple_targets ? nullptr : num_targets_.data_ptr(),
+      !has_attn_scale ? nullptr : attn_scale_.data_ptr(),
+      !(sort_by_length && is_jagged) ? nullptr
+                                     : sort_by_length_indices_.data_ptr(),
+      !is_cross_attn ? nullptr : seq_offsets_q_.data_ptr(),
+      num_softmax_heads == 0 ? nullptr : softmax_lse.value().data_ptr(),
+      num_softmax_heads == 0 ? nullptr : softmax_d.data_ptr(),
+      num_softmax_heads == 0 ? nullptr : softmax_lse_log2.data_ptr(),
+      num_groups > 1 ? max_seq_len_tensor_.data_ptr() : nullptr,
+      ((num_groups > 1) && (!is_cross_attn))
+          ? contextual_seq_len_tensor_.data_ptr()
+          : nullptr,
+      ((num_groups > 1) && (!is_cross_attn)) ? max_attn_len_tensor_.data_ptr()
+                                             : nullptr,
+      ((num_groups > 1) && (!is_cross_attn))
+          ? min_full_attn_seq_len_tensor_.data_ptr()
+          : nullptr,
+      num_groups,
+      scalar_scale,
+      causal,
+      alpha,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      num_softmax_heads,
+      deterministic,
+      sm_margin);
+
+  // auto tile_count_semaphore = (params.is_causal || params.is_local) ?
+  // torch::zeros({1}, opts.dtype(torch::kInt32)) : torch::empty({1},
+  // opts.dtype(torch::kInt32)); params.tile_count_semaphore =
+  // tile_count_semaphore.data_ptr<int>(); Will be zero'ed out in the
+  // backward preprocess kernel
+  at::Tensor dq_semaphore = torch::empty(
+      {(max_seq_len + kBlockM - 1) / kBlockM, batch_size, num_heads},
+      opts.dtype(torch::kInt32));
+  params.dq_semaphore = dq_semaphore.data_ptr<int>();
+
+#ifdef FLASHATTENTION_DISABLE_LOCAL
+  TORCH_CHECK(
+      !params.is_local,
+      "This flash attention build does not support local attention.");
+#endif
+
+  if (total_seq_len_q > 0 && num_heads > 0) {
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    run_mha_bwd(params, stream);
+  }
+  return {dq, dk, dv};
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h
new file mode 100644
index 000000000..98ca009f5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h
@@ -0,0 +1,149 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+// Include these 2 headers instead of torch/extension.h since we don't need all
+// of the torch headers.
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Optional.h>
+
+#include <torch/nn/functional.h>
+
+#include <torch/library.h> // @manual
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...)                           \
+  TORCH_CHECK(                                        \
+      x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \
+      #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+inline int round_up_headdim(int head_size) {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+  if (head_size <= 64) {
+    return 64;
+  }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+  if (head_size <= 96) {
+    return 96;
+  }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+  if (head_size <= 128) {
+    return 128;
+  }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+  if (head_size <= 192) {
+    return 192;
+  }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+  if (head_size <= 256) {
+    return 256;
+  }
+#endif
+  return 256;
+}
+
+inline int get_max_headdim() {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+  return 256;
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+  return 192;
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+  return 128;
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+  return 96;
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+  return 64;
+#endif
+  return 0;
+}
+
+namespace hstu {
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1);
+
+std::vector<at::Tensor> hstu_mha_bwd(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& dout,
+    at::Tensor& q,
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& dq,
+    at::Tensor& dk,
+    at::Tensor& dv,
+    at::Tensor& out,
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    bool sort_by_length,
+    bool const deterministic,
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    const std::optional<at::Tensor>& softmax_lse = std::nullopt,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1);
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp
new file mode 100644
index 000000000..30d4f792c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp
@@ -0,0 +1,172 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#include <c10/util/Optional.h>
+#include <torch/nn/functional.h>
+
+#include "flash_common_cpu.h"
+
+namespace hstu {
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_meta(
+    const at::SymInt max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin,
+    int64_t max_q_len,
+    const std::optional<at::Tensor>& seq_offsets_q,
+    int64_t num_softmax_heads,
+    bool training,
+    const std::optional<at::Tensor>& max_seq_len_tensor,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor,
+    const std::optional<at::Tensor>& max_attn_len_tensor,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
+    int64_t num_groups) {
+  auto q_type = q.scalar_type();
+  auto const sizes = q.sym_sizes();
+  at::Tensor seq_offsets_;
+  bool const is_jagged = seq_offsets.has_value();
+  if (is_jagged) {
+    seq_offsets_ = seq_offsets.value();
+  }
+  const c10::SymInt batch_size =
+      !is_jagged ? sizes[0] : seq_offsets_.sym_sizes()[0] - 1;
+  auto total_seq_len = !is_jagged ? batch_size * max_seq_len : sizes[0];
+  const auto& num_heads = sizes[sizes.size() - 2];
+  auto v_head_size = v.sym_sizes()[v.sym_sizes().size() - 1];
+  auto out_type = q_type == at::ScalarType::Float8_e4m3fn
+      ? at::ScalarType::BFloat16
+      : q_type;
+  auto opts = q.options();
+
+  at::Tensor out;
+  if (!is_jagged) {
+    out = at::empty_symint(
+        {batch_size, max_seq_len, num_heads, v_head_size},
+        opts.dtype(out_type));
+  } else {
+    out = at::empty_symint(
+        {total_seq_len, num_heads, v_head_size}, opts.dtype(out_type));
+  }
+  return {out, std::nullopt};
+};
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_dummy(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin,
+    const int64_t max_q_len,
+    const std::optional<at::Tensor>& seq_offsets_q,
+    int64_t num_softmax_heads,
+    bool training,
+    const std::optional<at::Tensor>& max_seq_len_tensor,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor,
+    const std::optional<at::Tensor>& max_attn_len_tensor,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
+    int64_t num_groups) {
+  auto q_type = q.scalar_type();
+  auto const sizes = q.sizes();
+  at::Tensor seq_offsets_;
+  bool const is_jagged = seq_offsets.has_value();
+  if (is_jagged) {
+    seq_offsets_ = seq_offsets.value();
+  }
+  const int batch_size = !is_jagged ? sizes[0] : seq_offsets_.size(0) - 1;
+  int total_seq_len = !is_jagged ? batch_size * max_seq_len : sizes[0];
+  int num_heads = q.size(-2);
+  //   int const qk_head_size = q.size(-1);
+  int const v_head_size = v.size(-1);
+  //   int const max_headdim = get_max_headdim();
+  auto out_type = q_type == at::ScalarType::Float8_e4m3fn
+      ? at::ScalarType::BFloat16
+      : q_type;
+  auto opts = q.options();
+
+  at::Tensor out;
+  if (!is_jagged) {
+    out = torch::empty(
+        {batch_size, max_seq_len, num_heads, v_head_size},
+        opts.dtype(out_type));
+  } else {
+    out = torch::empty(
+        {total_seq_len, num_heads, v_head_size}, opts.dtype(out_type));
+  }
+  return {out, std::nullopt};
+};
+
+std::vector<at::Tensor> hstu_mha_bwd_dummy(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& dout,
+    at::Tensor& q,
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& dq,
+    at::Tensor& dk,
+    at::Tensor& dv,
+    at::Tensor& out,
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    bool sort_by_length,
+    bool const deterministic,
+    const int64_t sm_margin,
+    const int64_t max_q_len,
+    const std::optional<at::Tensor>& seq_offsets_q,
+    int64_t num_softmax_heads,
+    const std::optional<at::Tensor>& softmax_lse,
+    const std::optional<at::Tensor>& max_seq_len_tensor,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor,
+    const std::optional<at::Tensor>& max_attn_len_tensor,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
+    int64_t num_groups) {
+  return {dq, dk, dv};
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h
new file mode 100644
index 000000000..9d0e18a71
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h
@@ -0,0 +1,114 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#include <c10/util/Optional.h>
+#include <torch/library.h> // @manual
+#include <torch/nn/functional.h>
+
+namespace hstu {
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_dummy(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1);
+
+std::vector<at::Tensor> hstu_mha_bwd_dummy(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& dout,
+    at::Tensor& q,
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& dq,
+    at::Tensor& dk,
+    at::Tensor& dv,
+    at::Tensor& out,
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    bool sort_by_length,
+    bool const deterministic,
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    const std::optional<at::Tensor>& softmax_lse = std::nullopt,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1);
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_meta(
+    const at::SymInt max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1);
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h
new file mode 100644
index 000000000..2e3d0916b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h
@@ -0,0 +1,511 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/arch/reg_reconfig.h>
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/kernel_hardware_info.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/pipeline/pipeline.hpp>
+
+#include "seqlen.h"
+#include "softmax.h"
+#include "tile_scheduler.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    bool Softmax,
+    class CollectiveMainloop_,
+    class CollectiveEpilogue_,
+    class TileScheduler_>
+class FlashAttnFwdSm90 {
+ public:
+  // Type Aliases
+  using CollectiveMainloop = CollectiveMainloop_;
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  static constexpr bool Is_FP8 = CollectiveMainloop::Is_FP8;
+  static constexpr bool Transpose_V = CollectiveMainloop::Transpose_V;
+  static constexpr bool Use_TMA_O = CollectiveEpilogue::Use_TMA_O;
+  static constexpr int NumProducerThreads =
+      CollectiveMainloop::NumProducerThreads;
+  using SeqlenInfo_t = typename CollectiveMainloop::SeqlenInfo_t;
+
+  // Mainloop derived types
+  using TileShape_MNK = typename CollectiveMainloop::TileShape_MNK;
+  using TiledMma0 = typename CollectiveMainloop::TiledMma0;
+  using TiledMma1 = typename CollectiveMainloop::TiledMma1;
+  using ArchTag = typename CollectiveMainloop::ArchTag;
+  using ClusterShape = typename CollectiveMainloop::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  using BarrierQ = cutlass::arch::ClusterTransactionBarrier;
+
+  // Epilogue derived types
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  using TileScheduler = TileScheduler_;
+  using TileSchedulerArguments = typename hstu::TileSchedulerArguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups =
+      CUTE_STATIC_V(size(TiledMma0{})) / cutlass::NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock =
+      CUTE_STATIC_V(size(TiledMma0{})) +
+      (NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static_assert(
+      NumMmaWarpGroups == 1 || NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
+
+  /// Register requirement for Load and Math WGs
+  // If we use cp.async to load K and V, we need more registers for the producer
+  // WG.
+  static constexpr uint32_t LoadRegisterRequirement =
+      NumMmaWarpGroups == 1 ? 56 : (NumMmaWarpGroups == 2 ? 24 : 32);
+  static constexpr uint32_t MmaRegisterRequirement =
+      NumMmaWarpGroups == 1 ? 256 : (NumMmaWarpGroups == 2 ? 240 : 160);
+  // If you want to print from the producer warp, you'd need to increase the
+  // number of registers Otherwise you'll get CUDA error. static constexpr
+  // uint32_t LoadRegisterRequirement = 40; static constexpr uint32_t
+  // MmaRegisterRequirement = NumMmaWarpGroups == 2 ? 232 : 152;
+
+  // Kernel level shared memory storage
+  // We overlap the shared memory for the mainloop and epilogue. However, we
+  // only want smem_o to overlap with smem_v and nothing else, so we'll pad in
+  // case sizeof(smem_o) > sizeof(smem_v).
+  static constexpr int mainloop_smem_padding_ =
+      int(sizeof(typename CollectiveEpilogue::TensorStorage)) -
+      int(sizeof(
+          decltype((typename CollectiveMainloop::TensorStorage{}).smem_v)));
+  static constexpr int mainloop_smem_padding =
+      mainloop_smem_padding_ < 0 ? 0 : mainloop_smem_padding_;
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128> {
+      union {
+        struct {
+          cute::array<uint32_t, mainloop_smem_padding / sizeof(uint32_t)>
+              padding_;
+          typename CollectiveMainloop::TensorStorage mainloop;
+        };
+        // We want smem_o to line up with the start of smem_v
+        typename CollectiveEpilogue::TensorStorage epilogue;
+      };
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16> {
+      alignas(16) BarrierQ barrier_Q;
+      alignas(16) cutlass::arch::ClusterBarrier barrier_O;
+      alignas(16) typename CollectiveMainloop::MainloopPipelineK::SharedStorage
+          pipeline_k;
+      alignas(16) typename CollectiveMainloop::MainloopPipelineV::SharedStorage
+          pipeline_v;
+      alignas(16) typename CollectiveMainloop::MainloopPipelineVt::SharedStorage
+          pipeline_vt;
+      alignas(16) typename TileScheduler::SharedStorage smem_scheduler;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    cutlass::KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    cutlass::KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const& args) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST(
+          "  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST(
+        "to_underlying_arguments(): Setting persistent grid SM count to "
+        << sm_count);
+
+    cutlass::KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    return {
+        CollectiveMainloop::to_underlying_arguments(args.mainloop),
+        CollectiveEpilogue::to_underlying_arguments(args.epilogue),
+        hw_info,
+        TileScheduler::to_underlying_arguments(args.scheduler)};
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3 get_grid_shape(Params const& params) {
+    return TileScheduler::get_grid_shape(
+        params.scheduler, params.hw_info.sm_count);
+  }
+
+  static dim3 get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char* smem_buf) {
+    static constexpr int NumMmaThreads =
+        NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
+    static constexpr int MmaThreadOffset =
+        NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup;
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+
+    using MainloopPipelineK = typename CollectiveMainloop::MainloopPipelineK;
+    using MainloopPipelineV = typename CollectiveMainloop::MainloopPipelineV;
+    using MainloopPipelineVt = typename CollectiveMainloop::MainloopPipelineVt;
+    using MainloopPipelineKVNew =
+        typename CollectiveMainloop::MainloopPipelineKVNew;
+    using PipelineState = typename CollectiveMainloop::PipelineState;
+    using PipelineParamsK = typename MainloopPipelineK::Params;
+    using PipelineParamsV = typename MainloopPipelineV::Params;
+    using PipelineParamsVt = typename MainloopPipelineVt::Params;
+    using PipelineParamsKVNew = typename MainloopPipelineKVNew::Params;
+
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int const lane_predicate = cute::elect_one_sync();
+    int const warp_idx = cutlass::canonical_warp_idx_sync();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if (warp_idx == 0 && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Obtain warp index
+    int const warp_group_thread_idx =
+        threadIdx.x % cutlass::NumThreadsPerWarpGroup;
+    int warp_group_idx = cutlass::canonical_warp_group_idx();
+
+    if (warp_idx == 0 && lane_predicate) {
+      shared_storage.pipelines.barrier_Q.init(1 /*numThreads*/);
+      shared_storage.pipelines.barrier_O.init(
+          size(ClusterShape{}) *
+          (Use_TMA_O ? 1 : NumMmaThreads) /*numThreads*/);
+    }
+
+    // We're counting on pipeline_k to call cutlass::arch::fence_barrier_init();
+    PipelineParamsK pipeline_params_k;
+    pipeline_params_k.role = warp_group_idx == 0
+        ? MainloopPipelineK::ThreadCategory::Producer
+        : MainloopPipelineK::ThreadCategory::Consumer;
+    pipeline_params_k.transaction_bytes =
+        CollectiveMainloop::TmaTransactionBytesK;
+    pipeline_params_k.is_leader = warp_group_thread_idx == 0;
+    pipeline_params_k.num_consumers = NumMmaThreads;
+
+    MainloopPipelineK pipeline_k = [&] {
+      return MainloopPipelineK(
+          shared_storage.pipelines.pipeline_k,
+          pipeline_params_k,
+          ClusterShape{});
+    }();
+    // MainloopPipelineV pipeline_v(shared_storage.pipelines.pipeline_v,
+    // pipeline_params_v, ClusterShape{});
+    MainloopPipelineV pipeline_v = [&] {
+      if constexpr (!Transpose_V) {
+        static_assert(is_same_v<PipelineParamsK, PipelineParamsV>);
+        return MainloopPipelineV(
+            shared_storage.pipelines.pipeline_v,
+            pipeline_params_k,
+            ClusterShape{});
+      } else {
+        PipelineParamsV pipeline_params_v;
+        pipeline_params_v.role = warp_group_idx == 0
+            ? MainloopPipelineV::ThreadCategory::Producer
+            : MainloopPipelineV::ThreadCategory::Consumer;
+        pipeline_params_v.producer_arv_count = NumProducerThreads;
+        pipeline_params_v.consumer_arv_count = NumMmaThreads;
+        return MainloopPipelineV(
+            shared_storage.pipelines.pipeline_v, pipeline_params_v);
+      }
+    }();
+    static_assert(is_same_v<PipelineParamsK, PipelineParamsVt>);
+    // If we need to transpose V (e.g. FP8 and V is row-major), we use
+    // pipeline_vt for the TMA, then the producer WG will read from pipeline_vt
+    // and write to pipeline_v. If we don't need to transpose V, we use
+    // pipeline_v for the TMA, and pipeline_vt won't be used. Technically for
+    // pipeline_params_vt, warp0 of WG0 is the producer and all of WG0 are
+    // consumers. However, the thread role isn't used in the pipeline
+    // implementation.
+    MainloopPipelineVt pipeline_vt = [&] {
+      pipeline_params_k.num_consumers =
+          NumProducerThreads; // TMA_V is only consumed by the producer WG
+      return MainloopPipelineVt(
+          shared_storage.pipelines.pipeline_vt,
+          pipeline_params_k,
+          ClusterShape{});
+    }();
+
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue;
+
+    // We need this to guarantee that the Pipeline init is visible to all
+    // producers and consumer blocks in the Cluster
+    if constexpr (size(ClusterShape{}) > 1) {
+      cute::cluster_arrive_relaxed();
+      cute::cluster_wait();
+    } else {
+      __syncthreads();
+    }
+
+    if (warp_group_idx == 0) { // Producer
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // The pipelines for AppendKV and main attention are different, since e.g.
+      // main attention might use cp.async to load KV (if PagedKV) while
+      // AppendKV always uses TMA to load KV_new. Since the pipeline states are
+      // different, we have to manually sync to make sure the two pipelines
+      // don't race when accessing smem_k and smem_v.
+      PipelineState smem_pipe_write =
+          cutlass::make_producer_start_state<MainloopPipelineK>();
+      PipelineState smem_pipe_write_new =
+          cutlass::make_producer_start_state<MainloopPipelineKVNew>();
+      int work_idx = 0;
+
+      TileScheduler scheduler(
+          reinterpret_cast<typename TileScheduler::SharedStorage*>(
+              &shared_storage.pipelines.smem_scheduler));
+      int warp_idx_in_warpgroup =
+          __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+      static constexpr bool SingleProducerWarp =
+          NumProducerThreads == cutlass::NumThreadsPerWarp;
+      if constexpr (SingleProducerWarp) {
+        if (warp_idx_in_warpgroup != 0) {
+          return;
+        }
+      }
+      if (!SingleProducerWarp && warp_idx_in_warpgroup != 0) {
+        scheduler.init_consumer();
+      }
+
+      // Load Q, K, V
+      for (auto work_tile_info = SingleProducerWarp ||
+                   warp_idx_in_warpgroup == 0
+               ? scheduler.template get_initial_work</*IsProducerWarp=*/true>(
+                     params.scheduler)
+               : scheduler.template get_initial_work</*IsProducerWarp=*/false>(
+                     params.scheduler);
+           work_tile_info.is_valid(params.scheduler);
+           work_tile_info = SingleProducerWarp || warp_idx_in_warpgroup == 0
+               ? scheduler.template get_next_work</*IsProducerWarp=*/true>(
+                     params.scheduler, work_tile_info)
+               : scheduler.template get_next_work</*IsProducerWarp=*/false>(
+                     params.scheduler, work_tile_info)) {
+        auto block_coord = work_tile_info.get_block_coord(params.scheduler);
+        SeqlenInfo_t seqlen_info{
+            get<2>(block_coord) /*bidb*/,
+            get<0>(params.mainloop.shape_Q),
+            get<0>(params.mainloop.shape_K),
+            params.mainloop.seq_offsets,
+            params.mainloop.seq_offsets_q,
+            params.mainloop.num_targets,
+        };
+        auto scheduler_prefetch = [&scheduler, &params, &work_tile_info]() {
+          scheduler.prefetch_next_work(params.scheduler, work_tile_info);
+        };
+        // pipeline_vt won't be used if we don't need to transpose V.
+        collective_mainloop.load(
+            params.mainloop,
+            pipeline_k,
+            pipeline_v,
+            pipeline_vt,
+            smem_pipe_write,
+            shared_storage,
+            scheduler_prefetch,
+            seqlen_info,
+            block_coord,
+            work_idx);
+      }
+      collective_mainloop.load_tail(
+          pipeline_k,
+          pipeline_v,
+          pipeline_vt,
+          smem_pipe_write,
+          shared_storage,
+          work_idx);
+    } else { // Consumer
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      TileScheduler scheduler(
+          reinterpret_cast<typename TileScheduler::SharedStorage*>(
+              &shared_storage.pipelines.smem_scheduler));
+      // Initialize matmul objects.
+      TiledMma1 tiled_mma1;
+
+      PipelineState smem_pipe_read;
+      // We don't need separate variables smem_pipe_release_k and
+      // smem_pipe_release_v (like in Cutlass's gemm) because the read and
+      // release pipeline states are always the same.
+
+      scheduler.init_consumer();
+      collective_mainloop.mma_init();
+
+      int work_idx = 0;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (auto work_tile_info =
+               scheduler.template get_initial_work</*IsProducerWarp=*/false>(
+                   params.scheduler);
+           work_tile_info.is_valid(params.scheduler);
+           work_tile_info =
+               scheduler.template get_next_work</*IsProducerWarp=*/false>(
+                   params.scheduler, work_tile_info)) {
+        // Attention output (GEMM-II) accumulator.
+        Tensor tOrO =
+            partition_fragment_C(tiled_mma1, select<0, 2>(TileShape_MNK{}));
+        // If there's tanh softcap, the scaling will be done before tanh.
+        auto block_coord = work_tile_info.get_block_coord(params.scheduler);
+        int const bidb = get<2>(block_coord);
+        int const bidh = get<1>(block_coord);
+        if constexpr (Is_FP8) {
+          int const bidh_kv = bidh;
+          float const q_descale = params.mainloop.ptr_q_descale == nullptr
+              ? 1.0f
+              : params.mainloop.ptr_q_descale
+                    [bidb * get<0>(params.mainloop.stride_q_descale) +
+                     bidh_kv * get<1>(params.mainloop.stride_q_descale)];
+          float const k_descale = params.mainloop.ptr_k_descale == nullptr
+              ? 1.0f
+              : params.mainloop.ptr_k_descale
+                    [bidb * get<0>(params.mainloop.stride_k_descale) +
+                     bidh_kv * get<1>(params.mainloop.stride_k_descale)];
+        }
+
+        SeqlenInfo_t seqlen_info{
+            bidb,
+            get<0>(params.mainloop.shape_Q),
+            get<0>(params.mainloop.shape_K),
+            params.mainloop.seq_offsets,
+            params.mainloop.seq_offsets_q,
+            params.mainloop.num_targets,
+        };
+        float alpha_log2 = params.mainloop.alpha_log2;
+        bool tile_valid;
+        if constexpr (Softmax) {
+          hstu::Softmax<
+              2 * (2 * kBlockM / NumMmaThreads),
+              /*Max_offset=*/!Is_FP8 ? 0 : 8>
+              softmax(alpha_log2);
+          tile_valid = collective_mainloop.mma_softmax(
+              params.mainloop,
+              pipeline_k,
+              pipeline_v,
+              smem_pipe_read,
+              tOrO,
+              softmax,
+              threadIdx.x - MmaThreadOffset,
+              work_idx,
+              seqlen_info,
+              block_coord,
+              shared_storage);
+          if (tile_valid) {
+            collective_epilogue.store(
+                params.epilogue,
+                tOrO,
+                shared_storage,
+                tiled_mma1,
+                threadIdx.x - MmaThreadOffset,
+                block_coord);
+            collective_epilogue.store_softmax(
+                params.epilogue,
+                softmax.row_sum,
+                tiled_mma1,
+                threadIdx.x - MmaThreadOffset,
+                block_coord);
+          } else {
+            // Write 0 to gO and -inf to gLSE.
+            // If Split, we don't have to write 0 to O if the mha_combine kernel
+            // is used, since it will not use the value of O if LSE is -inf.
+            collective_epilogue.template store_zero<true /*Clear_O*/>(
+                params.epilogue, threadIdx.x - MmaThreadOffset, block_coord);
+            // collective_epilogue.store_zero(params.epilogue, threadIdx.x -
+            // MmaThreadOffset, block_coord);
+          }
+        } else {
+          tile_valid = collective_mainloop.mma(
+              params.mainloop,
+              pipeline_k,
+              pipeline_v,
+              smem_pipe_read,
+              tOrO,
+              threadIdx.x - MmaThreadOffset,
+              work_idx,
+              seqlen_info,
+              block_coord,
+              shared_storage);
+          if (tile_valid) {
+            collective_epilogue.store(
+                params.epilogue,
+                tOrO,
+                shared_storage,
+                tiled_mma1,
+                threadIdx.x - MmaThreadOffset,
+                block_coord);
+          } else {
+            // Write 0 to gO and -inf to gLSE.
+            // If Split, we don't have to write 0 to O if the mha_combine kernel
+            // is used, since it will not use the value of O if LSE is -inf.
+            collective_epilogue.template store_zero<true /*Clear_O*/>(
+                params.epilogue, threadIdx.x - MmaThreadOffset, block_coord);
+            // collective_epilogue.store_zero(params.epilogue, threadIdx.x -
+            // MmaThreadOffset, block_coord);
+          }
+        }
+      }
+      collective_epilogue.store_tail();
+    }
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h
new file mode 100644
index 000000000..c79ea3a3f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h
@@ -0,0 +1,376 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+// clang-format off
+#include "cute/tensor.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"  // For device_kernel
+#include <cutlass/kernel_hardware_info.h>
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/kernel_launch.h"
+
+#include "static_switch.h"
+#include "flash.h"
+#include "tile_size.h"
+#include "tile_scheduler.h"
+#include "flash_fwd_kernel_sm90.h"
+#include "mainloop_fwd_sm90_tma_gmma_ws.h"
+#include "epilogue_fwd.h"
+// clang-format on
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    int Arch,
+    int kHeadDim,
+    int ClusterM,
+    typename Element,
+    typename ElementOut,
+    bool Causal,
+    bool Local,
+    bool Contexual_mask,
+    bool Jagged,
+    bool Has_targets,
+    bool V_colmajor,
+    bool Cross,
+    bool Softmax,
+    bool Training>
+void run_flash_fwd(hstu::Flash_fwd_params& params, cudaStream_t stream) {
+  static_assert(
+      !(Causal && Local),
+      "Causal and Local cannot be enabled at the same time");
+  static constexpr bool Is_FP8 =
+      cute::is_same_v<Element, cutlass::float_e4m3_t> ||
+      cute::is_same_v<Element, cutlass::float_e5m2_t>;
+  static constexpr bool FP8_TransposeV = Is_FP8 && !V_colmajor;
+  using ArchTag =
+      std::conditional_t<Arch >= 90, cutlass::arch::Sm90, cutlass::arch::Sm80>;
+
+  // Can't use structured binding since it's not compatible with constexpr
+  static constexpr std::tuple<int, int, bool> kBlockMN_RS =
+      hstu::tile_size_fwd_sm90(
+          kHeadDim,
+          Causal,
+          Local,
+          sizeof(Element) /*element_size*/,
+          V_colmajor,
+          Cross,
+          Training);
+  static constexpr std::tuple<int, int, int, int, bool>
+      kBlockMN_kNWarps_Stages_RS = hstu::tile_size_fwd_sm8x(
+          Arch == 86 || Arch == 89,
+          kHeadDim,
+          Causal,
+          Local,
+          sizeof(Element) /*element_size*/);
+  static constexpr int kBlockM = Arch >= 90
+      ? std::get<0>(kBlockMN_RS)
+      : std::get<0>(kBlockMN_kNWarps_Stages_RS);
+  static constexpr int kBlockN = Arch >= 90
+      ? std::get<1>(kBlockMN_RS)
+      : std::get<1>(kBlockMN_kNWarps_Stages_RS);
+  static constexpr bool Mma1_is_RS = std::get<2>(kBlockMN_RS);
+  static constexpr int kNWarps = std::get<2>(kBlockMN_kNWarps_Stages_RS);
+  static constexpr int kStages =
+      Arch >= 90 ? 2 : std::get<3>(kBlockMN_kNWarps_Stages_RS);
+  static constexpr bool Q_in_regs =
+      Arch >= 90 ? false : std::get<4>(kBlockMN_kNWarps_Stages_RS);
+
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+  std::printf(
+      "kBlockM: (%d), kBlockN: (%d), Mma1_is_RS: (%d), kNWarps: (%d), kStages: (%d), Q_in_regs: (%d)\n",
+      kBlockM,
+      kBlockN,
+      Mma1_is_RS,
+      kNWarps,
+      kStages,
+      Q_in_regs);
+#endif
+
+  using TileShape_MNK = cute::Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+  using ClusterShape = cute::Shape<Int<ClusterM>, _1, _1>;
+  using CollectiveMainloop = hstu::CollectiveMainloopFwdSm90<
+      kStages,
+      ClusterShape,
+      TileShape_MNK,
+      Element,
+      float,
+      cutlass::arch::Sm90,
+      Causal,
+      Local,
+      Contexual_mask,
+      Jagged,
+      Has_targets,
+      Mma1_is_RS,
+      V_colmajor,
+      Cross>;
+  using CollectiveEpilogue = hstu::CollectiveEpilogueFwd<
+      TileShape_MNK,
+      ClusterShape,
+      ElementOut,
+      ArchTag,
+      CollectiveMainloop::NumMmaThreads,
+      Jagged,
+      FP8_TransposeV>;
+
+  static constexpr int NumProducerThreads = Arch >= 90
+      ? CollectiveMainloop::NumProducerThreads
+      : CollectiveMainloop::NumMmaThreads;
+  using SchedulerPersistent = std::conditional_t<
+      Jagged,
+      hstu::VarlenDynamicPersistentTileScheduler<
+          kBlockM,
+          CollectiveMainloop::NumMmaThreads,
+          NumProducerThreads,
+          Arch >= 90 /*WarpSpecialized*/>,
+      std::conditional_t<
+          !Causal && !Local,
+          hstu::StaticPersistentTileScheduler,
+          hstu::DynamicPersistentTileScheduler<
+              CollectiveMainloop::NumMmaThreads,
+              NumProducerThreads,
+              Arch >= 90 /*WarpSpecialized*/>>>;
+  using SchedulerSingleTile = hstu::
+      SingleTileScheduler<Jagged, kBlockM, false /*Sort_by_length_indices*/>;
+  // If Split then we probably don't have enough work for PersistentScheduler to
+  // be useful. However, if Jagged (e.g., during decode where we have
+  // max_seqlens), using PersistentScheduler is better since we'll avoid
+  // launching a bunch of thread blocks that immediately exit. On Sm80,
+  // noncausal persistent seems a bit slower.
+  using Scheduler = std::conditional_t<
+      Arch >= 90 ? false : !(Causal && !Jagged),
+      SchedulerSingleTile,
+      SchedulerPersistent>;
+  using AttnKernel = hstu::enable_sm90_or_later<hstu::FlashAttnFwdSm90<
+      Softmax,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      Scheduler>>;
+
+  int seqlen_q = !Jagged ? params.max_q_len : params.total_seq_len_q;
+  int seqlen_kv = !Jagged ? params.max_kv_len : params.total_seq_len_kv;
+  int batch = !Jagged ? params.b : 1;
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+  std::printf("max/total seqlen: (%d), batch: (%d)\n", seqlen, batch);
+#endif
+  typename CollectiveMainloop::StrideV v_strides =
+      cute::conditional_return<!V_colmajor>(
+          make_stride(
+              params.v_row_stride,
+              _1{},
+              params.v_head_stride,
+              !Jagged ? params.v_batch_stride : 0),
+          make_stride(
+              _1{},
+              params.v_dim_stride,
+              params.v_head_stride,
+              !Jagged ? params.v_batch_stride : 0));
+  typename CollectiveMainloop::Arguments mainloop_args{
+      static_cast<Element const*>(params.q_ptr),
+      {seqlen_q, params.qk_d, params.h, batch}, // shape_Q
+      {params.q_row_stride,
+       _1{},
+       params.q_head_stride,
+       !Jagged ? params.q_batch_stride : 0}, // stride_Q
+      static_cast<Element*>(params.k_ptr),
+      {seqlen_kv, params.qk_d, params.h, batch}, // shape_K
+      {params.k_row_stride,
+       _1{},
+       params.k_head_stride,
+       !Jagged ? params.k_batch_stride : 0}, // stride_K
+      static_cast<Element*>(params.v_ptr),
+      v_strides, // stride_V
+      params.q_descale_ptr,
+      params.k_descale_ptr,
+      params.v_descale_ptr,
+      {params.q_descale_batch_stride, params.q_descale_head_stride},
+      {params.k_descale_batch_stride, params.k_descale_head_stride},
+      {params.v_descale_batch_stride, params.v_descale_head_stride},
+      1.0f / params.max_kv_len,
+      params.alpha,
+      params.max_attn_len,
+      params.min_full_attn_seq_len,
+      params.contextual_seq_len,
+      params.num_softmax_heads,
+      params.num_groups,
+      params.batch_size_per_group,
+      params.seq_offsets,
+      params.seq_offsets_q,
+      params.num_targets,
+      params.max_seq_len_tensor,
+      params.contextual_seq_len_tensor,
+      params.max_attn_len_tensor,
+      params.min_full_attn_seq_len_tensor,
+      params.attn_scale,
+      params.scalar_scale,
+  };
+  typename CollectiveEpilogue::Arguments epilogue_args{
+      static_cast<ElementOut*>(params.o_ptr),
+      {seqlen_q, params.v_d, params.h, batch, 1}, // shape_O
+      {params.o_row_stride,
+       _1{},
+       params.o_head_stride,
+       !Jagged ? params.o_batch_stride : 0,
+       0}, // stride_O
+      params.h,
+      params.num_softmax_heads,
+      {_1{}, seqlen_q, !Jagged ? params.h * seqlen_q : 0, 0}, // stride_LSE}
+      static_cast<float*>(params.softmax_lse),
+      Cross ? params.seq_offsets_q : params.seq_offsets};
+
+  int num_blocks_m =
+      cutlass::ceil_div(params.max_q_len, get<0>(TileShape_MNK{}));
+  num_blocks_m = cutlass::round_up(num_blocks_m, size<0>(ClusterShape{}));
+  typename hstu::TileSchedulerArguments scheduler_args{
+      num_blocks_m,
+      params.h,
+      params.b,
+      params.max_q_len,
+      params.qk_d,
+      sizeof(Element),
+      params.tile_count_semaphore,
+      Cross ? params.seq_offsets_q : params.seq_offsets,
+      nullptr /*sort_by_length_indices*/};
+
+  int device;
+  CHECK_CUDA(cudaGetDevice(&device));
+  typename AttnKernel::Params kernel_params =
+      AttnKernel::to_underlying_arguments(
+          {mainloop_args,
+           epilogue_args,
+           {device, params.num_sm},
+           scheduler_args});
+
+  dim3 grid_dims = AttnKernel::get_grid_shape(kernel_params);
+  dim3 block_dims = AttnKernel::get_block_shape();
+  int smem_size = AttnKernel::SharedStorageSize;
+  // int smem_size_q = sizeof(decltype((typename
+  // CollectiveMainloop::TensorStorage{}).smem_q)); int smem_size_k =
+  // sizeof(decltype((typename CollectiveMainloop::TensorStorage{}).smem_k));
+  // int smem_size_v = sizeof(decltype((typename
+  // CollectiveMainloop::TensorStorage{}).smem_v)); printf("smem_size = %d, q =
+  // %d, k = %d, v = %d\n", smem_size, smem_size_q, smem_size_k, smem_size_v);
+  // Get the ptr to kernel function.
+  if constexpr (size(ClusterShape{}) > 1) {
+    void const* kernel = (void const*)cutlass::device_kernel<AttnKernel>;
+    if (smem_size >= 48 * 1024) {
+      CHECK_CUDA(cudaFuncSetAttribute(
+          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+    dim3 cluster_dims(
+        size<0>(ClusterShape{}),
+        size<1>(ClusterShape{}),
+        size<2>(ClusterShape{}));
+    cutlass::ClusterLaunchParams launch_params{
+        grid_dims, block_dims, cluster_dims, smem_size, stream};
+    cutlass::launch_kernel_on_cluster(launch_params, kernel, kernel_params);
+  } else {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    std::cout << "ClusterShape = 1" << std::endl;
+    std::cout << "grid_dims = " << grid_dims << std::endl;
+    std::cout << "block_dims = " << block_dims << std::endl;
+    std::cout << "smem_size = " << smem_size << std::endl;
+#endif
+    auto kernel = cutlass::device_kernel<AttnKernel>;
+    if (smem_size >= 48 * 1024) {
+      CHECK_CUDA(cudaFuncSetAttribute(
+          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+    kernel<<<grid_dims, block_dims, smem_size, stream>>>(kernel_params);
+  }
+  CHECK_CUDA_KERNEL_LAUNCH();
+}
+
+template <
+    int Arch,
+    int kHeadDim,
+    bool Causal,
+    bool Local,
+    bool Softmax,
+    typename T,
+    typename T_out>
+void run_mha_fwd_dispatch(hstu::Flash_fwd_params& params, cudaStream_t stream) {
+  static constexpr bool V_colmajor = false; // V_colmajor_ && sizeof(T) == 1;
+  BOOL_SWITCH(params.num_targets, Has_targets, [&] {
+    BOOL_SWITCH(params.seq_offsets, Jagged, [&] {
+      BOOL_SWITCH(params.seq_offsets_q, Cross, [&] {
+        BOOL_SWITCH(params.has_contexual_mask, Contexual_mask, [&] {
+          BOOL_SWITCH(params.training, Training, [&] {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+            std::printf(
+                "[flash_fwd_launch_template] Local: (%d), Jagged: (%d), Has_targets: (%d), Causal: (%d), max_kv_len: (%d), kHeadDim: (%d)\n",
+                Local,
+                Jagged,
+                Has_targets,
+                Causal,
+                params.max_kv_len,
+                kHeadDim);
+#endif
+            // static constexpr bool Enable_cluster = Arch >= 90 &&
+            //     (sizeof(T) == 2 ? (kHeadDim >= 128) : (kHeadDim == 192)) &&
+            //     !Causal && !Local && !Jagged;
+            // static constexpr bool Enable_cluster = false;
+            // CLUSTER_SWITCH(
+            //     cutlass::ceil_div(params.max_q_len, kBlockM) % 2 == 0,
+            //     Use_cluster,
+            //     [&] {
+            // static constexpr int ClusterM =
+            //     Enable_cluster && Use_cluster ? 2 : 1;
+            run_flash_fwd<
+                Arch,
+                kHeadDim,
+                1, // ClusterM,
+                T,
+                T_out,
+                Causal,
+                Local,
+                Contexual_mask,
+                Jagged,
+                Has_targets,
+                V_colmajor,
+                Cross,
+                Softmax,
+                Training>(params, stream);
+          });
+        });
+      });
+    });
+  });
+}
+
+template <int Arch, typename T, int kHeadDim, bool Softmax>
+void run_mha_fwd_(hstu::Flash_fwd_params& params, cudaStream_t stream) {
+  static_assert(
+      sizeof(T) == 2 || sizeof(T) == 1, "Only 16bit and 8bit are supported");
+  static constexpr bool Is_FP8 = cute::is_same_v<T, cutlass::float_e4m3_t> ||
+      cute::is_same_v<T, cutlass::float_e5m2_t>;
+  using T_out = std::conditional_t<!Is_FP8, T, cutlass::bfloat16_t>;
+  CAUSAL_LOCAL_SWITCH(params.is_causal, params.is_local, Causal, Local, [&] {
+    // VCOLMAJOR_SWITCH(params.v_dim_stride != 1, V_colmajor_, [&] {
+    run_mha_fwd_dispatch<Arch, kHeadDim, Causal, Local, Softmax, T, T_out>(
+        params, stream);
+  });
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py
new file mode 100644
index 000000000..6c3a03188
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py
@@ -0,0 +1,236 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+
+# Copied from Driss Guessous's PR in PyTorch: https://github.com/pytorch/pytorch/pull/105602
+
+# This file is run to generate the kernel instantiations for the flash_attn kernels
+# They are written to several files in order to speed up compilation
+
+import argparse
+import itertools
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Union
+
+
+DTYPE_MAP = {
+    "fp16": "cutlass::half_t",
+    "bf16": "cutlass::bfloat16_t",
+    "e4m3": "cutlass::float_e4m3_t",
+}
+
+DTYPE_MAP_FWD_SM8x = {
+    "fp16": "cutlass::half_t",
+    "bf16": "cutlass::bfloat16_t",
+}
+
+DTYPE_MAP_BWD = {
+    "fp16": "cutlass::half_t",
+    "bf16": "cutlass::bfloat16_t",
+}
+
+SM = [90]  # Sm kernels support up to
+SOFTMAX = ["true", "false"]
+HEAD_DIMENSIONS = [64, 96, 128, 192, 256]
+
+KERNEL_IMPL_TEMPLATE_FWD_SM90 = """
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {{
+#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
+template void run_mha_fwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_fwd_params &params, cudaStream_t stream);
+#endif
+}} // namespace hstu
+"""
+
+KERNEL_IMPL_TEMPLATE_FWD_SM8x = """
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {{
+#ifndef FLASHATTENTION_DISABLE_SM8x
+#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
+template void run_mha_fwd_<80, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_fwd_params &params, cudaStream_t stream);
+#endif
+#endif
+}} // namespace hstu
+"""
+
+KERNEL_IMPL_TEMPLATE_BWD_SM90 = """
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {{
+#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
+template void run_mha_bwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_bwd_params &params, cudaStream_t stream);
+#endif
+}} // namespace hstu
+"""
+
+KERNEL_IMPL_TEMPLATE_BWD_SM8x = """
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {{
+#ifndef FLASHATTENTION_DISABLE_SM8x
+#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
+template void run_mha_bwd_<80, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_bwd_params &params, cudaStream_t stream);
+#endif
+#endif
+}} // namespace hstu
+"""
+
+
+@dataclass
+class Kernel:
+    sm: int
+    dtype: str
+    head_dim: int
+    softmax: str
+    direction: str
+
+    @property
+    def template(self) -> str:
+        if self.direction == "fwd":
+            if self.sm == 90:
+                return KERNEL_IMPL_TEMPLATE_FWD_SM90.format(
+                    ARCH=str(self.sm),
+                    DTYPE=DTYPE_MAP[self.dtype],
+                    HEAD_DIM=self.head_dim,
+                    SOFTMAX=self.softmax,
+                )
+            else:
+                # Always enable PackGQA for Sm8x to reduce compilation
+                return KERNEL_IMPL_TEMPLATE_FWD_SM8x.format(
+                    DTYPE=DTYPE_MAP[self.dtype],
+                    HEAD_DIM=self.head_dim,
+                    SOFTMAX=self.softmax,
+                )
+        else:
+            assert self.direction == "bwd"
+            if self.sm == 90:
+                return KERNEL_IMPL_TEMPLATE_BWD_SM90.format(
+                    ARCH=str(self.sm),
+                    DTYPE=DTYPE_MAP[self.dtype],
+                    HEAD_DIM=self.head_dim,
+                    SOFTMAX=self.softmax,
+                )
+            else:
+                return KERNEL_IMPL_TEMPLATE_BWD_SM8x.format(
+                    DTYPE=DTYPE_MAP[self.dtype],
+                    HEAD_DIM=self.head_dim,
+                    SOFTMAX=self.softmax,
+                )
+
+    @property
+    def filename(self) -> str:
+        return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_softmax{self.softmax}_sm{self.sm}.cu"
+
+
+def get_all_kernels() -> List[Kernel]:
+    kernels: List[Kernel] = []
+    for dtype, head_dim, sm, softmax in itertools.product(
+        DTYPE_MAP.keys(), HEAD_DIMENSIONS, SM, SOFTMAX
+    ):
+        # We always enable PackGQA for Sm8x or Split
+        # so we should just pass in packgqa=False to avoid the `_packgqa` in the filename.
+        if sm >= 90 or dtype in DTYPE_MAP_FWD_SM8x:
+            kernels.append(
+                Kernel(
+                    sm=sm,
+                    dtype=dtype,
+                    head_dim=head_dim,
+                    direction="fwd",
+                    softmax=softmax,
+                )
+            )
+    for dtype, head_dim, sm, softmax in itertools.product(
+        DTYPE_MAP_BWD.keys(), HEAD_DIMENSIONS, SM, SOFTMAX
+    ):
+        kernels.append(
+            Kernel(
+                sm=sm,
+                dtype=dtype,
+                head_dim=head_dim,
+                direction="bwd",
+                softmax=softmax,
+            )
+        )
+    return kernels
+
+
+def write_kernel(kernel: Union[Kernel], autogen_dir: Path) -> None:
+    prelude = """
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ \n
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different template instantiations to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"\n
+"""
+    (autogen_dir / kernel.filename).write_text(prelude + kernel.template)
+
+
+def main(output_dir_name: Optional[str]) -> None:
+    output_dir = (
+        Path(output_dir_name) if output_dir_name is not None else Path(__file__).parent
+    )
+    output_dir.mkdir(parents=True, exist_ok=True)
+    kernels_all = list(get_all_kernels())
+    for kernel in kernels_all:
+        write_kernel(kernel, output_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate_kernels",
+        description="Generate the flash_attention kernels template instantiations",
+    )
+    # Set an optional output directory
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        default="instantiations",
+        required=False,
+        help="Where to generate the kernels  will default to the current directory ",
+    )
+    args = parser.parse_args()
+    main(args.output_dir)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..da0eeb2df
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 128, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..8d85c2235
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 128, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..09226cd80
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_bwd_<90, cutlass::half_t, 128, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..63e451d14
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_bwd_<90, cutlass::half_t, 128, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..e379d9918
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 192, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..7faa31376
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 192, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..5ddc7d7fc
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_bwd_<90, cutlass::half_t, 192, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..530deae2b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_bwd_<90, cutlass::half_t, 192, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..185907c5e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 256, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..39df173bb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 256, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..cdc0a9f7e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_bwd_<90, cutlass::half_t, 256, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..6f3182d34
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_bwd_<90, cutlass::half_t, 256, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..89285d0be
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 64, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..ab39c7e06
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 64, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..8d62b8827
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_bwd_<90, cutlass::half_t, 64, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..5192d945f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_bwd_<90, cutlass::half_t, 64, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..cbeeac64a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 96, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..b654969e4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 96, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..ea81f7ee4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_bwd_<90, cutlass::half_t, 96, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..7439f322e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_bwd_<90, cutlass::half_t, 96, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..a39bcd505
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 128, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..464a0f443
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 128, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..3075657bb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 128, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..1ab6e4394
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 128, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..be5a6cb0d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::half_t, 128, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..7c303e7ef
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::half_t, 128, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..6e8d906d5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 192, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..80367708f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 192, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..67ade004b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 192, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..9f40d2726
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 192, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..1779657c0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::half_t, 192, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..0037dbc17
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::half_t, 192, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..93440571c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 256, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..c0634db8f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 256, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..a0eb625f5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 256, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..8b7216302
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 256, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..fe89b532f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::half_t, 256, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..c0857f941
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::half_t, 256, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..841e9359e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 64, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..3da54d69f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 64, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..4761ca635
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 64, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..33e66d0a7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 64, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..fab2951ee
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::half_t, 64, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..2ef1f29c9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::half_t, 64, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..bc52514e9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 96, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..11ea3bb20
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 96, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..9e0b05a31
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 96, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..7fa79aa76
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 96, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..83a25a649
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::half_t, 96, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..e0526dec8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::half_t, 96, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h
new file mode 100644
index 000000000..e702faf0b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h
@@ -0,0 +1,3166 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/array.h>
+#include <cutlass/barrier.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+
+#include "copy_sm90_bulk_reduce.h"
+#include "mask.h"
+#include "named_barrier.h"
+#include "seqlen.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    int Stages,
+    int Stages_dO,
+    int Stages_dS,
+    class ClusterShape_,
+    class TileShape_MNK_,
+    class Element_,
+    class ElementAccum_,
+    class ArchTag_,
+    bool Causal,
+    bool Local,
+    bool Contexual_mask,
+    bool Jagged,
+    bool Has_targets,
+    bool Deterministic,
+    bool SdP_swapAB_,
+    bool dKV_swapAB_,
+    bool dQ_swapAB_,
+    int NumMmaWarpGroups = 2,
+    int AtomLayoutMSdP = 1,
+    int AtomLayoutNdKV = 2,
+    int AtomLayoutMdQ = 1,
+    bool Mma_dP_is_RS = false,
+    bool Cross = false,
+    bool Softmax = false>
+struct CollectiveMainloopBwdSm90 {
+  static constexpr int kStages = Stages;
+  static constexpr int kStages_dO = Stages_dO;
+  static constexpr int kStages_dS = Stages_dS;
+  static_assert(kStages >= kStages_dO);
+  static_assert(Stages_dS == 1 || Stages_dS == kStages);
+  static_assert(
+      !Mma_dP_is_RS || SdP_swapAB_); // If Mma_dP_is_RS, we need SdP_SwapAB
+  using ClusterShape = ClusterShape_;
+  using TileShape_MNK = TileShape_MNK_;
+  using Element = Element_;
+  using ElementAccum = ElementAccum_;
+  using ArchTag = ArchTag_;
+  using SeqlenInfo_t = hstu::SeqlenInfoQKBwd<
+      Jagged,
+      Cross,
+      Has_targets,
+      CUTE_STATIC_V(get<0>(TileShape_MNK{}))>;
+
+  static constexpr bool SdP_swapAB = SdP_swapAB_;
+  static constexpr bool dKV_swapAB = dKV_swapAB_;
+  static constexpr bool dQ_swapAB = dQ_swapAB_;
+
+  static constexpr bool Q_dO_same_stages = kStages == kStages_dO;
+
+  static constexpr int kBlockM = get<0>(TileShape_MNK{});
+  static constexpr int kBlockN = get<1>(TileShape_MNK{});
+  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+  static_assert(get<0>(ClusterShape{}) == 1 && get<2>(ClusterShape{}) == 1);
+
+  static constexpr int NumMmaThreads =
+      NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
+  static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarp * 2;
+
+  static_assert(NumMmaWarpGroups % AtomLayoutMSdP == 0);
+  static_assert(NumMmaWarpGroups % AtomLayoutNdKV == 0);
+  static_assert(NumMmaWarpGroups % AtomLayoutMdQ == 0);
+  static constexpr bool Mma_dKV_is_RS = AtomLayoutMSdP == 1 &&
+      AtomLayoutNdKV == NumMmaWarpGroups && SdP_swapAB && !dKV_swapAB;
+  static constexpr bool Mma_dQ_is_RS = AtomLayoutMSdP == NumMmaWarpGroups &&
+      AtomLayoutMdQ == NumMmaWarpGroups && !SdP_swapAB &&
+      !dQ_swapAB; // If dQ_swapAB we can't use RS
+
+  static constexpr GMMA::Major PdS_Major = GMMA::Major::K;
+  // static constexpr GMMA::Major PdS_Major = GMMA::Major::MN;
+  static constexpr GMMA::Major PdSt_Major =
+      PdS_Major == GMMA::Major::K ? GMMA::Major::MN : GMMA::Major::K;
+
+  using TileShapeAtomSdP = std::conditional_t<
+      !SdP_swapAB,
+      Shape<
+          Int<kBlockM>,
+          Int<kBlockN / (NumMmaWarpGroups / AtomLayoutMSdP)>,
+          Int<kHeadDim>>,
+      Shape<Int<kBlockN>, Int<kBlockM / AtomLayoutMSdP>, Int<kHeadDim>>>;
+  using AtomLayoutSdP = std::conditional_t<
+      !SdP_swapAB,
+      Layout<Shape<
+          Int<AtomLayoutMSdP>,
+          Int<NumMmaWarpGroups / AtomLayoutMSdP>,
+          _1>>,
+      Layout<Shape<
+          Int<NumMmaWarpGroups / AtomLayoutMSdP>,
+          Int<AtomLayoutMSdP>,
+          _1>>>;
+  using TiledMmaSdP = decltype(cute::make_tiled_mma(
+      cute::GMMA::
+          ss_op_selector<Element, Element, ElementAccum, TileShapeAtomSdP>(),
+      AtomLayoutSdP{}));
+
+  using TiledMmadPRS = decltype(cute::make_tiled_mma(
+      cute::GMMA::
+          rs_op_selector<Element, Element, ElementAccum, TileShapeAtomSdP>(),
+      AtomLayoutSdP{}));
+
+  using TileShapeAtomdKV = std::conditional_t<
+      !dKV_swapAB,
+      Shape<
+          Int<kBlockN>,
+          Int<kHeadDim / (NumMmaWarpGroups / AtomLayoutNdKV)>,
+          Int<kBlockM>>,
+      Shape<Int<kHeadDim>, Int<kBlockN / AtomLayoutNdKV>, Int<kBlockM>>>;
+  using AtomLayoutdKV = std::conditional_t<
+      !dKV_swapAB,
+      Layout<Shape<
+          Int<AtomLayoutNdKV>,
+          Int<NumMmaWarpGroups / AtomLayoutNdKV>,
+          _1>>,
+      Layout<Shape<
+          Int<NumMmaWarpGroups / AtomLayoutNdKV>,
+          Int<AtomLayoutNdKV>,
+          _1>>>;
+  using TiledMmadKV = decltype(cute::make_tiled_mma(
+      std::conditional_t<
+          Mma_dKV_is_RS,
+          decltype(cute::GMMA::rs_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShapeAtomdKV,
+                   GMMA::Major::K,
+                   GMMA::Major::MN>()),
+          decltype(cute::GMMA::ss_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShapeAtomdKV,
+                   !dKV_swapAB ? PdSt_Major : GMMA::Major::MN,
+                   !dKV_swapAB ? GMMA::Major::MN : PdSt_Major>())>{},
+      AtomLayoutdKV{}));
+
+  static constexpr bool dQacc_use_TMA = kHeadDim < 256;
+  // For hdim256, we want to slice the dQ MMA (64 x 256 on 2 WGs) into two (64 x
+  // 128 on 2 WGs) so that we can do atomic add on one half before doing the
+  // other half of the MMA, to reduce register pressure.
+  static constexpr bool Slice_dQKV_Mma = kHeadDim == 256 && !dQacc_use_TMA &&
+      dQ_swapAB && AtomLayoutMdQ == 1 && NumMmaWarpGroups == 2;
+  static_assert(
+      !(Deterministic && Slice_dQKV_Mma),
+      "Deterministic mode not supported with Slice_dQKV_Mma");
+
+  static constexpr int TileShapeAtomdQ_BlockM = kBlockM / AtomLayoutMdQ;
+  static constexpr int TileShapeAtomdQ_HeadDim =
+      (Slice_dQKV_Mma ? kHeadDim / 2 : kHeadDim) /
+      (NumMmaWarpGroups / AtomLayoutMdQ);
+  static_assert(
+      !dQ_swapAB ? TileShapeAtomdQ_BlockM == 64 : TileShapeAtomdQ_HeadDim == 64,
+      "Tile_M must be 64.");
+  using TileShapeAtomdQ = std::conditional_t<
+      !dQ_swapAB,
+      Shape<
+          Int<TileShapeAtomdQ_BlockM>,
+          Int<TileShapeAtomdQ_HeadDim>,
+          Int<kBlockN>>,
+      Shape<
+          Int<TileShapeAtomdQ_HeadDim>,
+          Int<TileShapeAtomdQ_BlockM>,
+          Int<kBlockN>>>;
+  using AtomLayoutdQ = std::conditional_t<
+      !dQ_swapAB,
+      Layout<
+          Shape<Int<AtomLayoutMdQ>, Int<NumMmaWarpGroups / AtomLayoutMdQ>, _1>>,
+      Layout<Shape<
+          Int<NumMmaWarpGroups / AtomLayoutMdQ>,
+          Int<AtomLayoutMdQ>,
+          _1>>>;
+  using TiledMmadQ = decltype(cute::make_tiled_mma(
+      std::conditional_t<
+          Mma_dQ_is_RS,
+          decltype(cute::GMMA::rs_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShapeAtomdQ,
+                   GMMA::Major::K,
+                   GMMA::Major::MN>()),
+          decltype(cute::GMMA::ss_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShapeAtomdQ,
+                   !dQ_swapAB ? PdS_Major : GMMA::Major::MN,
+                   !dQ_swapAB ? GMMA::Major::MN : PdS_Major>())>{},
+      AtomLayoutdQ{}));
+
+  // We need to accommodate both Q and Q^T (and dO and dO^T) in shared memory.
+  // Q & dO are used in the SdP Mma and Q^T and dO^T are used in the dKV Mma.
+  // Since this is GMMA::Major::K, the M dimension (kBlockM) doesn't matter for
+  // the layout, only the K dimension changes the layout.
+  using SmemLayoutAtomQdO =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               Int<kBlockM>,
+               Int<kHeadDim /
+                   (NumMmaWarpGroups / AtomLayoutNdKV)>>()); // for dKV_Mma
+  using SmemLayoutQ = decltype(tile_to_shape(
+      SmemLayoutAtomQdO{},
+      make_shape(
+          shape<0>(TileShape_MNK{}),
+          shape<2>(TileShape_MNK{}),
+          Int<kStages>{})));
+  using SmemLayoutdO = decltype(tile_to_shape(
+      SmemLayoutAtomQdO{},
+      make_shape(
+          shape<0>(TileShape_MNK{}),
+          shape<2>(TileShape_MNK{}),
+          Int<kStages_dO>{})));
+
+  using SmemLayoutAtomK =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               Int<kBlockN>,
+               Int<kHeadDim / (NumMmaWarpGroups / AtomLayoutMdQ)>>());
+  using SmemLayoutK =
+      decltype(tile_to_shape(SmemLayoutAtomK{}, select<1, 2>(TileShape_MNK{})));
+
+  using SmemLayoutAtomV =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<1>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutV =
+      decltype(tile_to_shape(SmemLayoutAtomV{}, select<1, 2>(TileShape_MNK{})));
+
+  using SmemLayoutAtomPdS =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               PdS_Major,
+               Element,
+               Int<kBlockM / AtomLayoutMSdP>,
+               Int<kBlockN / (NumMmaWarpGroups / AtomLayoutMSdP)>>());
+  using SmemLayoutPdS = decltype(tile_to_shape(
+      SmemLayoutAtomPdS{},
+      make_shape(Int<kBlockM>{}, Int<kBlockN>{}, Int<kStages_dS>{}),
+      std::conditional_t<
+          PdS_Major == GMMA::Major::K,
+          cute::Step<_1, _2, _3>,
+          cute::Step<_2, _1, _3>>{}));
+  // Need stride to be multiple of 32, otherwise we get error (misaligned
+  // address) when doing TMA if e.g. kBlockM=80 We set stride to be multiple of
+  // 64 so that if ShuffleLSE, even if threads read from sLSE but out of bounds,
+  // it's still a valid smem address.
+  using SmemLayoutLSE = cute::Layout<
+      cute::Shape<Int<kBlockM>, Int<kStages>>,
+      cute::Stride<_1, Int<cute::round_up(kBlockM, 64)>>>;
+  using SmemLayoutLSEMma = std::conditional_t<
+      SdP_swapAB,
+      cute::Layout<
+          cute::Shape<Int<kBlockN>, Int<kBlockM>, Int<kStages>>,
+          cute::Stride<_0, _1, Int<cute::round_up(kBlockM, 64)>>>,
+      cute::Layout<
+          cute::Shape<Int<kBlockM>, Int<kBlockN>, Int<kStages>>,
+          cute::Stride<_1, _0, Int<cute::round_up(kBlockM, 64)>>>>;
+
+  // Note this is the transpose in terms of the view, not in terms of memory.
+  using SmemLayoutQt = decltype(cute::composition(
+      SmemLayoutQ{},
+      make_layout(
+          make_shape(
+              get<2>(TileShape_MNK{}),
+              get<0>(TileShape_MNK{}),
+              Int<kStages>{}),
+          make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kHeadDim>{}))));
+  using SmemLayoutdOt = decltype(cute::composition(
+      SmemLayoutdO{},
+      make_layout(
+          make_shape(
+              get<2>(TileShape_MNK{}),
+              get<0>(TileShape_MNK{}),
+              Int<kStages_dO>{}),
+          make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kHeadDim>{}))));
+  using SmemLayoutKt = decltype(cute::composition(
+      SmemLayoutK{},
+      make_layout(
+          make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
+          make_stride(Int<kBlockN>{}, _1{}))));
+  using SmemLayoutPdSt = decltype(cute::composition(
+      SmemLayoutPdS{},
+      make_layout(
+          make_shape(Int<kBlockN>{}, Int<kBlockM>{}, Int<kStages_dS>{}),
+          make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kBlockN>{}))));
+
+  // Thread layout, 256 or 384 threads per row
+  // We split into NumMmaWarpGroups so that we can do Bulk reduce add for each
+  // WG separately.
+  using R2SLayoutAtomdQaccum = Layout<
+      Shape<Int<cutlass::NumThreadsPerWarpGroup>, Int<NumMmaWarpGroups>>>;
+  using R2STiledCopydQaccum = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementAccum>{},
+      R2SLayoutAtomdQaccum{},
+      Layout<Shape<_4>>{})); // Val layout, 4 vals per store
+  using SmemLayoutdQaccum = Layout<
+      Shape<Int<kBlockM * kHeadDim / NumMmaWarpGroups>, Int<NumMmaWarpGroups>>>;
+
+  static constexpr int kNumPdSStore = kBlockM * kBlockN / NumMmaThreads;
+  // If !SdP_swapAB, the accum registers hold P / dS, otherwise they hold Pt /
+  // dSt. If PdS_major is MN, then we need to "transpose" the write.
+  using SmemCopyAtomPdS = Copy_Atom<
+      std::conditional_t<
+          (!SdP_swapAB) ^ (PdS_Major == GMMA::Major::MN),
+          std::conditional_t<
+              kNumPdSStore % 8 == 0,
+              cute::SM90_U32x4_STSM_N,
+              cute::SM90_U32x2_STSM_N>,
+          std::conditional_t<
+              kNumPdSStore % 8 == 0,
+              cute::SM90_U16x8_STSM_T,
+              cute::SM90_U16x4_STSM_T>>,
+      Element>;
+
+  using GmemTiledCopyQdO =
+      decltype(cutlass::gemm::collective::detail::
+                   sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape{})));
+  using GmemTiledCopyKV = cute::SM90_TMA_LOAD;
+
+  using ShapeQKV =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen, d, head,
+                                                       // batch)
+  using StrideQKV = cute::Stride<int64_t, _1, int64_t, int64_t>;
+  using ShapeLSE =
+      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen, head, batch)
+  using StrideLSE = cute::Stride<_1, int64_t, int64_t>; // (seqlen, head, batch)
+  using ShapedQaccum =
+      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen * d, head, batch)
+  using StridedQaccum = cute::Stride<_1, int64_t, int64_t>;
+
+  using TMA_QdO = decltype(make_tma_copy_A_sm90(
+      GmemTiledCopyQdO{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          StrideQKV{}),
+      take<0, 2>(SmemLayoutQ{}),
+      TileShape_MNK{},
+      ClusterShape{})); // mcast along N mode for this M load, if any
+
+  using TMA_K = decltype(make_tma_copy_B_sm90(
+      GmemTiledCopyKV{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          StrideQKV{}),
+      SmemLayoutK{},
+      TileShape_MNK{},
+      ClusterShape{})); // no mcast for KV
+
+  using TMA_V = decltype(make_tma_copy_B_sm90(
+      GmemTiledCopyKV{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          StrideQKV{}),
+      SmemLayoutV{},
+      TileShape_MNK{},
+      ClusterShape{})); // no mcast for KV
+
+  using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
+  using PipelineState = typename MainloopPipeline::PipelineState;
+  using MainloopPipeline_dO = typename cutlass::PipelineTmaAsync<kStages_dO>;
+  using PipelineState_dO = typename MainloopPipeline_dO::PipelineState;
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple
+  // issues)
+  static constexpr uint32_t TmaTransactionBytesQ = static_cast<uint32_t>(
+      size(take<0, 2>(SmemLayoutQ{})) * cutlass::sizeof_bits_v<Element> / 8);
+  static constexpr uint32_t TmaTransactionBytesK = static_cast<uint32_t>(
+      size(SmemLayoutK{}) * cutlass::sizeof_bits_v<Element> / 8);
+  static constexpr uint32_t TmaTransactionBytesV = static_cast<uint32_t>(
+      size(SmemLayoutV{}) * cutlass::sizeof_bits_v<Element> / 8);
+  static constexpr uint32_t TmaTransactionBytesLSE = static_cast<uint32_t>(
+      size(select<0>(SmemLayoutLSE{})) * cutlass::sizeof_bits_v<ElementAccum> /
+      8);
+
+  // These are tuned for speed. They don't affect correctness.
+  // We have separate iterations with causal masking. Not necessary for hdim 128
+  // but for hdim 64 this helps quite a bit to not have to do causal masking for
+  // most of the iterations. For hdim 192, separating masking iterations results
+  // in register spills.
+  static constexpr bool SeparateMaskingIterations = false;
+  // Do we keep the LSE and dPsum in each thread, or split them across 8 threads
+  // that share them and then shuffle to get the value whenever we need? This
+  // can reduce register pressure when SdP_swapAB, where each thread needs to
+  // keep statistics for (kBlockM / 4) rows. If !SdP_swapAB, each thread only
+  // needs to keep statistic for 2 rows.
+  static constexpr bool ShuffleLSE = SdP_swapAB && kHeadDim <= 64;
+  static constexpr bool ShuffledPsum = SdP_swapAB && kHeadDim <= 64;
+  static constexpr size_t SmemAlignmentP =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutPdS{});
+  static constexpr size_t SmemAlignmentdS =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutPdS{});
+  // Without this SmemAlignment, with hdim 256 we get "misaligned address" error
+  // in TMA
+  static constexpr size_t SmemAlignmentQKVdO = kHeadDim % 256 == 0 ? 256 : 128;
+  static constexpr size_t SmemAlignmentV = !Mma_dP_is_RS
+      ? SmemAlignmentQKVdO
+      : cutlass::detail::alignment_for_swizzle(SmemLayoutV{});
+  static_assert(
+      SmemAlignmentP >= 128 && SmemAlignmentdS >= 128,
+      "Require at least 128B alignment");
+
+  // TODO: do we have to worry that smem_dk and smem_dv in the epilogue don't
+  // line up w smem_k and smem_v due to alignment?
+  using SmemdQacc_t = std::conditional_t<
+      !dQacc_use_TMA,
+      cute::array<ElementAccum, 0>,
+      cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutdQaccum>>>;
+  using SmemP_t = std::conditional_t<
+      Mma_dKV_is_RS,
+      cute::array<Element, 0>,
+      cute::array_aligned<
+          Element,
+          cute::cosize_v<SmemLayoutPdS>,
+          SmemAlignmentP>>;
+  struct TensorStorage
+      : cute::aligned_struct<
+            cute::max(SmemAlignmentP, SmemAlignmentdS, SmemAlignmentQKVdO)> {
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentQKVdO>
+            smem_k;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutV>, SmemAlignmentV>
+        smem_v;
+    SmemdQacc_t smem_dqacc;
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQKVdO>
+            smem_q;
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutdO>, SmemAlignmentQKVdO>
+            smem_do;
+    cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutLSE>, 128>
+        smem_lse;
+    cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutLSE>, 128>
+        smem_dpsum;
+    SmemP_t smem_p;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutPdS>, SmemAlignmentdS>
+        smem_ds;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    Element const* const ptr_Q;
+    ShapeQKV const shape_Q;
+    StrideQKV const stride_Q;
+    Element const* const ptr_K;
+    ShapeQKV const shape_K;
+    StrideQKV const stride_K;
+    Element const* const ptr_V;
+    ShapeQKV const shape_V;
+    StrideQKV const stride_V;
+    Element const* const ptr_dO;
+    ShapeQKV const shape_dO;
+    StrideQKV const stride_dO;
+    ElementAccum* const ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum const stride_dQaccum;
+    float const* const ptr_LSE_log2;
+    ShapeLSE const shape_LSE;
+    StrideLSE const stride_LSE_log2;
+    float const* const ptr_dPsum;
+    StrideLSE const stride_dPsum;
+    int const max_attn_len;
+    int const min_full_attn_seq_len;
+    int const contextual_seq_len;
+    float const max_seq_len_inv;
+    float const alpha;
+    int const num_batch;
+    int const num_softmax_heads;
+    int const num_groups;
+    int const batch_size_per_group;
+    int* const dq_semaphore;
+    int const* const seq_offsets = nullptr;
+    int const* const seq_offsets_q = nullptr;
+    int const* const num_targets = nullptr;
+    int const* const max_seq_len_tensor = nullptr;
+    int const* const contextual_seq_len_tensor = nullptr;
+    int const* const max_attn_len_tensor = nullptr;
+    int const* const min_full_attn_seq_len_tensor = nullptr;
+    float const* const attn_scale = nullptr;
+    bool const scalar_scale = true;
+  };
+
+  // Device side kernel params
+  struct Params {
+    ShapeQKV const shape_Q;
+    ShapeQKV const shape_K;
+    ShapeQKV const shape_V;
+    ShapeQKV const shape_dO;
+    ElementAccum* const ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum stride_dQaccum;
+    TMA_QdO tma_load_Q, tma_load_dO;
+    TMA_K tma_load_K;
+    TMA_V tma_load_V;
+    float const* const ptr_LSE_log2;
+    ShapeLSE const shape_LSE;
+    StrideLSE const stride_LSE_log2;
+    float const* const ptr_dPsum;
+    StrideLSE const stride_dPsum;
+    int const max_attn_len;
+    int const min_full_attn_seq_len;
+    int const contextual_seq_len;
+    float const max_seq_len_inv;
+    float const alpha;
+    float const alpha_log2;
+    int const num_batch;
+    int const num_softmax_heads;
+    int const num_groups;
+    int const batch_size_per_group;
+    int* const dq_semaphore;
+    int const* const seq_offsets = nullptr;
+    int const* const seq_offsets_q = nullptr;
+    int const* const num_targets;
+    int const* const max_seq_len_tensor = nullptr;
+    int const* const contextual_seq_len_tensor = nullptr;
+    int const* const max_attn_len_tensor = nullptr;
+    int const* const min_full_attn_seq_len_tensor = nullptr;
+    float const* const attn_scale;
+    bool const scalar_scale = true;
+  };
+
+  static Params to_underlying_arguments(Arguments const& args) {
+    Tensor mQ =
+        make_tensor(make_gmem_ptr(args.ptr_Q), args.shape_Q, args.stride_Q);
+    TMA_QdO tma_load_Q = make_tma_copy_A_sm90(
+        GmemTiledCopyQdO{},
+        mQ,
+        SmemLayoutQ{}(_, _, _0{}),
+        TileShape_MNK{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+    Tensor mdO =
+        make_tensor(make_gmem_ptr(args.ptr_dO), args.shape_Q, args.stride_dO);
+    TMA_QdO tma_load_dO = make_tma_copy_A_sm90(
+        GmemTiledCopyQdO{},
+        mdO,
+        SmemLayoutdO{}(_, _, _0{}),
+        TileShape_MNK{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+    Tensor mK =
+        make_tensor(make_gmem_ptr(args.ptr_K), args.shape_K, args.stride_K);
+    TMA_K tma_load_K = make_tma_copy_B_sm90(
+        GmemTiledCopyKV{},
+        mK,
+        SmemLayoutK{},
+        TileShape_MNK{},
+        ClusterShape{}); // no mcast for KV
+    Tensor mV =
+        make_tensor(make_gmem_ptr(args.ptr_V), args.shape_K, args.stride_V);
+    TMA_V tma_load_V = make_tma_copy_B_sm90(
+        GmemTiledCopyKV{},
+        mV,
+        SmemLayoutV{},
+        TileShape_MNK{},
+        ClusterShape{}); // no mcast for KV
+    if constexpr (Deterministic) {
+      assert(args.dq_semaphore != nullptr);
+    }
+    return {
+        args.shape_Q,
+        args.shape_K,
+        args.shape_V,
+        args.shape_dO,
+        args.ptr_dQaccum,
+        args.shape_dQaccum,
+        args.stride_dQaccum,
+        tma_load_Q,
+        tma_load_dO,
+        tma_load_K,
+        tma_load_V,
+        args.ptr_LSE_log2,
+        args.shape_LSE,
+        args.stride_LSE_log2,
+        args.ptr_dPsum,
+        args.stride_dPsum,
+        args.max_attn_len,
+        args.min_full_attn_seq_len,
+        args.contextual_seq_len,
+        args.max_seq_len_inv,
+        args.alpha,
+        float(args.alpha * M_LOG2E),
+        args.num_batch,
+        args.num_softmax_heads,
+        args.num_groups,
+        args.batch_size_per_group,
+        args.dq_semaphore,
+        args.seq_offsets,
+        args.seq_offsets_q,
+        args.num_targets,
+        args.max_seq_len_tensor,
+        args.contextual_seq_len_tensor,
+        args.max_attn_len_tensor,
+        args.min_full_attn_seq_len_tensor,
+        args.attn_scale,
+        args.scalar_scale};
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
+  /// performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& params) {
+    cute::prefetch_tma_descriptor(params.tma_load_Q.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_dO.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_K.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_V.get_tma_descriptor());
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_m_block_min_max(
+      int const max_attn_len,
+      int const contextual_seq_len,
+      int const uihlen,
+      int const seqlen,
+      int const n_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (Has_targets) {
+      int n_idx_min = n_block * kBlockN;
+      if (n_idx_min >= uihlen) {
+        int n_idx_max = (n_block + 1) * kBlockN;
+        return {
+            std::max(0, n_idx_min / kBlockM),
+            cute::ceil_div(std::min(n_idx_max, seqlen), kBlockM)};
+      }
+    }
+    // uih part
+    int m_block_max = cute::ceil_div(seqlen, kBlockM);
+    if constexpr (Local) {
+      int local_m_block_max =
+          cute::ceil_div((n_block + 1) * kBlockN + max_attn_len, kBlockM);
+      if constexpr (Contexual_mask) {
+        // row contexual without sink
+        if (n_block * kBlockN < contextual_seq_len) {
+          local_m_block_max = std::max(
+              local_m_block_max,
+              cute::ceil_div(contextual_seq_len + max_attn_len, kBlockM));
+        }
+      }
+      m_block_max = std::min(m_block_max, local_m_block_max);
+    }
+    int m_block_min = 0;
+    if constexpr (Causal || Local) {
+      m_block_min = std::max(m_block_min, (n_block * kBlockN) / kBlockM);
+    }
+    return {m_block_min, m_block_max};
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_full_m_block_min_max(
+      int const uihlen,
+      int const seqlen,
+      int const min_full_attn_seq_len,
+      int const m_block_max,
+      int const n_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (Cross) {
+      return {0, 0};
+    }
+    if constexpr (!Local) {
+      return {0, 0};
+    }
+    if constexpr (Has_targets) {
+      int n_idx_min = n_block * kBlockN;
+      if (n_idx_min >= uihlen) {
+        return {0, 0};
+      }
+    }
+    if constexpr (Local) {
+      int full_m_block_max = cute::ceil_div(seqlen, kBlockM);
+      int full_m_block_min =
+          std::max(m_block_max, (uihlen - min_full_attn_seq_len) / kBlockM);
+      return {full_m_block_min, full_m_block_max};
+    }
+    return {0, 0};
+  }
+
+  CUTLASS_DEVICE
+  int get_contexual_m_block_max(
+      int const uihlen,
+      int const contextual_seq_len,
+      int const m_block_min,
+      int const n_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (Cross) {
+      return 0;
+    }
+    if constexpr (!Contexual_mask) {
+      return 0;
+    }
+    if constexpr (Has_targets) {
+      int n_idx_min = n_block * kBlockN;
+      if (n_idx_min >= uihlen) {
+        return 0;
+      }
+    }
+    if constexpr (Causal || Local) {
+      int contexual_m_block_max =
+          std::min(m_block_min, cute::ceil_div(contextual_seq_len, kBlockM));
+      return contexual_m_block_max;
+    }
+    return 0;
+  }
+
+  CUTLASS_DEVICE
+  int get_next_m_block(
+      int const m_block,
+      int const m_block_min,
+      int const m_block_max,
+      int const contexual_m_block_max,
+      int const full_m_block_min,
+      int const full_m_block_max) {
+    int const out_m_block = m_block + 1;
+    if constexpr (Contexual_mask || Local) {
+      if (out_m_block == m_block_max) {
+        if (contexual_m_block_max > 0) {
+          return 0;
+        }
+        if (full_m_block_max > full_m_block_min) {
+          return full_m_block_min;
+        }
+        return -1;
+      }
+      if (out_m_block == contexual_m_block_max) {
+        if (full_m_block_max > full_m_block_min) {
+          return full_m_block_min;
+        }
+        return -1;
+      }
+      if (out_m_block == full_m_block_max) {
+        return -1;
+      }
+      return out_m_block;
+    }
+    if (out_m_block == m_block_max) {
+      return -1;
+    }
+    return out_m_block;
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_cross_m_block_min_max(
+      int const uihlen_q,
+      int const seqlen_q,
+      int const seqlen_kv,
+      int const n_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    int m_block_max = cute::ceil_div(seqlen_q, kBlockM);
+    if constexpr (!Causal) {
+      return {0, m_block_max};
+    }
+    int m_block_min =
+        std::max(0, (n_block * kBlockN + uihlen_q - seqlen_kv) / kBlockM);
+    return {m_block_min, m_block_max};
+  }
+
+  template <typename SchedulerPrefetch, typename SharedStorage>
+  CUTLASS_DEVICE void load(
+      Params const& params,
+      MainloopPipeline pipeline_q,
+      MainloopPipeline_dO pipeline_do,
+      PipelineState& smem_pipe_write,
+      PipelineState_dO& smem_pipe_write_do,
+      SharedStorage& shared_storage,
+      SchedulerPrefetch const& scheduler_prefetch,
+      cute::tuple<int32_t, int32_t, int32_t> block_coord) {
+    auto [n_block, bidh, bidb] = block_coord;
+    SeqlenInfo_t seqlen_info{
+        bidb,
+        get<0>(params.shape_Q),
+        get<0>(params.shape_K),
+        params.seq_offsets,
+        params.seq_offsets_q,
+        params.num_targets};
+    if constexpr (Jagged) {
+      static constexpr int kBlockN = get<1>(TileShape_MNK{});
+      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
+        scheduler_prefetch();
+        return;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    int m_block_min, m_block_max;
+    if constexpr (Cross) {
+      auto m_block_min_max = get_cross_m_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    } else {
+      auto m_block_min_max = get_m_block_min_max(
+          max_attn_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    }
+    auto full_m_block_min_max = get_full_m_block_min_max(
+        seqlen_info.uihlen_q,
+        seqlen_info.seqlen_q,
+        min_full_attn_seq_len_,
+        m_block_max,
+        n_block);
+    int const full_m_block_min = get<0>(full_m_block_min_max);
+    int const full_m_block_max = get<1>(full_m_block_min_max);
+    int contexual_m_block_max = get_contexual_m_block_max(
+        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sdO = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
+        SmemLayoutdO{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sV = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutV{});
+    Tensor sLSE = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_lse.data()),
+        SmemLayoutLSE{});
+    Tensor sdPsum = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_dpsum.data()),
+        SmemLayoutLSE{});
+
+    // Prepare the TMA loads
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {
+        block_rank_in_cluster % cluster_shape_x,
+        block_rank_in_cluster / cluster_shape_x};
+    Tensor mQ = params.tma_load_Q.get_tma_tensor(params.shape_Q)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mdO = params.tma_load_dO.get_tma_tensor(params.shape_Q)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mK = params.tma_load_K.get_tma_tensor(params.shape_K)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mV = params.tma_load_V.get_tma_tensor(params.shape_K)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mLSE = make_tensor(
+        make_gmem_ptr(params.ptr_LSE_log2),
+        params.shape_LSE,
+        params.stride_LSE_log2)(_, bidh, !Jagged ? bidb : 0);
+    Tensor mdPsum = make_tensor(
+        make_gmem_ptr(params.ptr_dPsum), params.shape_LSE, params.stride_dPsum)(
+        _, bidh, !Jagged ? bidb : 0);
+
+    Tensor gQ = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_q, _0{}), mQ),
+        select<0, 2>(TileShape_MNK{}),
+        make_coord(_, _0{})); // (M, K, _)
+    Tensor gdO = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_q, _0{}), mdO),
+        select<0, 2>(TileShape_MNK{}),
+        make_coord(_, _0{})); // (M, K, _)
+    Tensor gK = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_k, _0{}), mK),
+        select<1, 2>(TileShape_MNK{}),
+        make_coord(n_block, _0{})); // (N, K)
+    Tensor gV = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_k, _0{}), mV),
+        select<1, 2>(TileShape_MNK{}),
+        make_coord(n_block, _0{})); // (N, K)
+    Tensor gLSE = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_q_padded), mLSE),
+        select<0>(TileShape_MNK{}),
+        make_coord(_)); // (M, _)
+    Tensor gdPsum = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_q_padded), mdPsum),
+        select<0>(TileShape_MNK{}),
+        make_coord(_)); // (M, _)
+
+    Tensor sK_x =
+        make_tensor(sK.data(), make_layout(sK.layout(), Layout<_1>{}));
+    Tensor gK_x =
+        make_tensor(gK.data(), make_layout(gK.layout(), Layout<_1>{}));
+    Tensor sV_x =
+        make_tensor(sV.data(), make_layout(sV.layout(), Layout<_1>{}));
+    Tensor gV_x =
+        make_tensor(gV.data(), make_layout(gV.layout(), Layout<_1>{}));
+    // auto [tQgQ, tQsQ] = tma_partition(params.tma_load_Q,
+    // block_rank_in_cluster, Layout<ClusterShape>{},
+    //                                   group_modes<0, 2>(sQ), group_modes<0,
+    //                                   2>(gQ));  // (TMA, k), (TMA, PIPE)
+    // auto [tdOgdO, tdOsdO] = tma_partition(params.tma_load_dO,
+    // block_rank_in_cluster, Layout<ClusterShape>{},
+    //                                   group_modes<0, 2>(sdO), group_modes<0,
+    //                                   2>(gdO));  // (TMA, k), (TMA, PIPE)
+    auto block_tma_Q = params.tma_load_Q.get_slice(cluster_local_block_id.y);
+    auto block_tma_dO = params.tma_load_dO.get_slice(cluster_local_block_id.y);
+    Tensor tQgQ = group_modes<0, 3>(block_tma_Q.partition_S(gQ));
+    Tensor tQsQ = group_modes<0, 3>(block_tma_Q.partition_D(sQ));
+    Tensor tdOgdO = group_modes<0, 3>(block_tma_dO.partition_S(gdO));
+    Tensor tdOsdO = group_modes<0, 3>(block_tma_dO.partition_D(sdO));
+    auto [tKgK, tKsK] = tma_partition(
+        params.tma_load_K,
+        _0{},
+        Layout<_1>{},
+        group_modes<0, 2>(sK_x),
+        group_modes<0, 2>(gK_x)); // (TMA), (TMA)
+    auto [tVgV, tVsV] = tma_partition(
+        params.tma_load_V,
+        _0{},
+        Layout<_1>{},
+        group_modes<0, 2>(sV_x),
+        group_modes<0, 2>(gV_x)); // (TMA), (TMA)
+    auto bulk_copy = Copy_Traits<SM90_BULK_COPY_AUTO>{};
+
+    uint16_t mcast_mask_qdo = 0;
+    if constexpr (cute::is_same_v<GmemTiledCopyQdO, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<ClusterShape>{}; // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_qdo |=
+            (uint16_t(1) << block_layout(cluster_local_block_id.x, n, _0{}));
+      }
+    }
+
+    int m_block = m_block_min;
+    int next_m_block = -1;
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      pipeline_q.producer_acquire(smem_pipe_write);
+      copy(
+          params.tma_load_Q.with(
+              *pipeline_q.producer_get_barrier(smem_pipe_write),
+              mcast_mask_qdo,
+              TMA::CacheHintSm90::EVICT_LAST),
+          tQgQ(_, m_block),
+          tQsQ(_, smem_pipe_write.index()));
+      if constexpr (Softmax) {
+        copy(
+            bulk_copy.with(*pipeline_q.producer_get_barrier(smem_pipe_write)),
+            gLSE(_, m_block),
+            sLSE(_, smem_pipe_write.index()));
+      }
+    }
+
+    // // Wait for the MMA warpgroups to say that smem_k and smem_v are ready
+    // cutlass::arch::NamedBarrier::sync(NumMmaThreads +
+    // cutlass::NumThreadsPerWarp,
+    // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/);
+
+    auto load_step = [&](int m_block) {
+      // If Q and dO have the same number of stages, we can use the same
+      // pipeline state variable to reduce registers
+      PipelineState_dO smem_pipe_write_do_cur =
+          cute::conditional_return<Q_dO_same_stages>(
+              smem_pipe_write, smem_pipe_write_do);
+      pipeline_do.producer_acquire(smem_pipe_write_do_cur);
+      copy(
+          params.tma_load_dO.with(
+              *pipeline_do.producer_get_barrier(smem_pipe_write_do_cur),
+              mcast_mask_qdo,
+              TMA::CacheHintSm90::EVICT_LAST),
+          tdOgdO(_, m_block),
+          tdOsdO(_, smem_pipe_write_do_cur.index()));
+      if constexpr (Softmax) {
+        copy(
+            bulk_copy.with(
+                *pipeline_do.producer_get_barrier(smem_pipe_write_do_cur)),
+            gdPsum(_, m_block),
+            sdPsum(_, smem_pipe_write_do_cur.index()));
+      }
+      if constexpr (!Q_dO_same_stages) {
+        ++smem_pipe_write_do;
+      }
+      ++smem_pipe_write;
+      next_m_block = get_next_m_block(
+          m_block,
+          m_block_min,
+          m_block_max,
+          contexual_m_block_max,
+          full_m_block_min,
+          full_m_block_max);
+      if (next_m_block != -1) {
+        pipeline_q.producer_acquire(smem_pipe_write);
+        copy(
+            params.tma_load_Q.with(
+                *pipeline_q.producer_get_barrier(smem_pipe_write),
+                mcast_mask_qdo,
+                TMA::CacheHintSm90::EVICT_LAST),
+            tQgQ(_, next_m_block),
+            tQsQ(_, smem_pipe_write.index()));
+        if constexpr (Softmax) {
+          copy(
+              bulk_copy.with(*pipeline_q.producer_get_barrier(smem_pipe_write)),
+              gLSE(_, next_m_block),
+              sLSE(_, smem_pipe_write.index()));
+        }
+      }
+    };
+
+    if (lane_predicate) {
+      // Copy K tile and V tile from GMEM to SMEM.
+      shared_storage.pipelines.barrier_KV.arrive_and_expect_tx(
+          TmaTransactionBytesK + TmaTransactionBytesV);
+      copy(
+          params.tma_load_K.with(
+              reinterpret_cast<
+                  cutlass::arch::ClusterTransactionBarrier::ValueType&>(
+                  shared_storage.pipelines.barrier_KV),
+              0 /*mcast_mask*/),
+          tKgK,
+          tKsK);
+      copy(
+          params.tma_load_V.with(
+              reinterpret_cast<
+                  cutlass::arch::ClusterTransactionBarrier::ValueType&>(
+                  shared_storage.pipelines.barrier_KV),
+              0 /*mcast_mask*/),
+          tVgV,
+          tVsV);
+
+#pragma unroll(kHeadDim < 256 ? 2 : 1)
+      for (; m_block < m_block_max; ++m_block) {
+        load_step(m_block);
+      }
+    }
+    scheduler_prefetch();
+    m_block = next_m_block;
+    if constexpr (Contexual_mask) {
+      if (lane_predicate) {
+        if (m_block >= 0) {
+#pragma unroll(kHeadDim < 256 ? 2 : 1)
+          for (; m_block < contexual_m_block_max; ++m_block) {
+            load_step(m_block);
+          }
+        }
+      }
+    }
+    m_block = next_m_block;
+    if constexpr (Local) {
+      if (lane_predicate) {
+        if (m_block >= 0) {
+#pragma unroll(kHeadDim < 256 ? 2 : 1)
+          for (; m_block < full_m_block_max; ++m_block) {
+            load_step(m_block);
+          }
+        }
+      }
+    }
+    if constexpr (Q_dO_same_stages) {
+      smem_pipe_write_do = smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(
+      MainloopPipeline pipeline_q,
+      MainloopPipeline_dO pipeline_do,
+      PipelineState& smem_pipe_write) {
+    static_assert(
+        Q_dO_same_stages, "Q and dO must have the same number of stages");
+    // Need to copy since pipeline_q.producer_tail(smem_pipe_write) will
+    // increment smem_pipe_write
+    PipelineState smem_pipe_write_do = smem_pipe_write;
+    // Issue the epilogue waits
+    if (cute::elect_one_sync()) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all Consumer UNLOCKs), or
+       * if the stage was never used then would just be acquired since the phase
+       * was still inverted from make_producer_start_state
+       */
+      pipeline_q.producer_tail(smem_pipe_write);
+      pipeline_do.producer_tail(smem_pipe_write_do);
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(
+      MainloopPipeline pipeline_q,
+      MainloopPipeline_dO pipeline_do,
+      PipelineState& smem_pipe_write,
+      PipelineState_dO& smem_pipe_write_do) {
+    // Issue the epilogue waits
+    if (cute::elect_one_sync()) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all Consumer UNLOCKs), or
+       * if the stage was never used then would just be acquired since the phase
+       * was still inverted from make_producer_start_state
+       */
+      pipeline_q.producer_tail(smem_pipe_write);
+      pipeline_do.producer_tail(smem_pipe_write_do);
+    }
+  }
+
+  template <typename SharedStorage>
+  CUTLASS_DEVICE void store_dq(
+      Params const& params,
+      SharedStorage& shared_storage,
+      cute::tuple<int32_t, int32_t, int32_t> block_coord) {
+    if constexpr (!dQacc_use_TMA) {
+      return;
+    }
+
+    auto [n_block, bidh, bidb] = block_coord;
+    SeqlenInfo_t seqlen_info{
+        bidb,
+        get<0>(params.shape_Q),
+        get<0>(params.shape_K),
+        params.seq_offsets,
+        params.seq_offsets_q,
+        params.num_targets};
+    if constexpr (Jagged) {
+      static constexpr int kBlockN = get<1>(TileShape_MNK{});
+      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
+        return;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    int m_block_min, m_block_max;
+    if constexpr (Cross) {
+      auto m_block_min_max = get_cross_m_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    } else {
+      auto m_block_min_max = get_m_block_min_max(
+          max_attn_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    }
+    auto full_m_block_min_max = get_full_m_block_min_max(
+        seqlen_info.uihlen_q,
+        seqlen_info.seqlen_q,
+        min_full_attn_seq_len_,
+        m_block_max,
+        n_block);
+    int const full_m_block_min = get<0>(full_m_block_min_max);
+    int const full_m_block_max = get<1>(full_m_block_min_max);
+    int contexual_m_block_max = get_contexual_m_block_max(
+        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
+
+    Tensor sdQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_dqacc.data()),
+        SmemLayoutdQaccum{});
+    static constexpr int dQ_TMA_num_bytes =
+        CUTE_STATIC_V(size<0>(sdQ)) * sizeof(ElementAccum);
+
+    Tensor mdQaccum = make_tensor(
+        make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.ptr_dQaccum)),
+        params.shape_dQaccum,
+        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
+    Tensor gdQaccum_ = local_tile(
+        domain_offset(
+            make_coord(seqlen_info.offset_q_padded * kHeadDim), mdQaccum),
+        Shape<Int<kBlockM * kHeadDim>>{},
+        make_coord(_)); // (M * K, _)
+    Tensor gdQaccum = cute::flat_divide(
+        gdQaccum_,
+        Int<kBlockM * kHeadDim / NumMmaWarpGroups>{}); // (M * K / WG, WG, _)
+
+    int const num_batch = params.num_batch;
+    int const num_head = get<2>(params.shape_Q);
+    int* lock_ptr =
+        !Deterministic ? nullptr : params.dq_semaphore + bidb * num_head + bidh;
+    using Barrier = cutlass::GenericBarrier<cutlass::detail::SyncwarpSync>;
+    bool const lane_predicate = cute::elect_one_sync();
+
+    auto store_dq_step = [&](int m_block) {
+      if constexpr (Deterministic) {
+        Barrier::wait_eq(
+            lock_ptr,
+            threadIdx.x % cutlass::NumThreadsPerWarp,
+            m_block * num_batch * num_head,
+            n_block);
+      }
+#pragma unroll
+      for (int warpgroup_idx = 0; warpgroup_idx < NumMmaWarpGroups;
+           ++warpgroup_idx) {
+        cutlass::arch::NamedBarrier::sync(
+            cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+            static_cast<uint32_t>(BwdNamedBarriers::dQFullWG1) +
+                warpgroup_idx /*id*/); // sdQ full, to be written to gmem
+        if (lane_predicate) {
+          SM90_BULK_REDUCE_ADD::copy(
+              raw_pointer_cast(sdQ(_, warpgroup_idx).data()),
+              raw_pointer_cast(gdQaccum(_, warpgroup_idx, m_block).data()),
+              dQ_TMA_num_bytes,
+              static_cast<uint64_t>(TMA::CacheHintSm90::EVICT_LAST));
+          tma_store_arrive();
+        }
+      }
+      // Note, the for_each() function is required here to ensure
+      // `warpgroup_idx` is of type Int<x>.
+      for_each(make_int_sequence<NumMmaWarpGroups>{}, [&](auto warpgroup_idx) {
+        if (lane_predicate) {
+          tma_store_wait<NumMmaWarpGroups - 1 - CUTE_STATIC_V(warpgroup_idx)>();
+        }
+        cutlass::arch::NamedBarrier::arrive(
+            cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+            static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) +
+                warpgroup_idx /*id*/); // sdQ empty, ready to be written to
+      });
+      if constexpr (Deterministic) {
+        Barrier::arrive_inc(
+            lock_ptr,
+            threadIdx.x % cutlass::NumThreadsPerWarp,
+            m_block * num_batch * num_head);
+      }
+    };
+
+#pragma unroll 2
+    for (int m_block = m_block_min; m_block < m_block_max; ++m_block) {
+      store_dq_step(m_block);
+    }
+    if constexpr (Contexual_mask) {
+#pragma unroll 2
+      for (int m_block = 0; m_block < contexual_m_block_max; ++m_block) {
+        store_dq_step(m_block);
+      }
+    }
+    if constexpr (Local) {
+#pragma unroll 2
+      for (int m_block = full_m_block_min; m_block < full_m_block_max;
+           ++m_block) {
+        store_dq_step(m_block);
+      }
+    }
+    if constexpr (Local && Deterministic) {
+      constexpr int kBlockM = get<0>(TileShape_MNK{});
+      int const m_block_global_max =
+          cute::ceil_div(seqlen_info.seqlen_q, kBlockM);
+#pragma unroll 2
+      for (int m_block = m_block_max; m_block < m_block_global_max; ++m_block) {
+        Barrier::arrive_inc(
+            lock_ptr,
+            threadIdx.x % cutlass::NumThreadsPerWarp,
+            m_block * num_batch * num_head);
+      }
+    }
+  }
+
+  CUTLASS_DEVICE void mma_init() {
+    // We're not currently using this bc we're not using persistent scheduler
+    // // Tell producer (warp 0) that smem_k and smem_v are ready
+    // cutlass::arch::NamedBarrier::arrive(NumMmaThreads +
+    // cutlass::NumThreadsPerWarp,
+    // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/);
+    int warp_idx_in_warpgroup =
+        __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+    if constexpr (dQacc_use_TMA) {
+      if (warp_idx_in_warpgroup == 0) {
+        cutlass::arch::NamedBarrier::arrive(
+            cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+            static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) - 1 +
+                hstu::canonical_warp_group_idx_nosync() /*id*/); // sdQ empty,
+                                                                 // ready to be
+                                                                 // written to
+      }
+    }
+  }
+
+  template <typename SharedStorage, typename FrgTensordKV>
+  CUTLASS_DEVICE bool mma(
+      Params const& params,
+      MainloopPipeline pipeline_q,
+      MainloopPipeline_dO pipeline_do,
+      PipelineState& smem_pipe_read,
+      PipelineState_dO& smem_pipe_read_do,
+      FrgTensordKV& tdKrdK,
+      FrgTensordKV& tdVrdV,
+      int thread_idx,
+      int& work_idx,
+      cute::tuple<int32_t, int32_t, int32_t> block_coord,
+      SharedStorage& shared_storage) {
+    static_assert(
+        is_rmem<FrgTensordKV>::value,
+        "dK and dV tensor must be rmem resident.");
+
+    int n_block = get<0>(block_coord);
+    int bidb = get<2>(block_coord);
+    SeqlenInfo_t seqlen_info{
+        bidb,
+        get<0>(params.shape_Q),
+        get<0>(params.shape_K),
+        params.seq_offsets,
+        params.seq_offsets_q,
+        params.num_targets};
+    if constexpr (Jagged) {
+      static constexpr int kBlockN = get<1>(TileShape_MNK{});
+      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
+        return false;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    float scalar_scale_val_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    if (params.num_groups > 1) {
+      int group_id = bidb / params.batch_size_per_group;
+      int max_seq_len_per_group = params.max_seq_len_tensor[group_id];
+      // attention scale
+      scalar_scale_val_ = params.scalar_scale
+          ? (params.attn_scale == nullptr ? 1.0f / max_seq_len_per_group
+                                          : params.attn_scale[group_id])
+          : 0;
+    } else {
+      // attention scale
+      scalar_scale_val_ = params.scalar_scale
+          ? (params.attn_scale == nullptr ? params.max_seq_len_inv
+                                          : params.attn_scale[0])
+          : 0;
+    }
+    int m_block_min, m_block_max;
+    if constexpr (Cross) {
+      auto m_block_min_max = get_cross_m_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    } else {
+      auto m_block_min_max = get_m_block_min_max(
+          max_attn_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    }
+    auto full_m_block_min_max = get_full_m_block_min_max(
+        seqlen_info.uihlen_q,
+        seqlen_info.seqlen_q,
+        min_full_attn_seq_len_,
+        m_block_max,
+        n_block);
+    int const full_m_block_min = get<0>(full_m_block_min_max);
+    int const full_m_block_max = get<1>(full_m_block_min_max);
+    int contexual_m_block_max = get_contexual_m_block_max(
+        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sdO = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
+        SmemLayoutdO{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sV = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutV{});
+    Tensor sQt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQt{});
+    Tensor sdOt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
+        SmemLayoutdOt{});
+    Tensor sKt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutKt{});
+    Tensor sP = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+        SmemLayoutPdS{});
+    Tensor sP_pi = cute::as_position_independent_swizzle_tensor(sP);
+    Tensor sPt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+        SmemLayoutPdSt{});
+    Tensor sPt_pi = cute::as_position_independent_swizzle_tensor(sPt);
+    Tensor sdS = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
+        SmemLayoutPdS{});
+    Tensor sdS_pi = cute::as_position_independent_swizzle_tensor(sdS);
+    Tensor sdSt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
+        SmemLayoutPdSt{});
+    Tensor sdSt_pi = cute::as_position_independent_swizzle_tensor(sdSt);
+    Tensor sdQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_dqacc.data()),
+        SmemLayoutdQaccum{});
+
+    static_assert(
+        stride<0>(typename TiledMmaSdP::ALayout{}) == 0 and
+            stride<0>(typename TiledMmaSdP::BLayout{}) == 0 and
+            size<0>(typename TiledMmaSdP::ALayout{}) ==
+                cutlass::NumThreadsPerWarpGroup and
+            size<0>(typename TiledMmaSdP::BLayout{}) ==
+                cutlass::NumThreadsPerWarpGroup,
+        "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+    constexpr int MmaWarpGroups =
+        NumMmaThreads / cutlass::NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(
+        make_shape(Int<MmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+    Layout warp_group_thread_layout_dq = make_layout(
+        make_shape(Int<NumMmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+
+    int warp_group_idx = __shfl_sync(
+        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
+    TiledMmaSdP tiled_mma_SdP;
+    using TiledMmadP =
+        std::conditional_t<!Mma_dP_is_RS, TiledMmaSdP, TiledMmadPRS>;
+    TiledMmadP tiled_mma_dP;
+    TiledMmadKV tiled_mma_dKV;
+    TiledMmadQ tiled_mma_dQ;
+
+    auto wg_mma_SdP =
+        tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma_dP =
+        tiled_mma_dP.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto thread_mma_SdP = tiled_mma_SdP.get_thread_slice(thread_idx);
+    auto wg_mma_dKV =
+        tiled_mma_dKV.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma_dQ =
+        tiled_mma_dQ.get_slice(warp_group_thread_layout_dq(warp_group_idx));
+
+    auto smem_tiled_copy_PdS =
+        make_tiled_copy_C(SmemCopyAtomPdS{}, tiled_mma_SdP);
+    auto smem_thr_copy_PdS = smem_tiled_copy_PdS.get_thread_slice(thread_idx);
+
+    R2STiledCopydQaccum r2s_tiled_copy_dQaccum;
+    auto r2s_thr_copy_dQaccum =
+        r2s_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+    Tensor tdQsdQaccum = r2s_thr_copy_dQaccum.partition_D(sdQ);
+    // if (thread_idx == 0) { print(sdQ); printf("\n"); print(tdQsdQaccum);
+    // printf("\n"); }
+
+    // Allocate "fragments/descriptors"
+    // We have to use the templated mma_partition_fragment_AB instead of
+    // cute::conditional_return or lambda, because some partition_fragment_A/B
+    // don't compile.
+    // https://stackoverflow.com/questions/50051473/if-constexpr-in-c17-does-not-work-in-a-non-templated-function
+    Tensor tSrQ = mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sQ);
+    Tensor tSrK = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_SdP, sK);
+    Tensor tdPrdO =
+        mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sdO);
+    Tensor tdPrV = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_dP, sV);
+    Tensor tdVrdO =
+        mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sdOt);
+    Tensor tdKrQ = mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sQt);
+    Tensor tdQrdS = mma_partition_fragment_AB</*A=*/!dQ_swapAB>(wg_mma_dQ, sdS);
+    Tensor tdQrK = mma_partition_fragment_AB</*A=*/dQ_swapAB>(wg_mma_dQ, sKt);
+
+    Tensor tPsP = smem_thr_copy_PdS.partition_D(
+        cute::conditional_return<!SdP_swapAB>(
+            sP_pi, sPt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    Tensor tdSsdS = smem_thr_copy_PdS.partition_D(
+        cute::conditional_return<!SdP_swapAB>(
+            sdS_pi, sdSt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(smem_thr_copy_PdS);
+    // print(sP_pi); printf("\n"); print(sPt_pi); printf("\n"); print(tPsP);
+    // printf("\n"); print(tdSsdS); printf("\n"); }
+
+    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+    };
+
+    int bidh = get<1>(block_coord);
+    // For the case where we do atomicAdd directly to gdQaccum instead of using
+    // TMA
+    Tensor mdQaccum = make_tensor(
+        make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.ptr_dQaccum)),
+        params.shape_dQaccum,
+        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
+    Tensor gdQaccum_ = local_tile(
+        domain_offset(
+            make_coord(seqlen_info.offset_q_padded * kHeadDim), mdQaccum),
+        Shape<Int<kBlockM * kHeadDim>>{},
+        make_coord(_)); // (M * K, _)
+    Tensor gdQaccum = cute::flat_divide(
+        gdQaccum_,
+        Int<kBlockM * kHeadDim / NumMmaWarpGroups>{}); // (M * K / WG, WG, _)
+    // We can reuse r2s_thr_copy_dQaccum for this partitioning
+    Tensor tdQgdQaccum = r2s_thr_copy_dQaccum.partition_D(gdQaccum);
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(mdQaccum);
+    // printf("\n"); print(gdQaccum_); printf("\n"); print(gdQaccum);
+    // printf("\n"); print(tdQgdQaccum); printf("\n"); }
+
+    hstu::Mask<kBlockM, kBlockN, TiledMmaSdP, SdP_swapAB> mask(
+        thread_idx,
+        seqlen_info.seqlen_q,
+        seqlen_info.seqlen_kv,
+        max_attn_len_,
+        min_full_attn_seq_len_,
+        contextual_seq_len_,
+        seqlen_info.uihlen_q);
+
+    int m_block = m_block_min;
+
+    clear(tdKrdK);
+    clear(tdVrdV);
+    // tiled_mma_dKV.accumulate_ = GMMA::ScaleOut::Zero;
+
+    cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(
+        shared_storage.pipelines.barrier_KV.try_wait(work_idx % 2));
+    if (barrier_token == cutlass::BarrierStatus::WaitAgain) {
+      shared_storage.pipelines.barrier_KV.wait(work_idx % 2);
+    }
+
+    if constexpr (Mma_dP_is_RS) {
+      using SmemCopyAtomV = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+      auto smem_tiled_copy_V = make_tiled_copy_A(SmemCopyAtomV{}, tiled_mma_dP);
+      auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(thread_idx);
+      Tensor tdPrV_copy_view = smem_thr_copy_V.retile_D(tdPrV);
+      Tensor tdPsV_copy_view = smem_thr_copy_V.partition_S(
+          cute::as_position_independent_swizzle_tensor(sV));
+      cute::copy(smem_tiled_copy_V, tdPsV_copy_view, tdPrV_copy_view);
+    }
+    static constexpr int Qdim = !SdP_swapAB ? 0 : 1;
+    auto thread0_mma_SdP = tiled_mma_SdP.get_thread_slice(_0{});
+    Tensor cS = cute::make_identity_tensor(
+        Shape<
+            Int<!SdP_swapAB ? kBlockM : kBlockN>,
+            Int<!SdP_swapAB ? kBlockN : kBlockM>>{});
+    Tensor tScS = thread_mma_SdP.partition_C(cS);
+    Tensor tScS_rowcol = make_tensor(
+        tScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+            tScS.layout()));
+    Tensor t0ScS = thread0_mma_SdP.partition_C(cS);
+    Tensor t0ScS_rowcol = make_tensor(
+        t0ScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+            t0ScS.layout()));
+    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
+
+    auto bwd_step = [&](int m_block, auto mask_fn) {
+      Tensor tSrS = partition_fragment_C(
+          tiled_mma_SdP,
+          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
+      consumer_wait(pipeline_q, smem_pipe_read);
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
+          tiled_mma_SdP, tSrQ(_, _, _, smem_pipe_read.index()), tSrK, tSrS);
+      Tensor tdPrdP = partition_fragment_C(
+          tiled_mma_SdP,
+          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
+      PipelineState_dO smem_pipe_read_do_cur =
+          cute::conditional_return<Q_dO_same_stages>(
+              smem_pipe_read, smem_pipe_read_do);
+      consumer_wait(pipeline_do, smem_pipe_read_do_cur);
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
+          tiled_mma_dP,
+          tdPrdO(_, _, _, smem_pipe_read_do_cur.index()),
+          tdPrV,
+          tdPrdP);
+      warpgroup_wait<1>();
+      // Reshape tSrS from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
+      // ncol=(2, MMA_N))
+      Tensor scores = make_tensor(
+          tSrS.data(),
+          hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+              tSrS.layout()));
+      Tensor tSrS_sigmoid = make_tensor_like(tSrS);
+      Tensor sigmoid = make_tensor(
+          tSrS_sigmoid.data(),
+          hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+              tSrS_sigmoid.layout()));
+      int qdim_offset = params.scalar_scale
+          ? 0
+          : m_block * kBlockM + thread_qdim_offset + seqlen_info.offset_q;
+      mask_fn(tSrS, m_block);
+#pragma unroll
+      for (int mi = 0; mi < size<0>(scores); ++mi) {
+        float scale = scalar_scale_val_;
+        if (!params.scalar_scale) {
+          int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
+          int q_local = q_index - seqlen_info.offset_q;
+          if (q_local < seqlen_info.seqlen_q) {
+            scale = params.attn_scale[q_index];
+          }
+        }
+#pragma unroll
+        for (int ni = 0; ni < size<1>(scores); ++ni) {
+          scores(mi, ni) = scores(mi, ni) * params.alpha;
+          sigmoid(mi, ni) =
+              __fdividef(1., 1.0f + cutlass::fast_exp(-scores(mi, ni)));
+          scores(mi, ni) = sigmoid(mi, ni) * scores(mi, ni) * scale;
+        }
+      }
+      mask_fn(tSrS_sigmoid, m_block);
+
+      warpgroup_wait<0>();
+      // Reshape tdPrdP from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
+      // ncol=(2, MMA_N))
+      Tensor dS = make_tensor(tdPrdP.data(), scores.layout());
+#pragma unroll
+      for (int mi = 0; mi < size<0>(dS); ++mi) {
+        float scale = scalar_scale_val_;
+        if (!params.scalar_scale) {
+          int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
+          int q_local = q_index - seqlen_info.offset_q;
+          if (q_local < seqlen_info.seqlen_q) {
+            scale = params.attn_scale[q_index];
+          }
+        }
+#pragma unroll
+        for (int ni = 0; ni < size<1>(dS); ++ni) {
+          dS(mi, ni) = dS(mi, ni) * sigmoid(mi, ni) * scale +
+              dS(mi, ni) * scores(mi, ni) * (1.f - sigmoid(mi, ni));
+          dS(mi, ni) = dS(mi, ni) * params.alpha;
+          //   if (dS(mi, ni) > 0.0001) {
+          //     std::printf(
+          //         "dS(mi, ni) is (%f), (m, n) is (%d, %d), thread_idx is
+          //         (%d), blockIdx.z is (%d)\n", dS(mi, ni), mi, ni,
+          //         threadIdx.x,
+          //         blockIdx.z);
+          //   }
+        }
+      }
+      // Convert scores from fp32 to fp16/bf16
+      Tensor rP = make_tensor_like<Element>(tSrS);
+      hstu::convert_type_out(tSrS, rP);
+      if constexpr (!Mma_dKV_is_RS) {
+        // Need to sync to make sure P has already been used in the previous
+        // iteration before writing new values
+        if constexpr (kStages_dS == 1) {
+          cutlass::arch::NamedBarrier::sync(
+              NumMmaThreads,
+              static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        }
+        Tensor tPaP =
+            smem_thr_copy_PdS.retile_S(rP); // ((Atom,AtomNum), MMA_N, MMA_N)
+        cute::copy(
+            smem_tiled_copy_PdS,
+            tPaP,
+            tPsP(
+                _,
+                _,
+                _,
+                cute::conditional_return<kStages_dS == 1>(
+                    _0{}, smem_pipe_read.index())));
+      }
+      Tensor rdS = make_tensor_like<Element>(tdPrdP);
+      hstu::convert_type_out(tdPrdP, rdS);
+      // If there's double buffering on dS, we don't need to sync here.
+      // Otherwise we might have WG1 writing to dS before WG2 is done reading
+      // from it during MmadQ. But because both WGs have to sync at the end of
+      // the loop and double buffering, this race condition is not possible.
+      // This sync is to ensure (1) P is written in case of !Mma_dKV_is_RS and
+      // (2) dS is already read by the Mma in the previous iteration in case of
+      // Mma_dKV_is_RS.
+      if constexpr (!Mma_dKV_is_RS || (kStages_dS == 1 && Mma_dKV_is_RS)) {
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+      }
+      // For hdim 64, It's faster to write to smem_dS first before the dV gemm
+      Tensor tdSadS =
+          smem_thr_copy_PdS.retile_S(rdS); // ((Atom,AtomNum), MMA_N, MMA_N)
+      cute::copy(
+          smem_tiled_copy_PdS,
+          tdSadS,
+          tdSsdS(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index())));
+
+      if constexpr (!Slice_dQKV_Mma) {
+        // Most cases take this path, except for hdim256 where we want to slice
+        // to reduce register pressure
+        if constexpr (Mma_dKV_is_RS) {
+          Tensor tdVrP = make_tensor(
+              rP.data(), convert_layout_acc_Aregs<TiledMmadKV>(tSrS.layout()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+              tiled_mma_dKV,
+              tdVrP,
+              tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+              tdVrdV);
+        } else {
+          Tensor tdVrP =
+              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
+          Tensor tdVrP_cur = tdVrP(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index()));
+          hstu::
+              gemm</*zero_init=*/false, /*wg_wait=*/-1, /*SwapAB=*/dKV_swapAB>(
+                  tiled_mma_dKV,
+                  tdVrP_cur,
+                  tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+                  tdVrdV);
+        }
+        // SMEM fence to make sure sdS is written before it's read by WGMMA
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        Tensor tdQrdQ = partition_fragment_C(
+            tiled_mma_dQ,
+            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
+        Tensor tdQrdS_cur = tdQrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm</*zero_init=*/true, /*wg_wait=*/1, /*SwapAB=*/dQ_swapAB>(
+            tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dQ
+
+        if constexpr (Mma_dKV_is_RS) {
+          Tensor tdKrdS = make_tensor(
+              rdS.data(),
+              convert_layout_acc_Aregs<TiledMmadKV>(tdPrdP.layout()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1>(
+              tiled_mma_dKV,
+              tdKrdS,
+              tdKrQ(_, _, _, smem_pipe_read.index()),
+              tdKrdK);
+        } else {
+          Tensor tdKrdS =
+              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
+          Tensor tdKrdS_cur = tdKrdS(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1, /*SwapAB=*/dKV_swapAB>(
+              tiled_mma_dKV,
+              tdKrdS_cur,
+              tdKrQ(_, _, _, smem_pipe_read.index()),
+              tdKrdK);
+        }
+        if constexpr (dQacc_use_TMA) {
+          int const warp_group_idx =
+              hstu::canonical_warp_group_idx_nosync() - 1;
+          cutlass::arch::NamedBarrier::sync(
+              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+              static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) +
+                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
+          Tensor taccdQrdQ = r2s_thr_copy_dQaccum.retile_S(tdQrdQ);
+          cute::copy(r2s_tiled_copy_dQaccum, taccdQrdQ, tdQsdQaccum);
+          cutlass::arch::fence_view_async_shared();
+          cutlass::arch::NamedBarrier::arrive(
+              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+              static_cast<uint32_t>(BwdNamedBarriers::dQFullWG1) +
+                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
+        } else {
+          // We can reuse r2s_thr_copy_dQaccum for this partitioning
+          Tensor tdQrdQ_atomic =
+              recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
+          Tensor tdQgdQaccum_atomic =
+              recast<float4>(tdQgdQaccum(_, _, _, m_block));
+          static_assert(
+              CUTE_STATIC_V(size(tdQrdQ_atomic)) ==
+              CUTE_STATIC_V(size(tdQgdQaccum_atomic)));
+#pragma unroll
+          for (int i = 0; i < size(tdQrdQ_atomic); ++i) {
+            atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+          }
+        }
+
+      } else { // Slice_dQKV_Mma
+
+        static_assert(!(Slice_dQKV_Mma && Mma_dKV_is_RS));
+        Tensor tdVrP =
+            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
+        Tensor tdVrP_cur = tdVrP(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/0>(
+            tiled_mma_dKV,
+            tdVrP_cur,
+            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+            tdVrdV);
+
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        Tensor tdQrdQ = partition_fragment_C(
+            tiled_mma_dQ,
+            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
+        Tensor tdQrdS_cur = tdQrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/true,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dQ_swapAB,
+            /*M_slice=*/0>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/1>(
+            tiled_mma_dKV,
+            tdVrP_cur,
+            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+            tdVrdV);
+        Tensor tdQrdQ_atomic =
+            recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
+        Tensor tdQgdQaccum_atomic =
+            recast<float4>(tdQgdQaccum(_, _, _, m_block));
+#pragma unroll
+        for (int i = 0; i < size(tdQrdQ_atomic) / 2; ++i) {
+          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+        }
+
+        Tensor tdKrdS =
+            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
+        Tensor tdKrdS_cur = tdKrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/0>(
+            tiled_mma_dKV,
+            tdKrdS_cur,
+            tdKrQ(_, _, _, smem_pipe_read.index()),
+            tdKrdK);
+        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dO
+
+        hstu::gemm<
+            /*zero_init=*/true,
+            /*wg_wait=*/0,
+            /*SwapAB=*/dQ_swapAB,
+            /*M_slice=*/1>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+#pragma unroll
+        for (int i = size(tdQrdQ_atomic) / 2; i < size(tdQrdQ_atomic); ++i) {
+          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+        }
+
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/1>(
+            tiled_mma_dKV,
+            tdKrdS_cur,
+            tdKrQ(_, _, _, smem_pipe_read.index()),
+            tdKrdK);
+      }
+
+      warpgroup_wait<0>();
+      pipeline_q.consumer_release(smem_pipe_read); // release Q
+      ++smem_pipe_read;
+      if constexpr (!Q_dO_same_stages) {
+        ++smem_pipe_read_do;
+      }
+    };
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+
+    if constexpr (Cross) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            false /*Local*/,
+            false /*Contexual_mask*/,
+            false /*Target_mask*/,
+            Cross,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+      if constexpr (Q_dO_same_stages) {
+        smem_pipe_read_do = smem_pipe_read;
+      }
+      ++work_idx;
+      return true;
+    }
+    if constexpr (Has_targets) {
+      if (n_block * kBlockN >= seqlen_info.uihlen_q) {
+        auto mask_fn = [&](auto& tSrS, int m_block) {
+          mask.template apply<
+              true /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal*/,
+              false /*Local*/,
+              false /*Contexual_mask*/,
+              Has_targets /*Target_mask*/,
+              false /*Cross*/,
+              false /*Softmax*/>(tSrS, m_block, n_block);
+        };
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (; m_block < m_block_max; ++m_block) {
+          bwd_step(m_block, mask_fn);
+        }
+        if constexpr (Q_dO_same_stages) {
+          smem_pipe_read_do = smem_pipe_read;
+        }
+        ++work_idx;
+        return true;
+      } else if ((n_block + 1) * kBlockN >= seqlen_info.uihlen_q) {
+        if constexpr ((Causal || Local) && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                Causal,
+                Local,
+                Contexual_mask,
+                Has_targets /*Target_mask*/,
+                false /*Cross*/,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_block_masking_max =
+              ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < std::min(m_block_max, m_block_masking_max);
+               ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        auto mask_fn = [&](auto& tSrS, int m_block) {
+          mask.template apply<
+              true /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              Causal && !SeparateMaskingIterations,
+              Local && !SeparateMaskingIterations,
+              Contexual_mask,
+              Has_targets /*Target_mask*/,
+              false /*Cross*/,
+              false /*Softmax*/>(tSrS, m_block, n_block);
+        };
+        if constexpr (SeparateMaskingIterations) {
+          int const m_block_max_before_local_mask =
+              !Local || !SeparateMaskingIterations
+              ? m_block_max
+              : std::min(
+                    m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < m_block_max_before_local_mask; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        } else {
+          int num_m_block = m_block_max - m_block_min;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int i = 0; i < num_m_block + full_m_block_max -
+                   full_m_block_min + contexual_m_block_max;
+               ++i) {
+            if (i < num_m_block) {
+              m_block = m_block_min + i;
+            } else if (i < num_m_block + contexual_m_block_max) {
+              m_block = i - num_m_block;
+            } else {
+              m_block =
+                  i - num_m_block - contexual_m_block_max + full_m_block_min;
+            }
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        if constexpr (Local && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                false /*Causal_mask*/,
+                Local,
+                Contexual_mask,
+                Has_targets /*Target_mask*/,
+                false /*Cross*/,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < m_block_max; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+        if constexpr (Contexual_mask && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                Causal /*Causal_mask*/,
+                Local /*Local_mask*/,
+                Contexual_mask,
+                Has_targets,
+                false /*Cross*/,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        if constexpr (Local && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                false /*Causal_mask*/,
+                Local,
+                Contexual_mask,
+                Has_targets,
+                false /*Cross*/,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (m_block = full_m_block_min; m_block < full_m_block_max;
+               ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+        if constexpr (Q_dO_same_stages) {
+          smem_pipe_read_do = smem_pipe_read;
+        }
+        ++work_idx;
+        return true;
+      }
+    }
+    // We have separate iterations with causal masking. Not necessary for hdim
+    // 128 but for hdim 64 this helps quite a bit to not have to do causal
+    // masking for most of the iterations.
+    if constexpr ((Causal || Local) && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      static constexpr int kBlockM = get<0>(TileShape_MNK{});
+      int const m_block_masking_max =
+          ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < std::min(m_block_max, m_block_masking_max); ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    auto mask_fn = [&](auto& tSrS, int m_block) {
+      mask.template apply<
+          true /*Seqlenq_mask*/,
+          true /*Seqlenk_mask*/,
+          Causal && !SeparateMaskingIterations,
+          Local && !SeparateMaskingIterations,
+          Contexual_mask,
+          false /*Target_mask*/,
+          false /*Cross*/,
+          false /*Softmax*/>(tSrS, m_block, n_block);
+    };
+    if constexpr (SeparateMaskingIterations) {
+      int const m_block_max_before_local_mask =
+          !Local || !SeparateMaskingIterations
+          ? m_block_max
+          : std::min(
+                m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max_before_local_mask; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    } else {
+      int num_m_block = m_block_max - m_block_min;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int i = 0; i < num_m_block + full_m_block_max - full_m_block_min +
+               contexual_m_block_max;
+           ++i) {
+        if (i < num_m_block) {
+          m_block = m_block_min + i;
+        } else if (i < num_m_block + contexual_m_block_max) {
+          m_block = i - num_m_block;
+        } else {
+          m_block = i - num_m_block - contexual_m_block_max + full_m_block_min;
+        }
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    if constexpr (Local && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal_mask*/,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+    if constexpr (Contexual_mask && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal /*Causal_mask*/,
+            Local /*Local_mask*/,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    if constexpr (Local && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal_mask*/,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (m_block = full_m_block_min; m_block < full_m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(tdVrdV); }
+    if constexpr (Q_dO_same_stages) {
+      smem_pipe_read_do = smem_pipe_read;
+    }
+    ++work_idx;
+    return true;
+  }
+
+  template <typename SharedStorage, typename FrgTensordKV>
+  CUTLASS_DEVICE bool mma_softmax(
+      Params const& params,
+      MainloopPipeline pipeline_q,
+      MainloopPipeline_dO pipeline_do,
+      PipelineState& smem_pipe_read,
+      PipelineState_dO& smem_pipe_read_do,
+      FrgTensordKV& tdKrdK,
+      FrgTensordKV& tdVrdV,
+      int thread_idx,
+      int& work_idx,
+      cute::tuple<int32_t, int32_t, int32_t> block_coord,
+      SharedStorage& shared_storage) {
+    static_assert(
+        is_rmem<FrgTensordKV>::value,
+        "dK and dV tensor must be rmem resident.");
+
+    int n_block = get<0>(block_coord);
+    int bidb = get<2>(block_coord);
+    SeqlenInfo_t seqlen_info{
+        bidb,
+        get<0>(params.shape_Q),
+        get<0>(params.shape_K),
+        params.seq_offsets,
+        params.seq_offsets_q,
+        params.num_targets};
+    if constexpr (Jagged) {
+      static constexpr int kBlockN = get<1>(TileShape_MNK{});
+      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
+        return false;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.num_groups;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    int m_block_min, m_block_max;
+    if constexpr (Cross) {
+      auto m_block_min_max = get_cross_m_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    } else {
+      auto m_block_min_max = get_m_block_min_max(
+          max_attn_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    }
+    auto full_m_block_min_max = get_full_m_block_min_max(
+        seqlen_info.uihlen_q,
+        seqlen_info.seqlen_q,
+        min_full_attn_seq_len_,
+        m_block_max,
+        n_block);
+    int const full_m_block_min = get<0>(full_m_block_min_max);
+    int const full_m_block_max = get<1>(full_m_block_min_max);
+    int contexual_m_block_max = get_contexual_m_block_max(
+        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sdO = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
+        SmemLayoutdO{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sV = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutV{});
+    Tensor sQt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQt{});
+    Tensor sdOt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
+        SmemLayoutdOt{});
+    Tensor sKt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutKt{});
+    Tensor sP = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+        SmemLayoutPdS{});
+    Tensor sP_pi = cute::as_position_independent_swizzle_tensor(sP);
+    Tensor sPt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+        SmemLayoutPdSt{});
+    Tensor sPt_pi = cute::as_position_independent_swizzle_tensor(sPt);
+    Tensor sdS = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
+        SmemLayoutPdS{});
+    Tensor sdS_pi = cute::as_position_independent_swizzle_tensor(sdS);
+    Tensor sdSt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
+        SmemLayoutPdSt{});
+    Tensor sdSt_pi = cute::as_position_independent_swizzle_tensor(sdSt);
+    Tensor sdQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_dqacc.data()),
+        SmemLayoutdQaccum{});
+    Tensor sLSEMma = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_lse.data()),
+        SmemLayoutLSEMma{});
+    Tensor sdPsumMma = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_dpsum.data()),
+        SmemLayoutLSEMma{});
+
+    static_assert(
+        stride<0>(typename TiledMmaSdP::ALayout{}) == 0 and
+            stride<0>(typename TiledMmaSdP::BLayout{}) == 0 and
+            size<0>(typename TiledMmaSdP::ALayout{}) ==
+                cutlass::NumThreadsPerWarpGroup and
+            size<0>(typename TiledMmaSdP::BLayout{}) ==
+                cutlass::NumThreadsPerWarpGroup,
+        "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+    constexpr int MmaWarpGroups =
+        NumMmaThreads / cutlass::NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(
+        make_shape(Int<MmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+    Layout warp_group_thread_layout_dq = make_layout(
+        make_shape(Int<NumMmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+
+    int warp_group_idx = __shfl_sync(
+        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
+    TiledMmaSdP tiled_mma_SdP;
+    using TiledMmadP =
+        std::conditional_t<!Mma_dP_is_RS, TiledMmaSdP, TiledMmadPRS>;
+    TiledMmadP tiled_mma_dP;
+    TiledMmadKV tiled_mma_dKV;
+    TiledMmadQ tiled_mma_dQ;
+
+    auto wg_mma_SdP =
+        tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma_dP =
+        tiled_mma_dP.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto thread_mma_SdP = tiled_mma_SdP.get_thread_slice(thread_idx);
+    auto wg_mma_dKV =
+        tiled_mma_dKV.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma_dQ =
+        tiled_mma_dQ.get_slice(warp_group_thread_layout_dq(warp_group_idx));
+
+    auto smem_tiled_copy_PdS =
+        make_tiled_copy_C(SmemCopyAtomPdS{}, tiled_mma_SdP);
+    auto smem_thr_copy_PdS = smem_tiled_copy_PdS.get_thread_slice(thread_idx);
+
+    R2STiledCopydQaccum r2s_tiled_copy_dQaccum;
+    auto r2s_thr_copy_dQaccum =
+        r2s_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+    Tensor tdQsdQaccum = r2s_thr_copy_dQaccum.partition_D(sdQ);
+    // if (thread_idx == 0) { print(sdQ); printf("\n"); print(tdQsdQaccum);
+    // printf("\n"); }
+
+    // Allocate "fragments/descriptors"
+    // We have to use the templated mma_partition_fragment_AB instead of
+    // cute::conditional_return or lambda, because some partition_fragment_A/B
+    // don't compile.
+    // https://stackoverflow.com/questions/50051473/if-constexpr-in-c17-does-not-work-in-a-non-templated-function
+    Tensor tSrQ = mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sQ);
+    Tensor tSrK = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_SdP, sK);
+    Tensor tdPrdO =
+        mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sdO);
+    Tensor tdPrV = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_dP, sV);
+    Tensor tdVrdO =
+        mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sdOt);
+    Tensor tdKrQ = mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sQt);
+    Tensor tdQrdS = mma_partition_fragment_AB</*A=*/!dQ_swapAB>(wg_mma_dQ, sdS);
+    Tensor tdQrK = mma_partition_fragment_AB</*A=*/dQ_swapAB>(wg_mma_dQ, sKt);
+
+    Tensor tPsP = smem_thr_copy_PdS.partition_D(
+        cute::conditional_return<!SdP_swapAB>(
+            sP_pi, sPt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    Tensor tdSsdS = smem_thr_copy_PdS.partition_D(
+        cute::conditional_return<!SdP_swapAB>(
+            sdS_pi, sdSt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(smem_thr_copy_PdS);
+    // print(sP_pi); printf("\n"); print(sPt_pi); printf("\n"); print(tPsP);
+    // printf("\n"); print(tdSsdS); printf("\n"); }
+
+    // thread_mma_SdP.partition_C(sLSEMma) has shape ((2, 2, V), MMA_M, MMA_N,
+    // PIPE), we only take the col indices or row indices, depending on whether
+    // SdP_swapAB.
+    Tensor tLSEsLSE = cute::conditional_return<!SdP_swapAB>(
+        group_modes<0, 2>(thread_mma_SdP.partition_C(sLSEMma)(
+            make_coord(_0{}, _, _0{}), _, _0{}, _)), // (2, MMA_M, PIPE)
+        group_modes<0, 3>(thread_mma_SdP.partition_C(sLSEMma)(
+            make_coord(_, _0{}, _), _0{}, _, _))); // (2, V, MMA_N, PIPE)
+    Tensor tLSEsdPsum = cute::conditional_return<!SdP_swapAB>(
+        group_modes<0, 2>(thread_mma_SdP.partition_C(sdPsumMma)(
+            make_coord(_0{}, _, _0{}), _, _0{}, _)),
+        group_modes<0, 3>(thread_mma_SdP.partition_C(sdPsumMma)(
+            make_coord(_, _0{}, _), _0{}, _, _)));
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(sLSEMma);
+    // printf("\n"); print(tLSEsLSE); printf("\n"); } If we want to split the
+    // stats among the 8 threads that share the same rows.
+    static constexpr int kStatsPerThread =
+        cute::ceil_div(decltype(size(tLSEsLSE))::value, 8);
+
+    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+    };
+
+    int bidh = get<1>(block_coord);
+    // For the case where we do atomicAdd directly to gdQaccum instead of using
+    // TMA
+    Tensor mdQaccum = make_tensor(
+        make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.ptr_dQaccum)),
+        params.shape_dQaccum,
+        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
+    Tensor gdQaccum_ = local_tile(
+        domain_offset(
+            make_coord(seqlen_info.offset_q_padded * kHeadDim), mdQaccum),
+        Shape<Int<kBlockM * kHeadDim>>{},
+        make_coord(_)); // (M * K, _)
+    Tensor gdQaccum = cute::flat_divide(
+        gdQaccum_,
+        Int<kBlockM * kHeadDim / NumMmaWarpGroups>{}); // (M * K / WG, WG, _)
+    // We can reuse r2s_thr_copy_dQaccum for this partitioning
+    Tensor tdQgdQaccum = r2s_thr_copy_dQaccum.partition_D(gdQaccum);
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(mdQaccum);
+    // printf("\n"); print(gdQaccum_); printf("\n"); print(gdQaccum);
+    // printf("\n"); print(tdQgdQaccum); printf("\n"); }
+
+    hstu::Mask<kBlockM, kBlockN, TiledMmaSdP, SdP_swapAB> mask(
+        thread_idx,
+        seqlen_info.seqlen_q,
+        seqlen_info.seqlen_kv,
+        max_attn_len_,
+        min_full_attn_seq_len_,
+        contextual_seq_len_,
+        seqlen_info.uihlen_q);
+
+    int m_block = m_block_min;
+
+    clear(tdKrdK);
+    clear(tdVrdV);
+    // tiled_mma_dKV.accumulate_ = GMMA::ScaleOut::Zero;
+
+    cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(
+        shared_storage.pipelines.barrier_KV.try_wait(work_idx % 2));
+    if (barrier_token == cutlass::BarrierStatus::WaitAgain) {
+      shared_storage.pipelines.barrier_KV.wait(work_idx % 2);
+    }
+
+    if constexpr (Mma_dP_is_RS) {
+      using SmemCopyAtomV = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+      auto smem_tiled_copy_V = make_tiled_copy_A(SmemCopyAtomV{}, tiled_mma_dP);
+      auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(thread_idx);
+      Tensor tdPrV_copy_view = smem_thr_copy_V.retile_D(tdPrV);
+      Tensor tdPsV_copy_view = smem_thr_copy_V.partition_S(
+          cute::as_position_independent_swizzle_tensor(sV));
+      cute::copy(smem_tiled_copy_V, tdPsV_copy_view, tdPrV_copy_view);
+    }
+    static constexpr int Qdim = !SdP_swapAB ? 0 : 1;
+    auto thread0_mma_SdP = tiled_mma_SdP.get_thread_slice(_0{});
+    Tensor cS = cute::make_identity_tensor(
+        Shape<
+            Int<!SdP_swapAB ? kBlockM : kBlockN>,
+            Int<!SdP_swapAB ? kBlockN : kBlockM>>{});
+    Tensor tScS = thread_mma_SdP.partition_C(cS);
+    Tensor tScS_rowcol = make_tensor(
+        tScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+            tScS.layout()));
+    Tensor t0ScS = thread0_mma_SdP.partition_C(cS);
+    Tensor t0ScS_rowcol = make_tensor(
+        t0ScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+            t0ScS.layout()));
+    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
+
+    auto bwd_step = [&](int m_block, auto mask_fn) {
+      Tensor tSrS = partition_fragment_C(
+          tiled_mma_SdP,
+          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
+      consumer_wait(pipeline_q, smem_pipe_read);
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
+          tiled_mma_SdP, tSrQ(_, _, _, smem_pipe_read.index()), tSrK, tSrS);
+      Tensor tLSErLSE = cute::conditional_return<!ShuffleLSE>(
+          make_fragment_like(tLSEsLSE(_, _0{})),
+          make_tensor<ElementAccum>(Int<kStatsPerThread>{}));
+      if constexpr (!ShuffleLSE) {
+        cute::copy(tLSEsLSE(_, smem_pipe_read.index()), tLSErLSE);
+      } else {
+#pragma unroll
+        for (int i = 0; i < kStatsPerThread; ++i) {
+          // It's ok to read OOB, since we made sure sLSE is large enough and we
+          // won't use the OOB values
+          tLSErLSE(i) =
+              tLSEsLSE((thread_idx % 32) / 4 + i * 8, smem_pipe_read.index());
+        }
+      }
+      Tensor tdPrdP = partition_fragment_C(
+          tiled_mma_SdP,
+          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
+      PipelineState_dO smem_pipe_read_do_cur =
+          cute::conditional_return<Q_dO_same_stages>(
+              smem_pipe_read, smem_pipe_read_do);
+      consumer_wait(pipeline_do, smem_pipe_read_do_cur);
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
+          tiled_mma_dP,
+          tdPrdO(_, _, _, smem_pipe_read_do_cur.index()),
+          tdPrV,
+          tdPrdP);
+      warpgroup_wait<1>();
+      // Reshape tSrS from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
+      // ncol=(2, MMA_N))
+      Tensor scores = make_tensor(
+          tSrS.data(),
+          hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+              tSrS.layout()));
+      mask_fn(tSrS, m_block);
+#pragma unroll
+      for (int mi = 0; mi < size<0>(scores); ++mi) {
+        float const lse_scaled = [&] {
+          if constexpr (!ShuffleLSE)
+            return tLSErLSE(mi);
+          else
+            return __shfl_sync(
+                0xffffffff, tLSErLSE(mi / 8), (mi % 8) * 4 + (thread_idx % 4));
+        }();
+#pragma unroll
+        for (int ni = 0; ni < size<1>(scores); ++ni) {
+          scores(mi, ni) =
+              exp2f(scores(mi, ni) * params.alpha_log2 - lse_scaled);
+        }
+      }
+      Tensor tLSErdPsum = cute::conditional_return<!ShuffledPsum>(
+          make_fragment_like(tLSEsdPsum(_, _0{})),
+          make_tensor<ElementAccum>(Int<kStatsPerThread>{}));
+      if constexpr (!ShuffledPsum) {
+        cute::copy(tLSEsdPsum(_, smem_pipe_read_do_cur.index()), tLSErdPsum);
+      } else {
+#pragma unroll
+        for (int i = 0; i < kStatsPerThread; ++i) {
+          tLSErdPsum(i) = tLSEsdPsum(
+              (thread_idx % 32) / 4 + i * 8, smem_pipe_read_do_cur.index());
+        }
+      }
+
+      warpgroup_wait<0>();
+      // Reshape tdPrdP from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
+      // ncol=(2, MMA_N))
+      Tensor dS = make_tensor(tdPrdP.data(), scores.layout());
+#pragma unroll
+      for (int mi = 0; mi < size<0>(dS); ++mi) {
+        float const dP_sum_cur = [&] {
+          if constexpr (!ShuffledPsum)
+            return tLSErdPsum(mi);
+          else
+            return __shfl_sync(
+                0xffffffff,
+                tLSErdPsum(mi / 8),
+                (mi % 8) * 4 + (thread_idx % 4));
+        }();
+#pragma unroll
+        for (int ni = 0; ni < size<1>(dS); ++ni) {
+          dS(mi, ni) =
+              scores(mi, ni) * (dS(mi, ni) - dP_sum_cur) * params.alpha;
+        }
+      }
+      // Convert scores from fp32 to fp16/bf16
+      Tensor rP = make_tensor_like<Element>(tSrS);
+      hstu::convert_type_out(tSrS, rP);
+      if constexpr (!Mma_dKV_is_RS) {
+        // Need to sync to make sure P has already been used in the previous
+        // iteration before writing new values
+        if constexpr (kStages_dS == 1) {
+          cutlass::arch::NamedBarrier::sync(
+              NumMmaThreads,
+              static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        }
+        Tensor tPaP =
+            smem_thr_copy_PdS.retile_S(rP); // ((Atom,AtomNum), MMA_N, MMA_N)
+        cute::copy(
+            smem_tiled_copy_PdS,
+            tPaP,
+            tPsP(
+                _,
+                _,
+                _,
+                cute::conditional_return<kStages_dS == 1>(
+                    _0{}, smem_pipe_read.index())));
+      }
+      Tensor rdS = make_tensor_like<Element>(tdPrdP);
+      hstu::convert_type_out(tdPrdP, rdS);
+      // If there's double buffering on dS, we don't need to sync here.
+      // Otherwise we might have WG1 writing to dS before WG2 is done reading
+      // from it during MmadQ. But because both WGs have to sync at the end of
+      // the loop and double buffering, this race condition is not possible.
+      // This sync is to ensure (1) P is written in case of !Mma_dKV_is_RS and
+      // (2) dS is already read by the Mma in the previous iteration in case of
+      // Mma_dKV_is_RS.
+      if constexpr (!Mma_dKV_is_RS || (kStages_dS == 1 && Mma_dKV_is_RS)) {
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+      }
+      // For hdim 64, It's faster to write to smem_dS first before the dV gemm
+      Tensor tdSadS =
+          smem_thr_copy_PdS.retile_S(rdS); // ((Atom,AtomNum), MMA_N, MMA_N)
+      cute::copy(
+          smem_tiled_copy_PdS,
+          tdSadS,
+          tdSsdS(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index())));
+
+      if constexpr (!Slice_dQKV_Mma) {
+        // Most cases take this path, except for hdim256 where we want to slice
+        // to reduce register pressure
+        if constexpr (Mma_dKV_is_RS) {
+          Tensor tdVrP = make_tensor(
+              rP.data(), convert_layout_acc_Aregs<TiledMmadKV>(tSrS.layout()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+              tiled_mma_dKV,
+              tdVrP,
+              tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+              tdVrdV);
+        } else {
+          Tensor tdVrP =
+              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
+          Tensor tdVrP_cur = tdVrP(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index()));
+          hstu::
+              gemm</*zero_init=*/false, /*wg_wait=*/-1, /*SwapAB=*/dKV_swapAB>(
+                  tiled_mma_dKV,
+                  tdVrP_cur,
+                  tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+                  tdVrdV);
+        }
+        // SMEM fence to make sure sdS is written before it's read by WGMMA
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        Tensor tdQrdQ = partition_fragment_C(
+            tiled_mma_dQ,
+            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
+        Tensor tdQrdS_cur = tdQrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm</*zero_init=*/true, /*wg_wait=*/1, /*SwapAB=*/dQ_swapAB>(
+            tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dQ
+
+        if constexpr (Mma_dKV_is_RS) {
+          Tensor tdKrdS = make_tensor(
+              rdS.data(),
+              convert_layout_acc_Aregs<TiledMmadKV>(tdPrdP.layout()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1>(
+              tiled_mma_dKV,
+              tdKrdS,
+              tdKrQ(_, _, _, smem_pipe_read.index()),
+              tdKrdK);
+        } else {
+          Tensor tdKrdS =
+              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
+          Tensor tdKrdS_cur = tdKrdS(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1, /*SwapAB=*/dKV_swapAB>(
+              tiled_mma_dKV,
+              tdKrdS_cur,
+              tdKrQ(_, _, _, smem_pipe_read.index()),
+              tdKrdK);
+        }
+        if constexpr (dQacc_use_TMA) {
+          int const warp_group_idx =
+              hstu::canonical_warp_group_idx_nosync() - 1;
+          cutlass::arch::NamedBarrier::sync(
+              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+              static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) +
+                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
+          Tensor taccdQrdQ = r2s_thr_copy_dQaccum.retile_S(tdQrdQ);
+          cute::copy(r2s_tiled_copy_dQaccum, taccdQrdQ, tdQsdQaccum);
+          cutlass::arch::fence_view_async_shared();
+          cutlass::arch::NamedBarrier::arrive(
+              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+              static_cast<uint32_t>(BwdNamedBarriers::dQFullWG1) +
+                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
+        } else {
+          // We can reuse r2s_thr_copy_dQaccum for this partitioning
+          Tensor tdQrdQ_atomic =
+              recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
+          Tensor tdQgdQaccum_atomic =
+              recast<float4>(tdQgdQaccum(_, _, _, m_block));
+          static_assert(
+              CUTE_STATIC_V(size(tdQrdQ_atomic)) ==
+              CUTE_STATIC_V(size(tdQgdQaccum_atomic)));
+#pragma unroll
+          for (int i = 0; i < size(tdQrdQ_atomic); ++i) {
+            atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+          }
+        }
+
+      } else { // Slice_dQKV_Mma
+
+        static_assert(!(Slice_dQKV_Mma && Mma_dKV_is_RS));
+        Tensor tdVrP =
+            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
+        Tensor tdVrP_cur = tdVrP(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/0>(
+            tiled_mma_dKV,
+            tdVrP_cur,
+            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+            tdVrdV);
+
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        Tensor tdQrdQ = partition_fragment_C(
+            tiled_mma_dQ,
+            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
+        Tensor tdQrdS_cur = tdQrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/true,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dQ_swapAB,
+            /*M_slice=*/0>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/1>(
+            tiled_mma_dKV,
+            tdVrP_cur,
+            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+            tdVrdV);
+        Tensor tdQrdQ_atomic =
+            recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
+        Tensor tdQgdQaccum_atomic =
+            recast<float4>(tdQgdQaccum(_, _, _, m_block));
+#pragma unroll
+        for (int i = 0; i < size(tdQrdQ_atomic) / 2; ++i) {
+          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+        }
+
+        Tensor tdKrdS =
+            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
+        Tensor tdKrdS_cur = tdKrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/0>(
+            tiled_mma_dKV,
+            tdKrdS_cur,
+            tdKrQ(_, _, _, smem_pipe_read.index()),
+            tdKrdK);
+        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dO
+
+        hstu::gemm<
+            /*zero_init=*/true,
+            /*wg_wait=*/0,
+            /*SwapAB=*/dQ_swapAB,
+            /*M_slice=*/1>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+#pragma unroll
+        for (int i = size(tdQrdQ_atomic) / 2; i < size(tdQrdQ_atomic); ++i) {
+          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+        }
+
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/1>(
+            tiled_mma_dKV,
+            tdKrdS_cur,
+            tdKrQ(_, _, _, smem_pipe_read.index()),
+            tdKrdK);
+      }
+
+      warpgroup_wait<0>();
+      pipeline_q.consumer_release(smem_pipe_read); // release Q
+      ++smem_pipe_read;
+      if constexpr (!Q_dO_same_stages) {
+        ++smem_pipe_read_do;
+      }
+    };
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+
+    if constexpr (Cross) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            false /*Local*/,
+            false /*Contexual_mask*/,
+            false /*Target_mask*/,
+            Cross,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+      if constexpr (Q_dO_same_stages) {
+        smem_pipe_read_do = smem_pipe_read;
+      }
+      ++work_idx;
+      return true;
+    }
+    if constexpr (Has_targets) {
+      if (n_block * kBlockN >= seqlen_info.uihlen_q) {
+        auto mask_fn = [&](auto& tSrS, int m_block) {
+          mask.template apply<
+              true /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal*/,
+              false /*Local*/,
+              false /*Contexual_mask*/,
+              Has_targets /*Target_mask*/,
+              false /*Cross*/,
+              true /*Softmax*/>(tSrS, m_block, n_block);
+        };
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (; m_block < m_block_max; ++m_block) {
+          bwd_step(m_block, mask_fn);
+        }
+        if constexpr (Q_dO_same_stages) {
+          smem_pipe_read_do = smem_pipe_read;
+        }
+        ++work_idx;
+        return true;
+      } else if ((n_block + 1) * kBlockN >= seqlen_info.uihlen_q) {
+        if constexpr ((Causal || Local) && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                Causal,
+                Local,
+                Contexual_mask,
+                Has_targets /*Target_mask*/,
+                false /*Cross*/,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_block_masking_max =
+              ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < std::min(m_block_max, m_block_masking_max);
+               ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        auto mask_fn = [&](auto& tSrS, int m_block) {
+          mask.template apply<
+              true /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              Causal && !SeparateMaskingIterations,
+              Local && !SeparateMaskingIterations,
+              Contexual_mask,
+              Has_targets /*Target_mask*/,
+              false /*Cross*/,
+              true /*Softmax*/>(tSrS, m_block, n_block);
+        };
+        if constexpr (SeparateMaskingIterations) {
+          int const m_block_max_before_local_mask =
+              !Local || !SeparateMaskingIterations
+              ? m_block_max
+              : std::min(
+                    m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < m_block_max_before_local_mask; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        } else {
+          int num_m_block = m_block_max - m_block_min;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int i = 0; i < num_m_block + full_m_block_max -
+                   full_m_block_min + contexual_m_block_max;
+               ++i) {
+            if (i < num_m_block) {
+              m_block = m_block_min + i;
+            } else if (i < num_m_block + contexual_m_block_max) {
+              m_block = i - num_m_block;
+            } else {
+              m_block =
+                  i - num_m_block - contexual_m_block_max + full_m_block_min;
+            }
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        if constexpr (Local && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                false /*Causal_mask*/,
+                Local,
+                Contexual_mask,
+                Has_targets /*Target_mask*/,
+                false /*Cross*/,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < m_block_max; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+        if constexpr (Contexual_mask && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                Causal /*Causal_mask*/,
+                Local /*Local_mask*/,
+                Contexual_mask,
+                Has_targets,
+                false /*Cross*/,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        if constexpr (Local && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                false /*Causal_mask*/,
+                Local,
+                Contexual_mask,
+                Has_targets,
+                false /*Cross*/,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (m_block = full_m_block_min; m_block < full_m_block_max;
+               ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+        if constexpr (Q_dO_same_stages) {
+          smem_pipe_read_do = smem_pipe_read;
+        }
+        ++work_idx;
+        return true;
+      }
+    }
+    // We have separate iterations with causal masking. Not necessary for hdim
+    // 128 but for hdim 64 this helps quite a bit to not have to do causal
+    // masking for most of the iterations.
+    if constexpr ((Causal || Local) && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      static constexpr int kBlockM = get<0>(TileShape_MNK{});
+      int const m_block_masking_max =
+          ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < std::min(m_block_max, m_block_masking_max); ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    auto mask_fn = [&](auto& tSrS, int m_block) {
+      mask.template apply<
+          true /*Seqlenq_mask*/,
+          true /*Seqlenk_mask*/,
+          Causal && !SeparateMaskingIterations,
+          Local && !SeparateMaskingIterations,
+          Contexual_mask,
+          false /*Target_mask*/,
+          false /*Cross*/,
+          true /*Softmax*/>(tSrS, m_block, n_block);
+    };
+    if constexpr (SeparateMaskingIterations) {
+      int const m_block_max_before_local_mask =
+          !Local || !SeparateMaskingIterations
+          ? m_block_max
+          : std::min(
+                m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max_before_local_mask; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    } else {
+      int num_m_block = m_block_max - m_block_min;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int i = 0; i < num_m_block + full_m_block_max - full_m_block_min +
+               contexual_m_block_max;
+           ++i) {
+        if (i < num_m_block) {
+          m_block = m_block_min + i;
+        } else if (i < num_m_block + contexual_m_block_max) {
+          m_block = i - num_m_block;
+        } else {
+          m_block = i - num_m_block - contexual_m_block_max + full_m_block_min;
+        }
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    if constexpr (Local && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal_mask*/,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+    if constexpr (Contexual_mask && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal /*Causal_mask*/,
+            Local /*Local_mask*/,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    if constexpr (Local && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal_mask*/,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (m_block = full_m_block_min; m_block < full_m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(tdVrdV); }
+    if constexpr (Q_dO_same_stages) {
+      smem_pipe_read_do = smem_pipe_read;
+    }
+    ++work_idx;
+    return true;
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h
new file mode 100644
index 000000000..7c8a447af
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h
@@ -0,0 +1,2180 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+
+#include "mask.h"
+#include "named_barrier.h"
+#include "seqlen.h"
+#include "sm90_pipeline_no_cluster.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    int Stages,
+    class ClusterShape_,
+    class TileShape_MNK_,
+    class Element_,
+    class ElementAccum_,
+    class ArchTag_,
+    bool Causal,
+    bool Local,
+    bool Contexual_mask,
+    bool Jagged,
+    bool Has_targets,
+    bool Mma1_is_RS,
+    bool V_colmajor_,
+    bool Cross>
+struct CollectiveMainloopFwdSm90 {
+  static constexpr int kStages = Stages;
+  using ClusterShape = ClusterShape_;
+  using TileShape_MNK = TileShape_MNK_;
+  using Element = Element_;
+  using ElementAccum = ElementAccum_;
+  using ArchTag = ArchTag_;
+  static constexpr bool Is_FP8 =
+      cute::is_same_v<Element, cutlass::float_e4m3_t> ||
+      cute::is_same_v<Element, cutlass::float_e5m2_t>;
+  ;
+  static constexpr bool V_colmajor = V_colmajor_;
+  static constexpr bool Transpose_V = Is_FP8 && !V_colmajor;
+  using SeqlenInfo_t = hstu::SeqlenInfoQKFwd<Jagged, Cross, Has_targets>;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  static constexpr cute::GMMA::Major MmaMajorV =
+      !Is_FP8 && !V_colmajor ? GMMA::Major::MN : GMMA::Major::K;
+  static constexpr cute::GMMA::Major TmaMajorV =
+      !V_colmajor ? GMMA::Major::MN : GMMA::Major::K;
+
+  static constexpr int kBlockM = get<0>(TileShape_MNK{});
+  static constexpr int kBlockN = get<1>(TileShape_MNK{});
+  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
+
+  // Register bandwidth is actually a bottleneck so we don't want Q to be in
+  // registers. Leaving this option here for reference.
+  static constexpr bool Mma0_is_RS = false;
+  // We can have Mma1 (P @ V) with P in smem in rmem to reduce register pressure
+  // at the cost of more smem.
+  static_assert(!(!Mma1_is_RS && Is_FP8), "Mma1 must be RS if FP8");
+  static_assert(
+      !(!Mma1_is_RS && Transpose_V),
+      "Mma1 must be RS if Transpose_V");
+
+  using AtomLayoutMNK = Layout<Shape<Int<kBlockM / 64>, _1, _1>>;
+  using TiledMma0 = decltype(cute::make_tiled_mma(
+      std::conditional_t<
+          !Mma0_is_RS,
+          decltype(cute::GMMA::ss_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShape_MNK>()),
+          decltype(cute::GMMA::rs_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShape_MNK>())>{},
+      AtomLayoutMNK{}));
+  using TiledMma1 = decltype(cute::make_tiled_mma(
+      std::conditional_t<
+          !Mma1_is_RS,
+          decltype(cute::GMMA::ss_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   decltype(select<0, 2, 1>(TileShape_MNK{})),
+                   GMMA::Major::K,
+                   MmaMajorV>()),
+          decltype(cute::GMMA::rs_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   decltype(select<0, 2, 1>(TileShape_MNK{})),
+                   GMMA::Major::K,
+                   MmaMajorV>())>{},
+      AtomLayoutMNK{}));
+
+  static constexpr int NumMmaThreads = size(TiledMma0{});
+  static constexpr int NumProducerThreads = !Transpose_V
+      ? cutlass::NumThreadsPerWarp
+      : cutlass::NumThreadsPerWarpGroup;
+  static_assert(NumMmaThreads % cutlass::NumThreadsPerWarpGroup == 0);
+  static constexpr int NumMmaWarpGroups =
+      NumMmaThreads / cutlass::NumThreadsPerWarpGroup;
+  static_assert(
+      NumMmaWarpGroups == 1 || NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
+
+  using SmemLayoutAtomQ =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<0>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutQ =
+      decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{})));
+
+  using SmemLayoutAtomK =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<1>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutK = decltype(tile_to_shape(
+      SmemLayoutAtomK{},
+      make_shape(
+          shape<1>(TileShape_MNK{}),
+          shape<2>(TileShape_MNK{}),
+          Int<kStages>{})));
+
+  using SmemLayoutAtomVt =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               TmaMajorV,
+               Element,
+               decltype(cute::get<2>(TileShape_MNK{})),
+               decltype(cute::get<1>(TileShape_MNK{}))>());
+  using SmemLayoutVt = decltype(tile_to_shape(
+      SmemLayoutAtomVt{},
+      make_shape(
+          shape<2>(TileShape_MNK{}),
+          shape<1>(TileShape_MNK{}),
+          Int<kStages>{}),
+      std::conditional_t<
+          TmaMajorV == GMMA::Major::K,
+          cute::Step<_1, _2, _3>,
+          cute::Step<_2, _1, _3>>{}));
+
+  using SmemLayoutAtomVtMma =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               MmaMajorV,
+               Element,
+               decltype(cute::get<2>(TileShape_MNK{})),
+               decltype(cute::get<1>(TileShape_MNK{}))>());
+  using SmemLayoutVtMma = decltype(tile_to_shape(
+      SmemLayoutAtomVtMma{},
+      make_shape(
+          shape<2>(TileShape_MNK{}),
+          shape<1>(TileShape_MNK{}),
+          Int<kStages>{}),
+      std::conditional_t<
+          MmaMajorV == GMMA::Major::K,
+          cute::Step<_1, _2, _3>,
+          cute::Step<_2, _1, _3>>{}));
+
+  // Only used if we're using cp.async to load V
+  using SmemLayoutAtomVCpAsync =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<1>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutVCpAsync = decltype(tile_to_shape(
+      SmemLayoutAtomVCpAsync{},
+      make_shape(
+          shape<1>(TileShape_MNK{}),
+          shape<2>(TileShape_MNK{}),
+          Int<kStages>{})));
+
+  using SmemLayoutAtomP =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<0>(TileShape_MNK{})),
+               decltype(cute::get<1>(TileShape_MNK{}))>());
+  using SmemLayoutP =
+      decltype(tile_to_shape(SmemLayoutAtomP{}, select<0, 1>(TileShape_MNK{})));
+
+  using SmemCopyAtomP = Copy_Atom<cute::SM90_U32x4_STSM_N, Element>;
+
+  // Use LDSM.T and STSM to transpose V in the case of FP8 and V being
+  // row-major. For FP16/BF16 we don't do any transposing.
+  static_assert(!Transpose_V || (kHeadDim % 32 == 0 && kBlockN % 32 == 0));
+  static constexpr bool kHeadDim_multiple_64 = kHeadDim % 64 == 0;
+  // Either kHeadDim is a multiple of 64 (in which case we use a block size of
+  // 64 x 32 for the transpose), or we need kBlockN to be a multiple of 64 (in
+  // which case we use a block size of 32 x 64 for the transpose).
+  static_assert(!Transpose_V || (kHeadDim_multiple_64 || kBlockN % 64 == 0));
+  using LDSM_thread_shape = std::conditional_t<
+      kHeadDim_multiple_64,
+      Shape<_32, _4, _1, _1>,
+      Shape<_16, _4, _1, _2>>;
+  using LDSM_thread_stride = std::conditional_t<
+      kHeadDim_multiple_64,
+      Stride<_4, _1, _0, _0>,
+      Stride<_4, _1, _0, _64>>;
+  using LDSM_value_shape = Shape<_2, _2, _1, _4>;
+  using LDSM_value_stride = Stride<_1, _2, _16, _4>;
+  using LDSM_divide_shape =
+      std::conditional_t<kHeadDim_multiple_64, Shape<_64, _8>, Shape<_32, _8>>;
+  using S2RTiledCopyVt = decltype(make_tiled_copy(
+      Copy_Atom<SM75_U16x8_LDSM_T, Element>{},
+      Layout<LDSM_thread_shape, LDSM_thread_stride>{},
+      Layout<LDSM_value_shape, LDSM_value_stride>{}));
+
+  using STSM_thread_shape = std::conditional_t<
+      kHeadDim_multiple_64,
+      Shape<_8, _4, _4, _1>,
+      Shape<_8, _4, _2, _2>>;
+  using STSM_thread_stride = std::conditional_t<
+      kHeadDim_multiple_64,
+      Stride<_4, _1, _32, _0>,
+      Stride<_4, _1, _32, _64>>;
+  using STSM_value_shape = Shape<_1, _4, _2, _2>;
+  using STSM_value_stride = Stride<_0, _1, _4, _8>;
+  using STSM_divide_shape = Shape<_8, _16>;
+  // These will not permute the columns of V (the kHeadDim dimension) but incur
+  // bank conflicts so a little slower (e.g. 1150 TFLOPS for hdim 256 instead of
+  // 1200 TFLOPS). Instead we will permute the cols of V, and un-permute the
+  // cols of O in the epilogue. using STSM_value_shape = Shape<_2, _4, _1, _2>;
+  // using STSM_value_stride = Stride<_4, _1, _0, _8>;
+  // using STSM_divide_shape = Shape<_16, _16>;
+  using R2STiledCopyV = decltype(make_tiled_copy(
+      Copy_Atom<SM90_U32x4_STSM_N, Element>{},
+      Layout<STSM_thread_shape, STSM_thread_stride>{},
+      Layout<STSM_value_shape, STSM_value_stride>{}));
+
+  using GmemTiledCopyQ = cute::SM90_TMA_LOAD;
+  using GmemTiledCopyKV =
+      decltype(cutlass::gemm::collective::detail::
+                   sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape{})));
+
+  // We use CpAsync for K and V if PagedKV and AppendKV, since TMA doesn't work
+  // there
+  static constexpr int kGmemElemsPerLoad =
+      sizeof(cute::uint128_t) / sizeof(Element);
+  static_assert(
+      kHeadDim % kGmemElemsPerLoad == 0,
+      "Headdim must be a multiple of kGmemElemsPerLoad");
+  // We want each "row" to have 64 elements (128 bytes, i.e. 1 cache line). E.g.
+  // if hdim=128, we want each thread to have 4 loads in the M direction and 2
+  // vectorized load in the K direction. We want each thread to have at least 2
+  // loads in the K direction since in the case of non-interleaved rotary
+  // (combining elements at indices 0 and rotary_dim/2, 1 and rotary_dim/2+1,
+  // etc), each thread will load twice from the same row.
+  static constexpr int kBytePerHalfRow = kHeadDim / 2 * sizeof(Element);
+  static constexpr int kBlockKGmem =
+      (kBytePerHalfRow % 128 == 0 ? 128
+                                  : (kBytePerHalfRow % 64 == 0 ? 64 : 32)) /
+      sizeof(Element);
+  static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerLoad;
+  static_assert(
+      NumMmaThreads % kGmemThreadsPerRow == 0,
+      "NumMmaThreads must be a multiple of kGmemThreadsPerRow");
+  // We assume threads loading the same row are in the same warp. This is for an
+  // optimization in PagedKV where these threads share the same page table entry
+  // and share the work of computing pointers to paged K and paged V.
+  static_assert(
+      cutlass::NumThreadsPerWarp % kGmemThreadsPerRow == 0,
+      "kGmemThreadsPerRow must divide NumThreadsPerWarp");
+  using GmemLayoutAtom = Layout<
+      Shape<Int<NumMmaThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+      Stride<Int<kGmemThreadsPerRow>, _1>>;
+  // If AppendKV, we'll be loading Q for rotary, and we assume divisibility to
+  // avoid predication
+  static_assert(
+      kBlockM % CUTE_STATIC_V(shape<0>(GmemLayoutAtom{})) == 0,
+      "kBlockM must be a multiple of NumMmaThreads / kGmemThreadsPerRow");
+
+  using ShapeQKV =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen, d, head,
+                                                       // batch)
+  using StrideQK = cute::Stride<int64_t, _1, int64_t, int64_t>;
+  using StrideV = std::conditional_t<
+      !V_colmajor,
+      StrideQK,
+      cute::Stride<_1, int64_t, int64_t, int64_t>>;
+  // ((qhead_per_khead, seqlen), d, nheads_kv, batch, num_splits)
+  using ShapeQPacked = ShapeQKV;
+  using StrideQPacked = StrideQK;
+  using StrideDescale = cute::Stride<int64_t, int64_t>;
+
+  using TMA_Q = decltype(make_tma_copy_A_sm90(
+      GmemTiledCopyQ{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          StrideQK{}),
+      SmemLayoutQ{},
+      TileShape_MNK{},
+      ClusterShape{}));
+
+  using TMA_K = decltype(make_tma_copy_B_sm90(
+      GmemTiledCopyKV{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          StrideQK{}),
+      take<0, 2>(SmemLayoutK{}),
+      TileShape_MNK{},
+      ClusterShape{})); // mcast along M mode for this N load, if any
+
+  using TMA_V = decltype(make_tma_copy(
+      GmemTiledCopyKV{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          select<1, 0, 2, 3>(StrideV{})),
+      take<0, 2>(SmemLayoutVt{}),
+      select<2, 1>(TileShape_MNK{}),
+      size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple
+  // issues)
+  static constexpr uint32_t TmaTransactionBytesQ = static_cast<uint32_t>(
+      size(SmemLayoutQ{}) * cutlass::sizeof_bits_v<Element> / 8);
+  static constexpr uint32_t TmaTransactionBytesK = static_cast<uint32_t>(
+      size(take<0, 2>(SmemLayoutK{})) * cutlass::sizeof_bits_v<Element> / 8);
+  static constexpr uint32_t TmaTransactionBytesV = static_cast<uint32_t>(
+      size(take<0, 2>(SmemLayoutVt{})) * cutlass::sizeof_bits_v<Element> / 8);
+  static_assert(TmaTransactionBytesK == TmaTransactionBytesV);
+
+  using PipelineTmaAsync = std::conditional_t<
+      CUTE_STATIC_V(size(ClusterShape{})) == 1,
+      typename cutlass::PipelineTmaAsyncNoCluster<kStages>,
+      typename cutlass::PipelineTmaAsync<kStages>>;
+  using MainloopPipelineK = PipelineTmaAsync;
+  using MainloopPipelineV = std::conditional_t<
+      !Transpose_V,
+      PipelineTmaAsync,
+      typename cutlass::PipelineAsync<kStages>>;
+  using MainloopPipelineVt = PipelineTmaAsync;
+  // We always use TMA for K_new and V_new
+  using MainloopPipelineKVNew = PipelineTmaAsync;
+  using PipelineState = cutlass::PipelineState<kStages>;
+
+  // If PackGQA, we use cp.async (instead of TMA) to load Q, so we want smem_q
+  // to be aligned and have sQ being position_independent_swizzle_tensor. If
+  // !Use_TMA_KV, we use cp.async (instead of TMA) to load K & V, so we want
+  // smem_k and smem_v to be aligned.
+  static constexpr size_t SmemAlignmentQ =
+      !Mma0_is_RS ? 128 : cutlass::detail::alignment_for_swizzle(SmemLayoutQ{});
+  static constexpr size_t SmemAlignmentK = 128;
+  static constexpr size_t SmemAlignmentVtNoTranspose =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutVt{});
+  static_assert(
+      SmemAlignmentQ >= 128 and SmemAlignmentK >= 128 &&
+          SmemAlignmentVtNoTranspose >= 128,
+      "Require at least 128B alignment");
+  static constexpr size_t SmemAlignmentP =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutP{});
+  static_assert(SmemAlignmentP >= 128, "Require at least 128B alignment");
+
+  using SmemP_t = std::conditional_t<
+      Mma1_is_RS,
+      cute::array<Element, 0>,
+      cute::
+          array_aligned<Element, cute::cosize_v<SmemLayoutP>, SmemAlignmentP>>;
+  // Sometimes even with SmemP_t = cute::array<Element, 0>, putting it in the
+  // TensorStorage struct causes smem size to go from 227KB to 228KB and we get
+  // "invalid argument".
+
+  struct TensorStorageWithoutPNoTranspose : cute::aligned_struct<cute::max(
+                                                SmemAlignmentQ,
+                                                SmemAlignmentK,
+                                                SmemAlignmentVtNoTranspose)> {
+    cute::array_aligned<
+        Element,
+        cute::cosize_v<SmemLayoutVt>,
+        SmemAlignmentVtNoTranspose>
+        smem_v;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQ>
+        smem_q;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentK>
+        smem_k;
+  };
+
+  struct TensorStorageWithPNoTranspose : cute::aligned_struct<cute::max(
+                                             SmemAlignmentQ,
+                                             SmemAlignmentK,
+                                             SmemAlignmentVtNoTranspose,
+                                             SmemAlignmentP)> {
+    cute::array_aligned<
+        Element,
+        cute::cosize_v<SmemLayoutVt>,
+        SmemAlignmentVtNoTranspose>
+        smem_v;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQ>
+        smem_q;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentK>
+        smem_k;
+    SmemP_t smem_p;
+  };
+
+  using TensorStorageNoTranspose = std::conditional_t<
+      Mma1_is_RS,
+      TensorStorageWithoutPNoTranspose,
+      TensorStorageWithPNoTranspose>;
+
+  static constexpr size_t SmemAlignmentVt =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutVt{});
+  static constexpr size_t SmemAlignmentV =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutVtMma{});
+  static_assert(
+      SmemAlignmentVt >= 128 and SmemAlignmentV >= 128,
+      "Require at least 128B alignment");
+  struct TensorStorageTransposeV
+      : cute::aligned_struct<
+            cute::max(SmemAlignmentQ, SmemAlignmentK, SmemAlignmentV)> {
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutVtMma>, SmemAlignmentV>
+            smem_v;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutVt>, SmemAlignmentVt>
+        smem_vt;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQ>
+        smem_q;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentK>
+        smem_k;
+  };
+
+  using TensorStorage = std::conditional_t<
+      !Transpose_V,
+      TensorStorageNoTranspose,
+      TensorStorageTransposeV>;
+
+  // These are tuned for speed. They don't affect correctness.
+  static constexpr bool UseSchedulerBarrier =
+      (NumMmaWarpGroups >= 2) && (!Is_FP8 ? kHeadDim <= 128 : kHeadDim >= 128);
+  static constexpr bool RescaleOBeforeGemm =
+      kHeadDim > 128 && (!Is_FP8 || V_colmajor);
+
+  // Host side kernel arguments
+  struct Arguments {
+    Element const* const ptr_Q;
+    ShapeQKV const shape_Q;
+    StrideQK const stride_Q;
+    Element* const
+        ptr_K; // Not Element const* since we might append to KV cache in-place
+    ShapeQKV const shape_K;
+    StrideQK const stride_K;
+    Element* const ptr_V;
+    StrideV const stride_V;
+    float const *ptr_q_descale, *ptr_k_descale, *ptr_v_descale;
+    StrideDescale const stride_q_descale, stride_k_descale, stride_v_descale;
+    float const max_seq_len_inv;
+    float const alpha;
+    int const max_attn_len;
+    int const min_full_attn_seq_len;
+    int const contextual_seq_len;
+    int const num_softmax_heads;
+    int const num_groups;
+    int const batch_size_per_group;
+    int const* const seq_offsets = nullptr;
+    int const* const seq_offsets_q = nullptr;
+    int const* const num_targets = nullptr;
+    int const* const max_seq_len_tensor = nullptr;
+    int const* const contextual_seq_len_tensor = nullptr;
+    int const* const max_attn_len_tensor = nullptr;
+    int const* const min_full_attn_seq_len_tensor = nullptr;
+    float const* const attn_scale = nullptr;
+    bool const scalar_scale = true;
+  };
+
+  // Device side kernel params
+  struct Params {
+    Element const* const ptr_Q;
+    ShapeQKV const shape_Q;
+    StrideQK const stride_Q;
+    ShapeQPacked const shape_Q_packed;
+    StrideQPacked const stride_Q_packed;
+    Element* const ptr_K;
+    ShapeQKV const shape_K;
+    StrideQK const stride_K;
+    Element* const ptr_V;
+    StrideV const stride_V;
+    TMA_Q tma_load_Q;
+    TMA_K tma_load_K;
+    TMA_V tma_load_V;
+    float const *ptr_q_descale, *ptr_k_descale, *ptr_v_descale;
+    StrideDescale const stride_q_descale, stride_k_descale, stride_v_descale;
+    float const max_seq_len_inv;
+    float const alpha;
+    float const alpha_log2;
+    int const max_attn_len;
+    int const min_full_attn_seq_len;
+    int const contextual_seq_len;
+    int const num_softmax_heads;
+    int const num_groups;
+    int const batch_size_per_group;
+    int const* const seq_offsets = nullptr;
+    int const* const seq_offsets_q = nullptr;
+    int const* const num_targets = nullptr;
+    int const* const max_seq_len_tensor = nullptr;
+    int const* const contextual_seq_len_tensor = nullptr;
+    int const* const max_attn_len_tensor = nullptr;
+    int const* const min_full_attn_seq_len_tensor = nullptr;
+    float const* const attn_scale = nullptr;
+    bool const scalar_scale = true;
+  };
+
+  static Params to_underlying_arguments(Arguments const& args) {
+    Tensor mQ =
+        make_tensor(make_gmem_ptr(args.ptr_Q), args.shape_Q, args.stride_Q);
+    TMA_Q tma_load_Q = make_tma_copy_A_sm90(
+        GmemTiledCopyQ{},
+        mQ,
+        SmemLayoutQ{},
+        TileShape_MNK{},
+        ClusterShape{}); // no mcast for Q
+    Tensor mK =
+        make_tensor(make_gmem_ptr(args.ptr_K), args.shape_K, args.stride_K);
+    TMA_K tma_load_K = make_tma_copy_B_sm90(
+        GmemTiledCopyKV{},
+        mK,
+        take<0, 2>(SmemLayoutK{}),
+        TileShape_MNK{},
+        ClusterShape{}); // mcast along M mode for this N load, if any
+    Tensor mV = make_tensor(
+        make_gmem_ptr(args.ptr_V),
+        select<1, 0, 2, 3>(args.shape_K),
+        select<1, 0, 2, 3>(args.stride_V));
+    TMA_V tma_load_V = make_tma_copy(
+        GmemTiledCopyKV{},
+        mV,
+        take<0, 2>(SmemLayoutVt{}),
+        select<2, 1>(TileShape_MNK{}),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+    auto const shape_Q_packed = cute::conditional_return<true>(
+        args.shape_Q,
+        make_shape(
+            make_shape(1, get<0>(args.shape_Q)),
+            get<1>(args.shape_Q),
+            get<2>(args.shape_K),
+            get<3>(args.shape_Q)));
+    auto const stride_Q_packed = cute::conditional_return<true>(
+        args.stride_Q,
+        make_stride(
+            make_stride(get<2>(args.stride_Q), get<0>(args.stride_Q)),
+            get<1>(args.stride_Q),
+            get<2>(args.stride_Q),
+            get<3>(args.stride_Q)));
+    return {
+        args.ptr_Q,
+        args.shape_Q,
+        args.stride_Q,
+        shape_Q_packed,
+        stride_Q_packed,
+        args.ptr_K,
+        args.shape_K,
+        args.stride_K,
+        args.ptr_V,
+        args.stride_V,
+        tma_load_Q,
+        tma_load_K,
+        tma_load_V,
+        args.ptr_q_descale,
+        args.ptr_k_descale,
+        args.ptr_v_descale,
+        args.stride_q_descale,
+        args.stride_k_descale,
+        args.stride_v_descale,
+        args.max_seq_len_inv,
+        args.alpha,
+        float(args.alpha * M_LOG2E),
+        args.max_attn_len,
+        args.min_full_attn_seq_len,
+        args.contextual_seq_len,
+        args.num_softmax_heads,
+        args.num_groups,
+        args.batch_size_per_group,
+        args.seq_offsets,
+        args.seq_offsets_q,
+        args.num_targets,
+        args.max_seq_len_tensor,
+        args.contextual_seq_len_tensor,
+        args.max_attn_len_tensor,
+        args.min_full_attn_seq_len_tensor,
+        args.attn_scale,
+        args.scalar_scale};
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
+  /// performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& params) {
+    cute::prefetch_tma_descriptor(params.tma_load_Q.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_K.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_V.get_tma_descriptor());
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_n_block_min_max(
+      int max_attn_len,
+      int min_full_attn_seq_len,
+      int contextual_seq_len,
+      int uihlen,
+      int m_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (Contexual_mask) {
+      if (m_block * kBlockM < contextual_seq_len) {
+        return {0, cute::ceil_div(uihlen, kBlockN)};
+      }
+    }
+    if constexpr (Has_targets) {
+      int m_idx_max = (m_block + 1) * kBlockM;
+      if (m_idx_max > uihlen) {
+        return {0, cute::ceil_div(uihlen, kBlockN)};
+      }
+    }
+    int n_block_max;
+    int n_block_min;
+    // Non-target part, n_block_max
+    if constexpr (Causal || Local) {
+      int m_idx_max = (m_block + 1) * kBlockM;
+      n_block_max = cute::ceil_div(std::min(m_idx_max, uihlen), kBlockN);
+    } else {
+      n_block_max = cute::ceil_div(uihlen, kBlockN);
+    }
+    // Non-target part, n_block_min
+    if constexpr (Local) {
+      int m_idx_min = m_block * kBlockM;
+      int m_idx_max = (m_block + 1) * kBlockM;
+      if (min_full_attn_seq_len == 0 ||
+          m_idx_max <= uihlen - min_full_attn_seq_len) {
+        n_block_min = std::max(int(0), (m_idx_min - max_attn_len) / kBlockN);
+        if constexpr (Contexual_mask) {
+          // row contexual without sink
+          if (n_block_min * kBlockN < contextual_seq_len) {
+            n_block_min = 0;
+          }
+        }
+      } else {
+        n_block_min = 0;
+      }
+    } else {
+      n_block_min = 0;
+    }
+    return {n_block_min, n_block_max};
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_target_n_block_min_max(
+      int n_block_max,
+      int uihlen,
+      int seqlen,
+      int m_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    int m_idx_max = (m_block + 1) * kBlockM;
+    if (m_idx_max <= uihlen) { // Non-target part
+      return {n_block_max, n_block_max};
+    } else { // Target part
+      int m_idx_min = m_block * kBlockM;
+      return {
+          std::max(n_block_max, m_idx_min / kBlockN),
+          cute::ceil_div(std::min(m_idx_max, seqlen), kBlockN)};
+    }
+  }
+
+  CUTLASS_DEVICE
+  int get_contexual_n_block_max(
+      int n_block_min,
+      int min_full_attn_seq_len,
+      int contextual_seq_len,
+      int uihlen,
+      int m_block) {
+    return 0;
+    // TODO: reenable below once contexual + semi local implementation is
+    // finalized
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (!Local) {
+      return 0;
+    }
+    if (m_block * kBlockM < contextual_seq_len) {
+      return 0;
+    }
+    int m_idx_max = (m_block + 1) * kBlockM;
+    if constexpr (Has_targets) {
+      if (m_idx_max > uihlen) {
+        return 0;
+      }
+    }
+    if (min_full_attn_seq_len == 0 ||
+        m_idx_max <= uihlen - min_full_attn_seq_len) {
+      return std::min(n_block_min, cute::ceil_div(contextual_seq_len, kBlockN));
+    }
+    return 0;
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_cross_n_block_min_max(
+      int const uihlen_q,
+      int const seqlen_q,
+      int const seqlen_kv,
+      int const m_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (!Causal) {
+      return {0, cute::ceil_div(seqlen_kv, kBlockN)};
+    }
+    int n_block_max =
+        std::min(seqlen_kv, (m_block + 1) * kBlockM + seqlen_kv - uihlen_q);
+    return {0, cute::ceil_div(n_block_max, kBlockN)};
+  }
+
+  template <typename SchedulerPrefetch, typename SharedStorage>
+  CUTLASS_DEVICE void load(
+      Params const& params,
+      MainloopPipelineK pipeline_k,
+      MainloopPipelineV pipeline_v,
+      MainloopPipelineVt pipeline_vt,
+      PipelineState& smem_pipe_write,
+      SharedStorage& shared_storage,
+      SchedulerPrefetch const& scheduler_prefetch,
+      SeqlenInfo_t const& seqlen_info,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
+      int& work_idx) {
+    auto [m_block, bidh, bidb, split_idx] = block_coord;
+    if constexpr (Jagged) {
+      static constexpr int kBlockM = get<0>(TileShape_MNK{});
+      if (m_block * kBlockM >= seqlen_info.seqlen_q) {
+        scheduler_prefetch();
+        return;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    int n_block_min, n_block_max;
+    if constexpr (Cross) {
+      auto n_block_min_max = get_cross_n_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    } else {
+      auto n_block_min_max = get_n_block_min_max(
+          max_attn_len_,
+          min_full_attn_seq_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    }
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    if (n_block_max <= n_block_min) {
+      std::printf(
+          "mainloop_fwd_sm90: n_block_max <= n_block_min not expected.");
+      scheduler_prefetch();
+      return;
+    }
+#endif
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sK_pi = as_position_independent_swizzle_tensor(sK);
+    // as_position_independent_swizzle_tensor makes address calculation easier
+    // when we do LDSM & STSM to transpose. But it requires smem_vt and smem_v
+    // to be aligned to e.g 512 bytes.
+    Tensor sVt = [&] {
+      if constexpr (!Transpose_V) {
+        return make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+            SmemLayoutVt{});
+      } else {
+        return cute::as_position_independent_swizzle_tensor(make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_vt.data()),
+            SmemLayoutVt{}));
+      }
+    }();
+    // Only used if Transpose_V
+    Tensor sV = cute::as_position_independent_swizzle_tensor(make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutVtMma{}));
+
+    int const thread_idx = threadIdx.x % NumProducerThreads;
+
+    // Prepare the TMA loads
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {
+        block_rank_in_cluster % cluster_shape_x,
+        block_rank_in_cluster / cluster_shape_x};
+
+    Tensor mQ = params.tma_load_Q.get_tma_tensor(params.shape_Q)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mK_TMA = params.tma_load_K.get_tma_tensor(params.shape_K)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mVt_TMA = params.tma_load_V.get_tma_tensor(
+        select<1, 0, 2, 3>(params.shape_K))(_, _, bidh, !Jagged ? bidb : 0);
+
+    Tensor gQ = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_q, _0{}), mQ),
+        select<0, 2>(TileShape_MNK{}),
+        make_coord(m_block, _0{})); // (M, K)
+    // if (cute::thread0()) { printf("Jagged = %d, params.leftpad_k = %p,
+    // leftpad_k = %d\n", Jagged, params.leftpad_k, leftpad_k); }
+    Tensor gK_TMA = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_k, _0{}), mK_TMA),
+        select<1, 2>(TileShape_MNK{}),
+        make_coord(_, _0{})); // (N, K, _)
+    Tensor gVt_TMA = local_tile(
+        domain_offset(make_coord(_0{}, seqlen_info.offset_k), mVt_TMA),
+        select<2, 1>(TileShape_MNK{}),
+        make_coord(_0{}, _)); // (K, N, _)
+
+    auto block_tma_Q = params.tma_load_Q.get_slice(_0{});
+    Tensor tQgQ = group_modes<0, 3>(block_tma_Q.partition_S(gQ)); // (TMA)
+    Tensor tQsQ = group_modes<0, 3>(block_tma_Q.partition_D(sQ)); // (TMA)
+    // tma_partition doesn't handle position_independent_swizzle_tensor
+    // correctly, so we need to do it manually
+    auto block_tma_K = params.tma_load_K.get_slice(cluster_local_block_id.x);
+    Tensor tKgK_TMA =
+        group_modes<0, 3>(block_tma_K.partition_S(gK_TMA)); // (TMA, k)
+    Tensor tKsK_TMA =
+        group_modes<0, 3>(block_tma_K.partition_D(sK)); // (TMA, PIPE)
+    auto block_tma_V = params.tma_load_V.get_slice(cluster_local_block_id.x);
+    Tensor tVgVt_TMA =
+        group_modes<0, 3>(block_tma_V.partition_S(gVt_TMA)); // (TMA, k)
+    Tensor tVsVt_TMA =
+        group_modes<0, 3>(block_tma_V.partition_D(sVt)); // (TMA, PIPE)
+
+    // Set up for transposing V, only used if Transpose_V
+    S2RTiledCopyVt s2r_tiled_copy_vt;
+    R2STiledCopyV r2s_tiled_copy_v;
+    auto s2r_thr_copy_vt = s2r_tiled_copy_vt.get_thread_slice(thread_idx);
+    auto r2s_thr_copy_v = r2s_tiled_copy_v.get_thread_slice(thread_idx);
+    // flat_divide(sVt, LDSM_divide_shape{}):  (64, 8, kHeadDim / 64, kBlockN /
+    // 8, kStages)
+    Tensor tTranssVt_ = s2r_thr_copy_vt.partition_S(
+        flat_divide(sVt, LDSM_divide_shape{})); // ((16, 1), 1, 1, kHeadDim /
+                                                // 64, kBlockN / 32, kStages)
+    // flat_divide(sV, STSM_divide_shape{}):  (8, 16, kHeadDim / 8, (4, kBlockN
+    // / 64), kStages)
+    Tensor tTranssV_ = r2s_thr_copy_v.partition_D(
+        flat_divide(sV, STSM_divide_shape{})); // ((16, 1), 1, 1, kHeadDim / 64,
+                                               // (2, kBlockN / 64), kStages)
+    CUTE_STATIC_ASSERT_V(rank(tTranssVt_) == rank(tTranssV_));
+    CUTE_STATIC_ASSERT_V(size<0>(tTranssVt_) == size<0>(tTranssV_));
+    CUTE_STATIC_ASSERT_V(size<1>(tTranssVt_) == size<1>(tTranssV_));
+    CUTE_STATIC_ASSERT_V(size<2>(tTranssVt_) == size<2>(tTranssV_));
+    CUTE_STATIC_ASSERT_V(size<3>(tTranssVt_) == size<3>(tTranssV_));
+    CUTE_STATIC_ASSERT_V(size<4>(tTranssVt_) == size<4>(tTranssV_));
+    // Faster to have 2 LDSM.T, byte permute, STSM for better ILP
+    static constexpr int Transpose_ILP =
+        (size<2>(tTranssVt_) * size<3>(tTranssVt_)) % 2 == 0 ? 2 : 1;
+    Tensor tTranssVt = logical_divide(
+        group_modes<1, rank(tTranssVt_) - 1>(tTranssVt_),
+        Shape<Underscore, Int<Transpose_ILP>>{}); // ((16, 1), (2, kHeadDim / 64
+                                                  // * kBlockN / 32 / 2),
+                                                  // kStages)
+    Tensor tTranssV = logical_divide(
+        group_modes<1, rank(tTranssV_) - 1>(tTranssV_),
+        Shape<Underscore, Int<Transpose_ILP>>{}); // ((16, 1), (2, kHeadDim / 64
+                                                  // * kBlockN / 32 / 2),
+                                                  // kStages)
+    auto transpose_V = [&](int stage) {
+      if constexpr (Transpose_V) {
+#pragma unroll
+        for (int i = 0; i < size<1, 1>(tTranssVt); ++i) {
+          Tensor tTransrV =
+              make_fragment_like(tTranssV(_, make_coord(_, _0{}), _0{}));
+          static_assert(size<0>(tTransrV) == 16);
+          Tensor tTransrV_64 = recast<uint2>(tTransrV);
+          cute::copy(
+              s2r_tiled_copy_vt,
+              tTranssVt(_, make_coord(_, i), stage),
+              tTransrV);
+#pragma unroll
+          for (int j = 0; j < size(tTransrV_64); ++j) {
+            uint32_t upper = tTransrV_64[j].x;
+            uint32_t lower = tTransrV_64[j].y;
+            tTransrV_64[j].x = __byte_perm(upper, lower, 0x6420);
+            tTransrV_64[j].y = __byte_perm(upper, lower, 0x7531);
+          }
+          cute::copy(
+              r2s_tiled_copy_v, tTransrV, tTranssV(_, make_coord(_, i), stage));
+        }
+      }
+    };
+
+    uint16_t mcast_mask_kv = 0;
+    if constexpr (cute::is_same_v<GmemTiledCopyKV, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<ClusterShape>{}; // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_kv |=
+            (uint16_t(1) << block_layout(m, cluster_local_block_id.y, _0{}));
+      }
+    }
+
+    auto load_K = [&](int const n_block, auto const& smem_pipe_write) {
+      pipeline_k.producer_acquire(smem_pipe_write);
+      copy(
+          params.tma_load_K.with(
+              *pipeline_k.producer_get_barrier(smem_pipe_write),
+              mcast_mask_kv,
+              TMA::CacheHintSm90::EVICT_LAST),
+          tKgK_TMA(_, n_block),
+          tKsK_TMA(_, smem_pipe_write.index()));
+    };
+
+    auto load_V = [&](int const n_block, auto const& smem_pipe_write) {
+      auto pipeline_v_load =
+          cute::conditional_return<!Transpose_V>(pipeline_v, pipeline_vt);
+      pipeline_v_load.producer_acquire(smem_pipe_write);
+      copy(
+          params.tma_load_V.with(
+              *pipeline_v_load.producer_get_barrier(smem_pipe_write),
+              mcast_mask_kv,
+              TMA::CacheHintSm90::EVICT_LAST),
+          tVgVt_TMA(_, n_block),
+          tVsVt_TMA(_, smem_pipe_write.index()));
+    };
+
+    auto copy_Vt_to_V = [&](auto const& smem_pipe_write) {
+      // Instead of maintaining smem_pipe_read as a separate variable, we can
+      // just use smem_pipe_write, and exploit the invariance that
+      // smem_pipe_write.phase() == smem_pipe_read.phase() ^ 1. This saves 1 or
+      // 2 registers.
+      PipelineState smem_pipe_read{
+          smem_pipe_write.index(),
+          smem_pipe_write.phase() ^ 1,
+          smem_pipe_write.count()};
+      pipeline_vt.consumer_wait(smem_pipe_read);
+      pipeline_v.producer_acquire(smem_pipe_write);
+      transpose_V(smem_pipe_write.index());
+      // SMEM fence to make sure V is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      pipeline_v.producer_commit(smem_pipe_write);
+      // Very important: PipelineTmaAsync::consumer_release assumes that the
+      // warpgroup is synchronized before calling. Without this we get race
+      // conditions.
+      cutlass::arch::NamedBarrier::sync(
+          cutlass::NumThreadsPerWarpGroup,
+          static_cast<uint32_t>(FwdNamedBarriers::ProducerWG) /*id*/);
+      pipeline_vt.consumer_release(smem_pipe_read);
+    };
+
+    int n_block = n_block_max - 1;
+
+    int warp_idx_in_warpgroup =
+        __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+    // If this is true, we're guaranteed that only the first warp will execute
+    // this function
+    static constexpr bool SingleProducerWarp =
+        NumProducerThreads == cutlass::NumThreadsPerWarp;
+    bool should_load_KV =
+        ((SingleProducerWarp || warp_idx_in_warpgroup == 0) &&
+         cute::elect_one_sync());
+
+    if (should_load_KV) {
+      if constexpr (Transpose_V) {
+        load_V(n_block, smem_pipe_write);
+      }
+      // if (thread_idx == 0) { printf("Producer: main load, before load_K,
+      // index = %d\n", smem_pipe_write.index());}
+      load_K(n_block, smem_pipe_write);
+      // if (thread_idx == 0) { printf("Producer: main load, after load K, index
+      // = %d\n", smem_pipe_write.index());}
+    }
+
+    // TMA_Q, Wait for the MMA warpgroups to signal that smem_q is ready
+    if (SingleProducerWarp || warp_idx_in_warpgroup == 0) {
+      cutlass::arch::NamedBarrier::sync(
+          NumMmaThreads + cutlass::NumThreadsPerWarp,
+          static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
+    }
+    if ((SingleProducerWarp || warp_idx_in_warpgroup == 0) &&
+        cute::elect_one_sync()) {
+      shared_storage.pipelines.barrier_Q.arrive_and_expect_tx(
+          TmaTransactionBytesQ);
+      copy(
+          params.tma_load_Q.with(
+              reinterpret_cast<typename cutlass::arch::
+                                   ClusterTransactionBarrier::ValueType&>(
+                  shared_storage.pipelines.barrier_Q),
+              0 /*mcast_mask*/,
+              TMA::CacheHintSm90::EVICT_FIRST),
+          tQgQ,
+          tQsQ);
+    }
+
+    // Wait for the MMA WGs to signal that smem_v are ready and V can be copied
+    // from gmem Need ClusterBarrier, not just NamedBarrier. Otherwise we might
+    // have CTA 0 finishing the TMA store on O first, call TMA multicast load on
+    // V, before CTA 1 can finishing TMA store on O. if (thread_idx == 0) {
+    // printf("Producer: main load, before barrier_O, work_idx = %d\n",
+    // work_idx);}
+    shared_storage.pipelines.barrier_O.wait((work_idx + 1) % 2);
+    // if (thread_idx == 0) { printf("Producer: main load, after barrier_O\n");}
+
+    int n_block_prev = n_block;
+    --n_block;
+#pragma unroll(!Transpose_V ? 2 : 1)
+    for (; n_block >= n_block_min; --n_block) {
+      PipelineState smem_pipe_write_v =
+          smem_pipe_write; // copy the state, write_v is always 1 step behind
+      ++smem_pipe_write;
+      if (should_load_KV) {
+        if constexpr (Transpose_V) {
+          load_V(n_block, smem_pipe_write);
+        } else {
+          load_V(n_block_prev, smem_pipe_write_v);
+        }
+        load_K(n_block, smem_pipe_write);
+      }
+      n_block_prev = n_block;
+      if constexpr (Transpose_V) {
+        copy_Vt_to_V(smem_pipe_write_v);
+      }
+    }
+    scheduler_prefetch();
+    if constexpr (!Transpose_V) {
+      if (should_load_KV) {
+        load_V(n_block_prev, smem_pipe_write);
+      }
+    }
+    if constexpr (Transpose_V) {
+      copy_Vt_to_V(smem_pipe_write);
+    }
+    ++smem_pipe_write;
+    if constexpr (!Cross) {
+      if constexpr (Has_targets) {
+        auto [target_n_block_min, target_n_block_max] =
+            get_target_n_block_min_max(
+                n_block_max,
+                seqlen_info.uihlen_q,
+                seqlen_info.seqlen_kv,
+                m_block);
+#pragma unroll 1
+        for (n_block = target_n_block_max - 1; n_block >= target_n_block_min;
+             --n_block) {
+          if (should_load_KV) {
+            load_V(n_block, smem_pipe_write);
+            load_K(n_block, smem_pipe_write);
+          }
+          if constexpr (Transpose_V) {
+            copy_Vt_to_V(smem_pipe_write);
+          }
+          ++smem_pipe_write;
+        }
+      }
+      if constexpr (Contexual_mask) {
+        int contexual_n_block_max = get_contexual_n_block_max(
+            n_block_min,
+            min_full_attn_seq_len_,
+            contextual_seq_len_,
+            seqlen_info.uihlen_q,
+            m_block);
+#pragma unroll 1
+        for (n_block = contexual_n_block_max - 1; n_block >= 0; --n_block) {
+          if (should_load_KV) {
+            load_V(n_block, smem_pipe_write);
+            load_K(n_block, smem_pipe_write);
+          }
+          if constexpr (Transpose_V) {
+            copy_Vt_to_V(smem_pipe_write);
+          }
+          ++smem_pipe_write;
+        }
+      }
+    }
+    // At the end, all threads have the correct smem_pipe_write.
+    ++work_idx;
+  }
+
+  template <typename SharedStorage>
+  CUTLASS_DEVICE void load_tail(
+      MainloopPipelineK pipeline_k,
+      MainloopPipelineV pipeline_v,
+      MainloopPipelineVt pipeline_vt,
+      PipelineState& smem_pipe_write,
+      SharedStorage& shared_storage,
+      int const work_idx) {
+    // If we don't wait for barrier_O here, when using Cluster, CTA0 might exit
+    // early and CTA1 will try to arrive on barrier_O of CTA0, causing
+    // "unspecified launch failure".
+    shared_storage.pipelines.barrier_O.wait((work_idx + 1) % 2);
+    int warp_idx_in_warpgroup =
+        __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+    // Issue the epilogue waits
+    // TODO: check if this should be called by 1 thread or more
+    if (warp_idx_in_warpgroup == 0 && cute::elect_one_sync()) {
+      /* This helps avoid early exit of blocks in Cluster
+       *  Waits for all stages to either be released (all Consumer UNLOCKs), or
+       * if the stage was never used then would just be acquired since the phase
+       * was still inverted from make_producer_start_state
+       */
+      pipeline_k.producer_tail(smem_pipe_write);
+      pipeline_v.producer_tail(smem_pipe_write);
+      if constexpr (Transpose_V) {
+        pipeline_vt.producer_tail(smem_pipe_write);
+      }
+    }
+  }
+
+  CUTLASS_DEVICE void warp_scheduler_barrier_sync() {
+    if constexpr (UseSchedulerBarrier) {
+      cutlass::arch::NamedBarrier::sync(
+          2 * cutlass::NumThreadsPerWarpGroup,
+          static_cast<uint32_t>(FwdNamedBarriers::WarpSchedulerWG1) - 1 +
+              hstu::canonical_warp_group_idx_nosync() /*id*/);
+    }
+  }
+
+  CUTLASS_DEVICE void warp_scheduler_barrier_arrive() {
+    if constexpr (UseSchedulerBarrier) {
+      static_assert(NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
+      int const cur_WG = hstu::canonical_warp_group_idx_nosync() - 1;
+      int const next_WG = NumMmaWarpGroups == 2
+          ? 1 - cur_WG
+          : (cur_WG < NumMmaWarpGroups - 1 ? cur_WG + 1 : 0);
+      cutlass::arch::NamedBarrier::arrive(
+          2 * cutlass::NumThreadsPerWarpGroup,
+          static_cast<uint32_t>(FwdNamedBarriers::WarpSchedulerWG1) +
+              next_WG /*id*/);
+    }
+  }
+
+  CUTLASS_DEVICE void mma_init() {
+    // Tell producers that smem_q is ready
+    cutlass::arch::NamedBarrier::arrive(
+        NumMmaThreads + cutlass::NumThreadsPerWarp,
+        static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
+    if constexpr (UseSchedulerBarrier) {
+      // We have NamedBarrier for up to 3 WGs
+      static_assert(NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
+      // WG1 needs the very first signal to start
+      if (hstu::canonical_warp_group_idx_nosync() == 1) {
+        cutlass::arch::NamedBarrier::arrive(
+            2 * cutlass::NumThreadsPerWarpGroup,
+            static_cast<uint32_t>(FwdNamedBarriers::WarpSchedulerWG1) /*id*/);
+      }
+    }
+  }
+
+  template <typename SharedStorage, typename FrgTensorO>
+  CUTLASS_DEVICE bool mma(
+      Params const& params,
+      MainloopPipelineK pipeline_k,
+      MainloopPipelineV pipeline_v,
+      PipelineState& smem_pipe_read,
+      FrgTensorO& tOrO,
+      int const thread_idx,
+      int& work_idx,
+      SeqlenInfo_t const& seqlen_info,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
+      SharedStorage& shared_storage) {
+    static_assert(
+        is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+
+    // can't use auto [m_block, ...] = block_coord since structured binding
+    // cannot be captured in lambda
+    int const m_block = get<0>(block_coord);
+    int const bidh = get<1>(block_coord);
+    int const bidb = get<2>(block_coord);
+    int const split_idx = get<3>(block_coord);
+    if constexpr (Jagged) {
+      static constexpr int kBlockM = get<0>(TileShape_MNK{});
+      if (m_block * kBlockM >= seqlen_info.seqlen_q) {
+        return false;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    float scalar_scale_val_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    if (params.num_groups > 1) {
+      int group_id = bidb / params.batch_size_per_group;
+      int max_seq_len_per_group = params.max_seq_len_tensor[group_id];
+      // attention scale
+      scalar_scale_val_ = params.scalar_scale
+          ? (params.attn_scale == nullptr ? 1.0f / max_seq_len_per_group
+                                          : params.attn_scale[group_id])
+          : 0;
+    } else {
+      // attention scale
+      scalar_scale_val_ = params.scalar_scale
+          ? (params.attn_scale == nullptr ? params.max_seq_len_inv
+                                          : params.attn_scale[0])
+          : 0;
+    }
+    int n_block_min, n_block_max;
+    if constexpr (Cross) {
+      auto n_block_min_max = get_cross_n_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    } else {
+      auto n_block_min_max = get_n_block_min_max(
+          max_attn_len_,
+          min_full_attn_seq_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    }
+
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    if (n_block_max <= n_block_min) {
+      std::printf(
+          "mainloop_fwd_sm90: n_block_max <= n_block_min not expected.");
+      return false;
+    }
+#endif
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sV = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutVtMma{});
+    Tensor sP = [&] {
+      if constexpr (Mma1_is_RS) {
+        // We might not have smem_p if !Mma1_is_RS1, just use smem_q as a
+        // placeholder since we don't use it
+        return make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+            SmemLayoutP{});
+      } else {
+        return make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+            SmemLayoutP{});
+      }
+    }();
+
+    if constexpr (!Mma0_is_RS) {
+      static_assert(
+          stride<0>(typename TiledMma0::ALayout{}) == 0 and
+              stride<0>(typename TiledMma0::BLayout{}) == 0 and
+              size<0>(typename TiledMma0::ALayout{}) ==
+                  cutlass::NumThreadsPerWarpGroup and
+              size<0>(typename TiledMma0::BLayout{}) ==
+                  cutlass::NumThreadsPerWarpGroup,
+          "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+    }
+    constexpr int MmaWarpGroups =
+        size(TiledMma0{}) / cutlass::NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(
+        make_shape(Int<MmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+
+    int warp_group_idx = __shfl_sync(
+        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
+    TiledMma0 tiled_mma0;
+    TiledMma1 tiled_mma1;
+    auto wg_mma0 =
+        tiled_mma0.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma1 =
+        tiled_mma1.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    auto smem_tiled_copy_P = make_tiled_copy_C(SmemCopyAtomP{}, tiled_mma0);
+    auto smem_thr_copy_P = smem_tiled_copy_P.get_thread_slice(thread_idx);
+
+    // Allocate "fragments/descriptors"
+    Tensor tSrQ = wg_mma0.partition_fragment_A(sQ);
+    Tensor tSrK = wg_mma0.partition_fragment_B(sK);
+    Tensor tOrV = wg_mma1.partition_fragment_B(sV);
+    Tensor tOsP = wg_mma1.partition_fragment_A(sP);
+    Tensor tPsP = smem_thr_copy_P.partition_D(
+        cute::as_position_independent_swizzle_tensor(sP));
+
+    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+    };
+
+    clear(tOrO);
+
+    int n_block = n_block_max - 1;
+
+    hstu::Mask<kBlockM, kBlockN, TiledMma0> mask(
+        thread_idx,
+        seqlen_info.seqlen_q,
+        seqlen_info.seqlen_kv,
+        max_attn_len_,
+        min_full_attn_seq_len_,
+        contextual_seq_len_,
+        seqlen_info.uihlen_q);
+
+    auto& barrier_Q = shared_storage.pipelines.barrier_Q;
+    barrier_Q.wait(work_idx % 2);
+
+    static constexpr int Qdim = 0;
+    auto thread_mma = tiled_mma0.get_thread_slice(thread_idx);
+    auto thread0_mma = tiled_mma0.get_thread_slice(_0{});
+    Tensor cS = cute::make_identity_tensor(Shape<Int<kBlockM>, Int<kBlockN>>{});
+    Tensor tScS = thread_mma.partition_C(cS);
+    Tensor tScS_rowcol = make_tensor(
+        tScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tScS.layout()));
+    Tensor t0ScS = thread0_mma.partition_C(cS);
+    Tensor t0ScS_rowcol = make_tensor(
+        t0ScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(t0ScS.layout()));
+    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
+    SiluScaleOp<float> silu_scale_op;
+    int qdim_offset = params.scalar_scale
+        ? 0
+        : m_block * kBlockM + thread_qdim_offset + seqlen_info.offset_q;
+
+    if constexpr (Mma0_is_RS) {
+      using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+      auto smem_tiled_copy_Q = make_tiled_copy_A(SmemCopyAtomQ{}, tiled_mma0);
+      auto smem_thr_copy_Q = smem_tiled_copy_Q.get_thread_slice(thread_idx);
+      Tensor tSrQ_copy_view = smem_thr_copy_Q.retile_D(tSrQ);
+      Tensor tSsQ_copy_view = smem_thr_copy_Q.partition_S(
+          cute::as_position_independent_swizzle_tensor(sQ));
+      cute::copy(smem_tiled_copy_Q, tSsQ_copy_view, tSrQ_copy_view);
+    }
+
+    Tensor tSrS =
+        partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+    Tensor tSrS_rowcol = make_tensor(
+        tSrS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
+    consumer_wait(pipeline_k, smem_pipe_read);
+    hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
+        tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+    warpgroup_wait<0>();
+    pipeline_k.consumer_release(smem_pipe_read);
+#pragma unroll
+    for (int mi = 0; mi < size<0>(tSrS_rowcol); ++mi) {
+      float scale = scalar_scale_val_;
+      if (!params.scalar_scale) {
+        int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
+        // Convert global index to local sequence position for bounds checking
+        int q_local = q_index - seqlen_info.offset_q;
+        if (q_local < seqlen_info.seqlen_q) {
+          scale = params.attn_scale[q_index];
+        }
+      }
+#pragma unroll
+      for (int ni = 0; ni < size<1>(tSrS_rowcol); ++ni) {
+        tSrS_rowcol(mi, ni) =
+            silu_scale_op(tSrS_rowcol(mi, ni) * params.alpha, scale);
+      }
+    }
+    int const m_idx_max = (m_block + 1) * kBlockM;
+    if constexpr (Cross) {
+      mask.template apply<
+          false /*Seqlenq_mask*/,
+          true /*Seqlenk_mask*/,
+          Causal,
+          false /*Local*/,
+          false /*Contexual_mask*/,
+          false /*Target_mask*/,
+          Cross,
+          false /*Softmax*/>(tSrS, m_block, n_block);
+    } else {
+      if (m_idx_max <= seqlen_info.uihlen_q) {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            false /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            Cross,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      } else if (
+          m_idx_max <=
+          cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            Has_targets,
+            Cross,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      } else {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal*/,
+            false,
+            Contexual_mask,
+            Has_targets,
+            Cross,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      }
+    }
+    if constexpr (Is_FP8 && !V_colmajor) {
+      hstu::permute_Cregs_fp8(tSrS);
+    }
+    Tensor tOrP_acc = make_tensor(
+        tSrS.data(), hstu::convert_layout_acc_Aregs<TiledMma1>(tSrS.layout()));
+    Tensor tOrP = make_tensor_like<Element>(tOrP_acc);
+    convert_type_out(tOrP_acc, tOrP);
+    if constexpr (Is_FP8 && V_colmajor) {
+      hstu::permute_Aregs_fp8(tOrP);
+    }
+    if constexpr (!Mma1_is_RS) {
+      cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
+      cutlass::arch::fence_view_async_shared();
+      __syncwarp(); // Only need syncwarp since each warp is using its own P
+                    // values for Mma1
+    }
+    --n_block;
+
+    // Each step does gemm0 and silu for iter n_block and gemm1 for prev iter.
+    auto fwd_step_intra_warp_pipeline = [&](int const n_block, auto mask_fn) {
+      PipelineState smem_pipe_read_v(
+          smem_pipe_read.index(),
+          smem_pipe_read.phase(),
+          smem_pipe_read.count());
+      ++smem_pipe_read;
+      Tensor tSrS =
+          partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+      Tensor tSrS_rowcol = make_tensor(
+          tSrS.data(),
+          hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
+      if (!UseSchedulerBarrier || warp_group_idx == 0) {
+        consumer_wait(pipeline_k, smem_pipe_read);
+      }
+      warp_scheduler_barrier_sync();
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
+          tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+      if (!UseSchedulerBarrier || warp_group_idx == 0) {
+        consumer_wait(pipeline_v, smem_pipe_read_v);
+      }
+      hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+          tiled_mma1,
+          cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
+          tOrV(_, _, _, smem_pipe_read_v.index()),
+          tOrO);
+      warp_scheduler_barrier_arrive();
+      warpgroup_wait<1>();
+      pipeline_k.consumer_release(smem_pipe_read); // release K
+#pragma unroll
+      for (int mi = 0; mi < size<0>(tSrS_rowcol); ++mi) {
+        float scale = scalar_scale_val_;
+        if (!params.scalar_scale) {
+          int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
+          // Convert global index to local sequence position for bounds checking
+          int q_local = q_index - seqlen_info.offset_q;
+          if (q_local < seqlen_info.seqlen_q) {
+            scale = params.attn_scale[q_index];
+          }
+        }
+#pragma unroll
+        for (int ni = 0; ni < size<1>(tSrS_rowcol); ++ni) {
+          tSrS_rowcol(mi, ni) =
+              silu_scale_op(tSrS_rowcol(mi, ni) * params.alpha, scale);
+        }
+      }
+      mask_fn(tSrS, n_block);
+      warpgroup_wait<0>();
+      pipeline_v.consumer_release(smem_pipe_read_v); // release V
+      if constexpr (Is_FP8 && !V_colmajor) {
+        hstu::permute_Cregs_fp8(tSrS);
+      }
+      convert_type_out(make_tensor(tSrS.data(), tOrP.layout()), tOrP);
+      if constexpr (Is_FP8 && V_colmajor) {
+        hstu::permute_Aregs_fp8(tOrP);
+      }
+      if constexpr (!Mma1_is_RS) {
+        cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
+      }
+      if constexpr (!Mma1_is_RS) {
+        cutlass::arch::fence_view_async_shared();
+        __syncwarp();
+      }
+    };
+
+    if constexpr (Cross) {
+      if constexpr (Causal) {
+        if (m_idx_max <=
+            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+          auto mask_fn = [&](auto& tSrS, int n_block) {
+            mask.template apply<
+                false /*Seqlenq_mask*/,
+                false /*Seqlenk_mask*/,
+                Causal,
+                false /*Local*/,
+                false /*Contexual_mask*/,
+                false /*Target_mask*/,
+                Cross,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_idx_min = m_block * kBlockM;
+          int const n_block_min_causal_mask = std::max(
+              n_block_min,
+              (m_idx_min + seqlen_info.seqlen_kv - seqlen_info.uihlen_q) /
+                  kBlockN);
+#pragma unroll 1
+          for (; n_block >= n_block_min_causal_mask; --n_block) {
+            fwd_step_intra_warp_pipeline(n_block, mask_fn);
+          }
+        }
+      }
+      auto no_mask_fn = [](auto& tSrS, int n_block) {};
+#pragma unroll 1
+      for (; n_block >= n_block_min; --n_block) {
+        fwd_step_intra_warp_pipeline(n_block, no_mask_fn);
+      }
+    } else {
+      if constexpr (Causal || Local) { // Separate iterations with causal
+                                       // or local masking
+        if (m_idx_max <=
+            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+          auto mask_fn = [&](auto& tSrS, int n_block) {
+            mask.template apply<
+                false /*Seqlenq_mask*/,
+                false /*Seqlenk_mask*/,
+                Causal,
+                Local,
+                Contexual_mask,
+                false /*Has_targets*/,
+                Cross,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_idx_min = m_block * kBlockM;
+          int const n_block_min_causal_local_mask =
+              std::max(n_block_min, m_idx_min / kBlockN);
+#pragma unroll 1
+          for (; n_block >= n_block_min_causal_local_mask; --n_block) {
+            fwd_step_intra_warp_pipeline(n_block, mask_fn);
+          }
+        }
+      }
+      int n_block_min_before_local_mask = n_block_min;
+      if constexpr (Local) {
+        if (m_idx_max <=
+            cute::ceil_div(
+                seqlen_info.uihlen_q - min_full_attn_seq_len_, kBlockM) *
+                kBlockM) {
+          n_block_min_before_local_mask = std::max(
+              n_block_min, cute::ceil_div(m_idx_max - max_attn_len_, kBlockN));
+        }
+      }
+      auto no_mask_fn = [](auto& tSrS, int n_block) {};
+#pragma unroll 1
+      for (; n_block >= n_block_min_before_local_mask; --n_block) {
+        fwd_step_intra_warp_pipeline(n_block, no_mask_fn);
+      }
+      // Separate masking iterations on the left for local attention
+      if constexpr (Local) {
+        auto local_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              false /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              Local,
+              Contexual_mask,
+              false /*Has_targets*/,
+              Cross,
+              false /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (; n_block >= n_block_min; --n_block) {
+          fwd_step_intra_warp_pipeline(n_block, local_mask_fn);
+        }
+      }
+      // Target part GEMM
+      if constexpr (Has_targets) {
+        auto [target_n_block_min, target_n_block_max] =
+            get_target_n_block_min_max(
+                n_block_max,
+                seqlen_info.uihlen_q,
+                seqlen_info.seqlen_kv,
+                m_block);
+        auto target_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              false /*Local*/,
+              Contexual_mask,
+              Has_targets,
+              Cross,
+              false /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (n_block = target_n_block_max - 1; n_block >= target_n_block_min;
+             --n_block) {
+          fwd_step_intra_warp_pipeline(n_block, target_mask_fn);
+        }
+      }
+      if constexpr (Contexual_mask) {
+        int contexual_n_block_max = get_contexual_n_block_max(
+            n_block_min,
+            min_full_attn_seq_len_,
+            contextual_seq_len_,
+            seqlen_info.uihlen_q,
+            m_block);
+        auto contexual_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              Local,
+              Contexual_mask,
+              Has_targets,
+              Cross,
+              false /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (n_block = contexual_n_block_max - 1; n_block >= 0; --n_block) {
+          fwd_step_intra_warp_pipeline(n_block, contexual_mask_fn);
+        }
+      }
+    }
+    // Tell producers that smem_q is ready
+    cutlass::arch::NamedBarrier::arrive(
+        NumMmaThreads + cutlass::NumThreadsPerWarp,
+        static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
+    consumer_wait(pipeline_v, smem_pipe_read);
+    hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+        tiled_mma1,
+        cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
+        tOrV(_, _, _, smem_pipe_read.index()),
+        tOrO);
+    warpgroup_wait<0>();
+    pipeline_v.consumer_release(
+        smem_pipe_read); // release V, otherwise producers will hang
+    if constexpr (Is_FP8 && !V_colmajor) {
+      hstu::permute_output_fp8(tOrO);
+    }
+    ++smem_pipe_read;
+    ++work_idx;
+    return true;
+  }
+
+  template <typename SharedStorage, typename FrgTensorO, typename Softmax>
+  CUTLASS_DEVICE bool mma_softmax(
+      Params const& params,
+      MainloopPipelineK pipeline_k,
+      MainloopPipelineV pipeline_v,
+      PipelineState& smem_pipe_read,
+      FrgTensorO& tOrO,
+      Softmax& softmax,
+      int const thread_idx,
+      int& work_idx,
+      SeqlenInfo_t const& seqlen_info,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
+      SharedStorage& shared_storage) {
+    static_assert(
+        is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+
+    // can't use auto [m_block, ...] = block_coord since structured binding
+    // cannot be captured in lambda
+    int const m_block = get<0>(block_coord);
+    int const bidh = get<1>(block_coord);
+    int const bidb = get<2>(block_coord);
+    int const split_idx = get<3>(block_coord);
+    if constexpr (Jagged) {
+      static constexpr int kBlockM = get<0>(TileShape_MNK{});
+      if (m_block * kBlockM >= seqlen_info.seqlen_q) {
+        return false;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    int n_block_min, n_block_max;
+    if constexpr (Cross) {
+      auto n_block_min_max = get_cross_n_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    } else {
+      auto n_block_min_max = get_n_block_min_max(
+          max_attn_len_,
+          min_full_attn_seq_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    }
+
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    if (n_block_max <= n_block_min) {
+      std::printf(
+          "mainloop_fwd_sm90: n_block_max <= n_block_min not expected.");
+      return false;
+    }
+#endif
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sV = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutVtMma{});
+    Tensor sP = [&] {
+      if constexpr (Mma1_is_RS) {
+        // We might not have smem_p if !Mma1_is_RS1, just use smem_q as a
+        // placeholder since we don't use it
+        return make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+            SmemLayoutP{});
+      } else {
+        return make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+            SmemLayoutP{});
+      }
+    }();
+
+    if constexpr (!Mma0_is_RS) {
+      static_assert(
+          stride<0>(typename TiledMma0::ALayout{}) == 0 and
+              stride<0>(typename TiledMma0::BLayout{}) == 0 and
+              size<0>(typename TiledMma0::ALayout{}) ==
+                  cutlass::NumThreadsPerWarpGroup and
+              size<0>(typename TiledMma0::BLayout{}) ==
+                  cutlass::NumThreadsPerWarpGroup,
+          "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+    }
+    constexpr int MmaWarpGroups =
+        size(TiledMma0{}) / cutlass::NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(
+        make_shape(Int<MmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+
+    int warp_group_idx = __shfl_sync(
+        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
+    TiledMma0 tiled_mma0;
+    TiledMma1 tiled_mma1;
+    auto wg_mma0 =
+        tiled_mma0.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma1 =
+        tiled_mma1.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    auto smem_tiled_copy_P = make_tiled_copy_C(SmemCopyAtomP{}, tiled_mma0);
+    auto smem_thr_copy_P = smem_tiled_copy_P.get_thread_slice(thread_idx);
+
+    // Allocate "fragments/descriptors"
+    Tensor tSrQ = wg_mma0.partition_fragment_A(sQ);
+    Tensor tSrK = wg_mma0.partition_fragment_B(sK);
+    Tensor tOrV = wg_mma1.partition_fragment_B(sV);
+    Tensor tOsP = wg_mma1.partition_fragment_A(sP);
+    Tensor tPsP = smem_thr_copy_P.partition_D(
+        cute::as_position_independent_swizzle_tensor(sP));
+
+    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+    };
+
+    clear(tOrO);
+
+    int n_block = n_block_max - 1;
+
+    hstu::Mask<kBlockM, kBlockN, TiledMma0> mask(
+        thread_idx,
+        seqlen_info.seqlen_q,
+        seqlen_info.seqlen_kv,
+        max_attn_len_,
+        min_full_attn_seq_len_,
+        contextual_seq_len_,
+        seqlen_info.uihlen_q);
+
+    auto& barrier_Q = shared_storage.pipelines.barrier_Q;
+    barrier_Q.wait(work_idx % 2);
+    static constexpr int Qdim = 0;
+    auto thread_mma = tiled_mma0.get_thread_slice(thread_idx);
+    auto thread0_mma = tiled_mma0.get_thread_slice(_0{});
+    Tensor cS = cute::make_identity_tensor(Shape<Int<kBlockM>, Int<kBlockN>>{});
+    Tensor tScS = thread_mma.partition_C(cS);
+    Tensor tScS_rowcol = make_tensor(
+        tScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tScS.layout()));
+    Tensor t0ScS = thread0_mma.partition_C(cS);
+    Tensor t0ScS_rowcol = make_tensor(
+        t0ScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(t0ScS.layout()));
+    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
+    int qdim_offset = params.scalar_scale
+        ? 0
+        : m_block * kBlockM + thread_qdim_offset + seqlen_info.offset_q;
+
+    if constexpr (Mma0_is_RS) {
+      using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+      auto smem_tiled_copy_Q = make_tiled_copy_A(SmemCopyAtomQ{}, tiled_mma0);
+      auto smem_thr_copy_Q = smem_tiled_copy_Q.get_thread_slice(thread_idx);
+      Tensor tSrQ_copy_view = smem_thr_copy_Q.retile_D(tSrQ);
+      Tensor tSsQ_copy_view = smem_thr_copy_Q.partition_S(
+          cute::as_position_independent_swizzle_tensor(sQ));
+      cute::copy(smem_tiled_copy_Q, tSsQ_copy_view, tSrQ_copy_view);
+    }
+
+    Tensor tSrS =
+        partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+    Tensor tSrS_rowcol = make_tensor(
+        tSrS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
+    consumer_wait(pipeline_k, smem_pipe_read);
+    hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
+        tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+    warpgroup_wait<0>();
+    pipeline_k.consumer_release(smem_pipe_read);
+    int const m_idx_max = (m_block + 1) * kBlockM;
+    if constexpr (Cross) {
+      mask.template apply<
+          false /*Seqlenq_mask*/,
+          true /*Seqlenk_mask*/,
+          Causal,
+          false /*Local*/,
+          false /*Contexual_mask*/,
+          false /*Target_mask*/,
+          Cross,
+          true /*Softmax*/>(tSrS, m_block, n_block);
+    } else {
+      if (m_idx_max <= seqlen_info.uihlen_q) {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            false /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            Cross,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      } else if (
+          m_idx_max <=
+          cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            Has_targets,
+            Cross,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      } else {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal*/,
+            false,
+            Contexual_mask,
+            Has_targets,
+            Cross,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      }
+    }
+    Tensor scores_scale = softmax.template max_get_scale<
+        /*Is_first=*/true,
+        /*Check_inf=*/true>(tSrS);
+    softmax.template online_softmax</*Is_first=*/true, /*Check_inf=*/true>(
+        tSrS);
+    if constexpr (Is_FP8 && !V_colmajor) {
+      hstu::permute_Cregs_fp8(tSrS);
+    }
+    Tensor tOrP_acc = make_tensor(
+        tSrS.data(), hstu::convert_layout_acc_Aregs<TiledMma1>(tSrS.layout()));
+    Tensor tOrP = make_tensor_like<Element>(tOrP_acc);
+    convert_type_out(tOrP_acc, tOrP);
+    if constexpr (Is_FP8 && V_colmajor) {
+      hstu::permute_Aregs_fp8(tOrP);
+    }
+    if constexpr (!Mma1_is_RS) {
+      cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
+      cutlass::arch::fence_view_async_shared();
+      __syncwarp(); // Only need syncwarp since each warp is using its own P
+                    // values for Mma1
+    }
+    --n_block;
+
+    // Each step does gemm0 and softmax for iter n_block and gemm1 for prev
+    auto fwd_step_intra_warp_pipeline = [&](int const n_block,
+                                            auto mask_fn,
+                                            auto check_inf_type) {
+      static constexpr bool Check_inf = decltype(check_inf_type)::value;
+      PipelineState smem_pipe_read_v(
+          smem_pipe_read.index(),
+          smem_pipe_read.phase(),
+          smem_pipe_read.count());
+      ++smem_pipe_read;
+      Tensor tSrS =
+          partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+      Tensor tSrS_rowcol = make_tensor(
+          tSrS.data(),
+          hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
+      if (!UseSchedulerBarrier || warp_group_idx == 0) {
+        consumer_wait(pipeline_k, smem_pipe_read);
+      }
+      warp_scheduler_barrier_sync();
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
+          tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+      if (!UseSchedulerBarrier || warp_group_idx == 0) {
+        consumer_wait(pipeline_v, smem_pipe_read_v);
+      }
+      hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+          tiled_mma1,
+          cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
+          tOrV(_, _, _, smem_pipe_read_v.index()),
+          tOrO);
+      warp_scheduler_barrier_arrive();
+      warpgroup_wait<1>();
+      pipeline_k.consumer_release(smem_pipe_read); // release K
+      mask_fn(tSrS, n_block);
+      cute::copy(
+          softmax.template max_get_scale</*Is_first=*/false, Check_inf>(tSrS),
+          scores_scale);
+      softmax.template online_softmax</*Is_first=*/false, Check_inf>(tSrS);
+      warpgroup_wait<0>();
+      pipeline_v.consumer_release(smem_pipe_read_v); // release V
+      if constexpr (Is_FP8 && !V_colmajor) {
+        hstu::permute_Cregs_fp8(tSrS);
+      }
+      convert_type_out(make_tensor(tSrS.data(), tOrP.layout()), tOrP);
+      if constexpr (Is_FP8 && V_colmajor) {
+        hstu::permute_Aregs_fp8(tOrP);
+      }
+      softmax.rescale_o(tOrO, scores_scale);
+      if constexpr (!Mma1_is_RS) {
+        cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
+      }
+      if constexpr (!Mma1_is_RS) {
+        cutlass::arch::fence_view_async_shared();
+        __syncwarp();
+      }
+    };
+
+    if constexpr (Cross) {
+      if constexpr (Causal) {
+        if (m_idx_max <=
+            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+          auto mask_fn = [&](auto& tSrS, int n_block) {
+            mask.template apply<
+                false /*Seqlenq_mask*/,
+                false /*Seqlenk_mask*/,
+                Causal,
+                false /*Local*/,
+                false /*Contexual_mask*/,
+                false /*Target_mask*/,
+                Cross,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_idx_min = m_block * kBlockM;
+          int const n_block_min_causal_mask = std::max(
+              n_block_min,
+              (m_idx_min + seqlen_info.seqlen_kv - seqlen_info.uihlen_q) /
+                  kBlockN);
+#pragma unroll 1
+          for (; n_block >= n_block_min_causal_mask; --n_block) {
+            fwd_step_intra_warp_pipeline(n_block, mask_fn, cute::true_type{});
+          }
+        }
+      }
+      auto no_mask_fn = [](auto& tSrS, int n_block) {};
+#pragma unroll 1
+      for (; n_block >= n_block_min; --n_block) {
+        fwd_step_intra_warp_pipeline(n_block, no_mask_fn, cute::false_type{});
+      }
+    } else {
+      if constexpr (Causal || Local) { // Separate iterations with causal
+                                       // or local masking
+        if (m_idx_max <=
+            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+          auto mask_fn = [&](auto& tSrS, int n_block) {
+            mask.template apply<
+                false /*Seqlenq_mask*/,
+                false /*Seqlenk_mask*/,
+                Causal,
+                Local,
+                Contexual_mask,
+                false /*Has_targets*/,
+                Cross,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_idx_min = m_block * kBlockM;
+          int const n_block_min_causal_local_mask =
+              std::max(n_block_min, m_idx_min / kBlockN);
+#pragma unroll 1
+          for (; n_block >= n_block_min_causal_local_mask; --n_block) {
+            fwd_step_intra_warp_pipeline(n_block, mask_fn, cute::true_type{});
+          }
+        }
+      }
+      int n_block_min_before_local_mask = n_block_min;
+      if constexpr (Local) {
+        if (m_idx_max <=
+            cute::ceil_div(
+                seqlen_info.uihlen_q - min_full_attn_seq_len_, kBlockM) *
+                kBlockM) {
+          n_block_min_before_local_mask = std::max(
+              n_block_min, cute::ceil_div(m_idx_max - max_attn_len_, kBlockN));
+        }
+      }
+      auto no_mask_fn = [](auto& tSrS, int n_block) {};
+#pragma unroll 1
+      for (; n_block >= n_block_min_before_local_mask; --n_block) {
+        fwd_step_intra_warp_pipeline(n_block, no_mask_fn, cute::false_type{});
+      }
+      // Separate masking iterations on the left for local attention
+      if constexpr (Local) {
+        auto local_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              false /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              Local,
+              Contexual_mask,
+              false /*Has_targets*/,
+              Cross,
+              true /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (; n_block >= n_block_min; --n_block) {
+          fwd_step_intra_warp_pipeline(
+              n_block, local_mask_fn, cute::true_type{});
+        }
+      }
+      // Target part GEMM
+      if constexpr (Has_targets) {
+        auto [target_n_block_min, target_n_block_max] =
+            get_target_n_block_min_max(
+                n_block_max,
+                seqlen_info.uihlen_q,
+                seqlen_info.seqlen_kv,
+                m_block);
+        auto target_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              false /*Local*/,
+              Contexual_mask,
+              Has_targets,
+              Cross,
+              true /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (n_block = target_n_block_max - 1; n_block >= target_n_block_min;
+             --n_block) {
+          fwd_step_intra_warp_pipeline(
+              n_block, target_mask_fn, cute::true_type{});
+        }
+      }
+      if constexpr (Contexual_mask) {
+        int contexual_n_block_max = get_contexual_n_block_max(
+            n_block_min,
+            min_full_attn_seq_len_,
+            contextual_seq_len_,
+            seqlen_info.uihlen_q,
+            m_block);
+        auto contexual_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              Local,
+              Contexual_mask,
+              Has_targets,
+              Cross,
+              true /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (n_block = contexual_n_block_max - 1; n_block >= 0; --n_block) {
+          fwd_step_intra_warp_pipeline(
+              n_block, contexual_mask_fn, cute::true_type{});
+        }
+      }
+    }
+    // Tell producers that smem_q is ready
+    cutlass::arch::NamedBarrier::arrive(
+        NumMmaThreads + cutlass::NumThreadsPerWarp,
+        static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
+    consumer_wait(pipeline_v, smem_pipe_read);
+    hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+        tiled_mma1,
+        cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
+        tOrV(_, _, _, smem_pipe_read.index()),
+        tOrO);
+    cute::copy(softmax.finalize(1.0f), scores_scale);
+    warpgroup_wait<0>();
+    pipeline_v.consumer_release(
+        smem_pipe_read); // release V, otherwise producers will hang
+    softmax.rescale_o(tOrO, scores_scale);
+    if constexpr (Is_FP8 && !V_colmajor) {
+      hstu::permute_output_fp8(tOrO);
+    }
+    ++smem_pipe_read;
+    ++work_idx;
+    return true;
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h
new file mode 100644
index 000000000..e35af5193
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h
@@ -0,0 +1,396 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/tensor.hpp>
+
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <int BlockM, int BlockN, typename TiledMma, bool SwapAB = false>
+struct Mask {
+  int const thread_idx;
+  int const max_q_len;
+  int const max_kv_len;
+  int const max_attn_len;
+  int const min_full_attn_seq_len;
+  int const contextual_seq_len;
+  int const max_uih_len;
+
+  CUTLASS_DEVICE
+  Mask(
+      const int thread_idx,
+      const int max_q_len,
+      const int max_kv_len,
+      const int max_attn_len,
+      const int min_full_attn_seq_len,
+      const int contextual_seq_len,
+      const int max_uih_len)
+      : thread_idx(thread_idx),
+        max_q_len(max_q_len),
+        max_kv_len(max_kv_len),
+        max_attn_len(max_attn_len),
+        min_full_attn_seq_len(min_full_attn_seq_len),
+        contextual_seq_len(contextual_seq_len),
+        max_uih_len(max_uih_len) {};
+
+  template <
+      bool Seqlenq_mask = false,
+      bool Seqlenk_mask = false,
+      bool Causal_mask = false,
+      bool Local_mask = false,
+      bool Contexual_mask = false,
+      bool Target_mask = false, // If Target_mask, Seqlenk_mask will be disabled
+      bool Cross = false,
+      bool Softmax = false,
+      typename Engine,
+      typename Layout>
+  CUTLASS_DEVICE void apply(
+      Tensor<Engine, Layout>& tSrS,
+      const int m_block,
+      const int n_block) const {
+    static_assert(
+        !(Causal_mask && Local_mask), "Cannot be both causal and local");
+    static_assert(Layout::rank == 3, "Only support 3D Tensor");
+    if constexpr (Cross) {
+      static_assert(
+          (!Local_mask) && (!Contexual_mask) && (!Target_mask),
+          "Local, contexual, and target masks not supported under cross attention");
+    }
+    if (!Seqlenq_mask && !Seqlenk_mask && !Causal_mask && !Local_mask &&
+        !Target_mask) {
+      return;
+    }
+
+    auto thread_mma = TiledMma{}.get_thread_slice(thread_idx);
+    auto thread0_mma = TiledMma{}.get_thread_slice(_0{});
+
+    static constexpr int Qdim = !SwapAB ? 0 : 1, Kdim = !SwapAB ? 1 : 0;
+
+    Tensor cS = cute::make_identity_tensor(
+        Shape<
+            Int<!SwapAB ? BlockM : BlockN>,
+            Int<!SwapAB ? BlockN : BlockM>>{});
+    Tensor tScS = thread_mma.partition_C(cS);
+    Tensor tSrS_rowcol = make_tensor(
+        tSrS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tSrS.layout()));
+    Tensor tScS_rowcol = make_tensor(
+        tScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tScS.layout()));
+    Tensor t0ScS = thread0_mma.partition_C(cS);
+    Tensor t0ScS_rowcol = make_tensor(
+        t0ScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(t0ScS.layout()));
+    // We want to use the col indices of thread0 to compare, since that is known
+    // at compile time. So we subtract the limit by the first col index of this
+    // thread
+    int const thread_kdim_offset = get<Kdim>(tScS_rowcol(_0{}, _0{}));
+    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
+    int const seqlen_k_limit =
+        max_kv_len - n_block * BlockN - thread_kdim_offset;
+    int const uihlen_k_limit =
+        max_uih_len - n_block * BlockN - thread_kdim_offset;
+    int const seqlen_q_limit =
+        max_q_len - m_block * BlockM - thread_qdim_offset;
+    if constexpr (Seqlenq_mask) {
+#pragma unroll
+      for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+        if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
+#pragma unroll
+          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+            if constexpr (Softmax) {
+              tSrS_rowcol(m, n) = -INFINITY;
+            } else {
+              tSrS_rowcol(m, n) = 0.0f;
+            }
+          }
+        }
+      }
+    }
+    if constexpr (Cross) {
+      if constexpr (!Causal_mask) {
+        if constexpr (Seqlenk_mask) {
+#pragma unroll
+          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+            int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+            if (t0_col_idx >= seqlen_k_limit) {
+#pragma unroll
+              for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+                if constexpr (Softmax) {
+                  tSrS_rowcol(m, n) = -INFINITY;
+                } else {
+                  tSrS_rowcol(m, n) = 0.0f;
+                }
+              }
+            }
+          }
+        }
+      } else {
+        int const causal_row_offset = max_kv_len - max_uih_len + 1 -
+            n_block * BlockN - thread_kdim_offset;
+#pragma unroll
+        for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+          if constexpr (Seqlenq_mask) {
+            if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
+              continue;
+            }
+          }
+          int const row_idx = get<Qdim>(t0ScS_rowcol(m, _0{})) +
+              m_block * BlockM + thread_qdim_offset;
+          int const col_limit_right = !Seqlenk_mask
+              ? row_idx + causal_row_offset
+              : __viaddmin_s32(row_idx, causal_row_offset, seqlen_k_limit);
+#pragma unroll
+          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+            int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+            if (t0_col_idx >= col_limit_right) {
+              if constexpr (Softmax) {
+                tSrS_rowcol(m, n) = -INFINITY;
+              } else {
+                tSrS_rowcol(m, n) = 0.0f;
+              }
+            }
+          }
+        }
+      }
+    } else {
+      if constexpr (!Causal_mask && !Local_mask) {
+        if constexpr (Seqlenk_mask || Target_mask) {
+#pragma unroll
+          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+            int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+            if constexpr (Target_mask) {
+              if (t0_col_idx >= uihlen_k_limit) {
+                bool const oob_predicate = (t0_col_idx >= seqlen_k_limit);
+                int const col_offset =
+                    t0_col_idx - seqlen_k_limit + seqlen_q_limit;
+#pragma unroll
+                for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+                  int const t0_row_idx = int(get<Qdim>(t0ScS_rowcol(m, _0{})));
+                  if ((t0_row_idx != col_offset) || oob_predicate) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                }
+              }
+            } else if constexpr (Seqlenk_mask) {
+              if (t0_col_idx >= seqlen_k_limit) {
+#pragma unroll
+                for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+                  if constexpr (Softmax) {
+                    tSrS_rowcol(m, n) = -INFINITY;
+                  } else {
+                    tSrS_rowcol(m, n) = 0.0f;
+                  }
+                }
+              }
+            }
+          }
+        }
+      } else { // Causal_mask or Local_mask
+        int const causal_row_offset = 1 - n_block * BlockN - thread_kdim_offset;
+        if constexpr (Causal_mask) {
+#pragma unroll
+          for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+            if constexpr (Seqlenq_mask) {
+              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
+                continue;
+              }
+            }
+            if constexpr (Contexual_mask) {
+              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) <
+                  contextual_seq_len - m_block * BlockM - thread_qdim_offset) {
+#pragma unroll
+                for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                  int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+                  if (t0_col_idx >= uihlen_k_limit) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                }
+                continue;
+              }
+            }
+            int const row_idx = get<Qdim>(t0ScS_rowcol(m, _0{})) +
+                m_block * BlockM + thread_qdim_offset;
+            if constexpr (!Target_mask) {
+              int const col_limit_right = !Seqlenk_mask
+                  ? row_idx + causal_row_offset
+                  : __viaddmin_s32(row_idx, causal_row_offset, seqlen_k_limit);
+#pragma unroll
+              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+                if (t0_col_idx >= col_limit_right) {
+                  if constexpr (Softmax) {
+                    tSrS_rowcol(m, n) = -INFINITY;
+                  } else {
+                    tSrS_rowcol(m, n) = 0.0f;
+                  }
+                }
+              }
+            } else {
+#pragma unroll
+              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+                int const col_idx =
+                    t0_col_idx + n_block * BlockN + thread_kdim_offset;
+                bool const uih_cond =
+                    (t0_col_idx >= row_idx + causal_row_offset) &&
+                    (row_idx < max_uih_len);
+                bool const target_cond = (row_idx != col_idx) &&
+                    (row_idx >= max_uih_len) && (col_idx >= max_uih_len);
+                bool const seqlen_k_cond = (t0_col_idx >= seqlen_k_limit);
+                if (uih_cond || target_cond || seqlen_k_cond) {
+                  if constexpr (Softmax) {
+                    tSrS_rowcol(m, n) = -INFINITY;
+                  } else {
+                    tSrS_rowcol(m, n) = 0.0f;
+                  }
+                }
+              }
+            }
+          }
+        } else { // Local_mask
+          int const local_row_offset_left =
+              causal_row_offset - 1 - max_attn_len;
+          int const col_limit_sink = 0 - n_block * BlockN - thread_kdim_offset;
+#pragma unroll
+          for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+            if constexpr (Seqlenq_mask) {
+              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
+                continue;
+              }
+            }
+            if constexpr (Contexual_mask) {
+              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) <
+                  contextual_seq_len - m_block * BlockM - thread_qdim_offset) {
+#pragma unroll
+                for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                  int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+                  if (t0_col_idx >= uihlen_k_limit) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                }
+                continue;
+              }
+            }
+            int const row_idx = get<Qdim>(t0ScS_rowcol(m, _0{})) +
+                m_block * BlockM + thread_qdim_offset;
+            int col_limit_left = row_idx + local_row_offset_left;
+            if constexpr (Contexual_mask) {
+              // row contexual without sink
+              if (col_limit_left + n_block * BlockN + thread_kdim_offset <
+                  contextual_seq_len) {
+                col_limit_left = 0;
+              }
+            }
+            if constexpr (!Target_mask) {
+              int const col_limit_right = !Seqlenk_mask
+                  ? row_idx + causal_row_offset
+                  : __viaddmin_s32(row_idx, causal_row_offset, seqlen_k_limit);
+#pragma unroll
+              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(m, n)));
+                if (row_idx < max_uih_len - min_full_attn_seq_len) {
+                  bool const local_left_cond = Contexual_mask
+                      ? (t0_col_idx < col_limit_left &&
+                         t0_col_idx >= col_limit_sink)
+                      : (t0_col_idx < col_limit_left);
+                  if (local_left_cond) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                }
+                if (t0_col_idx >= col_limit_right) {
+                  if constexpr (Softmax) {
+                    tSrS_rowcol(m, n) = -INFINITY;
+                  } else {
+                    tSrS_rowcol(m, n) = 0.0f;
+                  }
+                }
+              }
+            } else {
+              int const col_limit_right = row_idx + causal_row_offset;
+#pragma unroll
+              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+                if (row_idx < max_uih_len) {
+                  if (row_idx < max_uih_len - min_full_attn_seq_len) {
+                    bool const local_left_cond = Contexual_mask
+                        ? (t0_col_idx < col_limit_left &&
+                           t0_col_idx >= col_limit_sink)
+                        : (t0_col_idx < col_limit_left);
+                    if (local_left_cond) {
+                      if constexpr (Softmax) {
+                        tSrS_rowcol(m, n) = -INFINITY;
+                      } else {
+                        tSrS_rowcol(m, n) = 0.0f;
+                      }
+                    }
+                  }
+                  if (t0_col_idx >= col_limit_right) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                } else {
+                  int const col_idx =
+                      t0_col_idx + n_block * BlockN + thread_kdim_offset;
+                  bool const target_cond = (row_idx != col_idx) &&
+                      (row_idx >= max_uih_len) && (col_idx >= max_uih_len);
+                  bool const seqlen_k_cond = (t0_col_idx >= seqlen_k_limit);
+                  if (target_cond || seqlen_k_cond) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  };
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h
new file mode 100644
index 000000000..79dce0dd9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h
@@ -0,0 +1,101 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cutlass/arch/barrier.h"
+
+namespace hstu {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// cutlass::arch::NamedBarrier::sync/arrive are only enabled Sm90 even though
+// they work for Sm80 as well. We reimplement them here, enabled for both Sm90
+// and Sm80.
+
+CUTLASS_DEVICE
+static void named_barrier_sync(uint32_t num_threads, uint32_t barrier_id_) {
+  static constexpr uint32_t ReservedNamedBarrierCount = static_cast<uint32_t>(
+      cutlass::arch::ReservedNamedBarriers::FirstUserBarrier);
+  uint32_t barrier_id = barrier_id_ + ReservedNamedBarrierCount;
+  asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+  cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(
+      __LINE__, num_threads, barrier_id);
+}
+
+CUTLASS_DEVICE
+static void named_barrier_sync(
+    uint32_t num_threads,
+    cutlass::arch::ReservedNamedBarriers reserved_named_barriers) {
+  uint32_t barrier_id = static_cast<uint32_t>(reserved_named_barriers);
+  asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+  cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(
+      __LINE__, num_threads, barrier_id);
+}
+
+CUTLASS_DEVICE
+static void named_barrier_arrive(uint32_t num_threads, uint32_t barrier_id_) {
+  static constexpr uint32_t ReservedNamedBarrierCount = static_cast<uint32_t>(
+      cutlass::arch::ReservedNamedBarriers::FirstUserBarrier);
+  uint32_t barrier_id = barrier_id_ + ReservedNamedBarrierCount;
+  cutlass::arch::synclog_emit_named_barrier_arrive(
+      __LINE__, num_threads, barrier_id);
+  asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+}
+
+CUTLASS_DEVICE
+static void named_barrier_arrive(
+    uint32_t num_threads,
+    cutlass::arch::ReservedNamedBarriers reserved_named_barriers) {
+  uint32_t barrier_id = static_cast<uint32_t>(reserved_named_barriers);
+  cutlass::arch::synclog_emit_named_barrier_arrive(
+      __LINE__, num_threads, barrier_id);
+  asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Enumerates the reserved named barriers to avoid potential conflicts
+
+enum class FwdNamedBarriers {
+  QueryEmpty = 0,
+  ProducerWG = 1,
+  TileCountSmemEmpty = 2,
+  TileCountSmemFull = 3,
+  WarpSchedulerWG1 = 4,
+  WarpSchedulerWG2 = 5,
+  WarpSchedulerWG3 = 6,
+};
+
+enum class BwdNamedBarriers {
+  KVEmpty = 0,
+  PdS = 1,
+  // This needs to match FwdNamedBarriers::TileCountSmemEmpty since
+  // TileScheduler uses it
+  TileCountSmemEmpty = 2,
+  TileCountSmemFull = 3,
+  dQEmptyWG1 = 4,
+  dQEmptyWG2 = 5,
+  dQEmptyWG3 = 6,
+  dQFullWG1 = 7,
+  dQFullWG2 = 8,
+  dQFullWG3 = 9,
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h
new file mode 100644
index 000000000..c5721b272
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h
@@ -0,0 +1,134 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/tensor.hpp>
+
+namespace hstu {
+
+// We consolidate all the info related to sequence length here. This is so that
+// we can do all the gmem reads once at the beginning of each tile, rather than
+// having to repeat these reads to compute various things like n_block_min,
+// n_block_max, etc.
+
+template <bool Jagged, int kBlock>
+struct SeqlenInfo {
+  int const offset, offset_padded;
+  int const seqlen;
+
+  CUTLASS_DEVICE
+  SeqlenInfo(
+      int const bidb,
+      int const seqlen_static,
+      int const* const seq_offsets)
+      : offset(!Jagged ? 0 : seq_offsets[bidb]),
+        offset_padded(
+            !Jagged ? 0
+                    : (seq_offsets[bidb] + bidb * kBlock) / kBlock * kBlock),
+        seqlen(
+            !Jagged ? seqlen_static
+                    : (seq_offsets[bidb + 1] - seq_offsets[bidb])) {}
+};
+
+template <bool Jagged, bool Cross, bool Has_targets, int kBlockM>
+struct SeqlenInfoQKBwd {
+  int const offset_q, offset_k, offset_q_padded;
+  int const seqlen_q, seqlen_kv, uihlen_q;
+
+  CUTLASS_DEVICE
+  SeqlenInfoQKBwd(
+      int const bidb,
+      int const max_q_len,
+      int const max_kv_len,
+      int const* const seq_offsets,
+      int const* const seq_offsets_q,
+      int const* const num_targets)
+      : offset_q(
+            !Jagged ? 0 : (Cross ? seq_offsets_q[bidb] : seq_offsets[bidb])),
+        offset_k(!Jagged ? 0 : seq_offsets[bidb])
+        // If jagged, the layout for dQaccum is that we pad
+        // each sequence in the batch by an extra kBlockM, so that the write for
+        // each sequence doesn't touch the next sequence. Sequence i starts at
+        // seq_offsets[i] + i * kBlockM and ends at seq_offsets[i + 1] + i *
+        // kBlockM However, the start must align to multiples of kBlockM.
+        ,
+        offset_q_padded(
+            !Jagged ? 0
+                : Cross
+                ? ((seq_offsets_q[bidb] + bidb * kBlockM) / kBlockM * kBlockM)
+                : ((seq_offsets[bidb] + bidb * kBlockM) / kBlockM * kBlockM)),
+        seqlen_q(
+            !Jagged ? max_q_len
+                    : (Cross ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
+                             : (seq_offsets[bidb + 1] - seq_offsets[bidb]))),
+        seqlen_kv(
+            !Jagged ? max_kv_len : (seq_offsets[bidb + 1] - seq_offsets[bidb])),
+        uihlen_q(
+            !Jagged
+                ? (Has_targets ? max_q_len - num_targets[bidb] : max_q_len)
+                : (Has_targets
+                       ? (Cross ? (seq_offsets_q[bidb + 1] -
+                                   seq_offsets_q[bidb] - num_targets[bidb])
+                                : (seq_offsets[bidb + 1] - seq_offsets[bidb] -
+                                   num_targets[bidb]))
+                       : (Cross
+                              ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
+                              : (seq_offsets[bidb + 1] - seq_offsets[bidb])))) {
+  }
+};
+
+template <bool Jagged, bool Cross, bool Has_targets>
+struct SeqlenInfoQKFwd {
+  int const offset_q, offset_k;
+  int const seqlen_q, seqlen_kv, uihlen_q;
+
+  CUTLASS_DEVICE
+  SeqlenInfoQKFwd(
+      int const bidb,
+      int const max_q_len,
+      int const max_kv_len,
+      int const* const seq_offsets,
+      int const* const seq_offsets_q,
+      int const* const num_targets)
+      : offset_q(
+            !Jagged ? 0 : (Cross ? seq_offsets_q[bidb] : seq_offsets[bidb])),
+        offset_k(!Jagged ? 0 : seq_offsets[bidb]),
+        seqlen_q(
+            !Jagged ? max_q_len
+                    : (Cross ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
+                             : (seq_offsets[bidb + 1] - seq_offsets[bidb]))),
+        seqlen_kv(
+            !Jagged ? max_kv_len : (seq_offsets[bidb + 1] - seq_offsets[bidb])),
+        uihlen_q(
+            !Jagged
+                ? (Has_targets ? max_q_len - num_targets[bidb] : max_q_len)
+                : (Has_targets
+                       ? (Cross ? (seq_offsets_q[bidb + 1] -
+                                   seq_offsets_q[bidb] - num_targets[bidb])
+                                : (seq_offsets[bidb + 1] - seq_offsets[bidb] -
+                                   num_targets[bidb]))
+                       : (Cross
+                              ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
+                              : (seq_offsets[bidb + 1] - seq_offsets[bidb])))) {
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h
new file mode 100644
index 000000000..b6428e79c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h
@@ -0,0 +1,150 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/pipeline/sm90_pipeline.hpp>
+
+namespace cutlass {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// As of Cutlass v3.6.0, if size(ClusterShape) == 1, PipelineTmaAsync has all
+// threads signaling the barrier during consumer_release. This causes a perf
+// regression in FA3 forward pass (especially hdim 128 causal). We instead
+// reimplement the version of PipelineTmaAsync before v3.6.0 where only 1 out of
+// 128 threads signals the barrier.
+//
+// Assumption: params.num_consumers % NumThreadsPerWarpGroup == 0
+template <int Stages_, class Base = cutlass::PipelineTmaAsync<Stages_>>
+class PipelineTmaAsyncNoCluster : public Base {
+ public:
+  using FullBarrier = typename Base::FullBarrier;
+  using EmptyBarrier = typename Base::EmptyBarrier;
+  static constexpr uint32_t Stages = Stages_;
+  using PipelineState = typename Base::PipelineState;
+
+  using SharedStorage = typename Base::SharedStorage;
+  using ThreadCategory = typename Base::ThreadCategory;
+  using Params = typename Base::Params;
+
+  static CUTLASS_DEVICE void init_barriers(
+      SharedStorage& storage,
+      Params params) {
+    int warp_idx = canonical_warp_idx_sync();
+    bool is_initializing_warp = (warp_idx == 0);
+    if (is_initializing_warp) {
+      // Barrier FULL and EMPTY init
+      constexpr int producer_arv_cnt = 1;
+      uint32_t const num_consumer_warpgroups_per_cluster =
+          params.num_consumers / NumThreadsPerWarpGroup;
+      uint32_t const multicast_consumer_arrival_count =
+          num_consumer_warpgroups_per_cluster;
+
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<
+          decltype(storage.full_barrier_),
+          decltype(storage.empty_barrier_),
+          Stages>(
+          storage.full_barrier_,
+          storage.empty_barrier_,
+          producer_arv_cnt,
+          multicast_consumer_arrival_count);
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  template <class ClusterShape, class InitBarriers, class InitMasks>
+  CUTLASS_DEVICE PipelineTmaAsyncNoCluster(
+      SharedStorage& storage,
+      Params params,
+      ClusterShape cluster_shape,
+      InitBarriers = {},
+      InitMasks = {})
+      : Base(
+            storage,
+            params,
+            make_shape(_1{}, _1{}, _1{}) /*cluster_shape*/,
+            cute::false_type{} /*init_barriers*/,
+            cute::false_type{} /*init_masks*/),
+        empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+
+    static_assert(
+        cute::is_same_v<InitBarriers, cute::true_type> ||
+        cute::is_same_v<InitBarriers, cute::false_type>);
+    static_assert(
+        cute::is_same_v<InitMasks, cute::true_type> ||
+        cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params);
+    }
+  }
+
+  // Constructor
+  template <class ClusterShape>
+  CUTLASS_DEVICE PipelineTmaAsyncNoCluster(
+      SharedStorage& storage,
+      Params params,
+      ClusterShape cluster_shape)
+      : PipelineTmaAsyncNoCluster(
+            storage,
+            params,
+            cluster_shape,
+            cute::true_type{},
+            cute::true_type{}) {}
+
+  template <class ClusterShape, class InitBarriers>
+  CUTLASS_DEVICE PipelineTmaAsyncNoCluster(
+      SharedStorage& storage,
+      Params params,
+      ClusterShape cluster_shape,
+      InitBarriers = {})
+      : PipelineTmaAsyncNoCluster(
+            storage,
+            params,
+            cluster_shape,
+            InitBarriers{},
+            cute::true_type{}) {}
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+ private:
+  EmptyBarrier* const empty_barrier_ptr_ = nullptr;
+
+  // Consumer signalling Producer of completion
+  // Ensures all blocks in the Same Row and Column get notifed.
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage, uint32_t skip = false) {
+    empty_barrier_ptr_[stage].arrive(
+        0 /*dst_blockid_*/,
+        uint32_t(threadIdx.x % cutlass::NumThreadsPerWarpGroup == 0) &
+            (!skip) /*is_signaling_thread*/);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cutlass
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h
new file mode 100644
index 000000000..1bd6131c6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h
@@ -0,0 +1,256 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/tensor.hpp>
+#include <cutlass/numeric_types.h>
+#include <cmath>
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool zero_init = true,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1,
+    typename Operator>
+__device__ __forceinline__ void thread_reduce_(
+    Tensor<Engine0, Layout0> const& tensor,
+    Tensor<Engine1, Layout1>& summary,
+    Operator& op) {
+  static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+  static_assert(Layout1::rank == 1, "Only support 1D Tensor");
+  CUTE_STATIC_ASSERT_V(size<0>(summary) == size<0>(tensor));
+#pragma unroll
+  for (int ni = 0; ni < size<1>(tensor); ni++) {
+#pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); mi++) {
+      summary(mi) = zero_init && ni == 0 ? tensor(mi, ni)
+                                         : op(summary(mi), tensor(mi, ni));
+    }
+  }
+}
+
+template <
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1,
+    typename Operator>
+__device__ __forceinline__ void quad_allreduce_(
+    Tensor<Engine0, Layout0>& dst,
+    Tensor<Engine1, Layout1>& src,
+    Operator& op) {
+  CUTE_STATIC_ASSERT_V(size(dst) == size(src));
+#pragma unroll
+  for (int i = 0; i < size(dst); i++) {
+    dst(i) = Allreduce<4>::run(src(i), op);
+  }
+}
+
+template <
+    bool zero_init = true,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1,
+    typename Operator>
+__device__ __forceinline__ void reduce_(
+    Tensor<Engine0, Layout0> const& tensor,
+    Tensor<Engine1, Layout1>& summary,
+    Operator& op) {
+  thread_reduce_<zero_init>(tensor, summary, op);
+  quad_allreduce_(summary, summary, op);
+}
+
+template <
+    bool zero_init = true,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1>
+__device__ __forceinline__ void reduce_max(
+    Tensor<Engine0, Layout0> const& tensor,
+    Tensor<Engine1, Layout1>& max) {
+  MaxOp<float> max_op;
+  reduce_<zero_init>(tensor, max, max_op);
+}
+
+template <
+    bool zero_init = true,
+    bool warp_reduce = true,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1>
+__device__ __forceinline__ void reduce_sum(
+    Tensor<Engine0, Layout0> const& tensor,
+    Tensor<Engine1, Layout1>& sum) {
+  SumOp<float> sum_op;
+  thread_reduce_<zero_init>(tensor, sum, sum_op);
+  if constexpr (warp_reduce) {
+    quad_allreduce_(sum, sum, sum_op);
+  }
+}
+
+// Apply the exp to all the elements.
+template <
+    bool Scale_max = true,
+    bool Check_inf = true,
+    int Max_offset = 0,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1>
+__forceinline__ __device__ void scale_apply_exp2(
+    Tensor<Engine0, Layout0>& tensor,
+    Tensor<Engine1, Layout1> const& max,
+    const float scale) {
+  // For FP8, we can subtract max by 8.0 so that the value after exp2 is in the
+  // range of [0, 256]. This lets us use more of the FP8 range (instead of just
+  // [0, 1]) to reduce underflow.
+  static constexpr float max_offset =
+      float(Max_offset); // We can only template on int, not float
+  static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+  static_assert(Layout1::rank == 1, "Only support 1D Tensor");
+  CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
+#pragma unroll
+  for (int mi = 0; mi < size<0>(tensor); ++mi) {
+    // If max is -inf, then all elements must have been -inf (possibly due to
+    // masking). We don't want (-inf - (-inf)) since that would give NaN.
+    const float max_scaled = Check_inf
+        ? (max(mi) == -INFINITY
+               ? 0.f
+               : (!Scale_max ? max(mi) : max(mi) * scale) - max_offset)
+        : (!Scale_max ? max(mi) : max(mi) * scale) - max_offset;
+#pragma unroll
+    for (int ni = 0; ni < size<1>(tensor); ++ni) {
+      // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+      // max * log_2(e)). This allows the compiler to use the ffma
+      // instruction instead of fadd and fmul separately.
+      tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int kNRows, int Max_offset = 0>
+struct Softmax {
+  using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
+  TensorT row_max, row_sum;
+  float const softmax_scale_log2;
+
+  CUTLASS_DEVICE Softmax(float const softmax_scale_log2_)
+      : softmax_scale_log2(softmax_scale_log2_) {};
+
+  template <bool Is_first, bool Check_inf = false, typename Tensor0>
+  __forceinline__ __device__ TensorT max_get_scale(Tensor0& acc_s) {
+    // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M),
+    // ncol=(2, V, MMA_N))
+    Tensor scores = make_tensor(
+        acc_s.data(), hstu::convert_layout_acc_rowcol(acc_s.layout()));
+    static_assert(CUTE_STATIC_V(size<0>(scores)) == kNRows);
+    TensorT scores_scale;
+    if constexpr (Is_first) {
+      hstu::template reduce_max</*zero_init=*/true>(scores, row_max);
+      cute::fill(scores_scale, 1.f);
+    } else {
+      Tensor scores_max_prev = make_fragment_like(row_max);
+      cute::copy(row_max, scores_max_prev);
+      hstu::template reduce_max</*zero_init=*/false>(scores, row_max);
+#pragma unroll
+      for (int mi = 0; mi < size(row_max); ++mi) {
+        float scores_max_cur = !Check_inf
+            ? row_max(mi)
+            : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+        scores_scale(mi) =
+            exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
+        row_sum(mi) *= scores_scale(mi);
+      }
+    }
+    return scores_scale;
+  };
+
+  template <bool Is_first, bool Check_inf = false, typename Tensor0>
+  __forceinline__ __device__ void online_softmax(Tensor0& acc_s) {
+    // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M),
+    // ncol=(2, V, MMA_N))
+    Tensor scores = make_tensor(
+        acc_s.data(), hstu::convert_layout_acc_rowcol(acc_s.layout()));
+    static_assert(CUTE_STATIC_V(size<0>(scores)) == kNRows);
+    hstu::template scale_apply_exp2</*Scale_max=*/true, Check_inf, Max_offset>(
+        scores, row_max, softmax_scale_log2);
+    // We don't do the reduce across threads here since we don't need to use the
+    // row_sum. We do that reduce at the end when we need to normalize the
+    // softmax.
+    hstu::reduce_sum</*zero_init=*/Is_first, /*warp_reduce=*/false>(
+        scores, row_sum);
+  };
+
+  __forceinline__ __device__ TensorT finalize(float const final_scale = 1.f) {
+    SumOp<float> sum_op;
+    quad_allreduce_(row_sum, row_sum, sum_op);
+    TensorT scores_scale;
+#pragma unroll
+    for (int mi = 0; mi < size(row_sum); ++mi) {
+      float sum = row_sum(mi);
+      float inv_sum = (sum == 0.f || sum != sum) ? 0.f : 1.f / sum;
+      scores_scale(mi) = inv_sum * final_scale;
+      // For FP8, we might have scaled the output of exp by 2**8 so we need to
+      // divide sum by that amount.
+      if constexpr (Max_offset != 0) {
+        static constexpr float sum_scale = 1.f / float(1 << Max_offset);
+        sum *= sum_scale;
+      }
+      row_sum(mi) = (sum == 0.f || sum != sum)
+          ? -INFINITY
+          : row_max(mi) * (softmax_scale_log2 * float(M_LN2)) + __logf(sum);
+    }
+    return scores_scale;
+  };
+
+  template <typename Tensor1>
+  __forceinline__ __device__ void rescale_o(
+      Tensor1& acc_o,
+      TensorT const& scores_scale) {
+    // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2,
+    // MMA_K))
+    Tensor acc_o_rowcol = make_tensor(
+        acc_o.data(), hstu::convert_layout_acc_rowcol(acc_o.layout()));
+    static_assert(CUTE_STATIC_V(size<0>(acc_o_rowcol)) == kNRows);
+#pragma unroll
+    for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
+        acc_o_rowcol(mi, ni) *= scores_scale(mi);
+      }
+    }
+  };
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h
new file mode 100644
index 000000000..c5759c9d2
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h
@@ -0,0 +1,135 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Inspired by
+// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+
+#pragma once
+
+/// @param COND       - a boolean expression to switch by
+/// @param CONST_NAME - a name given for the constexpr bool variable.
+/// @param ...       - code to execute for true and false
+///
+/// Usage:
+/// ```
+/// BOOL_SWITCH(flag, BoolConst, [&] {
+///     some_function<BoolConst>(...);
+/// });
+/// ```
+//
+
+#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
+  [&] {                                         \
+    if (COND) {                                 \
+      constexpr static bool CONST_NAME = true;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static bool CONST_NAME = false; \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
+#ifdef FLASHATTENTION_DISABLE_LOCAL
+#define CAUSAL_LOCAL_SWITCH(                                           \
+    CAUSAL_COND, LOCAL_COND, CAUSAL_CONST_NAME, LOCAL_CONST_NAME, ...) \
+  [&] {                                                                \
+    constexpr static bool LOCAL_CONST_NAME = false;                    \
+    if (CAUSAL_COND) {                                                 \
+      constexpr static bool CAUSAL_CONST_NAME = true;                  \
+      return __VA_ARGS__();                                            \
+    } else {                                                           \
+      constexpr static bool CAUSAL_CONST_NAME = false;                 \
+      return __VA_ARGS__();                                            \
+    }                                                                  \
+  }()
+#else
+#define CAUSAL_LOCAL_SWITCH(                                           \
+    CAUSAL_COND, LOCAL_COND, CAUSAL_CONST_NAME, LOCAL_CONST_NAME, ...) \
+  [&] {                                                                \
+    if (CAUSAL_COND) {                                                 \
+      constexpr static bool CAUSAL_CONST_NAME = true;                  \
+      constexpr static bool LOCAL_CONST_NAME = false;                  \
+      return __VA_ARGS__();                                            \
+    } else if (LOCAL_COND) {                                           \
+      constexpr static bool CAUSAL_CONST_NAME = false;                 \
+      constexpr static bool LOCAL_CONST_NAME = true;                   \
+      return __VA_ARGS__();                                            \
+    } else {                                                           \
+      constexpr static bool CAUSAL_CONST_NAME = false;                 \
+      constexpr static bool LOCAL_CONST_NAME = false;                  \
+      return __VA_ARGS__();                                            \
+    }                                                                  \
+  }()
+#endif
+
+#ifdef FLASHATTENTION_DISABLE_CLUSTER
+#define CLUSTER_SWITCH(COND, CONST_NAME, ...) \
+  [&] {                                       \
+    constexpr static bool CONST_NAME = false; \
+    return __VA_ARGS__();                     \
+  }()
+#else
+#define CLUSTER_SWITCH BOOL_SWITCH
+#endif
+
+// #ifdef FLASHATTENTION_DISABLE_SM8x
+#define ARCH_SWITCH(ARCH, ARCH_NAME, ...) \
+  [&] {                                   \
+    constexpr static int ARCH_NAME = 90;  \
+    return __VA_ARGS__();                 \
+  }()
+// #else
+// #define ARCH_SWITCH(ARCH, ARCH_NAME, ...)  \
+//   [&] {                                    \
+//     if (ARCH < 90) {                       \
+//       constexpr static int ARCH_NAME = 80; \
+//       return __VA_ARGS__();                \
+//     } else {                               \
+//       constexpr static int ARCH_NAME = 90; \
+//       return __VA_ARGS__();                \
+//     }                                      \
+//   }()
+// #endif
+
+#ifndef FLASHATTENTION_ENABLE_VCOLMAJOR
+#define VCOLMAJOR_SWITCH(COND, CONST_NAME, ...) \
+  [&] {                                         \
+    constexpr static bool CONST_NAME = false;   \
+    return __VA_ARGS__();                       \
+  }()
+#else
+#define VCOLMAJOR_SWITCH BOOL_SWITCH
+#endif
+
+#define HEADDIM_SWITCH(HEADDIM, ...)        \
+  [&] {                                     \
+    if (HEADDIM == 64) {                    \
+      constexpr static int kHeadSize = 64;  \
+      return __VA_ARGS__();                 \
+    } else if (HEADDIM == 96) {             \
+      constexpr static int kHeadSize = 96;  \
+      return __VA_ARGS__();                 \
+    } else if (HEADDIM == 128) {            \
+      constexpr static int kHeadSize = 128; \
+      return __VA_ARGS__();                 \
+    } else if (HEADDIM == 96) {             \
+      constexpr static int kHeadSize = 96;  \
+      return __VA_ARGS__();                 \
+    } else if (HEADDIM == 256) {            \
+      constexpr static int kHeadSize = 256; \
+      return __VA_ARGS__();                 \
+    }                                       \
+  }()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h
new file mode 100644
index 000000000..cde7837ce
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h
@@ -0,0 +1,616 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/fast_math.h"
+
+#include "named_barrier.h"
+
+namespace hstu {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Host side kernel arguments
+struct TileSchedulerArguments {
+  int const num_blocks, num_head, num_batch;
+  int const max_seq_len, headdim,
+      element_size; // Used to calculate L2 swizzling
+  int* const tile_count_semaphore = nullptr;
+  int* const seq_offsets = nullptr;
+  int* const sort_by_length_indices = nullptr;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool Jagged = false,
+    int kBlock = 128,
+    bool Sort_by_length_indices = false>
+class SingleTileScheduler {
+ public:
+  using SharedStorage = int;
+
+  // Device side kernel params
+  struct Params {
+    int const num_blocks, num_head, num_batch;
+    int const max_seq_len;
+    int* const seq_offsets;
+    int* const sort_by_length_indices;
+  };
+
+  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
+    return {
+        args.num_blocks,
+        args.num_head,
+        args.num_batch,
+        args.max_seq_len,
+        !Jagged ? nullptr : args.seq_offsets,
+        !Sort_by_length_indices ? nullptr : args.sort_by_length_indices};
+  }
+
+  static dim3 get_grid_shape(Params const& params, int num_sm) {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    std::printf(
+        "SingleTileScheduler::get_grid_shape: %d, %d, %d\n",
+        params.num_blocks,
+        params.num_head,
+        params.num_batch);
+#endif
+    return {
+        uint32_t(params.num_blocks),
+        uint32_t(params.num_head),
+        uint32_t(params.num_batch)};
+  }
+
+  struct WorkTileInfo {
+    int block_idx = 0;
+    int bidh = 0;
+    int bidb = 0;
+    bool is_valid_tile = false;
+
+    CUTLASS_DEVICE
+    bool is_valid(Params const& params) const {
+      return is_valid_tile;
+    }
+
+    CUTLASS_DEVICE
+    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
+        Params const& params) const {
+      return {block_idx, bidh, bidb, 0 /*split_idx*/};
+    }
+  };
+
+  CUTLASS_DEVICE
+  SingleTileScheduler(SharedStorage* const smem_scheduler) {}
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
+    int bidb = int(blockIdx.z);
+    if constexpr (Sort_by_length_indices) {
+      bidb = params.sort_by_length_indices[bidb];
+    }
+    WorkTileInfo work_info{int(blockIdx.x), int(blockIdx.y), bidb, true};
+    if constexpr (Jagged) {
+      int seqlen =
+          (params.seq_offsets ? params.seq_offsets[work_info.bidb + 1] -
+                   params.seq_offsets[work_info.bidb]
+                              : params.max_seq_len);
+      work_info.is_valid_tile = work_info.block_idx * kBlock < seqlen;
+    }
+    return work_info;
+  }
+
+  CUTLASS_DEVICE
+  void init_consumer() const {}
+
+  CUTLASS_DEVICE
+  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
+      const {}
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo
+  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+    return {-1, -1, -1, false};
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class StaticPersistentTileScheduler {
+ public:
+  using SharedStorage = int;
+
+  // Device side kernel params
+  struct Params {
+    int total_blocks;
+    cutlass::FastDivmod m_block_divmod, head_divmod;
+    cutlass::FastDivmod nsplits_divmod;
+  };
+
+  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
+    return {
+        args.num_blocks * args.num_head * args.num_batch,
+        cutlass::FastDivmod(args.num_blocks),
+        cutlass::FastDivmod(args.num_head),
+        cutlass::FastDivmod(1)};
+  }
+
+  static dim3 get_grid_shape(Params const& params, int num_sm) {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    std::printf("StaticPersistentTileScheduler::get_grid_shape %d\n", num_sm);
+#endif
+    return {uint32_t(num_sm)};
+  }
+
+  struct WorkTileInfo {
+    int tile_idx;
+
+    CUTLASS_DEVICE
+    bool is_valid(Params const& params) const {
+      return tile_idx < params.total_blocks;
+    }
+
+    CUTLASS_DEVICE
+    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
+        Params const& params) const {
+      int block, bidh, bidb;
+      bidb = params.head_divmod.divmod(
+          bidh, params.m_block_divmod.divmod(block, tile_idx));
+      int split_idx = 0;
+      return {block, bidh, bidb, split_idx};
+    }
+  };
+
+  CUTLASS_DEVICE
+  StaticPersistentTileScheduler(SharedStorage* const smem_scheduler) {};
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
+    return {int(blockIdx.x)};
+  }
+
+  CUTLASS_DEVICE
+  void init_consumer() const {}
+
+  CUTLASS_DEVICE
+  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
+      const {}
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo
+  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+    return {current_work.tile_idx + int(gridDim.x)};
+  }
+};
+
+template <
+    int NumMmaThreads = 2 * cutlass::NumThreadsPerWarpGroup,
+    int NumProducerThreads = cutlass::NumThreadsPerWarp,
+    bool WarpSpecialized = true>
+class DynamicPersistentTileScheduler {
+  // This scheduler targets the causal (or local) case where each tile takes
+  // different amount of time. We use longest-processing-time-first scheduling:
+  // the longest remaining tile is assigned to the first SM that's free.
+  // SM indicates they are free by incrementing a semaphore.
+  // However, we have to make sure K & V still fit into L2 cache, so we perform
+  // scheduling on "sections" of the head & batch dimension, each section
+  // consisting of e.g. 8 heads. This is the L2 swizzling part. The size of each
+  // section is precomputed based on the size of K & V and the L2 cache size.
+
+  static_assert(WarpSpecialized || NumProducerThreads == NumMmaThreads);
+  static constexpr int NumThreads =
+      WarpSpecialized ? NumMmaThreads + NumProducerThreads : NumMmaThreads;
+
+ public:
+  using SharedStorage = int;
+
+ protected:
+  SharedStorage* const tile_count_smem;
+
+ public:
+  // Device side kernel params
+  struct Params {
+    int const total_blocks;
+    cutlass::FastDivmod const m_block_divmod, head_divmod;
+    cutlass::FastDivmod const l2_minor_divmod, l2_major_divmod;
+    cutlass::FastDivmod const l2_minor_residual_divmod;
+    int const num_hb_quotient;
+    int* const tile_count_semaphore;
+  };
+
+  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
+    int const size_one_kv_head =
+        args.max_seq_len * args.headdim * args.element_size * 2;
+    int const size_l2 = 32 * 1024 * 1024; // 32 MB for K & V
+    // Swizzle is the size of each "section". Round swizzle to a power of 2
+    // If not PackGQA already, the size of each section can increase by
+    // qhead_per_khead
+    int const swizzle = (1 << cutlass::find_log2(size_l2 / size_one_kv_head));
+    // If we're in the last section (called residual), we don't want to divide
+    // by swizzle. Instead we want to divide by the remainder.
+    int const num_hb_remainder = (args.num_head * args.num_batch) % swizzle;
+    int const num_split_blocks = args.num_blocks;
+    // printf("num_split_blocks = %d, num_head = %d, num_batch = %d, swizzle =
+    // %d, PackGQA = %d, qhead_per_khead = %d, num_hb_remainder = %d\n",
+    // num_split_blocks, args.num_head, args.num_batch, swizzle, int(PackGQA),
+    // args.qhead_per_khead, num_hb_remainder);
+    assert(args.tile_count_semaphore != nullptr);
+    return {
+        num_split_blocks * args.num_head * args.num_batch,
+        cutlass::FastDivmod(args.num_blocks),
+        cutlass::FastDivmod(args.num_head),
+        cutlass::FastDivmod(swizzle),
+        cutlass::FastDivmod(swizzle * num_split_blocks),
+        // don't divide by 0
+        cutlass::FastDivmod(num_hb_remainder > 0 ? num_hb_remainder : 1),
+        (args.num_head * args.num_batch) / swizzle,
+        args.tile_count_semaphore};
+  }
+
+  static dim3 get_grid_shape(Params const& params, int num_sm) {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    std::printf("DynamicPersistentTileScheduler::get_grid_shape %d\n", num_sm);
+#endif
+    return {uint32_t(num_sm)};
+  }
+
+  struct WorkTileInfo {
+    int tile_idx;
+
+    CUTLASS_DEVICE
+    bool is_valid(Params const& params) const {
+      return tile_idx < params.total_blocks;
+    }
+
+    CUTLASS_DEVICE
+    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
+        Params const& params) const {
+      int block, bidh, bidb;
+      int l2_mod, bidhb, bidhb_residual;
+      bidhb = params.l2_major_divmod.divmod(l2_mod, tile_idx);
+      // If we're in the last section (called residual), we don't want to divide
+      // by swizzle. Instead we want to divide by the remainder.
+      if (bidhb < params.num_hb_quotient) {
+        block = params.l2_minor_divmod.divmod(bidhb_residual, l2_mod);
+      } else {
+        block = params.l2_minor_residual_divmod.divmod(bidhb_residual, l2_mod);
+      }
+      bidb = params.head_divmod.divmod(
+          bidh, bidhb * params.l2_minor_divmod.divisor + bidhb_residual);
+      int split_idx = 0;
+      // Longest-processing-time-first
+      block = params.m_block_divmod.divisor - 1 - block;
+      return {block, bidh, bidb, split_idx};
+    }
+  };
+
+  CUTLASS_DEVICE
+  DynamicPersistentTileScheduler(SharedStorage* const smem_scheduler)
+      : tile_count_smem(smem_scheduler) {};
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
+    return {int(blockIdx.x)};
+  }
+
+  CUTLASS_DEVICE
+  void init_consumer() const {
+    if (WarpSpecialized || cutlass::canonical_warp_idx_sync() > 0) {
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
+      const {
+    if (threadIdx.x % NumProducerThreads == 0) {
+      current_work.tile_idx =
+          atomicAdd(params.tile_count_semaphore, 1) + int(gridDim.x);
+    }
+  }
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo
+  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+    if constexpr (IsProducerWarp) {
+      // thread 0 already has the right tile_idx, just need to broadcast to the
+      // rest of warp 0
+      int new_tile_idx =
+          __shfl_sync(0xffffffff, current_work.tile_idx, 0 /*lane*/);
+      hstu::named_barrier_sync(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+      if (threadIdx.x % NumProducerThreads == 0) {
+        *tile_count_smem = current_work.tile_idx;
+      }
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+      return {new_tile_idx};
+    } else {
+      hstu::named_barrier_sync(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+      int tile_idx = *tile_count_smem;
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+      return {tile_idx};
+    }
+  }
+};
+
+template <
+    int kBlock,
+    int NumMmaThreads = 2 * cutlass::NumThreadsPerWarpGroup,
+    int NumProducerThreads = cutlass::NumThreadsPerWarp,
+    bool WarpSpecialized = true>
+class VarlenDynamicPersistentTileScheduler {
+  static_assert(WarpSpecialized || NumProducerThreads == NumMmaThreads);
+  static constexpr int NumThreads =
+      WarpSpecialized ? NumMmaThreads + NumProducerThreads : NumMmaThreads;
+
+ public:
+  using SharedStorage = int4;
+
+ protected:
+  SharedStorage* const work_info_smem;
+
+ public:
+  // Device side kernel params
+  struct Params {
+    int num_head, num_batch;
+    int const max_seq_len;
+    cutlass::FastDivmod nsplits_divmod;
+    int* const tile_count_semaphore;
+    int* const seq_offsets;
+  };
+
+  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
+    // If Split, for the purpose of scheduling, we pretend that instead there
+    // are (args.num_splits * args.num_head) number of heads.
+    assert(args.tile_count_semaphore != nullptr);
+    return {
+        args.num_head,
+        args.num_batch,
+        args.max_seq_len,
+        cutlass::FastDivmod(1),
+        args.tile_count_semaphore,
+        args.seq_offsets};
+  }
+
+  static dim3 get_grid_shape(Params const& params, int num_sm) {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    std::printf(
+        "VarlenDynamicPersistentTileScheduler::get_grid_shape %d\n", num_sm);
+#endif
+    return {uint32_t(num_sm)};
+  }
+
+  struct WorkTileInfo {
+    int tile_idx, block, bidh, bidb;
+
+    CUTLASS_DEVICE
+    bool is_valid(Params const& params) const {
+      // if (blockIdx.x >= 0 && (threadIdx.x == 128 || threadIdx.x == 0)) {
+      // printf("blockIdx.x = %d, threadIdx.x = %d, checking valid, bidb = %d,
+      // params.num_batch = %d\n", blockIdx.x, threadIdx.x, bidb,
+      // params.num_batch); }
+      return bidb < params.num_batch;
+    }
+
+    CUTLASS_DEVICE
+    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
+        Params const& params) const {
+      return {block, bidh, bidb, 0 /*split_idx*/};
+    }
+  };
+
+  CUTLASS_DEVICE
+  VarlenDynamicPersistentTileScheduler(SharedStorage* const smem_scheduler)
+      : work_info_smem(smem_scheduler) {};
+
+  CUTLASS_DEVICE
+  WorkTileInfo tile_idx_to_work_tile(
+      Params const& params,
+      int next_tile_idx,
+      WorkTileInfo const& current_work) const {
+    auto prefix_sum = [](int val) {
+      auto lane = threadIdx.x % cutlass::NumThreadsPerWarp;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < cutlass::NumThreadsPerWarp; i <<= 1) {
+        int32_t partial_sum = __shfl_up_sync(0xffffffff, val, i);
+        if (lane >= i) {
+          val += partial_sum;
+        }
+      }
+      return val;
+    };
+
+    auto get_num_m_blocks = [&](int bidb_start) {
+      auto lane = threadIdx.x % cutlass::NumThreadsPerWarp;
+      int seqlen;
+      if (params.seq_offsets) {
+        int cur_cu_seqlen = lane + bidb_start <= params.num_batch
+            ? params.seq_offsets[lane + bidb_start]
+            : 0;
+        int next_cu_seqlen = __shfl_down_sync(0xffffffff, cur_cu_seqlen, 1);
+        seqlen = next_cu_seqlen - cur_cu_seqlen;
+      } else {
+        seqlen = params.max_seq_len;
+      }
+      return lane + bidb_start < params.num_batch &&
+              lane < cutlass::NumThreadsPerWarp - 1
+          ? cute::ceil_div(seqlen, kBlock)
+          : 0;
+    };
+
+    int num_m_blocks =
+        get_num_m_blocks(current_work.bidb); // Different for each lane
+    // Cumulative number of blocks for the next 31 batches
+    int num_m_blocks_cumulative = prefix_sum(num_m_blocks);
+    // Total number of blocks for the next 31 batches
+    int m_blocks_in_group = __shfl_sync(
+        0xffffffff, num_m_blocks_cumulative, cutlass::NumThreadsPerWarp - 1);
+    int group_end_tile = current_work.tile_idx - current_work.block -
+        current_work.bidh * __shfl_sync(0xffffffff, num_m_blocks, 0 /*lane*/) +
+        m_blocks_in_group * params.num_head; // Same for all lanes
+    int bidb = current_work.bidb;
+    // if (blockIdx.x <= 9 && threadIdx.x == 0) {
+    //     printf("Before while, blockIdx.x = %d, threadIdx.x = %d, bidb = %d,
+    //     num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d,
+    //     m_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb,
+    //     num_m_blocks, next_tile_idx, group_end_tile, m_blocks_in_group);
+    // }
+    while (group_end_tile <= next_tile_idx) {
+      bidb += cutlass::NumThreadsPerWarp - 1;
+      if (bidb >= params.num_batch) {
+        // if (blockIdx.x <= 9 && threadIdx.x == 0) {
+        //     printf("Returning early, blockIdx.x = %d, threadIdx.x = %d, bidb
+        //     = %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d,
+        //     m_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb,
+        //     num_m_blocks, next_tile_idx, group_end_tile, m_blocks_in_group);
+        // }
+        return {next_tile_idx, 0, 0, params.num_batch};
+      }
+      num_m_blocks = get_num_m_blocks(bidb);
+      num_m_blocks_cumulative = prefix_sum(num_m_blocks);
+      m_blocks_in_group = __shfl_sync(
+          0xffffffff, num_m_blocks_cumulative, cutlass::NumThreadsPerWarp - 1);
+      group_end_tile += m_blocks_in_group * params.num_head;
+      // if (blockIdx.x <= 9 && threadIdx.x == 0) {
+      //     printf("Bottom of while, blockIdx.x = %d, threadIdx.x = %d, bidb =
+      //     %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d,
+      //     m_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb,
+      //     num_m_blocks, next_tile_idx, group_end_tile, m_blocks_in_group);
+      // }
+    }
+    int group_start_tile = group_end_tile - m_blocks_in_group * params.num_head;
+    // The next problem to process is the first one that does not have ending
+    // tile position that is greater than or equal to tile index.
+    int batch_idx_in_group = __popc(__ballot_sync(
+        0xffffffff,
+        group_start_tile + num_m_blocks_cumulative * params.num_head <=
+            next_tile_idx));
+    bidb += batch_idx_in_group;
+    num_m_blocks = __shfl_sync(0xffffffff, num_m_blocks, batch_idx_in_group);
+    int mh_block = next_tile_idx - group_start_tile -
+        (batch_idx_in_group == 0 ? 0
+                                 : __shfl_sync(
+                                       0xffffffff,
+                                       num_m_blocks_cumulative,
+                                       batch_idx_in_group - 1)) *
+            params.num_head;
+    int bidh = mh_block / num_m_blocks;
+    int block = mh_block - bidh * num_m_blocks;
+    // if (blockIdx.x <= 9 && threadIdx.x == 0) {
+    //     printf("blockIdx.x = %d, threadIdx.x = %d, batch_idx_in_group = %d,
+    //     bidb = %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile =
+    //     %d, m_blocks_in_group = %d, mh_block = %d, bidh = %d, block = %d\n",
+    //     blockIdx.x, threadIdx.x, batch_idx_in_group, bidb, num_m_blocks,
+    //     next_tile_idx, group_end_tile, m_blocks_in_group, mh_block, bidh,
+    //     block);
+    // }
+    return {next_tile_idx, block, bidh, bidb};
+  }
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
+    if constexpr (IsProducerWarp) {
+      WorkTileInfo work_info =
+          tile_idx_to_work_tile(params, int(blockIdx.x), {0, 0, 0, 0});
+      if (threadIdx.x % cutlass::NumThreadsPerWarp == 0) {
+        *work_info_smem = make_int4(
+            work_info.tile_idx,
+            work_info.block,
+            work_info.bidh,
+            work_info.bidb);
+      }
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+      return work_info;
+    } else {
+      return get_next_work<false>(params, {0, 0, 0, 0});
+    }
+  }
+
+  CUTLASS_DEVICE
+  void init_consumer() const {
+    // Don't arrive at the TileCountSmemEmpty barrier here, because
+    // get_initial_work will do that
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
+      const {
+    if (threadIdx.x % NumProducerThreads == 0) {
+      current_work.tile_idx =
+          atomicAdd(params.tile_count_semaphore, 1) + int(gridDim.x);
+    }
+  }
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo
+  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+    if constexpr (IsProducerWarp) {
+      // thread 0 has the next tile_idx, just need to broadcast to the rest of
+      // warp 0
+      int new_tile_idx =
+          __shfl_sync(0xffffffff, current_work.tile_idx, 0 /*lane*/);
+      WorkTileInfo work_info = {
+          __shfl_sync(0xffffffff, current_work.tile_idx, 1 /*lane*/),
+          current_work.block,
+          current_work.bidh,
+          current_work.bidb};
+      work_info = tile_idx_to_work_tile(params, new_tile_idx, work_info);
+      hstu::named_barrier_sync(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+      if (threadIdx.x % cutlass::NumThreadsPerWarp == 0) {
+        *work_info_smem = make_int4(
+            work_info.tile_idx,
+            work_info.block,
+            work_info.bidh,
+            work_info.bidb);
+      }
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+      return work_info;
+    } else {
+      hstu::named_barrier_sync(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+      int4 work_info = *work_info_smem;
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+      return WorkTileInfo{work_info.x, work_info.y, work_info.z, work_info.w};
+    }
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h
new file mode 100644
index 000000000..3c8968bda
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h
@@ -0,0 +1,220 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <tuple>
+
+namespace hstu {
+
+constexpr int kBlockM_bwd(
+    const int arch,
+    const int headdim,
+    const bool causal,
+    const bool is_local) {
+  int const kBlockM_sm90 = headdim <= 64
+      ? 64
+      : (headdim <= 96
+             ? 64
+             : (headdim <= 128 ? (causal || is_local ? 64 : 80) : 64));
+  int const kBlockM_sm80 = headdim <= 64 ? 128 : 64;
+  int const kBlockM = arch >= 90 ? kBlockM_sm90 : kBlockM_sm80;
+  return kBlockM;
+}
+
+constexpr int kBlockN_bwd(const int arch, const int headdim) {
+  int const kBlockN_sm90 = headdim <= 128 ? 128 : (headdim <= 192 ? 96 : 80);
+  int const kBlockN_sm80 = headdim <= 128 ? 128 : (headdim <= 192 ? 80 : 64);
+  int const kBlockN = arch >= 90 ? kBlockN_sm90 : kBlockN_sm80;
+  return kBlockN;
+}
+
+constexpr int NumMmaWarpGroups_bwd(const int arch, const int headdim) {
+  if (headdim <= 128) {
+    return 2;
+  } else if (headdim == 192) {
+    return arch >= 90 ? 3 : 2;
+  } else {
+    return 2;
+  }
+}
+
+constexpr bool V_in_regs_bwd(const int arch, const int headdim) {
+  if (arch >= 90 && headdim == 96) {
+    return true;
+  }
+  return false;
+}
+
+// Stages_dO, Stages_dS_or_QSm80
+constexpr std::tuple<int, int> Stages_bwd(const int arch, const int headdim) {
+  if (headdim <= 128) {
+    return {2, 2};
+  }
+  if (headdim == 192) {
+    if (arch >= 90) {
+      return {1, 1};
+    } else {
+      return {1, 2};
+    }
+  } else {
+    return {1, 1};
+  }
+}
+
+// AtomLayoutMSdP, AtomLayoutNdKV, AtomLayoutMdQ
+constexpr std::tuple<int, int, int> AtomLayout_bwd(
+    const int arch,
+    const int headdim) {
+  if (headdim <= 64) {
+    if (arch >= 90) {
+      return {1, 2, 1};
+    } else {
+      return {4, 4, 4};
+    }
+  } else if (headdim <= 96) {
+    if (arch >= 90) {
+      return {1, 2, 1};
+    } else {
+      return {2, 4, 2};
+    }
+  } else if (headdim <= 128) {
+    if (arch >= 90) {
+      return {1, 2, 1};
+    } else {
+      return {2, 2, 2};
+    }
+  } else {
+    if (arch >= 90) {
+      return {1, 1, 1};
+    } else {
+      return {4, 2, 2};
+    }
+  }
+}
+
+// SdP_swapAB, dKV_swapAB, dQ_swapAB
+constexpr std::tuple<bool, bool, bool> swapAB_bwd(
+    const int arch,
+    const int headdim,
+    const bool causal,
+    const bool local) {
+  if (headdim <= 96) {
+    return {arch >= 90 ? true : false, false, false};
+  } else if (headdim == 128) {
+    bool SdP_swapAB = arch >= 90 ? true : false;
+    bool dKV_swapAB = false;
+    bool dQ_swapAB = arch >= 90 ? ((causal || local) ? false : true) : false;
+    return {SdP_swapAB, dKV_swapAB, dQ_swapAB};
+  } else if (headdim == 192) {
+    return {false, true, false};
+  } else {
+    return {false, arch >= 90 ? true : false, arch >= 90 ? true : false};
+  }
+}
+
+// Return {kBlockM, kBlockN, Mma1_is_RS}
+constexpr std::tuple<int, int, bool> tile_size_fwd_sm90(
+    int headdim,
+    bool is_causal,
+    bool is_local,
+    int element_size = 2,
+    bool v_colmajor = false,
+    bool Cross = false,
+    bool Training = true) {
+  // for cross attention, q is usually much smaller than k/v, so we reduce the
+  // BlockM size to increase parallelism
+  bool small_blockm = Cross && (!Training);
+  if (element_size == 2) {
+    if (headdim <= 64) {
+      return {small_blockm ? 64 : 192, 128, true};
+      // Good for long seqlen (>= 4k) but suffers from tile quantization at
+      // short seqlen return {192, is_causal || is_local ? 192 : 176, true,
+      // false};
+    } else if (headdim <= 96) {
+      return {small_blockm ? 64 : 192, is_local ? 128 : 144, false};
+    } else if (headdim <= 128) {
+      return {small_blockm ? 64 : 128, is_causal || is_local ? 128 : 176, true};
+      // {128, 192, false, false} and {192, 128, false, true} are quite good too
+      // 128 x 192 hits the limit of smem if Mma1_is_RS, 128 x 144 hits the
+      // limit if !Mma1_is_RS
+    } else if (headdim <= 192) {
+      return {
+          small_blockm ? 64 : 128,
+          is_local ? 96 : 112,
+          true}; // 128 x 112 hits the limit of smem
+    } else {
+      return {
+          small_blockm ? 64 : 128,
+          is_local ? 64 : 80,
+          true}; // 128 x 80 hits the limit of smem
+    }
+  } else {
+    if (headdim <= 64) {
+      return {192, 160, true};
+    } else if (headdim <= 96) {
+      return {192, 128, true};
+    } else if (headdim <= 128) {
+      return {128, (v_colmajor ? 192 : 224), true};
+    } else if (headdim <= 192) {
+      return {128, 160, true};
+    } else {
+      return {128, is_local ? 64 : 128, true};
+    }
+  }
+}
+
+// Return {kBlockM, kBlockN, kNWarps, kStages, Q_in_regs}
+constexpr std::tuple<int, int, int, int, bool> tile_size_fwd_sm8x(
+    bool sm86_or_89,
+    int headdim,
+    bool is_causal,
+    bool is_local,
+    int element_size = 2) {
+  if (element_size == 2) {
+    if (headdim <= 64) {
+      return {128, (is_local ? 96 : 112), 4, 1, false};
+    } else if (headdim <= 96) {
+      return {128, is_local ? 48 : 64, 4, 1, false};
+    } else if (headdim <= 128) {
+      bool const use_8_warps = sm86_or_89;
+      return {
+          128,
+          use_8_warps ? (is_local ? 96 : 128) : (is_local ? 48 : 64),
+          use_8_warps ? 8 : 4,
+          1,
+          use_8_warps};
+    } else if (headdim <= 192) {
+      bool const kBlockN_64 = is_local;
+      return {128, kBlockN_64 ? 64 : 96, 8, sm86_or_89 ? 1 : 2, !kBlockN_64};
+    } else {
+      return {
+          128,
+          sm86_or_89 ? (is_local ? 48 : 64) : (is_local ? 64 : 96),
+          8,
+          1,
+          false};
+    }
+  } else {
+    // Placeholder for now
+    return {128, 64, 8, 2, false};
+  }
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h
new file mode 100644
index 000000000..50a065ed4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h
@@ -0,0 +1,789 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <cuda_fp16.h>
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#include <cuda_bf16.h>
+#endif
+
+#include <cute/tensor.hpp>
+#include <cutlass/gemm/device/gemm.h>
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+
+#define CHECK_CUDA(call)                \
+  do {                                  \
+    cudaError_t status_ = call;         \
+    if (status_ != cudaSuccess) {       \
+      fprintf(                          \
+          stderr,                       \
+          "CUDA error (%s:%d): %s\n",   \
+          __FILE__,                     \
+          __LINE__,                     \
+          cudaGetErrorString(status_)); \
+      exit(1);                          \
+    }                                   \
+  } while (0)
+
+#define CHECK_CUDA_KERNEL_LAUNCH() CHECK_CUDA(cudaGetLastError())
+
+#ifndef M_LOG2E
+#define M_LOG2E 1.44269504088896340735992468100 /* log_2 (e) */
+#endif
+
+namespace hstu {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A wrapper for the kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// Adapted from
+// https://github.com/vllm-project/vllm/blob/4d29e91be84d27ca313d657eee92c067439a4c23/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh#L55
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ <= 890)
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct MaxOp {
+  __device__ __forceinline__ T operator()(T const& x, T const& y) {
+    return x > y ? x : y;
+  }
+};
+
+template <>
+struct MaxOp<float> {
+  // This is slightly faster
+  __device__ __forceinline__ float operator()(float const& x, float const& y) {
+    return max(x, y);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(T const& x, T const& y) {
+    return x + y;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct SiluScaleOp {
+  cutlass::epilogue::thread::SiLu<T> silu;
+  __device__ __forceinline__ T operator()(T const& t, T const& scale) {
+    float t2 = t / 2;
+    return t2 * (1 + cutlass::fast_tanh(t2)) *
+        scale; // __fdividef(t, 1.0f + cutlass::fast_exp(-t)) * scale
+  }
+};
+
+template <typename T, typename Engine, typename Layout>
+CUTLASS_DEVICE void inplace_silu_scale(
+    Tensor<Engine, Layout>& tensor,
+    T const& scale_before,
+    T const& scale_after) {
+  SiluScaleOp<T> silu_scale_op;
+#pragma unroll
+  for (int i = 0; i < size(tensor); ++i) {
+    tensor(i) = silu_scale_op(tensor(i) * scale_before, scale_after);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int THREADS>
+struct Allreduce {
+  static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
+  template <typename T, typename Operator>
+  static __device__ __forceinline__ T run(T x, Operator& op) {
+    constexpr int OFFSET = THREADS / 2;
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
+    return Allreduce<OFFSET>::run(x, op);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Allreduce<2> {
+  template <typename T, typename Operator>
+  static __device__ __forceinline__ T run(T x, Operator& op) {
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
+    return x;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// For SM80, convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2,
+// MMA_M), ncol=(2, MMA_N)). For SM90, convert acc_layout from ((2, 2, V),
+// MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N))
+template <bool Transposed = false, typename Layout0>
+CUTLASS_DEVICE auto convert_layout_acc_rowcol(Layout0 acc_layout) {
+  if constexpr (decltype(rank<0>(acc_layout))::value == 3) { // SM90
+    static_assert(decltype(size<0, 0>(acc_layout))::value == 2);
+    static_assert(decltype(size<0, 1>(acc_layout))::value == 2);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    auto l = acc_layout;
+    if constexpr (!Transposed) {
+      return make_layout(
+          make_layout(get<0, 1>(l), get<1>(l)),
+          make_layout(get<0, 0>(l), get<0, 2>(l), get<2>(l)));
+    } else {
+      return make_layout(
+          make_layout(get<0, 0>(l), get<0, 2>(l), get<2>(l)),
+          make_layout(get<0, 1>(l), get<1>(l)));
+    }
+
+  } else { // SM80
+    static_assert(decltype(size<0>(acc_layout))::value == 4);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    auto l = logical_divide(acc_layout, Shape<_2>{}); // ((2, 2), MMA_M, MMA_N)
+    if constexpr (!Transposed) {
+      return make_layout(
+          make_layout(get<0, 1>(l), get<1>(l)),
+          make_layout(get<0, 0>(l), get<2>(l)));
+    } else {
+      return make_layout(
+          make_layout(get<0, 0>(l), get<2>(l)),
+          make_layout(get<0, 1>(l), get<1>(l)));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// For SM80, convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M,
+// MMA_N / 2) if using m16n8k16, or to (4, MMA_M, MMA_N) if using m16n8k8. For
+// SM90, FP16/BF16, convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to
+// ((2, 2, 2), MMA_M, (N / 16, MMA_N)) For SM90, FP8, convert acc_layout from
+// ((2, 2, N / 8), MMA_M, MMA_N) to ((4, 2, 2), MMA_M, (N / 32, MMA_N))
+template <typename MMA_Traits, typename Layout0>
+CUTLASS_DEVICE auto convert_layout_acc_Aregs(Layout0 acc_layout) {
+  using X = Underscore;
+  if constexpr (decltype(rank<0>(acc_layout))::value == 3) { // SM90
+    static_assert(decltype(size<0, 0>(acc_layout))::value == 2);
+    static_assert(decltype(size<0, 1>(acc_layout))::value == 2);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    static_assert(decltype(rank(get<0>(acc_layout)))::value == 3);
+    if constexpr (sizeof(typename MMA_Traits::ValTypeA) == 2) {
+      auto l =
+          logical_divide(get<0, 2>(acc_layout), Tile<_2>{}); // ((2, N / 16))
+      return make_layout(
+          make_layout(
+              get<0, 0>(acc_layout), get<0, 1>(acc_layout), get<0, 0>(l)),
+          get<1>(acc_layout),
+          coalesce(make_layout(get<0, 1>(l), get<2>(acc_layout))));
+    } else {
+      static_assert(sizeof(typename MMA_Traits::ValTypeA) == 1);
+      static_assert(decltype(stride<0, 0>(acc_layout))::value == 1);
+      static_assert(decltype(stride<0, 1>(acc_layout))::value == 2);
+      auto l = logical_divide(
+          get<0, 2>(acc_layout),
+          Tile<Layout<Shape<_2, _2>>>{}); // (((2, 2), N / 32))
+      // This combines the first two modes (<0, 0> and <0, 1>) into one mode.
+      // Will require register shuffling later to be correct.
+      return make_layout(
+          make_layout(Layout<_4>{}, get<0, 0, 0>(l), get<0, 0, 1>(l)),
+          get<1>(acc_layout),
+          coalesce(make_layout(
+              get<0, 1>(l),
+              get<2>(acc_layout)))); // ((4, 2, 2), MMA_M, N / 32 * MMA_N)
+      // This combination is right but doesn't work with register shuffling.
+      // return make_layout(make_layout(coalesce(make_layout(get<0,
+      // 0>(acc_layout), get<0, 0, 0>(l))), get<0, 1>(acc_layout), get<0, 0,
+      // 1>(l)),
+      //                    get<1>(acc_layout),
+      //                    coalesce(make_layout(get<0, 1>(l),
+      //                    get<2>(acc_layout))));
+    }
+  } else { // SM80
+    static_assert(decltype(size<0>(acc_layout))::value == 4);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    constexpr int mma_shape_K = get<2>(typename MMA_Traits::Shape_MNK{});
+    static_assert(mma_shape_K == 8 || mma_shape_K == 16);
+    if constexpr (mma_shape_K == 8) {
+      return acc_layout;
+    } else {
+      auto l = logical_divide(
+          acc_layout, Shape<X, X, _2>{}); // (4, MMA_M, (2, MMA_N / 2)))
+      return make_layout(
+          make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename To_type, typename Engine, typename Layout>
+CUTLASS_DEVICE auto convert_type_unsafe(Tensor<Engine, Layout> const& tensor) {
+  using From_type = typename Engine::value_type;
+  static constexpr int numel = decltype(size(tensor))::value;
+  cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
+  // HACK: this requires tensor to be "contiguous"
+  auto frag =
+      convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel>*>(
+          tensor.data()));
+  return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
+  // Unsafe because we're returning a tensor with memory allocated on the
+  // stack. If the compiler does not inline this function, then the memory
+  // might not be valid.
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Engine, typename Layout, typename EngineOut>
+CUTLASS_DEVICE void convert_type_out(
+    Tensor<Engine, Layout> const& tensor,
+    Tensor<EngineOut, Layout>& out) {
+  // Somehow if we allocate out inside this function and return it, e2e is
+  // slower and the output can be wrong.
+  using From_type = typename Engine::value_type;
+  using To_type = typename EngineOut::value_type;
+  static constexpr int FragmentSize = std::max(
+      sizeof(From_type) / sizeof(To_type), sizeof(To_type) / sizeof(From_type));
+  static_assert(
+      CUTE_STATIC_V(size(tensor)) % FragmentSize == 0,
+      "Fragment size does not vectorize properly");
+  Tensor frag = recast<cutlass::Array<From_type, FragmentSize> const>(tensor);
+  Tensor out_frg = recast<cutlass::Array<To_type, FragmentSize>>(out);
+  static_assert(size(frag) == size(out_frg));
+  cutlass::NumericArrayConverter<To_type, From_type, FragmentSize> convert_op;
+#pragma unroll
+  for (int i = 0; i < size(frag); ++i) {
+    out_frg[i] = convert_op(frag[i]);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Blocks until all but N previous cp.async.commit_group operations have
+// committed. This differs from cute::cp_async_wait in that when N = 0 we
+// don't call cp.async.wait_all (which is equivalent to commit_group then
+// wait_group 0). Instead we just call cp.async.wait_group 0, which is
+// slightly faster.
+// https://github.com/NVIDIA/cutlass/blob/master/include/cute/arch/copy_sm80.hpp#L113
+template <int N>
+CUTE_HOST_DEVICE void cp_async_wait() {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool A, class Mma, class Tensor0>
+CUTLASS_DEVICE auto mma_partition_fragment_AB(
+    Mma const& mma,
+    Tensor0 const& tensor0) {
+  if constexpr (A) {
+    return mma.partition_fragment_A(tensor0);
+  } else {
+    return mma.partition_fragment_B(tensor0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool zero_init = false,
+    int wg_wait = 0,
+    bool SwapAB = false,
+    int M_slice = -1,
+    typename Tensor0,
+    typename Tensor1,
+    typename Tensor2,
+    typename TiledMma>
+CUTLASS_DEVICE void gemm(
+    TiledMma& tiled_mma,
+    Tensor0 const& tCrA,
+    Tensor1 const& tCrB,
+    Tensor2& tCrC) {
+  if constexpr (M_slice >= 0) {
+    static constexpr int MMA_M = decltype(size<1>(tCrC))::value;
+    static_assert(M_slice < MMA_M);
+    // After logical_divide, C has shape ((2,2,V), (MMA_M, 1), MMA_N)
+    Tensor tCrC_slice =
+        cute::logical_divide(tCrC, Shape<cute::Underscore, Int<MMA_M>>{})(
+            _, make_coord(Int<M_slice>{}, _), _);
+    if constexpr (!SwapAB) {
+      Tensor tCrA_slice =
+          cute::logical_divide(tCrA, Shape<cute::Underscore, Int<MMA_M>>{})(
+              _, make_coord(Int<M_slice>{}, _), _);
+      gemm<zero_init, wg_wait, SwapAB, /*M_slice=*/-1>(
+          tiled_mma, tCrA_slice, tCrB, tCrC_slice);
+    } else {
+      Tensor tCrB_slice =
+          cute::logical_divide(tCrB, Shape<cute::Underscore, Int<MMA_M>>{})(
+              _, make_coord(Int<M_slice>{}, _), _);
+      gemm<zero_init, wg_wait, SwapAB, /*M_slice=*/-1>(
+          tiled_mma, tCrA, tCrB_slice, tCrC_slice);
+    }
+  } else {
+    constexpr bool Is_RS = !cute::is_base_of<
+        cute::GMMA::DescriptorIterator,
+        typename TiledMma::FrgTypeA>::value;
+    // Need to cast away const on tCrA since warpgroup_fence_operand doesn't
+    // take const
+    if constexpr (Is_RS) {
+      if constexpr (!SwapAB) {
+        warpgroup_fence_operand(const_cast<Tensor0&>(tCrA));
+      } else {
+        warpgroup_fence_operand(const_cast<Tensor1&>(tCrB));
+      }
+    }
+    warpgroup_fence_operand(tCrC);
+    warpgroup_arrive();
+    if constexpr (zero_init) {
+      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    }
+    // Unroll the K mode manually to set scale D to 1
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+      if constexpr (!SwapAB) {
+        cute::gemm(tiled_mma, tCrA(_, _, k_block), tCrB(_, _, k_block), tCrC);
+      } else {
+        cute::gemm(tiled_mma, tCrB(_, _, k_block), tCrA(_, _, k_block), tCrC);
+      }
+      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+    }
+    warpgroup_commit_batch();
+    if constexpr (wg_wait >= 0) {
+      warpgroup_wait<wg_wait>();
+    }
+    warpgroup_fence_operand(tCrC);
+    if constexpr (Is_RS) {
+      if constexpr (!SwapAB) {
+        warpgroup_fence_operand(const_cast<Tensor0&>(tCrA));
+      } else {
+        warpgroup_fence_operand(const_cast<Tensor1&>(tCrB));
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool A_in_regs = false,
+    bool B_in_regs = false,
+    bool SwapAB = false,
+    typename Tensor0,
+    typename Tensor1,
+    typename Tensor2,
+    typename Tensor3,
+    typename Tensor4,
+    typename TiledMma,
+    typename TiledCopyA,
+    typename TiledCopyB,
+    typename ThrCopyA,
+    typename ThrCopyB,
+    typename Hook>
+CUTLASS_DEVICE void gemm_sm80(
+    Tensor0& acc,
+    Tensor1& tCrA,
+    Tensor2& tCrB,
+    Tensor3 const& tCsA,
+    Tensor4 const& tCsB,
+    TiledMma tiled_mma,
+    TiledCopyA smem_tiled_copy_A,
+    TiledCopyB smem_tiled_copy_B,
+    ThrCopyA smem_thr_copy_A,
+    ThrCopyB smem_thr_copy_B,
+    Hook fn) {
+  if constexpr (SwapAB) {
+    gemm_sm80<B_in_regs, A_in_regs>(
+        acc,
+        tCrB,
+        tCrA,
+        tCsB,
+        tCsA,
+        tiled_mma,
+        smem_tiled_copy_B,
+        smem_tiled_copy_A,
+        smem_thr_copy_B,
+        smem_thr_copy_A,
+        fn);
+  } else {
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc)); // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc)); // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB)); // MMA_K
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view)); // M
+    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view)); // N
+    if (!A_in_regs) {
+      cute::copy(
+          smem_tiled_copy_A, tCsA(_, _, _0{}), tCrA_copy_view(_, _, _0{}));
+    }
+    if (!B_in_regs) {
+      cute::copy(
+          smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
+    }
+#pragma unroll
+    for (int i = 0; i < size<2>(tCrA); ++i) {
+      if (i < size<2>(tCrA) - 1) {
+        if (!A_in_regs) {
+          cute::copy(
+              smem_tiled_copy_A,
+              tCsA(_, _, i + 1),
+              tCrA_copy_view(_, _, i + 1));
+        }
+        if (!B_in_regs) {
+          cute::copy(
+              smem_tiled_copy_B,
+              tCsB(_, _, i + 1),
+              tCrB_copy_view(_, _, i + 1));
+        }
+      }
+      if constexpr (!std::is_same_v<Hook, std::nullptr_t>) {
+        if (i == 0) {
+          fn();
+        }
+      }
+      cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename Tensor0,
+    typename Tensor1,
+    typename Tensor2,
+    typename Tensor3,
+    typename TiledMma,
+    typename TiledCopy,
+    typename ThrCopy>
+CUTLASS_DEVICE void gemm_rs_sm80(
+    Tensor0& acc,
+    Tensor1& tCrA,
+    Tensor2& tCrB,
+    Tensor3 const& tCsB,
+    TiledMma tiled_mma,
+    TiledCopy smem_tiled_copy_B,
+    ThrCopy smem_thr_copy_B) {
+  CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc)); // MMA_M
+  CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc)); // MMA_N
+  CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB)); // MMA_K
+  Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
+  CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view)); // N
+  cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
+#pragma unroll
+  for (int i = 0; i < size<2>(tCrA); ++i) {
+    if (i < size<2>(tCrA) - 1) {
+      cute::copy(
+          smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1));
+    }
+    cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool Is_even_MN = true,
+    bool Is_even_K = true,
+    bool Clear_OOB_MN = false,
+    bool Clear_OOB_K = true,
+    class CopyAtom,
+    class TV,
+    class Tiler,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1,
+    typename Engine2,
+    typename Layout2,
+    typename Engine3,
+    typename Layout3>
+CUTLASS_DEVICE void copy(
+    TiledCopy<CopyAtom, TV, Tiler> const& tiled_copy,
+    Tensor<Engine0, Layout0> const& S,
+    Tensor<Engine1, Layout1>& D,
+    Tensor<Engine2, Layout2> const& identity_MN,
+    Tensor<Engine3, Layout3> const& predicate_K,
+    const int max_MN = 0) {
+  // Decay TiledCopy to CopyAtom
+  auto copy_atom = static_cast<CopyAtom const&>(tiled_copy);
+  CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
+  CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
+  CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D)); // MMA
+  CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D)); // MMA_M
+  CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D)); // MMA_K
+  // There's no case where !Clear_OOB_K && Clear_OOB_MN
+  static_assert(!(Clear_OOB_MN && !Clear_OOB_K));
+  auto has_with_bool = cute::is_valid(
+      [](auto t) -> void_t<decltype(declval<typename decltype(t)::Traits>()
+                                        .with(true))> {},
+      copy_atom);
+#pragma unroll
+  for (int m = 0; m < size<1>(S); ++m) {
+    bool predicate_mn =
+        Is_even_MN || get<0>(identity_MN(_0{}, m, _0{})) < max_MN;
+    if constexpr (Is_even_MN || !Clear_OOB_MN) {
+      if (Is_even_MN || predicate_mn) {
+#pragma unroll
+        for (int k = 0; k < size<2>(S); ++k) {
+          if constexpr (Is_even_K || !Clear_OOB_K) {
+            if (Is_even_K || predicate_K(k)) {
+              cute::copy(copy_atom, S(_, m, k), D(_, m, k));
+            }
+          } else { // Clear_OOB_K == true && Is_even_K == false
+            // If copy traits can be transformed with a predicate value, do
+            // it, otherwise branch here
+            if constexpr (has_with_bool) {
+              cute::copy(
+                  copy_atom.with(predicate_K(k)), S(_, m, k), D(_, m, k));
+            } else {
+              if (predicate_K(k)) {
+                cute::copy(copy_atom, S(_, m, k), D(_, m, k));
+              } else {
+                cute::clear(D(_, m, k));
+              }
+            }
+          }
+        }
+      }
+    } else { // Clear_OOB_MN == true && Is_even_MN == false, also implies
+             // Clear_OOB_K == true
+      if constexpr (!has_with_bool) {
+        if (predicate_mn) {
+#pragma unroll
+          for (int k = 0; k < size<2>(S); ++k) {
+            if (Is_even_K || predicate_K(k)) {
+              cute::copy(copy_atom, S(_, m, k), D(_, m, k));
+            } else if (Clear_OOB_K) {
+              cute::clear(D(_, m, k));
+            }
+          }
+        } else {
+          cute::clear(D(_, m, _));
+        }
+      } else { // combine the mn predicate with the k predicate
+#pragma unroll
+        for (int k = 0; k < size<2>(S); ++k) {
+          cute::copy(
+              copy_atom.with(predicate_mn && (Is_even_K || predicate_K(k))),
+              S(_, m, k),
+              D(_, m, k));
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Byte permute and shuffle to match register layout of
+// (FP8 downcasted) accumulator of GEMM-I to FP8 operand A of GEMM-II.
+template <typename Fragment>
+CUTLASS_DEVICE void permute_Aregs_fp8(Fragment& frag) {
+  // frag has shape ((4, 2, 2), MMA_M, MMA_N), each element is 8 bits
+  static_assert(decltype(size<0, 0>(frag))::value == 4);
+  static_assert(decltype(size<0, 1>(frag))::value == 2);
+  static_assert(decltype(stride<0, 0>(frag))::value == 1);
+  static_assert(decltype(stride<0, 1>(frag))::value == 4);
+  static_assert(sizeof(typename Fragment::value_type) == 1);
+
+  auto quad_idx = threadIdx.x % 4;
+  bool lane_03 = quad_idx == 0 || quad_idx == 3;
+  int selector_upper = lane_03 ? 0x5410 : 0x1054;
+  int selector_lower = lane_03 ? 0x7632 : 0x3276;
+
+  static constexpr int upper_map[4] = {0, 3, 1, 2};
+  // static constexpr int lower_map[4] = {1, 2, 0, 3};
+
+  Tensor frag_64b = recast<uint2>(frag); // ((1, 1, 2), MMA_M, MMA_N)
+#pragma unroll
+  for (int i = 0; i < size(frag_64b); ++i) {
+    uint32_t upper = frag_64b[i].x;
+    uint32_t lower = frag_64b[i].y;
+    uint32_t upper0 = lane_03 ? upper : lower;
+    uint32_t lower0 = lane_03 ? lower : upper;
+    upper0 = __shfl_sync(uint32_t(-1), upper0, upper_map[quad_idx], 4);
+    // lower0 = __shfl_sync(uint32_t(-1), lower0, lower_map[quad_idx], 4);
+    lower0 = __shfl_sync(uint32_t(-1), lower0, upper_map[quad_idx] ^ 1, 4);
+    frag_64b[i].x = __byte_perm(upper0, lower0, selector_upper);
+    frag_64b[i].y = __byte_perm(upper0, lower0, selector_lower);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Fragment>
+CUTLASS_DEVICE void permute_Cregs_fp8(Fragment& frag) {
+  // frag has shape ((2, 2, N / 8), MMA_M, MMA_N), each element is 32 bits
+  static_assert(decltype(size<0, 0>(frag))::value == 2);
+  static_assert(decltype(size<0, 1>(frag))::value == 2);
+  static_assert(decltype(size<0, 2>(frag))::value % 2 == 0);
+  static_assert(decltype(stride<0, 0>(frag))::value == 1);
+  static_assert(sizeof(typename Fragment::value_type) == 4);
+  Tensor frag_64b =
+      group_modes<1, 3>(recast<uint2>(frag)); // ((1, 2, N / 8), (MMA_M, MMA_N))
+#pragma unroll
+  for (int mi = 0; mi < size<1>(frag_64b); ++mi) {
+#pragma unroll
+    for (int i = 0; i < size<0, 2>(frag_64b) / 2; ++i) {
+      cutlass::swap(
+          frag_64b(make_coord(_0{}, _1{}, 2 * i), mi),
+          frag_64b(make_coord(_0{}, _0{}, 2 * i + 1), mi));
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Fragment>
+CUTLASS_DEVICE void permute_output_fp8(Fragment& out) {
+  // out has shape ((2, 2, N / 8), MMA_M, MMA_N), each element is 32 bits
+  static_assert(decltype(size<0, 0>(out))::value == 2);
+  static_assert(decltype(size<0, 1>(out))::value == 2);
+  static_assert(decltype(size<0, 2>(out))::value % 2 == 0);
+  static_assert(decltype(stride<0, 0>(out))::value == 1);
+  static_assert(sizeof(typename Fragment::value_type) == 4);
+  Tensor frag = group_modes<1, 3>(out); // ((2, 2, N / 8), (MMA_M, MMA_N))
+#pragma unroll
+  for (int mi = 0; mi < size<1>(frag); ++mi) {
+#pragma unroll
+    for (int j = 0; j < size<0, 1>(frag); ++j) {
+#pragma unroll
+      for (int i = 0; i < size<0, 2>(frag) / 2; ++i) {
+        cutlass::swap(
+            frag(make_coord(_1{}, j, 2 * i), mi),
+            frag(make_coord(_0{}, j, 2 * i + 1), mi));
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Fragment>
+CUTLASS_DEVICE void permute_output_fp8_Vcolmajor(Fragment& frag) {
+  // frag has shape ((2, 2, N / 8), MMA_M, MMA_N), each element is 16 bits
+  static_assert(decltype(size<0, 0>(frag))::value == 2);
+  static_assert(decltype(size<0, 1>(frag))::value == 2);
+  static_assert(decltype(stride<0, 0>(frag))::value == 1);
+  static_assert(
+      sizeof(typename Fragment::value_type) == 2 ||
+      sizeof(typename Fragment::value_type) == 4);
+
+  auto quad_idx = threadIdx.x % 4;
+  bool lane_03 = quad_idx == 0 || quad_idx == 3;
+
+  static constexpr int upper_map[4] = {0, 2, 3, 1};
+  // static constexpr int lower_map[4] = {2, 0, 1, 3};
+
+  // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(frag); }
+  using type2 = std::conditional_t<
+      sizeof(typename Fragment::value_type) == 2,
+      uint32_t,
+      uint64_t>;
+  Tensor frag_2 =
+      group_modes<1, 3>(recast<type2>(frag)); // ((1, 2, N / 8), (MMA_M, MMA_N))
+// if (blockIdx.x == 0 && threadIdx.x == 128) { print(frag); printf("\n");
+// print(frag_2); }
+#pragma unroll
+  for (int mi = 0; mi < size<1>(frag_2); ++mi) {
+#pragma unroll
+    for (int j = 0; j < size<0, 1>(frag_2); ++j) {
+#pragma unroll
+      for (int i = 0; i < size<0, 2>(frag_2) / 2; ++i) {
+        type2 upper = frag_2(make_coord(_0{}, j, 2 * i), mi);
+        type2 lower = frag_2(make_coord(_0{}, j, 2 * i + 1), mi);
+        type2 upper0 = lane_03 ? upper : lower;
+        type2 lower0 = lane_03 ? lower : upper;
+        upper0 = __shfl_sync(uint32_t(-1), upper0, upper_map[quad_idx], 4);
+        // lower0 = __shfl_sync(uint32_t(-1), lower0, lower_map[quad_idx], 4);
+        lower0 = __shfl_sync(uint32_t(-1), lower0, upper_map[quad_idx] ^ 2, 4);
+        frag_2(make_coord(_0{}, j, 2 * i), mi) = lane_03 ? upper0 : lower0;
+        frag_2(make_coord(_0{}, j, 2 * i + 1), mi) = lane_03 ? lower0 : upper0;
+      }
+    }
+  }
+  // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(frag); }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Engine, typename Layout>
+CUTLASS_DEVICE void apply_softcap(
+    Tensor<Engine, Layout>& tensor,
+    float const softcap) {
+#pragma unroll
+  for (int i = 0; i < size(tensor); ++i) {
+    tensor(i) = cutlass::fast_tanh(tensor(i) * softcap);
+  }
+}
+
+template <typename Engine, typename Layout>
+CUTLASS_DEVICE auto calculate_dtanh(Tensor<Engine, Layout>& tensor) {
+  Tensor out = make_fragment_like<float>(tensor);
+#pragma unroll
+  for (int i = 0; i < size(tensor); ++i) {
+    out(i) = 1.f - (tensor(i) * tensor(i));
+  }
+  return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_DEVICE
+int canonical_warp_group_idx_nosync() {
+  return threadIdx.x / cutlass::NumThreadsPerWarpGroup;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt
new file mode 100644
index 000000000..04d34e1a3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt
@@ -0,0 +1 @@
+5231d95
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp b/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp
new file mode 100644
index 000000000..d3a2e9421
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+void _jagged_transpose_1d_cpu_kernel(
+    int32_t size1,
+    int32_t size2,
+    int32_t max_len,
+    const at::TensorAccessor<index_t, 1>& offsets,
+    const at::TensorAccessor<val_t, 1>& values,
+    const at::TensorAccessor<index_t, 1>& lengths,
+    const at::TensorAccessor<index_t, 1>& trans_offsets,
+    at::TensorAccessor<val_t, 1> trans_values) {
+  for (auto i : c10::irange(size1)) {
+    for (auto j : c10::irange(size2)) {
+      auto src_idx = i * size2 + j;
+      auto dst_idx = j * size1 + i;
+      auto src_offset = offsets[src_idx];
+      auto src_length = lengths[src_idx];
+      auto dst_offset = trans_offsets[dst_idx];
+
+      for (auto k = 0; k < src_length; ++k) {
+        trans_values[dst_offset + k] = values[src_offset + k];
+      }
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cpu(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2) {
+  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(lengths.device().type() == at::DeviceType::CPU);
+  TORCH_CHECK(offsets.size(0) == size1 * size2 + 1);
+  TORCH_CHECK(lengths.size(0) == size1 * size2);
+
+  auto trans_lengths =
+      lengths.view({size1, size2}).transpose(0, 1).contiguous().view({-1});
+  auto trans_offsets =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(trans_lengths);
+  auto L_out = trans_offsets[-1].item<int64_t>();
+  auto trans_values = at::empty({L_out}, values.options());
+
+  if (L_out == 0) {
+    return std::make_tuple(trans_values, trans_offsets, trans_lengths);
+  }
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths.scalar_type(), "jagged_transpose_1d_cpu_kernel_input1", [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values.scalar_type(),
+            "jagged_transpose_1d_cpu_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _jagged_transpose_1d_cpu_kernel<index_t, val_t>(
+                  size1,
+                  size2,
+                  max_len,
+                  offsets.accessor<index_t, 1>(),
+                  values.accessor<val_t, 1>(),
+                  lengths.accessor<index_t, 1>(),
+                  trans_offsets.accessor<index_t, 1>(),
+                  trans_values.accessor<val_t, 1>());
+            });
+      });
+
+  return std::make_tuple(trans_values, trans_offsets, trans_lengths);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_meta(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2) {
+  auto trans_lengths =
+      lengths.view({size1, size2}).transpose(0, 1).contiguous().view({-1});
+  auto L_out = trans_lengths.sum().item<int64_t>();
+
+  auto trans_values = at::native::empty_meta_symint(
+      {L_out},
+      /*dtype=*/::std::make_optional(values.scalar_type()),
+      /*layout=*/::std::make_optional(values.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+
+  auto trans_offsets = at::native::empty_meta_symint(
+      {size1 * size2 + 1},
+      /*dtype=*/::std::make_optional(lengths.scalar_type()),
+      /*layout=*/::std::make_optional(lengths.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+
+  return std::make_tuple(trans_values, trans_offsets, trans_lengths);
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu b/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu
new file mode 100644
index 000000000..380100962
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu
@@ -0,0 +1,127 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "common.h"
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
+
+namespace hstu {
+
+static constexpr int32_t kMaxThreads = 1024;
+
+template <typename index_t, typename val_t>
+__global__ __launch_bounds__(kMaxThreads) void _jagged_transpose_1d_cuda_kernel(
+    int32_t size1,
+    int32_t size2,
+    int32_t max_len,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
+    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> lengths,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        trans_offsets,
+    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> trans_values) {
+  for (auto idx = blockIdx.x * blockDim.y + threadIdx.y;
+       idx < static_cast<uint32_t>(size1 * size2);
+       idx += gridDim.x * blockDim.y) {
+    auto i = idx / size2;
+    auto j = idx % size2;
+    auto src_idx = i * size2 + j;
+    auto dst_idx = j * size1 + i;
+    auto src_offset = offsets[src_idx];
+    auto src_length = lengths[src_idx];
+    auto dst_offset = trans_offsets[dst_idx];
+
+    for (auto k = threadIdx.x; k < static_cast<uint32_t>(src_length);
+         k += blockDim.x) {
+      trans_values[dst_offset + k] = values[src_offset + k];
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cuda(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values.get_device());
+  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(lengths.device().type() == at::DeviceType::CUDA);
+  TORCH_CHECK(offsets.size(0) == size1 * size2 + 1);
+  TORCH_CHECK(lengths.size(0) == size1 * size2);
+  TORCH_CHECK(values.get_device() == offsets.get_device());
+  TORCH_CHECK(values.get_device() == lengths.get_device());
+
+  auto trans_lengths =
+      lengths.view({size1, size2}).transpose(0, 1).contiguous().view({-1});
+  auto trans_offsets =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(trans_lengths);
+  auto L_out = trans_offsets[-1].item<int64_t>();
+  TORCH_CHECK(L_out < std::numeric_limits<int32_t>::max());
+  auto trans_values = at::empty({L_out}, values.options());
+
+  if (L_out == 0) {
+    return std::make_tuple(trans_values, trans_offsets, trans_lengths);
+  }
+
+  // Optimized thread block configuration based on benchmark results
+  uint32_t B_blocks = 4;
+  dim3 threads(256, B_blocks);
+  auto blocks = div_round_up(size1 * size2, B_blocks);
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths.scalar_type(), "jagged_transpose_1d_cuda_kernel_input1", [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values.scalar_type(),
+            "jagged_transpose_1d_cuda_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _jagged_transpose_1d_cuda_kernel<index_t, val_t><<<
+                  blocks,
+                  threads,
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  size1,
+                  size2,
+                  max_len,
+                  offsets
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  values.packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                  lengths
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  trans_offsets
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  trans_values
+                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>());
+            });
+      });
+
+  return std::make_tuple(trans_values, trans_offsets, trans_lengths);
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp b/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp
new file mode 100644
index 000000000..fa8eaac09
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp
@@ -0,0 +1,139 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+void _replace_last_n_with_jagged_cpu_kernel(
+    int32_t B,
+    const at::TensorAccessor<index_t, 1>& lengths_left,
+    const at::TensorAccessor<index_t, 1>& offsets_left,
+    const at::TensorAccessor<val_t, 2>& values_left,
+    const at::TensorAccessor<index_t, 1>& lengths_right,
+    const at::TensorAccessor<index_t, 1>& offsets_right,
+    const at::TensorAccessor<val_t, 2>& values_right,
+    const at::TensorAccessor<index_t, 1>& output_offsets,
+    at::TensorAccessor<val_t, 2> output) {
+  for (auto b : c10::irange(B)) {
+    auto left_start = offsets_left[b];
+    auto left_len = lengths_left[b];
+    auto right_start = offsets_right[b];
+    auto right_len = lengths_right[b];
+    auto output_start = output_offsets[b];
+
+    auto keep_len = left_len - right_len;
+
+    for (auto i = 0; i < left_len; ++i) {
+      for (auto d = 0; d < values_left.size(1); ++d) {
+        if (i < keep_len) {
+          output[output_start + i][d] = values_left[left_start + i][d];
+        } else {
+          auto right_idx = i - keep_len;
+          if (right_idx < right_len) {
+            output[output_start + i][d] =
+                values_right[right_start + right_idx][d];
+          }
+        }
+      }
+    }
+  }
+}
+
+at::Tensor replace_last_n_with_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CPU);
+  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
+  TORCH_CHECK(values_left.size(1) == values_right.size(1));
+
+  auto B = lengths_left.size(0);
+  auto D = values_left.size(1);
+
+  auto L_out = lengths_left.sum().item<int64_t>();
+
+  auto output = at::empty({L_out, D}, values_left.options());
+
+  if (L_out == 0) {
+    return output;
+  }
+
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_right.view({-1}));
+  const auto output_offsets = offsets_left;
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "replace_last_n_with_jagged_cpu_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values_left.scalar_type(),
+            "replace_last_n_with_jagged_cpu_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _replace_last_n_with_jagged_cpu_kernel<index_t, val_t>(
+                  B,
+                  lengths_left.accessor<index_t, 1>(),
+                  offsets_left.accessor<index_t, 1>(),
+                  values_left.accessor<val_t, 2>(),
+                  lengths_right.accessor<index_t, 1>(),
+                  offsets_right.accessor<index_t, 1>(),
+                  values_right.accessor<val_t, 2>(),
+                  output_offsets.accessor<index_t, 1>(),
+                  output.accessor<val_t, 2>());
+            });
+      });
+
+  return output;
+}
+
+at::Tensor replace_last_n_with_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  auto L_out = lengths_left.sum().item<int64_t>();
+  auto D = values_left.size(1);
+
+  auto output = at::native::empty_meta_symint(
+      {L_out, D},
+      /*dtype=*/::std::make_optional(values_left.scalar_type()),
+      /*layout=*/::std::make_optional(values_left.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+
+  return output;
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu b/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu
new file mode 100644
index 000000000..00a589eb9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu
@@ -0,0 +1,156 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "common.h"
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
+
+namespace hstu {
+
+static constexpr int32_t kMaxThreads = 1024;
+
+template <typename index_t, typename val_t>
+__global__
+__launch_bounds__(kMaxThreads) void _replace_last_n_with_jagged_cuda_kernel(
+    int32_t B,
+    int32_t D,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        lengths_left,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_left,
+    const at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits>
+        values_left,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        lengths_right,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_right,
+    const at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits>
+        values_right,
+    at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits> output) {
+  for (auto b = blockIdx.x * blockDim.y + threadIdx.y;
+       b < static_cast<uint32_t>(B);
+       b += gridDim.x * blockDim.y) {
+    auto left_start = offsets_left[b];
+    auto left_len = lengths_left[b];
+    auto right_start = offsets_right[b];
+    auto right_len = lengths_right[b];
+    auto output_start = offsets_left[b];
+    auto keep_len = left_len - right_len;
+
+    for (auto i = threadIdx.x; i < static_cast<uint32_t>(left_len * D);
+         i += blockDim.x) {
+      auto seq_pos = i / D;
+      auto dim_pos = i % D;
+      if (seq_pos < static_cast<uint32_t>(keep_len)) {
+        output[output_start + seq_pos][dim_pos] =
+            values_left[left_start + seq_pos][dim_pos];
+      } else {
+        auto right_idx = seq_pos - keep_len;
+        if (right_idx < static_cast<uint32_t>(right_len)) {
+          output[output_start + seq_pos][dim_pos] =
+              values_right[right_start + right_idx][dim_pos];
+        }
+      }
+    }
+  }
+}
+
+at::Tensor replace_last_n_with_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values_left.get_device());
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CUDA);
+  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
+  TORCH_CHECK(values_left.size(1) == values_right.size(1));
+
+  auto B = lengths_left.size(0);
+  auto D = values_left.size(1);
+  auto L_out = lengths_left.sum().item<int64_t>();
+  TORCH_CHECK(L_out < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(values_left.get_device() == lengths_left.get_device());
+  TORCH_CHECK(values_left.get_device() == lengths_right.get_device());
+  TORCH_CHECK(values_left.get_device() == values_right.get_device());
+
+  auto output = at::empty({L_out, D}, values_left.options());
+
+  if (L_out == 0) {
+    return output;
+  }
+
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_right.view({-1}));
+
+  // Optimized thread block configuration based on benchmark results
+  uint32_t B_blocks, threads_x;
+  B_blocks = 4;
+  threads_x = 256;
+
+  dim3 threads(threads_x, B_blocks);
+  auto blocks = div_round_up(B, B_blocks);
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "replace_last_n_with_jagged_cuda_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values_left.scalar_type(),
+            "replace_last_n_with_jagged_cuda_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _replace_last_n_with_jagged_cuda_kernel<index_t, val_t><<<
+                  blocks,
+                  threads,
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  B,
+                  D,
+                  lengths_left
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  offsets_left
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  values_left
+                      .packed_accessor32<val_t, 2, at::RestrictPtrTraits>(),
+                  lengths_right
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  offsets_right
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  values_right
+                      .packed_accessor32<val_t, 2, at::RestrictPtrTraits>(),
+                  output.packed_accessor32<val_t, 2, at::RestrictPtrTraits>());
+            });
+      });
+
+  return output;
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/setup.py b/recommendation_v4/generative_recommenders/ops/cpp/setup.py
new file mode 100644
index 000000000..2a06a9d05
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/setup.py
@@ -0,0 +1,487 @@
+# pyre-unsafe
+"""
+Modified from https://github.com/Dao-AILab/flash-attention/blob/main/hopper/setup.py
+"""
+
+import itertools
+import os
+import platform
+import subprocess
+import sys
+import sysconfig
+import warnings
+from pathlib import Path
+
+import torch
+from packaging.version import parse, Version
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension
+
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
+PACKAGE_NAME = "hstu"
+# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
+# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
+FORCE_BUILD = os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE"
+SKIP_CUDA_BUILD = os.getenv("FLASH_ATTENTION_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
+# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
+FORCE_CXX11_ABI = os.getenv("FLASH_ATTENTION_FORCE_CXX11_ABI", "FALSE") == "TRUE"
+
+# HACK: we monkey patch pytorch's _write_ninja_file to pass
+# "-gencode arch=compute_sm90a,code=sm_90a" to files ending in '_sm90.cu',
+# and pass "-gencode arch=compute_sm80,code=sm_80" to files ending in '_sm80.cu'
+from torch.utils.cpp_extension import (
+    _is_cuda_file,
+    _join_cuda_home,
+    _join_rocm_home,
+    _maybe_write,
+    COMMON_HIP_FLAGS,
+    get_cxx_compiler,
+    IS_HIP_EXTENSION,
+    IS_WINDOWS,
+    SUBPROCESS_DECODE_ARGS,
+)
+
+DISABLE_BACKWARD = os.getenv("FLASH_ATTENTION_DISABLE_BACKWARD", "FALSE") == "TRUE"
+DISABLE_FP16 = os.getenv("FLASH_ATTENTION_DISABLE_FP16", "TRUE") == "TRUE"
+DISABLE_HDIM64 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM64", "TRUE") == "TRUE"
+DISABLE_HDIM96 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM96", "TRUE") == "TRUE"
+DISABLE_HDIM128 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM128", "FALSE") == "TRUE"
+DISABLE_HDIM192 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM192", "TRUE") == "TRUE"
+DISABLE_HDIM256 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM256", "TRUE") == "TRUE"
+DISABLE_SM8x = os.getenv("FLASH_ATTENTION_DISABLE_SM80", "TRUE") == "TRUE"
+
+
+def _write_ninja_file(
+    path,
+    cflags,
+    post_cflags,
+    cuda_cflags,
+    cuda_post_cflags,
+    cuda_dlink_post_cflags,
+    sources,
+    objects,
+    ldflags,
+    library_target,
+    with_cuda,
+    **kwargs,  # kwargs (ignored) to absorb new flags in torch.utils.cpp_extension
+) -> None:
+    r"""Write a ninja file that does the desired compiling and linking.
+
+    `path`: Where to write this file
+    `cflags`: list of flags to pass to $cxx. Can be None.
+    `post_cflags`: list of flags to append to the $cxx invocation. Can be None.
+    `cuda_cflags`: list of flags to pass to $nvcc. Can be None.
+    `cuda_postflags`: list of flags to append to the $nvcc invocation. Can be None.
+    `sources`: list of paths to source files
+    `objects`: list of desired paths to objects, one per source.
+    `ldflags`: list of flags to pass to linker. Can be None.
+    `library_target`: Name of the output library. Can be None; in that case,
+                      we do no linking.
+    `with_cuda`: If we should be compiling with CUDA.
+    """
+
+    def sanitize_flags(flags):
+        if flags is None:
+            return []
+        else:
+            return [flag.strip() for flag in flags]
+
+    cflags = sanitize_flags(cflags)
+    post_cflags = sanitize_flags(post_cflags)
+    cuda_cflags = sanitize_flags(cuda_cflags)
+    cuda_post_cflags = sanitize_flags(cuda_post_cflags)
+    cuda_dlink_post_cflags = sanitize_flags(cuda_dlink_post_cflags)
+    ldflags = sanitize_flags(ldflags)
+
+    # Sanity checks...
+    assert len(sources) == len(objects)
+    assert len(sources) > 0
+
+    compiler = get_cxx_compiler()
+
+    # Version 1.3 is required for the `deps` directive.
+    config = ["ninja_required_version = 1.3"]
+    config.append(f"cxx = {compiler}")
+    if with_cuda or cuda_dlink_post_cflags:
+        if IS_HIP_EXTENSION:
+            nvcc = _join_rocm_home("bin", "hipcc")
+        else:
+            nvcc = _join_cuda_home("bin", "nvcc")
+        if "PYTORCH_NVCC" in os.environ:
+            nvcc_from_env = os.getenv(
+                "PYTORCH_NVCC"
+            )  # user can set nvcc compiler with ccache using the environment variable here
+        else:
+            nvcc_from_env = nvcc
+        config.append(f"nvcc_from_env = {nvcc_from_env}")
+        config.append(f"nvcc = {nvcc}")
+
+    if IS_HIP_EXTENSION:
+        post_cflags = COMMON_HIP_FLAGS + post_cflags
+    flags = [f"cflags = {' '.join(cflags)}"]
+    flags.append(f"post_cflags = {' '.join(post_cflags)}")
+    if with_cuda:
+        flags.append(f"cuda_cflags = {' '.join(cuda_cflags)}")
+        flags.append(f"cuda_post_cflags = {' '.join(cuda_post_cflags)}")
+        cuda_post_cflags_sm80 = [
+            s if s != "arch=compute_90a,code=sm_90a" else "arch=compute_80,code=sm_80"
+            for s in cuda_post_cflags
+        ]
+        flags.append(f"cuda_post_cflags_sm80 = {' '.join(cuda_post_cflags_sm80)}")
+        cuda_post_cflags_sm80_sm90 = cuda_post_cflags + [
+            "-gencode",
+            "arch=compute_80,code=sm_80",
+        ]
+        flags.append(
+            f"cuda_post_cflags_sm80_sm90 = {' '.join(cuda_post_cflags_sm80_sm90)}"
+        )
+        cuda_post_cflags_sm100 = [
+            s
+            if s != "arch=compute_90a,code=sm_90a"
+            else "arch=compute_100a,code=sm_100a"
+            for s in cuda_post_cflags
+        ]
+        flags.append(f"cuda_post_cflags_sm100 = {' '.join(cuda_post_cflags_sm100)}")
+    flags.append(f"cuda_dlink_post_cflags = {' '.join(cuda_dlink_post_cflags)}")
+    flags.append(f"ldflags = {' '.join(ldflags)}")
+
+    # Turn into absolute paths so we can emit them into the ninja build
+    # file wherever it is.
+    sources = [os.path.abspath(file) for file in sources]
+
+    # See https://ninja-build.org/build.ninja.html for reference.
+    compile_rule = ["rule compile"]
+    if IS_WINDOWS:
+        compile_rule.append(
+            "  command = cl /showIncludes $cflags -c $in /Fo$out $post_cflags"
+        )
+        compile_rule.append("  deps = msvc")
+    else:
+        compile_rule.append(
+            "  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags"
+        )
+        compile_rule.append("  depfile = $out.d")
+        compile_rule.append("  deps = gcc")
+
+    if with_cuda:
+        cuda_compile_rule = ["rule cuda_compile"]
+        nvcc_gendeps = ""
+        # --generate-dependencies-with-compile is not supported by ROCm
+        # Nvcc flag `--generate-dependencies-with-compile` is not supported by sccache, which may increase build time.
+        if (
+            torch.version.cuda is not None
+            and os.getenv("TORCH_EXTENSION_SKIP_NVCC_GEN_DEPENDENCIES", "0") != "1"
+        ):
+            cuda_compile_rule.append("  depfile = $out.d")
+            cuda_compile_rule.append("  deps = gcc")
+            # Note: non-system deps with nvcc are only supported
+            # on Linux so use --generate-dependencies-with-compile
+            # to make this work on Windows too.
+            nvcc_gendeps = (
+                "--generate-dependencies-with-compile --dependency-output $out.d"
+            )
+        cuda_compile_rule_sm80 = (
+            ["rule cuda_compile_sm80"]
+            + cuda_compile_rule[1:]
+            + [
+                f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags_sm80"
+            ]
+        )
+        cuda_compile_rule_sm80_sm90 = (
+            ["rule cuda_compile_sm80_sm90"]
+            + cuda_compile_rule[1:]
+            + [
+                f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags_sm80_sm90"
+            ]
+        )
+        cuda_compile_rule_sm100 = (
+            ["rule cuda_compile_sm100"]
+            + cuda_compile_rule[1:]
+            + [
+                f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags_sm100"
+            ]
+        )
+        cuda_compile_rule.append(
+            f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags"
+        )
+
+    # Emit one build rule per source to enable incremental build.
+    build = []
+    for source_file, object_file in zip(sources, objects):
+        is_cuda_source = _is_cuda_file(source_file) and with_cuda
+        if is_cuda_source:
+            if source_file.endswith("_sm90.cu"):
+                rule = "cuda_compile"
+            elif source_file.endswith("_sm80.cu"):
+                rule = "cuda_compile_sm80"
+            elif source_file.endswith("_sm100.cu"):
+                rule = "cuda_compile_sm100"
+            else:
+                rule = "cuda_compile_sm80_sm90"
+        else:
+            rule = "compile"
+        if IS_WINDOWS:
+            source_file = source_file.replace(":", "$:")
+            object_file = object_file.replace(":", "$:")
+        source_file = source_file.replace(" ", "$ ")
+        object_file = object_file.replace(" ", "$ ")
+        build.append(f"build {object_file}: {rule} {source_file}")
+
+    if cuda_dlink_post_cflags:
+        devlink_out = os.path.join(os.path.dirname(objects[0]), "dlink.o")
+        devlink_rule = ["rule cuda_devlink"]
+        devlink_rule.append("  command = $nvcc $in -o $out $cuda_dlink_post_cflags")
+        devlink = [f"build {devlink_out}: cuda_devlink {' '.join(objects)}"]
+        objects += [devlink_out]
+    else:
+        devlink_rule, devlink = [], []
+
+    if library_target is not None:
+        link_rule = ["rule link"]
+        if IS_WINDOWS:
+            cl_paths = (
+                subprocess.check_output(["where", "cl"])
+                .decode(*SUBPROCESS_DECODE_ARGS)
+                .split("\r\n")
+            )
+            if len(cl_paths) >= 1:
+                cl_path = os.path.dirname(cl_paths[0]).replace(":", "$:")
+            else:
+                raise RuntimeError("MSVC is required to load C++ extensions")
+            link_rule.append(
+                f'  command = "{cl_path}/link.exe" $in /nologo $ldflags /out:$out'
+            )
+        else:
+            link_rule.append("  command = $cxx $in $ldflags -o $out")
+
+        link = [f"build {library_target}: link {' '.join(objects)}"]
+
+        default = [f"default {library_target}"]
+    else:
+        link_rule, link, default = [], [], []
+
+    # 'Blocks' should be separated by newlines, for visual benefit.
+    blocks = [config, flags, compile_rule]
+    if with_cuda:
+        blocks.append(cuda_compile_rule)  # type: ignore[possibly-undefined]
+        blocks.append(cuda_compile_rule_sm80)  # type: ignore[possibly-undefined]
+        blocks.append(cuda_compile_rule_sm80_sm90)  # type: ignore[possibly-undefined]
+        blocks.append(cuda_compile_rule_sm100)  # type: ignore[possibly-undefined]
+    blocks += [devlink_rule, link_rule, build, devlink, link, default]
+    content = "\n\n".join("\n".join(b) for b in blocks)
+    # Ninja requires a new lines at the end of the .ninja file
+    content += "\n"
+    _maybe_write(path, content)
+
+
+# Monkey patching
+torch.utils.cpp_extension._write_ninja_file = _write_ninja_file
+
+
+def get_platform():
+    """
+    Returns the platform name as used in wheel filenames.
+    """
+    if sys.platform.startswith("linux"):
+        return "linux_x86_64"
+    elif sys.platform == "darwin":
+        mac_version = ".".join(platform.mac_ver()[0].split(".")[:2])
+        return f"macosx_{mac_version}_x86_64"
+    elif sys.platform == "win32":
+        return "win_amd64"
+    else:
+        raise ValueError("Unsupported platform: {}".format(sys.platform))
+
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    bare_metal_version = parse(output[release_idx].split(",")[0])
+
+    return raw_output, bare_metal_version
+
+
+def check_if_cuda_home_none(global_option: str) -> None:
+    if CUDA_HOME is not None:
+        return
+    # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary
+    # in that case.
+    warnings.warn(
+        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
+        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
+        "only images whose names contain 'devel' will provide nvcc."
+    )
+
+
+def nvcc_threads_args():
+    nvcc_threads = os.getenv("NVCC_THREADS") or "4"
+    return ["--threads", nvcc_threads]
+
+
+exe_extension = sysconfig.get_config_var("EXE")
+
+
+cmdclass = {}
+ext_modules = []
+
+# We want this even if SKIP_CUDA_BUILD because when we run python setup.py sdist we want the .hpp
+# files included in the source distribution, in case the user compiles from source.
+subprocess.run(["git", "submodule", "update", "--init", "cutlass"])
+
+if not SKIP_CUDA_BUILD:
+    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+
+    check_if_cuda_home_none(PACKAGE_NAME)
+    _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+    if bare_metal_version < Version("12.3"):
+        raise RuntimeError(
+            f"FlashAttention-3 is only supported on CUDA 12.3 and above, get {bare_metal_version} from {CUDA_HOME}"
+        )
+
+    cc_flag = []
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_90a,code=sm_90a")
+
+    # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
+    # torch._C._GLIBCXX_USE_CXX11_ABI
+    # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920
+    if FORCE_CXX11_ABI:
+        torch._C._GLIBCXX_USE_CXX11_ABI = True
+    repo_dir = Path(this_dir).parent
+    cutlass_dir = repo_dir / "cpp" / "cutlass"
+
+    feature_args = (
+        []
+        + ["-DOSS_ENV"]
+        + (["-DFLASHATTENTION_DISABLE_BACKWARD"] if DISABLE_BACKWARD else [])
+        + (["-DFLASHATTENTION_DISABLE_FP16"] if DISABLE_FP16 else [])
+        + ["-DFLASHATTENTION_DISABLE_FP8"]
+        + (["-DFLASHATTENTION_DISABLE_HDIM64"] if DISABLE_HDIM64 else [])
+        + (["-DFLASHATTENTION_DISABLE_HDIM96"] if DISABLE_HDIM96 else [])
+        + (["-DFLASHATTENTION_DISABLE_HDIM128"] if DISABLE_HDIM128 else [])
+        + (["-DFLASHATTENTION_DISABLE_HDIM192"] if DISABLE_HDIM192 else [])
+        + (["-DFLASHATTENTION_DISABLE_HDIM256"] if DISABLE_HDIM256 else [])
+        + (["-DFLASHATTENTION_DISABLE_SM8x"] if DISABLE_SM8x else [])
+    )
+
+    DTYPE = ["bf16"] + (["fp16"] if not DISABLE_FP16 else [])
+    HEAD_DIMENSIONS = (
+        []
+        + ([64] if not DISABLE_HDIM64 else [])
+        + ([96] if not DISABLE_HDIM96 else [])
+        + ([128] if not DISABLE_HDIM128 else [])
+        + ([192] if not DISABLE_HDIM192 else [])
+        + ([256] if not DISABLE_HDIM256 else [])
+    )
+    sources_fwd_sm80 = [
+        f"hstu_attention/instantiations/flash_fwd_hdim{hdim}_{dtype}_sm80.cu"
+        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
+    ]
+    sources_bwd_sm80 = [
+        f"hstu_attention/instantiations/flash_bwd_hdim{hdim}_{dtype}_sm80.cu"
+        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
+    ]
+    sources_fwd_sm90 = [
+        f"hstu_attention/instantiations/flash_fwd_hdim{hdim}_{dtype}_sm90.cu"
+        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
+    ]
+    sources_bwd_sm90 = [
+        f"hstu_attention/instantiations/flash_bwd_hdim{hdim}_{dtype}_sm90.cu"
+        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
+    ]
+    if DISABLE_BACKWARD:
+        sources_bwd_sm90 = []
+        sources_bwd_sm80 = []
+    sources = (
+        [
+            "hstu_attention/flash_api.cpp",
+            "hstu_attention/flash_common.cpp",
+            "hstu_attention/flash_cpu_dummy.cpp",
+            "hstu_attention/flash_meta.cpp",
+        ]
+        + (sources_fwd_sm80 if not DISABLE_SM8x else [])
+        + sources_fwd_sm90
+        + (sources_bwd_sm80 if not DISABLE_SM8x else [])
+        + sources_bwd_sm90
+    )
+    nvcc_flags = [
+        "-O3",
+        "-std=c++17",
+        "--ftemplate-backtrace-limit=0",  # To debug template code
+        "--use_fast_math",
+        # "--keep",
+        # "--ptxas-options=--verbose,--register-usage-level=5,--warn-on-local-memory-usage",  # printing out number of registers
+        "--resource-usage",  # printing out number of registers
+        # f"--split-compile={os.getenv('NVCC_THREADS', '4')}",  # split-compile is faster
+        "-lineinfo",  # TODO: disable this for release to reduce binary size
+        "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",  # Necessary for the WGMMA shapes that we use
+        "-DCUTLASS_ENABLE_GDC_FOR_SM90",  # For PDL
+        "-DCUTLASS_DEBUG_TRACE_LEVEL=0",  # Can toggle for debugging
+        "-DNDEBUG",  # Important, otherwise performance is severely impacted
+        "-Xfatbin",  # compress all binary sections
+        "-compress-all",
+    ]
+    if get_platform() == "win_amd64":
+        nvcc_flags.extend(
+            [
+                "-D_USE_MATH_DEFINES",  # for M_LN2
+                "-Xcompiler=/Zc:__cplusplus",  # sets __cplusplus correctly, CUTLASS_CONSTEXPR_IF_CXX17 needed for cutlass::gcd
+            ]
+        )
+    include_dirs = [
+        Path(this_dir),
+        cutlass_dir / "include",
+    ]
+
+    ext_modules.append(
+        CUDAExtension(
+            name=f"{PACKAGE_NAME}._C",
+            sources=sources,
+            extra_compile_args={
+                "cxx": ["-O3", "-std=c++17", "-DPy_LIMITED_API=0x03090000"]
+                + feature_args,
+                "nvcc": nvcc_threads_args() + nvcc_flags + cc_flag + feature_args,
+            },
+            include_dirs=include_dirs,
+            py_limited_api=True,
+        )
+    )
+
+
+setup(
+    name=PACKAGE_NAME,
+    version="0.1.0",
+    packages=find_packages(
+        exclude=(
+            "build",
+            "csrc",
+            "include",
+            "tests",
+            "dist",
+            "docs",
+            "benchmarks",
+        )
+    ),
+    py_modules=["cuda_hstu_attention"],
+    description="FlashAttention HSTU",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: Unix",
+    ],
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension},
+    python_requires=">=3.8",
+    install_requires=[
+        "torch",
+        "einops",
+        "packaging",
+        "ninja==1.11.1.1",
+    ],
+)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp
new file mode 100644
index 000000000..3925aefd7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp
@@ -0,0 +1,40 @@
+#include "common.h"
+#include "sort_kv_pairs_cuda_kernels_template.h"
+
+namespace hstu {
+
+DLL_PUBLIC std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_cuda(
+    const at::Tensor& keys,
+    const at::Tensor& values,
+    const std::optional<int64_t>& end_bit,
+    const bool descending = false) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(keys.get_device());
+  TORCH_CHECK(
+      keys.dtype() == at::kInt || keys.dtype() == at::kLong ||
+      keys.dtype() == at::kByte || keys.dtype() == at::kShort);
+  TORCH_CHECK(keys.numel() < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(keys.dim() == 1);
+  TORCH_CHECK(values.dim() == 1);
+  at::Tensor sorted_keys;
+  at::Tensor sorted_values;
+
+  AT_DISPATCH_INTEGRAL_TYPES(keys.scalar_type(), "sort_pairs_cuda_input1", [&] {
+    using key_t = scalar_t;
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        values.scalar_type(),
+        "sort_pairs_cuda_input2",
+        [&] {
+          using val_t = scalar_t;
+          std::tie(sorted_keys, sorted_values) =
+              sort_kv_pairs_cuda_dispatched<key_t, val_t>(
+                  keys, values, end_bit, descending);
+        });
+  });
+
+  return {std::move(sorted_keys), std::move(sorted_values)};
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu
new file mode 100644
index 000000000..8cd175c71
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu
@@ -0,0 +1,82 @@
+#include <generative_recommenders/ops/cpp/common.h>
+#include <generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h>
+
+#include <cub/device/device_radix_sort.cuh>
+
+namespace hstu {
+
+template <>
+DLL_PUBLIC std::tuple<at::Tensor, at::Tensor>
+sort_kv_pairs_cuda_dispatched<SUB_KEY_T, SUB_VALUE_T>(
+    const at::Tensor& keys,
+    const at::Tensor& values,
+    const std::optional<int64_t>& end_bit,
+    const bool descending) {
+  size_t temp_storage_bytes = 0;
+  auto keys_contig = keys.contiguous();
+  auto values_contig = values.contiguous();
+  auto sorted_keys = at::empty_like(keys_contig);
+  auto sorted_values = at::empty_like(values_contig);
+
+  if (descending) {
+    AT_CUDA_CHECK(
+        cub::DeviceRadixSort::SortPairsDescending(
+            nullptr,
+            temp_storage_bytes,
+            keys_contig.data_ptr<SUB_KEY_T>(),
+            sorted_keys.data_ptr<SUB_KEY_T>(),
+            values_contig.data_ptr<SUB_VALUE_T>(),
+            sorted_values.data_ptr<SUB_VALUE_T>(),
+            keys_contig.numel(),
+            0,
+            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
+            at::cuda::getCurrentCUDAStream()));
+    auto temp_storage = at::empty(
+        {static_cast<int64_t>(temp_storage_bytes)},
+        keys_contig.options().dtype(at::kByte));
+    AT_CUDA_CHECK(
+        cub::DeviceRadixSort::SortPairsDescending(
+            temp_storage.data_ptr(),
+            temp_storage_bytes,
+            keys_contig.data_ptr<SUB_KEY_T>(),
+            sorted_keys.data_ptr<SUB_KEY_T>(),
+            values_contig.data_ptr<SUB_VALUE_T>(),
+            sorted_values.data_ptr<SUB_VALUE_T>(),
+            keys_contig.numel(),
+            0,
+            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
+            at::cuda::getCurrentCUDAStream()));
+  } else {
+    AT_CUDA_CHECK(
+        cub::DeviceRadixSort::SortPairs(
+            nullptr,
+            temp_storage_bytes,
+            keys_contig.data_ptr<SUB_KEY_T>(),
+            sorted_keys.data_ptr<SUB_KEY_T>(),
+            values_contig.data_ptr<SUB_VALUE_T>(),
+            sorted_values.data_ptr<SUB_VALUE_T>(),
+            keys_contig.numel(),
+            0,
+            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
+            at::cuda::getCurrentCUDAStream()));
+    auto temp_storage = at::empty(
+        {static_cast<int64_t>(temp_storage_bytes)},
+        keys_contig.options().dtype(at::kByte));
+    AT_CUDA_CHECK(
+        cub::DeviceRadixSort::SortPairs(
+            temp_storage.data_ptr(),
+            temp_storage_bytes,
+            keys_contig.data_ptr<SUB_KEY_T>(),
+            sorted_keys.data_ptr<SUB_KEY_T>(),
+            values_contig.data_ptr<SUB_VALUE_T>(),
+            sorted_values.data_ptr<SUB_VALUE_T>(),
+            keys_contig.numel(),
+            0,
+            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
+            at::cuda::getCurrentCUDAStream()));
+  }
+
+  return {std::move(sorted_keys), std::move(sorted_values)};
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h
new file mode 100644
index 000000000..e599eccb0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <optional>
+
+namespace hstu {
+
+template <typename key_t, typename val_t>
+std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_cuda_dispatched(
+    const at::Tensor& keys_contig,
+    const at::Tensor& values_contig,
+    const std::optional<int64_t>& end_bit,
+    const bool descending);
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp b/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp
new file mode 100644
index 000000000..c361488fa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+void _split_1d_jagged_jagged_cpu_kernel(
+    int32_t B,
+    const at::TensorAccessor<index_t, 1>& combined_offsets,
+    const at::TensorAccessor<val_t, 1>& combined_values,
+    const at::TensorAccessor<index_t, 1>& lengths_left,
+    const at::TensorAccessor<index_t, 1>& offsets_left,
+    const at::TensorAccessor<index_t, 1>& offsets_right,
+    at::TensorAccessor<val_t, 1> values_left,
+    at::TensorAccessor<val_t, 1> values_right) {
+  for (auto b : c10::irange(B)) {
+    auto combined_start = combined_offsets[b];
+    auto left_len = lengths_left[b];
+    auto left_start = offsets_left[b];
+    auto right_start = offsets_right[b];
+
+    for (auto i = 0; i < left_len; ++i) {
+      values_left[left_start + i] = combined_values[combined_start + i];
+    }
+
+    auto right_len = combined_offsets[b + 1] - combined_offsets[b] - left_len;
+    for (auto i = 0; i < right_len; ++i) {
+      values_right[right_start + i] =
+          combined_values[combined_start + left_len + i];
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values) {
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(combined_values.device().type() == at::DeviceType::CPU);
+  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
+  auto B = lengths_left.size(0);
+
+  auto L_left = lengths_left.sum().item<int64_t>();
+  auto L_right = lengths_right.sum().item<int64_t>();
+  TORCH_CHECK(L_left + L_right == combined_values.numel());
+
+  auto values_left = at::empty({L_left}, combined_values.options());
+  auto values_right = at::empty({L_right}, combined_values.options());
+
+  if (L_left == 0 && L_right == 0) {
+    return std::make_tuple(values_left, values_right);
+  }
+
+  const auto combined_lengths = lengths_left + lengths_right;
+  const auto combined_offsets =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(combined_lengths.view({-1}));
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_right.view({-1}));
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "split_1d_jagged_jagged_values_cpu_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            combined_values.scalar_type(),
+            "split_1d_jagged_jagged_values_cpu_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _split_1d_jagged_jagged_cpu_kernel<index_t, val_t>(
+                  B,
+                  combined_offsets.accessor<index_t, 1>(),
+                  combined_values.accessor<val_t, 1>(),
+                  lengths_left.accessor<index_t, 1>(),
+                  offsets_left.accessor<index_t, 1>(),
+                  offsets_right.accessor<index_t, 1>(),
+                  values_left.accessor<val_t, 1>(),
+                  values_right.accessor<val_t, 1>());
+            });
+      });
+
+  return std::make_tuple(values_left, values_right);
+}
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values) {
+  auto L_left = lengths_left.sum().item<int64_t>();
+  auto L_right = lengths_right.sum().item<int64_t>();
+
+  auto values_left = at::native::empty_meta_symint(
+      {L_left},
+      /*dtype=*/::std::make_optional(combined_values.scalar_type()),
+      /*layout=*/::std::make_optional(combined_values.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+
+  auto values_right = at::native::empty_meta_symint(
+      {L_right},
+      /*dtype=*/::std::make_optional(combined_values.scalar_type()),
+      /*layout=*/::std::make_optional(combined_values.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+
+  return std::make_tuple(values_left, values_right);
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu b/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu
new file mode 100644
index 000000000..181489bae
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu
@@ -0,0 +1,147 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "common.h"
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
+
+namespace hstu {
+
+static constexpr int32_t kMaxThreads = 1024;
+
+template <typename index_t, typename val_t>
+__global__
+__launch_bounds__(kMaxThreads) void _split_1d_jagged_jagged_cuda_kernel(
+    int32_t B,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        combined_offsets,
+    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
+        combined_values,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        lengths_left,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_left,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_right,
+    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values_left,
+    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values_right) {
+  for (auto b = blockIdx.x * blockDim.y + threadIdx.y;
+       b < static_cast<uint32_t>(B);
+       b += gridDim.x * blockDim.y) {
+    auto combined_start = combined_offsets[b];
+    auto left_len = lengths_left[b];
+    auto right_len = combined_offsets[b + 1] - combined_offsets[b] - left_len;
+    auto left_start = offsets_left[b];
+    auto right_start = offsets_right[b];
+
+    for (auto i = threadIdx.x; i < static_cast<uint32_t>(left_len + right_len);
+         i += blockDim.x) {
+      if (i < static_cast<uint32_t>(left_len)) {
+        values_left[left_start + i] = combined_values[combined_start + i];
+      } else {
+        values_right[right_start + i - left_len] =
+            combined_values[combined_start + i];
+      }
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(combined_values.get_device());
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(
+      combined_values.device().type() == at::DeviceType::CUDA);
+  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
+
+  auto B = lengths_left.size(0);
+  auto L_left = lengths_left.sum().item<int64_t>();
+  auto L_right = lengths_right.sum().item<int64_t>();
+  TORCH_CHECK(L_left + L_right == combined_values.numel());
+  TORCH_CHECK(L_left < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(L_right < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(combined_values.get_device() == lengths_left.get_device());
+  TORCH_CHECK(combined_values.get_device() == lengths_right.get_device());
+
+  auto values_left = at::empty({L_left}, combined_values.options());
+  auto values_right = at::empty({L_right}, combined_values.options());
+
+  if (L_left == 0 && L_right == 0) {
+    return std::make_tuple(values_left, values_right);
+  }
+
+  const auto combined_lengths = lengths_left + lengths_right;
+  const auto combined_offsets =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(combined_lengths.view({-1}));
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_right.view({-1}));
+
+  // Optimized thread block configuration based on benchmark results
+  uint32_t B_blocks = 4;
+  dim3 threads(256, B_blocks);
+  auto blocks = div_round_up(B, B_blocks);
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "split_1d_jagged_jagged_values_cuda_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            combined_values.scalar_type(),
+            "split_1d_jagged_jagged_values_cuda_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _split_1d_jagged_jagged_cuda_kernel<index_t, val_t><<<
+                  blocks,
+                  threads,
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  B,
+                  combined_offsets
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  combined_values
+                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                  lengths_left
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  offsets_left
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  offsets_right
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  values_left
+                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                  values_right
+                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>());
+            });
+      });
+
+  return std::make_tuple(values_left, values_right);
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py
new file mode 100644
index 000000000..8c27a787b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from hammer.ops.jagged import concat_1D_jagged_jagged
+from hypothesis import given, settings, strategies as st, Verbosity
+
+# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:concat_1d_jagged_jagged_test
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+class OpsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(10, 500),
+        max_seq_len_left=st.integers(10, 1000),
+        max_seq_len_right=st.integers(10, 1000),
+        val_dtype=st.sampled_from([torch.float32, torch.float16, torch.bfloat16]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=100,
+        deadline=None,
+    )
+    def test_concat_1d_jagged_jagged(
+        self,
+        batch_size: int,
+        max_seq_len_left: int,
+        max_seq_len_right: int,
+        val_dtype: torch.dtype,
+    ) -> None:
+        batch_size = 3
+        max_seq_len_left = 4
+        max_seq_len_right = 2
+        lengths_left = torch.randint(
+            0, max_seq_len_left + 1, (batch_size,), device="cpu"
+        )
+        values_left = torch.rand(
+            (int(torch.sum(lengths_left).cpu().item()),), dtype=val_dtype, device="cpu"
+        )
+        offsets_left = torch.zeros(
+            (batch_size + 1,),
+            dtype=lengths_left.dtype,
+            device=lengths_left.device,
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        lengths_right = torch.randint(
+            0, max_seq_len_right + 1, (batch_size,), device="cpu"
+        )
+        values_right = torch.rand(
+            (int(torch.sum(lengths_right).cpu().item()),), dtype=val_dtype, device="cpu"
+        )
+        offsets_right = torch.zeros(
+            (batch_size + 1,),
+            dtype=lengths_right.dtype,
+            device=lengths_right.device,
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+        custom_cpu_result = torch.ops.hstu.concat_1d_jagged_jagged(
+            lengths_left=lengths_left,
+            values_left=values_left,
+            lengths_right=lengths_right,
+            values_right=values_right,
+        )
+
+        custom_cuda_result = torch.ops.hstu.concat_1d_jagged_jagged(
+            lengths_left=lengths_left.cuda(),
+            values_left=values_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            values_right=values_right.cuda(),
+        )
+        torch.testing.assert_close(custom_cuda_result.cpu(), custom_cpu_result)
+
+    @unittest.skipIf(*gpu_unavailable)
+    def test_concat_1d_jagged_jagged_vs_hammer(self) -> None:
+        torch.manual_seed(42)
+        batch_size = 8
+        max_seq_len_left = 50
+        max_seq_len_right = 30
+
+        lengths_left = torch.randint(
+            0, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            0, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
+        )
+
+        total_left = int(lengths_left.sum().item())
+        total_right = int(lengths_right.sum().item())
+
+        values_left = (
+            torch.randn(total_left, dtype=torch.float32)
+            if total_left > 0
+            else torch.empty(0, dtype=torch.float32)
+        )
+        values_right = (
+            torch.randn(total_right, dtype=torch.float32)
+            if total_right > 0
+            else torch.empty(0, dtype=torch.float32)
+        )
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+
+        combined_values_ref = concat_1D_jagged_jagged(
+            max_seq_len_left=max_seq_len_left,
+            offsets_left=offsets_left,
+            values_left=values_left,
+            max_seq_len_right=max_seq_len_right,
+            offsets_right=offsets_right,
+            values_right=values_right,
+        )
+
+        custom_cuda_result = torch.ops.hstu.concat_1d_jagged_jagged(
+            lengths_left=lengths_left.cuda(),
+            values_left=values_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            values_right=values_right.cuda(),
+        )
+
+        torch.testing.assert_close(custom_cuda_result.cpu(), combined_values_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py
new file mode 100644
index 000000000..cb787ea61
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py
@@ -0,0 +1,39 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-strict
+
+# cmd: buck2 run @//mode/opt -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8 -c fbcode.nvcc_arch=b200a  //generative_recommenders/ops/cpp/tests:hstu_mha_cpu_test
+
+import unittest
+
+import torch
+
+torch.ops.load_library(
+    "//generative_recommenders/ops/cpp/hstu_attention:hstu_flash_attention"
+)
+
+
+class TestHstuMhaFwd(unittest.TestCase):
+    def test_hstu_mha_fwd(self) -> None:
+        q: torch.Tensor = torch.randn([100, 4, 64], dtype=torch.bfloat16, device="cpu")
+        k: torch.Tensor = torch.randn([100, 4, 64], dtype=torch.bfloat16, device="cpu")
+        v: torch.Tensor = torch.randn([100, 4, 64], dtype=torch.bfloat16, device="cpu")
+        res = torch.ops.hstu.hstu_mha_fwd(
+            10,
+            0.25,
+            q,
+            k,
+            v,
+            torch.empty([0], dtype=torch.int32, device="cpu"),
+            True,  # causal
+            None,
+            None,
+            0,
+            0,
+            0,
+            None,  # q_descale
+            None,  # k_descale
+            None,  # v_descale
+            0,  # sm_margin
+        )
+        self.assertIsNotNone(res)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py
new file mode 100644
index 000000000..6a5f5997b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from hammer.ops.jagged import jagged_transpose_1D
+from hypothesis import given, settings, strategies as st, Verbosity
+
+# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:jagged_transpose_1d_test
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+class OpsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        size1=st.integers(2, 10),
+        size2=st.integers(2, 10),
+        max_len=st.integers(5, 50),
+        val_dtype=st.sampled_from([torch.float32, torch.float16, torch.bfloat16]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=100,
+        deadline=None,
+    )
+    def test_jagged_transpose_1d(
+        self,
+        size1: int,
+        size2: int,
+        max_len: int,
+        val_dtype: torch.dtype,
+    ) -> None:
+        lengths = torch.randint(
+            0, max_len + 1, (size1 * size2,), dtype=torch.int32, device="cpu"
+        )
+        offsets = torch.zeros(
+            (size1 * size2 + 1,), dtype=lengths.dtype, device=lengths.device
+        )
+        offsets[1:] = torch.cumsum(lengths.view(-1), dim=0)
+
+        values = torch.randn(int(offsets[-1].item()), dtype=val_dtype, device="cpu")
+
+        (
+            custom_cpu_values,
+            custom_cpu_offsets,
+            custom_cpu_lengths,
+        ) = torch.ops.hstu.jagged_transpose_1d(
+            values=values,
+            offsets=offsets,
+            lengths=lengths,
+            max_len=max_len,
+            size1=size1,
+            size2=size2,
+        )
+
+        (
+            custom_cuda_values,
+            custom_cuda_offsets,
+            custom_cuda_lengths,
+        ) = torch.ops.hstu.jagged_transpose_1d(
+            values=values.cuda(),
+            offsets=offsets.cuda(),
+            lengths=lengths.cuda(),
+            max_len=max_len,
+            size1=size1,
+            size2=size2,
+        )
+
+        torch.testing.assert_close(custom_cuda_values.cpu(), custom_cpu_values)
+        torch.testing.assert_close(custom_cuda_offsets.cpu(), custom_cpu_offsets)
+        torch.testing.assert_close(custom_cuda_lengths.cpu(), custom_cpu_lengths)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        size1=st.integers(2, 10),
+        size2=st.integers(2, 10),
+        max_len=st.integers(5, 50),
+        val_dtype=st.sampled_from([torch.float32, torch.float16, torch.bfloat16]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=100,
+        deadline=None,
+    )
+    def test_jagged_transpose_1d_vs_hammer(
+        self,
+        size1: int,
+        size2: int,
+        max_len: int,
+        val_dtype: torch.dtype,
+    ) -> None:
+        lengths = torch.randint(0, max_len + 1, (size1 * size2,), dtype=torch.int32)
+        offsets = torch.zeros(
+            (size1 * size2 + 1,), dtype=lengths.dtype, device=lengths.device
+        )
+        offsets[1:] = torch.cumsum(lengths.view(-1), dim=0)
+
+        values = torch.randn(int(offsets[-1].item()), dtype=val_dtype)
+
+        values_ref, offsets_ref, lengths_ref = jagged_transpose_1D(
+            values=values,
+            offsets=offsets,
+            lengths=lengths,
+            max_len=max_len,
+            size1=size1,
+            size2=size2,
+        )
+
+        (
+            custom_cuda_values,
+            custom_cuda_offsets,
+            custom_cuda_lengths,
+        ) = torch.ops.hstu.jagged_transpose_1d(
+            values=values.cuda(),
+            offsets=offsets.cuda(),
+            lengths=lengths.cuda(),
+            max_len=max_len,
+            size1=size1,
+            size2=size2,
+        )
+
+        torch.testing.assert_close(custom_cuda_values.cpu(), values_ref)
+        torch.testing.assert_close(custom_cuda_offsets.cpu(), offsets_ref)
+        torch.testing.assert_close(custom_cuda_lengths.cpu(), lengths_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py
new file mode 100644
index 000000000..9826f199d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from hammer.ops.jagged import replace_last_n_with_jagged
+
+# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:replace_last_n_with_jagged_test
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+class OpsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_replace_last_n_with_jagged(self) -> None:
+        torch.manual_seed(42)
+        batch_size = 8
+        embedding_dim = 64
+        max_seq_len_left = 25
+        max_seq_len_right = 10
+
+        lengths_left = torch.randint(
+            max_seq_len_right, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            1, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
+        )
+
+        lengths_right = torch.min(lengths_right, lengths_left)
+
+        total_left = int(lengths_left.sum().item())
+        total_right = int(lengths_right.sum().item())
+
+        values_left = torch.randn(total_left, embedding_dim, dtype=torch.float32)
+        values_right = torch.randn(total_right, embedding_dim, dtype=torch.float32)
+
+        custom_cpu_result = torch.ops.hstu.replace_last_n_with_jagged(
+            lengths_left=lengths_left,
+            values_left=values_left,
+            lengths_right=lengths_right,
+            values_right=values_right,
+        )
+
+        custom_cuda_result = torch.ops.hstu.replace_last_n_with_jagged(
+            lengths_left=lengths_left.cuda(),
+            values_left=values_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            values_right=values_right.cuda(),
+        )
+
+        torch.testing.assert_close(custom_cuda_result.cpu(), custom_cpu_result)
+
+    @unittest.skipIf(*gpu_unavailable)
+    def test_replace_last_n_with_jagged_vs_hammer(self) -> None:
+        torch.manual_seed(42)
+        batch_size = 8
+        embedding_dim = 32
+        max_seq_len_left = 20
+        max_seq_len_right = 8
+
+        lengths_left = torch.randint(
+            max_seq_len_right, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            1, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
+        )
+
+        lengths_right = torch.min(lengths_right, lengths_left)
+
+        total_left = int(lengths_left.sum().item())
+        total_right = int(lengths_right.sum().item())
+
+        values_left = torch.randn(total_left, embedding_dim, dtype=torch.float32)
+        values_right = torch.randn(total_right, embedding_dim, dtype=torch.float32)
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+
+        result_ref = replace_last_n_with_jagged(
+            max_seq_len_left=max_seq_len_left,
+            offsets_left=offsets_left,
+            values_left=values_left,
+            offsets_right=offsets_right,
+            values_right=values_right,
+        )
+
+        custom_cuda_result = torch.ops.hstu.replace_last_n_with_jagged(
+            lengths_left=lengths_left.cuda(),
+            values_left=values_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            values_right=values_right.cuda(),
+        )
+
+        torch.testing.assert_close(custom_cuda_result.cpu(), result_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py
new file mode 100644
index 000000000..24f12c4a2
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from hammer.ops.jagged import split_1D_jagged_jagged
+
+# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:split_1d_jagged_jagged_test
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+class OpsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_split_1d_jagged_jagged(self) -> None:
+        torch.manual_seed(42)
+        batch_size = 8
+        max_seq_len_left = 25
+        max_seq_len_right = 20
+
+        lengths_left = torch.randint(
+            0, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            0, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
+        )
+
+        combined_lengths = lengths_left + lengths_right
+        combined_offsets = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        combined_offsets[1:] = torch.cumsum(combined_lengths.view(-1), dim=0)
+
+        combined_values = torch.randn(
+            int(combined_offsets[-1].item()), dtype=torch.float32
+        )
+
+        custom_cpu_left, custom_cpu_right = torch.ops.hstu.split_1d_jagged_jagged(
+            lengths_left=lengths_left,
+            lengths_right=lengths_right,
+            combined_values=combined_values,
+        )
+
+        custom_cuda_left, custom_cuda_right = torch.ops.hstu.split_1d_jagged_jagged(
+            lengths_left=lengths_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            combined_values=combined_values.cuda(),
+        )
+
+        torch.testing.assert_close(custom_cuda_left.cpu(), custom_cpu_left)
+        torch.testing.assert_close(custom_cuda_right.cpu(), custom_cpu_right)
+
+    @unittest.skipIf(*gpu_unavailable)
+    def test_split_1d_jagged_jagged_vs_hammer(self) -> None:
+        torch.manual_seed(42)
+        batch_size = 8
+        max_seq_len_left = 25
+        max_seq_len_right = 20
+
+        lengths_left = torch.randint(
+            0, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            0, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
+        )
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+
+        combined_offsets = offsets_left + offsets_right
+        combined_values = torch.randn(
+            int(combined_offsets[-1].item()), dtype=torch.float32
+        )
+
+        left_ref, right_ref = split_1D_jagged_jagged(
+            max_seq_len=max_seq_len_left + max_seq_len_right,
+            values=combined_values,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+
+        custom_cuda_left, custom_cuda_right = torch.ops.hstu.split_1d_jagged_jagged(
+            lengths_left=lengths_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            combined_values=combined_values.cuda(),
+        )
+
+        torch.testing.assert_close(custom_cuda_left.cpu(), left_ref)
+        torch.testing.assert_close(custom_cuda_right.cpu(), right_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/hstu_attention.py b/recommendation_v4/generative_recommenders/ops/hstu_attention.py
new file mode 100644
index 000000000..137482227
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/hstu_attention.py
@@ -0,0 +1,353 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.common import HammerKernel, switch_to_contiguous_if_needed
+from generative_recommenders.ops.pytorch.pt_hstu_attention import (
+    pytorch_cached_hstu_mha,
+    pytorch_hstu_mha,
+)
+from generative_recommenders.ops.triton.triton_hstu_attention import (
+    triton_cached_hstu_mha,
+    triton_hstu_mha,
+)
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_ragged_hstu_attention
+    from generative_recommenders.ops.triton_aot.triton_ragged_hstu_attention import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_cached_hstu_mha,
+        aot_triton_kernel_wrapper_ragged_hstu_mha,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_cached_hstu_mha(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE cached_hstu_mha kernel."
+        )
+
+    def aot_triton_kernel_wrapper_ragged_hstu_mha(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE ragged_hstu_mha kernel."
+        )
+
+
+try:
+    from hammer.ops.triton.cc.hstu_attention.triton_cc_hstu_attention import (
+        triton_cc_hstu_mha,
+    )
+    from hammer.v2.ops.triton.template.tlx_bw_hstu_attention import (
+        tlx_bw_hstu_mha_wrapper,
+    )
+except ImportError:
+    tlx_bw_hstu_mha_wrapper = None
+    from generative_recommenders.ops.triton.triton_hstu_attention import (
+        triton_hstu_mha as triton_cc_hstu_mha,
+    )
+from torch.fx._symbolic_trace import is_fx_tracing
+
+torch.fx.wrap("triton_hstu_mha")
+torch.fx.wrap("triton_cached_hstu_mha")
+
+
+@torch.fx.wrap
+def hstu_mha_cuda(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+) -> torch.Tensor:
+    """TorchScript-friendly inference forwarder onto ``torch.ops.hstu.hstu_mha``.
+
+    Bypasses the ``HammerKernel`` enum dispatch in :func:`hstu_mha` so the
+    scripted graph has a single concrete C++ op to call. Mirrors the
+    inference-only path of
+    :func:`generative_recommenders.ops.cpp.cuda_hstu_attention.cuda_hstu_mha_inference_wrapper`
+    with the subset of arguments :class:`STULayer` actually uses.
+    """
+    return torch.ops.hstu.hstu_mha(
+        max_seq_len,
+        alpha,
+        q,
+        k,
+        v,
+        seq_offsets,
+        True,  # causal
+        num_targets,
+        None,  # attn_scale
+        max_attn_len,
+        0,  # min_full_attn_seq_len
+        contextual_seq_len,
+        None,  # q_descale
+        None,  # k_descale
+        None,  # v_descale
+        False,  # sort_by_length
+        False,  # deterministic
+        0,  # sm_margin
+        0,  # max_q_len
+        None,  # seq_offsets_q
+        0,  # num_softmax_heads
+        False,  # training
+        None,  # max_seq_len_tensor
+        None,  # contextual_seq_len_tensor
+        None,  # max_attn_len_tensor
+        None,  # min_full_attn_seq_len_tensor
+        1,  # num_groups
+    )
+
+
+def hstu_mha(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    causal: bool = True,
+    dropout_pr: float = 0.0,
+    training: bool = True,
+    num_targets: Optional[torch.Tensor] = None,
+    attn_scale: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    min_full_attn_seq_len: int = 0,
+    sort_by_length: bool = False,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+    enable_tma: bool = False,
+) -> torch.Tensor:
+    _, H, _ = q.shape
+    if not is_fx_tracing():
+        torch._assert(max_seq_len > 0, "max_seq_len must be larger than 0")
+        torch._assert(q.dim() == 3, "q must be 3-D")
+        torch._assert(k.shape == q.shape, "k must be the same shape as q")
+        torch._assert(v.dim() == 3, "v must be 3-D")
+        torch._assert(v.shape[0] == q.shape[0], "wrong v shape[0]")
+        torch._assert(v.shape[1] == H, "wrong v shape[1]")
+        torch._assert(causal, "only support causal attention")
+
+    if kernel in [
+        HammerKernel.TRITON,
+        HammerKernel.TLX,
+        HammerKernel.TRITON_CC,
+        HammerKernel.TRITON_INFERENCE,
+    ]:
+        if not is_fx_tracing() and kernel == HammerKernel.TRITON:
+            torch._assert(q.is_cuda, "q must be CUDA tensor")
+            torch._assert(k.is_cuda, "k must be CUDA tensor")
+            torch._assert(v.is_cuda, "v must be CUDA tensor")
+            torch._assert(seq_offsets.is_cuda, "seq_offsets must be CUDA tensor")
+            torch._assert(dropout_pr < 1e-6, "dropout for triton path not implemented")
+            torch._assert(
+                min_full_attn_seq_len == 0, "min_full_attn_seq_len not implemented"
+            )
+        assert attn_scale is None, "attn_scale not implemented"
+        q = switch_to_contiguous_if_needed(q)
+        k = switch_to_contiguous_if_needed(k)
+        v = switch_to_contiguous_if_needed(v)
+        seq_offsets = seq_offsets.contiguous()
+
+    if kernel == HammerKernel.TRITON:
+        return triton_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            sort_by_length=sort_by_length,
+            enable_tma=enable_tma,
+        )
+    elif kernel == HammerKernel.TLX:
+        if tlx_bw_hstu_mha_wrapper is None:
+            raise ImportError(
+                "hammer.v2 is required for the TLX kernel. "
+                "Falling back to TRITON or PYTORCH kernel instead."
+            )
+        return tlx_bw_hstu_mha_wrapper(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            attn_scale=torch.tensor(1.0 / max_seq_len, device=q.device),
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            sort_by_length=sort_by_length,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        return triton_cc_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_ragged_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            invalid_attn_mask_type="causal",
+            num_targets=num_targets,
+            attn_scale=attn_scale,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            full_attn_size=min_full_attn_seq_len,
+            num_softmax_heads=0,
+        )
+    else:
+        return pytorch_hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            causal=True,
+            dropout_pr=dropout_pr,
+            training=training,
+            num_targets=num_targets,
+            attn_scale=attn_scale,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            min_full_attn_seq_len=min_full_attn_seq_len,
+        )
+
+
+def delta_hstu_mha(
+    max_seq_len: int,
+    alpha: float,
+    delta_q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+    enable_tma: bool = False,
+) -> torch.Tensor:
+    L, H, D = delta_q.shape
+    B = seq_offsets.size(0) - 1
+    DeltaSize = L // B
+    if not is_fx_tracing():
+        torch._assert(max_seq_len > 0, "max_seq_len must be larger than 0")
+        torch._assert(delta_q.dim() == 3, "delta_q must be 3-D")
+        torch._assert(L % B == 0, "delta_q must be padded")
+        torch._assert(k.dim() == 3, "k must be 3-D")
+        torch._assert(k.shape[1] == H, "wrong k shape[1]")
+        torch._assert(k.shape[2] == D, "wrong k shape[2]")
+        torch._assert(v.dim() == 3, "v must be 3-D")
+        torch._assert(v.shape[1] == H, "wrong v shape[1]")
+    if kernel in [
+        HammerKernel.TRITON,
+        HammerKernel.TRITON_CC,
+        HammerKernel.TRITON_INFERENCE,
+    ]:
+        if not is_fx_tracing() and kernel == HammerKernel.TRITON:
+            torch._assert(delta_q.is_cuda, "q must be CUDA tensor")
+            torch._assert(seq_offsets.is_cuda, "seq_offsets must be CUDA tensor")
+            if num_targets is not None:
+                torch._assert(num_targets.is_cuda, "num_targets must be CUDA tensor")
+        seq_offsets = seq_offsets.contiguous()
+        delta_q = switch_to_contiguous_if_needed(delta_q)
+        k = switch_to_contiguous_if_needed(k)
+        v = switch_to_contiguous_if_needed(v)
+
+    if kernel == HammerKernel.TRITON:
+        return triton_cached_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            enable_tma=enable_tma,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        return triton_cc_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            is_delta_q=True,
+            delta_size=DeltaSize,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        delta_x_offsets = torch.arange(
+            0,
+            L + 1,
+            DeltaSize,
+            device=delta_q.device,
+            dtype=seq_offsets.dtype,
+        )
+        return aot_triton_kernel_wrapper_cached_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            delta_x_offsets=delta_x_offsets,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            attn_scale=None,
+            max_attn_len=max_attn_len,
+            full_attn_size=0,
+        )
+    else:
+        return pytorch_cached_hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/hstu_compute.py b/recommendation_v4/generative_recommenders/ops/hstu_compute.py
new file mode 100644
index 000000000..7728c1454
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/hstu_compute.py
@@ -0,0 +1,390 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.ops.layer_norm import layer_norm
+from generative_recommenders.ops.mm import addmm
+from generative_recommenders.ops.pytorch.pt_hstu_linear import (
+    pytorch_hstu_compute_output,
+)
+
+try:
+    from hammer.ops.triton.cc.addmm.triton_cc_addmm import triton_cc_addmm
+    from hammer.ops.triton.cc.group_norm_mul_dropout.triton_cc_group_norm_mul_dropout import (
+        triton_cc_group_norm_mul_dropout_wrapper,
+    )
+    from hammer.ops.triton.cc.layer_norm_mul_dropout.triton_cc_layer_norm_mul_dropout import (
+        triton_cc_layer_norm_mul_dropout_wrapper,
+    )
+except ImportError:
+    triton_cc_addmm = None
+    triton_cc_group_norm_mul_dropout_wrapper = None
+    triton_cc_layer_norm_mul_dropout_wrapper = None
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.hstu_attention import hstu_mha, hstu_mha_cuda
+from generative_recommenders.ops.triton.triton_hstu_linear import (
+    triton_hstu_compute_output,
+)
+from generative_recommenders.ops.triton.triton_hstu_preprocess_and_attention import (
+    triton_hstu_preprocess_and_attention,
+)
+from torch.fx._symbolic_trace import is_fx_tracing
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_group_norm_mul_dropout
+    from generative_recommenders.ops.triton_aot.triton_group_norm_mul_dropout import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_group_norm_mul_dropout,
+    )
+
+    # @manual=//generative_recommenders/ops/triton_aot:triton_layer_norm_mul_dropout
+    from generative_recommenders.ops.triton_aot.triton_layer_norm_mul_dropout import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_layer_norm_mul_dropout,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_group_norm_mul_dropout(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE group_norm_mul_dropout kernel."
+        )
+
+    def aot_triton_kernel_wrapper_layer_norm_mul_dropout(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE layer_norm_mul_dropout kernel."
+        )
+
+
+torch.fx.wrap("triton_hstu_compute_output")
+
+
+def hstu_compute_uqvk(
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    norm_eps: float,
+    num_heads: int,
+    attn_dim: int,
+    hidden_dim: int,
+    uvqk_weight: torch.Tensor,
+    uvqk_bias: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    if torch.jit.is_scripting():
+        # Script-mode fast path: pure PyTorch, no HammerKernel dispatch.
+        normed_x = F.layer_norm(
+            x,
+            normalized_shape=(x.shape[-1],),
+            weight=norm_weight,
+            bias=norm_bias,
+            eps=norm_eps,
+        )
+        uvqk = torch.addmm(uvqk_bias, normed_x, uvqk_weight)
+    else:
+        normed_x = layer_norm(
+            x,
+            weight=norm_weight,
+            bias=norm_bias,
+            eps=norm_eps,
+            kernel=kernel,
+        )
+        # NOTE: for AMD training, we go with torch.addmm instead of the triton
+        # version before Triton on AMD achieves on-par perf with NV GPU.
+        if torch.version.hip and kernel == HammerKernel.TRITON:
+            uvqk = torch.addmm(uvqk_bias, normed_x, uvqk_weight)
+        else:
+            uvqk = addmm(uvqk_bias, normed_x, uvqk_weight, kernel)
+    u, v, q, k = torch.split(
+        uvqk,
+        [
+            hidden_dim * num_heads,
+            hidden_dim * num_heads,
+            attn_dim * num_heads,
+            attn_dim * num_heads,
+        ],
+        dim=1,
+    )
+    u = F.silu(u)
+    q = q.view(-1, num_heads, attn_dim)
+    k = k.view(-1, num_heads, attn_dim)
+    v = v.view(-1, num_heads, hidden_dim)
+    return u, q, k, v
+
+
+def hstu_compute_output(
+    attn: torch.Tensor,
+    u: torch.Tensor,
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    norm_eps: float,
+    output_weight: torch.Tensor,
+    num_heads: int,
+    linear_dim: int,
+    dropout_ratio: float,
+    training: bool,
+    concat_u: bool,
+    concat_x: bool,
+    mul_u_activation_type: str,
+    group_norm: bool,
+    recompute_y_in_backward: bool,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        return pytorch_hstu_compute_output(
+            attn=attn,
+            u=u,
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            output_weight=output_weight,
+            eps=norm_eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=group_norm,
+            num_heads=num_heads,
+            linear_dim=linear_dim,
+        )
+    if kernel == HammerKernel.TRITON:
+        return triton_hstu_compute_output(
+            attn=attn,
+            u=u,
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            output_weight=output_weight,
+            eps=norm_eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=group_norm,
+            num_heads=num_heads,
+            linear_dim=linear_dim,
+            seed=None,
+            recompute_y_in_backward=recompute_y_in_backward,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        if group_norm:
+            y = aot_triton_kernel_wrapper_group_norm_mul_dropout(
+                x=attn,
+                u=u,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=norm_eps,
+                silu_u=mul_u_activation_type == "silu",
+                concat_ux=concat_u and concat_x,
+                num_heads=num_heads,
+                linear_dim=linear_dim,
+            )
+        else:
+            y = aot_triton_kernel_wrapper_layer_norm_mul_dropout(
+                x=attn,
+                u=u,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=norm_eps,
+                dropout_ratio=dropout_ratio,
+                training=training,
+                silu_u=mul_u_activation_type == "silu",
+                concat_ux=concat_u and concat_x,
+                mul_u_activation_type=mul_u_activation_type,
+            )
+        return addmm(x, y, output_weight, kernel)
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_group_norm_mul_dropout_wrapper is None or triton_cc_addmm is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in hstu_compute_output."
+            )
+        if group_norm:
+            y = triton_cc_group_norm_mul_dropout_wrapper(
+                x=attn,
+                u=u,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=norm_eps,
+                dropout_ratio=dropout_ratio,
+                training=training,
+                concat_ux=concat_u and concat_x,
+                num_heads=num_heads,
+                linear_dim=linear_dim,
+            )
+        else:
+            y = triton_cc_layer_norm_mul_dropout_wrapper(
+                x=attn,
+                u=u,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=norm_eps,
+                dropout_ratio=dropout_ratio,
+                training=training,
+                concat_u=concat_u,
+                concat_x=concat_x,
+                mul_u_activation_type=mul_u_activation_type,
+            )
+        return triton_cc_addmm(x, y, output_weight)
+    else:
+        return pytorch_hstu_compute_output(
+            attn=attn,
+            u=u,
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            output_weight=output_weight,
+            eps=norm_eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=group_norm,
+            num_heads=num_heads,
+            linear_dim=linear_dim,
+        )
+
+
+def hstu_preprocess_and_attention(
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    norm_eps: float,
+    num_heads: int,
+    attn_dim: int,
+    hidden_dim: int,
+    uvqk_weight: torch.Tensor,
+    uvqk_bias: torch.Tensor,
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    attn_alpha: float,
+    causal: bool,
+    num_targets: Optional[torch.Tensor],
+    max_attn_len: int,
+    contextual_seq_len: int,
+    recompute_uvqk_in_backward: bool,
+    recompute_normed_x_in_backward: bool,
+    sort_by_length: bool,
+    prefill: bool = False,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+    enable_tma: Optional[bool] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    if not is_fx_tracing():
+        torch._assert(max_seq_len > 0, "max_seq_len must be larger than 0")
+        torch._assert(x.dim() == 2, "x must be 2-D")
+        torch._assert(
+            x.shape[1] == uvqk_weight.shape[0],
+            "x.shape[1] must equal uvqk_weight.shape[0]",
+        )
+        torch._assert(
+            uvqk_weight.shape[1] == 2 * num_heads * (hidden_dim + attn_dim),
+            "uvqk_weight.shape[1] must equal 2 * num_heads * (hidden_dim + attn_dim)",
+        )
+        torch._assert(causal is True, "only causal attention is supported.")
+    if torch.jit.is_scripting():
+        # Script-mode: compute uvqk via PyTorch fallback then call the
+        # libtorch-callable CUDA HSTU MHA op directly. Avoids both the
+        # HammerKernel enum dispatch and the Triton-only fused path.
+        u, q, k, v = hstu_compute_uqvk(
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            num_heads=num_heads,
+            attn_dim=attn_dim,
+            hidden_dim=hidden_dim,
+            uvqk_weight=uvqk_weight,
+            uvqk_bias=uvqk_bias,
+            kernel=HammerKernel.PYTORCH,
+        )
+        attn_output = hstu_mha_cuda(
+            max_seq_len=max_seq_len,
+            alpha=attn_alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+        ).view(-1, hidden_dim * num_heads)
+        return u, attn_output, k, v
+    if kernel == HammerKernel.TRITON and prefill is False:
+        u, attn_output = triton_hstu_preprocess_and_attention(
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            num_heads=num_heads,
+            attn_dim=attn_dim,
+            hidden_dim=hidden_dim,
+            uvqk_weight=uvqk_weight,
+            uvqk_bias=uvqk_bias,
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            attn_alpha=attn_alpha,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            recompute_uvqk_in_backward=recompute_uvqk_in_backward,
+            recompute_normed_x_in_backward=recompute_normed_x_in_backward,
+            sort_by_length=sort_by_length,
+            enable_tma=enable_tma,
+        )
+        attn_output = attn_output.view(-1, hidden_dim * num_heads)
+        k = None
+        v = None
+    else:
+        u, q, k, v = hstu_compute_uqvk(
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            num_heads=num_heads,
+            attn_dim=attn_dim,
+            hidden_dim=hidden_dim,
+            uvqk_weight=uvqk_weight,
+            uvqk_bias=uvqk_bias,
+            kernel=kernel,
+        )
+        attn_output = hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=attn_alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            causal=causal,
+            dropout_pr=0.0,
+            training=False,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            sort_by_length=sort_by_length,
+            kernel=kernel,
+        ).view(-1, hidden_dim * num_heads)
+    return u, attn_output, k, v
diff --git a/recommendation_v4/generative_recommenders/ops/jagged_tensors.py b/recommendation_v4/generative_recommenders/ops/jagged_tensors.py
new file mode 100644
index 000000000..73e3c4a73
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/jagged_tensors.py
@@ -0,0 +1,451 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.pytorch.pt_jagged import pytorch_jagged_dense_bmm_add
+from generative_recommenders.ops.pytorch.pt_jagged_tensors import (
+    pytorch_concat_2D_jagged,
+    pytorch_hstu_concat_l2_embeddings,
+    pytorch_hstu_split_l2_embeddings,
+    pytorch_split_2D_jagged,
+)
+from generative_recommenders.ops.triton.triton_jagged import triton_jagged_dense_bmm_add
+from generative_recommenders.ops.triton.triton_jagged_tensors import (
+    triton_concat_2D_jagged,
+    triton_concat_2D_jagged_multirow,
+    triton_split_2D_jagged,
+    triton_split_2D_jagged_multirow,
+)
+from torch.fx._symbolic_trace import is_fx_tracing
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_concat_2d_jagged
+    from generative_recommenders.ops.triton_aot.triton_concat_2d_jagged import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_concat_2D_jagged,
+    )
+
+    # @manual=//generative_recommenders/ops/triton_aot:triton_split_2d_jagged
+    from generative_recommenders.ops.triton_aot.triton_split_2d_jagged import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_split_2D_jagged,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_concat_2D_jagged(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE concat_2D_jagged kernel."
+        )
+
+    def aot_triton_kernel_wrapper_split_2D_jagged(
+        *args: object,
+        **kwargs: object,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE split_2D_jagged kernel."
+        )
+
+
+torch.fx.wrap("triton_jagged_dense_bmm_add")
+
+try:
+    from hammer.ops.triton.cc.jagged_dense_bmm.triton_cc_jagged_dense_bmm import (
+        triton_cc_jagged_dense_bmm,
+    )
+except ImportError:
+    triton_cc_jagged_dense_bmm = None
+
+
+torch.fx.wrap("triton_concat_2D_jagged")
+torch.fx.wrap("triton_split_2D_jagged")
+torch.fx.wrap("triton_concat_2D_jagged_multirow")
+torch.fx.wrap("triton_split_2D_jagged_multirow")
+
+
+def concat_2D_jagged(
+    max_seq_len: int,
+    values_left: torch.Tensor,
+    values_right: torch.Tensor,
+    max_len_left: Optional[int] = None,
+    max_len_right: Optional[int] = None,
+    offsets_left: Optional[torch.Tensor] = None,
+    offsets_right: Optional[torch.Tensor] = None,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        return pytorch_concat_2D_jagged(
+            values_left=values_left,
+            values_right=values_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+    if not is_fx_tracing():
+        torch._assert(values_left.dim() == 2, "values_left must be 2D")
+        torch._assert(values_right.dim() == 2, "values_right must be 2D")
+        torch._assert(
+            values_right.shape[1] == values_left.shape[1],
+            f"values_left shape[1] must be equal to values_right shape[1] {values_left.shape[1]} vs {values_right.shape[1]}",
+        )
+    if kernel == HammerKernel.TRITON:
+        return triton_concat_2D_jagged(
+            max_seq_len=max_seq_len,
+            values_left=values_left,
+            values_right=values_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        aott_values_left = values_left
+        aott_values_right = values_right
+        if offsets_left is None:
+            assert max_len_left is not None
+            aott_values_left = values_left.reshape(
+                -1,
+                max_len_left,
+                values_left.shape[-1],
+            )
+        if offsets_right is None:
+            assert max_len_right is not None
+            aott_values_right = values_right.reshape(
+                -1,
+                max_len_right,
+                values_right.shape[-1],
+            )
+        return aot_triton_kernel_wrapper_concat_2D_jagged(
+            max_seq_len=max_seq_len,
+            values_a=aott_values_left,
+            values_b=aott_values_right,
+            offsets_a=offsets_left,
+            offsets_b=offsets_right,
+        )
+    else:
+        return pytorch_concat_2D_jagged(
+            values_left=values_left,
+            values_right=values_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+
+
+def split_2D_jagged(
+    max_seq_len: int,
+    values: torch.Tensor,
+    total_len_left: Optional[int] = None,
+    total_len_right: Optional[int] = None,
+    max_len_left: Optional[int] = None,
+    max_len_right: Optional[int] = None,
+    offsets_left: Optional[torch.Tensor] = None,
+    offsets_right: Optional[torch.Tensor] = None,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if torch.jit.is_scripting():
+        return pytorch_split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+    if not is_fx_tracing():
+        torch._assert(values.dim() == 2, "values must be 2D")
+        torch._assert(
+            offsets_left is not None or offsets_right is not None,
+            "offsets_left and offsets_right cannot be None at the same time",
+        )
+        if offsets_left is None:
+            torch._assert(
+                max_len_left is not None,
+                "max_len_left must be provided when offsets_left is None",
+            )
+        if offsets_right is None:
+            torch._assert(
+                max_len_right is not None,
+                "max_len_right must be provided when offsets_right is None",
+            )
+        if offsets_left is not None and offsets_right is not None:
+            torch._assert(
+                offsets_left.shape[0] == offsets_right.shape[0],
+                "offsets_left shape[0] must be equal to offsets_right shape[0]",
+            )
+    if kernel == HammerKernel.TRITON:
+        return triton_split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            total_len_left=total_len_left,
+            total_len_right=total_len_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        dense_size = 0
+        if offsets_left is None and max_len_left is not None:
+            dense_size = max_len_left
+        elif offsets_right is None and max_len_right is not None:
+            dense_size = max_len_right
+        split_left, split_right = aot_triton_kernel_wrapper_split_2D_jagged(
+            values=values,
+            max_seq_len=max_seq_len,
+            offsets_a=offsets_left,
+            offsets_b=offsets_right,
+            dense_size=dense_size,
+        )
+        if offsets_left is None:
+            split_left = split_left.reshape(-1, split_left.shape[-1])
+        if offsets_right is None:
+            split_right = split_right.reshape(-1, split_right.shape[-1])
+        return split_left, split_right
+    else:
+        return pytorch_split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+
+
+def hstu_split_l2_embeddings(
+    max_seq_len: int,
+    x: torch.Tensor,
+    prefix_offsets: torch.Tensor,
+    l2_offsets: torch.Tensor,
+    contextual_seq_len: int,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if kernel == HammerKernel.TRITON:
+        return triton_split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=x,
+            total_len_right=None,
+            total_len_left=None,
+            max_len_left=None,
+            max_len_right=None,
+            offsets_left=prefix_offsets,
+            offsets_right=l2_offsets,
+            n_prefix_to_right=contextual_seq_len,
+        )
+    else:
+        return pytorch_hstu_split_l2_embeddings(
+            max_seq_len=max_seq_len,
+            x=x,
+            prefix_offsets=prefix_offsets,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=contextual_seq_len,
+        )
+
+
+def hstu_concat_l2_embeddings(
+    max_prefix_len: int,
+    prefix_x: torch.Tensor,
+    prefix_offsets: torch.Tensor,
+    max_l2_len: int,
+    l2_x: torch.Tensor,
+    l2_offsets: torch.Tensor,
+    contextual_seq_len: int,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if kernel == HammerKernel.TRITON:
+        return triton_concat_2D_jagged(
+            max_seq_len=max_prefix_len + max_l2_len,
+            values_left=prefix_x,
+            values_right=l2_x,
+            max_len_left=max_prefix_len,
+            max_len_right=max_l2_len,
+            offsets_left=prefix_offsets,
+            offsets_right=l2_offsets,
+            n_prefix_from_right=contextual_seq_len,
+        )
+    else:
+        return pytorch_hstu_concat_l2_embeddings(
+            contextual_seq_len=contextual_seq_len,
+            max_prefix_len=max_prefix_len,
+            prefix_x=prefix_x,
+            prefix_offsets=prefix_offsets,
+            max_l2_len=max_l2_len,
+            l2_x=l2_x,
+            l2_offsets=l2_offsets,
+        )
+
+
+def jagged_dense_bmm_broadcast_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    bias: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    """
+    Computing out = jagged x dense + bias
+    jagged has shape (sum_B(M_i), K), dense has shape (B, K, N), and bias has shape (B, N)
+    out has shape (sum_B(M_i), N)
+    """
+    if not is_fx_tracing():
+        _, K = jagged.shape
+        B, _, N = dense.shape
+        torch._assert(dense.shape[1] == K, "wrong dense shape[1]")
+        torch._assert(seq_offsets.shape[0] == B + 1, "wrong seq_offsets shape[0]")
+        torch._assert(bias.shape[0] == B, "wrong bias shape[0]")
+        torch._assert(bias.shape[1] == N, "wrong bias shape[1]")
+    if kernel == HammerKernel.TRITON:
+        return triton_jagged_dense_bmm_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+            bias=bias,
+            elementwise=False,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_jagged_dense_bmm is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in jagged_dense_bmm_broadcast_add."
+            )
+        return triton_cc_jagged_dense_bmm(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+            bias=bias,
+        )
+    else:
+        return pytorch_jagged_dense_bmm_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+            bias=bias,
+        )
+
+
+def concat_2D_jagged_multirow(
+    max_seq_len: int,
+    values_left: torch.Tensor,
+    values_right: torch.Tensor,
+    offsets_left: Optional[torch.Tensor],
+    offsets_right: Optional[torch.Tensor],
+    max_len_left: int,
+    max_len_right: int,
+    kernel: HammerKernel = HammerKernel.TRITON,
+) -> torch.Tensor:
+    if not is_fx_tracing():
+        torch._assert(values_left.dim() == 2, "values_left must be 2D")
+        torch._assert(values_right.dim() == 2, "values_right must be 2D")
+        torch._assert(
+            values_right.shape[1] == values_left.shape[1],
+            f"values_left shape[1] must be equal to values_right shape[1] {values_left.shape[1]} vs {values_right.shape[1]}",
+        )
+        if offsets_left is not None and offsets_right is not None:
+            torch._assert(
+                offsets_left.shape[0] == offsets_right.shape[0],
+                "offsets_left and offsets_right must have the same batch dimension",
+            )
+
+    if kernel == HammerKernel.TRITON:
+        return triton_concat_2D_jagged_multirow(
+            max_seq_len=max_seq_len,
+            values_a=values_left,
+            values_b=values_right,
+            offsets_a=offsets_left,
+            offsets_b=offsets_right,
+            max_len_a=max_len_left,
+            max_len_b=max_len_right,
+        )
+    else:
+        return concat_2D_jagged(
+            max_seq_len=max_seq_len,
+            values_left=values_left,
+            values_right=values_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+            kernel=kernel,
+        )
+
+
+def split_2D_jagged_multirow(
+    max_seq_len: int,
+    values: torch.Tensor,
+    total_len_left: Optional[int] = None,
+    total_len_right: Optional[int] = None,
+    max_len_left: Optional[int] = None,
+    max_len_right: Optional[int] = None,
+    offsets_left: Optional[torch.Tensor] = None,
+    offsets_right: Optional[torch.Tensor] = None,
+    kernel: HammerKernel = HammerKernel.TRITON,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if not is_fx_tracing():
+        torch._assert(values.dim() == 2, "values must be 2D")
+        torch._assert(
+            offsets_left is not None or offsets_right is not None,
+            "offsets_left and offsets_right cannot be None at the same time",
+        )
+        if offsets_left is None:
+            torch._assert(
+                max_len_left is not None,
+                "max_len_left must be provided when offsets_left is None",
+            )
+        if offsets_right is None:
+            torch._assert(
+                max_len_right is not None,
+                "max_len_right must be provided when offsets_right is None",
+            )
+        if offsets_left is not None and offsets_right is not None:
+            torch._assert(
+                offsets_left.shape[0] == offsets_right.shape[0],
+                "offsets_left and offsets_right must have the same batch dimension",
+            )
+
+    if kernel == HammerKernel.TRITON:
+        return triton_split_2D_jagged_multirow(
+            max_seq_len=max_seq_len,
+            values=values,
+            total_len_left=total_len_left,
+            total_len_right=total_len_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+    else:
+        return split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            total_len_left=total_len_left,
+            total_len_right=total_len_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+            kernel=kernel,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/layer_norm.py b/recommendation_v4/generative_recommenders/ops/layer_norm.py
new file mode 100644
index 000000000..74ed377d6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/layer_norm.py
@@ -0,0 +1,330 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+
+from typing import List
+
+import torch
+from generative_recommenders.ops.pytorch.pt_layer_norm import (
+    pytorch_layer_norm,
+    pytorch_rms_norm,
+    pytorch_swish_layer_norm,
+)
+from generative_recommenders.ops.triton.triton_layer_norm import triton_rms_norm
+
+try:
+    from hammer.ops.triton.cc.rms_norm.triton_cc_rms_norm import triton_cc_rms_norm
+    from hammer.ops.triton.cc.swish_layer_norm.triton_cc_swish_layer_norm import (
+        triton_cc_swish_layer_norm,
+    )
+except ImportError:
+    triton_cc_swish_layer_norm = None
+    triton_cc_rms_norm = None
+from generative_recommenders.common import HammerKernel, HammerModule
+from generative_recommenders.ops.triton.triton_layer_norm import (
+    triton_layer_norm,
+    triton_swish_layer_norm,
+)
+from torch.fx._symbolic_trace import is_fx_tracing
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_layer_norm
+    from generative_recommenders.ops.triton_aot.triton_layer_norm import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_swish_layer_norm,
+    )
+
+    # @manual=//generative_recommenders/ops/triton_aot:triton_rms_norm
+    from generative_recommenders.ops.triton_aot.triton_rms_norm import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_rms_norm,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_swish_layer_norm(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+        is_swish: bool,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE swish_layer_norm kernel."
+        )
+
+    def aot_triton_kernel_wrapper_rms_norm(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        silu: bool,
+    ) -> torch.Tensor:
+        raise ImportError("AOT-T is required for the TRITON_INFERENCE rms_norm kernel.")
+
+
+torch.fx.wrap("triton_layer_norm")
+torch.fx.wrap("triton_swish_layer_norm")
+torch.fx.wrap("triton_rms_norm")
+
+
+def layer_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-5,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # Script-mode fast path: bypass the HammerKernel ladder (which would
+        # drag in is_fx_tracing()'s closed-over global bool).
+        return torch.nn.functional.layer_norm(
+            x,
+            normalized_shape=(x.shape[-1],),
+            weight=weight,
+            bias=bias,
+            eps=eps,
+        )
+    if kernel == HammerKernel.TRITON:
+        if not is_fx_tracing():
+            torch._assert(not x.is_cpu, "x must be device tensor")
+            torch._assert(not weight.is_cpu, "weight must be device tensor")
+            torch._assert(not bias.is_cpu, "bias must be device tensor")
+        return triton_layer_norm(x, weight, bias, eps)
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_swish_layer_norm(
+            x,
+            weight,
+            bias,
+            eps,
+            is_swish=False,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_swish_layer_norm is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in layer_norm."
+            )
+        return triton_cc_swish_layer_norm(
+            x,
+            weight,
+            bias,
+            eps,
+            is_swish=False,
+        )
+    else:
+        return pytorch_layer_norm(
+            x,
+            [
+                x.shape[-1],
+            ],
+            weight,
+            bias,
+            eps,
+        )
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-5,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+    silu: bool = False,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # Script-mode fast path: bypass the HammerKernel ladder.
+        x_f = x.float()
+        norm = torch.rsqrt(x_f.pow(2).mean(-1, keepdim=True) + eps)
+        out = (x_f * norm * weight.float()).to(x.dtype)
+        if silu:
+            out = torch.nn.functional.silu(out)
+        return out
+    if kernel == HammerKernel.TRITON:
+        if not is_fx_tracing():
+            torch._assert(not x.is_cpu, "x must be device tensor")
+            torch._assert(not weight.is_cpu, "weight must be device tensor")
+        return triton_rms_norm(x, weight, eps, silu)
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_rms_norm(x, weight, eps, silu)
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_rms_norm is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in rms_norm."
+            )
+        return triton_cc_rms_norm(
+            x,
+            weight,
+            eps,
+            silu=silu,
+        )
+    else:
+        return pytorch_rms_norm(
+            x,
+            [
+                x.shape[-1],
+            ],
+            weight,
+            eps,
+            silu,
+        )
+
+
+def swish_layer_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-5,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # Script-mode fast path: bypass the HammerKernel ladder (which
+        # otherwise drags in is_fx_tracing(), Triton/Triton_CC closures,
+        # etc.) and call pure PyTorch directly.
+        return pytorch_swish_layer_norm(
+            x,
+            [x.shape[-1]],
+            weight,
+            bias,
+            eps,
+        )
+    if kernel == HammerKernel.TRITON:
+        if not is_fx_tracing():
+            torch._assert(not x.is_cpu, "x must be device tensor")
+            torch._assert(not weight.is_cpu, "weight must be device tensor")
+            torch._assert(not bias.is_cpu, "bias must be device tensor")
+        return triton_swish_layer_norm(x, [x.shape[-1]], weight, bias, eps)
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_swish_layer_norm(
+            x,
+            weight,
+            bias,
+            eps,
+            is_swish=True,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_swish_layer_norm is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in swish_layer_norm."
+            )
+        return triton_cc_swish_layer_norm(
+            x,
+            weight,
+            bias,
+            eps,
+            is_swish=True,
+        )
+    else:
+        return pytorch_swish_layer_norm(
+            x,
+            [
+                x.shape[-1],
+            ],
+            weight,
+            bias,
+            eps,
+        )
+
+
+class LayerNorm(HammerModule):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._normalized_shape: List[int] = [dim]
+        self._eps = eps
+        self.weight = torch.nn.Parameter(
+            torch.ones(self._normalized_shape),
+        )
+        self.bias = torch.nn.Parameter(
+            torch.zeros(self._normalized_shape),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return layer_norm(
+            x=x,
+            weight=self.weight,
+            bias=self.bias,
+            eps=self._eps,
+            kernel=self.hammer_kernel(),
+        )
+
+
+class RMSNorm(HammerModule):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return rms_norm(
+            x,
+            self.weight,
+            self._eps,
+            silu=False,
+            kernel=self.hammer_kernel(),
+        )
+
+
+class RMSNormSilu(HammerModule):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return rms_norm(
+            x,
+            self.weight,
+            self._eps,
+            silu=True,
+            kernel=self.hammer_kernel(),
+        )
+
+
+class SwishLayerNorm(HammerModule):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._normalized_shape: List[int] = [dim]
+        self.weight = torch.nn.Parameter(torch.ones(self._normalized_shape))
+        self.bias = torch.nn.Parameter(torch.zeros(self._normalized_shape))
+        self._eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        return swish_layer_norm(
+            x=x,
+            weight=self.weight,
+            bias=self.bias,
+            eps=self._eps,
+            kernel=self.hammer_kernel(),
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/mm.py b/recommendation_v4/generative_recommenders/ops/mm.py
new file mode 100644
index 000000000..31a5c5d36
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/mm.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import torch
+
+try:
+    from hammer.ops.triton.cc.addmm.triton_cc_addmm import triton_cc_addmm
+except ImportError:
+    triton_cc_addmm = None
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.triton.triton_addmm import triton_addmm
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_addmm
+    from generative_recommenders.ops.triton_aot.triton_addmm import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_addmm,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_addmm(
+        input: torch.Tensor,
+        mat1: torch.Tensor,
+        mat2: torch.Tensor,
+    ) -> torch.Tensor:
+        raise ImportError("AOT-T is required for the TRITON_INFERENCE addmm kernel.")
+
+
+def addmm(
+    input: torch.Tensor,
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        return torch.addmm(input, mat1, mat2)
+    if kernel == HammerKernel.TRITON:
+        return triton_addmm(input, mat1, mat2)
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_addmm(input, mat1, mat2)
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_addmm is None:
+            raise ImportError("hammer is required for the TRITON_CC kernel in addmm.")
+        return triton_cc_addmm(input, mat1, mat2)
+    else:
+        return torch.addmm(input, mat1, mat2)
diff --git a/recommendation_v4/generative_recommenders/ops/position.py b/recommendation_v4/generative_recommenders/ops/position.py
new file mode 100644
index 000000000..e090827e3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/position.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.ops.pytorch.pt_position import (
+    pytorch_add_timestamp_positional_embeddings,
+)
+
+try:
+    from hammer.ops.triton.cc.add_timestamp_position_embeddings.triton_cc_add_timestamp_position_embeddings import (
+        triton_cc_add_timestamp_position_embeddings,
+    )
+except ImportError:
+    triton_cc_add_timestamp_position_embeddings = None
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.triton.triton_position import (
+    triton_add_timestamp_positional_embeddings,
+)
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_position
+    from generative_recommenders.ops.triton_aot.triton_position import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_position,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_position(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError("AOT-T is required for the TRITON_INFERENCE position kernel.")
+
+
+torch.fx.wrap("triton_add_timestamp_positional_embeddings")
+
+
+def add_timestamp_positional_embeddings(
+    alpha: float,
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    position_embeddings_weight: torch.Tensor,
+    timestamp_embeddings_weight: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    seq_lengths: torch.Tensor,
+    seq_embeddings: torch.Tensor,
+    timestamps: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+    time_bucket_fn: str = "sqrt",
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # Script-mode fast path: bypass the HammerKernel ladder.
+        seq_embeddings = seq_embeddings * alpha
+        return pytorch_add_timestamp_positional_embeddings(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
+    assert time_bucket_fn in ["sqrt", "log"]
+    seq_embeddings = seq_embeddings * alpha
+    if kernel == HammerKernel.TRITON:
+        return triton_add_timestamp_positional_embeddings(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_position(
+            alpha=1.0,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            position_embeddings_weight=position_embeddings_weight.to(torch.float32),
+            timestamp_embeddings_weight=timestamp_embeddings_weight.to(torch.float32),
+            seq_offsets=seq_offsets,
+            seq_lengths=seq_lengths,
+            seq_embeddings=seq_embeddings,
+            timestamps=timestamps,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_add_timestamp_position_embeddings is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in add_timestamp_positional_embeddings."
+            )
+        return triton_cc_add_timestamp_position_embeddings(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
+    else:
+        return pytorch_add_timestamp_positional_embeddings(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_attention.py
new file mode 100644
index 000000000..32575c4db
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_attention.py
@@ -0,0 +1,251 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+@torch.fx.wrap
+def _get_valid_attn_mask(
+    device: torch.device,
+    causal: bool,
+    N: int,
+    seq_lengths: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    min_full_attn_seq_len: int = 0,
+) -> torch.Tensor:
+    ids = torch.arange(0, N, device=device).view(1, N)
+    max_ids = seq_lengths.view(-1, 1, 1)
+    if contextual_seq_len > 0:
+        ids = ids - contextual_seq_len + 1
+        ids = torch.clamp(ids, min=0)
+        max_ids = max_ids - contextual_seq_len + 1
+    if num_targets is not None:
+        max_ids = max_ids - num_targets.view(-1, 1, 1)
+        ids = torch.clamp(
+            ids,
+            max=max_ids,
+        )
+        row_ids = ids.view(-1, N, 1).expand(-1, N, N)
+        col_ids = ids.view(-1, 1, N).expand(-1, N, N)
+    else:
+        row_ids = ids.view(N, 1).expand(N, N)
+        col_ids = row_ids.t()
+        row_ids = row_ids.view(1, N, N)
+        col_ids = col_ids.view(1, N, N)
+    row_col_dist = row_ids - col_ids
+    valid_attn_mask = torch.eye(N, device=device, dtype=torch.bool).view(1, N, N)
+    if not causal:
+        row_col_dist = torch.where(row_col_dist > 0, row_col_dist, -row_col_dist)
+    valid_attn_mask = torch.logical_or(valid_attn_mask, row_col_dist > 0)
+    if max_attn_len > 0:
+        if min_full_attn_seq_len > 0:
+            valid_attn_mask = torch.logical_and(
+                valid_attn_mask,
+                torch.logical_or(
+                    row_col_dist <= max_attn_len,
+                    row_ids >= max_ids - min_full_attn_seq_len,
+                ),
+            )
+        else:
+            valid_attn_mask = torch.logical_and(
+                valid_attn_mask, row_col_dist <= max_attn_len
+            )
+    if contextual_seq_len > 0:
+        valid_attn_mask = torch.logical_or(
+            valid_attn_mask, torch.logical_and(row_ids == 0, col_ids < max_ids)
+        )
+    return valid_attn_mask
+
+
+def _pad_qkv(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    N: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    L, H, D = q.shape
+    V = v.shape[2]
+    padded_q = (
+        torch.ops.fbgemm.jagged_to_padded_dense(
+            values=q.reshape(L, H * D),
+            offsets=[seq_offsets],
+            max_lengths=[N],
+            padding_value=0.0,
+        )
+        .view(-1, N, H, D)
+        .transpose(1, 2)
+    )  # [B, H, N, A]
+    padded_k = (
+        torch.ops.fbgemm.jagged_to_padded_dense(
+            values=k.reshape(L, H * D),
+            offsets=[seq_offsets],
+            max_lengths=[N],
+            padding_value=0.0,
+        )
+        .view(-1, N, H, D)
+        .transpose(1, 2)
+    )  # [B, H, N, A]
+    padded_v = (
+        torch.ops.fbgemm.jagged_to_padded_dense(
+            values=v.reshape(L, H * V),
+            offsets=[seq_offsets],
+            max_lengths=[N],
+            padding_value=0.0,
+        )
+        .view(-1, N, H, V)
+        .transpose(1, 2)
+    )  # [B, H, N, D]
+    return padded_q, padded_k, padded_v
+
+
+@torch.fx.wrap
+def pytorch_hstu_mha(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    causal: bool = True,
+    dropout_pr: float = 0.0,
+    training: bool = True,
+    num_targets: Optional[torch.Tensor] = None,
+    attn_scale: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    min_full_attn_seq_len: int = 0,
+) -> torch.Tensor:
+    L, H, _ = q.shape
+    V = v.shape[2]
+    q, k, v = _pad_qkv(
+        q, k, v, seq_offsets, max_seq_len
+    )  # [B, H, N, D) and [B, H, N, V]
+    qk_attn = torch.einsum("bhxa,bhya->bhxy", q, k) * alpha
+    if attn_scale is not None:
+        if attn_scale.ndim > 0:
+            attn_scale = (
+                torch.ops.fbgemm.jagged_to_padded_dense(
+                    values=attn_scale.unsqueeze(-1),
+                    offsets=[seq_offsets],
+                    max_lengths=[max_seq_len],
+                    padding_value=0.0,
+                )
+                .unsqueeze(1)
+                .to(qk_attn.dtype)
+            )
+
+        qk_attn = F.silu(qk_attn) * attn_scale
+    else:
+        qk_attn = F.silu(qk_attn) / max_seq_len
+    valid_attn_mask = _get_valid_attn_mask(
+        device=q.device,
+        causal=causal,
+        N=max_seq_len,
+        seq_lengths=seq_offsets[1:] - seq_offsets[:-1],
+        num_targets=num_targets,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        min_full_attn_seq_len=min_full_attn_seq_len,
+    )
+    # raise NotImplementedError(valid_attn_mask[0, :, :].to(torch.int32))
+    qk_attn = qk_attn * valid_attn_mask.unsqueeze(1)
+    if dropout_pr > 0.0:
+        qk_attn = F.dropout(qk_attn, p=dropout_pr, training=training)
+    attn_dense = torch.einsum("bhxd,bhdv->bhxv", qk_attn, v)  # [B, H, N, V]
+    return torch.ops.fbgemm.dense_to_jagged(
+        attn_dense.transpose(1, 2).flatten(2, 3),  # [B, N, H, V]->[B, N, H * V]
+        [seq_offsets],
+        L,
+    )[0].view(L, H, V)
+
+
+@torch.fx.wrap
+def pytorch_cached_hstu_mha(
+    max_seq_len: int,
+    alpha: float,
+    delta_q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+) -> torch.Tensor:
+    L, H, D = delta_q.shape
+    _, _, V = v.shape
+    B = seq_offsets.size(0) - 1
+    delta_size = L // B
+    delta_q = delta_q.view(B, -1, H, D).transpose(1, 2)
+    full_k = (
+        torch.ops.fbgemm.jagged_to_padded_dense(
+            values=k.reshape(-1, H * D),
+            offsets=[seq_offsets],
+            max_lengths=[max_seq_len],
+            padding_value=0.0,
+        )
+        .view(B, -1, H, D)
+        .transpose(1, 2)
+    )
+    full_v = (
+        torch.ops.fbgemm.jagged_to_padded_dense(
+            values=v.reshape(-1, H * V),
+            offsets=[seq_offsets],
+            max_lengths=[max_seq_len],
+            padding_value=0.0,
+        )
+        .view(B, -1, H, V)
+        .transpose(1, 2)
+    )
+    qk_attn = torch.einsum("bhxa,bhya->bhxy", delta_q, full_k) * alpha
+    qk_attn = F.silu(qk_attn) / max_seq_len
+    full_valid_attn_mask = _get_valid_attn_mask(
+        device=delta_q.device,
+        causal=True,
+        N=max_seq_len,
+        seq_lengths=seq_offsets[1:] - seq_offsets[:-1],
+        num_targets=num_targets,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+    )
+    seq_lengths = seq_offsets[1:] - seq_offsets[:-1]
+    mask = torch.arange(max_seq_len, device=delta_q.device).view(1, -1)
+    mask = torch.logical_and(
+        mask >= (seq_lengths - delta_size).view(-1, 1),
+        mask < seq_lengths.view(-1, 1),
+    )
+    valid_attn_mask = (
+        full_valid_attn_mask.expand(B, -1, -1)
+        .flatten(0, 1)[mask.view(-1), :]
+        .view(-1, delta_size, max_seq_len)
+    )
+    qk_attn = qk_attn * valid_attn_mask.unsqueeze(1)
+    attn_output = torch.einsum("bhxd,bhdv->bhxv", qk_attn, full_v)
+    return attn_output.transpose(1, 2).reshape(-1, H, V)
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_linear.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_linear.py
new file mode 100644
index 000000000..6ea94a565
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_linear.py
@@ -0,0 +1,130 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import torch
+import torch.nn.functional as F
+
+
+def pytorch_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    mul_u_activation_type: str = "none",
+    group_norm: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+) -> torch.Tensor:
+    dtype = x.dtype
+    x = x.to(torch.float32)
+    u = u.to(torch.float32)
+    if group_norm:
+        if silu_u:
+            u = F.silu(u)
+            u = u.to(torch.float32)
+        y = u * F.group_norm(
+            x.view(-1, num_heads, linear_dim),
+            num_groups=num_heads,
+            weight=weight.to(torch.float32),
+            bias=bias.to(torch.float32),
+            eps=eps,
+        ).view(-1, num_heads * linear_dim)
+        if concat_u and concat_x:
+            y = torch.cat([u, x, y], dim=1)
+    else:
+        mul_u = u
+        if mul_u_activation_type == "sigmoid":
+            mul_u = torch.sigmoid(u)
+        elif mul_u_activation_type == "silu":
+            mul_u = F.silu(u)
+        y = mul_u * F.layer_norm(
+            x,
+            normalized_shape=(x.shape[-1],),
+            weight=weight.to(torch.float32),
+            bias=bias.to(torch.float32),
+            eps=eps,
+        )
+        if concat_u:
+            if silu_u:
+                u = F.silu(u)
+            if concat_x:
+                y = torch.cat([u, x, y], dim=1)
+            else:
+                y = torch.cat([u, y], dim=1)
+        elif concat_x:
+            y = torch.cat([x, y], dim=1)
+    y = F.dropout(
+        y,
+        p=dropout_ratio,
+        training=training,
+    )
+    return y.to(dtype)
+
+
+def pytorch_hstu_compute_output(
+    attn: torch.Tensor,
+    u: torch.Tensor,
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    output_weight: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    mul_u_activation_type: str = "none",
+    group_norm: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+) -> torch.Tensor:
+    dtype = x.dtype
+    y = pytorch_norm_mul_dropout(
+        x=attn,
+        u=u,
+        weight=norm_weight,
+        bias=norm_bias,
+        eps=eps,
+        dropout_ratio=dropout_ratio,
+        training=training,
+        silu_u=silu_u,
+        concat_u=concat_u,
+        concat_x=concat_x,
+        mul_u_activation_type=mul_u_activation_type,
+        group_norm=group_norm,
+        num_heads=num_heads,
+        linear_dim=linear_dim,
+    )
+    return torch.addmm(x, y, output_weight.to(x.dtype)).to(dtype)
+
+
+def pytorch_swiglu(
+    x: torch.Tensor,
+    w_gate: torch.Tensor,
+    w_up: torch.Tensor,
+) -> torch.Tensor:
+    gate = F.silu(F.linear(x, w_gate))
+    up = F.linear(x, w_up)
+    return gate * up
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged.py
new file mode 100644
index 000000000..82d82f402
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged.py
@@ -0,0 +1,258 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Tuple
+
+import torch
+
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+def pytorch_jagged_dense_bmm(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+) -> torch.Tensor:
+    dtype = jagged.dtype
+    jagged = jagged.to(torch.float32)
+    dense = dense.to(torch.float32)
+    padded_jagged = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=jagged,
+        offsets=[seq_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    )
+    bmm_out = torch.bmm(padded_jagged, dense)
+    jagged_bmm_out = torch.ops.fbgemm.dense_to_jagged(
+        bmm_out, [seq_offsets], total_L=jagged.shape[0]
+    )[0]
+    jagged_bmm_out = jagged_bmm_out.to(dtype)
+    return jagged_bmm_out
+
+
+def pytorch_jagged_dense_broadcast_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+) -> torch.Tensor:
+    dtype = jagged.dtype
+    jagged = jagged.to(torch.float32)
+    dense = dense.to(torch.float32)
+    padded_jagged = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=jagged,
+        offsets=[seq_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    )
+    out = padded_jagged + dense.unsqueeze(1)
+    jagged_out = torch.ops.fbgemm.dense_to_jagged(
+        out, [seq_offsets], total_L=jagged.shape[0]
+    )[0]
+    jagged_out = jagged_out.to(dtype)
+    return jagged_out
+
+
+def pytorch_jagged_dense_bmm_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    bias: torch.Tensor,
+    elementwise: bool = False,
+) -> torch.Tensor:
+    dtype = jagged.dtype
+    jagged = jagged.to(torch.float32)
+    dense = dense.to(torch.float32)
+    padded_jagged = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=jagged,
+        offsets=[seq_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    )
+    bmm_out = torch.bmm(padded_jagged, dense)
+
+    if elementwise:
+        jagged_out = (
+            torch.ops.fbgemm.dense_to_jagged(
+                bmm_out, [seq_offsets], total_L=jagged.shape[0]
+            )[0]
+            + bias
+        )
+    else:
+        jagged_out = torch.ops.fbgemm.dense_to_jagged(
+            bmm_out + bias.unsqueeze(1), [seq_offsets], total_L=jagged.shape[0]
+        )[0]
+
+    jagged_out = jagged_out.to(dtype)
+    return jagged_out
+
+
+@torch.fx.wrap
+def _arange(len: int, device: torch.device) -> torch.Tensor:
+    return torch.arange(len, device=device)
+
+
+def pytorch_concat_2D_dense_jagged(
+    jagged_max_seq_len: int,
+    jagged_offsets: torch.Tensor,
+    jagged_values: torch.Tensor,
+    dense_values: torch.Tensor,
+) -> torch.Tensor:
+    B, dense_size, D = dense_values.size()
+    jagged_dense = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=jagged_values,
+        offsets=[jagged_offsets],
+        max_lengths=[jagged_max_seq_len],
+        padding_value=0.0,
+    )
+    concatted_dense = torch.cat([dense_values, jagged_dense], dim=1)
+    concatted_offsets = (
+        dense_size * _arange(B + 1, device=jagged_offsets.device) + jagged_offsets
+    )
+    return torch.ops.fbgemm.dense_to_jagged(
+        concatted_dense,
+        [concatted_offsets],
+        total_L=jagged_values.shape[0] + dense_size * B,
+    )[0]
+
+
+def pytorch_concat_2D_jagged_jagged(
+    max_seq_len_left: int,
+    offsets_left: torch.Tensor,
+    values_left: torch.Tensor,
+    max_seq_len_right: int,
+    offsets_right: torch.Tensor,
+    values_right: torch.Tensor,
+    is_replace: bool = False,
+    n_prefix_from_right: int = 0,
+) -> torch.Tensor:
+    # is_replace with n_prefix_from_right != 0 is not supported yet (neither in triton)
+    if is_replace:
+        return pytorch_replace_last_n_with_jagged(
+            max_seq_len_left,
+            offsets_left,
+            values_left,
+            offsets_right,
+            values_right,
+        )
+
+    lengths_a = offsets_left[1:] - offsets_left[:-1]
+    lengths_b = offsets_right[1:] - offsets_right[:-1]
+
+    # Compute output offsets via cumsum (no dynamic shapes).
+    output_lengths = lengths_a + lengths_b
+    output_offsets = torch.nn.functional.pad(
+        torch.cumsum(output_lengths, dim=0), (1, 0)
+    )
+
+    total_len = values_left.shape[0] + values_right.shape[0]
+    positions = torch.arange(total_len, device=values_left.device)
+    batch_idx = torch.searchsorted(output_offsets[1:], positions, right=True)
+    local_pos = positions - output_offsets[batch_idx]
+
+    per_batch_lengths_a = lengths_a[batch_idx]
+
+    # Classify each output position into prefix / left / suffix.
+    is_prefix = local_pos < n_prefix_from_right
+    is_left = (local_pos >= n_prefix_from_right) & (
+        local_pos < n_prefix_from_right + per_batch_lengths_a
+    )
+
+    # Pad with a sentinel zero row so index_select works on empty tensors
+    values_left_safe = torch.nn.functional.pad(values_left, (0, 0, 0, 1))
+    values_right_safe = torch.nn.functional.pad(values_right, (0, 0, 0, 1))
+
+    left_idx = (offsets_left[batch_idx] + (local_pos - n_prefix_from_right)).clamp(
+        min=0, max=values_left.shape[0]
+    )
+    right_prefix_idx = offsets_right[batch_idx] + local_pos
+    right_suffix_idx = offsets_right[batch_idx] + (local_pos - per_batch_lengths_a)
+    right_idx = torch.where(is_prefix, right_prefix_idx, right_suffix_idx).clamp(
+        min=0, max=values_right.shape[0]
+    )
+
+    left_values = values_left_safe.index_select(0, left_idx)
+    right_values = values_right_safe.index_select(0, right_idx)
+
+    return torch.where(is_left.unsqueeze(-1), left_values, right_values)
+
+
+def pytorch_jagged_remove_first_or_last_1D(
+    values: torch.Tensor,
+    lengths: torch.Tensor,
+    offsets: torch.Tensor,
+    max_seq_len: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    values = values.view(-1, 1)
+    shrunk_lengths = lengths - 1
+    k_lengths = torch.stack([shrunk_lengths, torch.ones_like(lengths)], dim=1).view(-1)
+    q_lengths = torch.stack([torch.ones_like(lengths), shrunk_lengths], dim=1).view(-1)
+    all_indices = torch.arange(
+        start=0, end=q_lengths.numel(), device=values.device
+    ).reshape(-1, 2)
+    q_indices, k_indices = all_indices[:, 1], all_indices[:, 0]
+    values_no_first, _ = torch.ops.fbgemm.jagged_index_select(
+        values, q_lengths, q_indices
+    )
+    values_no_last, _ = torch.ops.fbgemm.jagged_index_select(
+        values, k_lengths, k_indices
+    )
+    return values_no_first.squeeze(), values_no_last.squeeze()
+
+
+@torch.fx.wrap
+def fx_apply_mask(
+    tensor: torch.Tensor, mask: torch.Tensor, fill_value: torch.Tensor
+) -> torch.Tensor:
+    tensor[mask] = fill_value
+    return tensor
+
+
+def pytorch_replace_last_n_with_jagged(
+    max_seq_len_left: int,
+    offsets_left: torch.Tensor,
+    values_left: torch.Tensor,
+    offsets_right: torch.Tensor,
+    values_right: torch.Tensor,
+) -> torch.Tensor:
+    lengths_a = offsets_left[1:] - offsets_left[:-1]
+    lengths_b = offsets_right[1:] - offsets_right[:-1]
+
+    total_len = values_left.shape[0]
+    positions = torch.arange(total_len, device=values_left.device)
+    batch_idx = torch.searchsorted(offsets_left[1:], positions, right=True)
+    local_pos = positions - offsets_left[batch_idx]
+
+    # Positions >= (lengths_a - lengths_b) within each batch are in the replace zone.
+    threshold = lengths_a[batch_idx] - lengths_b[batch_idx]
+    in_replace_zone = local_pos >= threshold
+
+    # Pad with a sentinel zero row so index_select works on empty tensors
+    values_right_safe = torch.nn.functional.pad(values_right, (0, 0, 0, 1))
+    right_idx = (offsets_right[batch_idx] + (local_pos - threshold)).clamp(
+        min=0, max=values_right.shape[0]
+    )
+    right_values = values_right_safe.index_select(0, right_idx)
+    return torch.where(in_replace_zone.unsqueeze(-1), right_values, values_left)
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged_tensors.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged_tensors.py
new file mode 100644
index 000000000..27817f7fb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged_tensors.py
@@ -0,0 +1,246 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.common import fx_arange
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+def _concat_2D_jagged_jagged(
+    values_left: torch.Tensor,
+    values_right: torch.Tensor,
+    max_len_left: int,
+    max_len_right: int,
+    offsets_left: torch.Tensor,
+    offsets_right: torch.Tensor,
+) -> torch.Tensor:
+    max_seq_len = max_len_left + max_len_right
+    lengths_left = offsets_left[1:] - offsets_left[:-1]
+    lengths_right = offsets_right[1:] - offsets_right[:-1]
+    padded_left = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=values_left,
+        offsets=[offsets_left],
+        max_lengths=[max_len_left],
+        padding_value=0.0,
+    )
+    padded_right = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=values_right,
+        offsets=[offsets_right],
+        max_lengths=[max_len_right],
+        padding_value=0.0,
+    )
+    concatted_dense = torch.cat([padded_left, padded_right], dim=1)
+    mask = fx_arange(max_seq_len, device=offsets_left.device).view(1, -1)
+    mask = torch.logical_or(
+        mask < lengths_left.view(-1, 1),
+        torch.logical_and(
+            mask >= max_len_left,
+            mask < max_len_left + lengths_right.view(-1, 1),
+        ),
+    )
+    return concatted_dense.flatten(0, 1)[mask.view(-1), :]
+
+
+@torch.fx.wrap
+def pytorch_concat_2D_jagged(
+    values_left: torch.Tensor,
+    values_right: torch.Tensor,
+    max_len_left: Optional[int],
+    max_len_right: Optional[int],
+    offsets_left: Optional[torch.Tensor],
+    offsets_right: Optional[torch.Tensor],
+) -> torch.Tensor:
+    if offsets_left is None:
+        assert max_len_left is not None
+        B = values_left.shape[0] // max_len_left
+        offsets_left_non_optional = max_len_left * torch.arange(
+            B + 1, device=values_left.device
+        )
+    else:
+        offsets_left_non_optional = offsets_left
+    if offsets_right is None:
+        assert max_len_right is not None
+        B = values_right.shape[0] // max_len_right
+        offsets_right_non_optional = max_len_right * torch.arange(
+            B + 1, device=values_left.device
+        )
+    else:
+        offsets_right_non_optional = offsets_right
+    max_len_left = (
+        int(
+            (offsets_left_non_optional[1:] - offsets_left_non_optional[:-1])
+            .max()
+            .item()
+        )
+        if max_len_left is None
+        else max_len_left
+    )
+    max_len_right = (
+        int(
+            (offsets_right_non_optional[1:] - offsets_right_non_optional[:-1])
+            .max()
+            .item()
+        )
+        if max_len_right is None
+        else max_len_right
+    )
+    return _concat_2D_jagged_jagged(
+        values_left=values_left,
+        values_right=values_right,
+        max_len_left=max_len_left,
+        max_len_right=max_len_right,
+        offsets_left=offsets_left_non_optional,
+        offsets_right=offsets_right_non_optional,
+    )
+
+
+def _split_2D_jagged_jagged(
+    max_seq_len: int,
+    values: torch.Tensor,
+    offsets_left: torch.Tensor,
+    offsets_right: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    offsets = offsets_left + offsets_right
+    padded_values = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=values,
+        offsets=[offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    ).flatten(0, 1)
+    lengths_left = offsets_left[1:] - offsets_left[:-1]
+    lengths_right = offsets_right[1:] - offsets_right[:-1]
+    mask = fx_arange(max_seq_len, device=values.device).view(1, -1)
+    mask_left = mask < lengths_left.view(-1, 1)
+    mask_right = torch.logical_and(
+        mask >= lengths_left.view(-1, 1),
+        mask < (lengths_left + lengths_right).view(-1, 1),
+    )
+    return padded_values[mask_left.view(-1), :], padded_values[mask_right.view(-1), :]
+
+
+@torch.fx.wrap
+def pytorch_split_2D_jagged(
+    max_seq_len: int,
+    values: torch.Tensor,
+    max_len_left: Optional[int],
+    max_len_right: Optional[int],
+    offsets_left: Optional[torch.Tensor],
+    offsets_right: Optional[torch.Tensor],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if offsets_left is None:
+        assert max_len_left is not None
+        assert offsets_right is not None
+        offsets_left_non_optional = max_len_left * torch.arange(
+            offsets_right.shape[0], device=values.device
+        )
+    else:
+        offsets_left_non_optional = offsets_left
+    if offsets_right is None:
+        assert max_len_right is not None
+        assert offsets_left is not None
+        offsets_right_non_optional = max_len_right * torch.arange(
+            offsets_left.shape[0], device=values.device
+        )
+    else:
+        offsets_right_non_optional = offsets_right
+    return _split_2D_jagged_jagged(
+        max_seq_len=max_seq_len,
+        values=values,
+        offsets_left=offsets_left_non_optional,
+        offsets_right=offsets_right_non_optional,
+    )
+
+
+def pytorch_hstu_split_l2_embeddings(
+    max_seq_len: int,
+    x: torch.Tensor,
+    prefix_offsets: torch.Tensor,
+    l2_offsets: torch.Tensor,
+    contextual_seq_len: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    x_offsets = prefix_offsets + l2_offsets
+    x_lengths = x_offsets[1:] - x_offsets[:-1]
+    padded_x = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=x,
+        offsets=[x_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    ).flatten(0, 1)
+    prefix_lengths = prefix_offsets[1:] - prefix_offsets[:-1]
+    mask = fx_arange(max_seq_len, device=x_offsets.device).view(1, -1)
+    mask_prefix = torch.logical_and(
+        mask >= contextual_seq_len,
+        mask < prefix_lengths.view(-1, 1) + contextual_seq_len,
+    )
+    mask_l2 = torch.logical_or(
+        mask < contextual_seq_len,
+        torch.logical_and(
+            mask >= prefix_lengths.view(-1, 1) + contextual_seq_len,
+            mask < x_lengths.view(-1, 1),
+        ),
+    )
+    return padded_x[mask_prefix.view(-1), :], padded_x[mask_l2.view(-1), :]
+
+
+def pytorch_hstu_concat_l2_embeddings(
+    max_prefix_len: int,
+    prefix_x: torch.Tensor,
+    prefix_offsets: torch.Tensor,
+    max_l2_len: int,
+    l2_x: torch.Tensor,
+    l2_offsets: torch.Tensor,
+    contextual_seq_len: int,
+) -> torch.Tensor:
+    padded_prefix_x = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=prefix_x,
+        offsets=[prefix_offsets],
+        max_lengths=[max_prefix_len],
+        padding_value=0.0,
+    )
+    padded_l2_x = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=l2_x,
+        offsets=[l2_offsets],
+        max_lengths=[max_l2_len],
+        padding_value=0.0,
+    )
+    padded_x = torch.cat(
+        [
+            padded_l2_x[:, 0:contextual_seq_len, :],
+            padded_prefix_x,
+            padded_l2_x[:, contextual_seq_len:, :],
+        ],
+        dim=1,
+    )
+    mask = fx_arange(max_prefix_len + max_l2_len, device=prefix_x.device).view(1, -1)
+    prefix_lengths = prefix_offsets[1:] - prefix_offsets[:-1]
+    l2_lengths = l2_offsets[1:] - l2_offsets[:-1]
+    mask = torch.logical_or(
+        mask < prefix_lengths.view(-1, 1) + contextual_seq_len,
+        torch.logical_and(
+            mask >= max_prefix_len + contextual_seq_len,
+            mask < max_prefix_len + l2_lengths.view(-1, 1),
+        ),
+    )
+    return padded_x.flatten(0, 1)[mask.view(-1), :]
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_layer_norm.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_layer_norm.py
new file mode 100644
index 000000000..0666212ce
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_layer_norm.py
@@ -0,0 +1,81 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# pyre-strict
+
+
+from typing import List
+
+import torch
+
+
+def pytorch_layer_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    dtype = x.dtype
+    return torch.nn.functional.layer_norm(
+        x.to(torch.float32),
+        normalized_shape,
+        weight.to(torch.float32),
+        bias.to(torch.float32),
+        eps,
+    ).to(dtype)
+
+
+def pytorch_rms_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: torch.Tensor,
+    eps: float,
+    silu: bool = False,
+) -> torch.Tensor:
+    dtype = x.dtype
+    x_float = x.to(torch.float32)
+    normalized = torch.nn.functional.rms_norm(
+        x_float,
+        normalized_shape,
+        weight.to(torch.float32),
+        eps,
+    )
+    if silu:
+        normalized = torch.nn.functional.silu(normalized)
+    return normalized.to(dtype)
+
+
+def pytorch_swish_layer_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    dtype = x.dtype
+    x = x.to(torch.float32)
+    return (
+        x
+        * torch.sigmoid(
+            torch.nn.functional.layer_norm(
+                x,
+                normalized_shape,
+                weight.to(torch.float32),
+                bias.to(torch.float32),
+                eps,
+            )
+        )
+    ).to(dtype)
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_position.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_position.py
new file mode 100644
index 000000000..fced57e4f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_position.py
@@ -0,0 +1,142 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    fx_unwrap_optional_tensor,
+    jagged_to_padded_dense,
+)
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+@torch.fx.wrap
+def torch_arange(end: int, device: torch.device) -> torch.Tensor:
+    return torch.arange(end, device=device)
+
+
+@torch.fx.wrap
+def _get_col_indices(
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    max_pos_ind: int,
+    seq_lengths: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+) -> torch.Tensor:
+    B = seq_lengths.size(0)
+    col_indices = torch.arange(max_seq_len, device=seq_lengths.device).expand(
+        B, max_seq_len
+    )
+    if num_targets is not None:
+        if interleave_targets:
+            high_inds = seq_lengths - fx_unwrap_optional_tensor(num_targets) * 2
+        else:
+            high_inds = seq_lengths - fx_unwrap_optional_tensor(num_targets)
+        col_indices = torch.clamp(col_indices, max=high_inds.view(-1, 1))
+        col_indices = high_inds.view(-1, 1) - col_indices
+    else:
+        col_indices = seq_lengths.view(-1, 1) - col_indices
+    col_indices = col_indices + max_contextual_seq_len
+    col_indices = torch.clamp(col_indices, max=max_pos_ind - 1)
+    if max_contextual_seq_len > 0:
+        col_indices[:, :max_contextual_seq_len] = torch.arange(
+            0,
+            max_contextual_seq_len,
+            device=col_indices.device,
+            dtype=col_indices.dtype,
+        ).view(1, -1)
+    return col_indices
+
+
+def pytorch_add_timestamp_positional_embeddings(
+    seq_embeddings: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    pos_embeddings: torch.Tensor,
+    ts_embeddings: torch.Tensor,
+    timestamps: torch.Tensor,
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    seq_lengths: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+    time_bucket_fn: str,
+) -> torch.Tensor:
+    max_pos_ind = int(pos_embeddings.size(0))
+    # position encoding
+    pos_inds = _get_col_indices(
+        max_seq_len=max_seq_len,
+        max_contextual_seq_len=max_contextual_seq_len,
+        max_pos_ind=max_pos_ind,
+        seq_lengths=seq_lengths,
+        num_targets=num_targets,
+        interleave_targets=interleave_targets,
+    )
+    B, _ = pos_inds.shape
+    # timestamp encoding
+    num_time_buckets = ts_embeddings.size(1) - 1
+    time_bucket_increments = 60.0
+    time_bucket_divisor = 1.0
+    time_delta = 0
+    timestamps = jagged_to_padded_dense(
+        values=timestamps.unsqueeze(-1),
+        offsets=[seq_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    ).squeeze(-1)
+    query_time = torch.gather(
+        timestamps, dim=1, index=(seq_lengths - 1).unsqueeze(1).clamp(min=0)
+    )
+    ts = query_time - timestamps
+    ts = ts + time_delta
+    ts = ts.clamp(min=1e-6) / time_bucket_increments
+    if time_bucket_fn == "log":
+        ts = torch.log(ts)
+    else:
+        ts = torch.sqrt(ts)
+    ts = (ts / time_bucket_divisor).clamp(min=0).int()
+    ts = torch.clamp(
+        ts,
+        min=0,
+        max=num_time_buckets,
+    )
+    position_embeddings = torch.index_select(
+        pos_embeddings, 0, pos_inds.reshape(-1)
+    ).view(B, max_seq_len, -1)
+    time_embeddings = torch.index_select(ts_embeddings, 0, ts.reshape(-1)).view(
+        B, max_seq_len, -1
+    )
+    padded_emb = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=seq_embeddings,
+        offsets=[seq_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    )
+    summed = padded_emb + (time_embeddings + position_embeddings).to(
+        seq_embeddings.dtype
+    )
+    result, _ = torch.ops.fbgemm.dense_to_jagged(
+        summed, [seq_offsets], seq_embeddings.shape[0]
+    )
+    return result
diff --git a/recommendation_v4/generative_recommenders/ops/tests/fake_signature_test.py b/recommendation_v4/generative_recommenders/ops/tests/fake_signature_test.py
new file mode 100644
index 000000000..fb9047454
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/fake_signature_test.py
@@ -0,0 +1,162 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+"""
+Tests to ensure fake and real implementations of triton functions
+have the same function signatures. This is critical for PT2 compile compatibility.
+"""
+
+import inspect
+import unittest
+from typing import Any, Callable, List
+
+
+def get_custom_op_params(func: Callable[..., object]) -> List[str]:
+    """
+    Get parameter names from a function, handling custom_op decorated functions.
+
+    For maybe_register_custom_op decorated functions, inspect.signature may return
+    *args, **kwargs instead of the actual parameters. In this case, we need to
+    access the underlying schema to get the real parameter names.
+    """
+    sig = inspect.signature(func)
+    params = list(sig.parameters.keys())
+
+    if params == ["args", "kwargs"]:
+        func_any: Any = func
+        if hasattr(func_any, "_opoverload"):
+            schema = func_any._opoverload._schema
+            return [arg.name for arg in schema.arguments]
+
+    return params
+
+
+class FakeSignatureTest(unittest.TestCase):
+    """Test to ensure fake and real implementations have the same function signatures."""
+
+    def test_triton_addmm_fwd_and_fake_have_same_signature(self) -> None:
+        """Verify triton_addmm_fwd and triton_addmm_fwd_fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_addmm import (
+            triton_addmm_fwd,
+            triton_addmm_fwd_fake,
+        )
+
+        real_params = get_custom_op_params(triton_addmm_fwd)
+        fake_params = get_custom_op_params(triton_addmm_fwd_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"triton_addmm_fwd and triton_addmm_fwd_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
+
+    def test_maybe_triton_addmm_fwd_and_fake_have_same_signature(self) -> None:
+        """Verify maybe_triton_addmm_fwd and maybe_triton_addmm_fwd_fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_addmm import (
+            maybe_triton_addmm_fwd,
+            maybe_triton_addmm_fwd_fake,
+        )
+
+        real_params = get_custom_op_params(maybe_triton_addmm_fwd)
+        fake_params = get_custom_op_params(maybe_triton_addmm_fwd_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"maybe_triton_addmm_fwd and maybe_triton_addmm_fwd_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
+
+    def test_triton_hstu_attention_fwd_and_fake_have_same_signature(self) -> None:
+        """Verify triton_hstu_attention_fwd and _triton_hstu_attention_fwd_fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_hstu_attention import (
+            _triton_hstu_attention_fwd_fake,
+            triton_hstu_attention_fwd,
+        )
+
+        real_params = get_custom_op_params(triton_hstu_attention_fwd)
+        fake_params = get_custom_op_params(_triton_hstu_attention_fwd_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"triton_hstu_attention_fwd and _triton_hstu_attention_fwd_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
+
+    def test_triton_hstu_attention_bwd_and_fake_have_same_signature(self) -> None:
+        """Verify triton_hstu_attention_bwd and _triton_hstu_attention_bwd_fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_hstu_attention import (
+            _triton_hstu_attention_bwd_fake,
+            triton_hstu_attention_bwd,
+        )
+
+        real_params = get_custom_op_params(triton_hstu_attention_bwd)
+        fake_params = get_custom_op_params(_triton_hstu_attention_bwd_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"triton_hstu_attention_bwd and _triton_hstu_attention_bwd_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
+
+    def test_triton_layer_norm_mul_dropout_fwd_impl_and_fake_have_same_signature(
+        self,
+    ) -> None:
+        """Verify _triton_layer_norm_mul_dropout_fwd_impl and its fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_hstu_linear import (
+            _triton_layer_norm_mul_dropout_fwd_impl,
+            _triton_layer_norm_mul_dropout_fwd_impl_fake,
+        )
+
+        real_params = get_custom_op_params(_triton_layer_norm_mul_dropout_fwd_impl)
+        fake_params = get_custom_op_params(_triton_layer_norm_mul_dropout_fwd_impl_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"_triton_layer_norm_mul_dropout_fwd_impl and _triton_layer_norm_mul_dropout_fwd_impl_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
+
+    def test_triton_layer_norm_mul_dropout_bwd_impl_and_fake_have_same_signature(
+        self,
+    ) -> None:
+        """Verify _triton_layer_norm_mul_dropout_bwd_impl and its fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_hstu_linear import (
+            _triton_layer_norm_mul_dropout_bwd_impl,
+            _triton_layer_norm_mul_dropout_bwd_impl_fake,
+        )
+
+        real_params = get_custom_op_params(_triton_layer_norm_mul_dropout_bwd_impl)
+        fake_params = get_custom_op_params(_triton_layer_norm_mul_dropout_bwd_impl_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"_triton_layer_norm_mul_dropout_bwd_impl and _triton_layer_norm_mul_dropout_bwd_impl_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_test.py b/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_test.py
new file mode 100644
index 000000000..ef70b8cc2
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_test.py
@@ -0,0 +1,485 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import random
+import unittest
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    generate_sparse_seq_len,
+    gpu_unavailable,
+    HammerKernel,
+    set_dev_mode,
+)
+from generative_recommenders.ops.jagged_tensors import split_2D_jagged
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+def test_attn(
+    batch_size: int,
+    heads: int,
+    max_uih_len: int,
+    max_targets: int,
+    attn_dim: int,
+    hidden_dim: int,
+    causal: bool,
+    has_multiple_targets: bool,
+    has_max_attn_len: bool,
+    dtype: torch.dtype,
+    test_backward: bool,
+    ref_kernel: HammerKernel,
+    real_kernel: HammerKernel,
+    skip_comparisons: bool = False,
+    sparsity: float = -1.0,
+    contextual_seq_len: int = 0,
+    atol: Optional[float] = None,
+    rtol: Optional[float] = None,
+    enable_tma: bool = False,
+) -> None:
+    set_dev_mode(True)
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    from generative_recommenders.ops.hstu_attention import hstu_mha
+
+    alpha = 1.0 / (attn_dim**0.5)
+    if sparsity > 0.0:
+        lengths = generate_sparse_seq_len(
+            size=batch_size,
+            max_seq_len=max_uih_len,
+            sparsity=sparsity,
+            device=torch.device("cuda"),
+        )
+    else:
+        lengths = torch.randint(
+            max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+    num_targets = torch.randint(
+        1, max_targets + 1, size=(batch_size,), device=torch.device("cuda")
+    )
+    lengths = lengths + num_targets + contextual_seq_len
+    max_seq_len = max_uih_len + max_targets + contextual_seq_len
+    if has_max_attn_len:
+        max_attn_len = random.randint(1, max_uih_len // 5)
+    else:
+        max_attn_len = 0
+    seq_offsets = torch.zeros(
+        (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+    )
+    seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+
+    L = int(seq_offsets[-1].item())
+    q = (
+        torch.empty((L, heads, attn_dim), dtype=dtype, device=torch.device("cuda"))
+        .uniform_(-0.1, 0.1)
+        .requires_grad_()
+    )
+    k = (
+        torch.empty((L, heads, attn_dim), dtype=dtype, device=torch.device("cuda"))
+        .uniform_(-0.1, 0.1)
+        .requires_grad_()
+    )
+    v = (
+        torch.empty((L, heads, hidden_dim), dtype=dtype, device=torch.device("cuda"))
+        .uniform_(-0.1, 0.1)
+        .requires_grad_()
+    )
+
+    # ref implementation
+    ref_out = hstu_mha(
+        max_seq_len=max_seq_len,
+        alpha=alpha,
+        q=q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+        causal=causal,
+        num_targets=num_targets if has_multiple_targets else None,
+        dropout_pr=0.0,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        kernel=ref_kernel,
+        enable_tma=enable_tma,
+    )
+    dout = torch.randn_like(ref_out)
+    ref_out.backward(dout)
+
+    if skip_comparisons:
+        return
+
+    # pyre-ignore
+    ref_dv, v.grad = v.grad.clone(), None
+    ref_dk, k.grad = k.grad.clone(), None
+    ref_dq, q.grad = q.grad.clone(), None
+
+    # triton implementation
+    q = q.detach().clone().requires_grad_()
+    k = k.detach().clone().requires_grad_()
+    v = v.detach().clone().requires_grad_()
+    dout = dout.detach().clone()
+    real_out = hstu_mha(
+        max_seq_len=max_seq_len,
+        alpha=alpha,
+        q=q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+        causal=causal,
+        num_targets=num_targets if has_multiple_targets else None,
+        dropout_pr=0.0,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        kernel=real_kernel,
+        enable_tma=enable_tma,
+    )
+
+    torch.testing.assert_close(
+        ref_out,
+        real_out,
+        atol=atol,
+        rtol=rtol,
+    )
+    if test_backward:
+        real_out.backward(dout)
+        real_dq, real_dk, real_dv = q.grad.clone(), k.grad.clone(), v.grad.clone()
+        torch.testing.assert_close(ref_dv, real_dv, atol=atol, rtol=rtol)
+        torch.testing.assert_close(ref_dk, real_dk, atol=atol, rtol=rtol)
+        torch.testing.assert_close(ref_dq, real_dq, atol=atol, rtol=rtol)
+
+
+def test_delta_attn(
+    batch_size: int,
+    heads: int,
+    max_uih_len: int,
+    max_targets: int,
+    delta_size: int,
+    attn_dim: int,
+    hidden_dim: int,
+    has_multiple_targets: bool,
+    has_max_attn_len: bool,
+    dtype: torch.dtype,
+    ref_kernel: HammerKernel,
+    real_kernel: HammerKernel,
+    contextual_seq_len: int = 0,
+    atol: Optional[float] = None,
+    rtol: Optional[float] = None,
+    enable_tma: bool = False,
+) -> None:
+    set_dev_mode(True)
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    from generative_recommenders.ops.hstu_attention import delta_hstu_mha
+
+    alpha = 1.0 / (attn_dim**0.5)
+    lengths = torch.randint(
+        max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+    )
+    num_targets = torch.randint(
+        1, delta_size + 1, size=(batch_size,), device=torch.device("cuda")
+    )
+    lengths = lengths + delta_size + contextual_seq_len
+    max_seq_len = max_uih_len + delta_size + contextual_seq_len
+    if has_max_attn_len:
+        max_attn_len = random.randint(1, max_uih_len // 5)
+    else:
+        max_attn_len = 0
+    seq_offsets = torch.zeros(
+        (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+    )
+    seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+
+    L = int(seq_offsets[-1].item())
+    delta_q = torch.empty(
+        (batch_size * delta_size, heads, attn_dim),
+        dtype=dtype,
+        device=torch.device("cuda"),
+    ).uniform_(-0.1, 0.1)
+    k = torch.empty(
+        (L, heads, attn_dim), dtype=dtype, device=torch.device("cuda")
+    ).uniform_(-0.1, 0.1)
+    v = torch.empty(
+        (L, heads, hidden_dim), dtype=dtype, device=torch.device("cuda")
+    ).uniform_(-0.1, 0.1)
+
+    # ref implementation
+    ref_out = delta_hstu_mha(
+        max_seq_len=max_seq_len,
+        alpha=alpha,
+        delta_q=delta_q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets if has_multiple_targets else None,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        kernel=ref_kernel,
+        enable_tma=enable_tma,
+    )
+
+    # real implementation
+    real_out = delta_hstu_mha(
+        max_seq_len=max_seq_len,
+        alpha=alpha,
+        delta_q=delta_q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets if has_multiple_targets else None,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        kernel=real_kernel,
+        enable_tma=enable_tma,
+    )
+    torch.testing.assert_close(
+        ref_out,
+        real_out,
+        atol=atol,
+        rtol=rtol,
+    )
+
+
+class HSTUAttentionTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([20, 100, 128, 256]),
+        max_targets=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64, 128]),
+        hidden_dim=st.sampled_from([16, 32, 64, 128]),
+        causal=st.sampled_from([True]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([True, False]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_attn_triton(self, *args, **kwargs) -> None:
+        test_attn(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.just(64),
+        heads=st.just(4),
+        max_uih_len=st.sampled_from([32768]),
+        max_targets=st.sampled_from([32]),
+        attn_dim=st.just(128),
+        hidden_dim=st.just(128),
+        causal=st.sampled_from([True]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from([torch.bfloat16]),
+        has_max_attn_len=st.sampled_from([True, False]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=5,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_attn_triton_long_seqs(self, *args, **kwargs) -> None:
+        test_attn(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+            skip_comparisons=True,
+            sparsity=1.0,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([100, 128, 256]),
+        max_targets=st.sampled_from([20, 512]),
+        delta_size=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64, 128]),
+        hidden_dim=st.sampled_from([16, 32, 64, 128]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([False, True]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_delta_attn_triton(self, *args, **kwargs) -> None:
+        test_delta_attn(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([20, 100, 128]),
+        max_targets=st.sampled_from([20, 512]),
+        delta_size=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64]),
+        hidden_dim=st.sampled_from([16, 32, 64]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([False, True]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    def test_cache(
+        self,
+        batch_size: int,
+        heads: int,
+        max_uih_len: int,
+        max_targets: int,
+        delta_size: int,
+        attn_dim: int,
+        hidden_dim: int,
+        has_multiple_targets: bool,
+        dtype: torch.dtype,
+        has_max_attn_len: bool,
+        contextual_seq_len: int,
+    ) -> None:
+        set_dev_mode(True)
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        from generative_recommenders.ops.hstu_attention import delta_hstu_mha, hstu_mha
+
+        alpha = 1.0 / (attn_dim**0.5)
+        lengths = torch.randint(
+            max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        num_targets = torch.randint(
+            1, delta_size + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        lengths = lengths + delta_size + contextual_seq_len
+        max_seq_len = max_uih_len + delta_size + contextual_seq_len
+        if has_max_attn_len:
+            max_attn_len = random.randint(1, max_uih_len // 5)
+        else:
+            max_attn_len = 0
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+
+        L = int(seq_offsets[-1].item())
+        q = torch.empty(
+            (L, heads, attn_dim),
+            dtype=dtype,
+            device=torch.device("cuda"),
+        ).uniform_(-0.1, 0.1)
+        _, delta_q = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=q.view(-1, heads * attn_dim),
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=torch.ops.fbgemm.asynchronous_complete_cumsum(
+                lengths - delta_size
+            ),
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+        delta_q = delta_q.view(-1, heads, attn_dim)
+        k = torch.empty(
+            (L, heads, attn_dim), dtype=dtype, device=torch.device("cuda")
+        ).uniform_(-0.1, 0.1)
+        v = torch.empty(
+            (L, heads, hidden_dim), dtype=dtype, device=torch.device("cuda")
+        ).uniform_(-0.1, 0.1)
+        prime_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            lengths - delta_size
+        )
+
+        # ref implementation
+        ref_out = hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            causal=True,
+            num_targets=num_targets if has_multiple_targets else None,
+            dropout_pr=0.0,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.TRITON,
+        )
+        _, delta_out = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=ref_out.view(-1, heads * hidden_dim),
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=prime_offsets,
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+        delta_out = delta_out.view(-1, heads, hidden_dim)
+
+        # real implementation
+        real_delta_out = delta_hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets if has_multiple_targets else None,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+        )
+        torch.testing.assert_close(
+            delta_out,
+            real_delta_out,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_tma_test.py b/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_tma_test.py
new file mode 100644
index 000000000..8bb264af6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_tma_test.py
@@ -0,0 +1,270 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import random
+import unittest
+
+import torch
+from generative_recommenders.common import (
+    HammerKernel,
+    nv_gpu_unavailable,
+    set_dev_mode,
+)
+from generative_recommenders.ops.jagged_tensors import split_2D_jagged
+from generative_recommenders.ops.tests.hstu_attention_test import (
+    test_attn,
+    test_delta_attn,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class HSTUAttentionTmaTest(unittest.TestCase):
+    @unittest.skipIf(*nv_gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([20, 100, 128, 256]),
+        max_targets=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64, 128]),
+        hidden_dim=st.sampled_from([16, 32, 64, 128]),
+        causal=st.sampled_from([True]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([True, False]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_attn_triton_tma(self, *args, **kwargs) -> None:
+        test_attn(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+            enable_tma=True,
+        )
+
+    @unittest.skipIf(*nv_gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.just(64),
+        heads=st.just(4),
+        max_uih_len=st.sampled_from([32768]),
+        max_targets=st.sampled_from([32]),
+        attn_dim=st.just(128),
+        hidden_dim=st.just(128),
+        causal=st.sampled_from([True]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from([torch.bfloat16]),
+        has_max_attn_len=st.sampled_from([True, False]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=5,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_attn_triton_long_seqs_tma(self, *args, **kwargs) -> None:
+        test_attn(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+            skip_comparisons=True,
+            sparsity=1.0,
+            enable_tma=True,
+        )
+
+    @unittest.skipIf(*nv_gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([100, 128, 256]),
+        max_targets=st.sampled_from([20, 512]),
+        delta_size=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64, 128]),
+        hidden_dim=st.sampled_from([16, 32, 64, 128]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([False, True]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_delta_attn_triton_tma(self, *args, **kwargs) -> None:
+        test_delta_attn(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+            enable_tma=True,
+        )
+
+    @unittest.skipIf(*nv_gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([20, 100, 128]),
+        max_targets=st.sampled_from([20, 512]),
+        delta_size=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64]),
+        hidden_dim=st.sampled_from([16, 32, 64]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([False, True]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    def test_cache_tma(
+        self,
+        batch_size: int,
+        heads: int,
+        max_uih_len: int,
+        max_targets: int,
+        delta_size: int,
+        attn_dim: int,
+        hidden_dim: int,
+        has_multiple_targets: bool,
+        dtype: torch.dtype,
+        has_max_attn_len: bool,
+        contextual_seq_len: int,
+    ) -> None:
+        set_dev_mode(True)
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        from generative_recommenders.ops.hstu_attention import delta_hstu_mha, hstu_mha
+
+        alpha = 1.0 / (attn_dim**0.5)
+        lengths = torch.randint(
+            max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        num_targets = torch.randint(
+            1, delta_size + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        lengths = lengths + delta_size + contextual_seq_len
+        max_seq_len = max_uih_len + delta_size + contextual_seq_len
+        if has_max_attn_len:
+            max_attn_len = random.randint(1, max_uih_len // 5)
+        else:
+            max_attn_len = 0
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+
+        L = int(seq_offsets[-1].item())
+        q = torch.empty(
+            (L, heads, attn_dim),
+            dtype=dtype,
+            device=torch.device("cuda"),
+        ).uniform_(-0.1, 0.1)
+        _, delta_q = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=q.view(-1, heads * attn_dim),
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=torch.ops.fbgemm.asynchronous_complete_cumsum(
+                lengths - delta_size
+            ),
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+        delta_q = delta_q.view(-1, heads, attn_dim)
+        k = torch.empty(
+            (L, heads, attn_dim), dtype=dtype, device=torch.device("cuda")
+        ).uniform_(-0.1, 0.1)
+        v = torch.empty(
+            (L, heads, hidden_dim), dtype=dtype, device=torch.device("cuda")
+        ).uniform_(-0.1, 0.1)
+        prime_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            lengths - delta_size
+        )
+
+        # ref implementation
+        ref_out = hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            causal=True,
+            num_targets=num_targets if has_multiple_targets else None,
+            dropout_pr=0.0,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.TRITON,
+            enable_tma=True,
+        )
+        _, delta_out = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=ref_out.view(-1, heads * hidden_dim),
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=prime_offsets,
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+        delta_out = delta_out.view(-1, heads, hidden_dim)
+
+        # real implementation
+        real_delta_out = delta_hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets if has_multiple_targets else None,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            enable_tma=True,
+        )
+        torch.testing.assert_close(
+            delta_out,
+            real_delta_out,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/hstu_compute_test.py b/recommendation_v4/generative_recommenders/ops/tests/hstu_compute_test.py
new file mode 100644
index 000000000..57f217895
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/hstu_compute_test.py
@@ -0,0 +1,503 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import random
+import unittest
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    generate_sparse_seq_len,
+    gpu_unavailable,
+    HammerKernel,
+    set_dev_mode,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class HSTUComputeTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=1000, max_value=1000),
+        D=st.integers(min_value=128, max_value=128),
+        L=st.integers(min_value=512, max_value=512),
+        concat_u=st.booleans(),
+        concat_x=st.booleans(),
+        mul_u_activation_type=st.sampled_from(["silu", "sigmoid", "none"]),
+        group_norm=st.booleans(),
+        num_heads=st.sampled_from([4]),
+        training=st.just(False),
+        recompute_y_in_backward=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+    )
+    # pyre-ignore[2]
+    def test_compute_output(self, *args, **kwargs) -> None:
+        self._test_compute_output(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            opt_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.just(1500000),
+        D=st.just(512),
+        L=st.just(512),
+        concat_u=st.sampled_from([True]),
+        concat_x=st.sampled_from([True]),
+        mul_u_activation_type=st.sampled_from(["none"]),
+        group_norm=st.sampled_from([False]),
+        num_heads=st.sampled_from([4]),
+        training=st.just(False),
+        recompute_y_in_backward=st.sampled_from([False]),
+        dtype=st.just(torch.bfloat16),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+    )
+    # pyre-ignore[2]
+    def test_long_sequences_compute_output(self, *args, **kwargs) -> None:
+        self._test_compute_output(
+            *args,
+            **kwargs,
+            test_backward=False,
+            ref_kernel=HammerKernel.TRITON,
+            opt_kernel=HammerKernel.TRITON,
+            skip_comparisons=True,
+        )
+
+    def _test_compute_output(
+        self,
+        N: int,
+        D: int,
+        L: int,
+        concat_u: bool,
+        concat_x: bool,
+        mul_u_activation_type: str,
+        group_norm: bool,
+        num_heads: int,
+        training: bool,
+        recompute_y_in_backward: bool,
+        dtype: torch.dtype,
+        test_backward: bool,
+        ref_kernel: HammerKernel,
+        opt_kernel: HammerKernel,
+        skip_comparisons: bool = False,
+        atol: Optional[float] = None,
+        rtol: Optional[float] = None,
+    ) -> None:
+        from generative_recommenders.ops.hstu_compute import hstu_compute_output
+
+        torch.manual_seed(0)
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.allow_tf32 = False
+        dropout_ratio = 0.3 if training else 0.0
+        attn = (
+            torch.empty((N, L), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        u = (
+            torch.empty((N, L), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_weight = (
+            torch.empty(
+                (L if not group_norm else num_heads,),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_bias = (
+            torch.empty(
+                (L if not group_norm else num_heads,),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_eps = 1e-6
+        # When group_norm=True, only concat_ux = concat_u and concat_x is supported
+        if group_norm:
+            L_mult = 3 if (concat_u and concat_x) else 1
+        else:
+            L_mult = 1
+            if concat_u:
+                L_mult += 1
+            if concat_x:
+                L_mult += 1
+        output_weight = (
+            torch.empty((L * L_mult, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        x = (
+            torch.empty((N, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+
+        # ref
+        ref_out = hstu_compute_output(
+            attn=attn,
+            u=u,
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            dropout_ratio=dropout_ratio,
+            output_weight=output_weight,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=group_norm,
+            num_heads=num_heads,
+            linear_dim=L // num_heads,
+            training=training,
+            recompute_y_in_backward=recompute_y_in_backward,
+            kernel=ref_kernel,
+        )
+        dout = torch.randn_like(ref_out) * 0.1
+        ref_out.backward(dout)
+        if skip_comparisons:
+            return
+        # pyre-ignore[16]
+        ref_dattn, attn.grad = attn.grad.detach().clone(), None
+        ref_du, u.grad = u.grad.detach().clone(), None
+        ref_d_norm_w, norm_weight.grad = norm_weight.grad.detach().clone(), None
+        ref_d_norm_b, norm_bias.grad = norm_bias.grad.detach().clone(), None
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_d_output_w, output_weight.grad = output_weight.grad.detach().clone(), None
+
+        # opt
+        attn = attn.detach().clone().requires_grad_()
+        u = u.detach().clone().requires_grad_()
+        norm_weight = norm_weight.detach().clone().requires_grad_()
+        norm_bias = norm_bias.detach().clone().requires_grad_()
+        output_weight = output_weight.detach().clone().requires_grad_()
+        x = x.detach().clone().requires_grad_()
+        opt_out = hstu_compute_output(
+            attn=attn,
+            u=u,
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            dropout_ratio=dropout_ratio,
+            output_weight=output_weight,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=group_norm,
+            num_heads=num_heads,
+            linear_dim=L // num_heads,
+            training=training,
+            recompute_y_in_backward=recompute_y_in_backward,
+            kernel=opt_kernel,
+        )
+        torch.testing.assert_close(
+            ref_out,
+            opt_out,
+            atol=atol,
+            rtol=rtol,
+        )
+
+        if test_backward:
+            dout = dout.detach().clone()
+            opt_out.backward(dout)
+            opt_dattn, attn.grad = attn.grad.detach().clone(), None
+            opt_du, u.grad = u.grad.detach().clone(), None
+            opt_d_norm_w, norm_weight.grad = norm_weight.grad.detach().clone(), None
+            opt_d_norm_b, norm_bias.grad = norm_bias.grad.detach().clone(), None
+            opt_dx, x.grad = x.grad.detach().clone(), None
+            opt_d_output_w, output_weight.grad = (
+                output_weight.grad.detach().clone(),
+                None,
+            )
+            torch.testing.assert_close(ref_du, opt_du)
+            torch.testing.assert_close(ref_dattn, opt_dattn)
+            torch.testing.assert_close(ref_d_norm_w, opt_d_norm_w)
+            torch.testing.assert_close(ref_d_norm_b, opt_d_norm_b)
+            torch.testing.assert_close(ref_dx, opt_dx)
+            torch.testing.assert_close(ref_d_output_w, opt_d_output_w)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([100, 128, 256, 1300]),
+        max_targets=st.sampled_from([20, 512]),
+        embedding_dim=st.sampled_from([16, 32, 64]),
+        attn_dim=st.sampled_from([16, 32, 64, 128]),
+        hidden_dim=st.sampled_from([16, 32, 64, 128]),
+        causal=st.sampled_from([True]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        contextual_seq_len=st.sampled_from([0]),
+        has_max_attn_len=st.sampled_from([False, True]),
+        sort_by_length=st.sampled_from([True, False]),
+        recompute_uvqk_in_backward=st.sampled_from([True, False]),
+        recompute_normed_x_in_backward=st.sampled_from([True, False]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=150,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_preprocess_and_attention(self, *args, **kwargs) -> None:
+        self._test_hstu_preprocess_and_attention(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_hstu_preprocess_and_attention(
+        self,
+        batch_size: int,
+        heads: int,
+        max_uih_len: int,
+        max_targets: int,
+        embedding_dim: int,
+        attn_dim: int,
+        hidden_dim: int,
+        causal: bool,
+        has_multiple_targets: bool,
+        has_max_attn_len: bool,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        test_backward: bool,
+        contextual_seq_len: int,
+        sort_by_length: bool,
+        recompute_uvqk_in_backward: bool,
+        recompute_normed_x_in_backward: bool,
+        sparsity: float = -1.0,
+        atol: Optional[float] = None,
+        rtol: Optional[float] = None,
+    ) -> None:
+        set_dev_mode(True)
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.allow_tf32 = False
+
+        from generative_recommenders.ops.hstu_compute import (
+            hstu_preprocess_and_attention,
+        )
+
+        alpha = 1.0 / (attn_dim**0.5)
+        if sparsity > 0.0:
+            lengths = generate_sparse_seq_len(
+                size=batch_size,
+                max_seq_len=max_uih_len,
+                sparsity=sparsity,
+                device=torch.device("cuda"),
+            )
+        else:
+            lengths = torch.randint(
+                max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+
+        num_targets = torch.randint(
+            max_targets + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        lengths = lengths + num_targets
+        max_seq_len = max_uih_len + max_targets
+        if has_max_attn_len:
+            max_attn_len = random.randint(1, max_uih_len // 5)
+        else:
+            max_attn_len = 0
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+
+        L = int(seq_offsets[-1].item())
+
+        x = (
+            torch.empty((L, embedding_dim), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_weight = (
+            torch.empty((embedding_dim,), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_bias = (
+            torch.empty(
+                (embedding_dim,),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_eps = 1e-6
+        uvqk_weight = (
+            torch.empty(
+                (
+                    embedding_dim,
+                    (hidden_dim * 2 + attn_dim * 2) * heads,
+                ),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        uvqk_bias = (
+            torch.empty(
+                (hidden_dim * 2 + attn_dim * 2) * heads,
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+
+        # ref implementation
+        ref_u, ref_attn_output, _, _ = hstu_preprocess_and_attention(
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            num_heads=heads,
+            attn_dim=attn_dim,
+            hidden_dim=hidden_dim,
+            uvqk_weight=uvqk_weight,
+            uvqk_bias=uvqk_bias,
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            attn_alpha=alpha,
+            causal=causal,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            recompute_uvqk_in_backward=recompute_uvqk_in_backward,
+            recompute_normed_x_in_backward=recompute_normed_x_in_backward,
+            sort_by_length=sort_by_length,
+            kernel=ref_kernel,
+        )
+        ref_out = ref_u + ref_attn_output
+        dout = torch.randn_like(ref_out) * 0.01
+        ref_out.backward(dout)
+
+        # pyre-ignore
+        ref_dx, x.grad = x.grad.clone(), None
+        ref_d_norm_weight, norm_weight.grad = norm_weight.grad.clone(), None
+        ref_d_norm_bias, norm_bias.grad = norm_bias.grad.clone(), None
+        ref_d_uvqk_weight, uvqk_weight.grad = uvqk_weight.grad.clone(), None
+        ref_d_uvqk_bias, uvqk_bias.grad = uvqk_bias.grad.clone(), None
+
+        # real implementation
+        x = x.detach().clone().requires_grad_()
+        norm_weight = norm_weight.detach().clone().requires_grad_()
+        norm_bias = norm_bias.detach().clone().requires_grad_()
+        uvqk_weight = uvqk_weight.detach().clone().requires_grad_()
+        uvqk_bias = uvqk_bias.detach().clone().requires_grad_()
+        real_u, real_attn_output, _, _ = hstu_preprocess_and_attention(
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            num_heads=heads,
+            attn_dim=attn_dim,
+            hidden_dim=hidden_dim,
+            uvqk_weight=uvqk_weight,
+            uvqk_bias=uvqk_bias,
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            attn_alpha=alpha,
+            causal=causal,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            recompute_uvqk_in_backward=recompute_uvqk_in_backward,
+            recompute_normed_x_in_backward=recompute_normed_x_in_backward,
+            sort_by_length=sort_by_length,
+            kernel=real_kernel,
+        )
+        real_out = real_u + real_attn_output
+        torch.testing.assert_close(
+            ref_u,
+            real_u,
+            atol=atol,
+            rtol=rtol,
+        )
+        torch.testing.assert_close(
+            ref_attn_output,
+            real_attn_output,
+            atol=atol,
+            rtol=rtol,
+        )
+        if test_backward:
+            # real implementation
+            dout = dout.detach().clone()
+            real_out.backward(dout)
+            (
+                real_dx,
+                real_d_norm_weight,
+                real_d_norm_bias,
+                real_d_uvqk_weight,
+                real_d_uvqk_bias,
+            ) = (
+                x.grad.clone(),
+                norm_weight.grad.clone(),
+                norm_bias.grad.clone(),
+                uvqk_weight.grad.clone(),
+                uvqk_bias.grad.clone(),
+            )
+            torch.testing.assert_close(ref_dx, real_dx, atol=atol, rtol=rtol)
+            torch.testing.assert_close(
+                ref_d_norm_weight, real_d_norm_weight, atol=atol, rtol=rtol
+            )
+            torch.testing.assert_close(
+                ref_d_norm_bias, real_d_norm_bias, atol=atol, rtol=rtol
+            )
+            torch.testing.assert_close(
+                ref_d_uvqk_weight, real_d_uvqk_weight, atol=atol, rtol=rtol
+            )
+            torch.testing.assert_close(
+                ref_d_uvqk_bias, real_d_uvqk_bias, atol=atol, rtol=rtol
+            )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/jagged_tensors_test.py b/recommendation_v4/generative_recommenders/ops/tests/jagged_tensors_test.py
new file mode 100644
index 000000000..e03d68d0b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/jagged_tensors_test.py
@@ -0,0 +1,963 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    generate_sparse_seq_len,
+    gpu_unavailable,
+    HammerKernel,
+    set_dev_mode,
+)
+from generative_recommenders.ops.jagged_tensors import (
+    concat_2D_jagged,
+    concat_2D_jagged_multirow,
+    split_2D_jagged,
+    split_2D_jagged_multirow,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class JaggedTensorsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(2, 8),
+        max_len_a=st.integers(20, 100),
+        max_len_b=st.integers(20, 100),
+        D=st.integers(10, 30),
+        is_dense_a=st.sampled_from([True, False]),
+        is_dense_b=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_split_2D_jagged_triton(self, *args, **kwargs) -> None:
+        self._test_split_2D_jagged(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_split_2D_jagged(
+        self,
+        batch_size: int,
+        max_len_a: int,
+        max_len_b: int,
+        D: int,
+        is_dense_a: bool,
+        is_dense_b: bool,
+        test_backward: bool,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        dtype: torch.dtype = torch.float32,
+        skip_comparisons: bool = False,
+    ) -> None:
+        set_dev_mode(True)
+        from generative_recommenders.ops.jagged_tensors import split_2D_jagged
+
+        max_seq_len = max_len_a + max_len_b
+        if not is_dense_a:
+            lengths_a = torch.randint(
+                1, max_len_a + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_a = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_a[1:] = torch.cumsum(lengths_a, dim=0)
+            total_len_a = int(offsets_a[-1].item())
+        else:
+            offsets_a = None
+            total_len_a = batch_size * max_len_a
+            is_dense_b = False
+        if not is_dense_b:
+            lengths_b = torch.randint(
+                1, max_len_b + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_b = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_b[1:] = torch.cumsum(lengths_b, dim=0)
+            total_len_b = int(offsets_b[-1].item())
+        else:
+            offsets_b = None
+            total_len_b = batch_size * max_len_b
+        values = (
+            torch.empty(
+                (total_len_a + total_len_b, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        ref_values_a, ref_values_b = split_2D_jagged(
+            max_seq_len=max_len_a + max_len_b,
+            values=values,
+            max_len_left=max_len_a if is_dense_a else None,
+            max_len_right=max_len_b if is_dense_b else None,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=ref_kernel,
+        )
+        d_values_a = torch.randn_like(ref_values_a)
+        d_values_b = torch.randn_like(ref_values_b)
+        ref_values_a.backward(d_values_a, retain_graph=True)
+        ref_values_b.backward(d_values_b)
+        if skip_comparisons:
+            return
+
+        assert values.grad is not None
+        ref_d_values, values.grad = values.grad.clone(), None
+
+        values = values.detach().clone().requires_grad_()
+        real_values_a, real_values_b = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            max_len_left=max_len_a if is_dense_a else None,
+            max_len_right=max_len_b if is_dense_b else None,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=real_kernel,
+        )
+        torch.testing.assert_close(ref_values_a, real_values_a)
+        torch.testing.assert_close(ref_values_b, real_values_b)
+
+        if test_backward:
+            d_values_a = d_values_a.detach().clone()
+            d_values_b = d_values_b.detach().clone()
+            real_values_a.backward(d_values_a, retain_graph=True)
+            real_values_b.backward(d_values_b)
+            real_d_values = values.grad.clone()
+            torch.testing.assert_close(ref_d_values, real_d_values)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(2, 8),
+        max_len_a=st.integers(20, 100),
+        max_len_b=st.integers(20, 100),
+        D=st.integers(10, 30),
+        is_dense_a=st.sampled_from([True, False]),
+        is_dense_b=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_concat_2D_jagged_triton(self, *args, **kwargs) -> None:
+        self._test_concat_2D_jagged(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.sampled_from([130]),
+        max_len_a=st.sampled_from([32768]),
+        max_len_b=st.sampled_from([10]),
+        D=st.sampled_from([512]),
+        is_dense_a=st.sampled_from([True, False]),
+        is_dense_b=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_concat_2D_jagged_large_tensor(self, *args, **kwargs) -> None:
+        self._test_concat_2D_jagged(
+            *args,
+            **kwargs,
+            test_backward=True,
+            skip_comparisons=True,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_concat_2D_jagged(
+        self,
+        batch_size: int,
+        max_len_a: int,
+        max_len_b: int,
+        D: int,
+        is_dense_a: bool,
+        is_dense_b: bool,
+        test_backward: bool,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        dtype: torch.dtype = torch.float32,
+        skip_comparisons: bool = False,
+    ) -> None:
+        set_dev_mode(True)
+        from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+
+        if not is_dense_a:
+            lengths_a = torch.randint(
+                1, max_len_a + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_a = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_a[1:] = torch.cumsum(lengths_a, dim=0)
+            total_len_a = int(offsets_a[-1].item())
+        else:
+            offsets_a = None
+            total_len_a = batch_size * max_len_a
+        values_a = (
+            torch.empty(
+                (total_len_a, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if not is_dense_b:
+            lengths_b = torch.randint(
+                1, max_len_b + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_b = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_b[1:] = torch.cumsum(lengths_b, dim=0)
+            total_len_b = int(offsets_b[-1].item())
+        else:
+            offsets_b = None
+            total_len_b = batch_size * max_len_b
+        values_b = (
+            torch.empty(
+                (total_len_b, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        ref_values = concat_2D_jagged(
+            max_seq_len=max_len_a + max_len_b,
+            values_left=values_a,
+            values_right=values_b,
+            max_len_left=max_len_a,
+            max_len_right=max_len_b,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=ref_kernel,
+        )
+        dout = torch.randn_like(ref_values)
+        ref_values.backward(dout)
+        if skip_comparisons:
+            return
+
+        assert values_a.grad is not None
+        ref_d_a, values_a.grad = values_a.grad.clone(), None
+        assert values_b.grad is not None
+        ref_d_b, values_b.grad = values_b.grad.clone(), None
+
+        values_a = values_a.detach().clone().requires_grad_()
+        values_b = values_b.detach().clone().requires_grad_()
+        dout = dout.detach().clone()
+        real_values = concat_2D_jagged(
+            max_seq_len=max_len_a + max_len_b,
+            values_left=values_a,
+            values_right=values_b,
+            max_len_left=max_len_a,
+            max_len_right=max_len_b,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=real_kernel,
+        )
+        torch.testing.assert_close(ref_values, real_values)
+
+        if test_backward:
+            real_values.backward(dout)
+            real_d_a = values_a.grad.clone()
+            real_d_b = values_b.grad.clone()
+            torch.testing.assert_close(ref_d_a, real_d_a)
+            torch.testing.assert_close(ref_d_b, real_d_b)
+
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(2, 8),
+        max_uih_len=st.integers(20, 100),
+        max_l2_len=st.integers(10, 30),
+        contextual_seq_len=st.sampled_from([0, 10]),
+        max_targets=st.sampled_from([10, 20]),
+        D=st.integers(10, 30),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    def test_hstu_split_l2_embeddings(
+        self,
+        batch_size: int,
+        max_uih_len: int,
+        max_l2_len: int,
+        contextual_seq_len: int,
+        max_targets: int,
+        D: int,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        from generative_recommenders.ops.jagged_tensors import hstu_split_l2_embeddings
+
+        max_seq_len = max_uih_len + max_targets + contextual_seq_len
+        num_targets = torch.randint(
+            1, max_targets + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        x_lengths = torch.randint(
+            0,
+            max_uih_len + 1,
+            size=(batch_size,),
+            device=torch.device("cuda"),
+        )
+        x_lengths = num_targets + x_lengths + contextual_seq_len
+        x_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        x_offsets[1:] = torch.cumsum(x_lengths, dim=0)
+        total_len = int(x_offsets[-1].item())
+        x = (
+            torch.empty(
+                (total_len, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        prefix_lengths = x_lengths - max_l2_len - num_targets - contextual_seq_len
+        prefix_lengths = torch.clamp(prefix_lengths, min=0)
+        prefix_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(prefix_lengths)
+        l2_offsets = x_offsets - prefix_offsets
+        ref_prefix_x, ref_l2_x = hstu_split_l2_embeddings(
+            max_seq_len=max_seq_len,
+            x=x,
+            prefix_offsets=prefix_offsets,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.PYTORCH,
+        )
+        d_prefix_x = torch.randn_like(ref_prefix_x)
+        d_l2_x = torch.randn_like(ref_l2_x)
+        ref_prefix_x.backward(d_prefix_x, retain_graph=True)
+        ref_l2_x.backward(d_l2_x)
+        assert x.grad is not None
+        ref_d_x, x.grad = x.grad.clone(), None
+        x = x.detach().clone().requires_grad_()
+        real_prefix_x, real_l2_x = hstu_split_l2_embeddings(
+            max_seq_len=max_seq_len,
+            x=x,
+            prefix_offsets=prefix_offsets,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.TRITON,
+        )
+        print(ref_prefix_x.shape, real_prefix_x.shape)
+        torch.testing.assert_close(ref_prefix_x, real_prefix_x)
+        torch.testing.assert_close(ref_l2_x, real_l2_x)
+        d_prefix_x = d_prefix_x.detach().clone()
+        d_l2_x = d_l2_x.detach().clone()
+        real_prefix_x.backward(d_prefix_x, retain_graph=True)
+        real_l2_x.backward(d_l2_x)
+        real_d_x = x.grad.clone()
+        torch.testing.assert_close(ref_d_x, real_d_x)
+
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(1, 1),
+        max_prefix_len=st.integers(10, 10),
+        max_l2_len=st.integers(5, 5),
+        contextual_seq_len=st.sampled_from([3]),
+        max_targets=st.sampled_from([2]),
+        D=st.integers(10, 10),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    def test_hstu_concat_l2_embeddings(
+        self,
+        batch_size: int,
+        max_prefix_len: int,
+        max_l2_len: int,
+        contextual_seq_len: int,
+        max_targets: int,
+        D: int,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        from generative_recommenders.ops.jagged_tensors import hstu_concat_l2_embeddings
+
+        num_targets = torch.randint(
+            1, max_targets + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        l2_lengths = torch.randint(
+            0,
+            max_l2_len + 1,
+            size=(batch_size,),
+            device=torch.device("cuda"),
+        )
+        l2_lengths = num_targets + l2_lengths + contextual_seq_len
+        max_l2_len = max_l2_len + contextual_seq_len + max_targets
+        l2_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        l2_offsets[1:] = torch.cumsum(l2_lengths, dim=0)
+        total_l2_len = int(l2_offsets[-1].item())
+        l2_x = (
+            torch.empty(
+                (total_l2_len, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        prefix_lengths = torch.randint(
+            0,
+            max_prefix_len + 1,
+            size=(batch_size,),
+            device=torch.device("cuda"),
+        )
+        prefix_lengths = torch.randint(
+            0,
+            max_prefix_len + 1,
+            size=(batch_size,),
+            device=torch.device("cuda"),
+        )
+        prefix_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        prefix_offsets[1:] = torch.cumsum(prefix_lengths, dim=0)
+        total_prefix_len = int(prefix_offsets[-1].item())
+        prefix_x = (
+            torch.empty(
+                (total_prefix_len, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        ref_x = hstu_concat_l2_embeddings(
+            max_prefix_len=max_prefix_len,
+            prefix_x=prefix_x,
+            prefix_offsets=prefix_offsets,
+            max_l2_len=max_l2_len,
+            l2_x=l2_x,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.PYTORCH,
+        )
+        dout = torch.randn_like(ref_x)
+        ref_x.backward(dout)
+
+        assert prefix_x.grad is not None
+        ref_d_prefix_x, prefix_x.grad = prefix_x.grad.clone(), None
+        assert l2_x.grad is not None
+        ref_d_l2_x, l2_x.grad = l2_x.grad.clone(), None
+
+        prefix_x = prefix_x.detach().clone().requires_grad_()
+        l2_x = l2_x.detach().clone().requires_grad_()
+        real_x = hstu_concat_l2_embeddings(
+            max_prefix_len=max_prefix_len,
+            prefix_x=prefix_x,
+            prefix_offsets=prefix_offsets,
+            max_l2_len=max_l2_len,
+            l2_x=l2_x,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.TRITON,
+        )
+        torch.testing.assert_close(ref_x, real_x)
+        dout = dout.detach().clone()
+        real_x.backward(dout)
+        real_d_prefix_x = prefix_x.grad.clone()
+        real_d_l2_x = l2_x.grad.clone()
+        torch.testing.assert_close(ref_d_prefix_x, real_d_prefix_x)
+        torch.testing.assert_close(ref_d_l2_x, real_d_l2_x)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        max_seq_len=st.integers(50, 500),
+        D=st.integers(20, 200),
+        K=st.integers(30, 200),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16, torch.float16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        contiguous=st.booleans(),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_jagged_dense_bmm_broadcast_add_triton(self, *args, **kwargs) -> None:
+        self._test_jagged_dense_bmm_broadcast_add(
+            *args,
+            **kwargs,
+            test_backward=True,
+            atol=None,
+            rtol=None,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.sampled_from([130]),
+        max_seq_len=st.sampled_from([32768]),
+        D=st.sampled_from([512]),
+        K=st.sampled_from([512]),
+        dtype=st.sampled_from([torch.float32, torch.bfloat16]),
+        contiguous=st.booleans(),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+        deadline=None,
+    )
+    def test_jagged_dense_bmm_broadcast_add_triton_large_tensor(
+        self,
+        # pyre-fixme[2]: Parameter must be annotated.
+        *args,
+        **kwargs,  # pyre-ignore[2]
+    ) -> None:
+        self._test_jagged_dense_bmm_broadcast_add(
+            *args,
+            **kwargs,
+            test_backward=True,
+            atol=None,
+            rtol=None,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_jagged_dense_bmm_broadcast_add(
+        self,
+        batch_size: int,
+        max_seq_len: int,
+        D: int,
+        K: int,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        test_backward: bool,
+        contiguous: bool = True,
+        atol: Optional[float] = None,
+        rtol: Optional[float] = None,
+        sparsity: float = -1,
+    ) -> None:
+        set_dev_mode(True)
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.allow_tf32 = False
+        from generative_recommenders.ops.jagged_tensors import (
+            jagged_dense_bmm_broadcast_add,
+        )
+
+        if sparsity > 0.0:
+            lengths = generate_sparse_seq_len(
+                size=batch_size,
+                max_seq_len=max_seq_len,
+                sparsity=sparsity,
+                device=torch.device("cuda"),
+            ).to(torch.int64)
+        else:
+            lengths = torch.randint(
+                max_seq_len + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+        # Test the edge case with an empty row
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        jagged_size = int(seq_offsets[-1].item())
+        jagged = (
+            torch.empty((jagged_size, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        dense = (
+            torch.empty((batch_size, D, K), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        bias = (
+            torch.empty((batch_size, K), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if not contiguous:
+            dense = (
+                dense.transpose(1, 2)
+                .contiguous()
+                .transpose(1, 2)
+                .detach()
+                .clone()
+                .requires_grad_()
+            )
+
+        ref_out = jagged_dense_bmm_broadcast_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+            bias=bias,
+            kernel=ref_kernel,
+        ).to(jagged.dtype)
+        if test_backward:
+            dout = torch.randn_like(ref_out) * 0.01
+            ref_out.backward(dout)
+            # pyre-ignore
+            ref_d_jagged, jagged.grad = jagged.grad.clone(), None
+            ref_d_dense, dense.grad = dense.grad.clone(), None
+            ref_d_bias, bias.grad = bias.grad.clone(), None
+
+        jagged = jagged.detach().clone().requires_grad_()
+        dense = dense.detach().clone().requires_grad_()
+        bias = bias.detach().clone().requires_grad_()
+        real_out = jagged_dense_bmm_broadcast_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+            bias=bias,
+            kernel=real_kernel,
+        )
+        torch.testing.assert_close(
+            ref_out,
+            real_out,
+            atol=atol,
+            rtol=rtol,
+        )
+        if test_backward:
+            real_out.backward(dout)  # pyre-ignore
+            real_d_jagged = jagged.grad.clone()
+            real_d_dense = dense.grad.clone()
+            real_d_bias = bias.grad.clone()
+            torch.testing.assert_close(
+                ref_d_jagged,  # pyre-ignore
+                real_d_jagged,
+                atol=atol,
+                rtol=rtol,
+            )
+            torch.testing.assert_close(
+                ref_d_dense,  # pyre-ignore
+                real_d_dense,
+                atol=atol,
+                rtol=rtol,
+            )
+            torch.testing.assert_close(
+                ref_d_bias,  # pyre-ignore
+                real_d_bias,
+                atol=atol,
+                rtol=rtol,
+            )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(2, 8),
+        max_len_a=st.integers(20, 100),
+        max_len_b=st.integers(20, 100),
+        D=st.integers(10, 30),
+        is_dense_a=st.sampled_from([True, False]),
+        is_dense_b=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_concat_2D_jagged_multirow_triton(self, *args, **kwargs) -> None:
+        self._test_concat_2D_jagged_multirow(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_concat_2D_jagged_multirow(
+        self,
+        batch_size: int,
+        max_len_a: int,
+        max_len_b: int,
+        D: int,
+        is_dense_a: bool,
+        is_dense_b: bool,
+        test_backward: bool,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        dtype: torch.dtype = torch.float32,
+    ) -> None:
+        set_dev_mode(True)
+
+        if not is_dense_a:
+            lengths_a = torch.randint(
+                1, max_len_a + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_a = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_a[1:] = torch.cumsum(lengths_a, dim=0)
+            total_len_a = int(offsets_a[-1].item())
+        else:
+            offsets_a = None
+            total_len_a = batch_size * max_len_a
+        values_a = (
+            torch.empty(
+                (total_len_a, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if not is_dense_b:
+            lengths_b = torch.randint(
+                1, max_len_b + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_b = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_b[1:] = torch.cumsum(lengths_b, dim=0)
+            total_len_b = int(offsets_b[-1].item())
+        else:
+            offsets_b = None
+            total_len_b = batch_size * max_len_b
+        values_b = (
+            torch.empty(
+                (total_len_b, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        ref_values = concat_2D_jagged(
+            max_seq_len=max_len_a + max_len_b,
+            values_left=values_a,
+            values_right=values_b,
+            max_len_left=max_len_a,
+            max_len_right=max_len_b,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=ref_kernel,
+        )
+        dout = torch.randn_like(ref_values) * 0.1
+        ref_values.backward(dout)
+        assert values_a.grad is not None
+        ref_d_a, values_a.grad = values_a.grad.clone(), None
+        assert values_b.grad is not None
+        ref_d_b, values_b.grad = values_b.grad.clone(), None
+
+        values_a = values_a.detach().clone().requires_grad_()
+        values_b = values_b.detach().clone().requires_grad_()
+        dout = dout.detach().clone()
+
+        real_values = concat_2D_jagged_multirow(
+            max_seq_len=max_len_a + max_len_b,
+            values_left=values_a,
+            values_right=values_b,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            max_len_left=max_len_a,
+            max_len_right=max_len_b,
+            kernel=real_kernel,
+        )
+        torch.testing.assert_close(ref_values, real_values)
+        if test_backward:
+            real_values.backward(dout)
+            real_d_a = values_a.grad.clone()
+            real_d_b = values_b.grad.clone()
+            torch.testing.assert_close(ref_d_a, real_d_a)
+            torch.testing.assert_close(ref_d_b, real_d_b)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(2, 8),
+        max_len_a=st.integers(20, 100),
+        max_len_b=st.integers(20, 100),
+        D=st.integers(10, 30),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_split_2D_jagged_multirow_triton(self, *args, **kwargs) -> None:
+        self._test_split_2D_jagged_multirow(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_split_2D_jagged_multirow(
+        self,
+        batch_size: int,
+        max_len_a: int,
+        max_len_b: int,
+        D: int,
+        test_backward: bool,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        dtype: torch.dtype = torch.float32,
+    ) -> None:
+        set_dev_mode(True)
+
+        lengths_a = torch.randint(
+            1, max_len_a + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        offsets_a = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        offsets_a[1:] = torch.cumsum(lengths_a, dim=0)
+
+        lengths_b = torch.randint(
+            1, max_len_b + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        offsets_b = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        offsets_b[1:] = torch.cumsum(lengths_b, dim=0)
+
+        total_len = int(offsets_a[-1].item()) + int(offsets_b[-1].item())
+        values = (
+            torch.empty(
+                (total_len, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        ref_values_a, ref_values_b = split_2D_jagged(
+            max_seq_len=max_len_a + max_len_b,
+            values=values,
+            total_len_left=int(offsets_a[-1].item()),
+            total_len_right=int(offsets_b[-1].item()),
+            max_len_left=max_len_a,
+            max_len_right=max_len_b,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=ref_kernel,
+        )
+        d_values_a = torch.randn_like(ref_values_a) * 0.1
+        d_values_b = torch.randn_like(ref_values_b) * 0.1
+        ref_values_a.backward(d_values_a, retain_graph=True)
+        ref_values_b.backward(d_values_b)
+        assert values.grad is not None
+        ref_d_values, values.grad = values.grad.clone(), None
+
+        values = values.detach().clone().requires_grad_()
+        d_values_a = d_values_a.detach().clone()
+        d_values_b = d_values_b.detach().clone()
+
+        max_len_a_actual = int((offsets_a[1:] - offsets_a[:-1]).max().item())
+        max_len_b_actual = int((offsets_b[1:] - offsets_b[:-1]).max().item())
+
+        real_values_a, real_values_b = split_2D_jagged_multirow(
+            max_seq_len=max_len_a + max_len_b,
+            values=values,
+            total_len_left=int(offsets_a[-1].item()),
+            total_len_right=int(offsets_b[-1].item()),
+            max_len_left=max_len_a_actual,
+            max_len_right=max_len_b_actual,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=real_kernel,
+        )
+        torch.testing.assert_close(ref_values_a, real_values_a)
+        torch.testing.assert_close(ref_values_b, real_values_b)
+        if test_backward:
+            real_values_a.backward(d_values_a, retain_graph=True)
+            real_values_b.backward(d_values_b)
+            real_d_values = values.grad.clone()
+            torch.testing.assert_close(ref_d_values, real_d_values)
diff --git a/recommendation_v4/generative_recommenders/ops/tests/layer_norm_test.py b/recommendation_v4/generative_recommenders/ops/tests/layer_norm_test.py
new file mode 100644
index 000000000..62540967a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/layer_norm_test.py
@@ -0,0 +1,231 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import copy
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel, set_dev_mode
+from generative_recommenders.ops.layer_norm import (
+    layer_norm,
+    LayerNorm,
+    swish_layer_norm,
+    SwishLayerNorm,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class LayerNormTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.sampled_from([4200000]),
+        D=st.sampled_from([512]),
+        is_swish=st.sampled_from([False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+    )
+    # pyre-ignore[2]
+    def test_large_tensors(self, *args, **kwargs) -> None:
+        self._test_layernorm(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+            skip_comparisons=True,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=0, max_value=10000),
+        D=st.integers(min_value=32, max_value=512),
+        is_swish=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+    )
+    # pyre-ignore[2]
+    def test_ln(self, *args, **kwargs) -> None:
+        self._test_layernorm(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_layernorm(
+        self,
+        N: int,
+        D: int,
+        is_swish: bool,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        skip_comparisons: bool = False,
+    ) -> None:
+        N = N // 4 * 4
+        # enable auto-tuning to verify correctness of multi-row kernel
+        set_dev_mode(False)
+        x = (
+            torch.empty((N, D), dtype=dtype, device=torch.device("cuda"))
+            .normal_(0.0, 1.0)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((D,), device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        bias = (
+            torch.empty((D,), device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        if is_swish:
+            layer_norm_func = swish_layer_norm
+        else:
+            layer_norm_func = layer_norm
+        # ref
+        ref_out = layer_norm_func(x, weight, bias, eps=1e-6, kernel=ref_kernel)
+        dout = torch.randn_like(ref_out) * 0.05
+        ref_out.backward(dout)
+        if skip_comparisons:
+            return
+        # pyre-ignore[16]
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_dw, weight.grad = weight.grad.detach().clone(), None
+        ref_db, bias.grad = bias.grad.detach().clone(), None
+        # opt
+        x = x.detach().clone().requires_grad_()
+        weight = weight.detach().clone().requires_grad_()
+        bias = bias.detach().clone().requires_grad_()
+        opt_out = layer_norm_func(x, weight, bias, eps=1e-6, kernel=real_kernel)
+        dout = dout.detach().clone()
+        opt_out.backward(dout)
+        opt_dx, x.grad = x.grad.detach().clone(), None
+        opt_dw, weight.grad = weight.grad.detach().clone(), None
+        opt_db, bias.grad = bias.grad.detach().clone(), None
+        torch.testing.assert_close(ref_out, opt_out)
+        torch.testing.assert_close(ref_dx, opt_dx)
+        torch.testing.assert_close(ref_dw, opt_dw)
+        torch.testing.assert_close(ref_db, opt_db)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=32, max_value=10000),
+        D=st.integers(min_value=32, max_value=512),
+        is_swish=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+    )
+    # pyre-ignore[2]
+    def test_modules(self, *args, **kwargs) -> None:
+        self._test_layer_norm_module(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_layer_norm_module(
+        self,
+        N: int,
+        D: int,
+        is_swish: bool,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        skip_comparisons: bool = False,
+    ) -> None:
+        set_dev_mode(True)
+        x = (
+            torch.empty((N, D), dtype=dtype, device=torch.device("cuda"))
+            .normal_(0.0, 1.0)
+            .requires_grad_()
+        )
+        # ref
+        if is_swish:
+            ref_layer = SwishLayerNorm(
+                dim=D,
+                eps=1e-6,
+            ).to(device="cuda")
+            ref_layer._hammer_kernel = ref_kernel
+        else:
+            ref_layer = LayerNorm(
+                dim=D,
+                eps=1e-6,
+            ).to(device="cuda")
+            ref_layer._hammer_kernel = ref_kernel
+        opt_layer = copy.deepcopy(ref_layer)
+        opt_layer._hammer_kernel = real_kernel
+
+        ref_out = ref_layer(x)
+        dout = torch.randn_like(ref_out) * 0.05
+        ref_out.backward(dout)
+        if skip_comparisons:
+            return
+        # pyre-ignore[16]
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_dw = ref_layer.weight.grad.detach().clone()
+        ref_db = ref_layer.bias.grad.detach().clone()
+        # opt
+        x = x.detach().clone().requires_grad_()
+        opt_out = opt_layer(x)
+        dout = dout.detach().clone()
+        opt_out.backward(dout)
+        opt_dx, x.grad = x.grad.detach().clone(), None
+        opt_dw = opt_layer.weight.grad.detach().clone()
+        opt_db = opt_layer.bias.grad.detach().clone()
+        torch.testing.assert_close(ref_out, opt_out)
+        torch.testing.assert_close(
+            ref_dx,
+            opt_dx,
+        )
+        torch.testing.assert_close(
+            ref_dw,
+            opt_dw,
+        )
+        torch.testing.assert_close(
+            ref_db,
+            opt_db,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/mm_test.py b/recommendation_v4/generative_recommenders/ops/tests/mm_test.py
new file mode 100644
index 000000000..0695275e3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/mm_test.py
@@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+from typing import Optional
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel
+from generative_recommenders.ops.mm import addmm
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class MMlTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        M=st.integers(min_value=100, max_value=300),
+        N=st.integers(min_value=100, max_value=300),
+        K=st.sampled_from([128, 256]),
+        broadcast=st.booleans(),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16, torch.float16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    def test_addmm(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        broadcast: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        self._test_addmm(
+            M=M,
+            N=N,
+            K=K,
+            broadcast=broadcast,
+            dtype=dtype,
+            kernel_type=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        M=st.integers(min_value=100, max_value=300),
+        N=st.sampled_from([16, 48, 128, 144, 256]),
+        K=st.sampled_from([16, 48, 128, 144, 256]),
+        broadcast=st.booleans(),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16, torch.float16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    def test_addmm_tma(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        broadcast: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        self._test_addmm(
+            M=M,
+            N=N,
+            K=K,
+            broadcast=broadcast,
+            dtype=dtype,
+            kernel_type=HammerKernel.TRITON,
+        )
+
+    def _test_addmm(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        broadcast: bool,
+        dtype: torch.dtype,
+        kernel_type: HammerKernel,
+        atol: Optional[float] = None,
+        rtol: Optional[float] = None,
+    ) -> None:
+        # to enable more deterministic results.
+        torch.manual_seed(0)
+
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.allow_tf32 = False
+
+        x: torch.Tensor = torch.rand((M, K), dtype=dtype, device="cuda").requires_grad_(
+            True
+        )
+        w: torch.Tensor = torch.rand((K, N), dtype=dtype, device="cuda").requires_grad_(
+            True
+        )
+
+        if broadcast:
+            y: torch.Tensor = torch.rand(
+                (N), dtype=dtype, device="cuda"
+            ).requires_grad_(True)
+        else:
+            y: torch.Tensor = torch.rand(
+                (M, N), dtype=dtype, device="cuda"
+            ).requires_grad_(True)
+
+        ref_z = addmm(y, x, w, kernel=HammerKernel.PYTORCH)
+        dz = torch.randn_like(ref_z) * 0.1
+        ref_z.backward(dz)
+        # pyre-ignore[16]
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_dw, w.grad = w.grad.detach().clone(), None
+        ref_dy, y.grad = y.grad.detach().clone(), None
+
+        x = x.detach().clone().requires_grad_(True)
+        w = w.detach().clone().requires_grad_(True)
+        y = y.detach().clone().requires_grad_(True)
+        real_z = addmm(y, x, w, kernel=kernel_type)
+
+        torch.testing.assert_close(ref_z, real_z, atol=atol, rtol=rtol)
+
+        # triton cc doesn't support backward
+        if kernel_type != HammerKernel.TRITON_CC:
+            real_z.backward(dz)
+            real_dx, x.grad = x.grad.detach().clone(), None
+            real_dw, w.grad = w.grad.detach().clone(), None
+            real_dy, y.grad = y.grad.detach().clone(), None
+
+            torch.testing.assert_close(ref_dx, real_dx, atol=atol, rtol=rtol)
+            torch.testing.assert_close(ref_dw, real_dw, atol=atol, rtol=rtol)
+            torch.testing.assert_close(ref_dy, real_dy, atol=atol, rtol=rtol)
diff --git a/recommendation_v4/generative_recommenders/ops/tests/position_test.py b/recommendation_v4/generative_recommenders/ops/tests/position_test.py
new file mode 100644
index 000000000..ab9e1b415
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/position_test.py
@@ -0,0 +1,234 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import (
+    generate_sparse_seq_len,
+    gpu_unavailable,
+    HammerKernel,
+    set_dev_mode,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class PositionEmbeddingsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        alpha=st.sampled_from([0.5]),
+        max_uih_len=st.integers(50, 500),
+        max_contextual_seq_len=st.sampled_from([10]),
+        interleave_targets=st.sampled_from([True, False]),
+        batch_size=st.integers(16, 32),
+        D=st.integers(20, 200),
+        max_targets=st.sampled_from([10, 20]),
+        time_bucket_fn=st.sampled_from(["log"]),
+        dtype=st.sampled_from([torch.float32, torch.bfloat16, torch.float16]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_add_timestamp_positional_embeddings_triton(self, *args, **kwargs) -> None:
+        self._test_add_timestamp_positional_embeddings(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        alpha=st.sampled_from([0.5]),
+        max_uih_len=st.sampled_from([32768]),
+        max_contextual_seq_len=st.sampled_from([10]),
+        interleave_targets=st.sampled_from([False]),
+        batch_size=st.sampled_from([130]),
+        D=st.sampled_from([512]),
+        max_targets=st.sampled_from([10]),
+        time_bucket_fn=st.sampled_from(["log"]),
+        dtype=st.sampled_from([torch.bfloat16]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+        deadline=None,
+    )
+    def test_add_timestamp_positional_embeddings_triton_large_tensor(
+        self,
+        # pyre-fixme[2]: Parameter must be annotated.
+        *args,
+        # pyre-ignore[2]
+        **kwargs,
+    ) -> None:
+        self._test_add_timestamp_positional_embeddings(
+            *args,
+            **kwargs,
+            test_backward=False,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+            sparsity=1.0,
+        )
+
+    def _test_add_timestamp_positional_embeddings(
+        self,
+        alpha: float,
+        max_uih_len: int,
+        max_contextual_seq_len: int,
+        interleave_targets: bool,
+        batch_size: int,
+        D: int,
+        max_targets: int,
+        time_bucket_fn: str,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        test_backward: bool,
+        sparsity: float = -1,
+    ) -> None:
+        set_dev_mode(True)
+        from generative_recommenders.ops.position import (
+            add_timestamp_positional_embeddings,
+        )
+
+        num_targets = torch.randint(
+            max_targets + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        if sparsity > 0.0:
+            lengths = generate_sparse_seq_len(
+                size=batch_size,
+                max_seq_len=max_uih_len,
+                sparsity=sparsity,
+                device=torch.device("cuda"),
+            ).to(torch.int64)
+        else:
+            lengths = torch.randint(
+                max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        max_seq_len = max_uih_len + max_targets
+
+        position_embeddings_weight = (
+            torch.empty(
+                (max_seq_len, D), dtype=torch.float32, device=torch.device("cuda")
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        num_time_buckets = 1000
+        timestamp_embeddings_weight = (
+            torch.empty(
+                (num_time_buckets, D), dtype=torch.float32, device=torch.device("cuda")
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        seq_embeddings = (
+            torch.empty(
+                (int(seq_offsets[-1].item()), D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        timestamp_deltas: torch.Tensor = torch.randint(
+            86400,
+            size=(batch_size, max_seq_len),
+            device="cuda",
+        )
+        timestamps = timestamp_deltas.cumsum(dim=1)
+        mask = torch.arange(max_seq_len, device=timestamps.device) < lengths.unsqueeze(
+            1
+        )
+        timestamps = timestamps[mask.view(batch_size, -1)]
+
+        ref_out = add_timestamp_positional_embeddings(
+            alpha=alpha,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            position_embeddings_weight=position_embeddings_weight,
+            timestamp_embeddings_weight=timestamp_embeddings_weight,
+            seq_offsets=seq_offsets,
+            seq_lengths=lengths,
+            seq_embeddings=seq_embeddings,
+            timestamps=timestamps,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+            kernel=ref_kernel,
+        )
+        dout = torch.randn_like(ref_out) * 0.01
+        ref_out.backward(dout)
+        # pyre-ignore
+        ref_d_seq_embeddings, seq_embeddings.grad = seq_embeddings.grad.clone(), None
+        ref_d_position_embeddings_weight, position_embeddings_weight.grad = (
+            position_embeddings_weight.grad.clone(),
+            None,
+        )
+        ref_d_timestamp_embeddings_weight, timestamp_embeddings_weight.grad = (
+            timestamp_embeddings_weight.grad.clone(),
+            None,
+        )
+
+        real_out = add_timestamp_positional_embeddings(
+            alpha=alpha,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            position_embeddings_weight=position_embeddings_weight,
+            timestamp_embeddings_weight=timestamp_embeddings_weight,
+            seq_offsets=seq_offsets,
+            seq_lengths=lengths,
+            seq_embeddings=seq_embeddings,
+            timestamps=timestamps,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+            kernel=real_kernel,
+        )
+
+        torch.testing.assert_close(ref_out, real_out)
+        if test_backward:
+            real_out.backward(dout)
+            real_d_seq_embeddings = seq_embeddings.grad.clone()
+            real_d_position_embeddings_weight = position_embeddings_weight.grad.clone()
+            real_d_timestamp_embeddings_weight = (
+                timestamp_embeddings_weight.grad.clone()
+            )
+            torch.testing.assert_close(ref_d_seq_embeddings, real_d_seq_embeddings)
+            torch.testing.assert_close(
+                ref_d_position_embeddings_weight,
+                real_d_position_embeddings_weight,
+                atol=5e-2 if dtype != torch.float32 else None,
+                rtol=2e-2 if dtype != torch.float32 else None,
+            )
+            torch.testing.assert_close(
+                ref_d_timestamp_embeddings_weight,
+                real_d_timestamp_embeddings_weight,
+                atol=5e-2 if dtype != torch.float32 else None,
+                rtol=2e-2 if dtype != torch.float32 else None,
+            )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/rms_norm_test.py b/recommendation_v4/generative_recommenders/ops/tests/rms_norm_test.py
new file mode 100644
index 000000000..4e5c1a871
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/rms_norm_test.py
@@ -0,0 +1,229 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import copy
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel, set_dev_mode
+from generative_recommenders.ops.layer_norm import rms_norm, RMSNorm
+from hammer.ops.triton.cc.utils import set_triton_cc_version
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class LayerNormTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.sampled_from([2000000]),
+        D=st.sampled_from([512]),
+        dtype=st.sampled_from(
+            [torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        silu=st.booleans(),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+    )
+    # pyre-ignore[2]
+    def test_large_tensors(self, *args, **kwargs) -> None:
+        self._test_rms_norm(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+            skip_comparisons=True,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=0, max_value=10000),
+        D=st.integers(min_value=32, max_value=512),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        silu=st.booleans(),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=50,
+    )
+    # pyre-ignore[2]
+    def test_rms_norm(self, *args, **kwargs) -> None:
+        self._test_rms_norm(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=4, max_value=10000),
+        D=st.sampled_from([256, 512]),
+        dtype=st.sampled_from([torch.bfloat16, torch.float16]),
+        triton_cc_version=st.sampled_from(["", "repkg"]),
+        silu=st.just(False),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=10,
+    )
+    # pyre-ignore[2]
+    def test_rms_norm_triton_cc(self, triton_cc_version: str, *args, **kwargs) -> None:
+        set_triton_cc_version(triton_cc_version)
+        self._test_rms_norm(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON_CC,
+            test_backward=False,
+        )
+
+    def _test_rms_norm(
+        self,
+        N: int,
+        D: int,
+        dtype: torch.dtype,
+        silu: bool,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        skip_comparisons: bool = False,
+        test_backward: bool = True,
+    ) -> None:
+        N = N // 4 * 4
+        # enable auto-tuning to verify correctness of multi-row kernel
+        set_dev_mode(False)
+        x = (
+            torch.empty((N, D), dtype=dtype, device=torch.device("cuda"))
+            .normal_(0.0, 1.0)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((D,), device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        # ref
+        ref_out = rms_norm(x, weight, eps=1e-6, silu=silu, kernel=ref_kernel)
+        opt_x = x.detach().clone().requires_grad_()
+        opt_weight = weight.detach().clone().requires_grad_()
+        opt_out = rms_norm(opt_x, opt_weight, eps=1e-6, silu=silu, kernel=real_kernel)
+        torch.testing.assert_close(ref_out, opt_out)
+
+        if not test_backward:
+            return
+
+        dout = torch.randn_like(ref_out) * 0.05
+        ref_out.backward(dout)
+        if skip_comparisons:
+            return
+        # pyre-ignore[16]
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_dw, weight.grad = weight.grad.detach().clone(), None
+        # opt
+        dout = dout.detach().clone()
+        opt_out.backward(dout)
+        opt_dx, x.grad = opt_x.grad.detach().clone(), None
+        opt_dw, weight.grad = opt_weight.grad.detach().clone(), None
+        torch.testing.assert_close(ref_dx, opt_dx)
+        torch.testing.assert_close(ref_dw, opt_dw)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=32, max_value=10000),
+        D=st.integers(min_value=32, max_value=512),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=50,
+    )
+    # pyre-ignore[2]
+    def test_modules(self, *args, **kwargs) -> None:
+        self._test_rms_norm_module(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_rms_norm_module(
+        self,
+        N: int,
+        D: int,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        skip_comparisons: bool = False,
+    ) -> None:
+        set_dev_mode(True)
+        x = (
+            torch.empty((N, D), dtype=dtype, device=torch.device("cuda"))
+            .normal_(0.0, 1.0)
+            .requires_grad_()
+        )
+        # ref
+        ref_layer = RMSNorm(
+            dim=D,
+            eps=1e-6,
+        ).to(device="cuda")
+        ref_layer._hammer_kernel = ref_kernel
+        opt_layer = copy.deepcopy(ref_layer)
+        opt_layer._hammer_kernel = real_kernel
+
+        ref_out = ref_layer(x)
+        dout = torch.randn_like(ref_out) * 0.05
+        ref_out.backward(dout)
+        if skip_comparisons:
+            return
+        # pyre-ignore[16]
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_dw = ref_layer.weight.grad.detach().clone()
+        # opt
+        x = x.detach().clone().requires_grad_()
+        opt_out = opt_layer(x)
+        dout = dout.detach().clone()
+        opt_out.backward(dout)
+        opt_dx, x.grad = x.grad.detach().clone(), None
+        opt_dw = opt_layer.weight.grad.detach().clone()
+        torch.testing.assert_close(ref_out.to(dtype), opt_out)
+        torch.testing.assert_close(
+            ref_dx,
+            opt_dx,
+        )
+        torch.testing.assert_close(
+            ref_dw,
+            opt_dw,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton/_autotune_pinning.py b/recommendation_v4/generative_recommenders/ops/triton/_autotune_pinning.py
new file mode 100644
index 000000000..5aa24eb85
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/_autotune_pinning.py
@@ -0,0 +1,27 @@
+"""Triton autotune pinning helper.
+
+A handful of Triton kernels in this directory have two stable autotune
+equilibria on MI350X gfx950 at our yambda bs=32 L=2039 shape: a fast one
+(~52 ms/step) and a slow one (~71 ms/step). The autotuner's measurement
+noise puts the choice on a coin flip per cold start. We pin the winning
+config for these kernels so every cold start lands at the fast equilibrium
+deterministically.
+
+Set `TRITON_FULL_AUTOTUNE=1` to bypass the pin and re-enable the full
+autotune search (useful when validating a new shape, GPU, or Triton version
+before re-capturing winners).
+"""
+
+import os
+from typing import Callable, List
+
+import triton
+
+
+def pinned_or_full(
+    pinned: List[triton.Config],
+    full_configs_fn: Callable[[], List[triton.Config]],
+) -> List[triton.Config]:
+    if os.environ.get("TRITON_FULL_AUTOTUNE", "0") == "1":
+        return full_configs_fn()
+    return pinned
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py b/recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py
new file mode 100644
index 000000000..487aae189
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py
@@ -0,0 +1,1708 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+import math
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.ops.utils import is_sm100_plus, maybe_register_custom_op
+
+try:
+    # @manual=//triton:triton
+    from triton.language.extra.subtile_ops import _split_n_2D
+except ImportError:
+    _split_n_2D = None
+
+try:
+    # @manual=//triton:triton
+    import triton.language.extra.tlx as tlx  # type: ignore
+
+    HAS_TLX = True
+except ImportError:
+    tlx = None
+    HAS_TLX = False
+
+from generative_recommenders.common import triton_autotune, triton_cc
+
+try:
+    # @manual=//triton:triton
+    from triton.tools.tensor_descriptor import TensorDescriptor
+
+    TMA_AVAILABLE = True
+except ImportError:
+    TMA_AVAILABLE = False
+    pass
+
+
+ENABLE_FULL_TURNING_SPACE = False
+
+
+def _use_meta_ws() -> bool:
+    """Check if Meta's warp specialization is available, enabled, and on SM100+."""
+    return (
+        is_sm100_plus()
+        and hasattr(triton, "knobs")
+        and hasattr(triton.knobs, "nvidia")
+        # `use_meta_ws` is absent in some Triton builds (e.g. nvcr.io/nvidia/pytorch:26.01-py3);
+        # use getattr so import doesn't crash on AttributeError before any step runs.
+        and getattr(triton.knobs.nvidia, "use_meta_ws", False)
+    )
+
+
+def _check_tma_alignment(
+    x: torch.Tensor, w: torch.Tensor, y: torch.Tensor, min_alignment: int = 16
+) -> bool:
+    """Check if tensors meet TMA alignment requirements.
+
+    TMA (Tensor Memory Accelerator) on H100 requires:
+    1. Base addresses to be 64-byte aligned
+    2. Dimensions to be multiples of 64 for optimal performance
+    3. Contiguous inner dimensions (stride=1)
+
+    Args:
+        x: Input tensor [M, K]
+        w: Weight tensor [K, N]
+        y: Bias tensor [N] or [M, N]
+        min_alignment: Minimum alignment requirement (default: 64)
+
+    Returns:
+        True if all tensors meet TMA alignment requirements
+    """
+    _, K = x.shape
+    KB, N = w.shape
+    assert K == KB, f"incompatible dimensions {K}, {KB}"
+
+    is_y_1d = y.dim() == 1
+    NY = y.shape[0] if is_y_1d else y.shape[1]
+    assert N == NY, f"incompatible dimensions {N}, {NY}"
+
+    return (K % min_alignment == 0) and (N % min_alignment == 0)
+
+
+def _prune_persistent_autows_configs(configs, named_args, **kwargs):  # noqa
+    if not _use_meta_ws():
+        return configs
+    BROADCAST_Y = kwargs.get("BROADCAST_Y", False)
+    pruned = []
+    for c in configs:
+        BLOCK_M = c.kwargs.get("BLOCK_M", 0)
+        BLOCK_N = c.kwargs.get("BLOCK_N", 0)
+        EPILOGUE_SUBTILE = c.kwargs.get("EPILOGUE_SUBTILE", 1)
+        DP = c.kwargs.get("DATA_PARTITION_FACTOR", 1)
+        # DATA_PARTITION_FACTOR=2 is only supported with BLOCK_M=256
+        if DP == 2 and BLOCK_M != 256:
+            continue
+        if (BLOCK_N // EPILOGUE_SUBTILE) < 32:
+            continue
+        if BROADCAST_Y and (BLOCK_N // EPILOGUE_SUBTILE) < 64:
+            continue
+        pruned.append(c)
+    return pruned
+
+
+def _prune_configs_for_tlx_persistent_addmm(configs, named_args, **kwargs):  # noqa
+    M = named_args.get("M", 0)
+    N = named_args.get("N", 0)
+    BROADCAST_Y = kwargs.get("BROADCAST_Y", False)
+
+    pruned = []
+    for c in configs:
+        BLOCK_M = c.kwargs.get("BLOCK_M", 0)
+        BLOCK_N = c.kwargs.get("BLOCK_N", 0)
+        EPILOGUE_SUBTILE = c.kwargs.get("EPILOGUE_SUBTILE", 1)
+        NUM_MMA_GROUPS = c.kwargs.get("NUM_MMA_GROUPS", 1)
+        BLOCK_M_SPLIT = BLOCK_M // NUM_MMA_GROUPS
+        NUM_SMEM_BUFFERS = c.kwargs.get("NUM_SMEM_BUFFERS", 1)
+
+        # Hardware constraint: Always make MMA tile 128.
+        if BLOCK_M_SPLIT != 128:
+            continue
+
+        # BLOCK_N >= 64 required for PAIR_CTA
+        if BLOCK_N < 64:
+            continue
+
+        # Subslice loads cannot be smaller than 32
+        if (BLOCK_N // EPILOGUE_SUBTILE) < 32:
+            continue
+
+        # TMA loads must be at least 128 bytes. With BROADCAST_Y
+        # this may not be met.
+        if BROADCAST_Y and (BLOCK_N // EPILOGUE_SUBTILE) < 64:
+            continue
+
+        # Prune the support SMEM_BUFFER configurations.
+        if BROADCAST_Y:
+            if NUM_MMA_GROUPS == 1 and NUM_SMEM_BUFFERS != 5:
+                continue
+            elif NUM_MMA_GROUPS == 2 and NUM_SMEM_BUFFERS != 4:
+                continue
+        else:
+            if NUM_MMA_GROUPS == 1 and NUM_SMEM_BUFFERS != 4:
+                continue
+            elif NUM_MMA_GROUPS == 2 and NUM_SMEM_BUFFERS != 3:
+                continue
+
+        # PAIR_CTA requires even number of M tiles and even total tiles
+        num_tiles_m = math.ceil(M / BLOCK_M) if BLOCK_M > 0 else 0
+        num_tiles_n = math.ceil(N / BLOCK_N) if BLOCK_N > 0 else 0
+        total_tiles = num_tiles_m * num_tiles_n
+
+        # PAIR_CTA incompatible with MMA M=64
+        pair_cta_compatible = (
+            (num_tiles_m % 2 == 0)
+            and (total_tiles % 2 == 0)
+            and BLOCK_M == 128
+            and NUM_MMA_GROUPS == 1
+        )
+
+        c.kwargs["PAIR_CTA"] = pair_cta_compatible
+        # Set ctas_per_cga for CUDA-native cluster launch semantics (TLX way)
+        c.ctas_per_cga = (2, 1, 1) if pair_cta_compatible else None
+
+        pruned.append(c)
+    return pruned
+
+
+def get_mm_configs(pre_hook=None) -> List[triton.Config]:
+    if torch.version.hip:
+        if ENABLE_FULL_TURNING_SPACE:
+            block_m_range = [32, 64, 128, 256]
+            block_n_range = [32, 64, 128, 256]
+            block_k_range = [32, 64]
+            group_m_range = [4, 8]
+            matrix_instr_nonkdim_range = [16]
+            waves_per_eu_range = [0]
+            kpack_range = [1, 2]
+            num_warps_range = [4, 8]
+            num_stage_range = [2] if triton.__version__ >= "3.2.0" else [0]
+        else:
+            block_m_range = [256]
+            block_n_range = [256]
+            block_k_range = [32]
+            group_m_range = [8]
+            matrix_instr_nonkdim_range = [16]
+            waves_per_eu_range = [0]
+            kpack_range = [2]
+            num_warps_range = [8]
+            num_stage_range = [2] if triton.__version__ >= "3.2.0" else [0]
+
+        return [
+            triton.Config(
+                {
+                    "BLOCK_M": block_m,
+                    "BLOCK_N": block_n,
+                    "BLOCK_K": block_k,
+                    "GROUP_M": group_m,
+                    "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                    "waves_per_eu": waves_per_eu,
+                    "kpack": kpack,
+                },
+                num_stages=num_stages,
+                num_warps=num_warps,
+                pre_hook=pre_hook,
+            )
+            for block_m in block_m_range
+            for block_n in block_n_range
+            for block_k in block_k_range
+            for group_m in group_m_range
+            for matrix_instr_nonkdim in matrix_instr_nonkdim_range
+            for waves_per_eu in waves_per_eu_range
+            for kpack in kpack_range
+            for num_stages in num_stage_range
+            for num_warps in num_warps_range
+        ]
+    else:
+        block_m_range = [32, 64, 128, 256]
+        block_n_range = [32, 64, 128, 256]
+        block_k_range = [32, 64]
+        group_m_range = [4, 8]
+        # WARP_SPECIALIZE only works with num_warps >=4
+        num_warps_range = [4, 8] if is_sm100_plus() else [2, 4, 8]
+        num_stage_range = [2, 3, 4, 5]
+        if ENABLE_FULL_TURNING_SPACE:
+            return [
+                triton.Config(
+                    {
+                        "BLOCK_M": block_m,
+                        "BLOCK_N": block_n,
+                        "BLOCK_K": block_k,
+                        "GROUP_M": group_m,
+                    },
+                    num_stages=num_stages,
+                    num_warps=num_warps,
+                    pre_hook=pre_hook,
+                )
+                for block_m in block_m_range
+                for block_n in block_n_range
+                for block_k in block_k_range
+                for group_m in group_m_range
+                for num_stages in num_stage_range
+                for num_warps in num_warps_range
+            ]
+        else:
+            configs = [
+                triton.Config(
+                    {
+                        "BLOCK_M": 32,
+                        "BLOCK_N": 64,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=5,
+                    num_warps=2,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 128,
+                        "BLOCK_N": 256,
+                        "BLOCK_K": 64,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=3,
+                    num_warps=8,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 64,
+                        "BLOCK_N": 256,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 128,
+                        "BLOCK_N": 128,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 128,
+                        "BLOCK_N": 64,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 64,
+                        "BLOCK_N": 128,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 128,
+                        "BLOCK_N": 32,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 64,
+                        "BLOCK_N": 32,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=5,
+                    num_warps=2,
+                    pre_hook=pre_hook,
+                ),
+            ]
+            if is_sm100_plus():
+                configs += [
+                    triton.Config(
+                        {
+                            "BLOCK_M": 128,
+                            "BLOCK_N": 256,
+                            "BLOCK_K": 64,
+                            "GROUP_M": 8,
+                        },
+                        num_stages=3,
+                        num_warps=4,
+                        pre_hook=pre_hook,
+                    ),
+                ]
+                return [c for c in configs if c.num_warps >= 4]
+
+            return configs
+
+
+def _get_addmm_tma_ws_persistent_configs(pre_hook=None) -> List[triton.Config]:
+    """Get configs for _addmm_fwd_tma_ws_persistent (sm100+ TLX kernel).
+
+    This kernel has unique requirements (warp specialization, PAIR_CTA,
+    EPILOGUE_SUBTILE) that don't apply to the other addmm kernels.
+    """
+    if ENABLE_FULL_TURNING_SPACE:
+        block_m_range = [64, 128, 256]
+        block_n_range = [64, 128, 256]
+        block_k_range = [64, 128, 256]
+        group_m_range = [8]
+        num_warps_range = [4]
+        num_stage_range = [1]
+        epilogue_subtile_range = [1, 2, 4]
+        num_mma_groups_range = [1, 2]
+        return [
+            triton.Config(
+                {
+                    "BLOCK_M": block_m,
+                    "BLOCK_N": block_n,
+                    "BLOCK_K": block_k,
+                    "GROUP_M": group_m,
+                    "EPILOGUE_SUBTILE": epilogue_subtile,
+                    "NUM_MMA_GROUPS": num_mma_groups,
+                    "NUM_TMEM_BUFFERS": 1 if num_mma_groups == 2 else 2,
+                    "NUM_SMEM_BUFFERS": num_smem_buffers,
+                },
+                num_stages=num_stages,
+                num_warps=num_warps,
+                pre_hook=pre_hook,
+            )
+            for block_m in block_m_range
+            for block_n in block_n_range
+            for block_k in block_k_range
+            for group_m in group_m_range
+            for num_stages in num_stage_range
+            for num_warps in num_warps_range
+            for epilogue_subtile in epilogue_subtile_range
+            for num_mma_groups in num_mma_groups_range
+            for num_smem_buffers in [3, 4, 5]
+        ]
+    else:
+        configs = []
+        for block_m, block_n, block_k in [
+            (128, 256, 64),
+            (128, 128, 64),
+            (64, 128, 64),
+            (64, 256, 64),
+            (128, 64, 128),
+        ]:
+            # Note: num_smem_buffers is pruned to 1 in
+            # the pruning function.
+            for num_smem_buffers in [3, 4, 5]:
+                configs.append(
+                    triton.Config(
+                        {
+                            "BLOCK_M": block_m,
+                            "BLOCK_N": block_n,
+                            "BLOCK_K": block_k,
+                            "GROUP_M": 8,
+                            "EPILOGUE_SUBTILE": 1,
+                            "NUM_MMA_GROUPS": 1,
+                            "NUM_TMEM_BUFFERS": 2,
+                            "NUM_SMEM_BUFFERS": num_smem_buffers,
+                        },
+                        num_stages=1,
+                        num_warps=4,
+                        pre_hook=pre_hook,
+                    ),
+                )
+        return configs
+
+
+def get_triton_persistent_configs(pre_hook=None) -> List[triton.Config]:
+    if not _use_meta_ws():
+        configs = get_mm_configs(pre_hook=pre_hook)
+        for c in configs:
+            c.kwargs["DATA_PARTITION_FACTOR"] = 1
+            c.kwargs["EPILOGUE_SUBTILE"] = 1
+        return configs
+    # TODO: Prune configs to best configs.
+    return [
+        triton.Config(  # pyre-ignore[28]
+            {
+                "BLOCK_M": block_m,
+                "BLOCK_N": block_n,
+                "BLOCK_K": block_k,
+                "GROUP_M": 8,
+                "EPILOGUE_SUBTILE": subtile,
+                "DATA_PARTITION_FACTOR": DP,
+            },
+            num_stages=num_stages,
+            num_warps=4,
+            pre_hook=pre_hook,
+            early_tma_store_lowering=1,
+            maxRegAutoWS=255,
+        )
+        for block_m in [64, 128, 256]
+        for block_n in [64, 128, 256]
+        for block_k in [64, 128, 256]
+        for num_stages in [2, 3, 4]
+        for subtile in [1, 2, 4, 8]
+        for DP in [1, 2]
+    ]
+
+
+@triton_cc(
+    annotations={
+        "M": "i32",
+        "N": ("i32", 16),
+        "K": ("i32", 16),
+        "stride_xm": ("i32", 16),
+        "stride_xk": ("i32", 1),
+        "stride_wk": ("i32", 16),
+        "stride_wn": ("i32", 1),
+        "stride_ym": ("i32", 16),
+        "stride_yn": ("i32", 1),
+        "stride_zm": ("i32", 16),
+        "stride_zn": ("i32", 1),
+    },
+)
+@triton_autotune(
+    configs=get_mm_configs(),
+    key=["N", "K"],
+)
+@triton.jit
+def _addmm_fwd(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    z_ptr,
+    M,
+    N,
+    K,
+    stride_xm,
+    stride_xk,
+    stride_wk,
+    stride_wn,
+    stride_ym,
+    stride_yn,
+    stride_zm,
+    stride_zn,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+):
+    pid_0, pid_1 = tl.program_id(axis=0), tl.program_id(axis=1)
+    pid = pid_0 * tl.num_programs(axis=1) + pid_1
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+    offs_n = tl.arange(0, BLOCK_N)
+    mask_m = (pid_m * BLOCK_M + offs_m)[:, None] < M
+    mask_n = (pid_n * BLOCK_N + offs_n)[None, :] < N
+    x_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_xm
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    w_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_wn
+    w_ptrs = w_ptr + (offs_k[:, None] * stride_wk + offs_n[None, :] * stride_wn)
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        mask_k = offs_k[None, :] < K - k * BLOCK_K
+        x = tl.load(x_ptrs, mask=mask_k & mask_m, other=0.0)
+        mask_k = offs_k[:, None] < K - k * BLOCK_K
+        w = tl.load(w_ptrs, mask=mask_k & mask_n, other=0.0)
+        accumulator += tl.dot(x, w, allow_tf32=ALLOW_TF32)
+        x_ptrs += BLOCK_K * stride_xk
+        w_ptrs += BLOCK_K * stride_wk
+
+    z_mask = mask_m & mask_n
+    if BROADCAST_Y:
+        # y is a vector, broadcast to add to z
+        y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn
+        y_ptrs = y_ptr + stride_yn * offs_n[None, :]
+        y = tl.load(y_ptrs, mask=mask_n)
+    else:
+        y_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_ym
+        y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn
+        y_ptrs = y_ptr + stride_ym * offs_m[:, None] + stride_yn * offs_n[None, :]
+        y = tl.load(y_ptrs, mask=z_mask)
+    z = (accumulator + y.to(tl.float32)).to(z_ptr.dtype.element_ty)
+    z_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_zm
+    z_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_zn
+    z_ptrs = z_ptr + stride_zm * offs_m[:, None] + stride_zn * offs_n[None, :]
+    tl.store(z_ptrs, z, mask=z_mask)
+
+
+def _addmm_tma_set_block_size_hook(nargs):
+    BLOCK_M = nargs["BLOCK_M"]
+    BLOCK_N = nargs["BLOCK_N"]
+    BLOCK_K = nargs["BLOCK_K"]
+    NUM_MMA_GROUPS = nargs.get("NUM_MMA_GROUPS", 1)
+    BLOCK_M_SPLIT = BLOCK_M // NUM_MMA_GROUPS
+    PAIR_CTA = nargs.get("PAIR_CTA", False)
+    nargs["x_desc"].block_shape = [BLOCK_M_SPLIT, BLOCK_K]
+    # In PAIR_CTA mode, each CTA loads BLOCK_N // 2 of W
+    if PAIR_CTA:
+        nargs["w_desc"].block_shape = [BLOCK_K, BLOCK_N // 2]
+    else:
+        nargs["w_desc"].block_shape = [BLOCK_K, BLOCK_N]
+    EPILOGUE_SUBTILE = nargs.get("EPILOGUE_SUBTILE", 1)
+    nargs["z_desc"].block_shape = [BLOCK_M_SPLIT, BLOCK_N // EPILOGUE_SUBTILE]
+    if nargs["BROADCAST_Y"]:
+        nargs["y_desc"].block_shape = [1, BLOCK_N // EPILOGUE_SUBTILE]
+    else:
+        nargs["y_desc"].block_shape = [BLOCK_M_SPLIT, BLOCK_N // EPILOGUE_SUBTILE]
+
+
+@triton.jit
+def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS):
+    group_id = tile_id // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (tile_id % group_size_m)
+    pid_n = (tile_id % num_pid_in_group) // group_size_m
+    return pid_m, pid_n
+
+
+@triton.jit
+def _addmm_persistent_tile_body(
+    x_desc,
+    w_desc,
+    y_desc,
+    z_desc,
+    tile_id,
+    num_pid_in_group,
+    num_pid_m,
+    k_tiles,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    EPILOGUE_SUBTILE: tl.constexpr,
+    INNER_WARP_SPECIALIZE: tl.constexpr,
+):
+    pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS)
+    offs_xm = pid_m * BLOCK_M
+    offs_wn = pid_n * BLOCK_N
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in tl.range(0, k_tiles, warp_specialize=INNER_WARP_SPECIALIZE):
+        offs_k = k * BLOCK_K
+        x = x_desc.load([offs_xm, offs_k])
+        w = w_desc.load([offs_k, offs_wn])
+        accumulator = tl.dot(x, w, accumulator, allow_tf32=ALLOW_TF32)
+
+    # Epilogue subtiling breaks the store into multiple pieces to reduce
+    # shared memory consumption and allow higher stage counts.
+    tl.static_assert(
+        EPILOGUE_SUBTILE <= 8,
+        "EPILOGUE_SUBTILE > 8 is not supported",
+    )
+    acc_subtiles = _split_n_2D(accumulator, EPILOGUE_SUBTILE)  # pyre-ignore[16]
+    slice_size: tl.constexpr = BLOCK_N // EPILOGUE_SUBTILE
+    for i in tl.static_range(EPILOGUE_SUBTILE):
+        if BROADCAST_Y:
+            y_i = y_desc.load([0, offs_wn + i * slice_size])
+        else:
+            y_i = y_desc.load([offs_xm, offs_wn + i * slice_size])
+        z_i = (acc_subtiles[i] + y_i.to(tl.float32)).to(z_desc.dtype)
+        z_desc.store([offs_xm, offs_wn + i * slice_size], z_i)
+
+
+@triton_autotune(
+    configs=get_triton_persistent_configs(pre_hook=_addmm_tma_set_block_size_hook),
+    key=["M", "N", "K", "WARP_SPECIALIZE"],
+    prune_configs_by={"early_config_prune": _prune_persistent_autows_configs},
+)
+@triton.jit
+def _addmm_fwd_tma_persistent(
+    x_desc,
+    w_desc,
+    y_desc,
+    z_desc,
+    M,
+    N,
+    K,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+    WARP_SPECIALIZE: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    EPILOGUE_SUBTILE: tl.constexpr,
+    DATA_PARTITION_FACTOR: tl.constexpr,
+    USE_META_WS: tl.constexpr,
+):
+    start_pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    k_tiles = tl.cdiv(K, BLOCK_K)
+    num_tiles = num_pid_m * num_pid_n
+
+    num_pid_in_group = GROUP_M * num_pid_n
+
+    if USE_META_WS:
+        # Some arguments are only available in FBexperimental.
+        # pyre-ignore[28]: smem_alloc_algo is FBexperimental
+        for tile_id in tl.range(
+            start_pid,
+            num_tiles,
+            NUM_SMS,
+            flatten=False,
+            warp_specialize=WARP_SPECIALIZE,
+            data_partition_factor=DATA_PARTITION_FACTOR,
+            smem_alloc_algo=1,
+        ):
+            _addmm_persistent_tile_body(
+                x_desc,
+                w_desc,
+                y_desc,
+                z_desc,
+                tile_id,
+                num_pid_in_group,
+                num_pid_m,
+                k_tiles,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+                BLOCK_K=BLOCK_K,
+                GROUP_M=GROUP_M,
+                ALLOW_TF32=ALLOW_TF32,
+                BROADCAST_Y=BROADCAST_Y,
+                NUM_SMS=NUM_SMS,
+                EPILOGUE_SUBTILE=EPILOGUE_SUBTILE,
+                INNER_WARP_SPECIALIZE=tl.constexpr(False),
+            )
+    else:
+        # Pure OAI Triton version.
+        for tile_id in tl.range(
+            start_pid, num_tiles, NUM_SMS, flatten=True, warp_specialize=WARP_SPECIALIZE
+        ):
+            _addmm_persistent_tile_body(
+                x_desc,
+                w_desc,
+                y_desc,
+                z_desc,
+                tile_id,
+                num_pid_in_group,
+                num_pid_m,
+                k_tiles,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+                BLOCK_K=BLOCK_K,
+                GROUP_M=GROUP_M,
+                ALLOW_TF32=ALLOW_TF32,
+                BROADCAST_Y=BROADCAST_Y,
+                NUM_SMS=NUM_SMS,
+                EPILOGUE_SUBTILE=EPILOGUE_SUBTILE,
+                INNER_WARP_SPECIALIZE=WARP_SPECIALIZE,
+            )
+
+
+@triton_autotune(
+    configs=get_mm_configs(pre_hook=_addmm_tma_set_block_size_hook),
+    key=["N", "K"],
+)
+@triton.jit
+def _addmm_fwd_tma_ws(
+    x_desc,
+    w_desc,
+    y_desc,
+    z_desc,
+    M,
+    N,
+    K,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+    NUM_SMEM_BUFFERS: tl.constexpr,
+):
+    x_buffers = tlx.local_alloc((BLOCK_M, BLOCK_K), x_desc.dtype, NUM_SMEM_BUFFERS)
+    w_buffers = tlx.local_alloc((BLOCK_K, BLOCK_N), w_desc.dtype, NUM_SMEM_BUFFERS)
+    acc_tmem_buffer = tlx.local_alloc(
+        (BLOCK_M, BLOCK_N), tl.float32, tl.constexpr(1), tlx.storage_kind.tmem
+    )
+
+    if BROADCAST_Y:
+        y_buffer = tlx.local_alloc((1, BLOCK_N), y_desc.dtype, tl.constexpr(1))
+    else:
+        y_buffer = tlx.local_alloc((BLOCK_M, BLOCK_N), y_desc.dtype, tl.constexpr(1))
+    z_buffer = tlx.local_alloc((BLOCK_M, BLOCK_N), z_desc.dtype, tl.constexpr(1))
+
+    smem_full_bars = tlx.alloc_barriers(num_barriers=NUM_SMEM_BUFFERS, arrive_count=1)
+    smem_empty_bars = tlx.alloc_barriers(num_barriers=NUM_SMEM_BUFFERS, arrive_count=1)
+    y_load_barrier = tlx.alloc_barriers(num_barriers=1, arrive_count=1)
+
+    with tlx.async_tasks():
+        # Producer task: TMA loads
+        with tlx.async_task("default"):
+            pid_0, pid_1 = tl.program_id(axis=0), tl.program_id(axis=1)
+            pid = pid_0 * tl.num_programs(axis=1) + pid_1
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            group_id = pid // num_pid_in_group
+            first_pid_m = group_id * GROUP_M
+            group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+            pid_m = first_pid_m + (pid % group_size_m)
+            pid_n = (pid % num_pid_in_group) // group_size_m
+
+            offs_xm = pid_m * BLOCK_M
+            offs_wn = pid_n * BLOCK_N
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            load_phase = 0
+            for k in range(0, k_tiles):
+                buf = k % int(NUM_SMEM_BUFFERS)
+
+                # Wait for buffer to be free
+                if k >= NUM_SMEM_BUFFERS:
+                    tlx.barrier_wait(smem_empty_bars[buf], load_phase ^ 1)
+
+                offs_k = k * BLOCK_K
+                tlx.barrier_expect_bytes(
+                    smem_full_bars[buf],
+                    2 * (BLOCK_M * BLOCK_K + BLOCK_K * BLOCK_N),
+                )
+                tlx.async_descriptor_load(
+                    x_desc, x_buffers[buf], [offs_xm, offs_k], smem_full_bars[buf]
+                )
+                tlx.async_descriptor_load(
+                    w_desc, w_buffers[buf], [offs_k, offs_wn], smem_full_bars[buf]
+                )
+
+                load_phase = load_phase ^ (buf == NUM_SMEM_BUFFERS - 1)
+
+        # Consumer task: async_dot MMA
+        with tlx.async_task(num_warps=4, num_regs=232):
+            pid_0, pid_1 = tl.program_id(axis=0), tl.program_id(axis=1)
+            pid = pid_0 * tl.num_programs(axis=1) + pid_1
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            group_id = pid // num_pid_in_group
+            first_pid_m = group_id * GROUP_M
+            group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+            pid_m = first_pid_m + (pid % group_size_m)
+            pid_n = (pid % num_pid_in_group) // group_size_m
+
+            offs_xm = pid_m * BLOCK_M
+            offs_wn = pid_n * BLOCK_N
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            # Start async load of y early
+            y_buf_view = tlx.local_view(y_buffer, 0)
+            y_load_bar = tlx.local_view(y_load_barrier, 0)
+            if BROADCAST_Y:
+                tlx.barrier_expect_bytes(y_load_bar, 1 * BLOCK_N * 2)
+                tlx.async_descriptor_load(y_desc, y_buf_view, [0, offs_wn], y_load_bar)
+            else:
+                tlx.barrier_expect_bytes(y_load_bar, BLOCK_M * BLOCK_N * 2)
+                tlx.async_descriptor_load(
+                    y_desc, y_buf_view, [offs_xm, offs_wn], y_load_bar
+                )
+
+            dot_phase = 0
+            for k in range(0, k_tiles):
+                buf = k % int(NUM_SMEM_BUFFERS)
+                tlx.barrier_wait(smem_full_bars[buf], dot_phase)
+
+                tlx.async_dot(
+                    x_buffers[buf],
+                    w_buffers[buf],
+                    acc_tmem_buffer[0],
+                    use_acc=k > 0,
+                    mBarriers=[smem_empty_bars[buf]],
+                    out_dtype=tl.float32,
+                )
+
+                dot_phase = dot_phase ^ (buf == NUM_SMEM_BUFFERS - 1)
+
+            last_buf = (k_tiles - 1) % NUM_SMEM_BUFFERS
+            last_dot_phase = dot_phase ^ (last_buf == NUM_SMEM_BUFFERS - 1)
+            tlx.barrier_wait(smem_empty_bars[last_buf], last_dot_phase)
+
+            tmem_result = tlx.local_load(acc_tmem_buffer[0])
+
+            tlx.barrier_wait(y_load_bar, 0)
+            y = tlx.local_load(y_buf_view)
+
+            z = (tmem_result + y.to(tl.float32)).to(z_desc.dtype)
+            z_buf_view = tlx.local_view(z_buffer, 0)
+            tlx.local_store(z_buf_view, z)
+            tlx.async_descriptor_store(z_desc, z_buf_view, [offs_xm, offs_wn])
+            tlx.async_descriptor_store_wait(0)
+
+
+@triton_autotune(
+    configs=_get_addmm_tma_ws_persistent_configs(
+        pre_hook=_addmm_tma_set_block_size_hook
+    ),
+    key=["M", "N", "K"],
+    prune_configs_by={"early_config_prune": _prune_configs_for_tlx_persistent_addmm},
+)
+@triton.jit
+def _addmm_fwd_tma_ws_persistent(
+    x_desc,
+    w_desc,
+    y_desc,
+    z_desc,
+    M,
+    N,
+    K,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+    NUM_SMEM_BUFFERS: tl.constexpr,
+    NUM_TMEM_BUFFERS: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    EPILOGUE_SUBTILE: tl.constexpr,
+    PAIR_CTA: tl.constexpr,
+    NUM_MMA_GROUPS: tl.constexpr,
+):
+    BLOCK_M_SPLIT: tl.constexpr = BLOCK_M // NUM_MMA_GROUPS
+
+    # Allocate buffers once for all tiles
+    x_buffers = tlx.local_alloc(
+        (BLOCK_M_SPLIT, BLOCK_K), x_desc.dtype, NUM_SMEM_BUFFERS * NUM_MMA_GROUPS
+    )
+    # In pair CTA mode, each CTA only needs to load half of W
+    if PAIR_CTA:
+        w_buffers = tlx.local_alloc(
+            (BLOCK_K, BLOCK_N // 2), w_desc.dtype, NUM_SMEM_BUFFERS
+        )
+    else:
+        w_buffers = tlx.local_alloc((BLOCK_K, BLOCK_N), w_desc.dtype, NUM_SMEM_BUFFERS)
+    tmem_buffers = tlx.local_alloc(
+        (BLOCK_M_SPLIT, BLOCK_N),
+        tl.float32,
+        NUM_TMEM_BUFFERS * NUM_MMA_GROUPS,
+        tlx.storage_kind.tmem,
+    )
+    slice_size: tl.constexpr = BLOCK_N // EPILOGUE_SUBTILE
+
+    Y_Z_SHARED: tl.constexpr = NUM_MMA_GROUPS == 2 and not BROADCAST_Y
+    if Y_Z_SHARED:
+        NUM_Z_BUFFERS: tl.constexpr = EPILOGUE_SUBTILE * NUM_MMA_GROUPS
+    else:
+        NUM_Z_BUFFERS: tl.constexpr = NUM_MMA_GROUPS
+
+    if Y_Z_SHARED:
+        bias_storage_alias = tlx.storage_alias_spec(storage=tlx.storage_kind.smem)
+        y_buffers = tlx.local_alloc(
+            (BLOCK_M_SPLIT, slice_size),
+            y_desc.dtype,
+            NUM_Z_BUFFERS,
+            reuse=bias_storage_alias,
+        )
+        z_buffers = tlx.local_alloc(
+            (BLOCK_M_SPLIT, slice_size),
+            z_desc.dtype,
+            NUM_Z_BUFFERS,
+            reuse=bias_storage_alias,
+        )
+        # Define y and z to share a single buffer
+        bias_storage_alias.set_buffer_overlap(
+            tlx.reuse_group(
+                y_buffers,
+                z_buffers,
+                group_type=tlx.reuse_group_type.shared,
+            )
+        )
+    else:
+        if BROADCAST_Y:
+            y_buffers = tlx.local_alloc(
+                (1, slice_size), y_desc.dtype, EPILOGUE_SUBTILE * NUM_MMA_GROUPS
+            )
+        else:
+            y_buffers = tlx.local_alloc(
+                (BLOCK_M_SPLIT, slice_size),
+                y_desc.dtype,
+                EPILOGUE_SUBTILE * NUM_MMA_GROUPS,
+            )
+        z_buffers = tlx.local_alloc(
+            (BLOCK_M_SPLIT, slice_size), z_desc.dtype, NUM_Z_BUFFERS
+        )
+
+    cluster_cta_rank = tlx.cluster_cta_rank()
+    pred_cta0 = cluster_cta_rank == 0
+    if PAIR_CTA:
+        cta_bars = tlx.alloc_barriers(
+            num_barriers=NUM_SMEM_BUFFERS * NUM_MMA_GROUPS, arrive_count=2
+        )
+
+    # Barriers for producer <-> MMA (separate X and W barriers)
+    x_smem_full_bars = tlx.alloc_barriers(
+        num_barriers=NUM_SMEM_BUFFERS * NUM_MMA_GROUPS, arrive_count=1
+    )
+    x_smem_empty_bars = tlx.alloc_barriers(
+        num_barriers=NUM_SMEM_BUFFERS * NUM_MMA_GROUPS, arrive_count=1
+    )
+    w_smem_full_bars = tlx.alloc_barriers(num_barriers=NUM_SMEM_BUFFERS, arrive_count=1)
+    # Barriers for MMA <-> Epilogue
+    tmem_full_bars = tlx.alloc_barriers(
+        num_barriers=NUM_TMEM_BUFFERS * NUM_MMA_GROUPS, arrive_count=1
+    )
+    tmem_empty_bars = tlx.alloc_barriers(
+        num_barriers=NUM_TMEM_BUFFERS * NUM_MMA_GROUPS, arrive_count=1
+    )
+    # Barriers for producer <-> Epilogue
+    # y_load_bar: producer signals when y data is ready
+    # y_empty_bar: epilogue signals when done using y buffer
+    y_load_bars = tlx.alloc_barriers(
+        num_barriers=EPILOGUE_SUBTILE * NUM_MMA_GROUPS, arrive_count=1
+    )
+    y_empty_bars = tlx.alloc_barriers(
+        num_barriers=EPILOGUE_SUBTILE * NUM_MMA_GROUPS, arrive_count=1
+    )
+    z_load_bars = tlx.alloc_barriers(num_barriers=NUM_Z_BUFFERS, arrive_count=1)
+    z_empty_bars = tlx.alloc_barriers(num_barriers=NUM_Z_BUFFERS, arrive_count=1)
+
+    with tlx.async_tasks():
+        # Epilogue consumer: waits for Y from producer, adds bias, stores to SMEM.
+        with tlx.async_task("default"):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            if PAIR_CTA:
+                # Round up to even for proper CTA pairing
+                num_pid_m = (num_pid_m + 1) // 2 * 2
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_tiles = num_pid_m * num_pid_n
+
+            tmem_read_phase = 0
+            cur_tmem_buf = 0
+            y_load_phase = 0
+            z_load_phase = 0
+
+            z_idx = 0
+            for _ in range(start_pid, num_tiles, NUM_SMS):
+                for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                    for group_id in tl.static_range(NUM_MMA_GROUPS):
+                        buf_idx = group_id * NUM_TMEM_BUFFERS + cur_tmem_buf
+                        acc_tmem = tmem_buffers[buf_idx]
+                        if slice_id == 0:
+                            # Wait for MMA to finish computing this group
+                            tlx.barrier_wait(tmem_full_bars[buf_idx], tmem_read_phase)
+
+                        # Load result from TMEM and add bias
+                        acc_subslice = tlx.subslice(
+                            acc_tmem, slice_id * slice_size, slice_size
+                        )
+                        result = tlx.local_load(acc_subslice)
+                        if slice_id == EPILOGUE_SUBTILE - 1:
+                            # Signal MMA that this TMEM buffer is now free
+                            tlx.barrier_arrive(tmem_empty_bars[buf_idx], 1)
+
+                        y_idx = slice_id * NUM_MMA_GROUPS + group_id
+                        y_buf_view = tlx.local_view(y_buffers, y_idx)
+                        y_full = tlx.local_view(y_load_bars, y_idx)
+                        tlx.barrier_wait(y_full, y_load_phase)
+                        y = tlx.local_load(y_buf_view)
+                        # If Y and Z are not shared signal we can load the next bias.
+                        if not Y_Z_SHARED:
+                            y_empty = tlx.local_view(y_empty_bars, y_idx)
+                            tlx.barrier_arrive(y_empty, 1)
+                        z = (result + y.to(tl.float32)).to(z_desc.dtype)
+                        z_buf_view = tlx.local_view(z_buffers, z_idx)
+                        # If Y and Z are not shared wait for Z to be empty.
+                        # If there are shared this already guaranteed.
+                        if not Y_Z_SHARED:
+                            z_empty = tlx.local_view(z_empty_bars, z_idx)
+                            tlx.barrier_wait(z_empty, z_load_phase ^ 1)
+                        tlx.local_store(z_buf_view, z)
+                        z_full = tlx.local_view(z_load_bars, z_idx)
+                        tlx.barrier_arrive(z_full, 1)
+                        z_load_phase = z_load_phase ^ (z_idx == (NUM_Z_BUFFERS - 1))
+                        # pyre-ignore[58]
+                        z_idx = (z_idx + 1) % NUM_Z_BUFFERS
+
+                tmem_read_phase = tmem_read_phase ^ (
+                    cur_tmem_buf == int(NUM_TMEM_BUFFERS) - 1
+                )
+                y_load_phase = y_load_phase ^ 1
+
+                cur_tmem_buf = (cur_tmem_buf + 1) % int(NUM_TMEM_BUFFERS)
+
+        # MMA consumer: performs matrix multiplication
+        with tlx.async_task(num_warps=1, num_regs=24):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            if PAIR_CTA:
+                # Round up to even for proper CTA pairing
+                num_pid_m = (num_pid_m + 1) // 2 * 2
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            dot_phase = 0
+            tmem_write_phase = 1
+            cur_tmem_buf = 0
+            processed_k_iters = 0
+
+            for tile_id in range(start_pid, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+
+                # First K iteration (peeled): use_acc=False
+                buf = processed_k_iters % int(NUM_SMEM_BUFFERS)
+                tlx.barrier_wait(w_smem_full_bars[buf], dot_phase)
+
+                for group_id in tl.static_range(NUM_MMA_GROUPS):
+                    a_buf = group_id * NUM_SMEM_BUFFERS + buf
+                    acc_buf = group_id * NUM_TMEM_BUFFERS + cur_tmem_buf
+
+                    tlx.barrier_wait(x_smem_full_bars[a_buf], dot_phase)
+
+                    # Wait for epilogue to finish with this TMEM buffer
+                    tlx.barrier_wait(tmem_empty_bars[acc_buf], tmem_write_phase)
+
+                    if PAIR_CTA:
+                        # pyre-ignore[61]
+                        tlx.barrier_arrive(cta_bars[a_buf], 1, remote_cta_rank=0)
+                        # pyre-ignore[61]
+                        tlx.barrier_wait(
+                            # pyre-ignore[61]
+                            cta_bars[a_buf],
+                            phase=dot_phase,
+                            pred=pred_cta0,
+                        )
+
+                    tlx.async_dot(
+                        x_buffers[a_buf],
+                        w_buffers[buf],
+                        tmem_buffers[acc_buf],
+                        use_acc=False,
+                        mBarriers=[x_smem_empty_bars[a_buf]],
+                        two_ctas=PAIR_CTA,
+                        out_dtype=tl.float32,
+                    )
+
+                dot_phase = dot_phase ^ (buf == int(NUM_SMEM_BUFFERS) - 1)
+
+                # Remaining K iterations: use_acc=True
+                for k in range(1, k_tiles):
+                    buf = (processed_k_iters + k) % int(NUM_SMEM_BUFFERS)
+                    tlx.barrier_wait(w_smem_full_bars[buf], dot_phase)
+
+                    for group_id in tl.static_range(NUM_MMA_GROUPS):
+                        a_buf = group_id * NUM_SMEM_BUFFERS + buf
+                        acc_buf = group_id * NUM_TMEM_BUFFERS + cur_tmem_buf
+
+                        tlx.barrier_wait(x_smem_full_bars[a_buf], dot_phase)
+
+                        if PAIR_CTA:
+                            # pyre-ignore[61]
+                            tlx.barrier_arrive(cta_bars[a_buf], 1, remote_cta_rank=0)
+                            # pyre-ignore[61]
+                            tlx.barrier_wait(
+                                # pyre-ignore[61]
+                                cta_bars[a_buf],
+                                phase=dot_phase,
+                                # pyre-ignore[61]
+                                pred=pred_cta0,
+                            )
+
+                        tlx.async_dot(
+                            x_buffers[a_buf],
+                            w_buffers[buf],
+                            tmem_buffers[acc_buf],
+                            use_acc=True,
+                            mBarriers=[x_smem_empty_bars[a_buf]],
+                            two_ctas=PAIR_CTA,
+                            out_dtype=tl.float32,
+                        )
+
+                    dot_phase = dot_phase ^ (buf == int(NUM_SMEM_BUFFERS) - 1)
+
+                # Wait for last MMA to complete and signal epilogue
+                last_buf = (processed_k_iters + k_tiles - 1) % int(NUM_SMEM_BUFFERS)
+                last_dot_phase = dot_phase ^ (last_buf == int(NUM_SMEM_BUFFERS) - 1)
+                for group_id in tl.static_range(NUM_MMA_GROUPS):
+                    a_buf = group_id * NUM_SMEM_BUFFERS + last_buf
+                    tlx.barrier_wait(x_smem_empty_bars[a_buf], last_dot_phase)
+                    acc_buf = group_id * NUM_TMEM_BUFFERS + cur_tmem_buf
+                    # Signal epilogue that result is ready
+                    tlx.barrier_arrive(tmem_full_bars[acc_buf], 1)
+
+                tmem_write_phase = tmem_write_phase ^ (
+                    cur_tmem_buf == int(NUM_TMEM_BUFFERS) - 1
+                )
+                cur_tmem_buf = (cur_tmem_buf + 1) % int(NUM_TMEM_BUFFERS)
+                processed_k_iters += k_tiles
+
+        # Producer: TMA loads for X, W, and Y
+        with tlx.async_task(num_warps=1, num_regs=24):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            if PAIR_CTA:
+                # Round up to even for proper CTA pairing
+                num_pid_m = (num_pid_m + 1) // 2 * 2
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            load_phase = 0
+            y_load_phase = 0
+            processed_k_iters = 0
+
+            for tile_id in range(start_pid, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+                offs_xm = pid_m * BLOCK_M
+                # Full tile offset for y loading (both CTAs use same y)
+                offs_wn_full = pid_n * BLOCK_N
+                # Split W into two parts so each CTA has different offset
+                if PAIR_CTA:
+                    # pyre-ignore[61]
+                    offs_wn = pid_n * BLOCK_N + cluster_cta_rank * (BLOCK_N // 2)
+                else:
+                    offs_wn = pid_n * BLOCK_N
+
+                for k in range(0, k_tiles):
+                    buf = (processed_k_iters + k) % int(NUM_SMEM_BUFFERS)
+                    offs_k = k * BLOCK_K
+
+                    # Load X for group 0
+                    a_buf = buf  # 0 * NUM_SMEM_BUFFERS + buf
+                    tlx.barrier_wait(x_smem_empty_bars[a_buf], load_phase ^ 1)
+                    tlx.barrier_expect_bytes(
+                        x_smem_full_bars[a_buf],
+                        2 * BLOCK_M_SPLIT * BLOCK_K,
+                    )
+                    tlx.async_descriptor_load(
+                        x_desc,
+                        x_buffers[a_buf],
+                        [offs_xm, offs_k],
+                        x_smem_full_bars[a_buf],
+                    )
+
+                    # Load W (wait for last group's x_empty to know W is free)
+                    last_a_buf = (NUM_MMA_GROUPS - 1) * NUM_SMEM_BUFFERS + buf
+                    tlx.barrier_wait(x_smem_empty_bars[last_a_buf], load_phase ^ 1)
+                    if PAIR_CTA:
+                        tlx.barrier_expect_bytes(
+                            w_smem_full_bars[buf],
+                            2 * BLOCK_K * (BLOCK_N // 2),
+                        )
+                    else:
+                        tlx.barrier_expect_bytes(
+                            w_smem_full_bars[buf],
+                            2 * BLOCK_K * BLOCK_N,
+                        )
+                    tlx.async_descriptor_load(
+                        w_desc,
+                        w_buffers[buf],
+                        [offs_k, offs_wn],
+                        w_smem_full_bars[buf],
+                    )
+
+                    # Load X for remaining groups
+                    for group_id in tl.static_range(1, NUM_MMA_GROUPS):
+                        a_buf = group_id * NUM_SMEM_BUFFERS + buf
+                        tlx.barrier_wait(x_smem_empty_bars[a_buf], load_phase ^ 1)
+                        offs_xm2 = offs_xm + group_id * BLOCK_M_SPLIT
+                        tlx.barrier_expect_bytes(
+                            x_smem_full_bars[a_buf],
+                            2 * BLOCK_M_SPLIT * BLOCK_K,
+                        )
+                        tlx.async_descriptor_load(
+                            x_desc,
+                            x_buffers[a_buf],
+                            [offs_xm2, offs_k],
+                            x_smem_full_bars[a_buf],
+                        )
+
+                    load_phase = load_phase ^ (buf == int(NUM_SMEM_BUFFERS) - 1)
+
+                for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                    for group_id in tl.static_range(NUM_MMA_GROUPS):
+                        y_idx = slice_id * NUM_MMA_GROUPS + group_id
+                        y_buf_view = tlx.local_view(y_buffers, y_idx)
+                        y_bar = tlx.local_view(y_load_bars, y_idx)
+                        # If Y and Z are shared we need to wait for Z to be empty.
+                        if Y_Z_SHARED:
+                            y_empty = tlx.local_view(z_empty_bars, y_idx)
+                        else:
+                            y_empty = tlx.local_view(y_empty_bars, y_idx)
+                        tlx.barrier_wait(y_empty, y_load_phase ^ 1)
+                        if BROADCAST_Y:
+                            tlx.barrier_expect_bytes(y_bar, 1 * slice_size * 2)
+                            tlx.async_descriptor_load(
+                                y_desc,
+                                y_buf_view,
+                                [0, offs_wn_full + slice_id * slice_size],
+                                y_bar,
+                            )
+                        else:
+                            tlx.barrier_expect_bytes(
+                                y_bar, BLOCK_M_SPLIT * slice_size * 2
+                            )
+                            tlx.async_descriptor_load(
+                                y_desc,
+                                y_buf_view,
+                                [
+                                    offs_xm + group_id * BLOCK_M_SPLIT,
+                                    offs_wn_full + slice_id * slice_size,
+                                ],
+                                y_bar,
+                            )
+
+                y_load_phase = y_load_phase ^ 1
+
+                processed_k_iters += k_tiles
+
+        # TMA Store consumer. Added to simplify the barrier
+        # logic.
+        with tlx.async_task(num_warps=1, num_regs=24):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            if PAIR_CTA:
+                # Round up to even for proper CTA pairing
+                num_pid_m = (num_pid_m + 1) // 2 * 2
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+            z_load_phase = 0
+
+            # Unroll the first iteration.
+            # This guraranteed safe from our grid size.
+            pid_m, pid_n = _compute_pid(
+                start_pid, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+            )
+            offs_xm = pid_m * BLOCK_M
+            offs_wn = pid_n * BLOCK_N
+            z_idx = 0
+            for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                for group_id in tl.static_range(NUM_MMA_GROUPS):
+                    # Determine the base "index" to decide if we need to wait on TMA.
+                    z_idx_unrolled = slice_id * NUM_MMA_GROUPS + group_id
+                    if z_idx_unrolled >= NUM_Z_BUFFERS:
+                        tlx.async_descriptor_store_wait(NUM_Z_BUFFERS - 1)
+                        z_empty = tlx.local_view(z_empty_bars, z_idx)
+                        tlx.barrier_arrive(z_empty, 1)
+
+                    z_full = tlx.local_view(z_load_bars, z_idx)
+                    tlx.barrier_wait(z_full, z_load_phase)
+                    z_buf_view = tlx.local_view(z_buffers, z_idx)
+                    tlx.fence_async_shared()
+                    tlx.async_descriptor_store(
+                        z_desc,
+                        z_buf_view,
+                        [
+                            offs_xm + group_id * BLOCK_M_SPLIT,
+                            offs_wn + slice_id * slice_size,
+                        ],
+                    )
+
+                    z_load_phase = z_load_phase ^ (z_idx == (NUM_Z_BUFFERS - 1))
+                    # pyre-ignore[58]
+                    z_idx = (z_idx + 1) % NUM_Z_BUFFERS
+
+            for tile_id in range(start_pid + NUM_SMS, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+                offs_xm = pid_m * BLOCK_M
+                offs_wn = pid_n * BLOCK_N
+                for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                    for group_id in tl.static_range(NUM_MMA_GROUPS):
+                        # Wait on prior store to finish.
+                        tlx.async_descriptor_store_wait(NUM_Z_BUFFERS - 1)
+                        z_empty = tlx.local_view(z_empty_bars, z_idx)
+                        tlx.barrier_arrive(z_empty, 1)
+                        # Wait for the next load to be ready
+                        z_full = tlx.local_view(z_load_bars, z_idx)
+                        tlx.barrier_wait(z_full, z_load_phase)
+                        z_buf_view = tlx.local_view(z_buffers, z_idx)
+                        tlx.async_descriptor_store(
+                            z_desc,
+                            z_buf_view,
+                            [
+                                offs_xm + group_id * BLOCK_M_SPLIT,
+                                offs_wn + slice_id * slice_size,
+                            ],
+                        )
+                        z_load_phase = z_load_phase ^ (z_idx == (NUM_Z_BUFFERS - 1))
+                        # pyre-ignore[58]
+                        z_idx = (z_idx + 1) % NUM_Z_BUFFERS
+
+            # Wait for the last store.
+            tlx.async_descriptor_store_wait(0)
+
+
+@torch.fx.wrap
+def triton_addmm_fwd_tma_persistent(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+    warp_specialize: bool | None = None,
+) -> torch.Tensor:
+    _meta_ws = _use_meta_ws()
+    if warp_specialize is None:
+        warp_specialize = _meta_ws
+
+    M, K = x.shape
+    _, N = w.shape
+
+    is_y_1d = y.dim() == 1
+
+    # Allocate output
+    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return z
+
+    # A dummy block value that will be overwritten when we have the real block size
+    dummy_block = [1, 1]
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    x_desc = TensorDescriptor(x, x.shape, x.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    w_desc = TensorDescriptor(w, w.shape, w.stride(), dummy_block)
+    y = y.reshape(1, -1) if is_y_1d else y
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    y_desc = TensorDescriptor(y, y.shape, y.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    z_desc = TensorDescriptor(z, z.shape, z.stride(), dummy_block)
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+
+    def grid(meta):
+        BLOCK_M = meta["BLOCK_M"]
+        BLOCK_N = meta["BLOCK_N"]
+        return (
+            min(
+                NUM_SMS,
+                triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+            ),
+        )
+
+    _addmm_fwd_tma_persistent[grid](
+        x_desc,
+        w_desc,
+        y_desc,
+        z_desc,
+        M,
+        N,
+        K,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BROADCAST_Y=is_y_1d,
+        WARP_SPECIALIZE=warp_specialize,
+        NUM_SMS=NUM_SMS,
+        USE_META_WS=_meta_ws,
+    )
+    return z
+
+
+@torch.fx.wrap
+def triton_addmm_fwd_tma_ws_tlx(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    M, K = x.shape
+    _, N = w.shape
+
+    is_y_1d = y.dim() == 1
+
+    # Allocate output
+    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return z
+
+    # A dummy block value that will be overwritten when we have the real block size
+    dummy_block = [1, 1]
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    x_desc = TensorDescriptor(x, x.shape, x.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    w_desc = TensorDescriptor(w, w.shape, w.stride(), dummy_block)
+    y = y.reshape(1, -1) if is_y_1d else y
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    y_desc = TensorDescriptor(y, y.shape, y.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    z_desc = TensorDescriptor(z, z.shape, z.stride(), dummy_block)
+
+    def grid(meta):
+        BLOCK_M = meta["BLOCK_M"]
+        BLOCK_N = meta["BLOCK_N"]
+        return (
+            triton.cdiv(M, BLOCK_M),
+            triton.cdiv(N, BLOCK_N),
+        )
+
+    _addmm_fwd_tma_ws[grid](
+        x_desc,
+        w_desc,
+        y_desc,
+        z_desc,
+        M,
+        N,
+        K,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BROADCAST_Y=is_y_1d,
+        NUM_SMEM_BUFFERS=2,  # Double buffering
+    )
+    return z
+
+
+@torch.fx.wrap
+def triton_addmm_fwd_tma_ws_persistent_tlx(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    M, K = x.shape
+    _, N = w.shape
+
+    is_y_1d = y.dim() == 1
+
+    # Allocate output
+    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return z
+
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+
+    # A dummy block value that will be overwritten by the hook
+    dummy_block = [1, 1]
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    x_desc = TensorDescriptor(x, x.shape, x.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    w_desc = TensorDescriptor(w, w.shape, w.stride(), dummy_block)
+    y = y.reshape(1, -1) if is_y_1d else y
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    y_desc = TensorDescriptor(y, y.shape, y.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    z_desc = TensorDescriptor(z, z.shape, z.stride(), dummy_block)
+
+    def grid(meta):
+        BLOCK_M = meta["BLOCK_M"]
+        BLOCK_N = meta["BLOCK_N"]
+        num_pid_m = triton.cdiv(M, BLOCK_M)
+        num_pid_n = triton.cdiv(N, BLOCK_N)
+        # Round up num_pid_m to even for PAIR_CTA cluster compatibility
+        num_pid_m = (num_pid_m + 1) // 2 * 2
+        total_tiles = num_pid_m * num_pid_n
+        grid_size = min(NUM_SMS, total_tiles)
+        # Ensure grid is even for cluster compatibility
+        if grid_size % 2 == 1:
+            grid_size = min(grid_size + 1, NUM_SMS)
+            # If rounding up exceeds NUM_SMS and NUM_SMS is odd, round down instead
+            if grid_size % 2 == 1:
+                grid_size = grid_size - 1
+        return (grid_size,)
+
+    _addmm_fwd_tma_ws_persistent[grid](
+        x_desc,
+        w_desc,
+        y_desc,
+        z_desc,
+        M,
+        N,
+        K,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BROADCAST_Y=is_y_1d,
+        NUM_SMS=NUM_SMS,
+    )
+    return z
+
+
+@maybe_register_custom_op("generative_recommenders::triton_addmm_fwd", mutates_args=())
+def triton_addmm_fwd(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    M, K = x.shape
+    KB, N = w.shape
+    assert K == KB, f"incompatible dimensions {K}, {KB}"
+
+    is_y_1d = y.dim() == 1
+    NY = y.shape[0] if is_y_1d else y.shape[1]
+    assert N == NY, f"incompatible dimensions {N}, {NY}"
+
+    # Allocate output
+    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return z
+
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(M, meta["BLOCK_M"]),
+        triton.cdiv(N, meta["BLOCK_N"]),
+    )
+
+    _addmm_fwd[grid](
+        x,
+        w,
+        y,
+        z,
+        M,
+        N,
+        K,
+        x.stride(0),
+        x.stride(1),
+        w.stride(0),
+        w.stride(1),
+        y.stride(0) if not is_y_1d else 0,
+        y.stride(1) if not is_y_1d else y.stride(0),
+        z.stride(0),
+        z.stride(1),
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BROADCAST_Y=is_y_1d,
+    )
+    return z
+
+
+@triton_addmm_fwd.register_fake
+def triton_addmm_fwd_fake(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    """Fake implementation for FakeTensor tracing."""
+    M, _ = x.shape
+    _, N = w.shape
+    return torch.empty((M, N), device=x.device, dtype=x.dtype)
+
+
+def triton_addmm_bwd(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    dz: torch.Tensor,
+    is_y_1d: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if is_y_1d:
+        dy = torch.sum(dz, dim=0)
+    else:
+        dy = dz
+    dw = torch.mm(x.t(), dz)
+    dx = torch.mm(dz, w.t())
+
+    return dx, dw, dy
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::maybe_triton_addmm_fwd", mutates_args=()
+)
+def maybe_triton_addmm_fwd(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: Optional[torch.Tensor],
+) -> torch.Tensor:
+    # triton addmm is slower than torch (cublas) on AMD/Blackwell.
+    # Default to pytorch addmm on AMD/Blackwell for now.
+    if y is None:
+        return torch.mm(x, w)
+    if is_sm100_plus() or torch.version.hip is not None:
+        return torch.addmm(y, x, w)
+    else:
+        return triton_addmm_fwd(x=x, w=w, y=y)
+
+
+@maybe_triton_addmm_fwd.register_fake
+def maybe_triton_addmm_fwd_fake(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    """Fake implementation for FakeTensor tracing."""
+    M, _ = x.shape
+    _, N = w.shape
+    return torch.empty((M, N), device=x.device, dtype=x.dtype)
+
+
+class _AddMmFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        w: torch.Tensor,
+        y: torch.Tensor,
+    ) -> torch.Tensor:
+        ctx.save_for_backward(x, w)
+        ctx.is_y_1d = y.dim() == 1
+        if is_sm100_plus() and TMA_AVAILABLE and _check_tma_alignment(x, w, y):
+            if x.dtype == torch.float32 or HAS_TLX == False:
+                return triton_addmm_fwd_tma_persistent(x, w, y, warp_specialize=True)
+            else:
+                return triton_addmm_fwd_tma_ws_persistent_tlx(
+                    x, w, y
+                )  # tlx.async_dot doesn't support fp32 inputs because of WGMMA requirements
+        else:
+            return triton_addmm_fwd(x, w, y)
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dz: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        (x, w) = ctx.saved_tensors
+        return triton_addmm_bwd(x, w, dz, ctx.is_y_1d)
+
+
+def triton_addmm(
+    input: torch.Tensor,
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+) -> torch.Tensor:
+    return _AddMmFunction.apply(mat1, mat2, input)
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_attention_utils.py b/recommendation_v4/generative_recommenders/ops/triton/triton_attention_utils.py
new file mode 100644
index 000000000..61fd614f3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_attention_utils.py
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+
+
+@triton.jit
+def acc_dq(
+    dq_ptrs_trans,
+    start_m,
+    stride_dqm,
+    k,
+    dqk_trans,
+    alpha,
+    mask_m,
+    MAX_SEQ_LEN,
+    LOCK,
+    BLOCK_M: tl.constexpr,
+    ATOMIC_ADD: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+):
+    if ATOMIC_ADD:
+        lock_id = start_m // BLOCK_M
+        stride_lock = tl.cdiv(MAX_SEQ_LEN, BLOCK_M)
+        lock = LOCK + tl.program_id(0) * stride_lock + lock_id
+        tl.debug_barrier()  # add a barrier to force sync
+        while tl.atomic_cas(lock, 0, 1) == 1:
+            pass
+    dq_trans = tl.load(
+        dq_ptrs_trans + start_m * stride_dqm,
+        mask=mask_m[None, :],
+        other=0.0,
+        eviction_policy="evict_last",
+    )
+    dq_trans += tl.dot(tl.trans(k), dqk_trans, allow_tf32=ALLOW_TF32) * alpha
+    dq_trans = dq_trans.to(k.dtype)
+    tl.store(
+        dq_ptrs_trans + start_m * stride_dqm,
+        dq_trans,
+        mask=mask_m[None, :],
+        eviction_policy="evict_last",
+    )
+    if ATOMIC_ADD:
+        tl.atomic_xchg(lock, 0)  # pyre-ignore [61]
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
new file mode 100644
index 000000000..768ef0013
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
@@ -0,0 +1,3242 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/env python3
+
+# pyre-unsafe
+
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.ops.utils import (
+    copy_if_different_ptr,
+    maybe_register_custom_op,
+)
+
+try:
+    # @manual=//triton:triton
+    import triton.language.extra.tlx as tlx  # type: ignore
+
+    HAS_TLX = True
+except ImportError:
+    # suppress type checking errors
+    tlx = None
+
+    HAS_TLX = False
+
+from generative_recommenders.common import (
+    autotune_max_seq_len,
+    prev_power_of_2,
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+from generative_recommenders.ops.triton._autotune_pinning import pinned_or_full
+from triton.language.extra.libdevice import (  # @manual=//triton:triton
+    fast_dividef,
+    fast_expf,
+)
+
+try:
+    # @manual=//triton:triton
+    from triton.tools.tensor_descriptor import TensorDescriptor
+
+    tensor_descriptor_tma = True
+except ImportError:
+    tensor_descriptor_tma = False
+
+try:
+    from generative_recommenders.ops.triton.fb.triton_attention_utils import acc_dq
+except ImportError:
+    from generative_recommenders.ops.triton.triton_attention_utils import acc_dq
+
+
+def _host_descriptor_pre_hook(nargs):
+    if not tensor_descriptor_tma:
+        return
+
+    if not isinstance(nargs["Q"], TensorDescriptor):
+        return
+    BLOCK_M = nargs["BLOCK_M"]
+    BLOCK_N = nargs["BLOCK_N"]
+    BLOCK_D_Q = nargs["BLOCK_D_Q"]
+    BLOCK_D_V = nargs["BLOCK_D_V"]
+    if "USE_TLX" in nargs and nargs["USE_TLX"]:
+        BLOCK_M = BLOCK_M // nargs["NUM_MMA_GROUPS"]
+    nargs["Q"].block_shape = [BLOCK_M, BLOCK_D_Q]
+    nargs["V"].block_shape = [BLOCK_N, BLOCK_D_V]
+    nargs["K"].block_shape = [BLOCK_N, BLOCK_D_Q]
+
+
+# pyre-ignore[2]
+def _early_config_prune(
+    configs: List[triton.Config],
+    named_args,
+    **kwargs,
+) -> List[triton.Config]:
+    """Filter autotune configs that are incompatible with the current call.
+
+    The TLX (warp-specialized) variant of ``_hstu_attn_fwd`` calls
+    ``tlx.async_descriptor_load(Q, ...)`` which requires Q/K/V to be real TMA
+    tensor descriptors (``tl.tensor_descriptor_base``). They are only
+    constructed by the host wrapper when ``ENABLE_TMA=True`` AND the host
+    ``TensorDescriptor`` API is importable. If the kernel is invoked without
+    those preconditions, raw tensors flow into the TLX path and the
+    ``isinstance(desc, tl.tensor_descriptor_base)`` assert in
+    ``triton/language/extra/tlx/mem_ops.py`` fires at compile time.
+
+    We make autotuning robust to that mismatch by dropping any config with
+    ``USE_TLX=True`` whenever ENABLE_TMA is not set or TMA host descriptors
+    are unavailable. This is purely defensive: if the caller threads
+    ``enable_tma=True`` (see ``_should_enable_tma`` below) the TLX configs
+    remain eligible.
+    """
+    enable_tma = kwargs.get("ENABLE_TMA", None)
+    if enable_tma is None:
+        enable_tma = named_args.get("ENABLE_TMA", False)
+    if enable_tma and tensor_descriptor_tma:
+        return configs
+    pruned = [c for c in configs if not c.kwargs.get("USE_TLX", False)]
+    # Safety: never return an empty config list.
+    return pruned if pruned else configs
+
+
+def _should_enable_tma() -> bool:
+    """Return True iff the TMA / TLX fast path can be safely enabled.
+
+    Conditions:
+      * The host ``triton.tools.tensor_descriptor.TensorDescriptor`` API is
+        importable (``tensor_descriptor_tma``).
+      * CUDA is available and the device is Hopper (compute capability 9),
+        which is the only architecture for which TLX configs are emitted in
+        ``_get_fw_configs``.
+    """
+    if not tensor_descriptor_tma:
+        return False
+    if not torch.cuda.is_available():
+        return False
+    # NVIDIA-only gate: TMA (Tensor Memory Accelerator) is Hopper-specific
+    # hardware. On ROCm/HIP, `torch.cuda.get_device_capability()` mirrors the
+    # gfx name into a major.minor tuple — gfx950 (MI350X) returns (9, 5), which
+    # would otherwise pass the `device_capability == 9` check below and trick
+    # the kernel into taking the TMA path. The TMA path uses
+    # `triton.tools.tensor_descriptor.TensorDescriptor` and `TensorDescriptor.load`
+    # which lower to PTX `cp.async.bulk.tensor.*`; on AMD this either fails to
+    # compile or produces a kernel with mismatched reduction-dim shapes for
+    # `tl.dot(silu, v)` in `_hstu_attn_fwd_one_block` (see WARNING in
+    # `_hstu_attn_fwd` for the cascade). Bail out early on HIP so the
+    # non-TMA path is selected and AMD gets a working kernel.
+    if torch.version.hip:
+        return False
+    try:
+        device_capability = torch.cuda.get_device_capability()[0]
+    except (RuntimeError, AssertionError):
+        return False
+    return device_capability == 9
+
+
+def _get_fw_configs() -> List[triton.Config]:  # noqa: C901
+    configs = []
+    if torch.version.hip:
+        for BLOCK_M in [32, 64, 128]:
+            for BLOCK_N in [32, 64]:
+                for num_stages in [1, 2]:
+                    for num_warps in [4, 8]:
+                        for matrix_instr_nonkdim in [16, 32]:
+                            configs.append(
+                                triton.Config(
+                                    {
+                                        "BLOCK_M": BLOCK_M,
+                                        "BLOCK_N": BLOCK_N,
+                                        "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                                        "waves_per_eu": 0,
+                                        "kpack": 2,
+                                    },
+                                    num_stages=num_stages,
+                                    num_warps=num_warps,
+                                )
+                            )
+    else:
+        configs = [
+            triton.Config(
+                {"BLOCK_M": 16, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64},
+                num_stages=4,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 128},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 128},
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 64},
+                num_stages=2,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 64},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 64},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 64},
+                num_stages=4,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 64},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 64},
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 64},
+                num_stages=4,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 128},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 128},
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+        ]
+
+    # The `_hstu_attn_fwd` kernel signature unconditionally declares the four
+    # constexprs `USE_TLX`, `NUM_BUFFERS`, `NUM_MMA_WARPS_PER_GROUP`,
+    # `NUM_MMA_GROUPS` (introduced for the Hopper TLX warp-specialized variant).
+    # Triton requires every constexpr be bound at autotune time; missing any one
+    # of them triggers `TypeError: dynamic_func() missing N required positional
+    # arguments` during kernel dispatch. This loop populates the non-TLX defaults
+    # so the kernel call site doesn't have to know about TLX at all.
+    #
+    # IMPORTANT: this loop must apply to BOTH the HIP branch and the CUDA branch
+    # above. It used to live inside the CUDA `else:` block which meant HIP
+    # configs reached `_hstu_attn_fwd[grid](...)` without these defaults and
+    # crashed at dispatch. Keep this hoisted (outside the if/else) when
+    # editing — see commit message for the symptom.
+    for config in configs:
+        if not config.kwargs.get("USE_TLX", False):
+            config.kwargs["USE_TLX"] = False
+            config.kwargs["NUM_BUFFERS"] = 1
+            config.kwargs["NUM_MMA_WARPS_PER_GROUP"] = 1
+            config.kwargs["NUM_MMA_GROUPS"] = 1
+
+    # TLX (Triton Language Extension) warp-specialized configs are Hopper-only.
+    # Guard with `not torch.version.hip` so AMD never sees them — the TLX code
+    # path inside `_hstu_attn_fwd` calls `tlx.async_descriptor_load(...)` which
+    # requires real TMA tensor descriptors and only compiles on CUDA.
+    if not torch.version.hip:
+        if HAS_TLX:
+            try:
+                device_capability = torch.cuda.get_device_capability()[0]
+            except (RuntimeError, AssertionError):
+                # No CUDA device available
+                device_capability = None
+
+            if device_capability == 9:
+                # H100 configs
+                configs.append(
+                    triton.Config(
+                        {
+                            "BLOCK_M": 128,
+                            "BLOCK_N": 64,
+                            "USE_TLX": True,
+                            "NUM_BUFFERS": 2,
+                            "NUM_MMA_WARPS_PER_GROUP": 4,
+                            "NUM_MMA_GROUPS": 2,
+                        },
+                        num_stages=0,
+                        num_warps=4,
+                        pre_hook=_host_descriptor_pre_hook,
+                    ),
+                )
+
+    return configs
+
+
+@triton.jit
+def _hstu_attn_fwd_one_block(  # noqa: C901
+    start_n,
+    seq_len,
+    offs_m,
+    offs_n,
+    q,
+    K,
+    V,
+    K_block_ptr,
+    V_block_ptr,
+    offset_kh,
+    offset_vh,
+    seq_start,
+    n_targets,
+    alpha,
+    MAX_SEQ_LEN,
+    contextual_seq_len,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+):
+    start_n = tl.multiple_of(start_n, BLOCK_N)
+    # -- compute qk ----
+    k = None
+    qk = None
+    if ENABLE_TMA:
+        k = K.load(
+            [(seq_start + start_n).to(tl.int32), offset_kh.to(tl.int32)],
+        )
+        # tma can only be loaded in one order, use trans afterwards
+        qk = tl.dot(q, tl.trans(k), allow_tf32=ALLOW_TF32) * alpha
+    else:
+        k = tl.load(K_block_ptr, boundary_check=(1,), padding_option="zero")
+        qk = tl.dot(q, k, allow_tf32=ALLOW_TF32) * alpha
+    invalid_mask = offs_m[:, None] == offs_n[None, :]
+    max_ids = seq_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        offs_m = offs_m - contextual_seq_len + 1
+        offs_m = tl.where(
+            offs_m > 0,
+            offs_m,
+            0,
+        )
+        offs_n = offs_n - contextual_seq_len + 1
+        offs_n = tl.where(
+            offs_n > 0,
+            offs_n,
+            0,
+        )
+        max_ids = max_ids - contextual_seq_len + 1
+    if HAS_MULTIPLE_TARGETS:
+        max_ids = max_ids - n_targets
+        offs_m = tl.where(
+            offs_m < max_ids,
+            offs_m,
+            max_ids,
+        )
+        offs_n = tl.where(
+            offs_n < max_ids,
+            offs_n,
+            max_ids,
+        )
+    offs_m_minus_n = offs_m[:, None] - offs_n[None, :]
+    invalid_mask = invalid_mask or (offs_m_minus_n > 0)
+    if HAS_MAX_ATTN_LEN:
+        invalid_mask = invalid_mask and offs_m_minus_n <= max_attn_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        invalid_mask = invalid_mask or (
+            offs_m[:, None] == 0 and offs_n[None, :] < max_ids
+        )
+    scale = tl.where(invalid_mask, (1.0 / MAX_SEQ_LEN), 0.0)
+    silu = fast_dividef(qk, 1.0 + fast_expf(-qk)) * scale
+    v = None
+    if ENABLE_TMA:
+        v = V.load(
+            [(seq_start + start_n).to(tl.int32), offset_vh.to(tl.int32)],
+        )
+    else:
+        v = tl.load(V_block_ptr, boundary_check=(0,), padding_option="zero")
+    silu = silu.to(v.dtype)
+    return tl.dot(silu, v, allow_tf32=ALLOW_TF32)
+
+
+@triton.jit
+def _hstu_attn_fwd_compute(  # noqa C901
+    Q,
+    K,
+    V,
+    H,
+    DimQ,
+    DimV,
+    workspace_ptr,
+    seq_offsets,
+    num_targets,
+    Out,
+    stride_qm,
+    stride_qh,
+    stride_kn,
+    stride_kh,
+    stride_vn,
+    stride_vh,
+    stride_om,
+    stride_oh,
+    alpha,
+    MAX_SEQ_LEN,
+    DeltaSize,
+    contextual_seq_len,
+    max_attn_len,
+    off_z,
+    off_h,
+    pid,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    IS_DELTA_Q: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+    TMA_DESC_SIZE: tl.constexpr,
+):
+    seq_start = tl.load(seq_offsets + off_z).to(tl.int64)
+    off_h = off_h.to(tl.int64)
+    off_z = off_z.to(tl.int64)
+    seq_end = tl.load(seq_offsets + off_z + 1)
+    seq_len = (seq_end - seq_start).to(tl.int32)
+
+    if IS_DELTA_Q:
+        start_m_delta = pid * BLOCK_M
+        start_m = (start_m_delta + seq_len - DeltaSize).to(tl.int32)
+    else:
+        start_m_delta = 0
+        start_m = pid * BLOCK_M
+    if start_m < seq_len:
+        if HAS_MULTIPLE_TARGETS:
+            n_targets = tl.load(num_targets + off_z).to(tl.int32)
+        else:
+            n_targets = None
+
+        # initialize offsets
+        offs_m = start_m + tl.arange(0, BLOCK_M)
+        offs_n = tl.arange(0, BLOCK_N)
+        Q_block_ptr = None
+        K_block_ptr = None
+        V_block_ptr = None
+        if not ENABLE_TMA:
+            if IS_DELTA_Q:
+                Q_block_ptr = tl.make_block_ptr(
+                    base=Q + off_h * stride_qh + off_z * DeltaSize * stride_qm,
+                    shape=(DeltaSize, BLOCK_D_Q),
+                    strides=(stride_qm, 1),
+                    offsets=(start_m_delta, 0),
+                    block_shape=(BLOCK_M, BLOCK_D_Q),
+                    order=(1, 0),
+                )
+            else:
+                Q_block_ptr = tl.make_block_ptr(
+                    base=Q + off_h * stride_qh + seq_start * stride_qm,
+                    shape=(seq_len, BLOCK_D_Q),
+                    strides=(stride_qm, 1),
+                    offsets=(start_m, 0),
+                    block_shape=(BLOCK_M, BLOCK_D_Q),
+                    order=(1, 0),
+                )
+            q = tl.load(Q_block_ptr, boundary_check=(0,), padding_option="zero")
+
+            K_block_ptr = tl.make_block_ptr(
+                base=K + off_h * stride_kh + seq_start * stride_kn,
+                shape=(BLOCK_D_Q, seq_len),
+                strides=(1, stride_kn),
+                offsets=(0, 0),
+                block_shape=(BLOCK_D_Q, BLOCK_N),
+                order=(0, 1),
+            )
+            V_block_ptr = tl.make_block_ptr(
+                base=V + off_h * stride_vh + seq_start * stride_vn,
+                shape=(seq_len, BLOCK_D_V),
+                strides=(stride_vn, 1),
+                offsets=(0, 0),
+                block_shape=(BLOCK_N, BLOCK_D_V),
+                order=(1, 0),
+            )
+        else:
+            if IS_DELTA_Q:
+                q = Q.load(
+                    [
+                        (off_z * DeltaSize + start_m_delta).to(tl.int32),
+                        (off_h * stride_qh).to(tl.int32),
+                    ]
+                )
+            else:
+                q = Q.load(
+                    [
+                        (seq_start + start_m).to(tl.int32),
+                        (off_h * stride_qh).to(tl.int32),
+                    ]
+                )
+
+        acc = tl.zeros([BLOCK_M, BLOCK_D_V], dtype=tl.float32)
+        if HAS_MULTIPLE_TARGETS:
+            uih_end = seq_len - n_targets
+        else:
+            uih_end = seq_len
+        if HAS_CONTEXTUAL_SEQ_LEN is True and start_m < contextual_seq_len:
+            # uih_end must be larger than start_m
+            low = 0
+            high = seq_len
+        else:
+            low = 0
+            high = start_m + BLOCK_M
+            if HAS_MAX_ATTN_LEN:
+                if start_m > uih_end:
+                    low = uih_end - max_attn_len
+                else:
+                    low = start_m - max_attn_len
+                if HAS_CONTEXTUAL_SEQ_LEN:
+                    low = low if low > contextual_seq_len else 0
+                else:
+                    low = low if low > 0 else 0
+            if HAS_MULTIPLE_TARGETS:
+                uih_end = (uih_end + BLOCK_N - 1) // BLOCK_N * BLOCK_N
+                if uih_end < start_m:
+                    high = seq_len - n_targets
+
+        if low > 0:
+            if not ENABLE_TMA:
+                K_block_ptr = tl.advance(K_block_ptr, (0, low))
+                V_block_ptr = tl.advance(V_block_ptr, (low, 0))
+        end_n = low
+        for start_n in range(low, high, BLOCK_N):
+            acc += _hstu_attn_fwd_one_block(
+                start_n=start_n,
+                seq_len=seq_len,
+                offs_m=offs_m,
+                offs_n=offs_n + start_n,
+                q=q,
+                K=K,
+                V=V,
+                K_block_ptr=K_block_ptr,
+                V_block_ptr=V_block_ptr,
+                offset_kh=off_h * stride_kh,
+                offset_vh=off_h * stride_vh,
+                seq_start=seq_start,
+                n_targets=n_targets if HAS_MULTIPLE_TARGETS else None,
+                alpha=alpha,
+                MAX_SEQ_LEN=MAX_SEQ_LEN,
+                contextual_seq_len=contextual_seq_len,
+                max_attn_len=max_attn_len,
+                HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                ALLOW_TF32=ALLOW_TF32,
+                BLOCK_D_Q=BLOCK_D_Q,
+                BLOCK_D_V=BLOCK_D_V,
+                BLOCK_N=BLOCK_N,
+                ENABLE_TMA=ENABLE_TMA,
+            )
+            if not ENABLE_TMA:
+                K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+                V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+            end_n += BLOCK_N
+
+        if HAS_MULTIPLE_TARGETS:
+            # pyre-ignore[61]
+            if uih_end < start_m:
+                low_delta = start_m
+                high_delta = start_m + BLOCK_M
+                offset = (low_delta - end_n).to(tl.int32)
+                if not ENABLE_TMA:
+                    K_block_ptr = tl.advance(K_block_ptr, (0, offset))
+                    V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
+                for start_delta in tl.range(
+                    low_delta, high_delta, BLOCK_N, num_stages=0
+                ):
+                    acc += _hstu_attn_fwd_one_block(
+                        start_n=start_delta,
+                        seq_len=seq_len,
+                        offs_m=offs_m,
+                        offs_n=offs_n + start_delta,
+                        q=q,
+                        K=K,
+                        V=V,
+                        K_block_ptr=K_block_ptr,
+                        V_block_ptr=V_block_ptr,
+                        offset_kh=off_h * stride_kh,
+                        offset_vh=off_h * stride_vh,
+                        seq_start=seq_start,
+                        n_targets=n_targets if HAS_MULTIPLE_TARGETS else None,
+                        alpha=alpha,
+                        MAX_SEQ_LEN=MAX_SEQ_LEN,
+                        contextual_seq_len=contextual_seq_len,
+                        max_attn_len=max_attn_len,
+                        HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                        HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                        HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                        ALLOW_TF32=ALLOW_TF32,
+                        BLOCK_D_Q=BLOCK_D_Q,
+                        BLOCK_D_V=BLOCK_D_V,
+                        BLOCK_N=BLOCK_N,
+                        ENABLE_TMA=ENABLE_TMA,
+                    )
+                    if not ENABLE_TMA:
+                        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+                        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        # Don't use TMA in Jagged case since we don't want to overwrite
+        # the output of another sequence
+        if IS_DELTA_Q:
+            start_m_delta = pid * BLOCK_M
+            offs_m_delta = start_m_delta + tl.arange(0, BLOCK_M)
+            offs_v_d = tl.arange(0, BLOCK_D_V)
+            off_o = Out + off_z * DeltaSize * stride_om + off_h * stride_oh
+            out_ptrs = off_o + offs_m_delta[:, None] * stride_om + offs_v_d[None, :]
+            tl.store(out_ptrs, acc, mask=(offs_m_delta < DeltaSize)[:, None])
+        else:
+            # rematerialize offsets to save registers
+            start_m = pid * BLOCK_M
+            offs_m = start_m + tl.arange(0, BLOCK_M)
+            offs_v_d = tl.arange(0, BLOCK_D_V)
+            off_o = Out + seq_start * stride_om + off_h * stride_oh
+            out_ptrs = off_o + offs_m[:, None] * stride_om + offs_v_d[None, :]
+            tl.store(out_ptrs, acc, mask=(offs_m < seq_len)[:, None])
+
+
+@triton.jit
+def _hstu_attn_fwd_compute_main_loop_tlx(  # noqa C901
+    low,
+    high,
+    seq_len,
+    offs_m,
+    offs_n,
+    acc,
+    q_tiles,
+    k_tiles,
+    v_tiles,
+    q_fulls,
+    k_fulls,
+    v_fulls,
+    k_empties,
+    v_empties,
+    v_dtype,
+    n_targets,
+    alpha,
+    end_n,
+    loop_trip_cnt,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    cid: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,
+    MAX_SEQ_LEN: tl.constexpr,
+    WAIT_FOR_Q: tl.constexpr,
+):
+    if WAIT_FOR_Q:
+        # wait for the Q buffer to be populated by the producer
+        q_full = tlx.local_view(q_fulls, cid)
+        tlx.barrier_wait(q_full, 0)
+
+    q_tile = tlx.local_view(q_tiles, cid)
+
+    for start in tl.range(low + BLOCK_N, high, BLOCK_N, num_stages=0):
+        buf_id = loop_trip_cnt % NUM_BUFFERS
+        # buffers in a row share the same phase
+        kv_phase = (loop_trip_cnt // NUM_BUFFERS) % 2
+
+        start_n = tl.multiple_of(start, BLOCK_N)
+        offs_n_start = offs_n
+        offs_n = offs_n_start + start_n
+
+        # wait for the K buffer to be populated by the producer
+        k_full = tlx.local_view(k_fulls, buf_id)
+        tlx.barrier_wait(k_full, kv_phase)
+        k_tile = tlx.local_view(k_tiles, buf_id)
+
+        # tma can only be loaded in one order, use trans afterwards
+        k_tile = tlx.local_trans(k_tile)
+        # second
+        qk = tlx.async_dot(q_tile, k_tile)
+        # wait for the MMA using to complete
+        qk = tlx.async_dot_wait(0, qk)
+        # release the K buffer
+        k_empty = tlx.local_view(k_empties, buf_id)
+        tlx.barrier_arrive(k_empty, 1)
+
+        qk = qk * alpha
+
+        invalid_mask = offs_m[:, None] == offs_n[None, :]
+        max_ids = seq_len
+        if HAS_MULTIPLE_TARGETS:
+            max_ids = max_ids - n_targets
+            offs_m = tl.where(
+                offs_m < max_ids,
+                offs_m,
+                max_ids,
+            )
+            offs_n = tl.where(
+                offs_n < max_ids,
+                offs_n,
+                max_ids,
+            )
+        offs_m_minus_n = offs_m[:, None] - offs_n[None, :]
+        invalid_mask = invalid_mask or (offs_m_minus_n > 0)
+        if HAS_MAX_ATTN_LEN:
+            invalid_mask = invalid_mask and offs_m_minus_n <= max_attn_len
+        if HAS_CONTEXTUAL_SEQ_LEN:
+            invalid_mask = invalid_mask or (
+                offs_m[:, None] == 0 and offs_n[None, :] < max_ids
+            )
+        scale = tl.where(invalid_mask, (1.0 / MAX_SEQ_LEN), 0.0)
+        silu = fast_dividef(qk, 1.0 + fast_expf(-qk)) * scale
+        silu = silu.to(v_dtype)
+
+        # wait for the V buffer to be populated by the producer
+        v_full = tlx.local_view(v_fulls, buf_id)
+        tlx.barrier_wait(v_full, kv_phase)
+        v_tile = tlx.local_view(v_tiles, buf_id)
+        acc = tlx.async_dot(silu, v_tile, acc)
+        # wait for the MMA using to complete
+        acc = tlx.async_dot_wait(0, acc)
+        # release the V buffer
+        v_empty = tlx.local_view(v_empties, buf_id)
+        tlx.barrier_arrive(v_empty, 1)
+
+        end_n += BLOCK_N
+
+        # increment loop trip counts
+        loop_trip_cnt += 1
+
+    return acc, end_n, loop_trip_cnt
+
+
+@triton.jit
+def _hstu_attn_fwd_compute_main_loop_tlx_pipelined(  # noqa C901
+    low,
+    high,
+    seq_len,
+    offs_m,
+    offs_n,
+    acc,
+    q_tiles,
+    k_tiles,
+    v_tiles,
+    q_fulls,
+    k_fulls,
+    v_fulls,
+    k_empties,
+    v_empties,
+    v_dtype,
+    n_targets,
+    alpha,
+    end_n,
+    loop_trip_cnt,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    cid: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,
+    MAX_SEQ_LEN: tl.constexpr,
+    WAIT_FOR_Q: tl.constexpr,
+):
+    if WAIT_FOR_Q:
+        # wait for the Q buffer to be populated by the producer
+        q_full = tlx.local_view(q_fulls, cid)
+        tlx.barrier_wait(q_full, 0)
+    q_tile = tlx.local_view(q_tiles, cid)
+
+    # wait for the K buffer to be populated by the producer
+    k_buf_id = loop_trip_cnt % NUM_BUFFERS
+    # buffers in a row share the same phase
+    k_phase = (loop_trip_cnt // NUM_BUFFERS) % 2
+
+    k_full = tlx.local_view(k_fulls, k_buf_id)
+    tlx.barrier_wait(k_full, k_phase)
+    k_tile = tlx.local_view(k_tiles, k_buf_id)
+
+    # tma can only be loaded in one order, use trans afterwards
+    k_tile = tlx.local_trans(k_tile)
+
+    # Pingpong
+    if cid == 0:
+        # Consumer 0 waits for Consumer 1 to reach synchronization point at barrier 9.
+        tlx.named_barrier_wait(9, 256)
+    else:
+        # Consumer 1 signals its arrival at barrier 9.
+        tlx.named_barrier_arrive(9, 256)
+        # Then waits at barrier 10 until Consumer 0 finishes issuing its async_dot.
+        tlx.named_barrier_wait(10, 256)
+
+    qk = tlx.async_dot(q_tile, k_tile)
+
+    if cid == 0:
+        # After issuing async_dot, Consumer 0 signals barrier 10 to unblock Consumer 1.
+        tlx.named_barrier_arrive(10, 256)
+
+    # wait for the MMA using to complete
+    qk = tlx.async_dot_wait(0, qk)
+    # release the K buffer
+    k_empty = tlx.local_view(k_empties, k_buf_id)
+    tlx.barrier_arrive(k_empty, 1)
+
+    qk = qk * alpha
+
+    start_n = tl.multiple_of(low, BLOCK_N)
+    offs_n_start = offs_n
+    offs_n = offs_n_start + start_n
+
+    invalid_mask = offs_m[:, None] == offs_n[None, :]
+    max_ids = seq_len
+    if HAS_MULTIPLE_TARGETS:
+        max_ids = max_ids - n_targets
+        offs_m = tl.where(
+            offs_m < max_ids,
+            offs_m,
+            max_ids,
+        )
+        offs_n = tl.where(
+            offs_n < max_ids,
+            offs_n,
+            max_ids,
+        )
+    offs_m_minus_n = offs_m[:, None] - offs_n[None, :]
+    invalid_mask = invalid_mask or (offs_m_minus_n > 0)
+    if HAS_MAX_ATTN_LEN:
+        invalid_mask = invalid_mask and offs_m_minus_n <= max_attn_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        invalid_mask = invalid_mask or (
+            offs_m[:, None] == 0 and offs_n[None, :] < max_ids
+        )
+    scale = tl.where(invalid_mask, (1.0 / MAX_SEQ_LEN), 0.0)
+    silu = fast_dividef(qk, 1.0 + fast_expf(-qk)) * scale
+    silu = silu.to(v_dtype)
+
+    loop_trip_cnt += 1
+
+    for start in tl.range(low + BLOCK_N, high, BLOCK_N, num_stages=0):
+        start_n = tl.multiple_of(start, BLOCK_N)
+        offs_n = offs_n_start + start_n
+
+        k_buf_id = loop_trip_cnt % NUM_BUFFERS
+        # buffers in a row share the same phase
+        k_phase = k_phase ^ (k_buf_id == 0)
+
+        # wait for the K buffer to be populated by the producer
+        k_full = tlx.local_view(k_fulls, k_buf_id)
+        tlx.barrier_wait(k_full, k_phase)
+        k_tile = tlx.local_view(k_tiles, k_buf_id)
+
+        # tma can only be loaded in one order, use trans afterwards
+        k_tile = tlx.local_trans(k_tile)
+
+        qk = tlx.async_dot(q_tile, k_tile)
+        # wait for the MMA using to complete
+        prev_silu = silu
+
+        v_buf_id = (loop_trip_cnt - 1) % NUM_BUFFERS
+        # v_phase = v_phase ^ (v_buf_id == 0)
+        v_phase = ((loop_trip_cnt - 1) // NUM_BUFFERS) % 2
+        v_full = tlx.local_view(v_fulls, v_buf_id)
+        tlx.barrier_wait(v_full, v_phase)
+        v_tile = tlx.local_view(v_tiles, v_buf_id)
+        acc = tlx.async_dot(prev_silu, v_tile, acc)
+        qk = tlx.async_dot_wait(1, qk)
+
+        # release the K buffer
+        k_empty = tlx.local_view(k_empties, k_buf_id)
+        tlx.barrier_arrive(k_empty, 1)
+
+        qk = qk * alpha
+        invalid_mask = offs_m[:, None] == offs_n[None, :]
+        max_ids = seq_len
+        if HAS_MULTIPLE_TARGETS:
+            max_ids = max_ids - n_targets
+            offs_m = tl.where(
+                offs_m < max_ids,
+                offs_m,
+                max_ids,
+            )
+            offs_n = tl.where(
+                offs_n < max_ids,
+                offs_n,
+                max_ids,
+            )
+        offs_m_minus_n = offs_m[:, None] - offs_n[None, :]
+        invalid_mask = invalid_mask or (offs_m_minus_n > 0)
+        if HAS_MAX_ATTN_LEN:
+            invalid_mask = invalid_mask and offs_m_minus_n <= max_attn_len
+        if HAS_CONTEXTUAL_SEQ_LEN:
+            invalid_mask = invalid_mask or (
+                offs_m[:, None] == 0 and offs_n[None, :] < max_ids
+            )
+        scale = tl.where(invalid_mask, (1.0 / MAX_SEQ_LEN), 0.0)
+        silu = fast_dividef(qk, 1.0 + fast_expf(-qk)) * scale
+        silu = silu.to(v_dtype)
+
+        acc = tlx.async_dot_wait(0, acc)
+        # release the V buffer
+        v_empty = tlx.local_view(v_empties, v_buf_id)
+        tlx.barrier_arrive(v_empty, 1)
+
+        end_n += BLOCK_N
+
+        # increment loop trip counts
+        loop_trip_cnt += 1
+        # v_buf_id = loop_trip_cnt % NUM_BUFFERS
+        # v_phase = (loop_trip_cnt // NUM_BUFFERS) % 2
+
+    # wait for the V buffer to be populated by the producer
+    v_buf_id = (loop_trip_cnt - 1) % NUM_BUFFERS
+    v_phase = ((loop_trip_cnt - 1) // NUM_BUFFERS) % 2
+    v_full = tlx.local_view(v_fulls, v_buf_id)
+    # tlx.barrier_wait(v_full, v_buf_id)
+    v_tile = tlx.local_view(v_tiles, v_buf_id)
+    tlx.barrier_wait(v_full, v_phase)
+    acc = tlx.async_dot(silu, v_tile, acc)
+    acc = tlx.async_dot_wait(0, acc)
+    # release the V buffer
+    v_empty = tlx.local_view(v_empties, v_buf_id)
+    tlx.barrier_arrive(v_empty, 1)
+
+    return acc, end_n, loop_trip_cnt
+
+
+@triton.jit
+def _hstu_attn_fwd_load_K_or_V(
+    K,
+    k_tiles,
+    k_empties,
+    k_fulls,
+    buf_id,
+    k_phase,
+    start_n,
+    seq_start,
+    offset_kh,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    # wait for the K buffer to be released by the consumer
+    k_empty = tlx.local_view(k_empties, buf_id)
+    tlx.barrier_wait(k_empty, k_phase)
+    # load K
+    k_full = tlx.local_view(k_fulls, buf_id)
+    k_tile = tlx.local_view(k_tiles, buf_id)
+    tlx.barrier_expect_bytes(k_full, 2 * BLOCK_N * BLOCK_D_Q)  # float16
+    tlx.async_descriptor_load(
+        K,
+        k_tile,
+        [(seq_start + start_n).to(tl.int32), offset_kh.to(tl.int32)],
+        k_full,
+    )
+
+
+@triton.jit
+def _hstu_attn_fwd_load_Q(
+    Q,
+    q_tiles,
+    q_fulls,
+    cid,
+    off_z,
+    off_h,
+    stride_qh,
+    start_m,
+    seq_start,
+    DeltaSize,
+    IS_DELTA_Q: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+):
+    q_full = tlx.local_view(q_fulls, cid)
+    tlx.barrier_expect_bytes(q_full, 2 * BLOCK_M * BLOCK_D_Q)  # float16
+    q_tile = tlx.local_view(q_tiles, cid)
+    seq_offset = start_m + cid * BLOCK_M
+    if IS_DELTA_Q:
+        tlx.async_descriptor_load(
+            Q,
+            q_tile,
+            [
+                (off_z * DeltaSize + start_m).to(tl.int32),
+                (off_h * stride_qh).to(tl.int32),
+            ],
+            q_full,
+        )
+    else:
+        tlx.async_descriptor_load(
+            Q,
+            q_tile,
+            [
+                (seq_start + seq_offset).to(tl.int32),
+                (off_h * stride_qh).to(tl.int32),
+            ],
+            q_full,
+        )
+
+
+@triton.jit
+def _hstu_attn_fwd_caculate_range(
+    seq_len,
+    start_m,
+    n_targets,
+    contextual_seq_len,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    if HAS_MULTIPLE_TARGETS:
+        uih_end = seq_len - n_targets
+    else:
+        uih_end = seq_len
+
+    if HAS_CONTEXTUAL_SEQ_LEN is True and start_m < contextual_seq_len:
+        # uih_end must be larger than start_m
+        low = 0
+        high = seq_len
+    else:
+        low = 0
+        high = start_m + BLOCK_M
+        if HAS_MAX_ATTN_LEN:
+            if start_m > uih_end:
+                low = uih_end - max_attn_len
+            else:
+                low = start_m - max_attn_len
+            if HAS_CONTEXTUAL_SEQ_LEN:
+                low = low if low > contextual_seq_len else 0
+            else:
+                low = low if low > 0 else 0
+        if HAS_MULTIPLE_TARGETS:
+            uih_end = (uih_end + BLOCK_N - 1) // BLOCK_N * BLOCK_N
+            if uih_end < start_m:
+                high = seq_len - n_targets
+
+    return low, high, uih_end
+
+
+@triton.jit
+def _hstu_attn_fwd_load_Q_K_V(
+    Q,
+    K,
+    V,
+    q_tiles,
+    k_tiles,
+    v_tiles,
+    q_fulls,
+    k_fulls,
+    v_fulls,
+    k_empties,
+    v_empties,
+    stride_qh,
+    stride_kh,
+    stride_vh,
+    contextual_seq_len,
+    max_attn_len,
+    DeltaSize,
+    off_z,
+    off_h,
+    start_m,
+    seq_start,
+    seq_len,
+    n_targets,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    IS_DELTA_Q: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,
+    NUM_MMA_GROUPS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+):
+    # load q: it will stay in SRAM throughout
+    BLOCK_M_SPLIT: tl.constexpr = BLOCK_M // NUM_MMA_GROUPS
+
+    _hstu_attn_fwd_load_Q(
+        Q=Q,
+        q_tiles=q_tiles,
+        q_fulls=q_fulls,
+        cid=0,
+        off_z=off_z,
+        off_h=off_h,
+        stride_qh=stride_qh,
+        start_m=start_m,
+        seq_start=seq_start,
+        DeltaSize=DeltaSize,
+        IS_DELTA_Q=IS_DELTA_Q,
+        BLOCK_D_Q=BLOCK_D_Q,
+        BLOCK_M=BLOCK_M_SPLIT,
+    )
+
+    off_h = off_h.to(tl.int64)
+    off_z = off_z.to(tl.int64)
+    offset_kh = off_h * stride_kh
+    offset_vh = off_h * stride_vh
+
+    low, high, uih_end = _hstu_attn_fwd_caculate_range(
+        seq_len,
+        start_m,
+        n_targets,
+        contextual_seq_len,
+        max_attn_len,
+        HAS_MULTIPLE_TARGETS,
+        HAS_CONTEXTUAL_SEQ_LEN,
+        HAS_MAX_ATTN_LEN,
+        BLOCK_M,
+        BLOCK_N,
+    )
+
+    kv_phase = 0
+    loop_trip_cnt = 0
+
+    # pyre-ignore[58]
+    buf_id = loop_trip_cnt % NUM_BUFFERS
+    # buffers in a row share the same phase
+    kv_phase = kv_phase ^ (buf_id == 0)
+
+    start_n = tl.multiple_of(low, BLOCK_N)
+
+    _hstu_attn_fwd_load_K_or_V(
+        K,
+        k_tiles,
+        k_empties,
+        k_fulls,
+        buf_id,
+        kv_phase,
+        start_n,
+        seq_start,
+        offset_kh,
+        BLOCK_D_Q,
+        BLOCK_N,
+    )
+
+    for cid in tl.range(1, NUM_MMA_GROUPS, loop_unroll_factor=NUM_MMA_GROUPS - 1):
+        _hstu_attn_fwd_load_Q(
+            Q,
+            q_tiles,
+            q_fulls,
+            cid,
+            off_z,
+            off_h,
+            stride_qh,
+            start_m,
+            seq_start,
+            DeltaSize,
+            IS_DELTA_Q,
+            BLOCK_D_Q,
+            BLOCK_M_SPLIT,
+        )
+
+    _hstu_attn_fwd_load_K_or_V(
+        V,
+        v_tiles,
+        v_empties,
+        v_fulls,
+        buf_id,
+        kv_phase,
+        start_n,
+        seq_start,
+        offset_vh,
+        BLOCK_D_V,
+        BLOCK_N,
+    )
+
+    loop_trip_cnt += 1
+
+    for start in range(low + BLOCK_N, high, BLOCK_N):
+        # pyre-ignore[58]
+        buf_id = loop_trip_cnt % NUM_BUFFERS
+        # buffers in a row share the same phase
+        kv_phase = kv_phase ^ (buf_id == 0)
+
+        start_n = tl.multiple_of(start, BLOCK_N)
+
+        _hstu_attn_fwd_load_K_or_V(
+            K,
+            k_tiles,
+            k_empties,
+            k_fulls,
+            buf_id,
+            kv_phase,
+            start_n,
+            seq_start,
+            offset_kh,
+            BLOCK_D_Q,
+            BLOCK_N,
+        )
+
+        _hstu_attn_fwd_load_K_or_V(
+            V,
+            v_tiles,
+            v_empties,
+            v_fulls,
+            buf_id,
+            kv_phase,
+            start_n,
+            seq_start,
+            offset_vh,
+            BLOCK_D_V,
+            BLOCK_N,
+        )
+
+        # increment loop trip counts
+        loop_trip_cnt += 1
+
+    # pyre-ignore[61]
+    if uih_end < start_m:
+        low_delta = start_m
+        high_delta = start_m + BLOCK_M
+        for start_delta in tl.range(low_delta, high_delta, BLOCK_N, num_stages=0):
+            # pyre-ignore[58]
+            buf_id = loop_trip_cnt % NUM_BUFFERS
+            # buffers in a row share the same phase
+            kv_phase = kv_phase ^ (buf_id == 0)
+
+            start_n = tl.multiple_of(start_delta, BLOCK_N)
+
+            _hstu_attn_fwd_load_K_or_V(
+                K,
+                k_tiles,
+                k_empties,
+                k_fulls,
+                buf_id,
+                kv_phase,
+                start_n,
+                seq_start,
+                offset_kh,
+                BLOCK_D_Q,
+                BLOCK_N,
+            )
+
+            _hstu_attn_fwd_load_K_or_V(
+                V,
+                v_tiles,
+                v_empties,
+                v_fulls,
+                buf_id,
+                kv_phase,
+                start_n,
+                seq_start,
+                offset_vh,
+                BLOCK_D_V,
+                BLOCK_N,
+            )
+
+            # increment loop trip counts
+            loop_trip_cnt += 1
+
+
+@triton.jit
+def _hstu_attn_fwd_compute_tlx(  # noqa C901
+    Q,
+    K,
+    V,
+    H,
+    DimQ,
+    DimV,
+    seq_offsets,
+    num_targets,
+    Out,
+    stride_qh,
+    stride_kh,
+    stride_vh,
+    stride_om,
+    stride_oh,
+    alpha,
+    MAX_SEQ_LEN,
+    DeltaSize,
+    contextual_seq_len,
+    max_attn_len,
+    off_z,
+    off_h,
+    pid,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    IS_DELTA_Q: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,  #
+    NUM_MMA_WARPS_PER_GROUP: tl.constexpr,  #
+    NUM_MMA_GROUPS: tl.constexpr,  #
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+):
+    seq_start = tl.load(seq_offsets + off_z).to(tl.int64)
+    seq_end = tl.load(seq_offsets + off_z + 1)
+    seq_len = (seq_end - seq_start).to(tl.int32)
+
+    if IS_DELTA_Q:
+        start_m = pid * BLOCK_M
+        start_m = (start_m + seq_len - DeltaSize).to(tl.int32)
+    else:
+        start_m = pid * BLOCK_M
+
+    if start_m >= seq_len:
+        return
+
+    if HAS_MULTIPLE_TARGETS:
+        n_targets = tl.load(num_targets + off_z).to(tl.int32)
+    else:
+        n_targets = None
+
+    BLOCK_M_SPLIT: tl.constexpr = BLOCK_M // NUM_MMA_GROUPS
+    # allocate buffers
+    q_tiles = tlx.local_alloc(
+        (BLOCK_M_SPLIT, BLOCK_D_Q), tlx.dtype_of(Q), NUM_MMA_GROUPS
+    )
+    k_tiles = tlx.local_alloc((BLOCK_N, BLOCK_D_Q), tlx.dtype_of(K), NUM_BUFFERS)
+    v_tiles = tlx.local_alloc((BLOCK_N, BLOCK_D_V), tlx.dtype_of(V), NUM_BUFFERS)
+
+    # allocate barriers
+    q_fulls = tlx.alloc_barriers(num_barriers=NUM_MMA_GROUPS, arrive_count=1)
+    k_empties = tlx.alloc_barriers(
+        num_barriers=NUM_BUFFERS, arrive_count=NUM_MMA_GROUPS
+    )
+    k_fulls = tlx.alloc_barriers(num_barriers=NUM_BUFFERS, arrive_count=1)
+    v_empties = tlx.alloc_barriers(
+        num_barriers=NUM_BUFFERS, arrive_count=NUM_MMA_GROUPS
+    )
+    v_fulls = tlx.alloc_barriers(num_barriers=NUM_BUFFERS, arrive_count=1)
+
+    with tlx.async_tasks():
+        # producer group
+        with tlx.async_task("default"):
+            _hstu_attn_fwd_load_Q_K_V(
+                Q=Q,
+                K=K,
+                V=V,
+                q_tiles=q_tiles,
+                k_tiles=k_tiles,
+                v_tiles=v_tiles,
+                q_fulls=q_fulls,
+                k_fulls=k_fulls,
+                v_fulls=v_fulls,
+                k_empties=k_empties,
+                v_empties=v_empties,
+                stride_qh=stride_qh,
+                stride_kh=stride_kh,
+                stride_vh=stride_vh,
+                contextual_seq_len=contextual_seq_len,
+                max_attn_len=max_attn_len,
+                DeltaSize=DeltaSize,
+                off_z=off_z,
+                off_h=off_h,
+                start_m=start_m,
+                seq_start=seq_start,
+                seq_len=seq_len,
+                n_targets=n_targets,
+                HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                IS_DELTA_Q=IS_DELTA_Q,
+                BLOCK_D_Q=BLOCK_D_Q,
+                BLOCK_D_V=BLOCK_D_V,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+                NUM_BUFFERS=NUM_BUFFERS,
+                NUM_MMA_GROUPS=NUM_MMA_GROUPS,
+                HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            )
+
+        # consumer groups
+        with tlx.async_task(
+            num_warps=NUM_MMA_WARPS_PER_GROUP, registers=232, replicate=NUM_MMA_GROUPS
+        ):
+            cid = tlx.async_task_replica_id()
+            acc = tl.zeros([BLOCK_M_SPLIT, BLOCK_D_V], dtype=tl.float32)
+            # initialize offsets
+            offs_m = start_m + tl.arange(0, BLOCK_M_SPLIT) + cid * BLOCK_M_SPLIT
+            offs_n = tl.arange(0, BLOCK_N)
+
+            low, high, uih_end = _hstu_attn_fwd_caculate_range(
+                seq_len,
+                start_m,
+                n_targets,
+                contextual_seq_len,
+                max_attn_len,
+                HAS_MULTIPLE_TARGETS,
+                HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN,
+                BLOCK_M,
+                BLOCK_N,
+            )
+
+            end_n = low
+            loop_trip_cnt = 0
+
+            acc, end_n, loop_trip_cnt = _hstu_attn_fwd_compute_main_loop_tlx_pipelined(
+                low=low,
+                high=high,
+                seq_len=seq_len,
+                offs_m=offs_m,
+                offs_n=offs_n,
+                acc=acc,
+                q_tiles=q_tiles,
+                k_tiles=k_tiles,
+                v_tiles=v_tiles,
+                q_fulls=q_fulls,
+                k_fulls=k_fulls,
+                v_fulls=v_fulls,
+                k_empties=k_empties,
+                v_empties=v_empties,
+                v_dtype=tlx.dtype_of(V),
+                n_targets=n_targets,
+                alpha=alpha,
+                end_n=end_n,
+                loop_trip_cnt=loop_trip_cnt,
+                max_attn_len=max_attn_len,
+                HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                cid=cid,
+                BLOCK_N=BLOCK_N,
+                NUM_BUFFERS=NUM_BUFFERS,
+                MAX_SEQ_LEN=MAX_SEQ_LEN,
+                WAIT_FOR_Q=1,
+            )
+
+            # pyre-ignore[61]
+            if uih_end < start_m:
+                low_delta = start_m
+                high_delta = start_m + BLOCK_M
+                acc, end_n, loop_trip_cnt = _hstu_attn_fwd_compute_main_loop_tlx(
+                    low=low_delta,
+                    high=high_delta,
+                    seq_len=seq_len,
+                    offs_m=offs_m,
+                    offs_n=offs_n,
+                    acc=acc,
+                    q_tiles=q_tiles,
+                    k_tiles=k_tiles,
+                    v_tiles=v_tiles,
+                    q_fulls=q_fulls,
+                    k_fulls=k_fulls,
+                    v_fulls=v_fulls,
+                    k_empties=k_empties,
+                    v_empties=v_empties,
+                    v_dtype=tlx.dtype_of(V),
+                    n_targets=n_targets,
+                    alpha=alpha,
+                    end_n=end_n,
+                    loop_trip_cnt=loop_trip_cnt,
+                    max_attn_len=max_attn_len,
+                    HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                    HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                    HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                    cid=cid,
+                    BLOCK_N=BLOCK_N,
+                    NUM_BUFFERS=NUM_BUFFERS,
+                    MAX_SEQ_LEN=MAX_SEQ_LEN,
+                    WAIT_FOR_Q=0,
+                )
+
+            # Don't use TMA in Jagged case since we don't want to overwrite
+            # the output of another sequence
+            if IS_DELTA_Q:
+                start_m_delta = pid * BLOCK_M + cid * BLOCK_M_SPLIT
+                offs_m_delta = start_m_delta + tl.arange(0, BLOCK_M_SPLIT)
+                offs_v_d = tl.arange(0, BLOCK_D_V)
+                off_o = Out + off_z * DeltaSize * stride_om + off_h * stride_oh
+                out_ptrs = off_o + offs_m_delta[:, None] * stride_om + offs_v_d[None, :]
+                tl.store(out_ptrs, acc, mask=(offs_m_delta < DeltaSize)[:, None])
+            else:
+                # rematerialize offsets to save registers
+                start_m = pid * BLOCK_M + cid * BLOCK_M_SPLIT
+                offs_m = start_m + tl.arange(0, BLOCK_M_SPLIT)
+                offs_v_d = tl.arange(0, BLOCK_D_V)
+                off_o = Out + seq_start * stride_om + off_h * stride_oh
+                out_ptrs = off_o + offs_m[:, None] * stride_om + offs_v_d[None, :]
+                tl.store(out_ptrs, acc, mask=(offs_m < seq_len)[:, None])
+
+
+def _get_fw_pinned_configs() -> List[triton.Config]:
+    # Pinned forward-attention configs for MI350X gfx950. The full search is a
+    # coin flip between matrix_instr_nonkdim 16 (fast) and 32 (slow); pinning the
+    # known winners makes cold starts deterministic. See _autotune_pinning.
+    #
+    # This is a LIST, one entry per training shape we've tuned (currently just
+    # bs=1024). With >1 entry the autotuner still runs a tiny benchmark over only
+    # these candidates and caches the winner per `key` (AUTOTUNE_Z / H /
+    # AUTOTUNE_MAX_SEQ_LEN / DimQ / DimV / ...), so each batch size automatically
+    # picks its own config — same pattern as the layer-norm pins.
+    #
+    # TO ADD A NEW BATCH SIZE / SHAPE:
+    #   1. Run once with TRITON_FULL_AUTOTUNE=1 TRITON_PRINT_AUTOTUNING=1.
+    #   2. Grep the log for "best config selected:" under "_hstu_attn_fwd".
+    #   3. Append that config below (copy BLOCK_M/BLOCK_N/matrix_instr_nonkdim/
+    #      waves_per_eu/kpack/num_stages/num_warps verbatim).
+    # The four USE_TLX/NUM_* defaults below are required by the kernel signature
+    # (see the USE_TLX-default loop in _get_fw_configs); the pinned path bypasses
+    # that loop, so every pinned entry must set them explicitly.
+    if torch.version.hip:
+        return [
+            # --- yambda bs=1024, L=2048 winner (from capture log) ---
+            triton.Config(
+                {
+                    "BLOCK_M": 128,
+                    "BLOCK_N": 32,
+                    "matrix_instr_nonkdim": 16,
+                    "waves_per_eu": 0,
+                    "kpack": 2,
+                    "USE_TLX": False,
+                    "NUM_BUFFERS": 1,
+                    "NUM_MMA_WARPS_PER_GROUP": 1,
+                    "NUM_MMA_GROUPS": 1,
+                },
+                num_stages=2,
+                num_warps=8,
+            ),
+            # --- add more (bs, L) winners here; see "TO ADD A NEW BATCH SIZE" ---
+        ]
+    return _get_fw_configs()
+
+
+@triton_autotune(
+    configs=pinned_or_full(_get_fw_pinned_configs(), _get_fw_configs),
+    key=[
+        "AUTOTUNE_Z",
+        "H",
+        "AUTOTUNE_MAX_SEQ_LEN",
+        "DimQ",
+        "DimV",
+        "DeltaSize",
+        "IS_DELTA_Q",
+    ],
+    prune_configs_by={"early_config_prune": _early_config_prune},
+)
+@triton.jit
+def _hstu_attn_fwd(  # noqa C901
+    Q,
+    K,
+    V,
+    workspace_ptr,
+    sort_by_length_indices,
+    seq_offsets,
+    num_targets,
+    Out,
+    stride_qm,
+    stride_qh,
+    stride_kn,
+    stride_kh,
+    stride_vn,
+    stride_vh,
+    stride_om,
+    stride_oh,
+    alpha,
+    Z,
+    AUTOTUNE_Z,
+    H,
+    MAX_SEQ_LEN,
+    AUTOTUNE_MAX_SEQ_LEN,  # Quantized MAX_SEQ_LEN used as an autotuning key
+    DimQ,
+    DimV,
+    DeltaSize,
+    contextual_seq_len,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    IS_DELTA_Q: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    USE_TLX: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,  #
+    NUM_MMA_WARPS_PER_GROUP: tl.constexpr,  #
+    NUM_MMA_GROUPS: tl.constexpr,  #
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    HAS_SORT_BY_LENGTH_INDICES: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+    TMA_DESC_SIZE: tl.constexpr,
+):
+    off_hz = tl.program_id(1)
+    off_z = off_hz // H
+    if HAS_SORT_BY_LENGTH_INDICES:
+        off_z = tl.load(sort_by_length_indices + off_z)
+    off_h = off_hz % H
+    pid = tl.program_id(0)
+    if USE_TLX:
+        _hstu_attn_fwd_compute_tlx(
+            Q=Q,
+            K=K,
+            V=V,
+            H=H,
+            DimQ=DimQ,
+            DimV=DimV,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            Out=Out,
+            stride_qh=stride_qh,
+            stride_kh=stride_kh,
+            stride_vh=stride_vh,
+            stride_om=stride_om,
+            stride_oh=stride_oh,
+            alpha=alpha,
+            MAX_SEQ_LEN=MAX_SEQ_LEN,
+            DeltaSize=DeltaSize,
+            contextual_seq_len=contextual_seq_len,
+            max_attn_len=max_attn_len,
+            off_z=off_z,
+            off_h=off_h,
+            pid=pid,
+            HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+            IS_DELTA_Q=IS_DELTA_Q,
+            ALLOW_TF32=ALLOW_TF32,
+            BLOCK_D_Q=BLOCK_D_Q,
+            BLOCK_D_V=BLOCK_D_V,
+            HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+            HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            NUM_BUFFERS=NUM_BUFFERS,
+            NUM_MMA_WARPS_PER_GROUP=NUM_MMA_WARPS_PER_GROUP,
+            NUM_MMA_GROUPS=NUM_MMA_GROUPS,
+        )
+    else:
+        _hstu_attn_fwd_compute(
+            Q=Q,
+            K=K,
+            V=V,
+            H=H,
+            DimQ=DimQ,
+            DimV=DimV,
+            workspace_ptr=workspace_ptr,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            Out=Out,
+            stride_qm=stride_qm,
+            stride_qh=stride_qh,
+            stride_kn=stride_kn,
+            stride_kh=stride_kh,
+            stride_vn=stride_vn,
+            stride_vh=stride_vh,
+            stride_om=stride_om,
+            stride_oh=stride_oh,
+            alpha=alpha,
+            MAX_SEQ_LEN=MAX_SEQ_LEN,
+            DeltaSize=DeltaSize,
+            contextual_seq_len=contextual_seq_len,
+            max_attn_len=max_attn_len,
+            off_z=off_z,
+            off_h=off_h,
+            pid=pid,
+            HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+            IS_DELTA_Q=IS_DELTA_Q,
+            ALLOW_TF32=ALLOW_TF32,
+            BLOCK_D_Q=BLOCK_D_Q,
+            BLOCK_D_V=BLOCK_D_V,
+            HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+            HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            ENABLE_TMA=ENABLE_TMA,
+            TMA_DESC_SIZE=TMA_DESC_SIZE,
+        )
+
+
+@triton_autotune(
+    configs=pinned_or_full(_get_fw_pinned_configs(), _get_fw_configs),
+    key=[
+        "AUTOTUNE_Z",
+        "H",
+        "AUTOTUNE_MAX_SEQ_LEN",
+        "DimQ",
+        "DimV",
+        "DeltaSize",
+        "IS_DELTA_Q",
+    ],
+    prune_configs_by={"early_config_prune": _early_config_prune},
+)
+@triton.jit
+def _hstu_attn_fwd_persistent(  # noqa C901
+    Q,
+    K,
+    V,
+    workspace_ptr,
+    sort_by_length_indices,
+    seq_offsets,
+    num_targets,
+    Out,
+    stride_qm,
+    stride_qh,
+    stride_kn,
+    stride_kh,
+    stride_vn,
+    stride_vh,
+    stride_om,
+    stride_oh,
+    alpha,
+    Z,
+    AUTOTUNE_Z,
+    H,
+    MAX_SEQ_LEN,
+    AUTOTUNE_MAX_SEQ_LEN,  # Quantized MAX_SEQ_LEN used as an autotuning key
+    DimQ,
+    DimV,
+    DeltaSize,
+    contextual_seq_len,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    IS_DELTA_Q: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    USE_TLX: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,  #
+    NUM_MMA_WARPS_PER_GROUP: tl.constexpr,  #
+    NUM_MMA_GROUPS: tl.constexpr,  #
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    HAS_SORT_BY_LENGTH_INDICES: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+    TMA_DESC_SIZE: tl.constexpr,
+):
+    n_tile_num = tl.cdiv(MAX_SEQ_LEN, BLOCK_M)
+    prog_id = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    total_tiles = n_tile_num * Z * H
+
+    tiles_per_sm = total_tiles // num_progs
+    if prog_id < total_tiles % num_progs:
+        tiles_per_sm += 1
+
+    tile_idx = prog_id
+    for _ in range(0, tiles_per_sm):
+        pid = (total_tiles - tile_idx - 1) // (Z * H)
+        off_hz = (total_tiles - tile_idx - 1) % (Z * H)
+        off_z = off_hz // H
+        off_h = off_hz % H
+        _hstu_attn_fwd_compute(
+            Q=Q,
+            K=K,
+            V=V,
+            H=H,
+            DimQ=DimQ,
+            DimV=DimV,
+            workspace_ptr=workspace_ptr,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            Out=Out,
+            stride_qm=stride_qm,
+            stride_qh=stride_qh,
+            stride_kn=stride_kn,
+            stride_kh=stride_kh,
+            stride_vn=stride_vn,
+            stride_vh=stride_vh,
+            stride_om=stride_om,
+            stride_oh=stride_oh,
+            alpha=alpha,
+            MAX_SEQ_LEN=MAX_SEQ_LEN,
+            DeltaSize=DeltaSize,
+            contextual_seq_len=contextual_seq_len,
+            max_attn_len=max_attn_len,
+            off_z=off_z,
+            off_h=off_h,
+            pid=pid,
+            HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+            IS_DELTA_Q=IS_DELTA_Q,
+            ALLOW_TF32=ALLOW_TF32,
+            BLOCK_D_Q=BLOCK_D_Q,
+            BLOCK_D_V=BLOCK_D_V,
+            HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+            HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            ENABLE_TMA=ENABLE_TMA,
+            TMA_DESC_SIZE=TMA_DESC_SIZE,
+        )
+        tile_idx += num_progs
+
+
+@triton.jit
+def _hstu_attn_bwd_one_block(  # noqa C901
+    start_m,
+    offs_n,
+    offs_m,
+    q_ptrs_trans,
+    dq_ptrs_trans,
+    do_ptrs,
+    device_desc_q,
+    device_desc_do,
+    dk,
+    dv,
+    k,
+    v,
+    pos_offs_n,
+    seq_len,
+    max_ids,
+    contextual_seq_len,
+    max_attn_len,
+    LOCK,
+    off_h,
+    stride_qh,
+    stride_doh,
+    stride_qm,
+    stride_dom,
+    stride_dqm,
+    alpha,
+    MAX_SEQ_LEN,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    ATOMIC_ADD: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+):
+    pos_offs_m = offs_m + start_m
+    mask_m = pos_offs_m < seq_len
+    invalid_mask_trans = pos_offs_m[None, :] == offs_n[:, None]
+    # recompute qk and silu
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        pos_offs_m = pos_offs_m - contextual_seq_len + 1
+        pos_offs_m = tl.where(
+            pos_offs_m > 0,
+            pos_offs_m,
+            0,
+        )
+    if HAS_MULTIPLE_TARGETS:
+        pos_offs_m = tl.where(
+            pos_offs_m < max_ids,
+            pos_offs_m,
+            max_ids,
+        )
+    if ENABLE_TMA:
+        q = device_desc_q.load(
+            [start_m, (off_h * stride_qh).to(tl.int32)],
+        )
+        q_trans = tl.trans(q)
+    else:
+        q_trans = tl.load(
+            q_ptrs_trans + start_m * stride_qm,
+            mask=mask_m[None, :],
+            other=0.0,
+        )
+    qk_trans = tl.dot(k, q_trans, allow_tf32=ALLOW_TF32) * alpha
+    sig_trans = fast_dividef(1.0, 1.0 + tl.exp(-qk_trans))
+    silu_trans = qk_trans * sig_trans * (1.0 / MAX_SEQ_LEN)
+    pos_offs_m_minus_n = pos_offs_m[None, :] - pos_offs_n[:, None]
+    invalid_mask_trans = invalid_mask_trans or (pos_offs_m_minus_n > 0)
+    if HAS_MAX_ATTN_LEN:
+        invalid_mask_trans = invalid_mask_trans and pos_offs_m_minus_n <= max_attn_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        invalid_mask_trans = invalid_mask_trans or (
+            pos_offs_m[None, :] == 0 and pos_offs_n[:, None] < max_ids
+        )
+    silu_trans = tl.where(invalid_mask_trans, silu_trans, 0)
+    silu_trans = silu_trans.to(k.dtype)
+    # compute dv
+    if ENABLE_TMA:
+        do = device_desc_do.load(
+            [start_m, (off_h * stride_doh).to(tl.int32)],
+        )
+    else:
+        do = tl.load(
+            do_ptrs + start_m * stride_dom,
+            mask=mask_m[:, None],
+            other=0.0,
+        )
+    dv += tl.dot(silu_trans, do, allow_tf32=ALLOW_TF32)
+
+    # compute dk and dq
+    dqk_trans = tl.dot(v, tl.trans(do), allow_tf32=ALLOW_TF32)
+    dqk_trans = (
+        dqk_trans * sig_trans * (1 + qk_trans * (1 - sig_trans)) * (1.0 / MAX_SEQ_LEN)
+    )
+    dqk_trans = tl.where(invalid_mask_trans, dqk_trans, 0)
+    dqk_trans = dqk_trans.to(k.dtype)
+
+    # Note: the factor `alpha` is delayed until the end of the function to reduce the cost
+    dk += tl.dot(dqk_trans, tl.trans(q_trans), allow_tf32=ALLOW_TF32)
+    acc_dq(
+        dq_ptrs_trans=dq_ptrs_trans,
+        start_m=start_m,
+        stride_dqm=stride_dqm,
+        k=k,
+        dqk_trans=dqk_trans,
+        alpha=alpha,
+        mask_m=mask_m,
+        MAX_SEQ_LEN=MAX_SEQ_LEN,
+        LOCK=LOCK,
+        BLOCK_M=BLOCK_M,
+        ATOMIC_ADD=ATOMIC_ADD,
+        ALLOW_TF32=ALLOW_TF32,
+    )
+    return dk, dv
+
+
+@triton.jit
+def _hstu_attn_bwd_one_col_block(  # noqa C901
+    start_n,
+    seq_len,
+    n_targets,
+    contextual_seq_len,
+    max_attn_len,
+    Q,
+    K,
+    V,
+    DOut,
+    DQ,
+    DK,
+    DV,
+    device_desc_q,
+    device_desc_k,
+    device_desc_v,
+    device_desc_do,
+    device_desc_dk,
+    device_desc_dv,
+    LOCK,
+    off_h,
+    stride_qh,
+    stride_kh,
+    stride_vh,
+    stride_doh,
+    stride_dkh,
+    stride_dvh,
+    stride_qm,
+    stride_kn,
+    stride_vn,
+    stride_dom,
+    stride_dqm,
+    stride_dkn,
+    stride_dvn,
+    alpha,
+    MAX_SEQ_LEN,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    UNROLL: tl.constexpr,
+    ATOMIC_ADD: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+):
+    if HAS_MULTIPLE_TARGETS:
+        low = start_n
+        if HAS_MAX_ATTN_LEN:
+            high = start_n + max_attn_len + BLOCK_N
+            high = high if high + n_targets < seq_len else seq_len
+        else:
+            high = seq_len
+    else:
+        low = start_n
+        if HAS_MAX_ATTN_LEN:
+            high = start_n + max_attn_len + BLOCK_N
+            high = high if high < seq_len else seq_len
+        else:
+            high = seq_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        contextual_block_end = tl.cdiv(contextual_seq_len, BLOCK_M) * BLOCK_M
+        if low < contextual_block_end:
+            low = contextual_block_end
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_qk_d = tl.arange(0, BLOCK_D_Q)
+    offs_v_d = tl.arange(0, BLOCK_D_V)
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+
+    dq_ptrs_trans = DQ + (offs_m[None, :] * stride_dqm + offs_qk_d[:, None])
+    dv = tl.zeros([BLOCK_N, BLOCK_D_V], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_D_Q], dtype=tl.float32)
+    if ENABLE_TMA:
+        q_ptrs_trans = None
+        do_ptrs = None
+        k = device_desc_k.load(
+            [start_n, (off_h * stride_kh).to(tl.int32)],
+        )
+        v = device_desc_v.load(
+            [start_n, (off_h * stride_vh).to(tl.int32)],
+        )
+    else:
+        mask_n = offs_n < seq_len
+        q_ptrs_trans = Q + (offs_m[None, :] * stride_qm + offs_qk_d[:, None])
+        do_ptrs = DOut + (offs_m[:, None] * stride_dom + offs_v_d[None, :])
+        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_qk_d[None, :])
+        v_ptrs = V + (offs_n[:, None] * stride_vn + offs_v_d[None, :])
+        k = tl.load(k_ptrs, mask=mask_n[:, None], other=0.0)
+        v = tl.load(v_ptrs, mask=mask_n[:, None], other=0.0)
+    max_ids = seq_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        pos_offs_n = offs_n - contextual_seq_len + 1
+        pos_offs_n = tl.where(
+            pos_offs_n > 0,
+            pos_offs_n,
+            0,
+        )
+        max_ids = max_ids - contextual_seq_len + 1
+    else:
+        pos_offs_n = offs_n
+    if HAS_MULTIPLE_TARGETS:
+        max_ids = max_ids - n_targets
+        pos_offs_n = tl.where(
+            pos_offs_n < max_ids,
+            pos_offs_n,
+            max_ids,
+        )
+    # loop over rows
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        for start_m in range(0, contextual_seq_len, BLOCK_M):
+            start_m = tl.multiple_of(start_m, BLOCK_M)
+            dk, dv = _hstu_attn_bwd_one_block(
+                start_m=start_m,
+                offs_n=offs_n,
+                offs_m=offs_m,
+                q_ptrs_trans=q_ptrs_trans,
+                dq_ptrs_trans=dq_ptrs_trans,
+                do_ptrs=do_ptrs,
+                device_desc_q=device_desc_q,
+                device_desc_do=device_desc_do,
+                dk=dk,
+                dv=dv,
+                k=k,
+                v=v,
+                pos_offs_n=pos_offs_n,
+                seq_len=seq_len,
+                max_ids=max_ids,
+                contextual_seq_len=contextual_seq_len,
+                max_attn_len=max_attn_len,
+                LOCK=LOCK,
+                off_h=off_h,
+                stride_qh=stride_qh,
+                stride_doh=stride_doh,
+                stride_qm=stride_qm,
+                stride_dom=stride_dom,
+                stride_dqm=stride_dqm,
+                alpha=alpha,
+                MAX_SEQ_LEN=MAX_SEQ_LEN,
+                HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                ALLOW_TF32=ALLOW_TF32,
+                BLOCK_M=BLOCK_M,
+                ATOMIC_ADD=ATOMIC_ADD,
+                ENABLE_TMA=ENABLE_TMA,
+                BLOCK_D_Q=BLOCK_D_Q,
+                BLOCK_D_V=BLOCK_D_V,
+            )
+    for start_m in tl.range(low, high, BLOCK_M, loop_unroll_factor=UNROLL):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        dk, dv = _hstu_attn_bwd_one_block(
+            start_m=start_m,
+            offs_n=offs_n,
+            offs_m=offs_m,
+            q_ptrs_trans=q_ptrs_trans,
+            dq_ptrs_trans=dq_ptrs_trans,
+            do_ptrs=do_ptrs,
+            device_desc_q=device_desc_q,
+            device_desc_do=device_desc_do,
+            dk=dk,
+            dv=dv,
+            k=k,
+            v=v,
+            pos_offs_n=pos_offs_n,
+            seq_len=seq_len,
+            max_ids=max_ids,
+            contextual_seq_len=contextual_seq_len,
+            max_attn_len=max_attn_len,
+            LOCK=LOCK,
+            off_h=off_h,
+            stride_qh=stride_qh,
+            stride_doh=stride_doh,
+            stride_qm=stride_qm,
+            stride_dom=stride_dom,
+            stride_dqm=stride_dqm,
+            alpha=alpha,
+            MAX_SEQ_LEN=MAX_SEQ_LEN,
+            HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+            HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+            HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            ALLOW_TF32=ALLOW_TF32,
+            BLOCK_M=BLOCK_M,
+            ATOMIC_ADD=ATOMIC_ADD,
+            ENABLE_TMA=ENABLE_TMA,
+            BLOCK_D_Q=BLOCK_D_Q,
+            BLOCK_D_V=BLOCK_D_V,
+        )
+    # write-back
+    dk = dk * alpha
+    if ENABLE_TMA:
+        device_desc_dv.store(
+            [start_n, (off_h * stride_dvh).to(tl.int32)],
+            dv.to(k.dtype),
+        )
+        device_desc_dk.store(
+            [start_n, (off_h * stride_dkh).to(tl.int32)],
+            dk.to(k.dtype),
+        )
+    else:
+        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_v_d[None, :])
+        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_qk_d[None, :])
+        tl.store(dv_ptrs, dv.to(k.dtype), mask=mask_n[:, None])  # pyre-ignore[61]
+        tl.store(dk_ptrs, dk.to(k.dtype), mask=mask_n[:, None])  # pyre-ignore[61]
+
+
+def _bwd_pre_hook(nargs):
+    nargs["DQ"].zero_()
+    if nargs["SEQUENCE_PARALLEL"] is True:
+        nargs["LOCK"].zero_()
+
+
+def _get_bw_configs() -> List[triton.Config]:
+    if torch.version.hip:
+        configs = []
+        for BLOCK_M in [32, 64]:
+            for BLOCK_N in [32, 64, 128]:
+                for num_stages in [1, 2]:
+                    for num_warps in [4, 8]:
+                        for matrix_instr_nonkdim in [16, 32]:
+                            for waves_per_eu in [0, 2, 4]:
+                                for sp in [True, False]:
+                                    configs.append(
+                                        triton.Config(
+                                            {
+                                                "BLOCK_M": BLOCK_M,
+                                                "BLOCK_N": BLOCK_N,
+                                                "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                                                "waves_per_eu": waves_per_eu,
+                                                "SEQUENCE_PARALLEL": sp,
+                                                "UNROLL": 1,
+                                            },
+                                            num_stages=num_stages,
+                                            num_warps=num_warps,
+                                            pre_hook=_bwd_pre_hook,
+                                        )
+                                    )
+        return configs
+
+    configs = [
+        triton.Config(
+            {"BLOCK_M": 16, "BLOCK_N": 32, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=2,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 16, "BLOCK_N": 16, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=2,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 16, "BLOCK_N": 32, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 16, "BLOCK_N": 32, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=1,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=1,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=1,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=1,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=3,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False, "UNROLL": 4},
+            num_stages=2,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 16, "BLOCK_N": 32, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+            num_stages=2,
+            num_warps=2,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+            num_stages=1,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+            num_stages=1,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+    ]
+    if torch.cuda.is_available() and torch.version.cuda < "12.8":
+        configs += [
+            triton.Config(
+                {"BLOCK_M": 16, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+                num_stages=1,
+                num_warps=4,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+                num_stages=1,
+                num_warps=4,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+                num_stages=1,
+                num_warps=8,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+                num_stages=1,
+                num_warps=8,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+                num_stages=3,
+                num_warps=8,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+                num_stages=1,
+                num_warps=4,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_M": 32,
+                    "BLOCK_N": 128,
+                    "SEQUENCE_PARALLEL": False,
+                    "UNROLL": 2,
+                },
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_bwd_pre_hook,
+            ),
+        ]
+    else:
+        print("WARNING: temporarily disabled some autotune configs for CUDA 12.8+")
+    return configs
+
+
+def _get_bw_pinned_configs() -> List[triton.Config]:
+    # Pinned backward-attention configs for MI350X gfx950. Pins the fast
+    # matrix_instr_nonkdim=16 winner(s) to avoid the 16-vs-32 autotune lottery.
+    #
+    # LIST, one entry per tuned shape (currently just bs=1024). With >1 entry the
+    # autotuner benchmarks only these candidates and caches the winner per `key`
+    # (AUTOTUNE_Z / H / AUTOTUNE_MAX_SEQ_LEN / DimQ / DimV), so each batch size
+    # picks its own config automatically — same pattern as the layer-norm pins.
+    #
+    # TO ADD A NEW BATCH SIZE / SHAPE:
+    #   1. Run once with TRITON_FULL_AUTOTUNE=1 TRITON_PRINT_AUTOTUNING=1.
+    #   2. Grep the log for "best config selected:" under "_hstu_attn_bwd".
+    #   3. Append that config below (verbatim BLOCK_M/BLOCK_N/matrix_instr_nonkdim/
+    #      waves_per_eu/SEQUENCE_PARALLEL/UNROLL/num_stages/num_warps).
+    # Keep pre_hook=_bwd_pre_hook on every entry (the bwd configs require it).
+    if torch.version.hip:
+        return [
+            # --- yambda bs=1024, L=2048 winner (from capture log) ---
+            triton.Config(
+                {
+                    "BLOCK_M": 32,
+                    "BLOCK_N": 128,
+                    "matrix_instr_nonkdim": 16,
+                    "waves_per_eu": 0,
+                    "SEQUENCE_PARALLEL": False,
+                    "UNROLL": 1,
+                },
+                num_stages=1,
+                num_warps=4,
+                pre_hook=_bwd_pre_hook,
+            ),
+            # --- add more (bs, L) winners here; see "TO ADD A NEW BATCH SIZE" ---
+        ]
+    return _get_bw_configs()
+
+
+@triton_autotune(
+    configs=pinned_or_full(_get_bw_pinned_configs(), _get_bw_configs),
+    key=[
+        "AUTOTUNE_Z",
+        "H",
+        "AUTOTUNE_MAX_SEQ_LEN",
+        "DimQ",
+        "DimV",
+    ],
+)
+@triton.jit
+def _hstu_attn_bwd(  # noqa C901
+    Q,
+    K,
+    V,
+    tma_workspace_ptr,
+    sort_by_length_indices,
+    seq_offsets,
+    num_targets,
+    DOut,
+    DQ,
+    DK,
+    DV,
+    LOCK,
+    stride_qm,
+    stride_qh,
+    stride_kn,
+    stride_kh,
+    stride_vn,
+    stride_vh,
+    stride_dom,
+    stride_doh,
+    stride_dqm,
+    stride_dqh,
+    stride_dkn,
+    stride_dkh,
+    stride_dvn,
+    stride_dvh,
+    alpha,
+    contextual_seq_len,
+    max_attn_len,
+    Z,
+    AUTOTUNE_Z,
+    H,
+    MAX_SEQ_LEN,
+    AUTOTUNE_MAX_SEQ_LEN,  # Quantized MAX_SEQ_LEN used as an autotuning key
+    DimQ,
+    DimV,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    SEQUENCE_PARALLEL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    UNROLL: tl.constexpr,
+    HAS_SORT_BY_LENGTH_INDICES: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+    TMA_DESC_SIZE: tl.constexpr,
+    ENABLE_BUFFER_OPS_ASSUMES: tl.constexpr,
+):
+    off_hz = tl.program_id(0)
+    off_z = off_hz // H
+    if HAS_SORT_BY_LENGTH_INDICES:
+        off_z = tl.load(sort_by_length_indices + off_z)
+    off_h = off_hz % H
+    off_h = off_h.to(tl.int64)
+    seq_start = tl.load(seq_offsets + off_z).to(tl.int64)
+    seq_end = tl.load(seq_offsets + off_z + 1)
+    seq_len = (seq_end - seq_start).to(tl.int32)
+    if HAS_MULTIPLE_TARGETS:
+        n_targets = tl.load(num_targets + off_z).to(tl.int32)
+    else:
+        n_targets = None
+    if ENABLE_BUFFER_OPS_ASSUMES:
+        tl.assume(off_hz >= 0)
+        tl.assume(off_z >= 0)
+        tl.assume(off_h >= 0)
+        tl.assume(seq_start >= 0)
+        tl.assume(stride_qm >= 0)
+        tl.assume(stride_qh >= 0)
+        tl.assume(stride_kn >= 0)
+        tl.assume(stride_kh >= 0)
+        tl.assume(stride_vn >= 0)
+        tl.assume(stride_vh >= 0)
+        tl.assume(stride_dom >= 0)
+        tl.assume(stride_doh >= 0)
+        tl.assume(stride_dqm >= 0)
+        tl.assume(stride_dqh >= 0)
+        tl.assume(stride_dkn >= 0)
+        tl.assume(stride_dkh >= 0)
+        tl.assume(stride_dvn >= 0)
+        tl.assume(stride_dvh >= 0)
+
+    # offset pointers for batch/head
+    Q = Q + seq_start * stride_qm
+    K = K + seq_start * stride_kn
+    V = V + seq_start * stride_vn
+    DOut = DOut + seq_start * stride_dom
+    DQ = DQ + seq_start * stride_dqm + off_h * stride_dqh
+    DK = DK + seq_start * stride_dkn
+    DV = DV + seq_start * stride_dvn
+    device_desc_q = None
+    device_desc_k = None
+    device_desc_v = None
+    device_desc_do = None
+    device_desc_dk = None
+    device_desc_dv = None
+    if ENABLE_TMA:
+        device_desc_q = tl.make_tensor_descriptor(
+            Q,
+            shape=[seq_len, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=[BLOCK_M, BLOCK_D_Q],
+        )
+        device_desc_do = tl.make_tensor_descriptor(
+            DOut,
+            shape=[seq_len, H * DimV],
+            strides=[H * DimV, 1],
+            block_shape=[BLOCK_M, BLOCK_D_V],
+        )
+        device_desc_k = tl.make_tensor_descriptor(
+            K,
+            shape=[seq_len, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=[BLOCK_N, BLOCK_D_Q],
+        )
+        device_desc_dk = tl.make_tensor_descriptor(
+            DK,
+            shape=[seq_len, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=[BLOCK_N, BLOCK_D_Q],
+        )
+        device_desc_v = tl.make_tensor_descriptor(
+            V,
+            shape=[seq_len, H * DimV],
+            strides=[H * DimV, 1],
+            block_shape=[BLOCK_N, BLOCK_D_V],
+        )
+        device_desc_dv = tl.make_tensor_descriptor(
+            DV,
+            shape=[seq_len, H * DimV],
+            strides=[H * DimV, 1],
+            block_shape=[BLOCK_N, BLOCK_D_V],
+        )
+    else:
+        Q += off_h * stride_qh
+        K += off_h * stride_kh
+        V += off_h * stride_vh
+        DOut += off_h * stride_doh
+        DK += off_h * stride_dkh
+        DV += off_h * stride_dvh
+    if SEQUENCE_PARALLEL:
+        start_n = tl.program_id(1) * BLOCK_N
+        if start_n >= seq_len:
+            return
+        _hstu_attn_bwd_one_col_block(
+            start_n=start_n,
+            seq_len=seq_len,
+            n_targets=n_targets,
+            contextual_seq_len=contextual_seq_len,
+            max_attn_len=max_attn_len,
+            Q=Q,
+            K=K,
+            V=V,
+            DOut=DOut,
+            DQ=DQ,
+            DK=DK,
+            DV=DV,
+            device_desc_q=device_desc_q,
+            device_desc_k=device_desc_k,
+            device_desc_v=device_desc_v,
+            device_desc_do=device_desc_do,
+            device_desc_dk=device_desc_dk,
+            device_desc_dv=device_desc_dv,
+            LOCK=LOCK,
+            off_h=off_h,
+            stride_qh=stride_qh,
+            stride_kh=stride_kh,
+            stride_vh=stride_vh,
+            stride_doh=stride_doh,
+            stride_dkh=stride_dkh,
+            stride_dvh=stride_dvh,
+            stride_qm=stride_qm,
+            stride_kn=stride_kn,
+            stride_vn=stride_vn,
+            stride_dom=stride_dom,
+            stride_dqm=stride_dqm,
+            stride_dkn=stride_dkn,
+            stride_dvn=stride_dvn,
+            alpha=alpha,
+            MAX_SEQ_LEN=MAX_SEQ_LEN,
+            HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+            HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+            HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            ALLOW_TF32=ALLOW_TF32,
+            BLOCK_D_Q=BLOCK_D_Q,
+            BLOCK_D_V=BLOCK_D_V,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            UNROLL=UNROLL,
+            ATOMIC_ADD=True,
+            ENABLE_TMA=ENABLE_TMA,
+        )
+    else:
+        for start_n in range(0, seq_len, BLOCK_N):
+            _hstu_attn_bwd_one_col_block(
+                start_n=start_n,
+                seq_len=seq_len,
+                n_targets=n_targets,
+                contextual_seq_len=contextual_seq_len,
+                max_attn_len=max_attn_len,
+                Q=Q,
+                K=K,
+                V=V,
+                DOut=DOut,
+                DQ=DQ,
+                DK=DK,
+                DV=DV,
+                device_desc_q=device_desc_q,
+                device_desc_k=device_desc_k,
+                device_desc_v=device_desc_v,
+                device_desc_do=device_desc_do,
+                device_desc_dk=device_desc_dk,
+                device_desc_dv=device_desc_dv,
+                LOCK=LOCK,
+                off_h=off_h,
+                stride_qh=stride_qh,
+                stride_kh=stride_kh,
+                stride_vh=stride_vh,
+                stride_doh=stride_doh,
+                stride_dkh=stride_dkh,
+                stride_dvh=stride_dvh,
+                stride_qm=stride_qm,
+                stride_kn=stride_kn,
+                stride_vn=stride_vn,
+                stride_dom=stride_dom,
+                stride_dqm=stride_dqm,
+                stride_dkn=stride_dkn,
+                stride_dvn=stride_dvn,
+                alpha=alpha,
+                MAX_SEQ_LEN=MAX_SEQ_LEN,
+                HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                ALLOW_TF32=ALLOW_TF32,
+                BLOCK_D_Q=BLOCK_D_Q,
+                BLOCK_D_V=BLOCK_D_V,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+                UNROLL=UNROLL,
+                ATOMIC_ADD=False,
+                ENABLE_TMA=ENABLE_TMA,
+            )
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::triton_hstu_attention_fwd", mutates_args=()
+)
+def triton_hstu_attention_fwd(
+    N: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    max_attn_len: int,
+    contextual_seq_len: int,
+    sort_by_length_indices: Optional[torch.Tensor],
+    enable_tma: bool,
+    num_softmax_heads: int,
+) -> torch.Tensor:
+    Z = seq_offsets.numel() - 1
+    AUTOTUNE_Z = prev_power_of_2(Z)
+    L, H, DimQ = q.shape
+    _, _, DimV = v.shape
+    out = torch.empty_like(v)
+    has_multiple_targets = num_targets is not None
+    has_contextual_seq_len = contextual_seq_len > 0
+    has_max_attn_len = max_attn_len > 0
+    has_sort_by_length_indices = sort_by_length_indices is not None
+    if L == 0:
+        return out
+
+    TMA_DESC_SIZE = 128
+    workspace = None
+    desc_q = q
+    desc_k = k
+    desc_v = v
+
+    if enable_tma and tensor_descriptor_tma:
+        dummy_block = [1, 1]
+        desc_q = TensorDescriptor(
+            q,
+            shape=[L, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=dummy_block,
+        )
+        desc_v = TensorDescriptor(
+            v,
+            shape=[L, H * DimV],
+            strides=[H * DimV, 1],
+            block_shape=dummy_block,
+        )
+        desc_k = TensorDescriptor(
+            k,
+            shape=[L, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=dummy_block,
+        )
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        assert align == TMA_DESC_SIZE
+        return torch.empty(size, dtype=torch.int8, device="cuda")
+
+    # pyre-ignore [6]
+    triton.set_allocator(alloc_fn)
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(N, meta["BLOCK_M"]),
+        Z * H,
+    )
+
+    _hstu_attn_fwd[grid](
+        Q=desc_q,
+        K=desc_k,
+        V=desc_v,
+        workspace_ptr=workspace,
+        sort_by_length_indices=sort_by_length_indices,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        Out=out,
+        stride_qm=q.stride(0),
+        stride_qh=q.stride(1),
+        stride_kn=k.stride(0),
+        stride_kh=k.stride(1),
+        stride_vn=v.stride(0),
+        stride_vh=v.stride(1),
+        stride_om=out.stride(0),
+        stride_oh=out.stride(1),
+        alpha=alpha,
+        Z=Z,
+        AUTOTUNE_Z=AUTOTUNE_Z,
+        H=H,
+        MAX_SEQ_LEN=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
+        DimQ=DimQ,
+        DimV=DimV,
+        DeltaSize=0,
+        contextual_seq_len=contextual_seq_len,
+        max_attn_len=max_attn_len,
+        HAS_MULTIPLE_TARGETS=has_multiple_targets,
+        IS_DELTA_Q=False,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BLOCK_D_Q=DimQ,
+        BLOCK_D_V=DimV,
+        HAS_CONTEXTUAL_SEQ_LEN=has_contextual_seq_len,
+        HAS_MAX_ATTN_LEN=has_max_attn_len,
+        HAS_SORT_BY_LENGTH_INDICES=has_sort_by_length_indices,
+        ENABLE_TMA=enable_tma,
+        TMA_DESC_SIZE=TMA_DESC_SIZE,
+    )
+    return out
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::triton_hstu_attention_bwd",
+    mutates_args=("dq", "dk", "dv"),
+)
+def triton_hstu_attention_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    dq: torch.Tensor,
+    dk: torch.Tensor,
+    dv: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    N: int,
+    alpha: float,
+    max_attn_len: int,
+    contextual_seq_len: int,
+    sort_by_length_indices: Optional[torch.Tensor],
+    enable_tma: bool,
+    num_softmax_heads: int,
+) -> None:
+    orig_dq, orig_dk, orig_dv = dq, dk, dv
+    dout = switch_to_contiguous_if_needed(dout)
+    dq = switch_to_contiguous_if_needed(dq)
+    dk = switch_to_contiguous_if_needed(dk)
+    dv = switch_to_contiguous_if_needed(dv)
+    if dout.shape[0] == 0:
+        orig_dq.zero_()
+        orig_dk.zero_()
+        orig_dv.zero_()
+        return
+    Z = seq_offsets.numel() - 1
+    _, H, DimQ = q.shape
+    _, _, DimV = v.shape
+    grid = lambda meta: (  # noqa E731
+        Z * H,
+        (triton.cdiv(N, meta["BLOCK_N"]) if meta["SEQUENCE_PARALLEL"] else 1),
+    )
+    # The minimum size of BLOCK_M used in `_get_bw_configs`.
+    # TODO (linjianma): avoid hardcoding the value.
+    MIN_BLOCK_M = 16
+    lock = torch.empty(
+        (Z * H, triton.cdiv(N, MIN_BLOCK_M)),
+        dtype=torch.int32,
+        device=q.device,
+    )
+    AUTOTUNE_Z = prev_power_of_2(Z)
+    TMA_DESC_SIZE = 128
+    tma_workspace = None
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        assert align == TMA_DESC_SIZE
+        return torch.empty(size, dtype=torch.int8, device="cuda")
+
+    # pyre-ignore [6]
+    triton.set_allocator(alloc_fn)
+
+    # Enable BufferOps on AMD
+    ENABLE_BUFFER_OPS_ASSUMES = torch.version.hip is not None
+    _hstu_attn_bwd[grid](
+        Q=q,
+        K=k,
+        V=v,
+        tma_workspace_ptr=tma_workspace,
+        sort_by_length_indices=sort_by_length_indices,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        DOut=dout,
+        DQ=dq,
+        DK=dk,
+        DV=dv,
+        LOCK=lock,
+        stride_qm=q.stride(0),
+        stride_qh=q.stride(1),
+        stride_kn=k.stride(0),
+        stride_kh=k.stride(1),
+        stride_vn=v.stride(0),
+        stride_vh=v.stride(1),
+        stride_dom=dout.stride(0),
+        stride_doh=dout.stride(1),
+        stride_dqm=dq.stride(0),
+        stride_dqh=dq.stride(1),
+        stride_dkn=dk.stride(0),
+        stride_dkh=dk.stride(1),
+        stride_dvn=dv.stride(0),
+        stride_dvh=dv.stride(1),
+        alpha=alpha,
+        contextual_seq_len=contextual_seq_len,
+        max_attn_len=max_attn_len,
+        Z=Z,
+        AUTOTUNE_Z=AUTOTUNE_Z,
+        H=H,
+        MAX_SEQ_LEN=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
+        DimQ=DimQ,
+        DimV=DimV,
+        HAS_MULTIPLE_TARGETS=num_targets is not None,
+        HAS_CONTEXTUAL_SEQ_LEN=contextual_seq_len > 0,
+        HAS_MAX_ATTN_LEN=max_attn_len > 0,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BLOCK_D_Q=DimQ,
+        BLOCK_D_V=DimV,
+        HAS_SORT_BY_LENGTH_INDICES=sort_by_length_indices is not None,
+        ENABLE_TMA=enable_tma,
+        TMA_DESC_SIZE=TMA_DESC_SIZE,
+        ENABLE_BUFFER_OPS_ASSUMES=ENABLE_BUFFER_OPS_ASSUMES,
+    )
+
+    copy_if_different_ptr(orig_dq, dq)
+    copy_if_different_ptr(orig_dk, dk)
+    copy_if_different_ptr(orig_dv, dv)
+
+
+@triton_hstu_attention_fwd.register_fake
+def _triton_hstu_attention_fwd_fake(
+    N: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    max_attn_len: int,
+    contextual_seq_len: int,
+    sort_by_length_indices: Optional[torch.Tensor],
+    enable_tma: bool,
+    num_softmax_heads: int,
+) -> torch.Tensor:
+    L, H, _ = q.shape
+    _, _, DimV = v.shape
+    out = torch.empty((L, H, DimV), dtype=v.dtype, device=v.device)
+    return out
+
+
+@triton_hstu_attention_bwd.register_fake
+def _triton_hstu_attention_bwd_fake(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    dq: torch.Tensor,
+    dk: torch.Tensor,
+    dv: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    N: int,
+    alpha: float,
+    max_attn_len: int,
+    contextual_seq_len: int,
+    sort_by_length_indices: Optional[torch.Tensor],
+    enable_tma: bool,
+    num_softmax_heads: int,
+) -> None:
+    return None
+
+
+class _AttentionFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        N: int,
+        alpha: float,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        num_targets: Optional[torch.Tensor],
+        max_attn_len: int,
+        contextual_seq_len: int,
+        sort_by_length: bool,
+        enable_tma: bool,
+    ) -> torch.Tensor:
+        sort_by_length_indices = None
+        if sort_by_length:
+            seq_lengths = seq_offsets[1:] - seq_offsets[:-1]
+            _, sort_by_length_indices = torch.sort(
+                seq_lengths, descending=True, stable=False
+            )
+        saved_tensors = [q, k, v, seq_offsets]
+        if num_targets is not None:
+            saved_tensors.append(num_targets)
+        if sort_by_length_indices is not None:
+            saved_tensors.append(sort_by_length_indices)
+        ctx.save_for_backward(*saved_tensors)
+        ctx.alpha = alpha
+        ctx.has_multiple_targets = num_targets is not None
+        ctx.max_attn_len = max_attn_len
+        ctx.N = N
+        ctx.contextual_seq_len = contextual_seq_len
+        ctx.sort_by_length = sort_by_length
+        ctx.enable_tma = enable_tma
+        return triton_hstu_attention_fwd(
+            N=N,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            sort_by_length_indices=sort_by_length_indices,
+            enable_tma=enable_tma,
+            num_softmax_heads=0,
+        )
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dout: torch.Tensor
+    ) -> Tuple[
+        None,
+        None,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        with torch.inference_mode():
+            q, k, v, seq_offsets = ctx.saved_tensors[:4]
+            idx = 4
+            if ctx.has_multiple_targets:
+                num_targets = ctx.saved_tensors[idx]
+                idx += 1
+            else:
+                num_targets = None
+            if ctx.sort_by_length:
+                sort_by_length_indices = ctx.saved_tensors[idx]
+            else:
+                sort_by_length_indices = None
+
+            dq = torch.empty_like(q)
+            dk = torch.empty_like(k)
+            dv = torch.empty_like(v)
+            triton_hstu_attention_bwd(
+                dout=dout,
+                q=q,
+                k=k,
+                v=v,
+                dq=dq,
+                dk=dk,
+                dv=dv,
+                seq_offsets=seq_offsets,
+                num_targets=num_targets,
+                N=ctx.N,
+                alpha=ctx.alpha,
+                max_attn_len=ctx.max_attn_len,
+                contextual_seq_len=ctx.contextual_seq_len,
+                sort_by_length_indices=sort_by_length_indices,
+                enable_tma=ctx.enable_tma,
+                num_softmax_heads=0,
+            )
+            return (
+                None,
+                None,
+                dq,
+                dk,
+                dv,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+            )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_hstu_mha(
+    N: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    sort_by_length: bool = False,
+    enable_tma: bool = False,
+) -> torch.Tensor:
+    return _AttentionFunction.apply(
+        N,
+        alpha,
+        q,
+        k,
+        v,
+        seq_offsets,
+        num_targets,
+        max_attn_len,
+        contextual_seq_len,
+        sort_by_length,
+        enable_tma,
+    )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_cached_hstu_mha(
+    N: int,
+    alpha: float,
+    delta_q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    enable_tma: bool = False,
+) -> torch.Tensor:
+    Z = seq_offsets.size(0) - 1
+    AUTOTUNE_Z = prev_power_of_2(Z)
+    DELTA_L, H, DimQ = delta_q.shape
+    DeltaSize = DELTA_L // Z
+    L, _, DimV = v.shape
+    out = torch.empty((DELTA_L, H, DimV), dtype=delta_q.dtype, device=delta_q.device)
+
+    TMA_DESC_SIZE = 128
+    desc_q = delta_q
+    desc_k = k
+    desc_v = v
+
+    if enable_tma and tensor_descriptor_tma:
+        dummy_block = [1, 1]
+        desc_q = TensorDescriptor(
+            delta_q,
+            shape=[DELTA_L, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=dummy_block,
+        )
+        desc_v = TensorDescriptor(
+            v,
+            shape=[L, H * DimV],
+            strides=[H * DimV, 1],
+            block_shape=dummy_block,
+        )
+        desc_k = TensorDescriptor(
+            k,
+            shape=[L, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=dummy_block,
+        )
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        assert align == TMA_DESC_SIZE
+        return torch.empty(size, dtype=torch.int8, device="cuda")
+
+    # pyre-ignore [6]
+    triton.set_allocator(alloc_fn)
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(DeltaSize, meta["BLOCK_M"]),
+        Z * H,
+    )
+
+    has_contextual_seq_len = contextual_seq_len > 0
+    has_max_attn_len = max_attn_len > 0
+    _hstu_attn_fwd[grid](
+        Q=desc_q,
+        K=desc_k,
+        V=desc_v,
+        workspace_ptr=None,
+        sort_by_length_indices=None,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        Out=out,
+        stride_qm=delta_q.stride(0),
+        stride_qh=delta_q.stride(1),
+        stride_kn=k.stride(0),
+        stride_kh=k.stride(1),
+        stride_vn=v.stride(0),
+        stride_vh=v.stride(1),
+        stride_om=out.stride(0),
+        stride_oh=out.stride(1),
+        alpha=alpha,
+        contextual_seq_len=contextual_seq_len,
+        max_attn_len=max_attn_len,
+        Z=Z,
+        AUTOTUNE_Z=AUTOTUNE_Z,
+        H=H,
+        MAX_SEQ_LEN=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
+        DimQ=DimQ,
+        DimV=DimV,
+        DeltaSize=DeltaSize,
+        HAS_MULTIPLE_TARGETS=num_targets is not None,
+        IS_DELTA_Q=True,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BLOCK_D_Q=DimQ,
+        BLOCK_D_V=DimV,
+        HAS_CONTEXTUAL_SEQ_LEN=has_contextual_seq_len,
+        HAS_MAX_ATTN_LEN=has_max_attn_len,
+        HAS_SORT_BY_LENGTH_INDICES=False,
+        ENABLE_TMA=enable_tma,
+        TMA_DESC_SIZE=TMA_DESC_SIZE,
+    )
+    return out
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py
new file mode 100644
index 000000000..516a15664
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py
@@ -0,0 +1,3047 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import (
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+from generative_recommenders.ops.triton.triton_addmm import maybe_triton_addmm_fwd
+from generative_recommenders.ops.utils import maybe_register_custom_op
+
+
+def _get_layer_norm_mul_dropout_fwd_multirow_configs() -> List[triton.Config]:
+    """Generate autotune configs for multi-row LayerNorm multiplication with dropout kernels."""
+    configs = []
+    for BLOCK_N in [1, 2, 4, 8, 16]:
+        for num_warps in [1, 2, 4]:
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+from generative_recommenders.ops.utils import use_separated_rng_ln_mul_dropout
+
+# @manual=//triton:triton
+from triton.language.extra import libdevice
+
+try:
+    # @manual=//triton:triton
+    from triton.language.extra.libdevice import fast_dividef
+except ImportError:
+    try:
+        # @manual=//triton:triton
+        from triton.language.extra.cuda.libdevice import fast_dividef
+    except ImportError:
+        # pyre-ignore: Undefined import [21]
+        # @manual=//triton:triton
+        from triton.language.math import fast_dividef
+
+
+COMPUTE_OUTPUT_LN_FAST_DROPOUT = False
+
+
+def set_compute_output_ln_fast_dropout(value: bool) -> None:
+    global COMPUTE_OUTPUT_LN_FAST_DROPOUT
+    COMPUTE_OUTPUT_LN_FAST_DROPOUT = value
+
+
+FUSE_OUTPUT_LN_RNG_BLACKWELL = False
+
+
+# Only impact B200 training when CONCAT_UX is False
+def set_fuse_output_ln_rng_blackwell(value: bool) -> None:
+    global FUSE_OUTPUT_LN_RNG_BLACKWELL
+    FUSE_OUTPUT_LN_RNG_BLACKWELL = value
+
+
+@triton.jit
+def rand3x(seed, offsets, n_rounds: tl.constexpr = 10):  # pyre-ignore [9]
+    i1, i2, i3, _ = tl.randint4x(seed, offsets, n_rounds)
+    u1 = tl.uint_to_uniform_float(i1)
+    u2 = tl.uint_to_uniform_float(i2)
+    u3 = tl.uint_to_uniform_float(i3)
+    return u1, u2, u3
+
+
+@triton.jit
+def _generate_random_mask(
+    MASK_BUFFER,
+    N,
+    dropout_ratio,
+    seed,
+    D: tl.constexpr,
+    STRIDE: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    NUM_MASKS: tl.constexpr,
+):
+    """Generate bit-packed dropout masks for (N, D) tensors. Outputs int8.
+
+    Processes 4 rows per program using rand4x. Mask j occupies bit j.
+    Extraction: y = val & 1, x = val & 2, u = val & 4.
+    """
+    pid = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    start_row = pid.to(tl.int64) * 4
+
+    base_ptr = MASK_BUFFER + start_row * STRIDE + cols
+    row0_mask = (start_row < N) & col_mask
+    row1_mask = ((start_row + 1) < N) & col_mask
+    row2_mask = ((start_row + 2) < N) & col_mask
+    row3_mask = ((start_row + 3) < N) & col_mask
+
+    # Each pid uses NUM_MASKS consecutive BLOCK_D chunks for Philox offsets
+    rand_offset = pid * (NUM_MASKS * BLOCK_D) + cols
+
+    packed0 = tl.zeros([BLOCK_D], dtype=tl.int8)
+    packed1 = tl.zeros([BLOCK_D], dtype=tl.int8)
+    packed2 = tl.zeros([BLOCK_D], dtype=tl.int8)
+    packed3 = tl.zeros([BLOCK_D], dtype=tl.int8)
+
+    for j in tl.static_range(NUM_MASKS):
+        r0, r1, r2, r3 = tl.rand4x(seed, rand_offset)
+        packed0 |= (r0 > dropout_ratio).to(tl.int8) << j
+        packed1 |= (r1 > dropout_ratio).to(tl.int8) << j
+        packed2 |= (r2 > dropout_ratio).to(tl.int8) << j
+        packed3 |= (r3 > dropout_ratio).to(tl.int8) << j
+        rand_offset += BLOCK_D
+
+    tl.store(base_ptr, packed0, mask=row0_mask)
+    tl.store(base_ptr + STRIDE, packed1, mask=row1_mask)
+    tl.store(base_ptr + 2 * STRIDE, packed2, mask=row2_mask)
+    tl.store(base_ptr + 3 * STRIDE, packed3, mask=row3_mask)
+
+
+@triton_autotune(
+    configs=_get_layer_norm_mul_dropout_fwd_multirow_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def _ln_mul_dropout_fwd_rng(
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    RANDOM_MASK,
+    N,
+    D,
+    eps,
+    dropout_ratio,
+    stride_x,
+    stride_u,
+    stride_y,
+    stride_mask,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_U: tl.constexpr,
+    CONCAT_X: tl.constexpr,
+    MUL_U_ACTIVATION_TYPE: tl.constexpr,
+):
+    block_id = tl.program_id(0)
+    start_row = block_id * BLOCK_N
+
+    # Create block pointers for X, U, and Y
+    X_block_ptr = tl.make_block_ptr(
+        base=X,
+        shape=(N, D),
+        strides=(stride_x, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    U_block_ptr = tl.make_block_ptr(
+        base=U,
+        shape=(N, D),
+        strides=(stride_u, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    # Load data blocks
+    x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+        tl.float32
+    )
+    u_block = tl.load(U_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+        tl.float32
+    )
+
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    rows = start_row + tl.arange(0, BLOCK_N)
+    row_mask = rows < N
+    # Pre-compute 2D mask for reuse in dropout and masked operations
+    mask_2d = row_mask[:, None] & col_mask[None, :]
+
+    # Pre-compute inv_D to replace divisions with multiplications (optimization)
+    inv_D = 1.0 / D
+
+    mean = tl.sum(x_block, axis=1) * inv_D
+    tl.store(Mean + rows, mean, mask=row_mask)
+    mean = tl.expand_dims(mean, 1)
+
+    x_mean = x_block - mean
+    x_mean = tl.where(mask_2d, x_mean, 0.0)
+    _var = x_mean * x_mean
+    var = tl.sum(_var, axis=1) * inv_D
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + rows, rstd, mask=row_mask)
+    rstd = tl.expand_dims(rstd, 1)
+
+    y = x_mean * rstd
+    w = tl.load(W + cols, mask=col_mask).to(tl.float32)
+    b = tl.load(B + cols, mask=col_mask).to(tl.float32)
+    y = y * w[None, :] + b[None, :]
+
+    # Pre-compute sigmoid once to avoid redundant computation
+    sigmoid_u_block = tl.sigmoid(u_block)
+    silu_u_block = u_block * sigmoid_u_block
+
+    if MUL_U_ACTIVATION_TYPE == "silu":
+        y = y * silu_u_block
+    elif MUL_U_ACTIVATION_TYPE == "sigmoid":
+        y = y * sigmoid_u_block
+    else:
+        y = y * u_block
+
+    if CONCAT_U and SILU_U:
+        # pyre-fixme[16]
+        u_block = silu_u_block
+
+    if TRAINING:
+        # Reuse rows (as int64 for pointer arithmetic) and pre-computed mask_2d
+        row_offsets_i64 = rows.to(tl.int64)
+        # Pre-compute loop-invariant values
+        dropout_scale = 1.0 / (1.0 - dropout_ratio)
+        offsets = row_offsets_i64[:, None] * stride_mask + cols[None, :]
+
+        if CONCAT_U or CONCAT_X:
+            # All 2+ mask cases use compressed int8 format - load once
+            compressed = tl.load(RANDOM_MASK + offsets, mask=mask_2d, other=0).to(
+                tl.int32
+            )
+            # Bit 0 is always y_mask
+            y_keep = (compressed & 1) != 0
+
+            if CONCAT_U and CONCAT_X:
+                # 3-mask: (u_mask << 2) | (x_mask << 1) | y_mask
+                x_keep = (compressed & 2) != 0
+                u_keep = (compressed & 4) != 0
+                u_block = tl.where(u_keep, u_block * dropout_scale, 0.0)
+                x_block = tl.where(x_keep, x_block * dropout_scale, 0.0)
+            elif CONCAT_U:
+                # 2-mask: (u_mask << 1) | y_mask
+                u_keep = (compressed & 2) != 0
+                u_block = tl.where(u_keep, u_block * dropout_scale, 0.0)
+            else:  # CONCAT_X
+                # 2-mask: (x_mask << 1) | y_mask
+                x_keep = (compressed & 2) != 0
+                x_block = tl.where(x_keep, x_block * dropout_scale, 0.0)
+
+            y = tl.where(y_keep, y * dropout_scale, 0.0)
+        else:
+            # 1-mask: y_mask at bit 0
+            y_keep = tl.load(RANDOM_MASK + offsets, mask=mask_2d, other=True)
+            y = tl.where(y_keep, y * dropout_scale, 0.0)
+
+    if CONCAT_U and CONCAT_X:
+        Y_block_ptr_u = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 3 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        Y_block_ptr_x = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 3 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, D),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        Y_block_ptr_y = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 3 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, 2 * D),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        tl.store(Y_block_ptr_u, u_block.to(Y.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(Y_block_ptr_x, x_block.to(Y.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(Y_block_ptr_y, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+    elif CONCAT_U:
+        Y_block_ptr_u = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 2 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+        Y_block_ptr_y = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 2 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, D),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+        tl.store(Y_block_ptr_u, u_block.to(Y.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(Y_block_ptr_y, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+    elif CONCAT_X:
+        Y_block_ptr_x = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 2 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+        Y_block_ptr_y = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 2 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, D),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+        tl.store(Y_block_ptr_x, x_block.to(Y.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(Y_block_ptr_y, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+    else:
+        Y_block_ptr = tl.make_block_ptr(
+            base=Y,
+            shape=(N, D),
+            strides=(stride_y, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        tl.store(Y_block_ptr, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.jit
+def _ln_mul_dropout_fwd(
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    D,
+    eps,
+    seed,
+    dropout_ratio,
+    stride_x,
+    stride_u,
+    stride_y,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_U: tl.constexpr,
+    CONCAT_X: tl.constexpr,
+    MUL_U_ACTIVATION_TYPE: tl.constexpr,
+    FAST_DROPOUT: tl.constexpr,
+):
+    row = tl.program_id(0)
+    X += row.to(tl.int64) * stride_x
+    U += row.to(tl.int64) * stride_u
+    Y += row.to(tl.int64) * stride_y
+    cols = tl.arange(0, BLOCK_D)
+
+    # Compute mean
+    mean = 0.0
+    x = tl.load(X + cols, mask=cols < D, other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=0) / D
+
+    # Compute variance
+    _var = tl.zeros([BLOCK_D], dtype=tl.float32)
+    x_mean = tl.where(cols < D, x - mean, 0.0)
+    _var += x_mean * x_mean
+    var = tl.sum(_var, axis=0) / D
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Mean + row, mean)
+    tl.store(Rstd + row, rstd)
+
+    # Normalize and apply linear transformation
+    mask = cols < D
+    y = x_mean * rstd
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    b = tl.load(B + cols, mask=mask).to(tl.float32)
+    y = y * w + b
+    u = tl.load(U + cols, mask=cols < D, other=0.0).to(tl.float32)
+    sigmoid_u = tl.sigmoid(u)
+    silu_u = u * sigmoid_u
+
+    if MUL_U_ACTIVATION_TYPE == "silu":
+        y = y * silu_u
+    elif MUL_U_ACTIVATION_TYPE == "sigmoid":
+        y = y * sigmoid_u
+    else:
+        y = y * u
+
+    if CONCAT_U and SILU_U:
+        u = silu_u
+
+    if TRAINING:
+        random_offsets = 3 * row * BLOCK_D + cols
+        if CONCAT_U and CONCAT_X:
+            # apply dropout on u
+            if FAST_DROPOUT:
+                random_u, random_x, random_y = rand3x(seed, random_offsets)
+            else:
+                random_u = tl.rand(seed, random_offsets)
+            u_keep = random_u > dropout_ratio
+            u = tl.where(u_keep, u / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on x
+            if not FAST_DROPOUT:
+                random_x = tl.rand(seed, random_offsets + D)
+            x_keep = random_x > dropout_ratio  # pyre-ignore [61]
+            x = tl.where(x_keep, x / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on y
+            if not FAST_DROPOUT:
+                random_y = tl.rand(seed, random_offsets + 2 * D)
+            y_keep = random_y > dropout_ratio  # pyre-ignore [61]
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+        elif CONCAT_U:
+            # apply dropout on u
+            if FAST_DROPOUT:
+                random_u, random_y, _ = rand3x(seed, random_offsets)
+            else:
+                random_u = tl.rand(seed, random_offsets)
+            u_keep = random_u > dropout_ratio
+            u = tl.where(u_keep, u / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on y
+            if not FAST_DROPOUT:
+                random_y = tl.rand(seed, random_offsets + D)
+            y_keep = random_y > dropout_ratio  # pyre-ignore [61]
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+        elif CONCAT_X:
+            # apply dropout on x
+            if FAST_DROPOUT:
+                random_x, random_y, _ = rand3x(seed, random_offsets)
+            else:
+                random_x = tl.rand(seed, random_offsets)
+            x_keep = random_x > dropout_ratio
+            x = tl.where(x_keep, x / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on y
+            if not FAST_DROPOUT:
+                random_y = tl.rand(seed, random_offsets + D)
+            y_keep = random_y > dropout_ratio  # pyre-ignore [61]
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+        else:
+            random = tl.rand(seed, random_offsets)
+            y_keep = random > dropout_ratio
+            # write-back
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+
+    # Write output
+    if CONCAT_U and CONCAT_X:
+        tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + D + cols, x.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + 2 * D + cols, y.to(Y.dtype.element_ty), mask=mask)
+    elif CONCAT_U:
+        tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+    elif CONCAT_X:
+        tl.store(Y + cols, x.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+    else:
+        tl.store(Y + cols, y.to(Y.dtype.element_ty), mask=mask)
+
+
+@triton.jit
+def _ln_mul_dropout_bwd_dx_du_rng(
+    DX,
+    DU,
+    DY,
+    DW,
+    DB,
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    RANDOM_MASK,
+    stride_dx,
+    stride_du,
+    stride_dy,
+    stride_x,
+    stride_u,
+    stride_y,
+    stride_mask,
+    D,
+    eps,
+    dropout_ratio,
+    N,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_U: tl.constexpr,
+    CONCAT_X: tl.constexpr,
+    MUL_U_ACTIVATION_TYPE: tl.constexpr,
+    COMPUTE_Y: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    tile_num = tl.num_programs(0)
+    rows_per_tile = N // tile_num
+    if pid < N % tile_num:
+        rows_per_tile += 1
+
+    if rows_per_tile == 0:
+        return
+
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+
+    row = pid
+    # Pre-compute row and pid as int64 once for initial pointer setup
+    row_i64 = row.to(tl.int64)
+    pid_i64 = pid.to(tl.int64)
+    X += row_i64 * stride_x
+    U += row_i64 * stride_u
+    if COMPUTE_Y:
+        Y += row_i64 * stride_y
+    DY += row_i64 * stride_dy
+    DX += row_i64 * stride_dx
+    DU += row_i64 * stride_du
+    DW = DW + pid_i64 * D + cols
+    DB = DB + pid_i64 * D + cols
+
+    # Pre-compute mask pointer offset (all cases use stride_mask for (N, D) shape)
+    RANDOM_MASK += row_i64 * stride_mask
+
+    partial_dw = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    partial_db = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    b = tl.load(B + cols, mask=mask).to(tl.float32)
+
+    dropout_scale = 0.0
+    if TRAINING:
+        dropout_scale = 1.0 / (1.0 - dropout_ratio)
+
+    # Pre-compute inv_D to replace divisions with multiplications (optimization)
+    inv_D = 1.0 / D
+
+    # Pre-compute tile_num as int64 to avoid repeated conversion in the loop
+    tile_num_i64 = tile_num.to(tl.int64)
+    for _ in range(0, rows_per_tile):
+        # Load data to SRAM
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        if CONCAT_U and CONCAT_X:
+            du = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dx = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + 2 * D + cols, mask=mask, other=0).to(tl.float32)
+        elif CONCAT_U:
+            du = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dx = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dy = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+        elif CONCAT_X:
+            du = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dx = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+        else:
+            du = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dx = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+        if TRAINING:
+            if CONCAT_U or CONCAT_X:
+                # All 2+ mask cases use compressed int8 format - load once
+                compressed = tl.load(RANDOM_MASK + cols, mask=mask, other=0).to(
+                    tl.int32
+                )
+                dy_keep = (compressed & 1) != 0  # Bit 0 always y_mask
+
+                if CONCAT_U and CONCAT_X:
+                    # Format: (u_mask << 2) | (x_mask << 1) | y_mask
+                    dx_keep = (compressed & 2) != 0
+                    du_keep = (compressed & 4) != 0
+                    du = tl.where(du_keep, du * dropout_scale, 0.0)
+                    dx = tl.where(dx_keep, dx * dropout_scale, 0.0)
+                elif CONCAT_U:
+                    # Format: (u_mask << 1) | y_mask
+                    du_keep = (compressed & 2) != 0
+                    du = tl.where(du_keep, du * dropout_scale, 0.0)
+                else:  # CONCAT_X
+                    # Format: (x_mask << 1) | y_mask
+                    dx_keep = (compressed & 2) != 0
+                    dx = tl.where(dx_keep, dx * dropout_scale, 0.0)
+                dy = tl.where(dy_keep, dy * dropout_scale, 0.0)
+            else:
+                # 1-mask: y_mask at bit 0
+                dy_keep = tl.load(RANDOM_MASK + cols, mask=mask, other=True)
+                dy = tl.where(dy_keep, dy * dropout_scale, 0.0)
+
+        mean = tl.load(Mean + row)
+        rstd = tl.load(Rstd + row)
+
+        # Compute dx
+        xhat = (x - mean) * rstd
+        u = tl.load(U + cols, mask=mask, other=0).to(tl.float32)
+        ln = xhat * w + b
+        du_y = dy * ln
+        mul_u = u
+        sig_u = tl.sigmoid(u)
+
+        # Pre-compute commonly used expressions to avoid redundant computation
+        silu_u = u * sig_u  # silu(u) - used multiple times
+        dsig_u = sig_u * (1.0 - sig_u)  # sigmoid derivative - used multiple times
+        dsilu_u = sig_u + silu_u * (
+            1.0 - sig_u
+        )  # silu derivative - used multiple times
+
+        if MUL_U_ACTIVATION_TYPE == "silu":
+            mul_u = silu_u
+            du_y = dy * ln * dsilu_u
+            dy = dy * silu_u
+        elif MUL_U_ACTIVATION_TYPE == "sigmoid":
+            mul_u = sig_u
+            du_y = dy * ln * dsig_u
+            dy = dy * sig_u
+        else:
+            dy = dy * u
+
+        du_u = du
+        if CONCAT_U and SILU_U:
+            du_u *= dsilu_u
+            u = silu_u
+
+        du = du_y + du_u
+
+        tl.store(DU + cols, du.to(DU.dtype.element_ty), mask=mask)
+
+        wdy = w * dy
+        if COMPUTE_Y:
+            y = ln * mul_u
+            if TRAINING:
+                if CONCAT_U:
+                    u = tl.where(
+                        du_keep,  # pyre-ignore [61]
+                        u * dropout_scale,
+                        0.0,
+                    )
+                if CONCAT_X:
+                    x = tl.where(
+                        dx_keep,  # pyre-ignore [61]
+                        x * dropout_scale,
+                        0.0,
+                    )
+                y = tl.where(
+                    dy_keep,  # pyre-ignore [61]
+                    y * dropout_scale,
+                    0.0,
+                )
+            if CONCAT_U and CONCAT_X:
+                tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, x.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + 2 * D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            elif CONCAT_U:
+                tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            elif CONCAT_X:
+                tl.store(Y + cols, x.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            else:
+                tl.store(Y + cols, y.to(Y.dtype.element_ty), mask=mask)
+            Y += tile_num_i64 * stride_y
+
+        # Note: xhat and wdy are already 0 outside valid range due to masked loads,
+        # so no additional tl.where masking is needed before reduction
+        c1 = tl.sum(xhat * wdy, axis=0) * inv_D
+        c2 = tl.sum(wdy, axis=0) * inv_D
+        dx += (wdy - (xhat * c1 + c2)) * rstd
+        # Write dx
+        tl.store(DX + cols, dx, mask=mask)
+
+        # Accumulate partial sums for dw/db
+        partial_dw += dy * xhat
+        partial_db += dy
+        X += tile_num_i64 * stride_x
+        U += tile_num_i64 * stride_u
+        DY += tile_num_i64 * stride_dy
+        DX += tile_num_i64 * stride_dx
+        DU += tile_num_i64 * stride_du
+        # Increment mask pointer
+        RANDOM_MASK += tile_num_i64 * stride_mask
+        row += tile_num
+    tl.store(DW, partial_dw, mask=mask)
+    tl.store(DB, partial_db, mask=mask)
+
+
+@triton.jit
+def _ln_mul_dropout_bwd_dx_du(
+    DX,
+    DU,
+    DY,
+    DW,
+    DB,
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    stride_dx,
+    stride_du,
+    stride_dy,
+    stride_x,
+    stride_u,
+    stride_y,
+    D,
+    eps,
+    seed,
+    dropout_ratio,
+    N,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_U: tl.constexpr,
+    CONCAT_X: tl.constexpr,
+    MUL_U_ACTIVATION_TYPE: tl.constexpr,
+    COMPUTE_Y: tl.constexpr,
+    FAST_DROPOUT: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    tile_num = tl.num_programs(0)
+    rows_per_tile = N // tile_num
+    if pid < N % tile_num:
+        rows_per_tile += 1
+
+    if rows_per_tile == 0:
+        return
+
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+
+    row = pid
+    X += row.to(tl.int64) * stride_x
+    U += row.to(tl.int64) * stride_u
+    if COMPUTE_Y:
+        Y += row.to(tl.int64) * stride_y
+    DY += row.to(tl.int64) * stride_dy
+    DX += row.to(tl.int64) * stride_dx
+    DU += row.to(tl.int64) * stride_du
+    DW = DW + pid * D + cols
+    DB = DB + pid * D + cols
+
+    partial_dw = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    partial_db = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    b = tl.load(B + cols, mask=mask).to(tl.float32)
+    for _idx in range(0, rows_per_tile):
+        # Load data to SRAM
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        if CONCAT_U and CONCAT_X:
+            du = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dx = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + 2 * D + cols, mask=mask, other=0).to(tl.float32)
+        elif CONCAT_U:
+            du = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dx = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dy = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+        elif CONCAT_X:
+            du = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dx = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+        else:
+            du = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dx = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+        if TRAINING:
+            random_offsets = 3 * row * BLOCK_D + cols
+            if CONCAT_U and CONCAT_X:
+                # apply dropout on du
+                if FAST_DROPOUT:
+                    random_du, random_dx, random_dy = rand3x(seed, random_offsets)
+                else:
+                    random_du = tl.rand(seed, random_offsets)
+                du_keep = random_du > dropout_ratio
+                du = tl.where(du_keep, du / (1.0 - dropout_ratio), 0.0)
+                # apply dropout on dx
+                if not FAST_DROPOUT:
+                    random_dx = tl.rand(seed, random_offsets + D)
+                dx_keep = random_dx > dropout_ratio  # pyre-ignore [61]
+                dx = tl.where(dx_keep, dx / (1.0 - dropout_ratio), 0.0)
+                # apply dropout on dy
+                if not FAST_DROPOUT:
+                    random_dy = tl.rand(seed, random_offsets + 2 * D)
+                dy_keep = random_dy > dropout_ratio  # pyre-ignore [61]
+                dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+            elif CONCAT_U:
+                # apply dropout on du
+                if FAST_DROPOUT:
+                    random_du, _, random_dy = rand3x(seed, random_offsets)
+                else:
+                    random_du = tl.rand(seed, random_offsets)
+                du_keep = random_du > dropout_ratio
+                du = tl.where(du_keep, du / (1.0 - dropout_ratio), 0.0)
+                # apply dropout on dy
+                if not FAST_DROPOUT:
+                    random_dy = tl.rand(seed, random_offsets + D)
+                dy_keep = random_dy > dropout_ratio  # pyre-ignore [61]
+                dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+            elif CONCAT_X:
+                # apply dropout on dx
+                if FAST_DROPOUT:
+                    _, random_dx, random_dy = rand3x(seed, random_offsets)
+                else:
+                    random_dx = tl.rand(seed, random_offsets)
+                dx_keep = random_dx > dropout_ratio  # pyre-ignore [61]
+                dx = tl.where(dx_keep, dx / (1.0 - dropout_ratio), 0.0)
+                # apply dropout on dy
+                if not FAST_DROPOUT:
+                    random_dy = tl.rand(seed, random_offsets + D)
+                dy_keep = random_dy > dropout_ratio  # pyre-ignore [61]
+                dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+            else:
+                random = tl.rand(seed, random_offsets)
+                dy_keep = random > dropout_ratio
+                # write-back
+                dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+
+        mean = tl.load(Mean + row)
+        rstd = tl.load(Rstd + row)
+
+        # Compute dx
+        xhat = (x - mean) * rstd
+        u = tl.load(U + cols, mask=mask, other=0).to(tl.float32)
+        ln = xhat * w + b
+        du_y = dy * ln
+        mul_u = u
+        sig_u = tl.sigmoid(u)
+
+        if MUL_U_ACTIVATION_TYPE == "silu":
+            mul_u = u * sig_u
+            du_y = dy * ln * (sig_u + u * sig_u * (1.0 - sig_u))
+            dy = dy * u * sig_u
+        elif MUL_U_ACTIVATION_TYPE == "sigmoid":
+            mul_u = sig_u
+            du_y = dy * ln * (sig_u * (1.0 - sig_u))
+            dy = dy * sig_u
+        else:
+            dy = dy * u
+
+        du_u = du
+        if CONCAT_U:
+            if SILU_U:
+                du_u *= sig_u + u * sig_u * (1.0 - sig_u)
+                u = u * sig_u
+
+        du = du_y + du_u
+
+        tl.store(DU + cols, du.to(DU.dtype.element_ty), mask=mask)
+        wdy = w * dy
+        if COMPUTE_Y:
+            y = ln * mul_u
+            if TRAINING:
+                if CONCAT_U:
+                    u = tl.where(
+                        du_keep,  # pyre-ignore [61]
+                        u / (1.0 - dropout_ratio),
+                        0.0,
+                    )
+                if CONCAT_X:
+                    x = tl.where(
+                        dx_keep,  # pyre-ignore [61]
+                        x / (1.0 - dropout_ratio),
+                        0.0,
+                    )
+                y = tl.where(
+                    dy_keep,  # pyre-ignore [61]
+                    y / (1.0 - dropout_ratio),
+                    0.0,
+                )
+            if CONCAT_U and CONCAT_X:
+                tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, x.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + 2 * D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            elif CONCAT_U:
+                tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            elif CONCAT_X:
+                tl.store(Y + cols, x.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            else:
+                tl.store(Y + cols, y.to(Y.dtype.element_ty), mask=mask)
+            Y += tile_num.to(tl.int64) * stride_y
+
+        xhat = tl.where(mask, xhat, 0.0)
+        wdy = tl.where(mask, wdy, 0.0)
+        c1 = tl.sum(xhat * wdy, axis=0) / D
+        c2 = tl.sum(wdy, axis=0) / D
+        dx += (wdy - (xhat * c1 + c2)) * rstd
+        # Write dx
+        tl.store(DX + cols, dx, mask=mask)
+
+        # Accumulate partial sums for dw/db
+        partial_dw += dy * xhat
+        partial_db += dy
+        X += tile_num.to(tl.int64) * stride_x
+        U += tile_num.to(tl.int64) * stride_u
+        DY += tile_num.to(tl.int64) * stride_dy
+        DX += tile_num.to(tl.int64) * stride_dx
+        DU += tile_num.to(tl.int64) * stride_du
+        row += tile_num
+    tl.store(DW, partial_dw, mask=mask)
+    tl.store(DB, partial_db, mask=mask)
+
+
+def _get_bwd_dwdb_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_N in [32, 64, 128, 256]:
+        for num_warps in [8, 16] + ([] if torch.ops.hip else [32]):
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_bwd_dwdb_configs(),
+    key=["D"],
+)
+@triton.jit
+def _ln_mul_dropout_bwd_dwdb(
+    DW,
+    DB,
+    FINAL_DW,
+    FINAL_DB,
+    N,
+    D,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    pid = tl.program_id(0).to(tl.int64)
+    cols = pid * BLOCK_D + tl.arange(0, BLOCK_D)
+    dw = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+    db = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+
+    for i in range(0, N, BLOCK_N):
+        rows = i + tl.arange(0, BLOCK_N)
+        # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+        mask = (rows[:, None] < N) & (cols[None, :] < D)
+        offs = rows[:, None] * D + cols[None, :]
+        dw += tl.load(DW + offs, mask=mask, other=0.0)
+        db += tl.load(DB + offs, mask=mask, other=0.0)
+
+    sum_dw = tl.sum(dw, axis=0)
+    sum_db = tl.sum(db, axis=0)
+    tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D)
+    tl.store(FINAL_DB + cols, sum_db.to(FINAL_DB.dtype.element_ty), mask=cols < D)
+
+
+def _create_dropout_mask(
+    N: int,
+    D: int,
+    BLOCK_D: int,
+    concat_u: bool,
+    concat_x: bool,
+    dropout_ratio: float,
+    seed: int,
+    device: torch.device,
+) -> torch.Tensor:
+    """Create dropout mask tensor for layer norm mul dropout.
+
+    Args:
+        N: Number of rows
+        D: Feature dimension
+        BLOCK_D: Block size for D dimension
+        concat_u: Whether to concatenate u
+        concat_x: Whether to concatenate x
+        dropout_ratio: Dropout ratio
+        seed: Random seed
+        device: Device to create tensor on
+
+    Returns:
+        random_mask: (N, D) int8 tensor. Mask j at bit j.
+
+    Bit layout: y = val & 1, x = val & 2, u = val & 4.
+    """
+    num_masks = 1 + int(concat_u) + int(concat_x)
+    # Torch uses 1 byte for bool internally, same as int8, so always use int8.
+    random_mask = torch.empty([N, D], dtype=torch.int8, device=device)
+    _generate_random_mask[(triton.cdiv(N, 4),)](
+        random_mask,
+        N,
+        dropout_ratio,
+        seed,
+        D,  # pyre-ignore[6]
+        random_mask.stride(0),  # pyre-ignore[6]
+        BLOCK_D,  # pyre-fixme[6]: Triton constexpr param
+        num_masks,  # pyre-ignore[6]: NUM_MASKS constexpr
+    )
+    return random_mask
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::_triton_layer_norm_mul_dropout_fwd_impl", mutates_args=()
+)
+def _triton_layer_norm_mul_dropout_fwd_impl(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool,
+    concat_u: bool,
+    concat_x: bool,
+    mul_u_activation_type: str,
+    seed: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Internal implementation that returns only tensors for custom_op compatibility.
+
+    Returns (y, mean, rstd, random_mask) where random_mask is empty when not used.
+    """
+    N, D = x.shape
+
+    if concat_u and concat_x:
+        y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+    elif concat_u:
+        y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+    elif concat_x:
+        y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty_like(x)
+    mean = torch.empty((N,), dtype=torch.float32, device=x.device)
+    rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+    if N == 0:
+        return y, mean, rstd, torch.empty(0, dtype=x.dtype, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_D: int = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+    if D > BLOCK_D:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+    num_warps: int = min(max(BLOCK_D // 256, 1), 8)
+    random_mask: torch.Tensor = torch.empty(0, dtype=x.dtype, device=x.device)
+    # Separating RNG from the ln_mul_dropout kernel lets us batch multiple rows per
+    # program (autotuned _ln_mul_dropout_fwd_rng) and reuse the precomputed mask in the
+    # backward, instead of launching one program per row with fused RNG. This is a large
+    # win on Blackwell (sm_100) and AMD MI350 (gfx950); other GPUs keep the fused path.
+    # Extended to support concat_u + concat_x for mask reuse optimization
+    if (
+        not FUSE_OUTPUT_LN_RNG_BLACKWELL
+        and use_separated_rng_ln_mul_dropout()
+        and training
+    ):
+        random_mask = _create_dropout_mask(
+            N=N,
+            D=D,
+            BLOCK_D=BLOCK_D,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            dropout_ratio=dropout_ratio,
+            seed=seed,
+            device=x.device,
+        )
+
+        def grid(META):
+            return (triton.cdiv(N, META["BLOCK_N"]),)
+
+        # pyre-ignore[28]
+        _ln_mul_dropout_fwd_rng[grid](
+            x,
+            u,
+            y,
+            weight,
+            bias,
+            mean,
+            rstd,
+            random_mask,
+            N,
+            D,
+            eps,
+            dropout_ratio,
+            x.stride(0),
+            u.stride(0),
+            y.stride(0),
+            random_mask.stride(0),
+            SILU_U=silu_u,
+            BLOCK_D=BLOCK_D,
+            TRAINING=training,
+            CONCAT_U=concat_u,
+            CONCAT_X=concat_x,
+            MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
+        )
+
+    else:
+        # Default path: fused RNG generation
+        # Mask cannot be saved with fused RNG - it's generated inline in the kernel
+        # pyre-ignore[28]
+        _ln_mul_dropout_fwd[(N,)](
+            x,
+            u,
+            y,
+            weight,
+            bias,
+            mean,
+            rstd,
+            D,
+            eps,
+            seed,
+            dropout_ratio,
+            x.stride(0),
+            u.stride(0),
+            y.stride(0),
+            SILU_U=silu_u,
+            BLOCK_D=BLOCK_D,
+            TRAINING=training,
+            CONCAT_U=concat_u,
+            CONCAT_X=concat_x,
+            MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
+            FAST_DROPOUT=COMPUTE_OUTPUT_LN_FAST_DROPOUT,
+            num_warps=num_warps,
+        )
+    return y, mean, rstd, random_mask
+
+
+@_triton_layer_norm_mul_dropout_fwd_impl.register_fake
+def _triton_layer_norm_mul_dropout_fwd_impl_fake(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool,
+    concat_u: bool,
+    concat_x: bool,
+    mul_u_activation_type: str,
+    seed: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Fake implementation for FakeTensor tracing."""
+    N, D = x.shape
+    if concat_u and concat_x:
+        y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+    elif concat_u:
+        y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+    elif concat_x:
+        y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty_like(x)
+    mean = torch.empty((N,), dtype=torch.float32, device=x.device)
+    rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+    random_mask = torch.empty(0, dtype=x.dtype, device=x.device)
+    return y, mean, rstd, random_mask
+
+
+def triton_layer_norm_mul_dropout_fwd(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    mul_u_activation_type: str = "none",
+    seed: Optional[int] = None,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, Optional[torch.Tensor]
+]:  # y, mean, rstd, BLOCK_D, num_warps, seed, random_mask
+    """Forward pass for layer norm + mul + dropout.
+
+    Args:
+        x: Input tensor of shape (N, D)
+        u: Second input tensor of shape (N, D)
+        weight: Layer norm weight of shape (D,)
+        bias: Layer norm bias of shape (D,)
+        eps: Layer norm epsilon
+        dropout_ratio: Dropout probability
+        training: Whether in training mode
+        silu_u: Whether to apply SiLU to u before concatenation
+        concat_u: Whether to concatenate u to output
+        concat_x: Whether to concatenate x to output
+        mul_u_activation_type: Activation type for u multiplication
+        seed: Random seed for dropout
+
+    Returns:
+        Tuple of (y, mean, rstd, BLOCK_D, num_warps, seed, random_mask)
+        - random_mask is None when using fused RNG path (non-SM100+)
+        - random_mask is always returned when using separate RNG path (SM100+)
+          for reuse in backward pass (avoids redundant mask generation)
+    """
+    assert x.dim() == 2
+    x = switch_to_contiguous_if_needed(x)
+    N, D = x.shape
+    assert weight.dim() == 1
+    assert bias.dim() == 1
+    assert weight.numel() == D
+    assert bias.numel() == D
+
+    if N == 0:
+        D = x.shape[1]
+        if concat_u and concat_x:
+            y = torch.empty((0, 3 * D), dtype=x.dtype, device=x.device)
+        elif concat_u or concat_x:
+            y = torch.empty((0, 2 * D), dtype=x.dtype, device=x.device)
+        else:
+            y = torch.empty_like(x)
+        return (
+            y,
+            torch.empty((N,), dtype=torch.float32, device=x.device),
+            torch.empty((N,), dtype=torch.float32, device=x.device),
+            0,
+            0,
+            0,
+            None,
+        )
+
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_D: int = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+    if D > BLOCK_D:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+    if seed is None and training:
+        # pyre-ignore[9]: torch.randint with dtype=int64 always returns int
+        seed = torch.randint(low=0, high=2**62, size=(1,), dtype=torch.int64).item()
+    num_warps: int = min(max(BLOCK_D // 256, 1), 8)
+
+    # Call internal implementation
+    y, mean, rstd, random_mask_tensor = _triton_layer_norm_mul_dropout_fwd_impl(
+        x,
+        u,
+        weight,
+        bias,
+        eps,
+        dropout_ratio,
+        training,
+        silu_u,
+        concat_u,
+        concat_x,
+        mul_u_activation_type,
+        seed if seed is not None else 0,
+    )
+
+    # Convert empty tensor back to None
+    random_mask: Optional[torch.Tensor] = (
+        random_mask_tensor if random_mask_tensor.numel() > 0 else None
+    )
+
+    return y, mean, rstd, BLOCK_D, num_warps, seed, random_mask  # pyre-ignore[7]
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::_triton_layer_norm_mul_dropout_bwd_impl", mutates_args=()
+)
+def _triton_layer_norm_mul_dropout_bwd_impl(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    BLOCK_D: int,
+    num_warps: int,
+    eps: float,
+    training: bool,
+    dropout_ratio: float,
+    seed: int,
+    silu_u: bool,
+    concat_u: bool,
+    concat_x: bool,
+    mul_u_activation_type: str,
+    compute_y: bool,
+    random_mask: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Internal implementation that returns only tensors for custom_op compatibility.
+
+    When compute_y is False, y is returned as an empty tensor.
+    random_mask with numel() == 0 means no mask (fused RNG path).
+    """
+    N, D = x.shape
+    if compute_y:
+        if concat_u and concat_x:
+            y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+        elif concat_u:
+            y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+        elif concat_x:
+            y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+        else:
+            y = torch.empty_like(x)
+    else:
+        y = torch.empty(0, dtype=x.dtype, device=x.device)
+
+    if N == 0:
+        return (
+            torch.zeros_like(x),
+            torch.zeros_like(u),
+            torch.zeros((D,), dtype=weight.dtype, device=x.device),
+            torch.zeros((D,), dtype=weight.dtype, device=x.device),
+            y,
+        )
+    dx = torch.empty_like(x)
+    du = torch.empty_like(u)
+    sms = torch.cuda.get_device_properties(x.device).multi_processor_count
+    tile_num = max(1, min(sms * 64, N // 4))
+    _dweight = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+    _dbias = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+    dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+    dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+
+    # Use separated RNG when random_mask is provided (from forward pass on SM100+ path)
+    has_random_mask = random_mask.numel() > 0
+    if has_random_mask:
+        # pyre-ignore[28]
+        _ln_mul_dropout_bwd_dx_du_rng[(tile_num,)](
+            dx,
+            du,
+            dy,
+            _dweight,
+            _dbias,
+            x,
+            u,
+            y if compute_y else None,
+            weight,
+            bias,
+            mean,
+            rstd,
+            random_mask,
+            dx.stride(0),
+            du.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            u.stride(0),
+            y.stride(0) if compute_y else 0,
+            random_mask.stride(0),
+            D,
+            eps,
+            dropout_ratio,
+            N=N,
+            SILU_U=silu_u,
+            BLOCK_D=BLOCK_D,
+            TRAINING=training,
+            CONCAT_U=concat_u,
+            CONCAT_X=concat_x,
+            MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
+            COMPUTE_Y=compute_y,
+            num_warps=num_warps,
+        )
+
+    else:
+        # pyre-ignore[28]
+        _ln_mul_dropout_bwd_dx_du[(tile_num,)](
+            dx,
+            du,
+            dy,
+            _dweight,
+            _dbias,
+            x,
+            u,
+            y if compute_y else None,
+            weight,
+            bias,
+            mean,
+            rstd,
+            dx.stride(0),
+            du.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            u.stride(0),
+            y.stride(0) if compute_y else 0,
+            D,
+            eps,
+            seed,
+            dropout_ratio,
+            N=N,
+            SILU_U=silu_u,
+            BLOCK_D=BLOCK_D,
+            TRAINING=training,
+            CONCAT_U=concat_u,
+            CONCAT_X=concat_x,
+            MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
+            COMPUTE_Y=compute_y,
+            FAST_DROPOUT=COMPUTE_OUTPUT_LN_FAST_DROPOUT,
+            num_warps=num_warps,
+        )
+
+    def grid(META):
+        return (triton.cdiv(D, META["BLOCK_D"]),)
+
+    blocks = triton.next_power_of_2(sms * 4)
+    BLOCK_D_bwd = triton.next_power_of_2(triton.cdiv(D, blocks))
+    BLOCK_D_bwd = min(max(BLOCK_D_bwd, 4), 128)
+    _ln_mul_dropout_bwd_dwdb[grid](
+        _dweight,
+        _dbias,
+        dweight,
+        dbias,
+        tile_num,
+        D,
+        BLOCK_D=BLOCK_D_bwd,
+    )
+    return dx, du, dweight, dbias, y
+
+
+@_triton_layer_norm_mul_dropout_bwd_impl.register_fake
+def _triton_layer_norm_mul_dropout_bwd_impl_fake(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    BLOCK_D: int,
+    num_warps: int,
+    eps: float,
+    training: bool,
+    dropout_ratio: float,
+    seed: int,
+    silu_u: bool,
+    concat_u: bool,
+    concat_x: bool,
+    mul_u_activation_type: str,
+    compute_y: bool,
+    random_mask: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Fake implementation for FakeTensor tracing."""
+    N, D = x.shape
+    dx = torch.empty_like(x)
+    du = torch.empty_like(u)
+    dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+    dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+    if compute_y:
+        if concat_u and concat_x:
+            y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+        elif concat_u:
+            y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+        elif concat_x:
+            y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+        else:
+            y = torch.empty_like(x)
+    else:
+        y = torch.empty(0, dtype=x.dtype, device=x.device)
+    return dx, du, dweight, dbias, y
+
+
+def triton_layer_norm_mul_dropout_bwd(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    BLOCK_D: int,
+    num_warps: int,
+    eps: float,
+    training: bool,
+    dropout_ratio: float,
+    seed: Optional[int] = None,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    mul_u_activation_type: str = "none",
+    compute_y: bool = False,
+    random_mask: Optional[torch.Tensor] = None,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]
+]:
+    N, D = x.shape
+
+    # Use empty tensor as sentinel for no random_mask
+    random_mask_tensor = (
+        random_mask
+        if random_mask is not None
+        else torch.empty(0, dtype=x.dtype, device=x.device)
+    )
+
+    dx, du, dweight, dbias, y_tensor = _triton_layer_norm_mul_dropout_bwd_impl(
+        dy,
+        x,
+        u,
+        weight,
+        bias,
+        mean,
+        rstd,
+        BLOCK_D,
+        num_warps,
+        eps,
+        training,
+        dropout_ratio,
+        seed if seed is not None else 0,
+        silu_u,
+        concat_u,
+        concat_x,
+        mul_u_activation_type,
+        compute_y,
+        random_mask_tensor,
+    )
+
+    # Convert empty tensor back to None
+    y: Optional[torch.Tensor] = y_tensor if compute_y else None
+    return dx, du, dweight, dbias, y
+
+
+class LayerNormMulDropoutFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        u: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+        dropout_ratio: float,
+        training: bool,
+        silu_u: bool = False,
+        concat_ux: bool = False,
+        seed: Optional[int] = None,
+    ) -> torch.Tensor:
+        if dropout_ratio == 0.0:
+            # skip dropout computation if dropout ratio is 0
+            training = False
+        # skipping supporting concat_u and concat_x separately here because seems like this code path is only used in v1 of hstu_linear
+        concat_u, concat_x = concat_ux, concat_ux
+
+        # Call forward function which generates and returns random_mask
+        # On SM100+ path, random_mask is always returned for backward reuse
+        # On fused RNG path, random_mask is None (mask generated inline in kernel)
+        y, mean, rstd, BLOCK_D, num_warps, returned_seed, random_mask = (
+            triton_layer_norm_mul_dropout_fwd(
+                x=x,
+                u=u,
+                weight=weight,
+                bias=bias,
+                eps=eps,
+                dropout_ratio=dropout_ratio,
+                training=training,
+                silu_u=silu_u,
+                concat_u=concat_u,
+                concat_x=concat_x,
+                seed=seed,
+            )
+        )
+
+        # Save tensors for backward pass
+        # When random_mask is generated (SM100+ path), always save it for reuse
+        # in backward pass. This avoids redundant _generate_random_mask execution.
+        if random_mask is not None:
+            ctx.save_for_backward(x, u, weight, bias, mean, rstd, random_mask)
+            ctx.has_random_mask = True
+        else:
+            ctx.save_for_backward(x, u, weight, bias, mean, rstd)
+            ctx.has_random_mask = False
+
+        ctx.BLOCK_D = BLOCK_D
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        ctx.seed = returned_seed
+        ctx.training = training
+        ctx.concat_ux = concat_ux
+        ctx.silu_u = silu_u
+        ctx.dropout_ratio = dropout_ratio
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        # Extract saved tensors including optional random mask
+        if ctx.has_random_mask:
+            x, u, weight, bias, mean, rstd, random_mask = ctx.saved_tensors
+        else:
+            x, u, weight, bias, mean, rstd = ctx.saved_tensors
+            random_mask = None
+
+        dx, du, dweight, dbias, _ = triton_layer_norm_mul_dropout_bwd(
+            dy=dy,
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            mean=mean,
+            rstd=rstd,
+            BLOCK_D=ctx.BLOCK_D,
+            num_warps=ctx.num_warps,
+            eps=ctx.eps,
+            training=ctx.training,
+            dropout_ratio=ctx.dropout_ratio,
+            seed=ctx.seed,
+            silu_u=ctx.silu_u,
+            concat_u=ctx.concat_ux,
+            concat_x=ctx.concat_ux,
+            compute_y=False,
+            random_mask=random_mask,  # Pass saved mask to backward
+        )
+        return dx, du, dweight, dbias, None, None, None, None, None, None
+
+
+@triton.jit
+def _group_norm_mul_dropout_fwd(
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    D,
+    Heads,
+    eps,
+    seed,
+    dropout_ratio,
+    stride_x,
+    stride_u,
+    stride_y,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_UX: tl.constexpr,
+):
+    row = tl.program_id(0)
+    X += row.to(tl.int64) * stride_x
+    U += row.to(tl.int64) * stride_u
+    Y += row.to(tl.int64) * stride_y
+    cols = tl.arange(0, BLOCK_D)
+    heads = tl.arange(0, BLOCK_H)
+    offsets = heads[:, None] * D + cols[None, :]
+    mask_h = heads < Heads
+    mask_c = cols < D
+    mask = mask_c[None, :] & mask_h[:, None]
+
+    # Compute mean
+    mean = 0.0
+    x = tl.load(X + offsets, mask=mask, other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=1) / D
+    mean = tl.ravel(mean)
+
+    # Compute variance
+    _var = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32)
+    x_mean = tl.where(mask, x - mean[:, None], 0.0)
+    _var += x_mean * x_mean
+    var = tl.sum(_var, axis=1) / D
+    var = tl.ravel(var)
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Mean + row * Heads + heads, mean, mask=mask_h)
+    tl.store(Rstd + row * Heads + heads, rstd, mask=mask_h)
+
+    # Normalize and apply linear transformation
+    y = x_mean * rstd[:, None]  # pyre-ignore [16]
+    w = tl.load(W + heads, mask=mask_h).to(tl.float32)
+    b = tl.load(B + heads, mask=mask_h).to(tl.float32)
+    y = y * w[:, None] + b[:, None]
+    u = tl.load(U + offsets, mask=mask, other=0.0).to(tl.float32)
+    if SILU_U:
+        u = u * tl.sigmoid(u)
+    y = y * u
+
+    if TRAINING:
+        if CONCAT_UX:
+            random_offsets = row * 3 * D * Heads + offsets
+            # apply dropout on u
+            random_u = tl.rand(seed, random_offsets)
+            u_keep = random_u > dropout_ratio
+            u = tl.where(u_keep, u / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on x
+            random_x = tl.rand(seed, random_offsets + Heads * D)
+            x_keep = random_x > dropout_ratio
+            x = tl.where(x_keep, x / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on y
+            random_y = tl.rand(seed, random_offsets + 2 * Heads * D)
+            y_keep = random_y > dropout_ratio
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+        else:
+            random_offsets = row * D * Heads + offsets
+            random = tl.rand(seed, random_offsets)
+            y_keep = random > dropout_ratio
+            # write-back
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+
+    # Write output
+    if CONCAT_UX:
+        tl.store(Y + offsets, u.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + Heads * D + offsets, x.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + 2 * Heads * D + offsets, y.to(Y.dtype.element_ty), mask=mask)
+    else:
+        tl.store(Y + offsets, y.to(Y.dtype.element_ty), mask=mask)
+
+
+@triton.jit
+def _group_norm_mul_dropout_bwd_dx_du(
+    DX,
+    DU,
+    DY,
+    DW,
+    DB,
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    stride_dx,
+    stride_du,
+    stride_dy,
+    stride_x,
+    stride_u,
+    stride_y,
+    D,
+    Heads,
+    eps,
+    seed,
+    dropout_ratio,
+    SILU_U: tl.constexpr,
+    GROUP_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_UX: tl.constexpr,
+    COMPUTE_Y: tl.constexpr,
+):
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_D)
+    off_heads = tl.arange(0, BLOCK_H)
+    mask_c = cols < D
+    mask_h = off_heads < Heads
+    mask = mask_c[None, :] & mask_h[:, None]
+    X += row.to(tl.int64) * stride_x
+    U += row.to(tl.int64) * stride_u
+    DY += row.to(tl.int64) * stride_dy
+    DX += row.to(tl.int64) * stride_dx
+    DU += row.to(tl.int64) * stride_du
+    offsets = off_heads[:, None] * D + cols[None, :]
+
+    # Load data to SRAM
+    x = tl.load(X + offsets, mask=mask, other=0).to(tl.float32)
+    if CONCAT_UX:
+        du = tl.load(DY + offsets, mask=mask, other=0).to(tl.float32)
+        dx = tl.load(DY + Heads * D + offsets, mask=mask, other=0).to(tl.float32)
+        dy = tl.load(DY + 2 * Heads * D + offsets, mask=mask, other=0).to(tl.float32)
+    else:
+        du = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32)
+        dx = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32)
+        dy = tl.load(DY + offsets, mask=mask, other=0).to(tl.float32)
+    if TRAINING:
+        if CONCAT_UX:
+            random_offsets = row * 3 * D * Heads + offsets
+            # apply dropout on du
+            random_du = tl.rand(seed, random_offsets)
+            du_keep = random_du > dropout_ratio
+            du = tl.where(du_keep, du / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on dx
+            random_dx = tl.rand(seed, random_offsets + Heads * D)
+            dx_keep = random_dx > dropout_ratio
+            dx = tl.where(dx_keep, dx / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on dy
+            random_dy = tl.rand(seed, random_offsets + 2 * Heads * D)
+            dy_keep = random_dy > dropout_ratio
+            dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+        else:
+            random_offsets = row * D * Heads + offsets
+            random = tl.rand(seed, random_offsets)
+            dy_keep = random > dropout_ratio
+            # write-back
+            dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+
+    mean = tl.load(Mean + row * Heads + off_heads)
+    rstd = tl.load(Rstd + row * Heads + off_heads)
+
+    # Compute dx
+    xhat = (x - mean[:, None]) * rstd[:, None]
+    w = tl.load(W + off_heads, mask=mask_h).to(tl.float32)
+    b = tl.load(B + off_heads, mask=mask_h).to(tl.float32)
+    u = tl.load(U + offsets, mask=mask, other=0).to(tl.float32)
+    ln = xhat * w[:, None] + b[:, None]
+    du += dy * ln
+    if SILU_U:
+        sig_u = tl.sigmoid(u)
+        silu_u = u * sig_u
+        du = du * sig_u * (1 + u - silu_u)
+        u = silu_u
+    tl.store(DU + offsets, du.to(DU.dtype.element_ty), mask=mask)
+    dy = dy * u
+    wdy = w[:, None] * dy
+    if COMPUTE_Y:
+        Y += row.to(tl.int64) * stride_y
+        y = ln * u
+        if TRAINING:
+            if CONCAT_UX:
+                u = tl.where(
+                    du_keep,  # pyre-ignore [61]
+                    u / (1.0 - dropout_ratio),
+                    0.0,
+                )
+                x = tl.where(
+                    dx_keep,  # pyre-ignore [61]
+                    x / (1.0 - dropout_ratio),
+                    0.0,
+                )
+                y = tl.where(
+                    dy_keep,  # pyre-ignore [61]
+                    y / (1.0 - dropout_ratio),
+                    0.0,
+                )
+            else:
+                y = tl.where(
+                    dy_keep,  # pyre-ignore [61]
+                    y / (1.0 - dropout_ratio),
+                    0.0,
+                )
+        if CONCAT_UX:
+            tl.store(Y + offsets, u.to(Y.dtype.element_ty), mask=mask)
+            tl.store(Y + Heads * D + offsets, x.to(Y.dtype.element_ty), mask=mask)
+            tl.store(Y + 2 * Heads * D + offsets, y.to(Y.dtype.element_ty), mask=mask)
+        else:
+            tl.store(Y + offsets, y.to(Y.dtype.element_ty), mask=mask)
+
+    xhat = tl.where(mask, xhat, 0.0)
+    wdy = tl.where(mask, wdy, 0.0)
+    c1 = tl.sum(xhat * wdy, axis=1) / D
+    c2 = tl.sum(wdy, axis=1) / D
+    dx += (wdy - (xhat * c1[:, None] + c2[:, None])) * rstd[:, None]
+    # Write dx
+    tl.store(DX + offsets, dx, mask=mask)
+
+    # Offset locks and weights/biases gradient pointer for parallel reduction
+    lock_id = row % GROUP_N
+    DW = DW + lock_id * Heads + off_heads
+    DB = DB + lock_id * Heads + off_heads
+    # Accumulate partial sums for dw/db
+    partial_dw = tl.sum(dy * xhat, axis=1)
+    partial_dw = tl.ravel(partial_dw)
+    partial_db = tl.sum(dy, axis=1)
+    partial_db = tl.ravel(partial_db)
+    tl.atomic_add(
+        DW,
+        partial_dw,
+        mask=mask_h,
+        sem="relaxed",
+    )
+    tl.atomic_add(
+        DB,
+        partial_db,
+        mask=mask_h,
+        sem="relaxed",
+    )
+
+
+def triton_group_norm_mul_dropout_fwd(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_ux: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+    seed: Optional[int] = None,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, int
+]:  # y, mean, rstd, BLOCK_D, BLOCK_H, num_warps, seed
+    assert x.dim() == 2
+    assert x.shape == u.shape
+    assert x.shape[1] == num_heads * linear_dim
+    x = switch_to_contiguous_if_needed(x)
+    u = switch_to_contiguous_if_needed(u)
+    N, _ = x.shape
+    assert weight.dim() == 1
+    assert bias.dim() == 1
+    assert weight.numel() == num_heads
+    assert bias.numel() == num_heads
+
+    if concat_ux:
+        y = torch.empty((N, 3 * num_heads * linear_dim), dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty((N, num_heads * linear_dim), dtype=x.dtype, device=x.device)
+    mean = torch.empty((N * num_heads,), dtype=torch.float32, device=x.device)
+    rstd = torch.empty((N * num_heads,), dtype=torch.float32, device=x.device)
+    if N == 0:
+        return y, mean, rstd, 0, 0, 0, 0
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_D: int = triton.next_power_of_2(linear_dim)
+    BLOCK_H: int = triton.next_power_of_2(num_heads)
+    if BLOCK_D * BLOCK_H > MAX_FUSED_SIZE:
+        raise RuntimeError(
+            "This group norm doesn't support num_heads * linear_dim >= 64KB."
+        )
+
+    if seed is None:
+        seed = torch.randint(low=0, high=2**62, size=(1,), dtype=torch.int64).item()
+    num_warps: int = min(max(BLOCK_D * BLOCK_H // 256, 1), 8)
+    # pyre-ignore[28]
+    _group_norm_mul_dropout_fwd[(N,)](
+        x,
+        u,
+        y,
+        weight,
+        bias,
+        mean,
+        rstd,
+        linear_dim,
+        num_heads,
+        eps,
+        seed,
+        dropout_ratio,
+        x.stride(0),
+        u.stride(0),
+        y.stride(0),
+        SILU_U=silu_u,
+        BLOCK_D=BLOCK_D,
+        BLOCK_H=BLOCK_H,
+        TRAINING=training,
+        CONCAT_UX=concat_ux,
+        num_warps=num_warps,
+    )
+    return y, mean, rstd, BLOCK_D, BLOCK_H, num_warps, seed  # pyre-ignore [7]
+
+
+def triton_group_norm_mul_dropout_bwd(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    BLOCK_D: int,
+    BLOCK_H: int,
+    num_warps: int,
+    eps: float,
+    training: bool,
+    dropout_ratio: float,
+    seed: Optional[int] = None,
+    silu_u: bool = False,
+    concat_ux: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+    compute_y: bool = False,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]
+]:
+    y = None
+    N, dim = x.shape
+    if compute_y:
+        if concat_ux:
+            y = torch.empty(
+                (N, 3 * num_heads * linear_dim), dtype=x.dtype, device=x.device
+            )
+        else:
+            y = torch.empty((N, num_heads * linear_dim), dtype=x.dtype, device=x.device)
+    if N == 0:
+        return (
+            torch.zeros_like(x),
+            torch.zeros_like(u),
+            torch.zeros_like(weight),
+            torch.zeros_like(bias),
+            y,
+        )
+    dx = torch.empty_like(x)
+    du = torch.empty_like(u)
+    if dim <= 1024:
+        GROUP_N = 256 * 8
+    elif dim <= 4096:
+        GROUP_N = 128 * 8
+    elif dim <= 8192:
+        GROUP_N = 96 * 8
+    else:
+        GROUP_N = 64 * 8
+    GROUP_N = N if GROUP_N > N else GROUP_N
+    _dweight = torch.zeros((GROUP_N, num_heads), dtype=torch.float32, device=x.device)
+    _dbias = torch.zeros((GROUP_N, num_heads), dtype=torch.float32, device=x.device)
+    dweight = torch.empty((num_heads,), dtype=weight.dtype, device=x.device)
+    dbias = torch.empty((num_heads,), dtype=weight.dtype, device=x.device)
+    # pyre-ignore[28]
+    _group_norm_mul_dropout_bwd_dx_du[(N,)](
+        dx,
+        du,
+        dy,
+        _dweight,
+        _dbias,
+        x,
+        u,
+        y,
+        weight,
+        bias,
+        mean,
+        rstd,
+        dx.stride(0),
+        du.stride(0),
+        dy.stride(0),
+        x.stride(0),
+        u.stride(0),
+        y.stride(0) if compute_y else 0,  # pyre-ignore [16]
+        linear_dim,
+        num_heads,
+        eps,
+        seed,
+        dropout_ratio,
+        SILU_U=silu_u,
+        GROUP_N=GROUP_N,
+        BLOCK_D=BLOCK_D,
+        BLOCK_H=BLOCK_H,
+        TRAINING=training,
+        CONCAT_UX=concat_ux,
+        COMPUTE_Y=compute_y,
+        num_warps=num_warps,
+    )
+    _group_norm_bwd_dwdb[(num_heads,)](
+        _dweight,
+        _dbias,
+        dweight,
+        dbias,
+        GROUP_N,
+    )
+    return dx, du, dweight, dbias, y
+
+
+def _get_bwd_dwdb_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_N in [32, 64, 128, 256]:
+        for num_warps in [8, 16] + ([] if torch.ops.hip else [32]):
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_bwd_dwdb_configs(),
+    key=[],
+)
+@triton.jit
+def _group_norm_bwd_dwdb(
+    DW,
+    DB,
+    FINAL_DW,
+    FINAL_DB,
+    N,
+    BLOCK_N: tl.constexpr,
+):
+    col = tl.program_id(0)
+    num_heads = tl.num_programs(0)
+    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+
+    for i in range(0, N, BLOCK_N):
+        rows = i + tl.arange(0, BLOCK_N)
+        mask = rows < N
+        offs = rows * num_heads + col
+        dw += tl.load(DW + offs, mask=mask, other=0.0)
+        db += tl.load(DB + offs, mask=mask, other=0.0)
+
+    sum_dw = tl.sum(dw, axis=0)
+    sum_db = tl.sum(db, axis=0)
+    tl.store(FINAL_DW + col, sum_dw.to(FINAL_DW.dtype.element_ty))
+    tl.store(FINAL_DB + col, sum_db.to(FINAL_DB.dtype.element_ty))
+
+
+class GroupNormMulDropoutFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        u: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+        dropout_ratio: float,
+        training: bool,
+        silu_u: bool = False,
+        concat_ux: bool = False,
+        num_heads: int = 1,
+        linear_dim: int = -1,
+        seed: Optional[int] = None,
+    ) -> torch.Tensor:
+        y, mean, rstd, BLOCK_D, BLOCK_H, num_warps, seed = (
+            triton_group_norm_mul_dropout_fwd(
+                x=x,
+                u=u,
+                weight=weight,
+                bias=bias,
+                eps=eps,
+                dropout_ratio=dropout_ratio,
+                training=training,
+                silu_u=silu_u,
+                concat_ux=concat_ux,
+                num_heads=num_heads,
+                linear_dim=linear_dim,
+                seed=seed,
+            )
+        )
+        ctx.save_for_backward(x, u, weight, bias, mean, rstd)
+        ctx.BLOCK_D = BLOCK_D
+        ctx.BLOCK_H = BLOCK_H
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        ctx.seed = seed
+        ctx.training = training
+        ctx.silu_u = silu_u
+        ctx.concat_ux = concat_ux
+        ctx.dropout_ratio = dropout_ratio
+        ctx.num_heads = num_heads
+        ctx.linear_dim = linear_dim
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        x, u, weight, bias, mean, rstd = ctx.saved_tensors
+        dx, du, dweight, dbias, _ = triton_group_norm_mul_dropout_bwd(
+            dy=dy,
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            mean=mean,
+            rstd=rstd,
+            BLOCK_D=ctx.BLOCK_D,
+            BLOCK_H=ctx.BLOCK_H,
+            num_warps=ctx.num_warps,
+            eps=ctx.eps,
+            training=ctx.training,
+            dropout_ratio=ctx.dropout_ratio,
+            seed=ctx.seed,
+            silu_u=ctx.silu_u,
+            concat_ux=ctx.concat_ux,
+            num_heads=ctx.num_heads,
+            linear_dim=ctx.linear_dim,
+            compute_y=False,
+        )
+        return (
+            dx,
+            du,
+            dweight,
+            dbias,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class HSTUComputeOutputFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        attn: torch.Tensor,
+        u: torch.Tensor,
+        x: torch.Tensor,
+        norm_weight: torch.Tensor,
+        norm_bias: torch.Tensor,
+        output_weight: torch.Tensor,
+        eps: float,
+        dropout_ratio: float,
+        training: bool,
+        silu_u: bool = False,
+        concat_u: bool = False,
+        concat_x: bool = False,
+        mul_u_activation_type: str = "none",
+        group_norm: bool = False,
+        num_heads: int = 1,
+        linear_dim: int = -1,
+        seed: Optional[int] = None,
+        recompute_y_in_backward: bool = False,
+    ) -> torch.Tensor:
+        if dropout_ratio == 0.0:
+            training = False
+
+        if group_norm:
+            y, mean, rstd, BLOCK_D, BLOCK_H, num_warps, seed = (
+                triton_group_norm_mul_dropout_fwd(
+                    x=attn,
+                    u=u,
+                    weight=norm_weight,
+                    bias=norm_bias,
+                    eps=eps,
+                    dropout_ratio=dropout_ratio,
+                    training=training,
+                    silu_u=silu_u,
+                    concat_ux=concat_u and concat_x,
+                    num_heads=num_heads,
+                    linear_dim=linear_dim,
+                    seed=seed,
+                )
+            )
+            ctx.BLOCK_H = BLOCK_H
+            random_mask = None
+        else:
+            y, mean, rstd, BLOCK_D, num_warps, seed, random_mask = (
+                triton_layer_norm_mul_dropout_fwd(
+                    x=attn,
+                    u=u,
+                    weight=norm_weight,
+                    bias=norm_bias,
+                    eps=eps,
+                    dropout_ratio=dropout_ratio,
+                    training=training,
+                    silu_u=silu_u,
+                    concat_u=concat_u,
+                    concat_x=concat_x,
+                    seed=seed,
+                )
+            )
+
+        out = maybe_triton_addmm_fwd(x=y, w=output_weight, y=x)
+
+        saved_tensors = [attn, u, norm_weight, norm_bias, mean, rstd, output_weight]
+        if not recompute_y_in_backward:
+            saved_tensors.append(y)
+        # Save random_mask for reuse in backward pass (avoids regenerating mask)
+        # When random_mask is available (SM100+ path), always save it.
+        if random_mask is not None:
+            saved_tensors.append(random_mask)
+            ctx.has_random_mask = True
+        else:
+            ctx.has_random_mask = False
+        ctx.save_for_backward(*saved_tensors)
+        ctx.BLOCK_D = BLOCK_D
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        ctx.seed = seed
+        ctx.training = training
+        ctx.concat_u = concat_u
+        ctx.concat_x = concat_x
+        ctx.dropout_ratio = dropout_ratio
+        ctx.num_heads = num_heads
+        ctx.linear_dim = linear_dim
+        ctx.group_norm = group_norm
+        ctx.recompute_y_in_backward = recompute_y_in_backward
+        ctx.silu_u = silu_u
+        ctx.mul_u_activation_type = mul_u_activation_type
+        return out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dout: torch.Tensor
+    ) -> Tuple[
+        torch.Tensor,  # dattn
+        torch.Tensor,  # du
+        torch.Tensor,  # dx
+        torch.Tensor,  # d_norm_weight
+        torch.Tensor,  # d_norm_bias
+        torch.Tensor,  # d_output_weight
+        None,  # eps
+        None,  # dropout_ratio
+        None,  # training
+        None,  # silu_u
+        None,  # concat_u
+        None,  # concat_x
+        None,  # mul_u_activation_type
+        None,  # group_norm
+        None,  # num_heads
+        None,  # linear_dim
+        None,  # seed
+        None,  # recompute_y_in_backward
+    ]:
+        attn, u, norm_weight, norm_bias, mean, rstd, output_weight = ctx.saved_tensors[
+            :7
+        ]
+        # Extract optional saved tensors based on flags
+        next_idx = 7
+        if not ctx.recompute_y_in_backward:
+            saved_y = ctx.saved_tensors[next_idx]
+            next_idx += 1
+        else:
+            saved_y = None
+        if ctx.has_random_mask:
+            random_mask = ctx.saved_tensors[next_idx]
+        else:
+            random_mask = None
+        dy = torch.mm(dout, output_weight.t())
+
+        if ctx.group_norm:
+            dattn, du, d_norm_weight, d_norm_bias, y = (
+                triton_group_norm_mul_dropout_bwd(
+                    dy=dy,
+                    x=attn,
+                    u=u,
+                    weight=norm_weight,
+                    bias=norm_bias,
+                    mean=mean,
+                    rstd=rstd,
+                    BLOCK_D=ctx.BLOCK_D,
+                    BLOCK_H=ctx.BLOCK_H,
+                    num_warps=ctx.num_warps,
+                    eps=ctx.eps,
+                    training=ctx.training,
+                    dropout_ratio=ctx.dropout_ratio,
+                    seed=ctx.seed,
+                    silu_u=ctx.silu_u,
+                    concat_ux=ctx.concat_u and ctx.concat_x,
+                    num_heads=ctx.num_heads,
+                    linear_dim=ctx.linear_dim,
+                    compute_y=ctx.recompute_y_in_backward,
+                )
+            )
+        else:
+            dattn, du, d_norm_weight, d_norm_bias, y = (
+                triton_layer_norm_mul_dropout_bwd(
+                    dy=dy,
+                    x=attn,
+                    u=u,
+                    weight=norm_weight,
+                    bias=norm_bias,
+                    mean=mean,
+                    rstd=rstd,
+                    BLOCK_D=ctx.BLOCK_D,
+                    num_warps=ctx.num_warps,
+                    eps=ctx.eps,
+                    training=ctx.training,
+                    dropout_ratio=ctx.dropout_ratio,
+                    seed=ctx.seed,
+                    silu_u=ctx.silu_u,
+                    concat_u=ctx.concat_u,
+                    concat_x=ctx.concat_x,
+                    mul_u_activation_type=ctx.mul_u_activation_type,
+                    compute_y=ctx.recompute_y_in_backward,
+                    random_mask=random_mask,
+                )
+            )
+        if not ctx.recompute_y_in_backward:
+            y = saved_y
+        d_output_weight = torch.mm(y.t(), dout)
+        return (
+            dattn,
+            du,
+            dout,
+            d_norm_weight,
+            d_norm_bias,
+            d_output_weight,
+            None,  # eps
+            None,  # dropout_ratio
+            None,  # training
+            None,  # silu_u
+            None,  # concat_u
+            None,  # concat_x
+            None,  # mul_u_activation_type
+            None,  # group_norm
+            None,  # num_heads
+            None,  # linear_dim
+            None,  # seed
+            None,  # recompute_y_in_backward
+        )
+
+
+@triton.jit
+def _helion_ln_mul_dropout_fwd(
+    x,
+    weight,
+    bias,
+    u,
+    y,
+    mean,
+    rstd,
+    eps,
+    seed,
+    dropout_ratio,
+    D: tl.constexpr,
+    stride_x: tl.constexpr,
+    stride_u: tl.constexpr,
+    stride_y: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    CONCAT_UX: tl.constexpr,
+    SILU_U: tl.constexpr,
+    TRAINING: tl.constexpr,
+):
+    row = tl.program_id(0)
+    x += row.to(tl.int64) * stride_x
+    u += row.to(tl.int64) * stride_u
+    y += row.to(tl.int64) * stride_y
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+
+    # Load input
+    x_val = tl.load(x + cols, mask=mask, other=0.0).to(tl.float32)
+
+    # Precompute inverse of D for faster computation
+    inv_D = 1.0 / D
+
+    # Compute mean
+    mean_val = tl.sum(x_val, axis=0) * inv_D
+
+    # Center the data
+    x_mean = tl.where(mask, x_val - mean_val, 0.0)
+
+    # Compute variance
+    var = tl.sum(x_mean * x_mean, axis=0) * inv_D
+
+    # Compute reciprocal standard deviation
+    # pyre-fixme[16]
+    rstd_val = libdevice.rsqrt(var + eps)
+
+    # Normalize
+    y_norm = x_mean * rstd_val
+
+    # Apply weight and bias
+    w = tl.load(weight + cols, mask=mask, other=0.0).to(tl.float32)
+    b = tl.load(bias + cols, mask=mask, other=0.0).to(tl.float32)
+    y_ln = y_norm * w + b
+
+    # Load u and optionally apply SiLU activation
+    u_val = tl.load(u + cols, mask=mask, other=0.0).to(tl.float32)
+    if SILU_U:
+        u_processed = u_val * tl.sigmoid(u_val)
+    else:
+        u_processed = u_val
+
+    y_out = y_ln * u_processed
+
+    if TRAINING:
+        # Compute dropout scale
+        # pyre-fixme[16]
+        dropout_scale = fast_dividef(1.0, 1.0 - dropout_ratio)
+
+        if CONCAT_UX:
+            # Generate dropout masks
+            random_offsets = 3 * row * BLOCK_D + cols
+            random_u, random_x, random_y = rand3x(seed, random_offsets)
+
+            u_keep = random_u > dropout_ratio
+            x_keep = random_x > dropout_ratio
+            y_keep = random_y > dropout_ratio
+
+            # Apply dropout to u, x, y
+            u_output = tl.where(u_keep, u_processed * dropout_scale, 0.0)
+            x_output = tl.where(x_keep, x_val * dropout_scale, 0.0)
+            y_output = tl.where(y_keep, y_out * dropout_scale, 0.0)
+        else:
+            # Generate dropout mask for y
+            random_offsets = row * BLOCK_D + cols
+            random_y = tl.rand(seed, random_offsets)
+            y_keep = random_y > dropout_ratio
+
+            # Apply dropout to y
+            y_output = tl.where(y_keep, y_out * dropout_scale, 0.0)
+    else:
+        if CONCAT_UX:
+            u_output = u_processed
+            x_output = x_val
+        y_output = y_out
+
+    # Store outputs
+    if CONCAT_UX:
+        tl.store(y + cols, u_output.to(y.dtype.element_ty), mask=mask)
+        tl.store(y + D + cols, x_output.to(y.dtype.element_ty), mask=mask)
+        tl.store(y + 2 * D + cols, y_output.to(y.dtype.element_ty), mask=mask)
+    else:
+        tl.store(y + cols, y_output.to(y.dtype.element_ty), mask=mask)
+
+    # Store mean and rstd
+    tl.store(mean + row, mean_val)
+    tl.store(rstd + row, rstd_val)
+
+
+def helion_layer_norm_mul_dropout_fwd(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_ux: bool = False,
+    seed: Optional[int] = None,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, int, int, int
+]:  # y, mean, rstd, BLOCK_D, num_warps, seed
+    N, D = x.shape
+
+    if seed is None:
+        seed = torch.randint(low=0, high=2**62, size=(1,), dtype=torch.int64).item()
+
+    if concat_ux:
+        y = torch.empty([N, 3 * D], dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty([N, D], dtype=x.dtype, device=x.device)
+    mean = torch.empty([N], dtype=torch.float32, device=x.device)
+    rstd = torch.empty([N], dtype=torch.float32, device=x.device)
+
+    BLOCK_D = triton.next_power_of_2(D)
+    # pyre-ignore[28]
+    _helion_ln_mul_dropout_fwd[(N,)](
+        x,
+        weight,
+        bias,
+        u,
+        y,
+        mean,
+        rstd,
+        eps,
+        seed,
+        dropout_ratio,
+        D,
+        x.stride(0),
+        u.stride(0),
+        y.stride(0),
+        BLOCK_D,
+        CONCAT_UX=concat_ux,
+        SILU_U=silu_u,
+        TRAINING=training,
+        num_warps=1,
+    )
+
+    return y, mean, rstd, BLOCK_D, 1, seed  # pyre-ignore [7]
+
+
+@triton.jit
+def _helion_ln_mul_dropout_bwd_dx_du(
+    DX,
+    DU,
+    DY,
+    DW,
+    DB,
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    stride_dx,
+    stride_du,
+    stride_dy,
+    stride_x,
+    stride_u,
+    stride_y,
+    D: tl.constexpr,
+    eps,
+    seed,
+    dropout_ratio,
+    N,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_UX: tl.constexpr,
+    COMPUTE_Y: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    tile_num = tl.num_programs(0)
+    rows_per_tile = N // tile_num
+    if pid < N % tile_num:
+        rows_per_tile += 1
+
+    if rows_per_tile == 0:
+        return
+
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+
+    # precompute inverse of D
+    inv_D: tl.constexpr = 1.0 / D
+
+    row = pid
+    X += row.to(tl.int64) * stride_x
+    U += row.to(tl.int64) * stride_u
+    if COMPUTE_Y:
+        Y += row.to(tl.int64) * stride_y
+    DY += row.to(tl.int64) * stride_dy
+    DX += row.to(tl.int64) * stride_dx
+    DU += row.to(tl.int64) * stride_du
+    DW = DW + pid * D + cols
+    DB = DB + pid * D + cols
+
+    partial_dw = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    partial_db = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    b = tl.load(B + cols, mask=mask).to(tl.float32)
+
+    for _idx in range(0, rows_per_tile):
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        if CONCAT_UX:
+            du = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dx = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + 2 * D + cols, mask=mask, other=0).to(tl.float32)
+        else:
+            du = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dx = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+
+        if TRAINING:
+            # pyre-fixme[16]
+            dropout_scale = fast_dividef(1.0, 1.0 - dropout_ratio)
+            if CONCAT_UX:
+                random_offsets = 3 * row * BLOCK_D + cols
+                # apply dropout on du
+                random_du, random_dx, random_dy = rand3x(seed, random_offsets)
+                du_keep = random_du > dropout_ratio
+                du = tl.where(du_keep, du * dropout_scale, 0.0)
+                # apply dropout on dx
+                dx_keep = random_dx > dropout_ratio
+                dx = tl.where(dx_keep, dx * dropout_scale, 0.0)
+                # apply dropout on dy
+                dy_keep = random_dy > dropout_ratio
+                dy = tl.where(dy_keep, dy * dropout_scale, 0.0)
+            else:
+                random_offsets = row * BLOCK_D + cols
+                random = tl.rand(seed, random_offsets)
+                dy_keep = random > dropout_ratio
+                dy = tl.where(dy_keep, dy * dropout_scale, 0.0)
+
+        mean = tl.load(Mean + row)
+        rstd = tl.load(Rstd + row)
+
+        # Compute dx
+        xhat = (x - mean) * rstd
+        u = tl.load(U + cols, mask=mask, other=0).to(tl.float32)
+        ln = xhat * w + b
+        du += dy * ln
+
+        if SILU_U:
+            sig_u = tl.sigmoid(u)
+            silu_u = u * sig_u
+            du = du * sig_u * (1 + u - silu_u)
+            u = silu_u
+
+        tl.store(DU + cols, du.to(DU.dtype.element_ty), mask=mask)
+        dy = dy * u
+        wdy = w * dy
+
+        if COMPUTE_Y:
+            y = ln * u
+            if TRAINING:
+                # pyre-fixme[16]
+                dropout_scale_y = fast_dividef(1.0, 1.0 - dropout_ratio)
+                if CONCAT_UX:
+                    u = tl.where(du_keep, u * dropout_scale_y, 0.0)  # pyre-ignore [61]
+                    x = tl.where(dx_keep, x * dropout_scale_y, 0.0)  # pyre-ignore [61]
+                    y = tl.where(dy_keep, y * dropout_scale_y, 0.0)  # pyre-ignore [61]
+                else:
+                    y = tl.where(dy_keep, y * dropout_scale_y, 0.0)  # pyre-ignore [61]
+            if CONCAT_UX:
+                tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, x.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + 2 * D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            else:
+                tl.store(Y + cols, y.to(Y.dtype.element_ty), mask=mask)
+            Y += tile_num.to(tl.int64) * stride_y
+
+        xhat = tl.where(mask, xhat, 0.0)
+        wdy = tl.where(mask, wdy, 0.0)
+        # multiply by inv_D
+        c1 = tl.sum(xhat * wdy, axis=0) * inv_D
+        c2 = tl.sum(wdy, axis=0) * inv_D
+        dx += (wdy - (xhat * c1 + c2)) * rstd
+
+        # Write dx
+        tl.store(DX + cols, dx, mask=mask)
+
+        # Accumulate partial sums for dw/db
+        partial_dw += dy * xhat
+        partial_db += dy
+
+        X += tile_num.to(tl.int64) * stride_x
+        U += tile_num.to(tl.int64) * stride_u
+        DY += tile_num.to(tl.int64) * stride_dy
+        DX += tile_num.to(tl.int64) * stride_dx
+        DU += tile_num.to(tl.int64) * stride_du
+        row += tile_num
+
+    tl.store(DW, partial_dw, mask=mask)
+    tl.store(DB, partial_db, mask=mask)
+
+
+@triton_autotune(
+    configs=_get_bwd_dwdb_configs(),
+    key=["D"],
+)
+@triton.jit
+def _helion_ln_mul_dropout_bwd_dwdb(
+    DW,
+    DB,
+    FINAL_DW,
+    FINAL_DB,
+    N,
+    D,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    cols = pid * BLOCK_D + tl.arange(0, BLOCK_D)
+    dw = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+    db = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+
+    for i in range(0, N, BLOCK_N):
+        rows = i + tl.arange(0, BLOCK_N)
+        # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+        off_mask = (rows[:, None] < N) & (cols[None, :] < D)
+        offs = rows[:, None] * D + cols[None, :]
+        dw += tl.load(DW + offs, mask=off_mask, other=0.0)
+        db += tl.load(DB + offs, mask=off_mask, other=0.0)
+
+    sum_dw = tl.sum(dw, axis=0)
+    sum_db = tl.sum(db, axis=0)
+    tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D)
+    tl.store(FINAL_DB + cols, sum_db.to(FINAL_DB.dtype.element_ty), mask=cols < D)
+
+
+def helion_layer_norm_mul_dropout_bwd(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    BLOCK_D: int,
+    num_warps: int,
+    eps: float,
+    training: bool,
+    dropout_ratio: float,
+    seed: Optional[int] = None,
+    silu_u: bool = False,
+    concat_ux: bool = False,
+    compute_y: bool = False,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]
+]:
+    y = None
+    N, D = x.shape
+    if compute_y:
+        if concat_ux:
+            y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+        else:
+            y = torch.empty_like(x)
+    if N == 0:
+        return (
+            torch.zeros_like(x),
+            torch.zeros_like(u),
+            torch.zeros((D,), dtype=weight.dtype, device=x.device),
+            torch.zeros((D,), dtype=weight.dtype, device=x.device),
+            y,
+        )
+    dx = torch.empty_like(x)
+    du = torch.empty_like(u)
+    sms = torch.cuda.get_device_properties(x.device).multi_processor_count
+    tile_num = max(1, min(sms * 64, N // 4))
+    _dweight = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+    _dbias = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+    dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+    dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+
+    # pyre-ignore[28]
+    _helion_ln_mul_dropout_bwd_dx_du[(tile_num,)](
+        dx,
+        du,
+        dy,
+        _dweight,
+        _dbias,
+        x,
+        u,
+        y,
+        weight,
+        bias,
+        mean,
+        rstd,
+        dx.stride(0),
+        du.stride(0),
+        dy.stride(0),
+        x.stride(0),
+        u.stride(0),
+        y.stride(0) if compute_y else 0,  # pyre-ignore [16]
+        D,
+        eps,
+        seed,
+        dropout_ratio,
+        N=N,
+        SILU_U=silu_u,
+        BLOCK_D=BLOCK_D,
+        TRAINING=training,
+        CONCAT_UX=concat_ux,
+        COMPUTE_Y=compute_y,
+        num_warps=num_warps,
+    )
+
+    blocks = triton.next_power_of_2(sms * 4)
+    BLOCK_D_DWDB = triton.next_power_of_2(triton.cdiv(D, blocks))
+    BLOCK_D_DWDB = min(max(BLOCK_D_DWDB, 4), 128)
+    _helion_ln_mul_dropout_bwd_dwdb[(triton.cdiv(D, BLOCK_D_DWDB),)](
+        _dweight,
+        _dbias,
+        dweight,
+        dbias,
+        tile_num,
+        D,
+        BLOCK_D=BLOCK_D_DWDB,
+    )
+    return dx, du, dweight, dbias, y
+
+
+class HelionLayerNormMulDropoutFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        u: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+        dropout_ratio: float,
+        training: bool,
+        silu_u: bool = False,
+        concat_ux: bool = False,
+        seed: Optional[int] = None,
+    ) -> torch.Tensor:
+        if dropout_ratio == 0.0:
+            # skip dropout computation if dropout ratio is 0
+            training = False
+        y, mean, rstd, BLOCK_D, num_warps, seed = helion_layer_norm_mul_dropout_fwd(
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            silu_u=silu_u,
+            concat_ux=concat_ux,
+            seed=seed,
+        )
+        ctx.save_for_backward(x, u, weight, bias, mean, rstd)
+        ctx.BLOCK_D = BLOCK_D
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        ctx.seed = seed
+        ctx.training = training
+        ctx.silu_u = silu_u
+        ctx.concat_ux = concat_ux
+        ctx.dropout_ratio = dropout_ratio
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        x, u, weight, bias, mean, rstd = ctx.saved_tensors
+        dx, du, dweight, dbias, _ = helion_layer_norm_mul_dropout_bwd(
+            dy=dy,
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            mean=mean,
+            rstd=rstd,
+            BLOCK_D=ctx.BLOCK_D,
+            num_warps=ctx.num_warps,
+            eps=ctx.eps,
+            training=ctx.training,
+            dropout_ratio=ctx.dropout_ratio,
+            seed=ctx.seed,
+            silu_u=ctx.silu_u,
+            concat_ux=ctx.concat_ux,
+            compute_y=False,
+        )
+        return dx, du, dweight, dbias, None, None, None, None, None, None
+
+
+@torch.fx.wrap
+def helion_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_ux: bool = False,
+    seed: Optional[int] = None,
+) -> torch.Tensor:
+    return HelionLayerNormMulDropoutFunction.apply(
+        x, u, weight, bias, eps, dropout_ratio, training, silu_u, concat_ux, seed
+    )
+
+
+@torch.fx.wrap
+def triton_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    group_norm: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+    seed: Optional[int] = None,
+) -> torch.Tensor:
+    if group_norm:
+        return GroupNormMulDropoutFunction.apply(
+            x,
+            u,
+            weight,
+            bias,
+            eps,
+            dropout_ratio,
+            training,
+            silu_u,
+            concat_u and concat_x,
+            num_heads,
+            linear_dim,
+            seed,
+        )
+    else:
+        return LayerNormMulDropoutFunction.apply(
+            x,
+            u,
+            weight,
+            bias,
+            eps,
+            dropout_ratio,
+            training,
+            silu_u,
+            concat_u and concat_x,
+            seed,
+        )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_hstu_compute_output(
+    attn: torch.Tensor,
+    u: torch.Tensor,
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    output_weight: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    mul_u_activation_type: str = "none",
+    group_norm: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+    seed: Optional[int] = None,
+    recompute_y_in_backward: bool = False,
+) -> torch.Tensor:
+    return HSTUComputeOutputFunction.apply(
+        attn,
+        u,
+        x,
+        norm_weight,
+        norm_bias,
+        output_weight,
+        eps,
+        dropout_ratio,
+        training,
+        silu_u,
+        concat_u,
+        concat_x,
+        mul_u_activation_type,
+        group_norm,
+        num_heads,
+        linear_dim,
+        seed,
+        recompute_y_in_backward,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py
new file mode 100644
index 000000000..bda97ff96
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py
@@ -0,0 +1,342 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.ops.triton.triton_addmm import (
+    maybe_triton_addmm_fwd,
+    triton_addmm_bwd,
+    triton_addmm_fwd,
+)
+from generative_recommenders.ops.triton.triton_hstu_attention import (
+    _should_enable_tma,
+    triton_hstu_attention_bwd,
+    triton_hstu_attention_fwd,
+)
+from generative_recommenders.ops.triton.triton_layer_norm import (
+    compute_BLOCK_D,
+    triton_weighted_layer_norm_bwd,
+    triton_weighted_layer_norm_fwd,
+)
+from torch.nn import functional as F
+
+
+class _HSTUPreprocessAndAttentionFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore [14]
+    def forward(
+        ctx,  # pyre-ignore [2]
+        x: torch.Tensor,
+        norm_weight: torch.Tensor,
+        norm_bias: torch.Tensor,
+        norm_eps: float,
+        num_heads: int,
+        attn_dim: int,
+        hidden_dim: int,
+        uvqk_weight: torch.Tensor,
+        uvqk_bias: torch.Tensor,
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        attn_alpha: float,
+        num_targets: Optional[torch.Tensor],
+        max_attn_len: int,
+        contextual_seq_len: int,
+        recompute_uvqk_in_backward: bool,
+        recompute_normed_x_in_backward: bool,
+        sort_by_length: bool,
+        enable_tma: bool,
+        num_softmax_heads: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert num_softmax_heads == 0, "Softmax attention is not supported"
+        normed_x, x_mean, x_rstd = triton_weighted_layer_norm_fwd(
+            x=x,
+            weight=norm_weight,
+            bias=norm_bias,
+            eps=norm_eps,
+        )
+        BLOCK_D = compute_BLOCK_D(x)
+        uvqk = maybe_triton_addmm_fwd(
+            x=normed_x, w=uvqk_weight, y=uvqk_bias
+        ).contiguous()
+        u, v, q, k = uvqk.split(
+            [
+                hidden_dim * num_heads,
+                hidden_dim * num_heads,
+                attn_dim * num_heads,
+                attn_dim * num_heads,
+            ],
+            dim=1,
+        )
+        q = q.view(-1, num_heads, attn_dim)
+        k = k.view(-1, num_heads, attn_dim)
+        v = v.view(-1, num_heads, hidden_dim)
+        silu_u = F.silu(u)
+        sort_by_length_indices = None
+        if sort_by_length:
+            seq_lengths = seq_offsets[1:] - seq_offsets[:-1]
+            _, sort_by_length_indices = torch.sort(
+                seq_lengths, descending=True, stable=False
+            )
+        out = triton_hstu_attention_fwd(
+            N=max_seq_len,
+            alpha=attn_alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            sort_by_length_indices=sort_by_length_indices,
+            enable_tma=enable_tma,
+            num_softmax_heads=num_softmax_heads,
+        )
+        # update ctx
+        saved_tensors = [
+            x,
+            norm_weight,
+            norm_bias,
+            x_mean,
+            x_rstd,
+            uvqk_weight,
+            seq_offsets,
+        ]
+        if num_targets is not None:
+            saved_tensors.append(num_targets)
+        if not recompute_normed_x_in_backward:
+            saved_tensors.append(normed_x)
+        if recompute_uvqk_in_backward:
+            saved_tensors.append(uvqk_bias)
+        else:
+            saved_tensors.append(uvqk)
+        if sort_by_length:
+            saved_tensors.append(sort_by_length_indices)
+        ctx.save_for_backward(*saved_tensors)
+        ctx.attn_alpha = attn_alpha
+        ctx.has_multiple_targets = num_targets is not None
+        ctx.max_seq_len = max_seq_len
+        ctx.max_attn_len = max_attn_len
+        ctx.recompute_normed_x_in_backward = recompute_normed_x_in_backward
+        ctx.recompute_uvqk_in_backward = recompute_uvqk_in_backward
+        ctx.hidden_dim = hidden_dim
+        ctx.attn_dim = attn_dim
+        ctx.num_heads = num_heads
+        ctx.uvqk_bias_1d = uvqk_bias.dim() == 1
+        ctx.norm_eps = norm_eps
+        ctx.norm_BLOCK_D = BLOCK_D
+        ctx.contextual_seq_len = contextual_seq_len
+        ctx.sort_by_length = sort_by_length
+        ctx.enable_tma = enable_tma
+        ctx.num_softmax_heads = num_softmax_heads
+        return silu_u, out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx,  # pyre-ignore[2]
+        dsilu_u: torch.Tensor,
+        dout: torch.Tensor,
+    ) -> Tuple[
+        torch.Tensor,  # d_x
+        torch.Tensor,  # d_norm_weight
+        torch.Tensor,  # d_norm_bias
+        None,
+        None,
+        None,
+        None,
+        torch.Tensor,  # d_uvqk_weight
+        torch.Tensor,  # d_uvqk_bias
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        x, norm_weight, norm_bias, x_mean, x_rstd, uvqk_weight, seq_offsets = (
+            ctx.saved_tensors[:7]
+        )
+        idx = 7
+        if ctx.has_multiple_targets:
+            num_targets = ctx.saved_tensors[idx]
+            idx += 1
+        else:
+            num_targets = None
+        if ctx.recompute_normed_x_in_backward:
+            normed_x, _, _ = triton_weighted_layer_norm_fwd(
+                x=x,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=ctx.norm_eps,
+                mean=x_mean,
+                rstd=x_rstd,
+            )
+        else:
+            normed_x = ctx.saved_tensors[idx]
+            idx += 1
+        if ctx.recompute_uvqk_in_backward:
+            uvqk_bias = ctx.saved_tensors[idx]
+            uvqk = maybe_triton_addmm_fwd(x=normed_x, w=uvqk_weight, y=uvqk_bias)
+            idx += 1
+        else:
+            uvqk = ctx.saved_tensors[idx]
+            idx += 1
+        if ctx.sort_by_length:
+            sort_by_length_indices = ctx.saved_tensors[idx]
+        else:
+            sort_by_length_indices = None
+
+        duvqk = torch.empty_like(uvqk)
+        du, dv, dq, dk = duvqk.split(
+            [
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+            ],
+            dim=1,
+        )
+        u, v, q, k = uvqk.split(
+            [
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+            ],
+            dim=1,
+        )
+        q = q.view(-1, ctx.num_heads, ctx.attn_dim)
+        k = k.view(-1, ctx.num_heads, ctx.attn_dim)
+        v = v.view(-1, ctx.num_heads, ctx.hidden_dim)
+        dq = dq.view(-1, ctx.num_heads, ctx.attn_dim)
+        dk = dk.view(-1, ctx.num_heads, ctx.attn_dim)
+        dv = dv.view(-1, ctx.num_heads, ctx.hidden_dim)
+        # Note: the operation below updates duvqk in place
+        triton_hstu_attention_bwd(
+            dout=dout,
+            q=q,
+            k=k,
+            v=v,
+            dq=dq,
+            dk=dk,
+            dv=dv,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            N=ctx.max_seq_len,
+            max_attn_len=ctx.max_attn_len,
+            alpha=ctx.attn_alpha,
+            contextual_seq_len=ctx.contextual_seq_len,
+            sort_by_length_indices=sort_by_length_indices,
+            enable_tma=ctx.enable_tma,
+            num_softmax_heads=ctx.num_softmax_heads,
+        )
+        torch.ops.aten.silu_backward(dsilu_u, u, grad_input=du)
+        d_normed_x, d_uvqk_weight, d_uvqk_bias = triton_addmm_bwd(
+            x=normed_x,
+            w=uvqk_weight,
+            dz=duvqk,
+            is_y_1d=ctx.uvqk_bias_1d,
+        )
+        d_x, d_norm_weight, d_norm_bias = triton_weighted_layer_norm_bwd(
+            dy=d_normed_x,
+            x=x,
+            weight=norm_weight,
+            bias=norm_bias,
+            mean=x_mean,
+            rstd=x_rstd,
+            learnable=True,
+            eps=ctx.norm_eps,
+            BLOCK_D=ctx.norm_BLOCK_D,
+        )
+        # pyre-ignore[7]
+        return (
+            d_x,
+            d_norm_weight,
+            d_norm_bias,
+            None,
+            None,
+            None,
+            None,
+            d_uvqk_weight,
+            d_uvqk_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def triton_hstu_preprocess_and_attention(
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    norm_eps: float,
+    num_heads: int,
+    attn_dim: int,
+    hidden_dim: int,
+    uvqk_weight: torch.Tensor,
+    uvqk_bias: torch.Tensor,
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    attn_alpha: float,
+    num_targets: Optional[torch.Tensor],
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    recompute_uvqk_in_backward: bool = False,
+    recompute_normed_x_in_backward: bool = False,
+    sort_by_length: bool = False,
+    enable_tma: Optional[bool] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # When the caller does not specify enable_tma, auto-detect whether the
+    # TMA / TLX fast path is safe on this device. Resolving here (vs inside
+    # the autograd Function.forward) keeps a concrete bool flowing through
+    # ctx.save_for_backward / ctx attributes.
+    if enable_tma is None:
+        enable_tma = _should_enable_tma()
+    return _HSTUPreprocessAndAttentionFunction.apply(
+        x,
+        norm_weight,
+        norm_bias,
+        norm_eps,
+        num_heads,
+        attn_dim,
+        hidden_dim,
+        uvqk_weight,
+        uvqk_bias,
+        max_seq_len,
+        seq_offsets,
+        attn_alpha,
+        num_targets,
+        max_attn_len,
+        contextual_seq_len,
+        recompute_uvqk_in_backward,
+        recompute_normed_x_in_backward,
+        sort_by_length,
+        enable_tma,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
new file mode 100644
index 000000000..3f5609d75
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
@@ -0,0 +1,2537 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+import os
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import (
+    autotune_max_seq_len,
+    fine_grained_autotune_max_seq_len,
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+from generative_recommenders.ops.triton._autotune_pinning import pinned_or_full
+from generative_recommenders.ops.utils import is_sm100_plus, is_sm90
+from torch._inductor.runtime import triton_helpers
+
+try:
+    torch.ops.load_library(
+        "//generative_recommenders/fb/ultra/ops/hopper/jagged_dense_bmm_add:jagged_dense_bmm_add"
+    )
+except OSError:
+    pass
+
+CUDA_JAGGED_DENSE_BMM_FWD = False
+CUDA_JAGGED_DENSE_BMM_BWD = False
+
+SPLIT_2D_JAGGED_KERNEL = None
+GLN_MUL_DROPOUT_KERNEL = None
+CONCAT_2D_JAGGED_KERNEL = None
+
+
+def set_cuda_jagged_dense_bmm_fwd(value: bool) -> None:
+    global CUDA_JAGGED_DENSE_BMM_FWD
+    CUDA_JAGGED_DENSE_BMM_FWD = value
+
+
+def get_cuda_jagged_dense_bmm_fwd() -> bool:
+    # currently only supports H100
+    return CUDA_JAGGED_DENSE_BMM_FWD and is_sm90()
+
+
+def set_cuda_jagged_dense_bmm_bwd(value: bool) -> None:
+    global CUDA_JAGGED_DENSE_BMM_BWD
+    CUDA_JAGGED_DENSE_BMM_BWD = value
+
+
+def get_cuda_jagged_dense_bmm_bwd() -> bool:
+    # currently only supports H100
+    return CUDA_JAGGED_DENSE_BMM_BWD and is_sm90()
+
+
+def set_split_2d_jagged_kernel(value: Optional[str]) -> None:
+    global SPLIT_2D_JAGGED_KERNEL
+    SPLIT_2D_JAGGED_KERNEL = value
+
+
+def get_split_2d_jagged_kernel() -> Optional[str]:
+    # only override during training
+    if torch.is_grad_enabled():
+        return SPLIT_2D_JAGGED_KERNEL
+    return None
+
+
+def set_concat_2d_jagged_kernel(value: Optional[str]) -> None:
+    global CONCAT_2D_JAGGED_KERNEL
+    CONCAT_2D_JAGGED_KERNEL = value
+
+
+def get_concat_2d_jagged_kernel() -> Optional[str]:
+    # only override during training
+    if torch.is_grad_enabled():
+        return CONCAT_2D_JAGGED_KERNEL
+    return None
+
+
+def _should_use_multirow() -> bool:
+    """Check if multirow kernel should be used based on current hardware.
+
+    Can be overridden via the JAGGED_USE_MULTIROW_MI350 environment variable:
+      JAGGED_USE_MULTIROW_MI350=1  -> force multirow on
+      JAGGED_USE_MULTIROW_MI350=0  -> force multirow off
+      unset                  -> auto-detect based on hardware (SM100+ or MI350)
+    """
+    env = os.environ.get("JAGGED_USE_MULTIROW_MI350")
+    if env is not None:
+        return env == "1"
+    return is_sm100_plus()
+
+
+def set_gln_mul_dropout_kernel(value: Optional[str]) -> None:
+    global GLN_MUL_DROPOUT_KERNEL
+    GLN_MUL_DROPOUT_KERNEL = value
+
+
+def get_gln_mul_dropout_kernel() -> Optional[str]:
+    # only override during training
+    return GLN_MUL_DROPOUT_KERNEL
+
+
+def _triton_concat_2D_jagged_internal(
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    values_out: torch.Tensor,
+    max_seq_len: int,
+    B: int,
+    offsets_a: Optional[torch.Tensor],
+    offsets_b: Optional[torch.Tensor],
+    D: int,
+    dense_size: int,
+    stride_dense_batch: int,
+    n_prefix: int,
+    is_dense_a: bool,
+    is_dense_b: bool,
+    is_replace: bool,
+    BLOCK_D: int,
+) -> None:
+    use_multirow = _should_use_multirow()
+    if n_prefix != 0:
+        if use_multirow:
+
+            def grid(meta):
+                return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+            concat_2D_jagged_jagged_w_prefix_multirow[grid](
+                OffsetsA=offsets_a,
+                ValuesA=values_a,
+                OffsetsB=offsets_b,
+                ValuesB=values_b,
+                Out=values_out,
+                D=D,
+                stride_ad=values_a.stride(-2),
+                stride_bd=values_b.stride(-2),
+                stride_od=values_out.stride(0),
+                n_prefix_from_B=n_prefix,
+                BLOCK_D=BLOCK_D,
+            )
+        else:
+            concat_2D_jagged_jagged_w_prefix[(max_seq_len, B)](
+                OffsetsA=offsets_a,
+                ValuesA=values_a,
+                OffsetsB=offsets_b,
+                ValuesB=values_b,
+                Out=values_out,
+                D=D,
+                stride_ad=values_a.stride(-2),
+                stride_bd=values_b.stride(-2),
+                stride_od=values_out.stride(0),
+                n_prefix_from_B=n_prefix,
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+            )
+    else:
+        if use_multirow:
+
+            def grid(meta):
+                return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+            concat_2D_jagged_multirow[grid](
+                OffsetsA=offsets_a,
+                ValuesA=values_a,
+                OffsetsB=offsets_b,
+                ValuesB=values_b,
+                DenseSize=dense_size,
+                Out=values_out,
+                D=D,
+                stride_ad=values_a.stride(-2),
+                stride_bd=values_b.stride(-2),
+                stride_dense_batch=stride_dense_batch,
+                stride_od=values_out.stride(0),
+                IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+                IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+                IS_REPLACE=is_replace,  # pyre-ignore[6]
+            )
+        else:
+            concat_2D_jagged[(max_seq_len, B)](
+                OffsetsA=offsets_a,
+                ValuesA=values_a,
+                OffsetsB=offsets_b,
+                ValuesB=values_b,
+                DenseSize=dense_size,
+                Out=values_out,
+                D=D,
+                stride_ad=values_a.stride(-2),
+                stride_bd=values_b.stride(-2),
+                stride_dense_batch=stride_dense_batch,
+                stride_od=values_out.stride(0),
+                IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+                IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+                IS_REPLACE=is_replace,  # pyre-ignore[6]
+            )
+
+
+def _get_split_concat_2d_jagged_multirow_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_N in [1, 2, 4, 8]:
+        for num_warps in [1, 2, 4]:
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+def _get_split_concat_2d_jagged_multirow_configs_wrapper() -> List[triton.Config]:
+    # Use extended config space only when JAGGED_USE_MULTIROW_MI350 is explicitly set,
+    # otherwise fall back to the existing configs to avoid breaking autotune.
+    if os.environ.get("JAGGED_USE_MULTIROW_MI350") is not None:
+        configs = []
+        # Extended config space for MI350 tuning
+        # - BLOCK_N: number of rows processed per block
+        # - num_warps: number of warps (AMD wavefront = 64 threads)
+        # - num_stages: software pipeline depth for memory latency hiding
+        #   NOTE: num_stages=0 is invalid for AMD GPUs, start from 1
+        # - waves_per_eu: AMD-specific, controls occupancy (waves per execution unit)
+        for BLOCK_N in [1, 2, 4, 8, 16, 32]:
+            for num_warps in [1, 2, 4, 8, 16, 32]:
+                for num_stages in [1, 2, 3, 4]:
+                    for waves_per_eu in [0, 1, 2, 4]:
+                        configs.append(
+                            triton.Config(
+                                {"BLOCK_N": BLOCK_N, "waves_per_eu": waves_per_eu},
+                                num_warps=num_warps,
+                                num_stages=num_stages,
+                            )
+                        )
+        return configs
+    return _get_split_concat_2d_jagged_multirow_configs()
+
+
+def _get_bmm_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_M in [64, 128]:
+        for BLOCK_N in [64, 128, 256]:
+            for BLOCK_K in [32, 64]:
+                for num_stages in [3, 5]:
+                    for num_warps in [4, 8]:
+                        configs.append(
+                            triton.Config(
+                                {
+                                    "BLOCK_M": BLOCK_M,
+                                    "BLOCK_N": BLOCK_N,
+                                    "BLOCK_K": BLOCK_K,
+                                },
+                                num_stages=num_stages,
+                                num_warps=num_warps,
+                            )
+                        )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_bmm_configs(),
+    key=["AUTOTUNE_MAX_SEQ_LEN", "N", "K", "ELEMENTWISE", "HAS_BIAS"],
+)
+@triton.jit
+def jagged_dense_bmm_broadcast_add_kernel(
+    seq_offsets,
+    Jagged,
+    Dense,
+    Bias,
+    Out,
+    AUTOTUNE_MAX_SEQ_LEN,
+    N,
+    K,
+    stride_jm,
+    stride_db,
+    stride_dk,
+    stride_dn,
+    stride_bias_b,
+    stride_om,
+    HAS_BIAS: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    ELEMENTWISE: tl.constexpr,
+):
+    """
+    Computing bmm Out = Jagged x Dense + Bias
+    M is the jagged dimension
+    Jagged has shape (sum_B(M_i), K), Dense has shape (B, K, N), Bias has shape (B, N), and Out has shape (sum_B(M_i), N)
+    """
+
+    off_n = tl.program_id(0)
+    off_m = tl.program_id(1).to(tl.int64)
+    off_b = tl.program_id(2)
+
+    seq_start = tl.load(seq_offsets + off_b).to(tl.int64)
+    seq_end = tl.load(seq_offsets + off_b + 1)
+    seq_len = seq_end - seq_start
+    start_m = off_m * BLOCK_M
+    start_n = off_n * BLOCK_N
+    if start_m >= seq_len:
+        return
+
+    Jagged += (seq_start + start_m) * stride_jm
+    Dense += off_b.to(tl.int64) * stride_db
+    Out += seq_start * stride_om
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_K)
+    jg_ptrs = Jagged + offs_m[:, None] * stride_jm + offs_k[None, :]
+    dn_ptrs = Dense + offs_k[:, None] * stride_dk + offs_n[None, :] * stride_dn
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, K, BLOCK_K):
+        jg = tl.load(
+            jg_ptrs,
+            # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+            mask=(offs_m[:, None] < (seq_len - start_m)) & ((k + offs_k)[None, :] < K),
+            other=0.0,
+        )
+        dn = tl.load(
+            dn_ptrs,
+            mask=((k + offs_k)[:, None] < K) & (offs_n[None, :] < N),
+            other=0.0,
+        )
+        accumulator += tl.dot(jg, dn, allow_tf32=ALLOW_TF32)
+        jg_ptrs += BLOCK_K
+        dn_ptrs += BLOCK_K * stride_dk
+
+    if HAS_BIAS:
+        if ELEMENTWISE:
+            Bias += (seq_start + start_m) * stride_bias_b
+            bias_ptrs = Bias + offs_m[:, None] * stride_bias_b + offs_n[None, :]
+            bias = tl.load(
+                bias_ptrs,
+                mask=(offs_m[:, None] < (seq_len - start_m)) & (offs_n[None, :] < N),
+                other=0.0,
+            )
+            accumulator += bias.to(tl.float32)
+        else:
+            bias_ptrs = Bias + off_b.to(tl.int64) * stride_bias_b + offs_n
+            bias = tl.load(bias_ptrs, mask=offs_n < N)
+            accumulator += bias[None, :].to(tl.float32)
+
+    out = accumulator.to(Out.dtype.element_ty)
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    Out += start_m * stride_om
+    out_ptrs = Out + offs_m[:, None] * stride_om + offs_n[None, :]
+    tl.store(
+        out_ptrs,
+        out,
+        mask=(offs_m[:, None] < (seq_len - start_m)) & (offs_n[None, :] < N),
+    )
+
+
+def _get_bmm_reduce_sum_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_M in [64, 128]:
+        for BLOCK_N in [64, 128]:
+            for BLOCK_K in [64, 128]:
+                for num_stages in [3, 4]:
+                    for num_warps in [4, 8]:
+                        configs.append(
+                            triton.Config(
+                                {
+                                    "BLOCK_M": BLOCK_M,
+                                    "BLOCK_N": BLOCK_N,
+                                    "BLOCK_K": BLOCK_K,
+                                },
+                                num_stages=num_stages,
+                                num_warps=num_warps,
+                            )
+                        )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_bmm_reduce_sum_configs(),
+    key=["M", "N", "AUTOTUNE_MAX_SEQ_LEN"],
+)
+@triton.jit
+def _jagged_jagged_bmm_reduce_sum(
+    seq_offsets,
+    JaggedA,
+    JaggedB,
+    Out,
+    ReduceOut,
+    M,
+    N,
+    AUTOTUNE_MAX_SEQ_LEN,
+    stride_ak,
+    stride_bk,
+    stride_ob,
+    stride_om,
+    stride_on,
+    stride_orb,
+    stride_orn,
+    REDUCE_JAGGEDB: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    """
+    Computing bmm Out = Jagged x Jagged
+    K is the jagged dimension
+    JaggedA has shape (sum_B(K_i), M), JaggedB has shape (sum_B(K_i), N), and Out has shape (B, M, N)
+    """
+
+    off_m = tl.program_id(0).to(tl.int64)
+    off_n = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    seq_start = tl.load(seq_offsets + off_b).to(tl.int64)
+    seq_end = tl.load(seq_offsets + off_b + 1)
+    seq_len = seq_end - seq_start
+
+    start_m = off_m * BLOCK_M
+    start_n = off_n * BLOCK_N
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    Out += off_b.to(tl.int64) * stride_ob
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    Out += start_m * stride_om
+    out_ptrs = Out + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
+    if REDUCE_JAGGEDB:
+        out_reduce_ptrs = (
+            ReduceOut + off_b.to(tl.int64) * stride_orb + offs_n * stride_orn
+        )
+        acc_reduce = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if seq_len == 0:
+        out = accumulator.to(Out.dtype.element_ty)
+        tl.store(
+            out_ptrs,
+            out,
+            mask=(offs_m[:, None] < (M - start_m)) & (offs_n[None, :] < N),
+        )
+        if REDUCE_JAGGEDB:
+            if off_m == 0:
+                tl.store(
+                    out_reduce_ptrs,  # pyre-ignore [61]
+                    acc_reduce.to(ReduceOut.dtype.element_ty),
+                    mask=(offs_n < N),
+                )
+        return
+
+    JaggedA += seq_start * stride_ak
+    JaggedB += seq_start * stride_bk
+    offs_k = tl.arange(0, BLOCK_K)
+    jg_a_ptrs = JaggedA + offs_k[None, :] * stride_ak + (start_m + offs_m)[:, None]
+    jg_b_ptrs = JaggedB + offs_k[:, None] * stride_bk + offs_n[None, :]
+
+    for k in range(0, seq_len, BLOCK_K):
+        jg_a = tl.load(
+            jg_a_ptrs,
+            # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+            mask=(offs_m[:, None] < (M - start_m)) & ((k + offs_k)[None, :] < seq_len),
+            other=0.0,
+        )
+        jg_b = tl.load(
+            jg_b_ptrs,
+            mask=(offs_n[None, :] < N) & ((k + offs_k)[:, None] < seq_len),
+            other=0.0,
+        )
+
+        accumulator += tl.dot(jg_a, jg_b, allow_tf32=ALLOW_TF32)
+        if REDUCE_JAGGEDB:
+            if off_m == 0:
+                acc_reduce += tl.sum(jg_b.to(tl.float32), axis=0)
+
+        jg_a_ptrs += BLOCK_K * stride_ak
+        jg_b_ptrs += BLOCK_K * stride_bk
+
+    out = accumulator.to(Out.dtype.element_ty)
+    tl.store(
+        out_ptrs,
+        out,
+        mask=(offs_m[:, None] < (M - start_m)) & (offs_n[None, :] < N),
+    )
+    if REDUCE_JAGGEDB:
+        if off_m == 0:
+            tl.store(
+                out_reduce_ptrs,  # pyre-ignore [61]
+                acc_reduce.to(ReduceOut.dtype.element_ty),
+                mask=(offs_n < N),
+            )
+
+
+class _JaggedDenseBmmFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        jagged: torch.Tensor,
+        dense: torch.Tensor,
+    ):
+        jagged = switch_to_contiguous_if_needed(jagged)
+        L, D = jagged.shape
+        B, _, K = dense.shape
+        bmm_out = torch.empty((L, K), dtype=jagged.dtype, device=jagged.device)
+
+        grid = lambda meta: (  # noqa E731
+            triton.cdiv(K, meta["BLOCK_N"]),
+            triton.cdiv(max_seq_len, meta["BLOCK_M"]),
+            B,
+        )
+
+        jagged_dense_bmm_broadcast_add_kernel[grid](
+            seq_offsets=seq_offsets,
+            Jagged=jagged,
+            Dense=dense,
+            Bias=0,
+            Out=bmm_out,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(max_seq_len),
+            N=K,
+            K=D,
+            stride_jm=jagged.stride(0),
+            stride_db=dense.stride(0),
+            stride_dk=dense.stride(1),
+            stride_dn=dense.stride(2),
+            stride_bias_b=0,
+            stride_om=bmm_out.stride(0),
+            HAS_BIAS=False,
+            ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+            ELEMENTWISE=False,
+        )
+
+        ctx.save_for_backward(seq_offsets, jagged, dense)
+        ctx.B = B
+        ctx.max_seq_len = max_seq_len
+        ctx.K = K
+        ctx.D = D
+        return bmm_out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_bmm_out: torch.Tensor
+    ) -> Tuple[None, None, torch.Tensor, torch.Tensor]:
+        seq_offsets, jagged, dense = ctx.saved_tensors
+        d_jagged = torch.empty_like(jagged)
+        d_dense = torch.empty_like(dense)
+
+        grid = lambda meta: (  # noqa E731
+            triton.cdiv(ctx.D, meta["BLOCK_N"]),
+            triton.cdiv(ctx.max_seq_len, meta["BLOCK_M"]),
+            ctx.B,
+        )
+        jagged_dense_bmm_broadcast_add_kernel[grid](
+            seq_offsets=seq_offsets,
+            Jagged=d_bmm_out,
+            Dense=dense,
+            Bias=None,
+            Out=d_jagged,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(ctx.max_seq_len),
+            N=ctx.D,
+            K=ctx.K,
+            stride_jm=d_bmm_out.stride(0),
+            stride_db=dense.stride(0),
+            stride_dk=dense.stride(2),
+            stride_dn=dense.stride(1),
+            stride_bias_b=0,
+            stride_om=d_jagged.stride(0),
+            HAS_BIAS=False,
+            ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+            ELEMENTWISE=False,
+        )
+
+        grid = lambda meta: (  # noqa E731
+            triton.cdiv(ctx.D, meta["BLOCK_M"]),
+            triton.cdiv(ctx.K, meta["BLOCK_N"]),
+            ctx.B,
+        )
+        _jagged_jagged_bmm_reduce_sum[grid](
+            seq_offsets=seq_offsets,
+            JaggedA=jagged,
+            JaggedB=d_bmm_out,
+            Out=d_dense,
+            ReduceOut=None,
+            M=ctx.D,
+            N=ctx.K,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(ctx.max_seq_len),
+            stride_ak=jagged.stride(0),
+            stride_bk=d_bmm_out.stride(0),
+            stride_ob=d_dense.stride(0),
+            stride_om=d_dense.stride(1),
+            stride_on=d_dense.stride(2),
+            stride_orb=0,
+            stride_orn=0,
+            REDUCE_JAGGEDB=False,
+            ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        )
+
+        return None, None, d_jagged, d_dense
+
+
+def _get_jagged_dense_broadcast_add_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_N in [16, 32, 64]:
+        for num_stages in [1, 2]:
+            for num_warps in [2, 4, 8]:
+                configs.append(
+                    triton.Config(
+                        {
+                            "BLOCK_N": BLOCK_N,
+                        },
+                        num_stages=num_stages,
+                        num_warps=num_warps,
+                    )
+                )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_jagged_dense_broadcast_add_configs(),
+    key=["AUTOTUNE_MAX_SEQ_LEN"],
+)
+@triton.jit
+def jagged_dense_broadcast_add_kernel(
+    seq_offsets,
+    Jagged,
+    Dense,
+    Out,
+    AUTOTUNE_MAX_SEQ_LEN,
+    D,
+    stride_jn,
+    stride_db,
+    stride_on,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    """
+    Computing Out = Jagged + Dense
+    JaggedA has shape (sum_B(N_i), D), Dense has shape (B, D), and Out has shape (sum_B(N_i), D)
+    """
+
+    off_b = tl.program_id(0)
+    off_n = tl.program_id(1)
+    seq_start = tl.load(seq_offsets + off_b)
+    seq_end = tl.load(seq_offsets + off_b + 1)
+    seq_len = seq_end - seq_start
+    start_n = off_n * BLOCK_N
+    if start_n >= seq_len:
+        return
+    Jagged += seq_start * stride_jn
+    Dense += off_b * stride_db
+    Out += seq_start * stride_on
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+    jagged_ptrs = Jagged + offs_n[:, None] * stride_jn + offs_d[None, :]
+    dense_ptrs = Dense + offs_d
+    out_ptrs = Out + offs_n[:, None] * stride_jn + offs_d[None, :]
+    for d in range(0, D, BLOCK_D):
+        jg = tl.load(
+            jagged_ptrs,
+            # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+            mask=(offs_n[:, None] < seq_len) & ((d + offs_d)[None, :] < D),
+        )
+        dn = tl.load(dense_ptrs, mask=d + offs_d < D)
+        out = jg + dn[None, :]
+        tl.store(
+            out_ptrs,
+            out,
+            mask=(offs_n[:, None] < seq_len) & ((d + offs_d)[None, :] < D),
+        )
+        dense_ptrs += BLOCK_D
+        jagged_ptrs += BLOCK_D
+        out_ptrs += BLOCK_D
+
+
+@triton.jit
+def jagged_reduce_sum(
+    seq_offsets,
+    Jagged,
+    Out,
+    D,
+    stride_jn,
+    stride_ob,
+    BLOCK_D: tl.constexpr,
+):
+    """
+    Computing Out = Jagged + Dense
+    JaggedA has shape (sum_B(N_i), D), Dense has shape (B, D), and Out has shape (sum_B(N_i), D)
+    """
+    off_b = tl.program_id(0)
+    off_d = tl.program_id(1) * BLOCK_D
+    seq_start = tl.load(seq_offsets + off_b)
+    seq_end = tl.load(seq_offsets + off_b + 1)
+    seq_len = seq_end - seq_start
+    Jagged += seq_start * stride_jn
+    Out += off_b * stride_ob
+    offs_d = off_d + tl.arange(0, BLOCK_D)
+    jagged_ptrs = Jagged + offs_d
+    out_ptrs = Out + offs_d
+    accumulator = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    for _ in range(0, seq_len):
+        jg = tl.load(
+            jagged_ptrs,
+            mask=offs_d < D,
+        )
+        accumulator += jg
+        jagged_ptrs += stride_jn
+    out = accumulator.to(Out.dtype.element_ty)
+    tl.store(
+        out_ptrs,
+        out,
+        mask=offs_d < D,
+    )
+
+
+class _JaggedDenseBroadcastAddFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        jagged: torch.Tensor,
+        dense: torch.Tensor,
+    ):
+        jagged = switch_to_contiguous_if_needed(jagged)
+        dense = switch_to_contiguous_if_needed(dense)
+        L, D = jagged.shape
+        B, _ = dense.shape
+        out = torch.empty_like(jagged)
+
+        grid = lambda meta: (  # noqa E731
+            B,
+            triton.cdiv(max_seq_len, meta["BLOCK_N"]),
+        )
+        BLOCK_D = triton.next_power_of_2(D) if D < 64 else 64
+        jagged_dense_broadcast_add_kernel[grid](
+            seq_offsets=seq_offsets,
+            Jagged=jagged,
+            Dense=dense,
+            Out=out,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(max_seq_len),
+            D=D,
+            stride_jn=jagged.stride(0),
+            stride_db=dense.stride(0),
+            stride_on=out.stride(0),
+            BLOCK_D=BLOCK_D,
+        )
+
+        ctx.save_for_backward(seq_offsets)
+        ctx.max_seq_len = max_seq_len
+        ctx.B = B
+        ctx.D = D
+        return out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, None, torch.Tensor, torch.Tensor]:
+        seq_offsets = ctx.saved_tensors[0]
+        d_dense = torch.empty((ctx.B, ctx.D), device=d_out.device, dtype=d_out.dtype)
+        BLOCK_D = triton.next_power_of_2(ctx.D) if ctx.D < 64 else 64
+        jagged_reduce_sum[(ctx.B, triton.cdiv(ctx.D, BLOCK_D))](
+            seq_offsets=seq_offsets,
+            Jagged=d_out,
+            Out=d_dense,
+            D=ctx.D,
+            stride_jn=d_out.stride(0),
+            stride_ob=d_dense.stride(0),
+            BLOCK_D=BLOCK_D,
+        )
+        return None, None, d_out, d_dense
+
+
+def triton_jagged_dense_bmm_add_fwd(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    bias: torch.Tensor,
+    elementwise: bool = False,
+) -> Tuple[torch.Tensor, int, int, int]:
+    jagged = switch_to_contiguous_if_needed(jagged)
+    bias = switch_to_contiguous_if_needed(bias)
+    L, K = jagged.shape
+    B, _, N = dense.shape
+    out = torch.empty((L, N), dtype=jagged.dtype, device=jagged.device)
+
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(N, meta["BLOCK_N"]),
+        triton.cdiv(max_seq_len, meta["BLOCK_M"]),
+        B,
+    )
+
+    jagged_dense_bmm_broadcast_add_kernel[grid](
+        seq_offsets=seq_offsets,
+        Jagged=jagged,
+        Dense=dense,
+        Bias=bias,
+        Out=out,
+        AUTOTUNE_MAX_SEQ_LEN=fine_grained_autotune_max_seq_len(max_seq_len),
+        N=N,
+        K=K,
+        stride_jm=jagged.stride(0),
+        stride_db=dense.stride(0),
+        stride_dk=dense.stride(1),
+        stride_dn=dense.stride(2),
+        stride_bias_b=bias.stride(0),
+        stride_om=out.stride(0),
+        HAS_BIAS=True,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        ELEMENTWISE=elementwise,
+    )
+
+    return out, B, K, N
+
+
+def triton_jagged_dense_bmm_add_bwd_jagged(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    d_jagged: torch.Tensor,
+    dense: torch.Tensor,
+    d_out: torch.Tensor,
+    K: int,
+    B: int,
+    N: int,
+) -> torch.Tensor:
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(K, meta["BLOCK_N"]),
+        triton.cdiv(max_seq_len, meta["BLOCK_M"]),
+        B,
+    )
+    jagged_dense_bmm_broadcast_add_kernel[grid](
+        seq_offsets=seq_offsets,
+        Jagged=d_out,
+        Dense=dense,
+        Bias=None,
+        Out=d_jagged,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(max_seq_len),
+        N=K,
+        K=N,
+        stride_jm=d_out.stride(0),
+        stride_db=dense.stride(0),
+        stride_dk=dense.stride(2),
+        stride_dn=dense.stride(1),
+        stride_bias_b=0,
+        stride_om=d_jagged.stride(0),
+        HAS_BIAS=False,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        ELEMENTWISE=False,
+    )
+
+    return d_jagged
+
+
+def triton_jagged_dense_bmm_add_bwd_dense_bias(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    d_dense: torch.Tensor,
+    B: int,
+    K: int,
+    N: int,
+    d_out: torch.Tensor,
+    elementwise: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    d_bias = torch.empty((B, N), device=d_out.device, dtype=d_out.dtype)
+
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(K, meta["BLOCK_M"]),
+        triton.cdiv(N, meta["BLOCK_N"]),
+        B,
+    )
+
+    if elementwise:
+        d_bias = d_out
+        reduce_out = None
+        stride_orb = 0
+        stride_orn = 0
+        reduce_jaggedb = False
+    else:
+        reduce_out = d_bias
+        stride_orb = d_bias.stride(0)
+        stride_orn = d_bias.stride(1)
+        reduce_jaggedb = True
+
+    _jagged_jagged_bmm_reduce_sum[grid](
+        seq_offsets=seq_offsets,
+        JaggedA=jagged,
+        JaggedB=d_out,
+        Out=d_dense,
+        ReduceOut=reduce_out,
+        M=K,
+        N=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(max_seq_len),
+        stride_ak=jagged.stride(0),
+        stride_bk=d_out.stride(0),
+        stride_ob=d_dense.stride(0),
+        stride_om=d_dense.stride(1),
+        stride_on=d_dense.stride(2),
+        stride_orb=stride_orb,
+        stride_orn=stride_orn,
+        REDUCE_JAGGEDB=reduce_jaggedb,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+    )
+
+    return d_dense, d_bias
+
+
+class _JaggedDenseBmmAddFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        jagged: torch.Tensor,
+        dense: torch.Tensor,
+        bias: torch.Tensor,
+        elementwise: bool = False,
+    ):
+        if get_cuda_jagged_dense_bmm_fwd():
+            jagged = switch_to_contiguous_if_needed(jagged)
+            bias = switch_to_contiguous_if_needed(bias)
+            # Ensure bias has same dtype as jagged (required by CUDA kernel)
+            bias = bias.to(jagged.dtype)
+            # Ensure seq_offsets is int64 (required by CUDA kernel)
+            seq_offsets = seq_offsets.to(torch.int64)
+            _, K = jagged.shape
+            B, _, N = dense.shape
+            out = torch.ops.jagged_dense_bmm_broadcast_add.jagged_dense_bmm_broadcast_add_fwd(
+                max_seq_len, seq_offsets, jagged, dense, bias, elementwise
+            )
+        else:
+            out, B, K, N = triton_jagged_dense_bmm_add_fwd(
+                max_seq_len, seq_offsets, jagged, dense, bias, elementwise
+            )
+
+        ctx.save_for_backward(seq_offsets, jagged, dense)
+        ctx.B = B
+        ctx.max_seq_len = max_seq_len
+        ctx.K = K
+        ctx.N = N
+        ctx.elementwise = elementwise
+        return out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, None, torch.Tensor, torch.Tensor, torch.Tensor, None]:
+        seq_offsets, jagged, dense = ctx.saved_tensors
+        if get_cuda_jagged_dense_bmm_bwd():
+            d_jagged, d_dense, d_bias = (
+                torch.ops.jagged_dense_bmm_broadcast_add.jagged_dense_bmm_broadcast_add_bwd(
+                    ctx.max_seq_len,
+                    d_out,
+                    seq_offsets.to(torch.int64),
+                    jagged,
+                    dense,
+                    ctx.elementwise,
+                )
+            )
+        else:
+            d_jagged = triton_jagged_dense_bmm_add_bwd_jagged(
+                ctx.max_seq_len,
+                seq_offsets,
+                torch.empty_like(jagged),
+                dense,
+                d_out,
+                ctx.K,
+                ctx.B,
+                ctx.N,
+            )
+            d_dense, d_bias = triton_jagged_dense_bmm_add_bwd_dense_bias(
+                ctx.max_seq_len,
+                seq_offsets,
+                jagged,
+                torch.empty_like(dense),
+                ctx.B,
+                ctx.K,
+                ctx.N,
+                d_out,
+                ctx.elementwise,
+            )
+
+        return None, None, d_jagged, d_dense, d_bias, None
+
+
+@triton.jit
+def concat_2D_jagged_w_prefix(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    DenseSize,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_dense_batch,
+    stride_od,
+    n_prefix_from_B,  # nonzero is not supported when IS_REPLACE=True
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_n = tl.program_id(0)
+    if IS_DENSE_A:
+        seq_start_a = off_z * DenseSize
+        seq_len_a = DenseSize
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    elif IS_DENSE_B:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = off_z * DenseSize
+        seq_len_b = DenseSize
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+
+    if IS_REPLACE:
+        seq_len = seq_len_a
+    else:
+        seq_len = seq_len_a + seq_len_b
+    if off_n >= seq_len:
+        return
+
+    offs_d = tl.arange(0, BLOCK_D)
+    if IS_REPLACE:
+        out_seq_start = seq_start_a + off_n
+        out_seq_b_start = seq_len_a - seq_len_b
+    else:
+        out_seq_start = seq_start_a + seq_start_b + off_n
+        out_seq_b_start = seq_len_a + n_prefix_from_B
+
+    out_ptrs = Out + out_seq_start.to(tl.int64) * stride_od + offs_d
+    if off_n < out_seq_b_start and off_n >= n_prefix_from_B:
+        off_a = off_n - n_prefix_from_B
+        if IS_DENSE_A:
+            in_ptrs = (
+                ValuesA
+                + off_a.to(tl.int64) * stride_ad
+                + off_z.to(tl.int64) * stride_dense_batch
+                + offs_d
+            )
+        else:
+            in_ptrs = ValuesA + (off_a + seq_start_a).to(tl.int64) * stride_ad + offs_d
+    else:
+        off_b = off_n - out_seq_b_start + n_prefix_from_B
+        if off_n < n_prefix_from_B:
+            off_b += out_seq_b_start - n_prefix_from_B
+        if IS_DENSE_B:
+            in_ptrs = (
+                ValuesB
+                + off_b.to(tl.int64) * stride_bd
+                + off_z.to(tl.int64) * stride_dense_batch
+                + offs_d
+            )
+        else:
+            in_ptrs = ValuesB + (off_b + seq_start_b).to(tl.int64) * stride_bd + offs_d
+    v = tl.load(in_ptrs, mask=offs_d < D)
+    tl.store(out_ptrs, v, mask=offs_d < D)
+
+
+@triton.jit
+def concat_2D_jagged(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    DenseSize,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_dense_batch,
+    stride_od,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    concat_2D_jagged_w_prefix(
+        OffsetsA,
+        ValuesA,
+        OffsetsB,
+        ValuesB,
+        DenseSize,
+        Out,
+        D,
+        stride_ad,
+        stride_bd,
+        stride_dense_batch,
+        stride_od,
+        0,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        IS_REPLACE,
+    )
+
+
+@triton.jit
+def concat_2D_jagged_jagged_w_prefix(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_od,
+    n_prefix_from_B,
+    BLOCK_D: tl.constexpr,
+):
+    concat_2D_jagged_w_prefix(
+        OffsetsA,
+        ValuesA,
+        OffsetsB,
+        ValuesB,
+        0,
+        Out,
+        D,
+        stride_ad,
+        stride_bd,
+        0,
+        stride_od,
+        n_prefix_from_B,
+        IS_DENSE_A=False,
+        IS_DENSE_B=False,
+        BLOCK_D=BLOCK_D,
+        IS_REPLACE=False,
+    )
+
+
+@triton.jit
+def split_2D_jagged_w_prefix(
+    JaggedIn,
+    DenseSize,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_n = tl.program_id(0)
+    if IS_DENSE_A:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_start_a = off_z * DenseSize
+        seq_len_a = DenseSize
+        seq_len_b = seq_end_b - seq_start_b
+    elif IS_DENSE_B:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = off_z * DenseSize
+        seq_len_b = DenseSize
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    if IS_REPLACE:
+        seq_len = seq_len_a
+    else:
+        seq_len = seq_len_a + seq_len_b
+    if off_n >= seq_len:
+        return
+
+    if IS_REPLACE:
+        seq_start = seq_start_a
+        out_seq_b_start = seq_len_a - seq_len_b
+    else:
+        seq_start = seq_start_a + seq_start_b
+        out_seq_b_start = seq_len_a + n_prefix_to_B
+
+    offs_d = tl.arange(0, BLOCK_D)
+    in_ptrs = JaggedIn + (seq_start + off_n).to(tl.int64) * stride_id + offs_d
+    if off_n < out_seq_b_start and off_n >= n_prefix_to_B:
+        off_a = off_n - n_prefix_to_B
+        out_ptrs = OutA + (off_a + seq_start_a).to(tl.int64) * stride_ad + offs_d
+    else:
+        off_b = off_n - out_seq_b_start + n_prefix_to_B
+        if off_n < n_prefix_to_B:
+            off_b += out_seq_b_start - n_prefix_to_B
+        out_ptrs = OutB + (off_b + seq_start_b).to(tl.int64) * stride_bd + offs_d
+    v = tl.load(in_ptrs, mask=offs_d < D)
+    tl.store(out_ptrs, v, mask=offs_d < D)
+
+
+@triton.jit
+def split_2D_jagged(
+    JaggedIn,
+    DenseSize,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    split_2D_jagged_w_prefix(
+        JaggedIn,
+        DenseSize,
+        OffsetsA,
+        OffsetsB,
+        OutA,
+        OutB,
+        D,
+        stride_id,
+        stride_ad,
+        stride_bd,
+        0,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        IS_REPLACE,
+    )
+
+
+@triton.jit
+def split_2D_jagged_jagged_w_prefix(
+    JaggedIn,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    BLOCK_D: tl.constexpr,
+):
+    split_2D_jagged_w_prefix(
+        JaggedIn,
+        0,
+        OffsetsA,
+        OffsetsB,
+        OutA,
+        OutB,
+        D,
+        stride_id,
+        stride_ad,
+        stride_bd,
+        n_prefix_to_B,
+        IS_DENSE_A=False,
+        IS_DENSE_B=False,
+        BLOCK_D=BLOCK_D,
+        IS_REPLACE=False,
+    )
+
+
+def _triton_split_2D_jagged_internal(
+    jagged_in: torch.Tensor,
+    max_seq_len: int,
+    B: int,
+    offsets_a: Optional[torch.Tensor],
+    offsets_b: Optional[torch.Tensor],
+    out_a: torch.Tensor,
+    out_b: torch.Tensor,
+    D: int,
+    dense_size: int,
+    n_prefix: int,
+    is_dense_a: bool,
+    is_dense_b: bool,
+    is_replace: bool,
+    BLOCK_D: int,
+) -> None:
+    use_multirow = _should_use_multirow()
+    if n_prefix != 0:
+        if use_multirow:
+
+            def grid(meta):
+                return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+            split_2D_jagged_jagged_w_prefix_multirow[grid](
+                JaggedIn=jagged_in,
+                OffsetsA=offsets_a,
+                OffsetsB=offsets_b,
+                OutA=out_a,
+                OutB=out_b,
+                D=D,
+                stride_id=jagged_in.stride(0),
+                stride_ad=out_a.stride(0),
+                stride_bd=out_b.stride(0),
+                n_prefix_to_B=n_prefix,
+                BLOCK_D=BLOCK_D,
+            )
+        else:
+            split_2D_jagged_jagged_w_prefix[(max_seq_len, B)](
+                JaggedIn=jagged_in,
+                OffsetsA=offsets_a,
+                OffsetsB=offsets_b,
+                OutA=out_a,
+                OutB=out_b,
+                D=D,
+                stride_id=jagged_in.stride(0),
+                stride_ad=out_a.stride(0),
+                stride_bd=out_b.stride(0),
+                n_prefix_to_B=n_prefix,
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+            )
+    else:
+        if use_multirow:
+
+            def grid(meta):
+                return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+            split_2D_jagged_multirow[grid](
+                JaggedIn=jagged_in,
+                DenseSize=dense_size,
+                OffsetsA=offsets_a,
+                OffsetsB=offsets_b,
+                OutA=out_a,
+                OutB=out_b,
+                D=D,
+                stride_id=jagged_in.stride(0),
+                stride_ad=out_a.stride(0),
+                stride_bd=out_b.stride(0),
+                IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+                IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+                IS_REPLACE=is_replace,  # pyre-ignore[6]
+            )
+        else:
+            split_2D_jagged[(max_seq_len, B)](
+                JaggedIn=jagged_in,
+                DenseSize=dense_size,
+                OffsetsA=offsets_a,
+                OffsetsB=offsets_b,
+                OutA=out_a,
+                OutB=out_b,
+                D=D,
+                stride_id=jagged_in.stride(0),
+                stride_ad=out_a.stride(0),
+                stride_bd=out_b.stride(0),
+                IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+                IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+                IS_REPLACE=is_replace,  # pyre-ignore[6]
+            )
+
+
+class _Concat2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values_a: torch.Tensor,
+        values_b: torch.Tensor,
+        offsets_a: Optional[torch.Tensor] = None,
+        offsets_b: Optional[torch.Tensor] = None,
+        is_replace: bool = False,
+        n_prefix_from_right: int = 0,
+    ):
+        values_a = switch_to_contiguous_if_needed(values_a)
+        values_b = switch_to_contiguous_if_needed(values_b)
+        is_dense_a = offsets_a is None
+        is_dense_b = offsets_b is None
+        dense_size: int = 0
+        if is_dense_a:
+            assert offsets_b is not None
+            B, dense_size, D = values_a.shape
+            seq_len_a = dense_size * B
+            seq_len_b, _ = values_b.shape
+            device = values_b.device
+            dtype = values_b.dtype
+            stride_dense_batch = values_a.stride(0)
+        elif is_dense_b:
+            assert offsets_a is not None
+            B, dense_size, D = values_b.shape
+            seq_len_a, _ = values_a.shape
+            seq_len_b = dense_size * B
+            device = values_a.device
+            dtype = values_a.dtype
+            stride_dense_batch = values_b.stride(0)
+        else:
+            assert offsets_a is not None and offsets_b is not None
+            B = offsets_a.shape[0] - 1
+            seq_len_a, D = values_a.shape
+            seq_len_b, _ = values_b.shape
+            device = values_a.device
+            dtype = values_a.dtype
+            stride_dense_batch = 0
+
+        BLOCK_D = triton.next_power_of_2(D)
+        if is_replace:
+            values_out = torch.empty_like(values_a)
+        else:
+            values_out = torch.empty(
+                (seq_len_a + seq_len_b, D), device=device, dtype=dtype
+            )
+        _triton_concat_2D_jagged_internal(
+            values_a=values_a,
+            values_b=values_b,
+            values_out=values_out,
+            max_seq_len=max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            D=D,
+            dense_size=dense_size,
+            stride_dense_batch=stride_dense_batch,
+            n_prefix=n_prefix_from_right,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            is_replace=is_replace,
+            BLOCK_D=BLOCK_D,
+        )
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.seq_len_a = seq_len_a
+        ctx.seq_len_b = seq_len_b
+        ctx.is_dense_a = is_dense_a
+        ctx.is_dense_b = is_dense_b
+        ctx.dense_size = dense_size
+        ctx.is_replace = is_replace
+        ctx.n_prefix_from_right = n_prefix_from_right
+        return values_out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, torch.Tensor, torch.Tensor, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        is_dense_a, is_dense_b, is_replace = (
+            ctx.is_dense_a,
+            ctx.is_dense_b,
+            ctx.is_replace,
+        )
+        dense_size = ctx.dense_size
+        if is_dense_a:
+            B = offsets_b.shape[0] - 1
+        else:
+            B = offsets_a.shape[0] - 1
+        _, D = d_out.shape
+        BLOCK_D = triton.next_power_of_2(D)
+        values_a = torch.zeros(
+            (ctx.seq_len_a, D), device=d_out.device, dtype=d_out.dtype
+        )
+        values_b = torch.empty(
+            (ctx.seq_len_b, D), device=d_out.device, dtype=d_out.dtype
+        )
+        _triton_split_2D_jagged_internal(
+            jagged_in=d_out,
+            max_seq_len=ctx.max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            out_a=values_a,
+            out_b=values_b,
+            D=D,
+            dense_size=dense_size,
+            n_prefix=ctx.n_prefix_from_right,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            is_replace=is_replace,
+            BLOCK_D=BLOCK_D,
+        )
+
+        if is_dense_a:
+            values_a = values_a.reshape((B, dense_size, D))
+        elif is_dense_b:
+            values_b = values_b.reshape((B, dense_size, D))
+        return None, values_a, values_b, None, None, None, None
+
+
+class _HelionConcat2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values_a: torch.Tensor,
+        values_b: torch.Tensor,
+        offsets_a: Optional[torch.Tensor] = None,
+        offsets_b: Optional[torch.Tensor] = None,
+    ):
+        values_a = switch_to_contiguous_if_needed(values_a)
+        values_b = switch_to_contiguous_if_needed(values_b)
+
+        assert offsets_a is not None and offsets_b is not None
+        B = offsets_a.shape[0] - 1
+        seq_len_a, D = values_a.shape
+        seq_len_b, _ = values_b.shape
+        device = values_a.device
+        dtype = values_a.dtype
+
+        BLOCK_D = triton.next_power_of_2(D)
+        values_out = torch.empty((seq_len_a + seq_len_b, D), device=device, dtype=dtype)
+        _triton_concat_2D_jagged_internal(
+            values_a=values_a,
+            values_b=values_b,
+            values_out=values_out,
+            max_seq_len=max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            D=D,
+            dense_size=0,
+            stride_dense_batch=0,
+            n_prefix=0,
+            is_dense_a=False,
+            is_dense_b=False,
+            is_replace=False,
+            BLOCK_D=BLOCK_D,
+        )
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.seq_len_a = seq_len_a
+        ctx.seq_len_b = seq_len_b
+        return values_out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, torch.Tensor, torch.Tensor, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        d_out = switch_to_contiguous_if_needed(d_out)
+        values_a, values_b = _helion_split_2D_jagged_impl(
+            values=d_out,
+            max_seq_len=ctx.max_seq_len,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            dense_size=0,
+            total_len_a=ctx.seq_len_a,
+            total_len_b=ctx.seq_len_b,
+        )
+
+        return None, values_a, values_b, None, None, None, None
+
+
+class _Split2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        values: torch.Tensor,
+        max_seq_len: int,
+        offsets_a: Optional[torch.Tensor] = None,
+        offsets_b: Optional[torch.Tensor] = None,
+        dense_size: int = 0,
+        n_prefix_to_right: int = 0,
+        seq_len_a: Optional[int] = None,
+        seq_len_b: Optional[int] = None,
+        total_seq_len: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        values = switch_to_contiguous_if_needed(values)
+        is_dense_a: bool = offsets_a is None
+        is_dense_b: bool = offsets_b is None
+        if is_dense_a:
+            L, _ = values.shape
+            assert offsets_b is not None
+            B = offsets_b.shape[0] - 1
+            seq_len_a = dense_size * B
+            seq_len_b = L - seq_len_a
+            offsets_a = offsets_b.new_empty(0)
+        elif is_dense_b:
+            L, _ = values.shape
+            assert offsets_a is not None
+            B = offsets_a.shape[0] - 1
+            seq_len_b = dense_size * B
+            seq_len_a = L - seq_len_b
+            offsets_b = offsets_a.new_empty(0)
+        else:
+            assert offsets_a is not None and offsets_b is not None
+            B = offsets_a.shape[0] - 1
+
+            # Select the last offset item using torch.index_select instead of
+            # "int(offsets_a[-1].item())" so that it won't cause "Cannot cast
+            # FakeTensor to python number" error for AOTI.
+            if torch.compiler.is_compiling():
+                offsets_b_last_idx = torch.tensor(offsets_b.size(0) - 1).to(
+                    offsets_b.device, non_blocking=True
+                )
+                if seq_len_b is None:
+                    seq_len_b = offsets_b.index_select(dim=0, index=offsets_b_last_idx)
+                if seq_len_a is None and total_seq_len is None:
+                    offsets_a_last_idx = torch.tensor(offsets_a.size(0) - 1).to(
+                        offsets_a.device, non_blocking=True
+                    )
+                    seq_len_a = offsets_a.index_select(dim=0, index=offsets_a_last_idx)
+            else:
+                if seq_len_b is None:
+                    seq_len_b = int(offsets_b[-1].item())
+                if seq_len_a is None and total_seq_len is None:
+                    seq_len_a = int(offsets_a[-1].item())
+        _, D = values.shape
+        BLOCK_D = triton.next_power_of_2(D)
+        # pyre-ignore[6] Incompatible parameter type
+        values_b = torch.empty((seq_len_b, D), device=values.device, dtype=values.dtype)
+        if seq_len_a is None:
+            # Derive seq_len_a from total_seq_len and values_b.size(0).
+            # values_b.size(0) is a SymInt (from the torch.empty above),
+            # so this is SymInt arithmetic — no new unbacked SymInt.
+            assert total_seq_len is not None
+            seq_len_a = total_seq_len - values_b.size(0)
+        # pyre-ignore[6] Incompatible parameter type
+        values_a = torch.empty((seq_len_a, D), device=values.device, dtype=values.dtype)
+        _triton_split_2D_jagged_internal(
+            jagged_in=values,
+            max_seq_len=max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            out_a=values_a,
+            out_b=values_b,
+            D=D,
+            dense_size=dense_size,
+            n_prefix=n_prefix_to_right,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            is_replace=False,
+            BLOCK_D=BLOCK_D,
+        )
+        if is_dense_a:
+            values_a = values_a.reshape(B, dense_size, D)
+        if is_dense_b:
+            values_b = values_b.reshape(B, dense_size, D)
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.seq_len_a = seq_len_a
+        ctx.seq_len_b = seq_len_b
+        ctx.is_dense_a = is_dense_a
+        ctx.is_dense_b = is_dense_b
+        ctx.dense_size = dense_size
+        ctx.B = B
+        ctx.D = D
+        ctx.n_prefix_to_right = n_prefix_to_right
+        return values_a, values_b
+
+    @staticmethod
+    def backward(
+        ctx, *d_values
+    ) -> Tuple[torch.Tensor, None, None, None, None, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        is_dense_a, is_dense_b = ctx.is_dense_a, ctx.is_dense_b
+        values_a, values_b = d_values
+        if is_dense_a:
+            stride_dense_batch = values_a.stride(0)
+        elif is_dense_b:
+            stride_dense_batch = values_b.stride(0)
+        else:
+            stride_dense_batch = 0
+
+        BLOCK_D = triton.next_power_of_2(ctx.D)
+        dvalues = torch.empty(
+            (ctx.seq_len_a + ctx.seq_len_b, ctx.D),
+            device=values_a.device,
+            dtype=values_b.dtype,
+        )
+        _triton_concat_2D_jagged_internal(
+            values_a=values_a,
+            values_b=values_b,
+            values_out=dvalues,
+            max_seq_len=ctx.max_seq_len,
+            B=ctx.B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            D=ctx.D,
+            dense_size=ctx.dense_size,
+            stride_dense_batch=stride_dense_batch,
+            n_prefix=ctx.n_prefix_to_right,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            is_replace=False,
+            BLOCK_D=BLOCK_D,
+        )
+
+        return dvalues, None, None, None, None, None, None, None, None
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_jagged_dense_bmm_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    bias: torch.Tensor,
+    elementwise: bool = False,
+) -> torch.Tensor:
+    """
+    Computing bmm Out = Jagged x Dense + Bias
+    M is the jagged dimension
+    Jagged has shape (sum_B(M_i), K), Dense has shape (B, K, N), Bias has shape (B, N) or (sum_B(M_i), N) depending on Elementwise, and Out has shape (sum_B(M_i), N)
+    """
+    return _JaggedDenseBmmAddFunction.apply(
+        max_seq_len, seq_offsets, jagged, dense, bias, elementwise
+    )
+
+
+@torch.fx.wrap
+def triton_concat_2D_jagged(
+    max_seq_len: int,
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    is_replace: bool = False,
+    n_prefix_from_right: int = 0,
+) -> torch.Tensor:
+    return _Concat2DJaggedFunction.apply(
+        max_seq_len,
+        values_a,
+        values_b,
+        offsets_a,
+        offsets_b,
+        is_replace,
+        n_prefix_from_right,
+    )
+
+
+@torch.fx.wrap
+def triton_concat_2D_jagged_jagged(
+    max_seq_len_left: int,
+    offsets_left: torch.Tensor,
+    values_left: torch.Tensor,
+    max_seq_len_right: int,
+    offsets_right: torch.Tensor,
+    values_right: torch.Tensor,
+    is_replace: bool,
+    n_prefix_from_right: int,
+) -> torch.Tensor:
+    return triton_concat_2D_jagged(
+        max_seq_len=max_seq_len_left + max_seq_len_right,
+        values_a=values_left,
+        values_b=values_right,
+        offsets_a=offsets_left,
+        offsets_b=offsets_right,
+        is_replace=is_replace,
+        n_prefix_from_right=n_prefix_from_right,
+    )
+
+
+@torch.fx.wrap
+def helion_concat_2D_jagged(
+    max_seq_len: int,
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return _HelionConcat2DJaggedFunction.apply(
+        max_seq_len,
+        values_a,
+        values_b,
+        offsets_a,
+        offsets_b,
+    )
+
+
+@torch.fx.wrap
+def triton_concat_2D_dense_jagged(
+    jagged_max_seq_len: int,
+    jagged_offsets: torch.Tensor,
+    jagged_values: torch.Tensor,
+    dense_values: torch.Tensor,
+) -> torch.Tensor:
+    B, dense_size, D = dense_values.size()
+    max_seq_len = jagged_max_seq_len + dense_size
+    return triton_concat_2D_jagged(
+        max_seq_len=max_seq_len,
+        values_a=dense_values,
+        values_b=jagged_values,
+        offsets_a=None,
+        offsets_b=jagged_offsets,
+    )
+
+
+def triton_jagged_dense_bmm(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+) -> torch.Tensor:
+    return _JaggedDenseBmmFunction.apply(max_seq_len, seq_offsets, jagged, dense)
+
+
+@torch.jit.unused
+def triton_split_2D_jagged(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    dense_size: int = 0,
+    n_prefix_to_right: int = 0,
+    seq_len_a: Optional[int] = None,
+    seq_len_b: Optional[int] = None,
+    total_seq_len: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return _Split2DJaggedFunction.apply(
+        values,
+        max_seq_len,
+        offsets_a,
+        offsets_b,
+        dense_size,
+        n_prefix_to_right,
+        seq_len_a,
+        seq_len_b,
+        total_seq_len,
+    )
+
+
+@torch.jit.unused
+def helion_split_2D_jagged(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    dense_size: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return _HelionSplit2DJaggedFunction.apply(
+        values,
+        max_seq_len,
+        offsets_a,
+        offsets_b,
+        dense_size,
+    )
+
+
+@triton.jit
+def concat_2D_jagged_w_prefix_multirow(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    DenseSize,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_dense_batch,
+    stride_od,
+    n_prefix_from_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_block_n = tl.program_id(0)
+
+    if IS_DENSE_A:
+        seq_start_a = off_z * DenseSize
+        seq_len_a = DenseSize
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    elif IS_DENSE_B:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = off_z * DenseSize
+        seq_len_b = DenseSize
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+
+    if IS_REPLACE:
+        seq_len = seq_len_a
+        out_seq_start = seq_start_a
+        out_seq_b_start = seq_len_a - seq_len_b
+    else:
+        seq_len = seq_len_a + seq_len_b
+        out_seq_start = seq_start_a + seq_start_b
+        out_seq_b_start = seq_len_a + n_prefix_from_B
+
+    start_n = off_block_n * BLOCK_N
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+    if start_n >= seq_len:
+        return
+    valid_mask = offs_n < seq_len
+
+    out_ptrs = (
+        Out
+        + (out_seq_start + offs_n[:, None]).to(tl.int64) * stride_od
+        + offs_d[None, :]
+    )
+
+    to_a_mask = (offs_n < out_seq_b_start) & (offs_n >= n_prefix_from_B) & valid_mask
+    to_b_mask = ~to_a_mask & valid_mask
+
+    off_a = offs_n - n_prefix_from_B
+    if IS_DENSE_A:
+        in_a_ptrs = (
+            ValuesA
+            + off_a[:, None].to(tl.int64) * stride_ad
+            + off_z.to(tl.int64) * stride_dense_batch
+            + offs_d[None, :]
+        )
+    else:
+        in_a_ptrs = (
+            ValuesA
+            + (off_a[:, None] + seq_start_a).to(tl.int64) * stride_ad
+            + offs_d[None, :]
+        )
+
+    v_a = tl.load(in_a_ptrs, mask=to_a_mask[:, None] & (offs_d[None, :] < D), other=0.0)
+    tl.store(out_ptrs, v_a, mask=to_a_mask[:, None] & (offs_d[None, :] < D))
+
+    prefix_mask = offs_n < n_prefix_from_B
+
+    off_b = tl.where(prefix_mask, offs_n, offs_n - out_seq_b_start + n_prefix_from_B)
+    if IS_DENSE_B:
+        in_b_ptrs = (
+            ValuesB
+            + off_b[:, None].to(tl.int64) * stride_bd
+            + off_z.to(tl.int64) * stride_dense_batch
+            + offs_d[None, :]
+        )
+    else:
+        in_b_ptrs = (
+            ValuesB
+            + (off_b[:, None] + seq_start_b).to(tl.int64) * stride_bd
+            + offs_d[None, :]
+        )
+
+    v_b = tl.load(in_b_ptrs, mask=to_b_mask[:, None] & (offs_d[None, :] < D), other=0.0)
+    tl.store(out_ptrs, v_b, mask=to_b_mask[:, None] & (offs_d[None, :] < D))
+
+
+@triton_autotune(
+    configs=_get_split_concat_2d_jagged_multirow_configs_wrapper(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def concat_2D_jagged_multirow(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    DenseSize,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_dense_batch,
+    stride_od,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    concat_2D_jagged_w_prefix_multirow(
+        OffsetsA,
+        ValuesA,
+        OffsetsB,
+        ValuesB,
+        DenseSize,
+        Out,
+        D,
+        stride_ad,
+        stride_bd,
+        stride_dense_batch,
+        stride_od,
+        0,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        BLOCK_N,
+        IS_REPLACE,
+    )
+
+
+@triton_autotune(
+    configs=_get_split_concat_2d_jagged_multirow_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def concat_2D_jagged_jagged_w_prefix_multirow(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_od,
+    n_prefix_from_B,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    concat_2D_jagged_w_prefix_multirow(
+        OffsetsA,
+        ValuesA,
+        OffsetsB,
+        ValuesB,
+        0,
+        Out,
+        D,
+        stride_ad,
+        stride_bd,
+        0,
+        stride_od,
+        n_prefix_from_B,
+        IS_DENSE_A=False,
+        IS_DENSE_B=False,
+        BLOCK_D=BLOCK_D,
+        BLOCK_N=BLOCK_N,
+        IS_REPLACE=False,
+    )
+
+
+@triton.jit
+def split_2D_jagged_w_prefix_multirow(
+    JaggedIn,
+    DenseSize,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_block_n = tl.program_id(0)
+
+    if IS_DENSE_A:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_start_a = off_z * DenseSize
+        seq_len_a = DenseSize
+        seq_len_b = seq_end_b - seq_start_b
+    elif IS_DENSE_B:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = off_z * DenseSize
+        seq_len_b = DenseSize
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+
+    if IS_REPLACE:
+        seq_len = seq_len_a
+    else:
+        seq_len = seq_len_a + seq_len_b
+
+    if IS_REPLACE:
+        seq_start = seq_start_a
+        out_seq_b_start = seq_len_a - seq_len_b
+    else:
+        seq_start = seq_start_a + seq_start_b
+        out_seq_b_start = seq_len_a + n_prefix_to_B
+
+    start_n = off_block_n * BLOCK_N
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+    if start_n >= seq_len:
+        return
+    valid_mask = offs_n < seq_len
+
+    in_ptrs = (
+        JaggedIn
+        + (seq_start + offs_n[:, None]).to(tl.int64) * stride_id
+        + offs_d[None, :]
+    )
+
+    v = tl.load(in_ptrs, mask=valid_mask[:, None] & (offs_d[None, :] < D), other=0.0)
+
+    to_a_mask = (offs_n < out_seq_b_start) & (offs_n >= n_prefix_to_B) & valid_mask
+    to_b_mask = ~to_a_mask & valid_mask
+
+    off_a = offs_n - n_prefix_to_B
+    out_a_ptrs = (
+        OutA + (off_a[:, None] + seq_start_a).to(tl.int64) * stride_ad + offs_d[None, :]
+    )
+    tl.store(out_a_ptrs, v, mask=to_a_mask[:, None] & (offs_d[None, :] < D))
+
+    prefix_mask = offs_n < n_prefix_to_B
+
+    off_b = tl.where(prefix_mask, offs_n, offs_n - out_seq_b_start + n_prefix_to_B)
+    out_b_ptrs = (
+        OutB + (off_b[:, None] + seq_start_b).to(tl.int64) * stride_bd + offs_d[None, :]
+    )
+    tl.store(out_b_ptrs, v, mask=to_b_mask[:, None] & (offs_d[None, :] < D))
+
+
+@triton_autotune(
+    configs=pinned_or_full(
+        [triton.Config({"BLOCK_N": 8}, num_warps=1)],
+        _get_split_concat_2d_jagged_multirow_configs_wrapper,
+    ),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def split_2D_jagged_multirow(
+    JaggedIn,
+    DenseSize,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    split_2D_jagged_w_prefix_multirow(
+        JaggedIn,
+        DenseSize,
+        OffsetsA,
+        OffsetsB,
+        OutA,
+        OutB,
+        D,
+        stride_id,
+        stride_ad,
+        stride_bd,
+        0,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        BLOCK_N,
+        IS_REPLACE,
+    )
+
+
+@triton_autotune(
+    configs=_get_split_concat_2d_jagged_multirow_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def split_2D_jagged_jagged_w_prefix_multirow(
+    JaggedIn,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    split_2D_jagged_w_prefix_multirow(
+        JaggedIn,
+        0,
+        OffsetsA,
+        OffsetsB,
+        OutA,
+        OutB,
+        D,
+        stride_id,
+        stride_ad,
+        stride_bd,
+        n_prefix_to_B,
+        IS_DENSE_A=False,
+        IS_DENSE_B=False,
+        BLOCK_D=BLOCK_D,
+        BLOCK_N=BLOCK_N,
+        IS_REPLACE=False,
+    )
+
+
+def triton_jagged_dense_broadcast_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+) -> torch.Tensor:
+    return _JaggedDenseBroadcastAddFunction.apply(
+        max_seq_len, seq_offsets, jagged, dense
+    )
+
+
+@triton.jit
+def _helion_split_2d_jagged_kernel(
+    offsets_a,
+    offsets_b,
+    values_flat,
+    out_a_flat,
+    out_b_flat,
+    max_seq_len,
+    D: tl.constexpr,
+    _BLOCK_SIZE_0: tl.constexpr,
+    _BLOCK_SIZE_1: tl.constexpr,
+) -> None:
+    # Get program ID and decompose to batch and sequence block coordinates
+    program_id = tl.program_id(0)
+    flat_program_id = program_id
+    batch_id = triton_helpers.div_floor_integer(
+        flat_program_id,
+        triton_helpers.div_floor_integer(
+            -1 + _BLOCK_SIZE_0 + max_seq_len, _BLOCK_SIZE_0
+        ),
+    )
+    seq_block_id = triton_helpers.remainder_integer(  # noqa: F841
+        flat_program_id,
+        triton_helpers.div_floor_integer(
+            -1 + _BLOCK_SIZE_0 + max_seq_len, _BLOCK_SIZE_0
+        ),
+    )
+    # Load output boundaries for part A
+    out_a_start = tl.load(offsets_a + batch_id * 1, None, eviction_policy="evict_last")
+    batch_id_plus_1 = 1 + triton_helpers.div_floor_integer(
+        flat_program_id,
+        triton_helpers.div_floor_integer(
+            -1 + _BLOCK_SIZE_0 + max_seq_len, _BLOCK_SIZE_0
+        ),
+    )
+    out_a_end = tl.load(
+        offsets_a + batch_id_plus_1 * 1, None, eviction_policy="evict_last"
+    )
+    len_a = out_a_end - out_a_start
+    # Load output boundaries for part B
+    out_b_start = tl.load(offsets_b + batch_id * 1, None)
+    out_b_end = tl.load(
+        offsets_b + batch_id_plus_1 * 1, None, eviction_policy="evict_last"
+    )
+    len_b = out_b_end - out_b_start
+    # Compute input start and total length for this batch
+    input_start = out_a_start + out_b_start
+    total_len = len_a + len_b
+    # Calculate sequence offset for this block
+    seq_offset = _BLOCK_SIZE_0 * triton_helpers.remainder_integer(
+        flat_program_id,
+        triton_helpers.div_floor_integer(
+            -1 + _BLOCK_SIZE_0 + max_seq_len, _BLOCK_SIZE_0
+        ),
+    )
+    has_work = total_len > seq_offset
+    if has_work:
+        # Generate row indices for this sequence block
+        seq_range = tl.arange(0, _BLOCK_SIZE_0)
+        seq_offset_i32 = tl.cast(seq_offset, tl.int32)
+        row_indices = seq_range + seq_offset_i32
+
+        # Create masks for valid rows and parts A/B
+        total_len_i32 = tl.cast(total_len[None], tl.int32)
+        len_a_i32 = tl.cast(len_a[None], tl.int32)
+        valid_mask = row_indices < total_len_i32
+        is_part_a = row_indices < len_a_i32
+        is_part_b = (row_indices >= len_a_i32) & valid_mask
+
+        # Extract scalar values once
+        input_start_i32 = tl.cast(input_start[None, None], tl.int32)
+        out_a_start_i32 = tl.cast(out_a_start[None, None], tl.int32)
+        out_b_start_i32 = tl.cast(out_b_start[None, None], tl.int32)
+
+        # Process features in smaller tiles
+        for feature_offset in tl.range(
+            0,
+            D,
+            _BLOCK_SIZE_1,
+            loop_unroll_factor=1,
+            num_stages=4,
+            disallow_acc_multi_buffer=True,
+            flatten=True,
+        ):
+            feature_indices = feature_offset + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32)
+
+            # Compute D constant and feature mask once per feature iteration
+            D_const = tl.full([], tl.cast(D, tl.int32), tl.int32)
+            D_i32 = tl.cast(D, tl.int32)
+            feature_mask = feature_indices < D_i32
+
+            # Compute indices for part A
+            row_subscript = row_indices[:, None]
+            input_row_a = input_start_i32 + row_subscript
+            input_idx_a = (
+                tl.cast(input_row_a * D_const, tl.int32) + feature_indices[None, :]
+            )
+
+            out_a_row = out_a_start_i32 + row_subscript
+            out_a_idx = (
+                tl.cast(out_a_row * D_const, tl.int32) + feature_indices[None, :]
+            )
+
+            mask_a = is_part_a[:, None] & valid_mask[:, None] & feature_mask[None, :]
+
+            # Load and store part A data
+            slice_a = tl.load(
+                values_flat + input_idx_a * 1,
+                mask_a,
+                other=0,
+                eviction_policy="evict_first",
+            )
+            tl.store(out_a_flat + out_a_idx * 1, slice_a, mask_a)
+
+            # Compute indices for part B
+            input_idx_b = (
+                tl.cast((input_start_i32 + row_subscript) * D_const, tl.int32)
+                + feature_indices[None, :]
+            )
+
+            row_minus_len_a = row_subscript - len_a_i32
+            out_b_row = out_b_start_i32 + row_minus_len_a
+            out_b_idx = (
+                tl.cast(out_b_row * D_const, tl.int32) + feature_indices[None, :]
+            )
+
+            mask_b = is_part_b[:, None] & feature_mask[None, :]
+
+            # Load and store part B data
+            slice_b = tl.load(
+                values_flat + input_idx_b * 1,
+                mask_b,
+                other=0,
+                eviction_policy="evict_first",
+            )
+            tl.store(out_b_flat + out_b_idx * 1, slice_b, mask_b)
+
+
+class _HelionSplit2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        values: torch.Tensor,
+        max_seq_len: int,
+        offsets_a: torch.Tensor,
+        offsets_b: torch.Tensor,
+        dense_size: int = 0,  # noqa: F841
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        values = switch_to_contiguous_if_needed(values)
+        B = offsets_a.shape[0] - 1
+        D = values.size(1)
+
+        # TODO: maybe check if torch.compiler.is_compiling() and use index_select instead
+        seq_len_a = int(offsets_a[-1].item())
+        seq_len_b = int(offsets_b[-1].item())
+
+        values_a, values_b = _helion_split_2D_jagged_impl(
+            values=values,
+            max_seq_len=max_seq_len,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            dense_size=dense_size,
+            total_len_a=seq_len_a,
+            total_len_b=seq_len_b,
+        )
+
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.seq_len_a = seq_len_a
+        ctx.seq_len_b = seq_len_b
+        ctx.dense_size = dense_size
+        ctx.B = B
+        ctx.D = D
+        return values_a, values_b
+
+    @staticmethod
+    def backward(ctx, *d_values) -> Tuple[torch.Tensor, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        values_a, values_b = d_values
+        BLOCK_D = triton.next_power_of_2(ctx.D)
+
+        dvalues = torch.empty(
+            (ctx.seq_len_a + ctx.seq_len_b, ctx.D),
+            device=values_a.device,
+            dtype=values_a.dtype,
+        )
+        _triton_concat_2D_jagged_internal(
+            values_a=values_a,
+            values_b=values_b,
+            values_out=dvalues,
+            max_seq_len=ctx.max_seq_len,
+            B=ctx.B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            D=ctx.D,
+            dense_size=0,
+            stride_dense_batch=0,
+            n_prefix=0,
+            is_dense_a=False,
+            is_dense_b=False,
+            is_replace=False,
+            BLOCK_D=BLOCK_D,
+        )
+        return dvalues, None, None, None, None
+
+
+def _helion_split_2D_jagged_impl(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: torch.Tensor,
+    offsets_b: torch.Tensor,
+    dense_size: int = 0,  # noqa: F841
+    total_len_a: Optional[int] = None,
+    total_len_b: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    D = values.size(1)
+
+    # Select dtype-specific optimal parameters
+    if values.dtype == torch.float32:
+        # FP32-optimized parameters
+        block_size_0 = 64
+        block_size_1 = 64
+        num_warps = 4
+        num_stages = 4
+    else:
+        # BF16/FP16-optimized parameters
+        block_size_0 = 128
+        block_size_1 = triton.next_power_of_2(D)
+        num_warps = 32
+        num_stages = 7
+
+    return _helion_split_2d_jagged(
+        values,
+        max_seq_len,
+        offsets_a,
+        offsets_b,
+        dense_size,
+        block_size_0=block_size_0,
+        block_size_1=block_size_1,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        total_len_a=total_len_a,
+        total_len_b=total_len_b,
+    )
+
+
+def _helion_split_2d_jagged(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: torch.Tensor,
+    offsets_b: torch.Tensor,
+    dense_size: int,  # noqa: F841
+    block_size_0: int = 64,
+    block_size_1: int = 64,
+    num_warps: int = 4,
+    num_stages: int = 4,
+    total_len_a: Optional[int] = None,
+    total_len_b: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    values = values.contiguous()
+    num_batches = offsets_a.size(0) - 1
+    D = values.size(1)
+    num_seq_blocks = (max_seq_len + block_size_0 - 1) // block_size_0
+    if total_len_a is None:
+        total_len_a = int(offsets_a[-1].item())
+    if total_len_b is None:
+        total_len_b = int(offsets_b[-1].item())
+    out_a = torch.empty([total_len_a, D], dtype=values.dtype, device=values.device)
+    out_b = torch.empty([total_len_b, D], dtype=values.dtype, device=values.device)
+    values_flat = values.view(-1)
+    out_a_flat = out_a.view(-1)
+    out_b_flat = out_b.view(-1)
+    total_programs = num_batches * num_seq_blocks
+
+    # pyre-ignore[28]
+    _helion_split_2d_jagged_kernel[(total_programs,)](
+        offsets_a,
+        offsets_b,
+        values_flat,
+        out_a_flat,
+        out_b_flat,
+        max_seq_len,
+        D,
+        block_size_0,
+        block_size_1,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return (out_a, out_b)
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py
new file mode 100644
index 000000000..2c3728c0a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py
@@ -0,0 +1,1088 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+from typing import Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import (
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+from generative_recommenders.ops.utils import is_sm100_plus
+
+
+def _triton_concat_2D_jagged_internal(
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    values_out: torch.Tensor,
+    max_seq_len: int,
+    B: int,
+    offsets_a: Optional[torch.Tensor],
+    offsets_b: Optional[torch.Tensor],
+    max_len_a: Optional[int],
+    max_len_b: Optional[int],
+    D: int,
+    n_prefix_from_B: int,
+    is_dense_a: bool,
+    is_dense_b: bool,
+    BLOCK_D: int,
+) -> None:
+    if is_sm100_plus() or (torch.cuda.is_available() and torch.version.hip):
+        # Route AMD/ROCm through the multirow kernel.
+        #
+        # The basic `_concat_2D_jagged` kernel below issues one program per
+        # output row (grid = `(max_seq_len, B)`). On ROCm Triton this fails to
+        # lower in the `TritonAMDGPUCanonicalizePointers` pass with
+        # `RuntimeError: PassManager::run failed` at `make_ttgir`. The
+        # multirow variant tiles rows with a tunable `BLOCK_N` (grid =
+        # `(cdiv(max_seq_len, BLOCK_N), B)`) and compiles cleanly on ROCm.
+        # The original `is_sm100_plus()` gate was conservative — only Blackwell
+        # was opted in. Adding HIP keeps NVIDIA H100/A100 on the basic kernel
+        # they were validated against and unblocks AMD without behavior change
+        # on existing NVIDIA paths.
+
+        def grid(meta):
+            return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+        concat_2D_jagged_multirow[grid](
+            ValuesA=values_a,
+            ValuesB=values_b,
+            OffsetsA=offsets_a,
+            OffsetsB=offsets_b,
+            MaxLenA=max_len_a,
+            MaxLenB=max_len_b,
+            Out=values_out,
+            D=D,
+            stride_ad=values_a.stride(-2),
+            stride_bd=values_b.stride(-2),
+            stride_od=values_out.stride(-2),
+            n_prefix_from_B=n_prefix_from_B,
+            IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+            IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+            BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+        )
+    else:
+        _concat_2D_jagged[(max_seq_len, B)](
+            ValuesA=values_a,
+            ValuesB=values_b,
+            OffsetsA=offsets_a,
+            OffsetsB=offsets_b,
+            MaxLenA=max_len_a,
+            MaxLenB=max_len_b,
+            Out=values_out,
+            D=D,
+            stride_ad=values_a.stride(-2),
+            stride_bd=values_b.stride(-2),
+            stride_od=values_out.stride(-2),
+            n_prefix_from_B=n_prefix_from_B,
+            IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+            IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+            BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+        )
+
+
+def _triton_split_2D_jagged_internal(
+    jagged_in: torch.Tensor,
+    max_seq_len: int,
+    B: int,
+    offsets_a: Optional[torch.Tensor],
+    offsets_b: Optional[torch.Tensor],
+    max_len_a: Optional[int],
+    max_len_b: Optional[int],
+    out_a: torch.Tensor,
+    out_b: torch.Tensor,
+    D: int,
+    n_prefix_to_B: int,
+    is_dense_a: bool,
+    is_dense_b: bool,
+    BLOCK_D: int,
+) -> None:
+    if is_sm100_plus() or (torch.cuda.is_available() and torch.version.hip):
+        # Route AMD/ROCm through the multirow kernel for the same reason as
+        # `_triton_concat_2D_jagged_internal` above: basic `_split_2D_jagged`
+        # hits `PassManager::run failed` in `TritonAMDGPUCanonicalizePointers`
+        # on ROCm; multirow lowers cleanly.
+
+        def grid(meta):
+            return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+        split_2D_jagged_multirow[grid](
+            JaggedIn=jagged_in,
+            OffsetsA=offsets_a,
+            OffsetsB=offsets_b,
+            MaxLenA=max_len_a,
+            MaxLenB=max_len_b,
+            OutA=out_a,
+            OutB=out_b,
+            D=D,
+            stride_id=jagged_in.stride(0),
+            stride_ad=out_a.stride(0),
+            stride_bd=out_b.stride(0),
+            n_prefix_to_B=n_prefix_to_B,
+            IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+            IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+            BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+        )
+    else:
+        _split_2D_jagged[(max_seq_len, B)](
+            JaggedIn=jagged_in,
+            OffsetsA=offsets_a,
+            OffsetsB=offsets_b,
+            MaxLenA=max_len_a,
+            MaxLenB=max_len_b,
+            OutA=out_a,
+            OutB=out_b,
+            D=D,
+            stride_id=jagged_in.stride(0),
+            stride_ad=out_a.stride(0),
+            stride_bd=out_b.stride(0),
+            n_prefix_to_B=n_prefix_to_B,
+            IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+            IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+            BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+        )
+
+
+def _get_concat_split_2d_jagged_multirow_configs():
+    configs = []
+    for BLOCK_N in [1, 2, 4, 8]:
+        for num_warps in [1, 2, 4]:
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+@triton.jit
+def _concat_2D_jagged_multirow(
+    ValuesA,
+    ValuesB,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_od,
+    n_prefix_from_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    block_n = tl.program_id(0)
+
+    if IS_DENSE_A:
+        seq_start_a = off_z * MaxLenA
+        seq_len_a = MaxLenA
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+    if IS_DENSE_B:
+        seq_start_b = off_z * MaxLenB
+        seq_len_b = MaxLenB
+    else:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    seq_len = seq_len_a + seq_len_b
+
+    start_n = block_n * BLOCK_N
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+
+    valid_mask = offs_n < seq_len
+
+    out_seq_start = seq_start_a + seq_start_b + offs_n
+    out_ptrs = Out + out_seq_start[:, None].to(tl.int64) * stride_od + offs_d[None, :]
+
+    from_prefix_b_mask = (offs_n < n_prefix_from_B) & valid_mask
+    from_a_mask = (
+        (offs_n >= n_prefix_from_B)
+        & (offs_n < seq_len_a + n_prefix_from_B)
+        & valid_mask
+    )
+    from_suffix_b_mask = (offs_n >= seq_len_a + n_prefix_from_B) & valid_mask
+
+    in_b1_ptrs = (
+        ValuesB
+        + (offs_n[:, None] + seq_start_b).to(tl.int64) * stride_bd
+        + offs_d[None, :]
+    )
+    v_b1 = tl.load(
+        in_b1_ptrs, mask=from_prefix_b_mask[:, None] & (offs_d[None, :] < D), other=0.0
+    )
+    tl.store(out_ptrs, v_b1, mask=from_prefix_b_mask[:, None] & (offs_d[None, :] < D))
+
+    off_a = offs_n - n_prefix_from_B
+    in_a_ptrs = (
+        ValuesA
+        + (off_a[:, None] + seq_start_a).to(tl.int64) * stride_ad
+        + offs_d[None, :]
+    )
+    v_a = tl.load(
+        in_a_ptrs, mask=from_a_mask[:, None] & (offs_d[None, :] < D), other=0.0
+    )
+    tl.store(out_ptrs, v_a, mask=from_a_mask[:, None] & (offs_d[None, :] < D))
+
+    off_b = offs_n - seq_len_a
+    in_b2_ptrs = (
+        ValuesB
+        + (off_b[:, None] + seq_start_b).to(tl.int64) * stride_bd
+        + offs_d[None, :]
+    )
+    v_b2 = tl.load(
+        in_b2_ptrs, mask=from_suffix_b_mask[:, None] & (offs_d[None, :] < D), other=0.0
+    )
+    tl.store(out_ptrs, v_b2, mask=from_suffix_b_mask[:, None] & (offs_d[None, :] < D))
+
+
+@triton_autotune(
+    configs=_get_concat_split_2d_jagged_multirow_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def concat_2D_jagged_multirow(
+    ValuesA,
+    ValuesB,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_od,
+    n_prefix_from_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    _concat_2D_jagged_multirow(
+        ValuesA,
+        ValuesB,
+        OffsetsA,
+        OffsetsB,
+        MaxLenA,
+        MaxLenB,
+        Out,
+        D,
+        stride_ad,
+        stride_bd,
+        stride_od,
+        n_prefix_from_B,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        BLOCK_N,
+    )
+
+
+@triton.jit
+def _split_2D_jagged_multirow(
+    JaggedIn,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    block_n = tl.program_id(0)
+
+    if IS_DENSE_A:
+        seq_start_a = off_z * MaxLenA
+        seq_len_a = MaxLenA
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+    if IS_DENSE_B:
+        seq_start_b = off_z * MaxLenB
+        seq_len_b = MaxLenB
+    else:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    seq_len = seq_len_a + seq_len_b
+    seq_start = seq_start_a + seq_start_b
+
+    start_n = block_n * BLOCK_N
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+
+    valid_mask = offs_n < seq_len
+
+    in_ptrs = (
+        JaggedIn
+        + (seq_start + offs_n[:, None]).to(tl.int64) * stride_id
+        + offs_d[None, :]
+    )
+
+    v = tl.load(in_ptrs, mask=valid_mask[:, None] & (offs_d[None, :] < D), other=0.0)
+
+    to_prefix_b_mask = (offs_n < n_prefix_to_B) & valid_mask
+    to_a_mask = (
+        (offs_n >= n_prefix_to_B) & (offs_n < seq_len_a + n_prefix_to_B) & valid_mask
+    )
+    to_suffix_b_mask = (offs_n >= seq_len_a + n_prefix_to_B) & valid_mask
+
+    out_b1_ptrs = (
+        OutB
+        + (offs_n[:, None] + seq_start_b).to(tl.int64) * stride_bd
+        + offs_d[None, :]
+    )
+    tl.store(out_b1_ptrs, v, mask=to_prefix_b_mask[:, None] & (offs_d[None, :] < D))
+
+    off_a = offs_n - n_prefix_to_B
+    out_a_ptrs = (
+        OutA + (off_a[:, None] + seq_start_a).to(tl.int64) * stride_ad + offs_d[None, :]
+    )
+    tl.store(out_a_ptrs, v, mask=to_a_mask[:, None] & (offs_d[None, :] < D))
+
+    off_b = offs_n - seq_len_a
+    out_b2_ptrs = (
+        OutB + (off_b[:, None] + seq_start_b).to(tl.int64) * stride_bd + offs_d[None, :]
+    )
+    tl.store(out_b2_ptrs, v, mask=to_suffix_b_mask[:, None] & (offs_d[None, :] < D))
+
+
+@triton_autotune(
+    configs=_get_concat_split_2d_jagged_multirow_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def split_2D_jagged_multirow(
+    JaggedIn,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    _split_2D_jagged_multirow(
+        JaggedIn,
+        OffsetsA,
+        OffsetsB,
+        MaxLenA,
+        MaxLenB,
+        OutA,
+        OutB,
+        D,
+        stride_id,
+        stride_ad,
+        stride_bd,
+        n_prefix_to_B,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        BLOCK_N,
+    )
+
+
+@triton.jit
+def _concat_2D_jagged(
+    ValuesA,
+    ValuesB,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_od,
+    n_prefix_from_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_n = tl.program_id(0)
+    if IS_DENSE_A:
+        seq_start_a = off_z * MaxLenA
+        seq_len_a = MaxLenA
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+    if IS_DENSE_B:
+        seq_start_b = off_z * MaxLenB
+        seq_len_b = MaxLenB
+    else:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    seq_len = seq_len_a + seq_len_b
+    if off_n >= seq_len:
+        return
+    offs_d = tl.arange(0, BLOCK_D)
+    out_seq_start = seq_start_a + seq_start_b + off_n
+    out_ptrs = Out + out_seq_start.to(tl.int64) * stride_od + offs_d
+    if off_n < n_prefix_from_B:
+        in_ptrs = ValuesB + (off_n + seq_start_b).to(tl.int64) * stride_bd + offs_d
+    elif off_n < seq_len_a + n_prefix_from_B:
+        in_ptrs = (
+            ValuesA
+            + (off_n - n_prefix_from_B + seq_start_a).to(tl.int64) * stride_ad
+            + offs_d
+        )
+    else:
+        in_ptrs = (
+            ValuesB
+            + (off_n - seq_len_a + seq_start_b).to(tl.int64) * stride_bd
+            + offs_d
+        )
+    v = tl.load(in_ptrs, mask=offs_d < D)
+    tl.store(out_ptrs, v, mask=offs_d < D)
+
+
+@triton.jit
+def _split_2D_jagged(
+    JaggedIn,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_n = tl.program_id(0)
+    if IS_DENSE_A:
+        seq_start_a = off_z * MaxLenA
+        seq_len_a = MaxLenA
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+    if IS_DENSE_B:
+        seq_start_b = off_z * MaxLenB
+        seq_len_b = MaxLenB
+    else:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    seq_len = seq_len_a + seq_len_b
+    if off_n >= seq_len:
+        return
+    seq_start = seq_start_a + seq_start_b
+    offs_d = tl.arange(0, BLOCK_D)
+    in_ptrs = JaggedIn + (seq_start + off_n).to(tl.int64) * stride_id + offs_d
+    if off_n < n_prefix_to_B:
+        out_ptrs = OutB + (off_n + seq_start_b).to(tl.int64) * stride_bd + offs_d
+    elif off_n < seq_len_a + n_prefix_to_B:
+        out_ptrs = (
+            OutA
+            + (off_n - n_prefix_to_B + seq_start_a).to(tl.int64) * stride_ad
+            + offs_d
+        )
+    else:
+        out_ptrs = (
+            OutB + (off_n - seq_len_a + seq_start_b).to(tl.int64) * stride_bd + offs_d
+        )
+    v = tl.load(in_ptrs, mask=offs_d < D)
+    tl.store(out_ptrs, v, mask=offs_d < D)
+
+
+class _Concat2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values_a: torch.Tensor,
+        values_b: torch.Tensor,
+        max_len_a: Optional[int],
+        max_len_b: Optional[int],
+        offsets_a: Optional[torch.Tensor],
+        offsets_b: Optional[torch.Tensor],
+        n_prefix_from_B: int,
+    ):
+        values_a = switch_to_contiguous_if_needed(values_a)
+        values_b = switch_to_contiguous_if_needed(values_b)
+        is_dense_a = offsets_a is None
+        is_dense_b = offsets_b is None
+        total_len_a, D = values_a.shape
+        total_len_b, _ = values_b.shape
+        if is_dense_a:
+            assert max_len_a is not None
+            B = total_len_a // max_len_a
+        else:
+            assert offsets_a is not None
+            B = offsets_a.shape[0] - 1
+        if is_dense_b:
+            assert max_len_b is not None
+            B = total_len_b // max_len_b
+        else:
+            assert offsets_b is not None
+            B = offsets_b.shape[0] - 1
+        total_seq_len = total_len_a + total_len_b
+        BLOCK_D = triton.next_power_of_2(D)
+        values_out = torch.empty(
+            (total_seq_len, D), device=values_a.device, dtype=values_a.dtype
+        )
+        _triton_concat_2D_jagged_internal(
+            values_a=values_a,
+            values_b=values_b,
+            values_out=values_out,
+            max_seq_len=max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            max_len_a=max_len_a,
+            max_len_b=max_len_b,
+            D=D,
+            n_prefix_from_B=n_prefix_from_B,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            BLOCK_D=BLOCK_D,
+        )
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.total_len_a = total_len_a
+        ctx.total_len_b = total_len_b
+        ctx.is_dense_a = is_dense_a
+        ctx.is_dense_b = is_dense_b
+        ctx.max_len_a = max_len_a
+        ctx.max_len_b = max_len_b
+        ctx.B = B
+        ctx.n_prefix_from_B = n_prefix_from_B
+        return values_out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, torch.Tensor, torch.Tensor, None, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        _, D = d_out.shape
+        BLOCK_D = triton.next_power_of_2(D)
+        d_values_a = torch.zeros(
+            (ctx.total_len_a, D), device=d_out.device, dtype=d_out.dtype
+        )
+        d_values_b = torch.empty(
+            (ctx.total_len_b, D), device=d_out.device, dtype=d_out.dtype
+        )
+        # Go through `_triton_split_2D_jagged_internal` (not raw
+        # `_split_2D_jagged[grid]`) so this backward pass benefits from the same
+        # AMD-routing-through-multirow workaround as the forward. Calling the
+        # raw kernel directly would hit `PassManager::run failed` on ROCm at
+        # `TritonAMDGPUCanonicalizePointers`. If you refactor this, do not
+        # collapse it back to `_split_2D_jagged[(ctx.max_seq_len, ctx.B)](...)`.
+        _triton_split_2D_jagged_internal(
+            jagged_in=d_out,
+            max_seq_len=ctx.max_seq_len,
+            B=ctx.B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            max_len_a=ctx.max_len_a,
+            max_len_b=ctx.max_len_b,
+            out_a=d_values_a,
+            out_b=d_values_b,
+            D=D,
+            n_prefix_to_B=ctx.n_prefix_from_B,
+            is_dense_a=ctx.is_dense_a,
+            is_dense_b=ctx.is_dense_b,
+            BLOCK_D=BLOCK_D,
+        )
+        return None, d_values_a, d_values_b, None, None, None, None, None
+
+
+class _Split2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values: torch.Tensor,
+        total_len_left: Optional[int],
+        total_len_right: Optional[int],
+        max_len_a: Optional[int],
+        max_len_b: Optional[int],
+        offsets_a: Optional[torch.Tensor],
+        offsets_b: Optional[torch.Tensor],
+        n_prefix_to_B: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        values = switch_to_contiguous_if_needed(values)
+        is_dense_a: bool = offsets_a is None
+        is_dense_b: bool = offsets_b is None
+        total_seq_len, D = values.shape
+        if is_dense_a:
+            assert is_dense_b is False
+            assert offsets_b is not None
+            assert max_len_a is not None
+            B = offsets_b.shape[0] - 1
+            total_len_a = max_len_a * B
+            total_len_b = total_seq_len - total_len_a
+        elif is_dense_b:
+            assert is_dense_a is False
+            assert offsets_a is not None
+            assert max_len_b is not None
+            B = offsets_a.shape[0] - 1
+            total_len_b = max_len_b * B
+            total_len_a = total_seq_len - total_len_b
+        else:
+            assert offsets_a is not None and offsets_b is not None
+            B = offsets_a.shape[0] - 1
+            if total_len_left is not None and total_len_right is not None:
+                assert total_len_left + total_len_right == total_seq_len
+                total_len_a = total_len_left
+                total_len_b = total_len_right
+            else:
+                total_len_a = int(offsets_a[-1].item())
+                total_len_b = values.size(0) - total_len_a
+        _, D = values.shape
+        BLOCK_D = triton.next_power_of_2(D)
+        values_a = torch.empty(
+            (total_len_a, D), device=values.device, dtype=values.dtype
+        )
+        values_b = torch.empty(
+            (total_len_b, D), device=values.device, dtype=values.dtype
+        )
+        _triton_split_2D_jagged_internal(
+            jagged_in=values,
+            max_seq_len=max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            max_len_a=max_len_a,
+            max_len_b=max_len_b,
+            out_a=values_a,
+            out_b=values_b,
+            D=D,
+            n_prefix_to_B=n_prefix_to_B,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            BLOCK_D=BLOCK_D,
+        )
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.total_seq_len = total_seq_len
+        ctx.max_len_a = max_len_a
+        ctx.max_len_b = max_len_b
+        ctx.is_dense_a = is_dense_a
+        ctx.is_dense_b = is_dense_b
+        ctx.B = B
+        ctx.D = D
+        ctx.n_prefix_to_B = n_prefix_to_B
+        return values_a, values_b
+
+    @staticmethod
+    def backward(
+        ctx, *d_values
+    ) -> Tuple[None, torch.Tensor, None, None, None, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        d_values_a, d_values_b = d_values
+        BLOCK_D = triton.next_power_of_2(ctx.D)
+        d_jagged_in = torch.empty(
+            (ctx.total_seq_len, ctx.D),
+            device=d_values_a.device,
+            dtype=d_values_a.dtype,
+        )
+        _triton_concat_2D_jagged_internal(
+            values_a=d_values_a,
+            values_b=d_values_b,
+            values_out=d_jagged_in,
+            max_seq_len=ctx.max_seq_len,
+            B=ctx.B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            max_len_a=ctx.max_len_a,
+            max_len_b=ctx.max_len_b,
+            D=ctx.D,
+            n_prefix_from_B=ctx.n_prefix_to_B,
+            is_dense_a=ctx.is_dense_a,
+            is_dense_b=ctx.is_dense_b,
+            BLOCK_D=BLOCK_D,
+        )
+
+        return None, d_jagged_in, None, None, None, None, None, None, None
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_concat_2D_jagged(
+    max_seq_len: int,
+    values_left: torch.Tensor,
+    values_right: torch.Tensor,
+    max_len_left: Optional[int],
+    max_len_right: Optional[int],
+    offsets_left: Optional[torch.Tensor],
+    offsets_right: Optional[torch.Tensor],
+    n_prefix_from_right: int = 0,
+) -> torch.Tensor:
+    return _Concat2DJaggedFunction.apply(
+        max_seq_len,
+        values_left,
+        values_right,
+        max_len_left,
+        max_len_right,
+        offsets_left,
+        offsets_right,
+        n_prefix_from_right,
+    )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_split_2D_jagged(
+    max_seq_len: int,
+    values: torch.Tensor,
+    total_len_left: Optional[int],
+    total_len_right: Optional[int],
+    max_len_left: Optional[int],
+    max_len_right: Optional[int],
+    offsets_left: Optional[torch.Tensor],
+    offsets_right: Optional[torch.Tensor],
+    n_prefix_to_right: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return _Split2DJaggedFunction.apply(
+        max_seq_len,
+        values,
+        total_len_left,
+        total_len_right,
+        max_len_left,
+        max_len_right,
+        offsets_left,
+        offsets_right,
+        n_prefix_to_right,
+    )
+
+
+class _Concat2DJaggedMultirowFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values_left: torch.Tensor,
+        values_right: torch.Tensor,
+        max_len_left: Optional[int],
+        max_len_right: Optional[int],
+        offsets_left: Optional[torch.Tensor],
+        offsets_right: Optional[torch.Tensor],
+        n_prefix_from_right: int,
+    ) -> torch.Tensor:
+        values_left = switch_to_contiguous_if_needed(values_left)
+        values_right = switch_to_contiguous_if_needed(values_right)
+        is_dense_left = offsets_left is None
+        is_dense_right = offsets_right is None
+        total_len_left, D = values_left.shape
+        total_len_right, _ = values_right.shape
+        if is_dense_left:
+            assert max_len_left is not None
+            B = total_len_left // max_len_left
+        else:
+            assert offsets_left is not None
+            B = offsets_left.shape[0] - 1
+        if is_dense_right:
+            assert max_len_right is not None
+            B = total_len_right // max_len_right
+        else:
+            assert offsets_right is not None
+            B = offsets_right.shape[0] - 1
+        total_seq_len = total_len_left + total_len_right
+        BLOCK_D = triton.next_power_of_2(D)
+        values_out = torch.empty(
+            (total_seq_len, D), device=values_left.device, dtype=values_left.dtype
+        )
+
+        def grid(meta):
+            return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+        concat_2D_jagged_multirow[grid](
+            ValuesA=values_left,
+            ValuesB=values_right,
+            OffsetsA=offsets_left,
+            OffsetsB=offsets_right,
+            MaxLenA=max_len_left,
+            MaxLenB=max_len_right,
+            Out=values_out,
+            D=D,
+            stride_ad=values_left.stride(-2),
+            stride_bd=values_right.stride(-2),
+            stride_od=values_out.stride(-2),
+            n_prefix_from_B=n_prefix_from_right,
+            IS_DENSE_A=is_dense_left,
+            IS_DENSE_B=is_dense_right,
+            BLOCK_D=BLOCK_D,
+        )
+        ctx.save_for_backward(offsets_left, offsets_right)
+        ctx.max_seq_len = max_seq_len
+        ctx.total_len_left = total_len_left
+        ctx.total_len_right = total_len_right
+        ctx.is_dense_left = is_dense_left
+        ctx.is_dense_right = is_dense_right
+        ctx.max_len_left = max_len_left
+        ctx.max_len_right = max_len_right
+        ctx.B = B
+        ctx.n_prefix_from_right = n_prefix_from_right
+        return values_out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, torch.Tensor, torch.Tensor, None, None, None, None, None]:
+        offsets_left, offsets_right = ctx.saved_tensors
+        _, D = d_out.shape
+        BLOCK_D = triton.next_power_of_2(D)
+        d_values_left = torch.zeros(
+            (ctx.total_len_left, D), device=d_out.device, dtype=d_out.dtype
+        )
+        d_values_right = torch.empty(
+            (ctx.total_len_right, D), device=d_out.device, dtype=d_out.dtype
+        )
+
+        def grid(meta):
+            return (triton.cdiv(ctx.max_seq_len, meta["BLOCK_N"]), ctx.B)
+
+        split_2D_jagged_multirow[grid](
+            JaggedIn=d_out,
+            OffsetsA=offsets_left,
+            OffsetsB=offsets_right,
+            MaxLenA=ctx.max_len_left,
+            MaxLenB=ctx.max_len_right,
+            OutA=d_values_left,
+            OutB=d_values_right,
+            D=D,
+            stride_id=d_out.stride(-2),
+            stride_ad=d_values_left.stride(-2),
+            stride_bd=d_values_right.stride(-2),
+            n_prefix_to_B=ctx.n_prefix_from_right,
+            IS_DENSE_A=ctx.is_dense_left,
+            IS_DENSE_B=ctx.is_dense_right,
+            BLOCK_D=BLOCK_D,
+        )
+        return None, d_values_left, d_values_right, None, None, None, None, None
+
+
+class _Split2DJaggedMultirowFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values: torch.Tensor,
+        total_len_left: Optional[int],
+        total_len_right: Optional[int],
+        max_len_left: Optional[int],
+        max_len_right: Optional[int],
+        offsets_left: Optional[torch.Tensor],
+        offsets_right: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        values = switch_to_contiguous_if_needed(values)
+        is_dense_left: bool = offsets_left is None
+        is_dense_right: bool = offsets_right is None
+        total_seq_len, D = values.shape
+
+        if is_dense_left:
+            assert is_dense_right is False
+            assert offsets_right is not None
+            assert max_len_left is not None
+            B = offsets_right.shape[0] - 1
+            total_len_a = max_len_left * B
+            total_len_b = total_seq_len - total_len_a
+        elif is_dense_right:
+            assert is_dense_left is False
+            assert offsets_left is not None
+            assert max_len_right is not None
+            B = offsets_left.shape[0] - 1
+            total_len_b = max_len_right * B
+            total_len_a = total_seq_len - total_len_b
+        else:
+            assert offsets_left is not None and offsets_right is not None
+            B = offsets_left.shape[0] - 1
+            if total_len_left is not None and total_len_right is not None:
+                assert total_len_left + total_len_right == total_seq_len
+                total_len_a = total_len_left
+                total_len_b = total_len_right
+            else:
+                total_len_a = int(offsets_left[-1].item())
+                total_len_b = values.size(0) - total_len_a
+
+        BLOCK_D = triton.next_power_of_2(D)
+        values_a = torch.empty(
+            (total_len_a, D), device=values.device, dtype=values.dtype
+        )
+        values_b = torch.empty(
+            (total_len_b, D), device=values.device, dtype=values.dtype
+        )
+
+        def grid(meta):
+            return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+        split_2D_jagged_multirow[grid](
+            JaggedIn=values,
+            OffsetsA=offsets_left,
+            OffsetsB=offsets_right,
+            MaxLenA=max_len_left,
+            MaxLenB=max_len_right,
+            OutA=values_a,
+            OutB=values_b,
+            D=D,
+            stride_id=values.stride(-2),
+            stride_ad=values_a.stride(-2),
+            stride_bd=values_b.stride(-2),
+            n_prefix_to_B=0,
+            IS_DENSE_A=is_dense_left,
+            IS_DENSE_B=is_dense_right,
+            BLOCK_D=BLOCK_D,
+        )
+
+        ctx.save_for_backward(offsets_left, offsets_right)
+        ctx.max_seq_len = max_seq_len
+        ctx.total_seq_len = total_seq_len
+        ctx.max_len_left = max_len_left
+        ctx.max_len_right = max_len_right
+        ctx.is_dense_left = is_dense_left
+        ctx.is_dense_right = is_dense_right
+        ctx.B = B
+        ctx.D = D
+
+        return values_a, values_b
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, *d_values
+    ) -> Tuple[None, torch.Tensor, None, None, None, None, None, None]:
+        offsets_left, offsets_right = ctx.saved_tensors
+        d_values_a, d_values_b = d_values
+        BLOCK_D = triton.next_power_of_2(ctx.D)
+        d_jagged_in = torch.empty(
+            (ctx.total_seq_len, ctx.D),
+            device=d_values_a.device,
+            dtype=d_values_a.dtype,
+        )
+
+        def grid(meta):
+            return (triton.cdiv(ctx.max_seq_len, meta["BLOCK_N"]), ctx.B)
+
+        concat_2D_jagged_multirow[grid](
+            ValuesA=d_values_a,
+            ValuesB=d_values_b,
+            OffsetsA=offsets_left,
+            OffsetsB=offsets_right,
+            MaxLenA=ctx.max_len_left,
+            MaxLenB=ctx.max_len_right,
+            Out=d_jagged_in,
+            D=ctx.D,
+            stride_ad=d_values_a.stride(-2),
+            stride_bd=d_values_b.stride(-2),
+            stride_od=d_jagged_in.stride(-2),
+            n_prefix_from_B=0,
+            IS_DENSE_A=ctx.is_dense_left,
+            IS_DENSE_B=ctx.is_dense_right,
+            BLOCK_D=BLOCK_D,
+        )
+
+        return None, d_jagged_in, None, None, None, None, None, None
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_concat_2D_jagged_multirow(
+    max_seq_len: int,
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    offsets_a: Optional[torch.Tensor],
+    offsets_b: Optional[torch.Tensor],
+    max_len_a: int,
+    max_len_b: int,
+) -> torch.Tensor:
+    return _Concat2DJaggedMultirowFunction.apply(
+        max_seq_len,
+        values_a,
+        values_b,
+        max_len_a,
+        max_len_b,
+        offsets_a,
+        offsets_b,
+        0,
+    )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_split_2D_jagged_multirow(
+    max_seq_len: int,
+    values: torch.Tensor,
+    total_len_left: Optional[int] = None,
+    total_len_right: Optional[int] = None,
+    max_len_left: Optional[int] = None,
+    max_len_right: Optional[int] = None,
+    offsets_left: Optional[torch.Tensor] = None,
+    offsets_right: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return _Split2DJaggedMultirowFunction.apply(
+        max_seq_len,
+        values,
+        total_len_left,
+        total_len_right,
+        max_len_left,
+        max_len_right,
+        offsets_left,
+        offsets_right,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py b/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
new file mode 100644
index 000000000..cc513433b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
@@ -0,0 +1,1337 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import (
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+from generative_recommenders.ops.triton._autotune_pinning import pinned_or_full
+from generative_recommenders.ops.utils import (
+    is_sm100_plus,
+    is_sm90,
+    maybe_register_custom_op,
+)
+
+try:
+    # @manual=//triton:triton
+    from triton.language.extra.libdevice import fast_dividef, rsqrt as libdevice_rsqrt
+except ImportError:
+    try:
+        # @manual=//triton:triton
+        from triton.language.extra.cuda.libdevice import (
+            fast_dividef,
+            rsqrt as libdevice_rsqrt,
+        )
+    except ImportError:
+        # pyre-ignore: Undefined import [21]
+        # @manual=//triton:triton
+        from triton.language.math import fast_dividef, rsqrt as libdevice_rsqrt
+
+
+def _get_layer_norm_fwd_configs() -> List[triton.Config]:
+    """Generate autotune configs for multi-row LayerNorm kernels."""
+    configs = []
+    block_ns = [4, 8, 16] if is_sm100_plus() else [1, 2, 4, 8]
+    for BLOCK_N in block_ns:
+        for num_warps in [1, 2, 4, 8]:
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+def _bwd_pre_hook(nargs):
+    nargs["DW"].zero_()
+    if "DB" in nargs:
+        nargs["DB"].zero_()
+
+
+def _get_norm_bwd_configs() -> List[triton.Config]:
+    """Generate autotune configs for multi-row LayerNorm kernels."""
+    configs = []
+    if is_sm100_plus():
+        block_ns = [8, 16]
+        num_shards_list = [8, 16]
+        num_warps_list = [2, 4]
+    elif is_sm90():
+        block_ns = [2, 4]
+        num_shards_list = [8]
+        num_warps_list = [2, 4]
+    else:
+        block_ns = [1, 2]
+        num_shards_list = [8]
+        num_warps_list = [2, 4]
+    for BLOCK_N in block_ns:
+        for num_warps in num_warps_list:
+            for num_shards in num_shards_list:
+                configs.append(
+                    triton.Config(
+                        {"BLOCK_N": BLOCK_N, "SHARDS_PER_SM": num_shards},
+                        num_warps=num_warps,
+                    )
+                )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_layer_norm_fwd_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def _layer_norm_fwd(
+    X,
+    Y,
+    Mean,
+    Rstd,
+    N,
+    D,
+    eps,
+    stride_x,
+    stride_y,
+    TRAINING: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    COMPUTE_MEAN_AND_RSTD: tl.constexpr,
+):
+    block_id = tl.program_id(0)
+    start_row = block_id * BLOCK_N
+
+    X_block_ptr = tl.make_block_ptr(
+        base=X,
+        shape=(N, D),
+        strides=(stride_x, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    Y_block_ptr = tl.make_block_ptr(
+        base=Y,
+        shape=(N, D),
+        strides=(stride_y, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+        tl.float32
+    )
+
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    rows = start_row + tl.arange(0, BLOCK_N)
+    row_mask = rows < N
+
+    if COMPUTE_MEAN_AND_RSTD:
+        mean = tl.sum(x_block, axis=1) / D
+        if TRAINING:
+            tl.store(Mean + rows, mean, row_mask)
+        mean = tl.expand_dims(mean, 1)
+    else:
+        mean = tl.load(Mean + rows, row_mask, other=0.0)
+        mean = tl.expand_dims(mean, 1)
+
+    x_mean = x_block - mean
+    x_mean = tl.where(row_mask[:, None] & col_mask[None, :], x_mean, 0.0)
+
+    if COMPUTE_MEAN_AND_RSTD:
+        _var = x_mean * x_mean
+        var = tl.sum(_var, axis=1) / D
+        rstd = 1 / tl.sqrt(var + eps)
+        if TRAINING:
+            tl.store(Rstd + rows, rstd, row_mask)
+    else:
+        rstd = tl.load(Rstd + rows, row_mask, other=0.0)
+
+    rstd = tl.expand_dims(rstd, 1)
+    y = x_mean * rstd
+
+    tl.store(Y_block_ptr, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton_autotune(
+    configs=_get_layer_norm_fwd_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def _weighted_layer_norm_fwd(
+    X,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    N,
+    D,
+    eps,
+    stride_x,
+    stride_y,
+    IS_SWISH: tl.constexpr,
+    TRAINING: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    COMPUTE_MEAN_AND_RSTD: tl.constexpr,
+):
+    # Get the block ID and calculate starting row
+    block_id = tl.program_id(0)
+    start_row = block_id * BLOCK_N
+
+    # Load weight and bias once (shared across all rows in this block)
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    w = tl.load(W + cols, mask=col_mask, other=0.0).to(tl.float32)
+    b = tl.load(B + cols, mask=col_mask, other=0.0).to(tl.float32)
+
+    # Create block pointers for X and Y
+    X_block_ptr = tl.make_block_ptr(
+        base=X,
+        shape=(N, D),
+        strides=(stride_x, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    Y_block_ptr = tl.make_block_ptr(
+        base=Y,
+        shape=(N, D),
+        strides=(stride_y, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+        tl.float32
+    )
+
+    rows = start_row + tl.arange(0, BLOCK_N)
+    row_mask = rows < N
+
+    if COMPUTE_MEAN_AND_RSTD:
+        mean = tl.sum(x_block, axis=1) / D
+        if TRAINING:
+            tl.store(Mean + rows, mean, row_mask)
+        mean = tl.expand_dims(mean, 1)
+    else:
+        mean = tl.load(Mean + rows, row_mask, other=0.0)
+        mean = tl.expand_dims(mean, 1)
+
+    x_mean = x_block - mean
+    x_mean = tl.where(row_mask[:, None] & col_mask[None, :], x_mean, 0.0)
+
+    if COMPUTE_MEAN_AND_RSTD:
+        _var = x_mean * x_mean
+        var = tl.sum(_var, axis=1) / D
+        rstd = libdevice_rsqrt(var + eps)
+        if TRAINING:
+            tl.store(Rstd + rows, rstd, row_mask)
+    else:
+        rstd = tl.load(Rstd + rows, row_mask, other=0.0)
+
+    rstd = tl.expand_dims(rstd, 1)
+    y = x_mean * rstd
+    y = y * w[None, :] + b[None, :]
+
+    if IS_SWISH:
+        y = tl.sigmoid(y) * x_block
+
+    tl.store(Y_block_ptr, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.jit
+def _layer_norm_bwd_dx(
+    DX,
+    DY,
+    X,
+    Mean,
+    Rstd,
+    stride_dx,
+    stride_dy,
+    stride_x,
+    D,
+    eps,
+    BLOCK_D: tl.constexpr,
+):
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+    X += row.to(tl.int64) * stride_x
+    DY += row.to(tl.int64) * stride_dy
+    DX += row.to(tl.int64) * stride_dx
+
+    # Load data to SRAM
+    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+    dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+    mean = tl.load(Mean + row)
+    rstd = tl.load(Rstd + row)
+
+    # Compute dx
+    xhat = (x - mean) * rstd
+    xhat = tl.where(mask, xhat, 0.0)
+    dy = tl.where(mask, dy, 0.0)
+    c1 = tl.sum(xhat * dy, axis=0) / D
+    c2 = tl.sum(dy, axis=0) / D
+    dx = (dy - (xhat * c1 + c2)) * rstd
+    # Write dx
+    tl.store(DX + cols, dx, mask=mask)
+
+
+@triton_autotune(
+    configs=pinned_or_full(
+        [
+            triton.Config({"BLOCK_N": 1}, num_warps=1),  # bs=32 winner
+            triton.Config({"BLOCK_N": 8}, num_warps=1),  # bs=1024 winner
+        ],
+        _get_layer_norm_fwd_configs,
+    ),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def _weighted_layer_norm_bwd_dx(
+    DX,
+    DY,
+    DW,
+    DB,
+    X,
+    W,
+    B,
+    Mean,
+    Rstd,
+    stride_dx,
+    stride_dy,
+    stride_x,
+    D,
+    eps,
+    IS_SWISH: tl.constexpr,
+    N,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    tile_num = tl.num_programs(0)
+    num_blocks = tl.cdiv(N, BLOCK_N)
+    blocks_per_tile = num_blocks // tile_num
+    if pid < num_blocks % tile_num:
+        blocks_per_tile += 1
+
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    w = tl.load(W + cols, mask=col_mask, other=0.0).to(tl.float32)
+
+    acc_dw = tl.zeros([BLOCK_D], dtype=tl.float32)
+    acc_db = tl.zeros([BLOCK_D], dtype=tl.float32)
+
+    start_block = pid
+
+    for idx in range(blocks_per_tile):
+        current_block = start_block + idx * tile_num
+        start_row = current_block * BLOCK_N
+
+        X_block_ptr = tl.make_block_ptr(
+            base=X,
+            shape=(N, D),
+            strides=(stride_x, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        DX_block_ptr = tl.make_block_ptr(
+            base=DX,
+            shape=(N, D),
+            strides=(stride_dx, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        DY_block_ptr = tl.make_block_ptr(
+            base=DY,
+            shape=(N, D),
+            strides=(stride_dy, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        # Load data blocks
+        x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+            tl.float32
+        )
+        dy_block = tl.load(
+            DY_block_ptr, boundary_check=(0, 1), padding_option="zero"
+        ).to(tl.float32)
+
+        # Load mean and rstd for all rows in this block
+        rows = start_row + tl.arange(0, BLOCK_N)
+        row_mask = rows < N
+        mean = tl.load(Mean + rows, row_mask, other=0.0)
+        rstd = tl.load(Rstd + rows, row_mask, other=0.0)
+
+        # Expand dimensions for broadcasting
+        mean = tl.expand_dims(mean, 1)
+        rstd = tl.expand_dims(rstd, 1)
+
+        xhat = (x_block - mean) * rstd
+
+        xhat = tl.where(row_mask[:, None] & col_mask[None, :], xhat, 0.0)
+        wdy = w[None, :] * dy_block
+        wdy = tl.where(row_mask[:, None] & col_mask[None, :], wdy, 0.0)
+
+        # Compute dx
+        if IS_SWISH:
+            b = tl.load(B + cols, mask=col_mask, other=0.0).to(tl.float32)
+            sigmoid_layer_norm = tl.sigmoid(xhat * w[None, :] + b[None, :])
+            sigmoid_layer_norm = tl.where(
+                row_mask[:, None] & col_mask[None, :], sigmoid_layer_norm, 0.0
+            )
+
+            sigmoid_deriv = sigmoid_layer_norm * (1 - sigmoid_layer_norm)
+            x_ = wdy * x_block * sigmoid_deriv
+            x_ = tl.where(row_mask[:, None] & col_mask[None, :], x_, 0.0)
+
+            c1 = tl.sum(xhat * x_, axis=1) / D
+            c2 = tl.sum(x_, axis=1) / D
+            c1 = tl.expand_dims(c1, 1)
+            c2 = tl.expand_dims(c2, 1)
+            dx = (x_ - (xhat * c1 + c2)) * rstd
+
+            dx = dy_block * sigmoid_layer_norm + dx
+            # Write dx
+            tl.store(DX_block_ptr, dx.to(DX.dtype.element_ty), boundary_check=(0, 1))
+            partial_dw = tl.sum(dy_block * x_block * xhat * sigmoid_deriv, axis=0)
+            partial_db = tl.sum(dy_block * x_block * sigmoid_deriv, axis=0)
+        else:
+            c1 = tl.sum(xhat * wdy, axis=1) / D
+            c2 = tl.sum(wdy, axis=1) / D
+            c1 = tl.expand_dims(c1, 1)
+            c2 = tl.expand_dims(c2, 1)
+            dx = (wdy - (xhat * c1 + c2)) * rstd
+            # Write dx
+            tl.store(DX_block_ptr, dx.to(DX.dtype.element_ty), boundary_check=(0, 1))
+            partial_dw = tl.sum(dy_block * xhat, axis=0)
+            partial_db = tl.sum(dy_block, axis=0)
+
+        # Accumulate partial sums in shared memory
+        acc_dw += partial_dw
+        acc_db += partial_db
+
+    # Store accumulated sums back to global memory
+    dw_ptrs = DW + pid.to(tl.int64) * D + cols
+    db_ptrs = DB + pid.to(tl.int64) * D + cols
+    tl.store(dw_ptrs, acc_dw, mask=col_mask)
+    tl.store(db_ptrs, acc_db, mask=col_mask)
+
+
+def _get_bwd_dwdb_configs() -> List[triton.Config]:
+    configs = []
+    BLOCK_N_CHOICES = [32, 64, 128, 256]
+    if is_sm100_plus():
+        BLOCK_N_CHOICES = [128, 256, 512, 1024]
+    for BLOCK_N in BLOCK_N_CHOICES:
+        for num_warps in [8, 16] + ([] if torch.ops.hip else [32]):
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+@triton_autotune(
+    configs=pinned_or_full(
+        [triton.Config({"BLOCK_N": 128}, num_warps=8)],
+        _get_bwd_dwdb_configs,
+    ),
+    key=["D"],
+)
+@triton.jit
+def _layer_norm_bwd_dwdb(
+    DW,
+    DB,
+    FINAL_DW,
+    FINAL_DB,
+    N,
+    D,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    cols = pid * BLOCK_D + tl.arange(0, BLOCK_D)
+    dw = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+    db = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+
+    for i in range(0, N, BLOCK_N):
+        rows = i + tl.arange(0, BLOCK_N)
+        # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+        mask = (rows[:, None] < N) & (cols[None, :] < D)
+        offs = rows[:, None] * D + cols[None, :]
+        dw += tl.load(DW + offs, mask=mask, other=0.0)
+        db += tl.load(DB + offs, mask=mask, other=0.0)
+
+    sum_dw = tl.sum(dw, axis=0)
+    sum_db = tl.sum(db, axis=0)
+    tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D)
+    tl.store(FINAL_DB + cols, sum_db.to(FINAL_DB.dtype.element_ty), mask=cols < D)
+
+
+def compute_BLOCK_D(x: torch.Tensor) -> int:
+    """Compute the BLOCK_D parameter for layer norm kernels."""
+    D = x.shape[-1]
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    return min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::triton_weighted_layer_norm_fwd", mutates_args=()
+)
+def triton_weighted_layer_norm_fwd(
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    eps: float,
+    mean: Optional[torch.Tensor] = None,
+    rstd: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2, f"x.dim() == {x.dim()}, expected 2"
+    x = switch_to_contiguous_if_needed(x)
+    N, D = x.shape
+    learnable = weight is not None
+    if learnable:
+        assert bias is not None and weight is not None
+        assert weight.dim() == 1
+        assert bias.dim() == 1
+        assert weight.numel() == D
+        assert bias.numel() == D
+
+    y = torch.empty_like(x)
+    compute_mean_and_rstd = mean is None or rstd is None
+    # Always allocate new tensors to avoid aliasing inputs with outputs
+    out_mean = torch.empty((N,), dtype=torch.float32, device=x.device)
+    out_rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+    if not compute_mean_and_rstd:
+        assert mean is not None and rstd is not None
+        out_mean.copy_(mean)
+        out_rstd.copy_(rstd)
+
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_D: int = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+    if D > BLOCK_D:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+    if N == 0:
+        return y, out_mean, out_rstd
+
+    # pyre-ignore[28]
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]),)  # noqa E731
+    if learnable:
+        _weighted_layer_norm_fwd[grid](
+            x,
+            y,
+            weight,
+            bias,
+            out_mean,
+            out_rstd,
+            N,
+            D,
+            eps,
+            x.stride(0),
+            y.stride(0),
+            IS_SWISH=False,
+            TRAINING=True,
+            BLOCK_D=BLOCK_D,
+            COMPUTE_MEAN_AND_RSTD=compute_mean_and_rstd,
+        )
+    else:
+        _layer_norm_fwd[grid](
+            x,
+            y,
+            out_mean,
+            out_rstd,
+            N,
+            D,
+            eps,
+            x.stride(0),
+            y.stride(0),
+            TRAINING=True,
+            BLOCK_D=BLOCK_D,
+            COMPUTE_MEAN_AND_RSTD=compute_mean_and_rstd,
+        )
+
+    return y, out_mean, out_rstd
+
+
+@triton_weighted_layer_norm_fwd.register_fake
+def _(
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    eps: float,
+    mean: Optional[torch.Tensor] = None,
+    rstd: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    N = x.shape[0]
+    y = torch.empty_like(x)
+    # Always allocate new tensors to avoid aliasing inputs with outputs
+    out_mean = torch.empty((N,), dtype=torch.float32, device=x.device)
+    out_rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+    return y, out_mean, out_rstd
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::triton_weighted_layer_norm_bwd", mutates_args=()
+)
+def _triton_weighted_layer_norm_bwd_impl(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    learnable: bool,
+    eps: float,
+    BLOCK_D: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    num_warps: int = min(max(BLOCK_D // 256, 1), 8)
+    if learnable:
+        N, D = x.shape
+        dx = torch.empty_like(x)
+        sms = torch.cuda.get_device_properties(x.device).multi_processor_count
+        tile_num = max(1, min(sms * 8, N // 4))
+        _dweight = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+        _dbias = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+        dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+        dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+        if N == 0:
+            dweight.zero_()
+            dbias.zero_()
+            return dx, dweight, dbias
+        # pyre-ignore[28]
+        _weighted_layer_norm_bwd_dx[(tile_num,)](
+            dx,
+            dy,
+            _dweight,
+            _dbias,
+            x,
+            weight,
+            bias,
+            mean,
+            rstd,
+            dx.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            D,
+            eps,
+            IS_SWISH=False,
+            N=N,
+            BLOCK_D=BLOCK_D,
+        )
+
+        def grid(META):
+            return (triton.cdiv(D, META["BLOCK_D"]),)
+
+        blocks = triton.next_power_of_2(sms * 4)
+        BLOCK_D = triton.next_power_of_2(triton.cdiv(D, blocks))
+        BLOCK_D = min(max(BLOCK_D, 4), 128)
+        _layer_norm_bwd_dwdb[grid](
+            _dweight,
+            _dbias,
+            dweight,
+            dbias,
+            tile_num,
+            D,
+            BLOCK_D=BLOCK_D,
+        )
+
+        return dx, dweight, dbias
+    else:
+        N, D = x.shape
+        dx = torch.empty_like(x)
+        # Return empty tensors as sentinels for None
+        dweight = torch.empty(0, dtype=x.dtype, device=x.device)
+        dbias = torch.empty(0, dtype=x.dtype, device=x.device)
+        if N == 0:
+            return dx, dweight, dbias
+        # pyre-ignore[28]
+        _layer_norm_bwd_dx[(N,)](
+            dx,
+            dy,
+            x,
+            mean,
+            rstd,
+            dx.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            D,
+            eps,
+            BLOCK_D=BLOCK_D,
+            num_warps=num_warps,
+        )
+        return dx, dweight, dbias
+
+
+@_triton_weighted_layer_norm_bwd_impl.register_fake
+def _(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    learnable: bool,
+    eps: float,
+    BLOCK_D: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    dx = torch.empty_like(x)
+    if learnable:
+        D = x.shape[-1]
+        dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+        dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+    else:
+        dweight = torch.empty(0, dtype=x.dtype, device=x.device)
+        dbias = torch.empty(0, dtype=x.dtype, device=x.device)
+    return dx, dweight, dbias
+
+
+def triton_weighted_layer_norm_bwd(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    learnable: bool,
+    eps: float,
+    BLOCK_D: int,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    # Use sentinel tensors for custom_op compatibility (can't return Optional[Tensor])
+    _weight = (
+        weight if weight is not None else torch.empty(0, dtype=x.dtype, device=x.device)
+    )
+    _bias = bias if bias is not None else torch.empty(0, dtype=x.dtype, device=x.device)
+    dx, dweight, dbias = _triton_weighted_layer_norm_bwd_impl(
+        dy=dy,
+        x=x,
+        weight=_weight,
+        bias=_bias,
+        mean=mean,
+        rstd=rstd,
+        learnable=learnable,
+        eps=eps,
+        BLOCK_D=BLOCK_D,
+    )
+    if not learnable:
+        return dx, None, None
+    return dx, dweight, dbias
+
+
+class LayerNormFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        weight: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor],
+        eps: float,
+    ) -> torch.Tensor:
+        y, mean, rstd = triton_weighted_layer_norm_fwd(
+            x=x,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+        )
+        BLOCK_D = compute_BLOCK_D(x)
+        learnable = weight is not None
+        if learnable:
+            ctx.save_for_backward(x, weight, bias, mean, rstd)
+        else:
+            ctx.save_for_backward(x, mean, rstd)
+        ctx.BLOCK_D = BLOCK_D
+        ctx.eps = eps
+        ctx.learnable = learnable
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], None]:
+        if ctx.learnable:
+            x, weight, bias, mean, rstd = ctx.saved_tensors
+        else:
+            x, mean, rstd = ctx.saved_tensors
+            weight, bias = None, None
+        dx, dweight, dbias = triton_weighted_layer_norm_bwd(
+            dy=dy,
+            x=x,
+            weight=weight,
+            bias=bias,
+            mean=mean,
+            rstd=rstd,
+            learnable=ctx.learnable,
+            eps=ctx.eps,
+            BLOCK_D=ctx.BLOCK_D,
+        )
+        return dx, dweight, dbias, None
+
+
+def _get_rms_norm_fwd_configs() -> List[triton.Config]:
+    """Generate autotune configs for multi-row RMSNorm kernels."""
+    configs = []
+    for BLOCK_N in [1, 4, 16]:
+        for num_warps in [2, 4]:
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+@triton.autotune(
+    configs=_get_rms_norm_fwd_configs(),
+    key=["BLOCK_D", "SILU"],
+)
+@triton.jit
+def _weighted_rms_norm_fwd(
+    X,
+    Y,
+    W,
+    Rstd,
+    N,
+    D: tl.constexpr,
+    eps,
+    stride_x,
+    stride_y,
+    SILU: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    block_id = tl.program_id(0)
+    start_row = block_id * BLOCK_N
+
+    # Load weight once (shared across all rows in this block)
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    w = tl.load(W + cols, mask=col_mask, other=0.0).to(tl.float32)
+
+    # Create block pointers for X and Y
+    X_block_ptr = tl.make_block_ptr(
+        base=X,
+        shape=(N, D),
+        strides=(stride_x, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    Y_block_ptr = tl.make_block_ptr(
+        base=Y,
+        shape=(N, D),
+        strides=(stride_y, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+        tl.float32
+    )
+
+    rows = start_row + tl.arange(0, BLOCK_N)
+    row_mask = rows < N
+
+    # Compute variance (RMS norm uses x directly, not x - mean)
+    x_masked = tl.where(row_mask[:, None] & col_mask[None, :], x_block, 0.0)
+    _var = x_masked * x_masked
+    var = tl.sum(_var, axis=1) / D
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + rows, rstd, row_mask)
+
+    # Normalize and apply linear transformation
+    rstd = tl.expand_dims(rstd, 1)
+    y = x_block * rstd
+    y = y * w[None, :]
+
+    if SILU:
+        # pyre-ignore[16]: Module `triton.language.math` has no attribute `fast_dividef`
+        y = fast_dividef(y, 1.0 + tl.exp(-y))
+
+    tl.store(Y_block_ptr, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.jit
+def _weighted_rms_norm_bwd_dx(
+    DX,
+    DY,
+    DW,
+    X,
+    W,
+    Rstd,
+    Lock,
+    stride_dx,
+    stride_dy,
+    stride_x,
+    D: tl.constexpr,
+    eps,
+    GROUP_N,
+    BLOCK_D: tl.constexpr,
+):
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+    X += row.to(tl.int64) * stride_x
+    DY += row.to(tl.int64) * stride_dy
+    DX += row.to(tl.int64) * stride_dx
+
+    # Load data to SRAM
+    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+    dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+    rstd = tl.load(Rstd + row)
+
+    # Compute dx
+    xhat = x * rstd
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    wdy = w * dy
+
+    xhat = tl.where(mask, xhat, 0.0)
+    wdy = tl.where(mask, wdy, 0.0)
+    c1 = tl.sum(xhat * wdy, axis=0) / D
+    dx = (wdy - (xhat * c1)) * rstd
+    # Write dx
+    tl.store(DX + cols, dx, mask=mask)
+
+    # Offset locks and weights/biases gradient pointer for parallel reduction
+    lock_id = row % GROUP_N
+    Lock += lock_id
+    Count = Lock + GROUP_N
+    DW = DW + lock_id * D + cols
+    # Accumulate partial sums for dw/db
+    partial_dw = dy * xhat
+    while tl.atomic_cas(Lock, 0, 1) == 1:
+        pass
+    count = tl.load(Count)
+    # First store doesn't accumulate
+    if count == 0:
+        tl.atomic_xchg(Count, 1)
+    else:
+        partial_dw += tl.load(DW, mask=mask)
+    tl.store(DW, partial_dw, mask=mask)
+    # Release the lock
+    tl.atomic_xchg(Lock, 0)
+
+
+@triton_autotune(
+    configs=_get_norm_bwd_configs(),
+    key=["BLOCK_D", "SILU"],
+    reset_to_zero=["DW"],
+)
+@triton.jit
+def _weighted_rms_norm_bwd(
+    DX,
+    DY,
+    DW,
+    X,
+    W,
+    Rstd,
+    stride_dx,
+    stride_dy,
+    stride_x,
+    D,
+    eps,
+    N,
+    SILU: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    SHARDS_PER_SM: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    tile_num = tl.num_programs(0)
+    num_blocks = tl.cdiv(N, BLOCK_N)
+    blocks_per_tile = num_blocks // tile_num
+    if pid < num_blocks % tile_num:
+        blocks_per_tile += 1
+
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    w = tl.load(W + cols, mask=col_mask, other=0.0).to(tl.float32)
+
+    start_block = pid
+
+    acc_dw = tl.zeros([BLOCK_D], dtype=tl.float32)
+
+    for idx in range(blocks_per_tile):
+        current_block = start_block + idx * tile_num
+        start_row = current_block * BLOCK_N
+
+        X_block_ptr = tl.make_block_ptr(
+            base=X,
+            shape=(N, D),
+            strides=(stride_x, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        DX_block_ptr = tl.make_block_ptr(
+            base=DX,
+            shape=(N, D),
+            strides=(stride_dx, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        DY_block_ptr = tl.make_block_ptr(
+            base=DY,
+            shape=(N, D),
+            strides=(stride_dy, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        # Load data blocks
+        x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+            tl.float32
+        )
+        dy_block = tl.load(
+            DY_block_ptr, boundary_check=(0, 1), padding_option="zero"
+        ).to(tl.float32)
+
+        # Load rstd for all rows in this block
+        rows = start_row + tl.arange(0, BLOCK_N)
+        row_mask = rows < N
+        rstd = tl.load(Rstd + rows, row_mask, other=0.0)
+
+        # Expand dimensions for broadcasting
+        rstd = tl.expand_dims(rstd, 1)
+
+        # Compute dx
+        xhat = x_block * rstd
+
+        # Apply SILU backward if enabled
+        if SILU:
+            y_before_silu = xhat * w[None, :]
+            # pyre-fixme[16]
+            sig_y = fast_dividef(1.0, 1.0 + tl.exp(-y_before_silu))
+            # SILU derivative: sigmoid(y) + y * sigmoid(y) * (1 - sigmoid(y))
+            dy_block = dy_block * (sig_y + y_before_silu * sig_y * (1.0 - sig_y))
+
+        wdy = w[None, :] * dy_block
+
+        c1 = tl.sum(xhat * wdy, axis=1) / D
+        c1 = tl.expand_dims(c1, 1)
+        dx = (wdy - (xhat * c1)) * rstd
+
+        # Write dx
+        tl.store(DX_block_ptr, dx.to(DX.dtype.element_ty), boundary_check=(0, 1))
+
+        # Accumulate partial sums for dw
+        # Compute dw for all rows, then sum locally before atomic operation
+        partial_dw_block = dy_block * xhat
+        # Local reduction: sum across all rows in this block
+        partial_dw = tl.sum(partial_dw_block, axis=0)
+        acc_dw += partial_dw
+
+    DW_ptr = DW + cols
+    tl.atomic_add(DW_ptr, acc_dw, col_mask)
+
+
+@triton_autotune(
+    configs=_get_bwd_dwdb_configs(),
+    key=["D"],
+)
+@triton.jit
+def _rms_norm_bwd_dwdb(
+    DW,
+    FINAL_DW,
+    N,
+    D,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    cols = pid * BLOCK_D + tl.arange(0, BLOCK_D)
+    dw = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+
+    for i in range(0, N, BLOCK_N):
+        rows = i + tl.arange(0, BLOCK_N)
+        # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+        mask = (rows[:, None] < N) & (cols[None, :] < D)
+        offs = rows[:, None] * D + cols[None, :]
+        dw += tl.load(DW + offs, mask=mask, other=0.0)
+
+    sum_dw = tl.sum(dw, axis=0)
+    tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D)
+
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        silu: bool,
+    ) -> torch.Tensor:
+        assert x.dim() == 2
+        x = switch_to_contiguous_if_needed(x)
+        N, D = x.shape
+        assert weight.dim() == 1
+        assert weight.numel() == D
+
+        y = torch.empty_like(x)
+        rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+
+        # Less than 64KB per feature: enqueue fused kernel
+        MAX_FUSED_SIZE = 65536 // x.element_size()
+        BLOCK_D = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+        if D > BLOCK_D:
+            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+        ctx.save_for_backward(x, weight, rstd)
+        ctx.silu = silu
+        if N == 0:
+            return y
+
+        # pyre-ignore[28]
+        grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]),)  # noqa E731
+        _weighted_rms_norm_fwd[grid](
+            x,
+            y,
+            weight,
+            rstd,
+            N,
+            D,
+            eps,
+            x.stride(0),
+            y.stride(0),
+            SILU=silu,
+            BLOCK_D=BLOCK_D,
+        )
+
+        ctx.BLOCK_D = BLOCK_D
+        ctx.eps = eps
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], None, None]:
+        x, weight, rstd = ctx.saved_tensors
+        N, D = x.shape
+        dx = torch.empty_like(x)
+        dweight = torch.zeros((D,), dtype=weight.dtype, device=x.device)
+        if N == 0:
+            dweight.zero_()
+            return dx, dweight, None, None
+
+        sms = torch.cuda.get_device_properties(x.device).multi_processor_count
+
+        # pyre-ignore[28]
+        grid = lambda meta: (  # noqa E731
+            max(1, min(sms * meta["SHARDS_PER_SM"], N // 4)),
+        )
+        _weighted_rms_norm_bwd[grid](
+            dx,
+            dy,
+            dweight,
+            x,
+            weight,
+            rstd,
+            dx.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            D,
+            ctx.eps,
+            N=N,
+            SILU=ctx.silu,
+            BLOCK_D=ctx.BLOCK_D,
+        )
+
+        return dx, dweight, None, None
+
+
+class SwishLayerNormFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+    ) -> torch.Tensor:
+        assert x.dim() == 2, f"x.dim() == {x.dim()}, expected 2"
+        x = switch_to_contiguous_if_needed(x)
+        N, D = x.shape
+
+        assert bias is not None and weight is not None
+        assert weight.dim() == 1
+        assert bias.dim() == 1
+        assert weight.numel() == D
+        assert bias.numel() == D
+
+        y = torch.empty_like(x)
+        mean = torch.empty((N,), dtype=torch.float32, device=x.device)
+        rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+
+        BLOCK_D = triton.next_power_of_2(D)
+        num_warps = min(max(BLOCK_D // 256, 1), 8)
+
+        ctx.save_for_backward(x, weight, bias, mean, rstd)
+        ctx.BLOCK_D = BLOCK_D
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        if N == 0:
+            return y
+
+        # pyre-ignore[28]
+        grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]),)  # noqa E731
+        _weighted_layer_norm_fwd[grid](
+            x,
+            y,
+            weight,
+            bias,
+            mean,
+            rstd,
+            N,
+            D,
+            eps,
+            x.stride(0),
+            y.stride(0),
+            IS_SWISH=True,
+            TRAINING=True,
+            BLOCK_D=BLOCK_D,
+            COMPUTE_MEAN_AND_RSTD=True,
+        )
+
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], None]:
+        x, weight, bias, mean, rstd = ctx.saved_tensors
+        N, D = x.shape
+        dx = torch.empty_like(x)
+        sms = torch.cuda.get_device_properties(x.device).multi_processor_count
+        tile_num = max(1, min(sms * 8, N // 4))
+        _dweight = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+        _dbias = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+        dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+        dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+        if N == 0:
+            dweight.zero_()
+            dbias.zero_()
+            return dx, dweight, dbias, None
+        # pyre-ignore[28]
+        _weighted_layer_norm_bwd_dx[(tile_num,)](
+            dx,
+            dy,
+            _dweight,
+            _dbias,
+            x,
+            weight,
+            bias,
+            mean,
+            rstd,
+            dx.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            D,
+            ctx.eps,
+            IS_SWISH=True,
+            N=N,
+            BLOCK_D=ctx.BLOCK_D,
+        )
+
+        def grid(META):
+            return (triton.cdiv(D, META["BLOCK_D"]),)
+
+        blocks = triton.next_power_of_2(sms * 4)
+        BLOCK_D = triton.next_power_of_2(triton.cdiv(D, blocks))
+        BLOCK_D = min(max(BLOCK_D, 4), 128)
+        _layer_norm_bwd_dwdb[grid](
+            _dweight,
+            _dbias,
+            dweight,
+            dbias,
+            tile_num,
+            D,
+            BLOCK_D=BLOCK_D,
+        )
+
+        return dx, dweight, dbias, None
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_layer_norm(
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    eps: float,
+) -> torch.Tensor:
+    return LayerNormFunction.apply(x, weight, bias, eps)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_rms_norm(
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor],
+    eps: float,
+    silu: bool = False,
+) -> torch.Tensor:
+    return RMSNormFunction.apply(x, weight, eps, silu)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_swish_layer_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    eps: float,
+) -> torch.Tensor:
+    return SwishLayerNormFunction.apply(x, weight, bias, eps)
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_position.py b/recommendation_v4/generative_recommenders/ops/triton/triton_position.py
new file mode 100644
index 000000000..72c43f9ac
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_position.py
@@ -0,0 +1,438 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+
+try:
+    torch.ops.load_library("//hammer/ops/cuda:cuda_ops")
+except OSError:
+    pass
+
+from generative_recommenders.common import (
+    autotune_max_seq_len,
+    prev_power_of_2,
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+
+
+def _autotune_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_N in [16, 32, 64]:
+        for num_stages in [1, 2]:
+            for num_warps in [2, 4, 8]:
+                configs.append(
+                    triton.Config(
+                        {
+                            "BLOCK_N": BLOCK_N,
+                        },
+                        num_stages=num_stages,
+                        num_warps=num_warps,
+                    )
+                )
+    return configs
+
+
+@triton_autotune(
+    configs=_autotune_configs(),
+    key=["AUTOTUNE_MAX_SEQ_LEN"],
+)
+@triton.jit
+def _add_timestamp_position_embeddings_kernel(
+    SeqEmb,
+    Offsets,
+    Lengths,
+    PosEmb,
+    TsEmb,
+    Out,
+    TS,
+    PosInds,
+    TsInds,
+    NumTargets,
+    AUTOTUNE_MAX_SEQ_LEN,
+    D,
+    num_time_buckets,
+    time_bucket_increments,
+    time_bucket_scale,
+    time_delta,
+    max_contextual_seq_len,
+    max_pos_ind,
+    stride_sn,
+    stride_pn,
+    stride_tn,
+    stride_on,
+    TRAINING: tl.constexpr,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    INTERLEAVE_TARGETS: tl.constexpr,
+    TIME_BUCKET_FN: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """
+    SeqEmb has shape (sum_B(N_i), D),
+    PosEmb has shape (N_p, D),
+    TsEmb has shape (N_t, D),
+    Out has shape (sum_B(N_i), D)
+    """
+
+    off_b = tl.program_id(0)
+    off_n = tl.program_id(1)
+    seq_start = tl.load(Offsets + off_b)
+    seq_end = tl.load(Offsets + off_b + 1)
+    seq_len = seq_end - seq_start
+    start_n = off_n * BLOCK_N
+    if start_n >= seq_len:
+        return
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+    seq_emb_offsets = offs_n[:, None] * stride_sn + offs_d[None, :]
+    SeqEmb += seq_start.to(tl.int64) * stride_sn
+    mask_n = offs_n < seq_len
+    # position encoding
+    seq_len = tl.load(Lengths + off_b)
+    if HAS_MULTIPLE_TARGETS:
+        num_targets = tl.load(NumTargets + off_b)
+        if INTERLEAVE_TARGETS:
+            high_ind = seq_len - num_targets * 2
+        else:
+            high_ind = seq_len - num_targets
+    else:
+        high_ind = seq_len
+    pos_inds = tl.where(offs_n < high_ind, offs_n, high_ind)
+    pos_inds = high_ind - pos_inds + max_contextual_seq_len
+    pos_inds = tl.where(pos_inds < max_pos_ind - 1, pos_inds, max_pos_ind - 1)
+    pos_inds = tl.where(offs_n < max_contextual_seq_len, offs_n, pos_inds)
+    if TRAINING:
+        tl.store(PosInds + seq_start + offs_n, pos_inds, mask=mask_n)
+    pos_emb_offsets = pos_inds[:, None] * stride_pn + offs_d[None, :]
+    # timestamp encoding
+    ts = tl.load(TS + seq_start + offs_n, mask=mask_n)
+    query_time = tl.load(TS + seq_end - 1)
+    ts = query_time - ts + time_delta
+    ts = tl.where(ts > 1e-6, ts, 1e-6) / time_bucket_increments
+    if TIME_BUCKET_FN == "log":
+        ts = tl.log(ts)
+    else:
+        ts = tl.sqrt(ts)
+    ts = ts * time_bucket_scale
+    ts = ts.to(tl.int32)
+    ts = tl.where(ts > 0, ts, 0)
+    ts = tl.where(ts < num_time_buckets, ts, num_time_buckets)
+    if TRAINING:
+        tl.store(TsInds + seq_start + offs_n, ts, mask=mask_n)
+    ts_emb_offsets = ts[:, None] * stride_tn + offs_d[None, :]
+    Out += seq_start.to(tl.int64) * stride_on
+    out_offsets = Out + offs_n[:, None] * stride_on + offs_d[None, :]
+    for _d in range(0, D, BLOCK_D):
+        mask = (offs_n[:, None] < seq_len) and offs_d[None, :] < D
+        seq_emb = tl.load(SeqEmb + seq_emb_offsets, mask=mask)
+        pos_emb = tl.load(PosEmb + pos_emb_offsets, mask=mask)
+        ts_emb = tl.load(TsEmb + ts_emb_offsets, mask=mask)
+        tl.store(out_offsets, seq_emb + (pos_emb + ts_emb).to(seq_emb.dtype), mask=mask)
+        seq_emb_offsets += BLOCK_D
+        pos_emb_offsets += BLOCK_D
+        ts_emb_offsets += BLOCK_D
+        out_offsets += BLOCK_D
+        offs_d += BLOCK_D
+
+
+def bwd_pre_hook(nargs):
+    nargs["Out"].zero_()
+
+
+def _add_embeddings_bwd_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK in [32, 64, 128]:
+        for num_stages in [2, 3, 4]:
+            for num_warps in [2, 4, 8]:
+                configs.append(
+                    triton.Config(
+                        {
+                            "BLOCK": BLOCK,
+                        },
+                        num_stages=num_stages,
+                        num_warps=num_warps,
+                        pre_hook=bwd_pre_hook,
+                    )
+                )
+    return configs
+
+
+@triton_autotune(
+    configs=_add_embeddings_bwd_configs(),
+    key=["AUTOTUNE_MAX_SEQ_LEN", "AUTOTUNE_B", "D"],
+)
+@triton.jit
+def _add_embeddings_bwd_kernel(
+    In,
+    KeyInds,
+    ValueInds,
+    Out,
+    AUTOTUNE_MAX_SEQ_LEN,
+    AUTOTUNE_B,
+    D,
+    jagged_size,
+    stride_in,
+    stride_on,
+    BLOCK_D: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    off_block = tl.program_id(0)
+    offs_d = tl.arange(0, BLOCK_D)
+    mask_d = offs_d < D
+    key_ind = -1
+    key_ind = key_ind.to(KeyInds.dtype.element_ty)  # pyre-ignore[16]
+    accumulator = tl.zeros((BLOCK_D,), dtype=In.dtype.element_ty)
+    for off_i in range(0, BLOCK):
+        off = off_block * BLOCK + off_i
+        if off < jagged_size:
+            value_ind = tl.load(ValueInds + off)
+            in_offset = In + value_ind.to(tl.int64) * stride_in
+            jagged_in = tl.load(in_offset + offs_d, mask=mask_d)
+            key_ind_new = tl.load(KeyInds + off)
+            if key_ind == key_ind_new:
+                accumulator += jagged_in
+            else:
+                if key_ind >= 0:
+                    out_offset = Out + key_ind.to(tl.int64) * stride_on
+                    tl.atomic_add(
+                        out_offset + offs_d,
+                        accumulator.to(Out.dtype.element_ty),
+                        mask=mask_d,
+                        sem="relaxed",
+                    )
+                key_ind = key_ind_new
+                accumulator = jagged_in
+    if key_ind >= 0:
+        out_offset = Out + key_ind.to(tl.int64) * stride_on
+        tl.atomic_add(
+            out_offset + offs_d,
+            accumulator.to(Out.dtype.element_ty),
+            mask=mask_d,
+            sem="relaxed",
+        )
+
+
+class _AddTimestampPositionEmbeddingsFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        seq_embeddings: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        pos_embeddings: torch.Tensor,
+        ts_embeddings: torch.Tensor,
+        timestamps: torch.Tensor,
+        max_seq_len: int,
+        max_contextual_seq_len: int,
+        seq_lengths: torch.Tensor,
+        num_targets: Optional[torch.Tensor],
+        interleave_targets: bool,
+        time_bucket_fn: str,
+    ):
+        seq_embeddings = switch_to_contiguous_if_needed(seq_embeddings)
+        pos_embeddings = switch_to_contiguous_if_needed(pos_embeddings)
+        ts_embeddings = switch_to_contiguous_if_needed(ts_embeddings)
+
+        max_pos_ind = pos_embeddings.shape[0]
+        B = seq_lengths.shape[0]
+        N, D = seq_embeddings.shape
+        assert len(pos_embeddings.shape) == 2
+        assert len(ts_embeddings.shape) == 2
+        assert pos_embeddings.shape[1] == D, (
+            "shape[1] of pos_embeddings much match seq_embeddings"
+        )
+        assert ts_embeddings.shape[1] == D, (
+            "shape[1] of ts_embeddings much match seq_embeddings"
+        )
+        out = torch.empty_like(seq_embeddings)
+
+        timestamps = switch_to_contiguous_if_needed(timestamps)
+        ts_inds = torch.empty_like(seq_embeddings[:, 0], dtype=torch.int32)
+        pos_inds = torch.empty_like(seq_embeddings[:, 0], dtype=torch.int32)
+        ts_emb_size = ts_embeddings.shape[0]
+
+        grid = lambda meta: (  # noqa E731
+            B,
+            triton.cdiv(max_seq_len, meta["BLOCK_N"]),
+        )
+        BLOCK_D = triton.next_power_of_2(D) if D < 64 else 64
+        _add_timestamp_position_embeddings_kernel[grid](
+            SeqEmb=seq_embeddings,
+            Offsets=seq_offsets,
+            Lengths=seq_lengths,
+            PosEmb=pos_embeddings,
+            TsEmb=ts_embeddings,
+            Out=out,
+            TS=timestamps,
+            PosInds=pos_inds,
+            TsInds=ts_inds,
+            NumTargets=num_targets,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(max_seq_len),
+            D=D,
+            num_time_buckets=ts_emb_size - 1,
+            time_bucket_increments=60.0,
+            time_bucket_scale=1.0,
+            time_delta=0,
+            max_contextual_seq_len=max_contextual_seq_len,
+            max_pos_ind=max_pos_ind,
+            stride_sn=seq_embeddings.stride(0),
+            stride_pn=pos_embeddings.stride(0),
+            stride_tn=ts_embeddings.stride(0),
+            stride_on=out.stride(0),
+            TRAINING=True,
+            HAS_MULTIPLE_TARGETS=num_targets is not None,
+            INTERLEAVE_TARGETS=interleave_targets,
+            TIME_BUCKET_FN=time_bucket_fn,
+            BLOCK_D=BLOCK_D,
+        )
+        try:
+            values = torch.arange(0, N, dtype=torch.int32, device=timestamps.device)
+            sorted_ts_key_inds, sorted_ts_value_inds = torch.ops.hammer.sort_kv_pairs(
+                ts_inds, values
+            )
+            sorted_pos_key_inds, sorted_pos_value_inds = torch.ops.hammer.sort_kv_pairs(
+                pos_inds, values
+            )
+        except Exception:
+            sorted_ts_key_inds, sorted_ts_value_inds = torch.sort(ts_inds)
+            sorted_pos_key_inds, sorted_pos_value_inds = torch.sort(pos_inds)
+        ctx.save_for_backward(
+            sorted_pos_key_inds,
+            sorted_pos_value_inds,
+            sorted_ts_key_inds,
+            sorted_ts_value_inds,
+        )
+        ctx.B = B
+        ctx.D = D
+        ctx.max_seq_len = max_seq_len
+        ctx.pos_emb_size = pos_embeddings.shape[0]
+        ctx.ts_emb_size = ts_emb_size
+        ctx.pos_dtype = pos_embeddings.dtype
+        ctx.ts_dtype = ts_embeddings.dtype
+        return out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[
+        torch.Tensor,
+        None,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        (
+            sorted_pos_key_inds,
+            sorted_pos_value_inds,
+            sorted_ts_key_inds,
+            sorted_ts_value_inds,
+        ) = ctx.saved_tensors
+        d_pos_embeddings = torch.empty(
+            (ctx.pos_emb_size, ctx.D), device=d_out.device, dtype=torch.float32
+        )
+        d_ts_embeddings = torch.empty(
+            (ctx.ts_emb_size, ctx.D), device=d_out.device, dtype=torch.float32
+        )
+        grid = lambda meta: (triton.cdiv(d_out.shape[0], meta["BLOCK"]),)  # noqa E731
+        AUTOTUNE_B = prev_power_of_2(ctx.B)
+        _add_embeddings_bwd_kernel[grid](
+            In=d_out,
+            KeyInds=sorted_pos_key_inds,
+            ValueInds=sorted_pos_value_inds,
+            Out=d_pos_embeddings,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(ctx.max_seq_len),
+            AUTOTUNE_B=AUTOTUNE_B,
+            D=ctx.D,
+            jagged_size=d_out.shape[0],
+            stride_in=d_out.stride(0),
+            stride_on=d_pos_embeddings.stride(0),
+            BLOCK_D=triton.next_power_of_2(ctx.D),
+        )
+        _add_embeddings_bwd_kernel[grid](
+            In=d_out,
+            KeyInds=sorted_ts_key_inds,
+            ValueInds=sorted_ts_value_inds,
+            Out=d_ts_embeddings,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(ctx.max_seq_len),
+            AUTOTUNE_B=AUTOTUNE_B,
+            D=ctx.D,
+            jagged_size=d_out.shape[0],
+            stride_in=d_out.stride(0),
+            stride_on=d_ts_embeddings.stride(0),
+            BLOCK_D=triton.next_power_of_2(ctx.D),
+        )
+        return (
+            d_out,
+            None,
+            d_pos_embeddings.to(ctx.pos_dtype),
+            d_ts_embeddings.to(ctx.ts_dtype),
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_add_timestamp_positional_embeddings(
+    seq_embeddings: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    pos_embeddings: torch.Tensor,
+    ts_embeddings: torch.Tensor,
+    timestamps: torch.Tensor,
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    seq_lengths: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+    time_bucket_fn: str,
+) -> torch.Tensor:
+    return _AddTimestampPositionEmbeddingsFunction.apply(
+        seq_embeddings,
+        seq_offsets,
+        pos_embeddings,
+        ts_embeddings,
+        timestamps,
+        max_seq_len,
+        max_contextual_seq_len,
+        seq_lengths,
+        num_targets,
+        interleave_targets,
+        time_bucket_fn,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_swiglu.py b/recommendation_v4/generative_recommenders/ops/triton/triton_swiglu.py
new file mode 100644
index 000000000..5f30da53b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_swiglu.py
@@ -0,0 +1,753 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-unsafe
+
+from typing import List
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import triton_autotune
+from generative_recommenders.ops.utils import is_sm100_plus
+
+TMA_AVAILABLE = False
+try:
+    # @manual=//triton:triton
+    from triton.tools.tensor_descriptor import TensorDescriptor
+
+    TMA_AVAILABLE = True
+except ImportError:
+    pass
+
+HAS_TLX = False
+try:
+    # @manual=//triton:triton
+    import triton.language.extra.tlx as tlx  # type: ignore
+
+    HAS_TLX = True
+except ImportError:
+    pass
+
+
+def is_blackwell_triton_swiglu_supported() -> bool:
+    return is_sm100_plus() and TMA_AVAILABLE and HAS_TLX
+
+
+def _swiglu_tma_set_block_size_hook(nargs) -> None:
+    BLOCK_M = nargs["BLOCK_M"]
+    BLOCK_N = nargs["BLOCK_N"]
+    BLOCK_K = nargs["BLOCK_K"]
+    EPILOGUE_SUBTILE = nargs.get("EPILOGUE_SUBTILE", 1)
+
+    nargs["x_desc"].block_shape = [BLOCK_M, BLOCK_K]
+    nargs["w_gate_desc"].block_shape = [BLOCK_N, BLOCK_K]
+    nargs["w_up_desc"].block_shape = [BLOCK_N, BLOCK_K]
+    nargs["out_desc"].block_shape = [BLOCK_M, BLOCK_N // EPILOGUE_SUBTILE]
+
+
+def get_swiglu_configs(pre_hook) -> List[triton.Config]:
+    return [
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 128,
+                "BLOCK_K": 64,
+                "GROUP_M": 8,
+            },
+            num_stages=1,
+            num_warps=4,
+            pre_hook=pre_hook,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 128,
+                "BLOCK_K": 64,
+                "GROUP_M": 8,
+            },
+            num_stages=1,
+            num_warps=4,
+            pre_hook=pre_hook,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "BLOCK_K": 64,
+                "GROUP_M": 8,
+            },
+            num_stages=1,
+            num_warps=4,
+            pre_hook=pre_hook,
+        ),
+    ]
+
+
+@triton.jit
+def _compute_pid_swiglu(
+    tile_id,
+    num_pid_in_group,
+    num_pid_m,
+    GROUP_M: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+):
+    group_id = tile_id // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+    pid_m = first_pid_m + (tile_id % group_size_m)
+    pid_n = (tile_id % num_pid_in_group) // group_size_m
+    return pid_m, pid_n
+
+
+@triton_autotune(
+    configs=get_swiglu_configs(pre_hook=_swiglu_tma_set_block_size_hook),
+    key=["M_BLOCK", "N", "K"],
+)
+@triton.jit
+def _swiglu_fwd_tma_ws_persistent(
+    x_desc,
+    w_gate_desc,
+    w_up_desc,
+    out_desc,
+    M,
+    N,
+    K,
+    M_BLOCK,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    NUM_SMEM_BUFFERS: tl.constexpr,
+    NUM_TMEM_BUFFERS: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    EPILOGUE_SUBTILE: tl.constexpr,
+):
+    # Allocate SMEM buffers
+    x_buffers = tlx.local_alloc((BLOCK_M, BLOCK_K), x_desc.dtype, NUM_SMEM_BUFFERS)
+
+    # Allocate SMEM buffers for W_gate and W_up
+    w_gate_buffers = tlx.local_alloc(
+        (BLOCK_N, BLOCK_K), w_gate_desc.dtype, NUM_SMEM_BUFFERS
+    )
+    w_up_buffers = tlx.local_alloc(
+        (BLOCK_N, BLOCK_K), w_up_desc.dtype, NUM_SMEM_BUFFERS
+    )
+
+    # Allocate TMEM for accumulators
+    tmem_gate_buffers = tlx.local_alloc(
+        (BLOCK_M, BLOCK_N), tl.float32, NUM_TMEM_BUFFERS, tlx.storage_kind.tmem
+    )
+    tmem_up_buffers = tlx.local_alloc(
+        (BLOCK_M, BLOCK_N), tl.float32, NUM_TMEM_BUFFERS, tlx.storage_kind.tmem
+    )
+
+    # Barriers for Producer <-> MMA synchronization
+    smem_full_bars_x_gate = tlx.alloc_barriers(
+        num_barriers=NUM_SMEM_BUFFERS,
+        arrive_count=1,  # pyre-ignore[6]
+    )
+    smem_full_bars_up = tlx.alloc_barriers(
+        num_barriers=NUM_SMEM_BUFFERS,
+        arrive_count=1,  # pyre-ignore[6]
+    )
+    # Empty barriers: arrive_count=2 because both GEMM1 and GEMM2 signal completion
+    smem_empty_bars = tlx.alloc_barriers(
+        num_barriers=NUM_SMEM_BUFFERS,
+        arrive_count=2,  # pyre-ignore[6]
+    )
+
+    # Barriers for MMA <-> Epilogue synchronization
+    # pyre-ignore[6]
+    tmem_full_bars = tlx.alloc_barriers(num_barriers=NUM_TMEM_BUFFERS, arrive_count=1)
+    # pyre-ignore[6]
+    tmem_empty_bars = tlx.alloc_barriers(num_barriers=NUM_TMEM_BUFFERS, arrive_count=1)
+
+    with tlx.async_tasks():
+        # Epilogue Consumer: Reads from TMEM, applies SwiGLU, and stores to output
+        with tlx.async_task("default"):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+
+            # Initialize buffer tracking
+            processed_tiles = 0
+
+            for tile_id in range(start_pid, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid_swiglu(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+                offs_m = pid_m * BLOCK_M
+                offs_n = pid_n * BLOCK_N
+
+                cur_tmem_buf = processed_tiles % int(NUM_TMEM_BUFFERS)
+                tmem_read_phase = (processed_tiles // int(NUM_TMEM_BUFFERS)) & 1
+
+                # Wait for MMA to finish writing to TMEM
+                # pyre-ignore[16]
+                tlx.barrier_wait(tmem_full_bars[cur_tmem_buf], tmem_read_phase)
+
+                # Load gate and up results from TMEM
+                # pyre-ignore[16]
+                gate_tmem = tmem_gate_buffers[cur_tmem_buf]
+                up_tmem = tmem_up_buffers[cur_tmem_buf]
+
+                if EPILOGUE_SUBTILE > 1:
+                    # Process tile in subtiles
+                    slice_size: tl.constexpr = BLOCK_N // EPILOGUE_SUBTILE
+                    for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                        gate_subslice = tlx.local_slice(
+                            gate_tmem,
+                            [0, slice_id * slice_size],
+                            # pyre-ignore[6]
+                            [BLOCK_M, slice_size],
+                        )
+                        up_subslice = tlx.local_slice(
+                            up_tmem,
+                            [0, slice_id * slice_size],
+                            # pyre-ignore[6]
+                            [BLOCK_M, slice_size],
+                        )
+
+                        gate = tlx.local_load(gate_subslice).to(out_desc.dtype)
+                        up = tlx.local_load(up_subslice).to(out_desc.dtype)
+
+                        gate_fp32 = gate.to(tl.float32)
+                        silu_gate = (gate_fp32 * tl.sigmoid(gate_fp32)).to(
+                            out_desc.dtype
+                        )
+                        result = silu_gate * up
+
+                        out_desc.store([offs_m, offs_n + slice_id * slice_size], result)
+                else:
+                    # Process full tile
+                    gate = tlx.local_load(gate_tmem).to(out_desc.dtype)
+                    up = tlx.local_load(up_tmem).to(out_desc.dtype)
+
+                    gate_fp32 = gate.to(tl.float32)
+                    silu_gate = (gate_fp32 * tl.sigmoid(gate_fp32)).to(out_desc.dtype)
+                    result = silu_gate * up
+
+                    out_desc.store([offs_m, offs_n], result)
+
+                # Signal MMA that TMEM buffer is free
+                # pyre-ignore[6]
+                tlx.barrier_arrive(tmem_empty_bars[cur_tmem_buf], 1)
+
+                processed_tiles += 1
+
+        # MMA Consumer: Computes both GEMMs: gate = X @ W_gate, up = X @ W_up
+        with tlx.async_task(num_warps=4, num_regs=232):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            processed_k_iters = 0
+            processed_tiles = 0
+
+            for tile_id in range(start_pid, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid_swiglu(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+
+                cur_tmem_buf = processed_tiles % int(NUM_TMEM_BUFFERS)
+                tmem_write_phase = (processed_tiles // int(NUM_TMEM_BUFFERS)) & 1
+
+                # Wait for epilogue to finish
+                tlx.barrier_wait(tmem_empty_bars[cur_tmem_buf], tmem_write_phase ^ 1)
+
+                # Perform K-dimension reduction for both GEMMs
+                for k in range(0, k_tiles):
+                    buf = (processed_k_iters + k) % int(NUM_SMEM_BUFFERS)
+
+                    total_iters = processed_k_iters + k
+                    dot_phase = (total_iters // int(NUM_SMEM_BUFFERS)) & 1
+
+                    # Wait for x and w_gate to be loaded, then start GEMM1
+                    tlx.barrier_wait(smem_full_bars_x_gate[buf], dot_phase)
+
+                    # Transpose weight buffer for MMA
+                    w_gate_trans = tlx.local_trans(w_gate_buffers[buf])
+
+                    # GEMM 1: gate = X @ W_gate.T
+                    tlx.async_dot(
+                        x_buffers[buf],
+                        w_gate_trans,
+                        tmem_gate_buffers[cur_tmem_buf],
+                        # pyre-ignore[6]
+                        use_acc=(k > 0),
+                        mBarriers=[smem_empty_bars[buf]],
+                        out_dtype=tl.float32,
+                    )
+
+                    # Wait for w_up to be loaded before starting GEMM2
+                    tlx.barrier_wait(smem_full_bars_up[buf], dot_phase)
+
+                    w_up_trans = tlx.local_trans(w_up_buffers[buf])
+
+                    # GEMM 2: up = X @ W_up.T
+                    tlx.async_dot(
+                        x_buffers[buf],
+                        w_up_trans,
+                        tmem_up_buffers[cur_tmem_buf],
+                        # pyre-ignore[6]
+                        use_acc=(k > 0),
+                        mBarriers=[smem_empty_bars[buf]],
+                        out_dtype=tl.float32,
+                    )
+
+                # Wait for last MMA to complete
+                last_buf = (processed_k_iters + k_tiles - 1) % int(NUM_SMEM_BUFFERS)
+                last_total_iters = processed_k_iters + k_tiles - 1
+                last_dot_phase = (last_total_iters // int(NUM_SMEM_BUFFERS)) & 1
+                tlx.barrier_wait(smem_empty_bars[last_buf], last_dot_phase)
+
+                # Signal epilogue that results are ready
+                # pyre-ignore[6]
+                tlx.barrier_arrive(tmem_full_bars[cur_tmem_buf], 1)
+
+                processed_tiles += 1
+                processed_k_iters += k_tiles
+
+        # Producer: TMA loads for X, W_gate, W_up
+        with tlx.async_task(num_warps=1, num_regs=24):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            # Initialize phase tracking
+            processed_k_iters = 0
+
+            for tile_id in range(start_pid, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid_swiglu(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+                offs_m = pid_m * BLOCK_M
+                offs_n = pid_n * BLOCK_N
+
+                for k in range(0, k_tiles):
+                    buf = (processed_k_iters + k) % int(NUM_SMEM_BUFFERS)
+
+                    total_iters = processed_k_iters + k
+                    load_phase = (total_iters // int(NUM_SMEM_BUFFERS)) & 1
+
+                    # Wait for buffer to be free
+                    tlx.barrier_wait(smem_empty_bars[buf], load_phase ^ 1)
+
+                    offs_k = k * BLOCK_K
+
+                    # Set expected bytes for x+w_gate barrier
+                    tlx.barrier_expect_bytes(
+                        smem_full_bars_x_gate[buf],
+                        # pyre-ignore[6]
+                        2 * (BLOCK_M * BLOCK_K + BLOCK_K * BLOCK_N),
+                    )
+
+                    # Set expected bytes for w_up barrier
+                    tlx.barrier_expect_bytes(
+                        smem_full_bars_up[buf],
+                        # pyre-ignore[6]
+                        2 * (BLOCK_K * BLOCK_N),
+                    )
+
+                    # Load x and w_gate first, signal smem_full_bars_x_gate
+                    tlx.async_descriptor_load(
+                        x_desc,
+                        x_buffers[buf],
+                        [offs_m, offs_k],
+                        smem_full_bars_x_gate[buf],
+                    )
+
+                    # Weights are in [N, K] layout, load with [offs_n, offs_k]
+                    tlx.async_descriptor_load(
+                        w_gate_desc,
+                        w_gate_buffers[buf],
+                        [offs_n, offs_k],
+                        smem_full_bars_x_gate[buf],
+                    )
+
+                    # Load w_up separately, signal smem_full_bars_up
+                    tlx.async_descriptor_load(
+                        w_up_desc,
+                        w_up_buffers[buf],
+                        [offs_n, offs_k],
+                        smem_full_bars_up[buf],
+                    )
+
+                processed_k_iters += k_tiles
+
+
+@torch.fx.wrap
+def triton_swiglu_fwd_tma_ws_persistent_tlx(
+    x: torch.Tensor,
+    w_gate: torch.Tensor,
+    w_up: torch.Tensor,
+) -> torch.Tensor:
+    M, K = x.shape
+    N, K_gate = w_gate.shape
+    N_up, K_up = w_up.shape
+
+    # Only bf16/fp16 supported by the kernel
+    supported_dtypes = (torch.bfloat16, torch.float16)
+    assert x.dtype in supported_dtypes, (
+        f"x.dtype must be bfloat16 or float16, got {x.dtype}"
+    )
+    assert w_gate.dtype in supported_dtypes, (
+        f"w_gate.dtype must be bfloat16 or float16, got {w_gate.dtype}"
+    )
+    assert w_up.dtype in supported_dtypes, (
+        f"w_up.dtype must be bfloat16 or float16, got {w_up.dtype}"
+    )
+
+    assert K == K_gate, f"Incompatible dimensions: x.K={K}, w_gate.K={K_gate}"
+    assert K == K_up, f"Incompatible dimensions: x.K={K}, w_up.K={K_up}"
+    assert N == N_up, f"Incompatible dimensions: w_gate.N={N}, w_up.N={N_up}"
+
+    # Allocate output
+    out = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return out
+
+    M_BLOCK = triton.next_power_of_2(M)
+
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+
+    # A dummy block value that will be overwritten by the hook
+    dummy_block = [1, 1]
+
+    # pyre-ignore[6]
+    x_desc = TensorDescriptor(x, x.shape, x.stride(), dummy_block)
+    # pyre-ignore[6]
+    w_gate_desc = TensorDescriptor(w_gate, w_gate.shape, w_gate.stride(), dummy_block)
+    # pyre-ignore[6]
+    w_up_desc = TensorDescriptor(w_up, w_up.shape, w_up.stride(), dummy_block)
+    # pyre-ignore[6]
+    out_desc = TensorDescriptor(out, out.shape, out.stride(), dummy_block)
+
+    def grid(meta):
+        BLOCK_M = meta["BLOCK_M"]
+        BLOCK_N = meta["BLOCK_N"]
+        return (
+            min(
+                NUM_SMS,
+                triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+            ),
+        )
+
+    _swiglu_fwd_tma_ws_persistent[grid](
+        x_desc,
+        w_gate_desc,
+        w_up_desc,
+        out_desc,
+        M,
+        N,
+        K,
+        M_BLOCK,
+        NUM_SMS=NUM_SMS,
+        NUM_SMEM_BUFFERS=4,
+        NUM_TMEM_BUFFERS=2,
+        EPILOGUE_SUBTILE=2,
+    )
+    return out
+
+
+# =============================================================================
+# Standard fused SwiGLU kernel for A100/H100 (non-TLX path).
+#
+# Fuses silu(x @ W_gate^T) * (x @ W_up^T) into a single kernel launch.
+# Uses standard Triton pointer arithmetic (no TMA), works on SM80+.
+#
+# Key optimization: x is loaded from HBM ONCE and reused for both GEMMs.
+# Activation (silu * up) is computed in float32 registers, no HBM round-trip.
+#
+# Weight layout: expects [N, K] (nn.Linear native format).
+# The wrapper transposes to [K, N] for the GEMM internally.
+# =============================================================================
+
+
+def _get_swiglu_fwd_configs() -> List[triton.Config]:
+    """
+    Autotune configs for the standard (non-TLX) fused SwiGLU kernel.
+
+    Two float32 accumulators (gate + up) double register pressure vs single
+    GEMM, so smaller block sizes are included.
+    """
+    configs = [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=3,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8},
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=3,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8},
+            num_stages=3,
+            num_warps=8,
+        ),
+    ]
+    if torch.version.hip:
+        hip_num_stages = 2 if triton.__version__ >= "3.2.0" else 0
+        configs.extend(
+            [
+                triton.Config(
+                    {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_M": 8},
+                    num_stages=hip_num_stages,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_M": 8},
+                    num_stages=hip_num_stages,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+                    num_stages=hip_num_stages,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+                    num_stages=hip_num_stages,
+                    num_warps=4,
+                ),
+            ]
+        )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_swiglu_fwd_configs(),
+    key=["M_BLOCK", "N", "K"],
+)
+@triton.jit
+def _swiglu_fwd_kernel(
+    # Pointers to input/output tensors
+    x_ptr,  # [M, K] input activation
+    w_gate_ptr,  # [K, N] gate weight (already transposed from [N, K])
+    w_up_ptr,  # [K, N] up weight (already transposed from [N, K])
+    out_ptr,  # [M, N] output = silu(x @ w_gate) * (x @ w_up)
+    # Matrix dimensions
+    M,  # rows in x (batch_size * seq_len)
+    N,  # output dimension (hidden_dim)
+    K,  # input/reduction dimension (input_dim)
+    M_BLOCK,  # next_power_of_2(M) for stable autotuning
+    # Strides
+    stride_xm,
+    stride_xk,
+    stride_wgk,
+    stride_wgn,
+    stride_wuk,
+    stride_wun,
+    stride_om,
+    stride_on,
+    # Compile-time constants
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+):
+    """
+    Fused SwiGLU forward: out = silu(x @ W_gate) * (x @ W_up).
+
+    Each thread block computes one [BLOCK_M, BLOCK_N] output tile.
+    Two accumulators share the same x tile loads (the fusion benefit).
+    """
+    # -- Step 1: Compute tile coordinates with grouped ordering (L2 reuse) --
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+
+    # -- Step 2: Set up pointers for x, w_gate, w_up tiles --
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_K)
+    mask_m = (pid_m * BLOCK_M + offs_m)[:, None] < M
+    mask_n = (pid_n * BLOCK_N + offs_n)[None, :] < N
+    # [BLOCK_M, BLOCK_K]
+    x_ptrs = (
+        x_ptr
+        + (pid_m.to(tl.int64) * BLOCK_M + offs_m)[:, None] * stride_xm
+        + offs_k[None, :] * stride_xk
+    )
+    # [BLOCK_K, BLOCK_N]
+    wg_ptrs = (
+        w_gate_ptr
+        + offs_k[:, None] * stride_wgk
+        + (pid_n.to(tl.int64) * BLOCK_N + offs_n)[None, :] * stride_wgn
+    )
+
+    # [BLOCK_K, BLOCK_N]
+    wu_ptrs = (
+        w_up_ptr
+        + offs_k[:, None] * stride_wuk
+        + (pid_n.to(tl.int64) * BLOCK_N + offs_n)[None, :] * stride_wun
+    )
+
+    # -- Step 3: K-loop - two GEMMs sharing the same x tile --
+    acc_gate = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    acc_up = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        mask_k = offs_k[None, :] < K - k * BLOCK_K
+        x = tl.load(x_ptrs, mask=mask_m & mask_k, other=0.0)
+        mask_k = offs_k[:, None] < K - k * BLOCK_K
+        wg = tl.load(wg_ptrs, mask=mask_k & mask_n, other=0.0)
+        wu = tl.load(wu_ptrs, mask=mask_k & mask_n, other=0.0)
+
+        acc_gate += tl.dot(x, wg, allow_tf32=ALLOW_TF32)
+        acc_up += tl.dot(x, wu, allow_tf32=ALLOW_TF32)
+
+        x_ptrs += BLOCK_K * stride_xk
+        wg_ptrs += BLOCK_K * stride_wgk
+        wu_ptrs += BLOCK_K * stride_wuk
+
+    # -- Step 4: Apply SwiGLU activation in registers (no HBM round-trip) --
+    gate_activated = acc_gate * tl.sigmoid(acc_gate)  # silu
+    result = (gate_activated * acc_up).to(out_ptr.dtype.element_ty)
+
+    # -- Step 5: Store result --
+    offs_m = pid_m * BLOCK_M + offs_m
+    offs_n = pid_n * BLOCK_N + offs_n
+    out_ptrs = out_ptr + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
+    tl.store(out_ptrs, result, mask=mask_m & mask_n)
+
+
+def triton_swiglu_fwd(
+    x: torch.Tensor,
+    w_gate: torch.Tensor,
+    w_up: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Forward pass of fused SwiGLU (non-TLX path). Works on A100/H100/MI300X.
+
+    Computes: silu(x @ w_gate^T) * (x @ w_up^T)
+
+    Args:
+        x: [M, K] input tensor
+        w_gate: [N, K] gate weight (nn.Linear format)
+        w_up: [N, K] up weight (nn.Linear format)
+
+    Returns:
+        [M, N] output tensor
+    """
+    M, K = x.shape
+    N, K_gate = w_gate.shape
+    N_up, K_up = w_up.shape
+    assert K == K_gate, f"x.K={K} != w_gate.K={K_gate}"
+    assert K == K_up, f"x.K={K} != w_up.K={K_up}"
+    assert N == N_up, f"w_gate.N={N} != w_up.N={N_up}"
+
+    out = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return out
+
+    M_BLOCK = triton.next_power_of_2(M)
+
+    # Transpose weights from [N, K] to [K, N] for the GEMM kernel
+    w_gate_t = w_gate.t().contiguous()
+    w_up_t = w_up.t().contiguous()
+
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(M, meta["BLOCK_M"]) * triton.cdiv(N, meta["BLOCK_N"]),
+    )
+
+    _swiglu_fwd_kernel[grid](
+        x,
+        w_gate_t,
+        w_up_t,
+        out,
+        M,
+        N,
+        K,
+        M_BLOCK,
+        x.stride(0),
+        x.stride(1),
+        w_gate_t.stride(0),
+        w_gate_t.stride(1),
+        w_up_t.stride(0),
+        w_up_t.stride(1),
+        out.stride(0),
+        out.stride(1),
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+    )
+    return out
+
+
+def triton_swiglu(
+    x: torch.Tensor,
+    w_gate: torch.Tensor,
+    w_up: torch.Tensor,
+) -> torch.Tensor:
+    if is_sm100_plus() and TMA_AVAILABLE and HAS_TLX:
+        # Blackwell: use the fast TLX persistent kernel with TMA
+        _, K = x.shape
+        N, _ = w_gate.shape
+        assert K % 16 == 0 and N % 16 == 0, (
+            f"K ({K}) and N ({N}) must be divisible by 16 for TMA alignment"
+        )
+        return triton_swiglu_fwd_tma_ws_persistent_tlx(x, w_gate, w_up)
+    else:
+        # A100/H100: use the standard fused kernel
+        return triton_swiglu_fwd(x, w_gate, w_up)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/README.md b/recommendation_v4/generative_recommenders/ops/triton_aot/README.md
new file mode 100644
index 000000000..2b0b1a834
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/README.md
@@ -0,0 +1,54 @@
+# Local Triton AOT Support
+
+This package is a minimal local copy of the Triton AOT pieces needed by the
+DLRM v3 HSTU inference end-to-end test. It avoids depending on the standalone
+`fbcode/triton_aot` package while preserving the compile, transform, and
+runtime-loading flow used by `generative_recommenders`.
+
+This is not intended to be a full fork of `fbcode/triton_aot`. Keep changes
+scoped to the GR inference use case unless a broader migration plan exists.
+
+## Code Structure
+
+- `types.py`: local `TritonAOT` registration object and `triton_aot` helper used
+  by GR AOT wrapper modules.
+- `preprocess.py`: FX graph preprocessing helpers, including wrapper-node
+  unwrapping before compile/transform.
+- `triton_*.py`: GR kernel-specific AOT wrapper modules for addmm, jagged
+  concat/split, layer norm variants, HSTU attention, and timestamp position
+  embeddings.
+- `compile/`: compile-time state, Triton signature/spec processing, generated
+  C++ codegen, and the `TritonAOTCompile` context manager.
+- `transform/`: FX graph transformation and generated Python wrapper code that
+  swaps Python AOT wrappers for `torch.ops.triton_aot.*` calls backed by built
+  shared libraries.
+- `build/`: extension builders and CUBIN embedding utilities used to create
+  loadable kernel libraries from compiled Triton artifacts.
+- `templates/`: C++ template files used by the compile/codegen path for kernel
+  entry points, embedded CUBIN data, and Torch operator registration.
+- `shared/`: compatibility helpers and type/spec conversion utilities shared by
+  compile and transform code.
+
+## Runtime Flow
+
+1. GR `triton_*.py` wrappers expose Triton kernels through local `triton_aot`
+   descriptors.
+2. `TritonAOTCompile` runs representative CUDA inputs, records kernel specs, and
+   compiles the collected Triton kernels into shared libraries.
+3. `transform_kernels` rewrites the FX graph so wrapper calls dispatch through
+   `torch.ops.triton_aot.*`.
+4. The e2e test copies the generated libraries into its workdir and passes them
+   to the C++ runner before executing the scripted sparse/dense modules.
+
+
+## Authors
+
+- Chang Pan <changpan@meta.com>
+- Zhiyong Wang (MRS) <zhiywang@meta.com>
+- Chenzhi Yu <chenzhi@meta.com>
+- Runming Lu <runming@meta.com>
+- Chun-Wei Chen <jcwchen@meta.com>
+- Michael He <michaelhe@meta.com>
+- Linjian Ma <linjianma@meta.com>
+- Xing Liu <xingl@meta.com>
+- Zhuoran Zhao <zhuoran@meta.com>
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py
new file mode 100644
index 000000000..bc5963674
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+"""ArgDescriptor — per-arg codegen descriptor for AOT-T.
+
+Centralises arg classification (pointer / scalar / constant) so every
+``gen_*`` function in ``codegen.py`` iterates descriptors instead of
+doing its own dict lookups into ``OpsUnit`` fields.
+
+Also provides type-mapping helpers that convert ``ArgDescriptor``
+metadata into context-specific C++ / TorchScript type strings.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from generative_recommenders.ops.triton_aot.compile.spec_processing import OpsUnit
+from triton.runtime.jit import JITFunction
+
+
+# ---------------------------------------------------------------------------
+# Type-mapping helpers
+# ---------------------------------------------------------------------------
+
+CONSTANT_SELECTOR_CTYPE: dict[type[Any], str] = {
+    bool: "bool",
+    int: "int",
+    str: "const std::string&",
+}
+
+CONSTANT_CPP_OP_CTYPE: dict[type[Any], str] = {
+    bool: "bool",
+    int: "int64_t",
+    str: "const std::string&",
+}
+
+CONSTANT_TORCH_SCHEMA: dict[type[Any], str] = {
+    bool: "bool",
+    int: "int",
+    str: "str",
+}
+
+
+def scalar_cpp_op_ctype(triton_dtype: str) -> str:
+    """Triton scalar dtype → widened C++ type for cpp_op / torch_op params."""
+    if triton_dtype.startswith("i"):
+        return "int64_t"
+    if triton_dtype.startswith("f"):
+        return "double"
+    if triton_dtype == "bool":
+        return "bool"
+    raise ValueError(f"Unsupported scalar dtype for cpp_op: {triton_dtype}")
+
+
+def scalar_torch_schema(triton_dtype: str) -> str:
+    """Triton scalar dtype → TorchScript schema type string."""
+    if triton_dtype.startswith("i"):
+        return "int"
+    if triton_dtype.startswith("f"):
+        return "float"
+    if triton_dtype == "bool":
+        return "bool"
+    raise ValueError(f"Unsupported scalar dtype for torch schema: {triton_dtype}")
+
+
+# ---------------------------------------------------------------------------
+# ArgDescriptor hierarchy
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class ArgDescriptor:
+    """Base class for per-arg codegen descriptors.
+
+    Built once by ``build_arg_descriptors`` and consumed by all ``gen_*``
+    functions.  Use ``isinstance`` to dispatch on arg kind:
+
+    - ``PointerArg`` — tensor pointer (required or optional)
+    - ``ScalarArg`` — non-pointer signature arg with a Triton dtype
+    - ``ConstantArg`` — compile-time constant with a Python type
+    """
+
+    name: str
+    index: int
+
+
+@dataclass(frozen=True)
+class PointerArg(ArgDescriptor):
+    """Tensor pointer arg (required or optional)."""
+
+    is_optional: bool
+
+
+@dataclass(frozen=True)
+class ScalarArg(ArgDescriptor):
+    """Non-pointer signature arg with a Triton dtype (e.g., ``"i32"``, ``"fp32"``).
+
+    ``triton_dtype`` is the **widest** type across all specs for this
+    position, computed by ``_compute_invariants`` via ``_wider_type``.
+    Individual specs may use a narrower type (e.g., ``"i32"`` when
+    ``triton_dtype`` is ``"i64"``); codegen adds ``fits_i32`` guards
+    and ``static_cast`` for narrowing.
+    """
+
+    triton_dtype: str
+
+
+@dataclass(frozen=True)
+class ConstantArg(ArgDescriptor):
+    """Compile-time constant arg with a Python type (``int``, ``str``, ``bool``)."""
+
+    python_type: type[Any]
+
+
+def build_arg_descriptors(
+    func: JITFunction[list[Any]],
+    unit: OpsUnit,
+) -> list[ArgDescriptor]:
+    """Build ordered arg descriptors from func arg names + OpsUnit invariants.
+
+    Single source of truth for arg classification.  Called once in
+    ``compile_to_cpp`` and passed to all downstream codegen functions.
+    """
+    result: list[ArgDescriptor] = []
+    for i, name in enumerate(func.arg_names):
+        if i in unit.pointer_args:
+            result.append(
+                PointerArg(name=name, index=i, is_optional=i in unit.optional)
+            )
+        elif i in unit.scalar_dtypes:
+            result.append(
+                ScalarArg(name=name, index=i, triton_dtype=unit.scalar_dtypes[i])
+            )
+        elif i in unit.constant_types:
+            result.append(
+                ConstantArg(name=name, index=i, python_type=unit.constant_types[i])
+            )
+        else:
+            raise ValueError(
+                f"Arg {name} (index {i}) not classified as pointer, scalar, "
+                f"or constant in OpsUnit"
+            )
+    return result
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py
new file mode 100644
index 000000000..f1b03848f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py
@@ -0,0 +1,780 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+"""C++ and Python code generation for AOT-T compiled kernels.
+
+Generates:
+  - kernel.h      (header with gridDims, tuner meta, selector proto)
+  - kernel.cpp    (cubin externs, loaders, launchers, selector)
+  - _torch_op.cpp (torch op registration)
+  - _meta.py      (Python autotuner meta function)
+"""
+
+import textwrap
+from collections import Counter
+from typing import Any
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.ops.triton_aot.compile.arg_descriptor import (
+    ArgDescriptor,
+    CONSTANT_CPP_OP_CTYPE,
+    CONSTANT_SELECTOR_CTYPE,
+    CONSTANT_TORCH_SCHEMA,
+    ConstantArg,
+    PointerArg,
+    scalar_cpp_op_ctype,
+    scalar_torch_schema,
+    ScalarArg,
+)
+from generative_recommenders.ops.triton_aot.compile.spec_processing import (
+    KernelSpec,
+    OpsUnit,
+)
+from generative_recommenders.ops.triton_aot.compile.stable_types import (
+    PY_TYPES_TO_CPP_TYPES,
+    SCALAR_TYPES,
+)
+from generative_recommenders.ops.triton_aot.compile.utils import (
+    hash_kernel_name,
+    unwrap_heuristic,
+)
+from generative_recommenders.ops.triton_aot.shared.compat import get_scratch_parameters
+from generative_recommenders.ops.triton_aot.shared.types import AUTOTUNE_ATTRs, CTYPES
+from generative_recommenders.ops.triton_aot.templates.template_utils import (
+    load_template,
+    render_template,
+)
+from triton.runtime.jit import JITFunction
+
+
+# ---------------------------------------------------------------------------
+# Kernel naming and binary generation
+# ---------------------------------------------------------------------------
+
+
+def gen_kernel_name(
+    fn: Any,
+    spec: KernelSpec,
+    cc: int | str,
+) -> str:
+    name = fn.__name__
+    sig = "_".join([p.replace("*", "p") for p in spec.signature.values()])
+    const = "_".join(map(str, spec.constants.values()))
+    cc_str = f"sm{cc}"
+    autotune_configs = []
+    autotune_configs.append(f"w{spec.num_warps}")
+    autotune_configs.append(f"s{spec.num_stages}")
+    # AMD only
+    autotune_configs.append(f"matrix{spec.matrix_instr_nonkdim}")
+    autotune_configs.append(f"wave{spec.waves_per_eu}")
+    autotune_configs.append(f"kpack{spec.kpack}")
+    # See kernel_suffix in triton/compiler/code_generator.py
+    suffix = ""
+    for i, _ in enumerate(spec.signature):
+        suffix += str(i)
+        if i in spec.divisible_by_16:
+            suffix += "d"
+        if i in spec.divisible_by_8:
+            suffix += "e"
+    return "_".join([name, cc_str, sig, const] + autotune_configs + [suffix])
+
+
+def gen_cubin(kernel_name: str, kernel: Any, install_dir: str, backend: str) -> str:
+    """Generate kernel binary file (.cubin or .hsaco) and return extern declaration.
+
+    Args:
+        kernel_name: Full kernel name including specialization suffix.
+        kernel: Compiled Triton kernel object containing binary in kernel.asm.
+        install_dir: Directory to write binary file.
+        backend: GPU backend ("cuda" or "hip").
+
+    Returns:
+        C++ extern declaration for the kernel binary array.
+    """
+    hashed = hash_kernel_name(kernel_name)
+    if backend == "hip":
+        binary_file = f"{install_dir}/{hashed}.hsaco"
+        with open(binary_file, "wb") as hsaco:
+            hsaco.write(kernel.asm["hsaco"])
+        target_symbol_name = f"{kernel_name}_cubin"
+    else:
+        binary_file = f"{install_dir}/{hashed}.cubin"
+        with open(binary_file, "wb") as cubin:
+            cubin.write(kernel.asm["cubin"])
+        target_symbol_name = f"{kernel_name}_cubin"
+
+    # We return extern declarations for both the array and its pointer.
+    # The pointer is used by gen_loader() to generate R_X86_64_64 relocations
+    # instead of R_X86_64_32, which allows the .triton section to be placed
+    # beyond the 4GB address limit in large binaries.
+    # Note: The pointer is volatile to prevent optimizer constant-propagation.
+    return f'extern "C" {{ extern unsigned char {target_symbol_name}[]; extern const void* volatile {target_symbol_name}_ptr; }}'
+
+
+def gen_loader(kernel_name: str, cubin_name: str, shared: int) -> str:
+    # TODO(changpan): Extract inline cuModuleLoadData/cuModuleGetFunction error
+    # handling into a shared helper to reduce generated code size.
+    return textwrap.dedent(
+        f"""
+        CUfunction load_{kernel_name}(void)
+        {{
+            thread_local std::unordered_map<int32_t, CUfunction> cache;
+            auto idx = torch::stable::accelerator::getCurrentDeviceIndex();
+            auto res = cache.find(idx);
+            if (res != cache.end()) {{
+                return res->second;
+            }}
+            CUfunction func;
+            CUmodule mod_ptr;
+            CUresult err;
+            // Use pointer to cubin data to generate R_X86_64_64 relocation
+            // instead of R_X86_64_32, allowing cubin data to be placed beyond 4GB
+            const void *image = {kernel_name}_cubin_ptr;
+
+            err = cuModuleLoadData(&mod_ptr, image);
+            if (err != 0) {{
+                const char* errStr;
+                cuGetErrorString(err, &errStr);
+                throw std::runtime_error("cuModuleLoadData failed for {kernel_name}: error " + std::to_string(err) + " (" + (errStr ? errStr : "unknown") + ")");
+            }}
+
+            err = cuModuleGetFunction(&func, mod_ptr, "{cubin_name}");
+            if (err != 0) {{
+                const char* errStr;
+                cuGetErrorString(err, &errStr);
+                throw std::runtime_error("cuModuleGetFunction failed for {kernel_name}: error " + std::to_string(err) + " (" + (errStr ? errStr : "unknown") + ")");
+            }}
+
+            check_errors({shared}, func);
+            cache.emplace(idx, func);
+            return func;
+        }}
+    """
+    )
+
+
+# ---------------------------------------------------------------------------
+# Launcher codegen (per-spec)
+# ---------------------------------------------------------------------------
+
+
+def gen_launcher_params(
+    descriptors: list[ArgDescriptor],
+    signature: dict[int, str],
+) -> str:
+    args = ["gridDims grid"]
+    for d in descriptors:
+        if d.index in signature:
+            if isinstance(d, PointerArg):
+                ctype = "void*"
+            else:
+                ctype = CTYPES[signature[d.index]]
+            args.append(f"{ctype} {d.name}")
+    return ", ".join(args)
+
+
+def gen_launch_args(
+    func: JITFunction[list[Any]],
+    spec: KernelSpec,
+) -> list[str]:
+    """Generate kernel launch argument list (pointers to non-constant arguments)."""
+    args = []
+    for i, arg in enumerate(func.arg_names):
+        if i in spec.constants:
+            continue
+        assert i in spec.signature, f"Argument {i} ({arg}) does not appear in signature"
+        args.append(f"&{arg}")
+    return args
+
+
+def gen_launcher(
+    kernel_name: str,
+    func: JITFunction[list[Any]],
+    kernel: Any,
+    shared: int,
+    warp_size: int,
+    spec: KernelSpec,
+    descriptors: list[ArgDescriptor],
+) -> str:
+    params = gen_launcher_params(descriptors, spec.signature)
+    args = gen_launch_args(func, spec)
+
+    scratch_declarations, scratch_args = get_scratch_parameters(kernel)
+    args.extend(scratch_args)
+
+    args_str = ", ".join(args)
+
+    return textwrap.dedent(
+        f"""
+        void {kernel_name}({params}) {{
+            CUfunction func = load_{kernel_name}();
+            cudaStream_t stream = grid.stream ? grid.stream : triton_aot_get_current_stream();
+            {scratch_declarations}
+            void *args[] = {{ {args_str} }};
+            auto res = cuLaunchKernel(func, grid.x, grid.y, grid.z, {warp_size} * {spec.num_warps}, 1, 1, {shared}, stream, args, NULL);
+            TRITON_AOT_CU_CHECK(res);
+        }}
+    """
+    )
+
+
+# ---------------------------------------------------------------------------
+# Selector codegen (invariant)
+# ---------------------------------------------------------------------------
+
+
+def gen_selector_params(
+    descriptors: list[ArgDescriptor],
+) -> str:
+    """Generate C++ selector function parameter list."""
+    args = ["gridDims grid"]
+    for d in descriptors:
+        if isinstance(d, PointerArg):
+            args.append(f"const std::optional<torch::stable::Tensor>& {d.name}")
+        elif isinstance(d, ScalarArg):
+            args.append(f"{CTYPES[d.triton_dtype]} {d.name}")
+        elif isinstance(d, ConstantArg):
+            args.append(f"{CONSTANT_SELECTOR_CTYPE[d.python_type]} {d.name}")
+
+    for name, value in AUTOTUNE_ATTRs.items():
+        args.append(f"{type(value).__name__} {name}")
+    return ", ".join(args)
+
+
+def gen_launcher_call_args(
+    descriptors: list[ArgDescriptor],
+    signature: dict[int, str],
+) -> str:
+    args = ["grid"]
+    for d in descriptors:
+        if d.index in signature:
+            if isinstance(d, PointerArg):
+                args.append(f"{d.name}.value().data_ptr()")
+            elif isinstance(d, ScalarArg) and signature[d.index] != d.triton_dtype:
+                args.append(f"static_cast<{CTYPES[signature[d.index]]}>({d.name})")
+            else:
+                args.append(d.name)
+    return ", ".join(args)
+
+
+def gen_guarded_calls(  # noqa: C901
+    func: JITFunction[list[Any]],
+    unit: OpsUnit,
+    descriptors: list[ArgDescriptor],
+) -> str:
+    desc_by_idx: dict[int, ArgDescriptor] = {d.index: d for d in descriptors}
+    calls = []
+    for spec in unit.specs:
+        kernel_name = gen_kernel_name(func, spec, unit.cc)
+        args = gen_launcher_call_args(descriptors, spec.signature)
+        guards = ""
+
+        # Guard on tensor dtypes (per-spec: different specs may have different dtypes)
+        for i, ttype in spec.signature.items():
+            d = desc_by_idx[i]
+            if not isinstance(d, PointerArg):
+                continue
+            arg = d.name
+            atype = SCALAR_TYPES[ttype]
+            guards += f"if ({arg}.has_value()) "
+            guards += f"if ({arg}.value().scalar_type() == {atype}) "
+
+        # Guard on int range (spec uses narrower type than selector)
+        for i, dtype in spec.signature.items():
+            d = desc_by_idx[i]
+            if isinstance(d, ScalarArg) and dtype != d.triton_dtype:
+                if dtype == "i32":
+                    guards += f"if (fits_i32({d.name})) "
+
+        # Guard on constant values.
+        for i, val in spec.constants.items():
+            arg = desc_by_idx[i].name
+            if isinstance(val, bool):
+                guards += f"if ({arg}) " if val else f"if (!({arg})) "
+            elif isinstance(val, str):
+                guards += f'if ({arg} == "{val}") '
+            elif val is None:
+                guards += f"if (!{arg}.has_value()) "
+            else:
+                guards += f"if ({arg} == {val}) "
+
+        # Guard on special constants
+        for name in AUTOTUNE_ATTRs.keys():
+            guards += f"if ({name} == {getattr(spec, name)}) "
+
+        # Guard on divisible_by_16
+        for i in spec.divisible_by_16:
+            arg = desc_by_idx[i].name
+            if i in spec.signature:
+                ttype = spec.signature[i]
+                if ttype.startswith("*"):
+                    guards += f"if ((((uintptr_t){arg}.value().data_ptr()) % 16) == 0) "
+                else:
+                    guards += f"if (({arg} % 16) == 0) "
+            elif i in spec.constants:
+                assert (spec.constants[i] % 16) == 0
+
+        # Guard on divisible_by_8
+        for i in spec.divisible_by_8:
+            arg = desc_by_idx[i].name
+            if i in spec.signature:
+                ttype = spec.signature[i]
+                # divisible_by_8 is only applied to int
+                if not ttype.startswith("*"):
+                    guards += f"if (({arg} % 8) == 0) "
+            elif i in spec.constants:
+                assert (spec.constants[i] % 8) == 0
+
+        # Call the specialization.
+        calls.append(f"{guards}return {kernel_name}({args});\n")
+    return "".join(calls)
+
+
+def gen_selector_proto(
+    descriptors: list[ArgDescriptor],
+    func_name: str,
+) -> str:
+    params = gen_selector_params(descriptors)
+    # Add Triton's default values for num warps/stages, etc
+    for name, value in AUTOTUNE_ATTRs.items():
+        params = params.replace(name, f"{name}={value}")
+    return f"void {func_name}({params});"
+
+
+def gen_failure_msg(
+    descriptors: list[ArgDescriptor],
+) -> str:
+    """Generate C++ ``<<``-chain for the dispatch-failure error message.
+
+    Groups parameters by category (Tensors / Scalars / Constants /
+    Autotune / Device).  Tensor entries include aligned16 status.
+    """
+    tensors: list[str] = []
+    scalars: list[str] = []
+    constants: list[str] = []
+
+    for d in descriptors:
+        if isinstance(d, PointerArg):
+            dtype_expr = (
+                f"({d.name}.has_value()"
+                f" ? c10::toString({d.name}.value().scalar_type())"
+                f' : "nullptr")'
+            )
+            align_expr = (
+                f"(({d.name}.has_value()"
+                f" && (((uintptr_t){d.name}.value().data_ptr()) % 16) == 0)"
+                f' ? "true" : "false")'
+            )
+            tensors.append(
+                f'" {d.name}=" << {dtype_expr} << "(aligned16=" << {align_expr} << ")"'
+            )
+        elif isinstance(d, ScalarArg):
+            scalars.append(f'" {d.name}=" << {d.name}')
+        elif isinstance(d, ConstantArg):
+            constants.append(f'" {d.name}=" << {d.name}')
+
+    autotune: list[str] = [f'" {n}=" << {n}' for n in AUTOTUNE_ATTRs]
+
+    sections: list[str] = []
+    if tensors:
+        sections.append('"\\n  Tensors:" << ' + " << ".join(tensors))
+    if scalars:
+        sections.append('"\\n  Scalars:" << ' + " << ".join(scalars))
+    if constants:
+        sections.append('"\\n  Constants:" << ' + " << ".join(constants))
+    sections.append('"\\n  Autotune:" << ' + " << ".join(autotune))
+    sections.append('"\\n  Device: cc=" << cc')
+
+    return " << ".join(sections)
+
+
+def gen_selector(
+    func: JITFunction[list[Any]],
+    unit: OpsUnit,
+    descriptors: list[ArgDescriptor],
+) -> str:
+    params = gen_selector_params(descriptors)
+    guarded_calls = gen_guarded_calls(func, unit, descriptors)
+    failure_msg = gen_failure_msg(descriptors)
+    return f"""
+        void {func.__name__}({params}) {{
+            auto cc = compute_capability();
+            if (grid.x * grid.y * grid.z > 0) {{
+                {guarded_calls}
+                std::stringstream ss;
+                ss << "[TritonAOT] No implementation found for {func.__name__}" << {failure_msg};
+                throw std::runtime_error(ss.str());
+            }}
+        }}
+    """
+
+
+# ---------------------------------------------------------------------------
+# Torch op codegen (invariant)
+# ---------------------------------------------------------------------------
+
+
+def gen_cpp_op_params(
+    descriptors: list[ArgDescriptor],
+) -> str:
+    args = []
+    for d in descriptors:
+        if isinstance(d, PointerArg):
+            args.append(f"std::optional<torch::stable::Tensor> {d.name}")
+        elif isinstance(d, ScalarArg):
+            args.append(f"{scalar_cpp_op_ctype(d.triton_dtype)} {d.name}")
+        elif isinstance(d, ConstantArg):
+            args.append(f"{CONSTANT_CPP_OP_CTYPE[d.python_type]} {d.name}")
+    for name, value in AUTOTUNE_ATTRs.items():
+        args.append(f"{PY_TYPES_TO_CPP_TYPES[type(value)]} {name}")
+    return ", ".join(args)
+
+
+def gen_torch_op_params(
+    descriptors: list[ArgDescriptor],
+    default_values: dict[str, Any],
+) -> str:
+    args = []
+
+    def gen_str_wrap(value: Any) -> Any:
+        return f'\\"{value}\\"' if isinstance(value, str) else value
+
+    def gen_default_str(arg: str) -> str:
+        return (
+            f" = {gen_str_wrap(default_values[arg])}" if arg in default_values else ""
+        )
+
+    for d in descriptors:
+        df_str = gen_default_str(d.name)
+        if isinstance(d, PointerArg):
+            t = chr(ord("a") + d.index)
+            args.append(f"Tensor({t}!)? {d.name}")
+        elif isinstance(d, ScalarArg):
+            args.append(f"{scalar_torch_schema(d.triton_dtype)} {d.name}{df_str}")
+        elif isinstance(d, ConstantArg):
+            args.append(f"{CONSTANT_TORCH_SCHEMA[d.python_type]} {d.name}{df_str}")
+    for name, value in AUTOTUNE_ATTRs.items():
+        args.append(f"{type(value).__name__} {name}={value}")
+    return ", ".join(args)
+
+
+def gen_torch_op(
+    func: JITFunction[list[Any]],
+    descriptors: list[ArgDescriptor],
+    default_values: dict[str, Any],
+) -> str:
+    cpp_params = gen_cpp_op_params(descriptors)
+    torch_params = gen_torch_op_params(descriptors, default_values)
+    arg_names = list(func.arg_names) + list(AUTOTUNE_ATTRs.keys())
+    args = ", ".join(arg_names)
+
+    # Generate a comment noting which tensor params are non-optional but
+    # promoted to Tensor? for TorchScript compatibility.
+    promoted = [
+        d.name for d in descriptors if isinstance(d, PointerArg) and not d.is_optional
+    ]
+    type_comment = ""
+    if promoted:
+        type_comment = (
+            f"// Note: {', '.join(promoted)} are non-optional but use Tensor? "
+            "for TorchScript compatibility.\n"
+            "// Dispatch uses HAS_XXX constexpr ints, not tensor presence.\n"
+        )
+    return textwrap.dedent(
+        f"""
+        namespace {{
+        triton::aot::gridDims dims_from_vec(
+            const std::vector<int64_t>& grid
+        ) {{
+          return triton::aot::gridDims(
+              grid.size() > 0 ? grid[0] : 1,
+              grid.size() > 1 ? grid[1] : 1,
+              grid.size() > 2 ? grid[2] : 1
+          );
+        }}
+
+        {type_comment}void {func.__name__}_op(
+            std::vector<int64_t> grid,
+            {cpp_params}
+        ) {{
+            triton::aot::{func.__name__}(
+                dims_from_vec(grid),
+                {args}
+            );
+        }}
+
+        void {func.__name__}_dummy_op(
+            std::vector<int64_t> grid,
+            {cpp_params}
+        ) {{
+            // Do nothing.  The op is a dummy for model transform,
+            // processing, and splitting services.
+        }}
+        }}
+
+        STABLE_TORCH_LIBRARY_FRAGMENT(triton_aot, m) {{
+          m.def("{func.__name__}(int[] grid, {torch_params}) -> ()");
+        }}
+        STABLE_TORCH_LIBRARY_IMPL(triton_aot, CUDA, m) {{
+          m.impl("{func.__name__}", TORCH_BOX(&{func.__name__}_op));
+        }}
+
+        STABLE_TORCH_LIBRARY_IMPL(triton_aot, CPU, m) {{
+          m.impl("{func.__name__}", TORCH_BOX(&{func.__name__}_dummy_op));
+        }}
+
+        STABLE_TORCH_LIBRARY_IMPL(triton_aot, Meta, m) {{
+          m.impl("{func.__name__}", TORCH_BOX(&{func.__name__}_dummy_op));
+        }}
+        """
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tuner meta codegen
+# ---------------------------------------------------------------------------
+
+
+def key_names_and_idx(func: Any) -> tuple[list[str], list[int]]:
+    if hasattr(func, "key_idx"):
+        arg_names = [func.arg_names[idx] for idx in func.key_idx]
+        key_idx = func.key_idx
+    else:
+        arg_names = func.keys
+        key_idx = [func.arg_names.index(arg) for arg in arg_names]
+    return arg_names, key_idx
+
+
+def is_non_empty_mapping_of_type(obj: object, value_type: type[Any]) -> bool:
+    """Check if object is a non-empty dict with all values of specific type"""
+    if not obj or not isinstance(obj, dict):
+        return False
+
+    return all(isinstance(value, value_type) for value in obj.values())
+
+
+_LAUNCH_PARAM_NAMES: list[str] = ["num_warps", "num_stages"]
+
+
+def gen_tuner_meta_py(
+    func: Any,
+    tuner_fallback: bool,
+    unit: OpsUnit,
+) -> str:
+    vals = []
+
+    guard_list = []
+
+    # Use custom meta generation function if available
+    if hasattr(func, "gen_autotune_select_meta_src"):
+        return func.gen_autotune_select_meta_src(unit.constant_types)
+
+    if hasattr(func, "cache") and is_non_empty_mapping_of_type(
+        func.cache, triton.runtime.autotuner.Config
+    ):
+        # auto tuned configs
+        arg_names, key_idx = key_names_and_idx(func)
+
+        in_args = ", ".join(
+            [
+                f"{name}: {unit.constant_types[idx].__name__ if idx in unit.constant_types else 'int'}"
+                for idx, name in zip(key_idx, arg_names)
+            ]
+        )
+
+        cfg_first = next(iter(func.cache.values()))
+        return_names = list(cfg_first.kwargs.keys()) + _LAUNCH_PARAM_NAMES
+
+        for key, cfg in func.cache.items():
+            val = list(cfg.kwargs.values()) + [cfg.num_warps, cfg.num_stages]
+            val = tuple(val)
+            vals.append(val)
+            equations = []
+            for arg, value in zip(arg_names, key):
+                if isinstance(value, str):
+                    equations.append(f"{arg} == '{value}'")
+                elif isinstance(value, bool):
+                    equations.append(f"{arg} == {int(value)}")
+                else:
+                    equations.append(f"{arg} == {value}")
+            guard_list.append(f"if {' and '.join(equations)}: return {val}")
+
+    else:
+        # default configs — single spec, use specs[0]
+        in_args = ""
+        arg_names = list(_LAUNCH_PARAM_NAMES)
+        return_names = list(_LAUNCH_PARAM_NAMES)
+        val = unit.specs[0].num_warps, unit.specs[0].num_stages
+        vals.append(val)
+
+    name = unwrap_heuristic(func, JITFunction).__name__
+    meta = name + "_meta"
+
+    guards = "\n        ".join(guard_list)
+
+    fmt_args = ", ".join([f"{{{arg_name}}}" for arg_name in arg_names])
+
+    raise_runtime_error_str = (
+        f"""raise RuntimeError(f"No autotuning config found for {name}({fmt_args})")"""
+    )
+    fallback_str = f"""return {Counter(vals).most_common(1)[0][0]}"""
+
+    returns_comment = f"# Returns: ({', '.join(return_names)})"
+
+    return textwrap.dedent(
+        f"""
+    def {meta}({in_args}):
+        {returns_comment}
+        {guards}
+        {fallback_str if tuner_fallback else raise_runtime_error_str}
+    """
+    )
+
+
+def gen_tuner_meta_cpp(
+    func: Any,
+    tuner_fallback: bool,
+    constant_types: dict[int, type[Any]],
+) -> str:
+    # TODO(changpan): This C++ inline _meta is currently dead code — no C++ caller
+    # invokes it.  The Python _meta.py (gen_tuner_meta_py) is the only consumer.
+    # Double check, try remove this and the TUNER_META_CPP template region.
+    def infer_arg_type(idx: int) -> str:
+        if idx in constant_types:
+            return PY_TYPES_TO_CPP_TYPES[constant_types[idx]]
+        else:
+            return "int64_t"
+
+    arg_names, key_idx = key_names_and_idx(func)
+
+    in_args = ", ".join(
+        [f"{infer_arg_type(idx)} {name}" for idx, name in zip(key_idx, arg_names)]
+    )
+
+    vals = []
+    guard_list = []
+    for key, cfg in func.cache.items():
+        val = list(cfg.kwargs.values()) + [cfg.num_warps, cfg.num_stages]
+        val = tuple(val)
+        vals.append(val)
+        equations = []
+        for arg, value in zip(arg_names, key):
+            if isinstance(value, str):
+                equations.append(f'{arg} == "{value}"')
+            elif isinstance(value, bool):
+                equations.append(f"{arg} == {int(value)}")
+            else:
+                equations.append(f"{arg} == {value}")
+        guard_list.append(f"if ({' && '.join(equations)}) return std::make_tuple{val};")
+    guards = "\n        ".join(guard_list)
+    name = unwrap_heuristic(func, JITFunction).__name__
+    meta = name + "_meta"
+    fmt_args = ", ".join([f"{arg_name}" for arg_name in arg_names])
+    raise_runtime_error_str = f"""throw std::runtime_error("No autotuning config found for {name}({fmt_args})");"""
+    fallback_str = f"""return std::make_tuple{Counter(vals).most_common(1)[0][0]};"""
+    # Infer the return type from the actual values
+    return_type = _infer_return_type(vals[0])
+    return textwrap.dedent(
+        f"""
+    inline std::tuple<{return_type}> {meta}({in_args}) {{
+        {guards}
+        {fallback_str if tuner_fallback else raise_runtime_error_str}
+    }}
+    """
+    )
+
+
+def _infer_return_type(vals: tuple[Any, ...]) -> str:
+    types = [PY_TYPES_TO_CPP_TYPES.get(type(val)) for val in vals]
+    try:
+        # pyre-fixme[6]: For 1st argument expected
+        #  `Iterable[typing_extensions.LiteralString]` but got `List[Optional[str]]`.
+        return ", ".join(types)
+    except TypeError:  # one of the types cannot be inferred, e.g. `None`
+        raise ValueError("Cannot infer return type from `vals`")
+
+
+# ---------------------------------------------------------------------------
+# Top-level codegen entry points
+# ---------------------------------------------------------------------------
+
+
+def generate_header_content(
+    tuned_func: triton.runtime.autotuner.Autotuner | None,
+    func: JITFunction[list[Any]],
+    unit: OpsUnit,
+    descriptors: list[ArgDescriptor],
+    tuner_fallback: bool,
+) -> str:
+    """Generate the content of the .h header file."""
+    h_template = load_template("kernel.h")
+    tuner_meta_cpp = (
+        gen_tuner_meta_cpp(tuned_func, tuner_fallback, unit.constant_types)
+        if tuned_func
+        else ""
+    )
+    selector_proto = gen_selector_proto(descriptors, func.__name__)
+    return render_template(
+        h_template,
+        {
+            "TUNER_META_CPP": tuner_meta_cpp,
+            "SELECTOR_PROTO": selector_proto,
+        },
+    )
+
+
+def generate_kernel_cpp_content(
+    func: JITFunction[list[Any]],
+    unit: OpsUnit,
+    descriptors: list[ArgDescriptor],
+    prefix: str,
+    generated_specs: list[str],
+    backend: str,
+) -> str:
+    """Generate the content of the kernel .cpp file.
+
+    All tensor params use Tensor? for TorchScript compatibility.
+    Dispatch relies on HAS_XXX constexpr ints, not tensor presence.
+    """
+    cpp_template = load_template("kernel.cpp")
+    kernel_specs = "\n".join(generated_specs)
+    selector = gen_selector(func, unit, descriptors)
+
+    # On AMD, apply hipification to generated code (KERNEL_SPECS, SELECTOR)
+    # Templates are already hipified at load time (from hip/ subdirectory)
+    if backend == "hip":
+        from torch._inductor.codegen.aoti_hipify_utils import maybe_hipify_code_wrapper
+
+        kernel_specs = maybe_hipify_code_wrapper(kernel_specs, force_hipify=True)
+        selector = maybe_hipify_code_wrapper(selector, force_hipify=True)
+
+    cpp_content = render_template(
+        cpp_template,
+        {
+            "HEADER_INCLUDE": f'#include "{prefix}.h"\n',
+            "KERNEL_SPECS": kernel_specs,
+            "SELECTOR": selector,
+        },
+    )
+    return cpp_content
+
+
+def generate_torch_op_content(
+    func: JITFunction[list[Any]],
+    descriptors: list[ArgDescriptor],
+    prefix: str,
+    default_values: dict[str, Any],
+) -> str:
+    """Generate the content of the torch_op .cpp file."""
+    torch_template = load_template("torch_op.cpp")
+    torch_op_content = gen_torch_op(func, descriptors, default_values)
+    torch_content = render_template(
+        torch_template,
+        {
+            "HEADER_INCLUDE": f'#include "{prefix}.h"\n',
+            "TORCH_OP": torch_op_content,
+        },
+    )
+    return torch_content
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py
new file mode 100644
index 000000000..231cb52f1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py
@@ -0,0 +1,409 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# pyre-strict
+
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+import tempfile
+from inspect import getcallargs, Parameter, signature
+from typing import Any, Callable, Dict, List, Optional, Set
+
+import torch
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.ops.triton_aot.compile.stable_types import SCALAR_TYPES
+from generative_recommenders.ops.triton_aot.compile.utils import is_autotuner
+from generative_recommenders.ops.triton_aot.types import (
+    Annotation,
+    AnnotationHint,
+    TritonAOT,
+)
+
+# @manual=//triton:triton
+from triton.runtime.jit import KernelInterface, mangle_type
+
+
+class CustomEncoder(json.JSONEncoder):
+    # pyre-ignore[14]: Inconsistent override
+    def default(self, obj: object) -> Any:
+        if isinstance(obj, set):
+            return {"__set__": True, "items": sorted(obj)}
+        # Handle other non-serializable types
+        return super().default(obj)
+
+
+def hash_spec(spec: Dict[str, Any]) -> str:
+    serialized_dict = json.dumps(spec, cls=CustomEncoder, sort_keys=True)
+    return hashlib.sha256(serialized_dict.encode("utf-8")).hexdigest()
+
+
+class AOTTCompileState:
+    """
+    Singleton state container for Triton AOT compilation.
+
+    Description:
+    This singleton pattern enables state sharing between code loaded via
+    torch.package (which creates isolated module namespaces) and the regular
+    Python import system. Without this pattern, the packaged module would have
+    its own copy of global state, leading to inconsistencies.
+
+    Usage:
+        # Normal usage - get the singleton instance
+        state = AOTTCompileState.get_instance()
+
+        # For torch.package integration - inject shared instance into packaged module
+        packaged_module = package_importer.import_module("triton_aot.compile.compile_state")
+        packaged_module.AOTTCompileState.set_instance(AOTTCompileState.get_instance())
+    """
+
+    _instance: Optional["AOTTCompileState"] = None
+
+    kernel_specs: Dict[KernelInterface[List[Any]], List[Dict[str, List[Any]]]] = {}
+    specs_hashset: Dict[KernelInterface[List[Any]], Set[str]] = {}
+    enable_aott_compile: bool = False
+    compile_base_dir: str = ""
+    compile_path: str = ""
+
+    def __new__(cls) -> "AOTTCompileState":
+        if cls._instance is None:
+            instance = super().__new__(cls)
+            instance._initialize()
+            cls._instance = instance
+        return cls._instance
+
+    def _initialize(self) -> None:
+        """Initialize the singleton state. Called only once."""
+        self.kernel_specs: Dict[
+            KernelInterface[List[Any]], List[Dict[str, List[Any]]]
+        ] = {}
+        self.specs_hashset: Dict[KernelInterface[List[Any]], Set[str]] = {}
+        self.enable_aott_compile: bool = False
+        self.compile_base_dir: str = os.getenv("TRITON_AOT_PATH_PREFIX", "/var/tmp")
+        self.compile_path: str = tempfile.mkdtemp(
+            dir=self.compile_base_dir, prefix="triton_aot_compile_"
+        )
+
+    @classmethod
+    def get_instance(cls) -> "AOTTCompileState":
+        """Get the singleton instance, creating it if necessary."""
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+
+    @classmethod
+    def set_instance(cls, instance: "AOTTCompileState") -> None:
+        """
+        Set the singleton instance. Used for torch.package integration.
+
+        When code is loaded via torch.package, it creates a separate module
+        namespace with its own class objects. This method allows injecting
+        a shared instance from the main module into the packaged module.
+        """
+        cls._instance = instance
+
+    def reset(self) -> None:
+        """Reset all state to initial values."""
+        self.kernel_specs = {}
+        self.specs_hashset = {}
+        self.disable()
+        self.compile_base_dir = os.getenv("TRITON_AOT_PATH_PREFIX", "/var/tmp")
+        self.compile_path = tempfile.mkdtemp(
+            dir=self.compile_base_dir, prefix="triton_aot_compile_"
+        )
+
+    def add_kernel_spec(
+        self,
+        fn: KernelInterface[List[Any]],
+        spec: Dict[str, List[Any]],
+        hashed_spec: str,
+    ) -> None:
+        """Add a kernel spec if not already present (based on hash).
+        If the same Triton kernel is used at multiple locations in a model:
+            - All calls share one spec list under the same kernel function key
+            - Specs with identical signatures (same dtypes, shapes) are deduplicated via hash
+            - Specs with different signatures (e.g., fp32 vs bf16) are recorded separately
+
+        Example:
+            # Two call sites using the same kernel:
+            my_kernel[grid](tensor_fp32, ...)  # Records spec with "*fp32"
+            my_kernel[grid](tensor_bf16, ...)  # Records spec with "*bf16"
+            my_kernel[grid](tensor_fp32, ...)  # Deduplicated, same hash as first call
+
+            # Result: kernel_specs[my_kernel] = [fp32_spec, bf16_spec]
+        """
+        if fn not in self.kernel_specs:
+            self.kernel_specs[fn] = []
+            self.specs_hashset[fn] = set()
+        if hashed_spec not in self.specs_hashset[fn]:
+            self.kernel_specs[fn].append(spec)
+            self.specs_hashset[fn].add(hashed_spec)
+
+    def _collect_spec(
+        self,
+        fn: KernelInterface[List[Any]],
+        annotations: Dict[str, Annotation],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """Spec collection callback registered on TritonAOT during compile.
+
+        Always collects the annotated spec (which equals the inferred spec
+        when no annotations are present).  Also collects the inferred spec
+        when it differs and either:
+        - annotations conflict with sample (fallback for safety), or
+        - inferred has perf hints the annotation lacks (perf variant).
+        """
+        spec = infer_spec(fn, annotations, *args, **kwargs)
+        annotated_hash = hash_spec(spec)
+        self.add_kernel_spec(fn, spec, annotated_hash)
+
+        if annotations:
+            inferred = infer_spec(fn, {}, *args, **kwargs)
+            inferred_hash = hash_spec(inferred)
+            if inferred_hash == annotated_hash:
+                return
+            if _annotation_conflicts_with_sample(
+                fn, annotations, *args, **kwargs
+            ) or _inferred_has_perf_advantage(spec, inferred):
+                self.add_kernel_spec(fn, inferred, inferred_hash)
+
+    def enable(self) -> None:
+        """Enable AOT compile and register the spec collection hook."""
+        self.enable_aott_compile = True
+        TritonAOT.set_spec_collector(self._collect_spec)
+
+    def disable(self) -> None:
+        """Disable AOT compile and unregister the spec collection hook."""
+        self.enable_aott_compile = False
+        TritonAOT.set_spec_collector(None)
+
+
+def get_aott_compile_state() -> AOTTCompileState:
+    """Get the current AOTTCompileState singleton.
+
+    Uses get_instance() so injected instances (via set_instance() for
+    torch.package integration) are respected.
+    """
+    return AOTTCompileState.get_instance()
+
+
+########
+# Module-level global accessors that delegate to singleton
+########
+
+
+def get_triton_aot_kernel_specs() -> Dict[
+    KernelInterface[List[Any]], List[Dict[str, List[Any]]]
+]:
+    return get_aott_compile_state().kernel_specs
+
+
+def get_triton_aot_specs_hashset() -> Dict[KernelInterface[List[Any]], Set[str]]:
+    return get_aott_compile_state().specs_hashset
+
+
+def get_aott_compile_path() -> str:
+    return get_aott_compile_state().compile_path
+
+
+def add_kernel_spec(
+    fn: KernelInterface[List[Any]], spec: Dict[str, List[Any]], hashed_spec: str
+) -> None:
+    get_aott_compile_state().add_kernel_spec(fn, spec, hashed_spec)
+
+
+def _unwrap_triton_fn(
+    fn: KernelInterface[List[Any]],
+) -> Callable[..., Any]:
+    while isinstance(fn, KernelInterface):
+        # pyre-ignore[16]: KernelInterface has `fn` attribute at runtime
+        fn = fn.fn
+    return fn
+
+
+def _inferred_has_perf_advantage(
+    annotated_spec: Dict[str, List[Any]],
+    inferred_spec: Dict[str, List[Any]],
+) -> bool:
+    """True if inferred spec has alignment/divisibility hints the annotated lacks.
+
+    A tuple element ``(type, N)`` carries alignment or divisibility info
+    that a bare string does not.  When inference adds such hints (e.g.,
+    tensor alignment from ``data_ptr() % 16 == 0``), the inferred spec
+    produces a more optimized cubin worth keeping as a perf variant.
+    """
+    for ann_elem, inf_elem in zip(
+        annotated_spec["signature"], inferred_spec["signature"]
+    ):
+        if isinstance(inf_elem, tuple) and not isinstance(ann_elem, tuple):
+            return True
+    return False
+
+
+# Triton-internal kwargs injected by KernelInterface.__getitem__
+# (triton/runtime/jit.py).  These are not kernel parameters and must
+# be stripped before getcallargs.
+_TRITON_INTERNAL_KWARGS: frozenset[str] = frozenset({"warmup", "grid"})
+
+
+def _resolve_call_args(
+    fn: KernelInterface[List[Any]],
+    *args: Any,
+    **kwargs: Any,
+) -> tuple[Callable[..., Any], dict[str, Any]]:
+    """Unwrap kernel and resolve call args with autotune placeholder fill."""
+    triton_fn = _unwrap_triton_fn(fn)
+    # Filter Triton-internal kwargs injected by KernelInterface.__getitem__
+    # (triton/runtime/jit.py) — not part of the kernel signature.
+    clean_kwargs = {k: v for k, v in kwargs.items() if k not in _TRITON_INTERNAL_KWARGS}
+    if is_autotuner(fn):
+        # pyre-ignore[16]: Attributes checked by is_autotuner
+        for arg_name in fn.configs[0].kwargs.keys():
+            if arg_name not in clean_kwargs:
+                clean_kwargs[arg_name] = -1
+    return triton_fn, getcallargs(triton_fn, *args, **clean_kwargs)
+
+
+_I32_MIN: int = -(2**31)
+_I32_MAX: int = 2**31 - 1
+
+
+def _sample_satisfies_int_type(sample: int, ann_type: str) -> bool:
+    """True if sample int fits the annotated type range."""
+    if ann_type == "i32":
+        return _I32_MIN <= sample <= _I32_MAX
+    return True
+
+
+def _sample_satisfies_annotation(sample: Any, ann: Annotation) -> bool:
+    """True if a single sample value satisfies its annotation constraint."""
+    if isinstance(ann, AnnotationHint):
+        if isinstance(sample, torch.Tensor):
+            return sample.data_ptr() % ann.hint == 0
+        if isinstance(sample, int):
+            if ann.hint == 1:
+                return sample == 1
+            if not _sample_satisfies_int_type(sample, ann.dtype):
+                return False
+            if ann.hint > 1:
+                return sample % ann.hint == 0
+        return True
+    if isinstance(ann, str) and not ann.startswith("*") and isinstance(sample, int):
+        return _sample_satisfies_int_type(sample, ann)
+    return True
+
+
+def _annotation_conflicts_with_sample(
+    fn: KernelInterface[List[Any]],
+    annotations: Dict[str, Annotation],
+    *args: Any,
+    **kwargs: Any,
+) -> bool:
+    """True if any annotated param's sample value doesn't satisfy the annotation.
+
+    Used by ``_collect_spec`` to decide whether to generate an inferred
+    fallback spec.  When the sample satisfies all annotations, only the
+    annotated spec is needed (the user's constraints hold for this input).
+    """
+    _, sample_args = _resolve_call_args(fn, *args, **kwargs)
+
+    for param_name, ann in annotations.items():
+        sample = sample_args.get(param_name)
+        if sample is None:
+            continue
+        if not _sample_satisfies_annotation(sample, ann):
+            return True
+
+    return False
+
+
+def _infer_spec_entry(
+    arg_name: str,
+    arg: Any,
+    arg_annotation: Any,
+    annotations: Dict[str, Annotation],
+) -> Any:
+    if arg_annotation != Parameter.empty:
+        if arg_annotation == tl.constexpr:
+            return arg
+        raise RuntimeError(
+            f"TritonAOT: unsupported scalar annotation {arg_annotation}."
+        )
+
+    if arg_name in annotations:
+        ann = annotations[arg_name]
+        # Convert to tuple for raw spec format (shared/spec_conversion
+        # processes plain tuples).
+        return ann.to_tuple() if isinstance(ann, AnnotationHint) else ann
+
+    if arg is None:
+        return None
+
+    if isinstance(arg, torch.Tensor):
+        # Reject dtypes SCALAR_TYPES can't render (e.g. *u1, *u16, *fp8e5)
+        # so codegen doesn't KeyError downstream.
+        type_str = mangle_type(arg)
+        if type_str not in SCALAR_TYPES:
+            raise RuntimeError(
+                f"TritonAOT: unsupported tensor type for {arg_name}: "
+                f"{arg.dtype} (Triton mangled to {type_str!r}). "
+                f"Supported tensor dtypes: {sorted(SCALAR_TYPES.keys())}."
+            )
+        return (type_str, 16) if arg.data_ptr() % 16 == 0 else type_str
+
+    if isinstance(arg, bool):
+        # bool is subclass of int; must check before int.
+        # Non-constexpr bools have no CTYPES entry for codegen.
+        raise RuntimeError(
+            f"TritonAOT: parameter {arg_name} is a bool without "
+            f"tl.constexpr annotation.  Add `{arg_name}: tl.constexpr` "
+            f"to the kernel signature."
+        )
+
+    if isinstance(arg, int):
+        # Always i64 for safety; users annotate "i32" for narrower variant via
+        # annotation-as-variant.
+        if not -(2**63) <= arg <= 2**63 - 1:
+            raise RuntimeError(
+                f"TritonAOT: unsupported int value for {arg_name}: "
+                f"value exceeds i64 range. Use a smaller value or tl.constexpr."
+            )
+        return "i64"
+
+    if isinstance(arg, float):
+        return "fp32"
+
+    raise RuntimeError(f"TritonAOT: parameter {arg_name} needs annotation.")
+
+
+def infer_spec(
+    fn: KernelInterface[List[Any]],
+    annotations: Dict[str, Annotation],
+    *args: Any,
+    **kwargs: Any,
+) -> Dict[str, List[Any]]:
+    """Infer kernel spec from sample args.
+
+    Tensor dtype: ``mangle_type``, alignment: ``data_ptr() % 16``.
+    Scalar int: always ``"i64"`` (safe default; user can annotate ``"i32"``
+    to get a narrower variant via annotation-as-variant).
+    Float: ``mangle_type`` → fp32.
+    """
+    triton_fn, call_args = _resolve_call_args(fn, *args, **kwargs)
+    fn_sig = signature(triton_fn)
+    arg_annotations = {
+        name: param.annotation for name, param in fn_sig.parameters.items()
+    }
+    spec = []
+
+    for arg_name in fn_sig.parameters.keys():
+        arg = call_args[arg_name]
+        spec.append(
+            _infer_spec_entry(arg_name, arg, arg_annotations[arg_name], annotations)
+        )
+    return {"signature": spec}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py
new file mode 100644
index 000000000..cdf4d1821
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py
@@ -0,0 +1,300 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+"""AOT-T compilation pipeline.
+
+Orchestrates: spec processing → Triton native compile → C++ / Python codegen.
+"""
+
+from __future__ import annotations
+
+import logging
+import multiprocessing as mp
+import os
+import signal
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from types import FrameType, ModuleType
+from typing import Any, Callable
+
+# @manual=//triton:triton
+import triton
+import triton.compiler
+from generative_recommenders.ops.triton_aot.compile.arg_descriptor import (
+    ArgDescriptor,
+    build_arg_descriptors,
+)
+from generative_recommenders.ops.triton_aot.compile.codegen import (
+    gen_cubin,
+    gen_kernel_name,
+    gen_launcher,
+    gen_loader,
+    gen_tuner_meta_py,
+    generate_header_content,
+    generate_kernel_cpp_content,
+    generate_torch_op_content,
+)
+from generative_recommenders.ops.triton_aot.compile.spec_processing import (
+    gen_compile_arg,
+    KernelSpec,
+    OpsUnit,
+    RawKernelSpec,
+)
+from generative_recommenders.ops.triton_aot.compile.utils import (
+    is_autotuner,
+    unwrap_heuristic,
+)
+from generative_recommenders.ops.triton_aot.shared.types import AUTOTUNE_ATTRs
+from triton.backends.compiler import GPUTarget
+from triton.runtime.jit import JITFunction, KernelInterface
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def compile_specs_parallel(
+    specs: list[KernelSpec],
+    install_dir: str,
+    module: str,
+    name: str,
+    gpu_target: GPUTarget,
+    import_module: Callable[[str], ModuleType],
+    descriptors: list[ArgDescriptor],
+) -> list[str]:
+    """Compile kernel specs in parallel using multiprocessing.
+
+    When TRITON_AOT_DEBUG=1 is set, compiles sequentially for easier debugging.
+
+    Args:
+        specs: List of kernel specifications to compile
+        install_dir: Directory to install generated files
+        module: The module name of the function
+        name: The function name
+        gpu_target: GPU target for compilation
+        import_module: Function to import modules (e.g., importlib.import_module or PackageImporter.import_module)
+
+    Returns:
+        List of generated code strings for each spec (cubin, loader, launcher)
+    """
+
+    debug = os.environ.get("TRITON_AOT_DEBUG", "0") == "1"
+    if debug:
+        outputs = [
+            spec_gen(
+                install_dir,
+                spec,
+                module,
+                name,
+                gpu_target,
+                import_module,
+                descriptors,
+            )
+            for spec in specs
+        ]
+    else:
+        max_workers = mp.cpu_count() // 2 + 1
+        with ThreadPoolExecutor(max_workers=min(len(specs), max_workers)) as executor:
+            outputs = list(
+                executor.map(
+                    lambda spec: spec_gen(
+                        install_dir,
+                        spec,
+                        module,
+                        name,
+                        gpu_target,
+                        import_module,
+                        descriptors,
+                    ),
+                    specs,
+                )
+            )
+    return outputs
+
+
+# For each spec, generate a kernel:
+# - cubin
+# - loader
+# - launcher
+def spec_gen(
+    install_dir: str,
+    spec: KernelSpec,
+    module: str,
+    name: str,
+    gpu_target: GPUTarget,
+    import_module: Callable[[str], ModuleType],
+    descriptors: list[ArgDescriptor],
+) -> str:
+    # To run this function with multiprocessing, we need to import the function by name,
+    # since JITFunction cannot be pickled.
+    # we have the case where the func name is injected with a suffix, like "_cuda" or "_amd",
+    # we should use the original name to import the func in such case
+    original_name = name
+    splits = name.split("_")
+    end_idx = len(splits)
+
+    while end_idx > 0:
+        original_name = "_".join(splits[:end_idx])
+        if hasattr(import_module(module), original_name):
+            break
+        end_idx -= 1
+    func = unwrap_heuristic(getattr(import_module(module), original_name), JITFunction)
+    func.__name__ = name
+
+    # Generate cubin.
+    kernel_name = gen_kernel_name(func, spec, gpu_target.arch)
+
+    compile_arg = gen_compile_arg(spec, func)
+    options = {name: getattr(spec, name) for name in AUTOTUNE_ATTRs.keys()}
+    compile_kwargs = {
+        "target": gpu_target,
+        "options": options,
+    }
+    kernel = triton.compiler.compile(*compile_arg, **compile_kwargs)
+    if getattr(kernel.metadata, "global_scratch_size", 0) > 0:
+        raise RuntimeError(f"{kernel_name=} with global scratch is not supported.")
+
+    metadata_name = kernel.metadata.name
+    metadata_shared = kernel.metadata.shared
+
+    cubin = gen_cubin(kernel_name, kernel, install_dir, gpu_target.backend)
+    out = [
+        cubin,
+        # Generate loader.
+        gen_loader(kernel_name, metadata_name, metadata_shared),
+        # Generate launcher.
+        gen_launcher(
+            kernel_name,
+            func,
+            kernel,
+            metadata_shared,
+            gpu_target.warp_size,
+            spec,
+            descriptors,
+        ),
+    ]
+    return "".join(out)
+
+
+def sigchld_handler(signum: int, frame: FrameType | None) -> None:
+    sketchy_signals = map(int, [signal.SIGSEGV, signal.SIGABRT, signal.SIGBUS])
+    try:
+        # Consume all pending SIGCHLDs, looking for unexpected failures
+        while True:
+            pid, status = os.waitpid(-1, os.WNOHANG)
+            if pid == 0:
+                break
+            if os.WIFSIGNALED(status) and os.WTERMSIG(status) in sketchy_signals:
+                logger.error(
+                    f"Child process {pid} exited catastrophically with signal {os.WTERMSIG(status)}, terminating!"
+                )
+
+                # Avoid triggering atexit etc which can get stuck and behave improperly
+                # because multiprocessing sets up an atexit handler to join workers
+                # (sigh).  We want to exit, now, so use os._exit instead of sys.exit.
+                os._exit(1)
+    except ChildProcessError:
+        pass
+
+
+def compile_to_cpp(
+    func: KernelInterface[list[Any]] | triton.runtime.autotuner.Autotuner,
+    base_specs: list[RawKernelSpec],
+    install_dir: str,
+    prefix: str,
+    *,
+    gpu_target: GPUTarget,
+    import_module: Callable[[str], ModuleType],
+    default_values: dict[str, Any] | None = None,
+    tuner_fallback: bool = False,
+) -> None:
+    """Compile a Triton kernel into .cpp, .h, _torch_op.cpp, _meta.py files.
+
+    Args:
+        func: Triton JITFunction or Autotuner to compile.
+        base_specs: List of kernel specialization specs.
+        install_dir: Directory to output generated files.
+        prefix: Kernel name prefix, e.g., "_addmm_fwd".
+        gpu_target: GPU target for compilation.
+        import_module: torch.package importer for loading kernels source code.
+        default_values: Default values for kernel arguments.
+        tuner_fallback: If True, generate fallback tuner code.
+    """
+    tuned_func = func if is_autotuner(func) else None
+    # pyre-ignore[6]: Attributes verified by is_autotuner
+    unit = OpsUnit.from_raw_specs(base_specs, gpu_target, tuned_func)
+    default_values = {} if default_values is None else default_values
+
+    func_unwrapped = unwrap_heuristic(func, JITFunction)
+    descriptors = build_arg_descriptors(func_unwrapped, unit)
+
+    # Python's multiprocessing.Pool class is not great at handling unexpected child
+    # failures such as segfaults.  Account for this by temporarily installing a signal
+    # handler that considers such signals a catastrophic compilation failure.  If not
+    # for this, the Pool will deadlock.
+    if threading.current_thread() is threading.main_thread():
+        previous_child_handler = signal.signal(signal.SIGCHLD, sigchld_handler)
+    else:
+        previous_child_handler = None
+
+    func = func_unwrapped
+
+    # sanity check to make sure args with default values are always at the end
+    has_default_value_arg = False
+    for name in func.arg_names:
+        if name in default_values:
+            has_default_value_arg = True
+        elif has_default_value_arg:
+            raise RuntimeError(
+                f"default values must be at the end of the argument list. {func.arg_names=} {default_values=}"
+            )
+
+    h_out = f"{install_dir}/{prefix}.h"
+    cu_out = f"{install_dir}/{prefix}.cpp"
+    torch_out = f"{install_dir}/{prefix}_torch_op.cpp"
+    py_out = f"{install_dir}/{prefix}_meta.py"
+
+    # Generate kernel.h file
+    h_content = generate_header_content(
+        tuned_func,  # pyre-ignore[6]: Autotuner when set (verified by is_autotuner)
+        func,
+        unit,
+        descriptors,
+        tuner_fallback,
+    )
+
+    with open(h_out, "w") as fp:
+        fp.write(h_content)
+
+    generated_specs = compile_specs_parallel(
+        unit.specs,
+        install_dir,
+        func.__module__,
+        func.__name__,
+        gpu_target,
+        import_module,
+        descriptors,
+    )
+    # Generate kernel.cpp file
+    cu_content = generate_kernel_cpp_content(
+        func, unit, descriptors, prefix, generated_specs, gpu_target.backend
+    )
+    with open(cu_out, "w") as fp:
+        fp.write(cu_content)
+
+    # Generate torch_op.cpp file
+    torch_op_content = generate_torch_op_content(
+        func, descriptors, prefix, default_values
+    )
+
+    with open(torch_out, "w") as fp:
+        fp.write(torch_op_content)
+
+    if tuned_func:
+        with open(py_out, "w") as fp:
+            fp.write(gen_tuner_meta_py(tuned_func, tuner_fallback, unit))
+    else:
+        with open(py_out, "w") as fp:
+            fp.write(gen_tuner_meta_py(func, tuner_fallback, unit))
+
+    if previous_child_handler is not None:
+        signal.signal(signal.SIGCHLD, previous_child_handler)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py
new file mode 100644
index 000000000..e8c181121
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py
@@ -0,0 +1,593 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+"""Kernel spec processing for AOT-T compilation.
+
+Transforms raw kernel specs (from infer_spec) into compiled specs ready
+for Triton native compile and C++ codegen.
+"""
+
+from __future__ import annotations
+
+import copy
+import dataclasses
+import logging
+from dataclasses import dataclass
+from typing import Any, cast
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.ops.triton_aot.compile.compile_state import hash_spec
+from generative_recommenders.ops.triton_aot.shared.spec_conversion import (
+    collect_constraints,
+    extract_constants,
+    get_fp8_replacement_signature_for_amd,
+    get_fp8_replacement_signature_for_sm80,
+    signature_list_to_dict,
+    SignatureElement,
+)
+from generative_recommenders.ops.triton_aot.shared.types import AUTOTUNE_ATTRs
+from triton.backends.compiler import BaseBackend, GPUTarget
+from triton.compiler.compiler import ASTSource
+from triton.runtime.jit import JITFunction
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+TRITON_VERSION: str = triton.__version__
+
+# A raw kernel spec produced by infer_spec.  The only key is "signature".
+RawKernelSpec = dict[str, list[SignatureElement]]
+
+
+@dataclass
+class KernelSpec:
+    """A single compilation variant for a kernel.
+
+    Each variant represents one combination of dtypes, constant values,
+    alignment constraints, and autotune configuration.  Multiple variants
+    are grouped together in an ``OpsUnit``.
+
+    Attributes:
+        signature: Non-constant arg index → dtype string (e.g., ``{0: "*fp32", 4: "i32"}``).
+        constants: Arg index → compile-time constant value.  Includes bare
+            literals (128, ``"leaky_relu"``), absent optional tensors (None),
+            and equal-to-1 specializations (stride=1 → constexpr folding).
+        divisible_by_16: Indices of args whose values are divisible by 16.
+            For pointers this means the address is 16-byte aligned;
+            for scalars it means the value itself is a multiple of 16.
+        divisible_by_8: Indices of args whose values are divisible by 8.
+            Only meaningful for scalars (pointer alignment is always ≥16).
+        num_warps: Number of warps per block.
+        num_stages: Number of pipeline stages.
+        matrix_instr_nonkdim: AMD matrix instruction non-K dimension.
+        waves_per_eu: AMD waves per execution unit.
+        kpack: AMD kpack factor.
+    """
+
+    signature: dict[int, str]
+    constants: dict[int, Any]
+    divisible_by_16: set[int]
+    divisible_by_8: set[int]
+    num_warps: int = 4
+    num_stages: int = 3
+    matrix_instr_nonkdim: int = 0
+    waves_per_eu: int = 1
+    kpack: int = 1
+
+
+@dataclass
+class OpsUnit:
+    """All compilation variants for a single kernel op.
+
+    Groups per-kernel invariants with the list of ``KernelSpec`` variants.
+    Use ``OpsUnit.from_raw_specs()`` to build — it performs the complete
+    spec processing pipeline (convert → detect optional → validate →
+    autotune → dedup → compute invariants).
+
+    Attributes:
+        cc: Compute capability (int for NVIDIA, str for AMD).
+        optional: Indices of optional tensor args (unified across all call sites).
+        pointer_args: Indices of all tensor pointer args (required + optional).
+            Invariant across specs — a pointer arg never becomes a non-pointer.
+        scalar_dtypes: Non-pointer signature arg index → widest dtype string
+            across all specs (e.g., ``"i32"``, ``"i64"``, ``"fp32"``).
+            Computed by ``_wider_type`` — individual specs may use narrower types.
+        constant_types: Python type per constant arg position (e.g., ``{15: int, 19: bool}``).
+            Excludes optional tensor args (None constants).
+            Invariant across specs — same Python type for each position.
+        specs: Per-variant compilation specs.
+    """
+
+    cc: int | str
+    optional: set[int]
+    pointer_args: set[int]
+    scalar_dtypes: dict[int, str]
+    constant_types: dict[int, type[Any]]
+    specs: list[KernelSpec]
+
+    @classmethod
+    def from_raw_specs(
+        cls,
+        base_specs: list[RawKernelSpec],
+        gpu_target: GPUTarget,
+        tuned_func: triton.runtime.autotuner.Autotuner | None = None,
+    ) -> OpsUnit:
+        """Build an OpsUnit from raw kernel specs.
+
+        Performs the complete spec processing pipeline:
+        1. Convert raw specs to KernelSpecs
+        2. Detect optional tensor args (cross-spec + 3-tuple)
+        3. Validate consistency across converted specs
+        4. Apply autotuning (if tuned_func provided)
+        5. Deduplicate specs
+        6. Compute shared invariants (pointer_args, scalar_dtypes, constant_types)
+        """
+        # Validate raw specs upfront, before any rewriting.
+        num_params = _check_uniform_signature_length(base_specs)
+        specs, three_tuple_optional = _convert_raw_specs(base_specs, gpu_target)
+        optional = _detect_optional_args(specs) | three_tuple_optional
+
+        _validate_converted_specs(specs, optional, num_params)
+
+        # Plain @triton.jit kernels (no @triton.autotune) skip config expansion.
+        if tuned_func is not None:
+            specs = _autotune_specs(tuned_func, gpu_target, specs)
+
+        specs = _dedup_specs(specs)
+
+        pointer_args, scalar_dtypes, constant_types = _compute_invariants(
+            specs, optional
+        )
+
+        return cls(
+            cc=gpu_target.arch,
+            optional=optional,
+            pointer_args=pointer_args,
+            scalar_dtypes=scalar_dtypes,
+            constant_types=constant_types,
+            specs=specs,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Public helpers (used outside spec processing)
+# ---------------------------------------------------------------------------
+
+
+def gen_compile_arg(
+    spec: KernelSpec,
+    func: JITFunction[list[Any]],
+) -> tuple[ASTSource]:
+    # ASTSource expects tuple-keyed dicts: {(idx,): value} for constants,
+    # {(idx,): [[attr_name, attr_val], ...]} for attrs.  Tuple keys support
+    # nested paths into structured types (asserted by ASTSource.__init__).
+    new_signature = {}
+    new_constants = {}
+    param_names = list(func.signature.parameters.keys())
+    for idx, param in enumerate(param_names):
+        if idx in spec.signature:
+            new_signature[param] = spec.signature[idx]
+        if idx in spec.constants:
+            new_constants[(idx,)] = spec.constants[idx]
+            new_signature[param] = "constexpr"
+
+    # parse_attr("D") returns a fresh [["tt.divisibility", 16]] each call.
+    new_attrs = {(idx,): BaseBackend.parse_attr("D") for idx in spec.divisible_by_16}
+
+    return (
+        ASTSource(
+            func,
+            new_signature,
+            constexprs=new_constants,
+            attrs=new_attrs,
+        ),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Int width helpers
+# ---------------------------------------------------------------------------
+
+_INT_WIDTH_RANK: dict[str, int] = {"i32": 0, "i64": 1}
+
+
+def _wider_type(t1: str, t2: str) -> str:
+    """Return the wider of two scalar dtypes.
+
+    Only i32/i64 widening is supported.  All other types must match exactly.
+    """
+    if t1 == t2:
+        return t1
+    r1 = _INT_WIDTH_RANK.get(t1)
+    r2 = _INT_WIDTH_RANK.get(t2)
+    if r1 is not None and r2 is not None:
+        return t1 if r1 >= r2 else t2
+    raise ValueError(f"Cannot widen incompatible types: {t1!r} vs {t2!r}")
+
+
+# ---------------------------------------------------------------------------
+# Private helpers — called by OpsUnit.from_raw_specs
+# ---------------------------------------------------------------------------
+
+
+def _detect_optional_args(specs: list[KernelSpec]) -> set[int]:
+    """Detect optional tensor args by cross-spec comparison.
+
+    An arg at index ``i`` is optional if:
+    - Some specs have ``i`` in ``signature`` as a pointer type (``*...``)
+    - Other specs have ``constants[i] = None``
+
+    Single-spec None args (always-absent tensors) are NOT detected here
+    but are handled by ``_compute_invariants`` which adds any
+    ``constants[i] = None`` to ``pointer_args``.
+    """
+    if len(specs) <= 1:
+        return set()
+    optional: set[int] = set()
+    all_indices: set[int] = set()
+    for spec in specs:
+        all_indices |= spec.signature.keys()
+        all_indices |= spec.constants.keys()
+    for i in all_indices:
+        has_pointer = any(
+            i in s.signature and s.signature[i].startswith("*") for s in specs
+        )
+        has_none_const = any(i in s.constants and s.constants[i] is None for s in specs)
+        if has_pointer and has_none_const:
+            optional.add(i)
+    return optional
+
+
+def _check_uniform_signature_length(base_specs: list[RawKernelSpec]) -> int:
+    """All raw specs must declare the same param count; return that count.
+
+    Each raw spec is one ``infer_spec`` call site for the same kernel,
+    so all should have ``len(fn.signature.parameters)`` entries.  Differing
+    lengths means upstream bug (mixed kernels, truncated spec, etc.) and
+    would surface later as silent IndexError or wrong bound checks.
+    """
+    if not base_specs:
+        return 0
+    sig_lens = {len(spec["signature"]) for spec in base_specs}
+    if len(sig_lens) != 1:
+        raise ValueError(
+            f"Raw specs declare inconsistent signature lengths: "
+            f"{sorted(sig_lens)}.  All specs for the same kernel must have "
+            f"one entry per declared param."
+        )
+    return sig_lens.pop()
+
+
+def _check_arg_indices_in_range(
+    specs: list[KernelSpec],
+    num_params: int,
+) -> None:
+    """Every spec arg index must be in ``[0, num_params)``.
+
+    Out-of-range indices would silently drop in ``gen_compile_arg``'s
+    ``enumerate(param_names)`` loop.  ``num_params <= 0`` disables the check.
+    """
+    if num_params <= 0:
+        return
+    for idx, spec in enumerate(specs):
+        all_indices = (
+            spec.signature.keys()
+            | spec.constants.keys()
+            | spec.divisible_by_16
+            | spec.divisible_by_8
+        )
+        for i in all_indices:
+            if not 0 <= i < num_params:
+                raise ValueError(
+                    f"Spec {idx}: arg index {i} out of range "
+                    f"[0, {num_params}) — kernel has {num_params} declared params"
+                )
+
+
+def _collect_pointer_args(
+    specs: list[KernelSpec],
+    optional: set[int],
+) -> set[int]:
+    """Collect all tensor pointer indices across all specs.
+
+    Includes optional args (from _detect_optional_args) AND any arg
+    whose constant value is None (single-spec optional tensor case
+    where _detect_optional_args didn't fire).
+    """
+    pointer_args: set[int] = set(optional)
+    for spec in specs:
+        for i, dtype in spec.signature.items():
+            if dtype.startswith("*"):
+                pointer_args.add(i)
+        for i, val in spec.constants.items():
+            if val is None:
+                pointer_args.add(i)
+    return pointer_args
+
+
+def _collect_scalar_dtypes(
+    specs: list[KernelSpec],
+    pointer_args: set[int],
+) -> dict[int, str]:
+    """Collect non-pointer signature arg dtypes, widening compatible int types.
+
+    Invariant across specs (validated by _validate_converted_specs).
+    """
+    scalar_dtypes: dict[int, str] = {}
+    for spec in specs:
+        for i, dtype in spec.signature.items():
+            if i not in pointer_args:
+                if i in scalar_dtypes:
+                    scalar_dtypes[i] = _wider_type(scalar_dtypes[i], dtype)
+                else:
+                    scalar_dtypes[i] = dtype
+    return scalar_dtypes
+
+
+def _collect_constant_types(
+    specs: list[KernelSpec],
+) -> dict[int, type[Any]]:
+    """Collect Python type per constant position.
+
+    Excludes None constants (optional tensor args — already in pointer_args).
+    """
+    constant_types: dict[int, type[Any]] = {}
+    for spec in specs:
+        for i, val in spec.constants.items():
+            if val is not None and i not in constant_types:
+                constant_types[i] = type(val)
+    return constant_types
+
+
+def _compute_invariants(
+    specs: list[KernelSpec],
+    optional: set[int],
+) -> tuple[set[int], dict[int, str], dict[int, type[Any]]]:
+    """Compute shared invariants from processed specs.
+
+    Returns (pointer_args, scalar_dtypes, constant_types).
+
+    When annotation-as-variant produces mixed partitions (arg in
+    ``signature`` in some specs, ``constants`` in others), the arg
+    appears in both ``scalar_dtypes`` and ``constant_types``.  The
+    selector must receive it as a runtime parameter for dispatch,
+    so ``scalar_dtypes`` wins and the arg is removed from
+    ``constant_types``.
+    """
+    pointer_args = _collect_pointer_args(specs, optional)
+    scalar_dtypes = _collect_scalar_dtypes(specs, pointer_args)
+    constant_types = _collect_constant_types(specs)
+
+    # Resolve overlap: if any spec has the arg in signature (scalar),
+    # the selector needs it as a runtime parameter → not a constant.
+    for i in scalar_dtypes:
+        constant_types.pop(i, None)
+
+    return pointer_args, scalar_dtypes, constant_types
+
+
+def _validate_converted_specs(
+    specs: list[KernelSpec],
+    optional: set[int],
+    num_params: int = 0,
+) -> None:
+    """Validate that converted specs are consistent before further processing.
+
+    Checks that all specs produce identical C++ function signatures:
+    - All arg indices are in ``[0, num_params)`` (when ``num_params > 0``)
+    - Optional args: each spec has either a pointer in signature or None in constants
+    - Non-optional scalar args: same dtype (or compatible int widths)
+    - Non-optional constant args: same Python type
+
+    Called after _convert_raw_specs + _detect_optional_args, before autotuning.
+    """
+    _check_arg_indices_in_range(specs, num_params)
+    if len(specs) <= 1:
+        return
+    ref = specs[0]
+    for idx, spec in enumerate(specs[1:], 1):
+        _check_optional_consistency(ref, spec, idx, optional)
+        _check_signature_consistency(ref, spec, idx, optional)
+        _check_constants_consistency(ref, spec, idx, optional)
+
+
+def _check_optional_consistency(
+    ref: KernelSpec,
+    spec: KernelSpec,
+    idx: int,
+    optional: set[int],
+) -> None:
+    """Optional positions must be pointer-in-signature or None-in-constants.
+
+    Validates that optional tensor args are not misclassified as scalars
+    or non-None constants, which would produce incompatible C++ types.
+    """
+    for i in optional:
+        for label, s in [("spec 0", ref), (f"spec {idx}", spec)]:
+            if i in s.signature:
+                if not s.signature[i].startswith("*"):
+                    raise ValueError(
+                        f"Arg {i}: optional position has non-pointer type "
+                        f"'{s.signature[i]}' in {label}"
+                    )
+            elif i in s.constants:
+                if s.constants[i] is not None:
+                    raise ValueError(
+                        f"Arg {i}: optional position has non-None constant "
+                        f"{s.constants[i]!r} in {label}"
+                    )
+
+
+def _check_signature_consistency(
+    ref: KernelSpec,
+    spec: KernelSpec,
+    idx: int,
+    optional: set[int],
+) -> None:
+    """Non-optional, non-pointer scalar args must have compatible dtypes.
+
+    Pointer args are skipped (different tensor dtypes are dispatched by
+    the dtype guard in ``gen_guarded_calls``).  Compatible int widths
+    (i32/i64) are allowed — handled by ``_wider_type`` and int range guards.
+    Optional positions are validated by ``_check_optional_consistency``.
+
+    Partition differences are allowed: an arg may be in ``signature`` in
+    one spec and in ``constants`` in another (e.g., annotation-as-variant
+    where stride=1 is constexpr in one spec but a runtime parameter in
+    another).  The per-spec codegen handles this correctly.
+    """
+    for i in ref.signature.keys() | spec.signature.keys():
+        if i in optional:
+            continue
+        if (i in ref.signature and ref.signature[i].startswith("*")) or (
+            i in spec.signature and spec.signature[i].startswith("*")
+        ):
+            continue
+        # Allow partition differences: arg in signature in one spec,
+        # in constants in another (annotation-as-variant pattern).
+        if i not in ref.signature or i not in spec.signature:
+            continue
+        if ref.signature[i] != spec.signature[i]:
+            r1 = _INT_WIDTH_RANK.get(ref.signature[i])
+            r2 = _INT_WIDTH_RANK.get(spec.signature[i])
+            if r1 is not None and r2 is not None:
+                continue
+            raise ValueError(
+                f"Arg {i}: dtype mismatch '{ref.signature[i]}' vs "
+                f"'{spec.signature[i]}' (spec 0 vs spec {idx})"
+            )
+
+
+def _check_constants_consistency(
+    ref: KernelSpec,
+    spec: KernelSpec,
+    idx: int,
+    optional: set[int],
+) -> None:
+    """Non-optional constant args must have the same Python type across specs.
+
+    C++ codegen uses one type per constant arg position (``PY_TYPES_TO_CPP_TYPES``),
+    so ``BLOCK_M=64`` (int) and ``BLOCK_M=64.0`` (float) would produce
+    incompatible launchers.  Optional positions are validated separately
+    by ``_check_optional_consistency``.
+    """
+    for i in ref.constants.keys() | spec.constants.keys():
+        if i in optional:
+            continue
+        if ref.constants.get(i) is None or spec.constants.get(i) is None:
+            continue
+        if type(ref.constants[i]) is not type(spec.constants[i]):
+            raise ValueError(
+                f"Arg {i}: constant type mismatch "
+                f"{type(ref.constants[i]).__name__} vs "
+                f"{type(spec.constants[i]).__name__} (spec 0 vs spec {idx})"
+            )
+
+
+def _convert_raw_specs(
+    base_specs: list[RawKernelSpec],
+    gpu_target: GPUTarget,
+) -> tuple[list[KernelSpec], set[int]]:
+    """Convert raw specs to KernelSpecs.
+
+    Returns (specs, three_tuple_optional) where three_tuple_optional is the
+    union of optional_args detected from 3-tuple signature elements across
+    all specs (backward compat with ``collect_constraints``).
+    """
+    raw_specs = cast(list[dict[str, Any]], copy.deepcopy(base_specs))
+    is_amd = gpu_target.backend == "hip"
+
+    result: list[KernelSpec] = []
+    three_tuple_optional: set[int] = set()
+    for raw_spec in raw_specs:
+        constraints = collect_constraints(raw_spec["signature"])
+        constants = extract_constants(raw_spec["signature"], constraints)
+        signature: dict[int, str] = signature_list_to_dict(
+            raw_spec["signature"], constants
+        )
+        three_tuple_optional |= constraints.optional_args
+
+        spec = KernelSpec(
+            signature=signature,
+            constants=constants,
+            divisible_by_16=constraints.divisible_by_16,
+            divisible_by_8=constraints.divisible_by_8,
+        )
+
+        if constraints.has_fp8:
+            if is_amd:
+                spec.signature = get_fp8_replacement_signature_for_amd(
+                    {"signature": spec.signature}, {str(gpu_target.arch)}
+                )
+            elif gpu_target.arch == 80:
+                spec.signature = get_fp8_replacement_signature_for_sm80(
+                    {"signature": spec.signature}
+                )
+
+        result.append(spec)
+
+    return result, three_tuple_optional
+
+
+def _autotune_specs(
+    func: triton.runtime.autotuner.Autotuner,
+    target: GPUTarget,
+    specs: list[KernelSpec],
+) -> list[KernelSpec]:
+    tuned_specs: list[KernelSpec] = []
+    for spec in specs:
+        for cfg in func.cache.values():
+            constants = spec.constants.copy()
+            for arg_name, arg_val in cfg.kwargs.items():
+                if arg_name in AUTOTUNE_ATTRs:
+                    continue
+                arg_idx = func.arg_names.index(arg_name)
+                if constants.get(arg_idx, -1) == -1:
+                    constants[arg_idx] = arg_val
+
+            autotune_values: dict[str, int] = {}
+            for name, default in AUTOTUNE_ATTRs.items():
+                if name in cfg.kwargs:
+                    autotune_values[name] = cfg.kwargs[name]
+                else:
+                    autotune_values[name] = getattr(cfg, name, default)
+                # AMD has changed their software pipeliner in Triton
+                # It now expects num_stages == 2 instead of 0
+                # see: https://github.com/pytorch/pytorch/pull/139881
+                # if we see someone try to set num_stages == 0, set it to the default (2) instead
+                # We can't use the Triton hook to get the default value because it requires the AMD runtime to be loaded
+                if (
+                    target.backend == "hip"
+                    and name == "num_stages"
+                    and autotune_values[name] == 0
+                    and TRITON_VERSION >= "3.2.0"
+                ):
+                    autotune_values[name] = 2
+
+            tuned_spec = dataclasses.replace(
+                spec,
+                constants=constants,
+                # pyrefly: ignore [bad-argument-type]
+                **autotune_values,
+            )
+            tuned_specs.append(tuned_spec)
+    return tuned_specs
+
+
+def _dedup_specs(specs: list[KernelSpec]) -> list[KernelSpec]:
+    deduped_specs: list[KernelSpec] = []
+    duplicated_specs: list[KernelSpec] = []
+    hash_spec_ids: set[str] = set()
+    for spec in specs:
+        id = hash_spec(dataclasses.asdict(spec))
+        if id in hash_spec_ids:
+            duplicated_specs.append(spec)
+        else:
+            hash_spec_ids.add(id)
+            deduped_specs.append(spec)
+
+    logger.debug(
+        f"[TritonAOT Dedup] {len(specs)=} {len(deduped_specs)=} {len(duplicated_specs)=}"
+    )
+    return deduped_specs
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py
new file mode 100644
index 000000000..33038a534
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py
@@ -0,0 +1,35 @@
+# pyre-strict
+
+"""AOTT-local type mappings for stable ABI codegen.
+
+These replace ``shared.types.ATYPES`` and ``shared.types.PY_TYPES_TO_CPP_TYPES``
+with versions that have zero link dependency on ATen.  The shared dicts are kept
+unchanged so TritonCC is not affected.
+"""
+
+from typing import Any
+
+# Stable ABI scalar type mapping: Triton pointer dtype → c10::ScalarType enum.
+# Uses c10::ScalarType:: (from torch/headeronly/core/ScalarType.h) instead of
+# at::kFloat aliases (which require ATen headers).
+SCALAR_TYPES: dict[str, str] = {
+    "*i1": "c10::ScalarType::Bool",
+    "*u8": "c10::ScalarType::Byte",
+    "*i8": "c10::ScalarType::Char",
+    "*i16": "c10::ScalarType::Short",
+    "*i32": "c10::ScalarType::Int",
+    "*i64": "c10::ScalarType::Long",
+    "*fp16": "c10::ScalarType::Half",
+    "*fp32": "c10::ScalarType::Float",
+    "*fp64": "c10::ScalarType::Double",
+    "*bf16": "c10::ScalarType::BFloat16",
+    "*fp8e4nv": "c10::ScalarType::Float8_e4m3fn",
+    "*fp8e4b8": "c10::ScalarType::Float8_e4m3fnuz",
+}
+
+# Stable ABI override: str → "std::string" instead of "at::string".
+PY_TYPES_TO_CPP_TYPES: dict[type[Any], str] = {
+    int: "int64_t",
+    str: "std::string",
+    float: "double",
+}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py
new file mode 100644
index 000000000..74179435a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py
@@ -0,0 +1,149 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# pyre-strict
+
+import importlib
+import logging
+import os
+import pickle
+from types import ModuleType, TracebackType
+from typing import Any, Callable, Optional, Type
+
+from generative_recommenders.ops.triton_aot.build.extension_builder import (
+    build_triton_aot_extension,
+)
+from generative_recommenders.ops.triton_aot.compile.codegen import (
+    is_non_empty_mapping_of_type,
+)
+from generative_recommenders.ops.triton_aot.compile.compile_state import (
+    get_aott_compile_path,
+    get_aott_compile_state,
+    get_triton_aot_kernel_specs,
+)
+from generative_recommenders.ops.triton_aot.compile.pipeline import compile_to_cpp
+from generative_recommenders.ops.triton_aot.compile.utils import unwrap_heuristic
+from torch import package
+from triton.backends.compiler import GPUTarget
+from triton.runtime import driver, JITFunction
+
+# @manual=//triton:triton
+from triton.runtime.autotuner import Config
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class TritonAOTCompile:
+    """
+    Context manager to compile Triton kernels to C++ and build a shared library.
+    The compiled kernels are cached in a temporary directory.
+
+    - package_importer:
+        torch.package importer for loading kernels source code (aott/ops).
+        If not provided, the default importlib is used (for local use cases)
+    - gpu_target:
+        GPU target to compile for (default: active GPU target, determined by Triton driver)
+    This local copy intentionally omits Manifold autotune-cache overrides. The
+    HSTU e2e path only needs representative-input autotuning captured during
+    the compile context.
+    """
+
+    def __init__(
+        self,
+        package_importer: Optional[package.PackageImporter] = None,
+        gpu_target: Optional[GPUTarget] = None,
+        auto_tune_cache_override_path: Optional[str] = None,
+    ) -> None:
+        self._import_module: Callable[[str], ModuleType] = (
+            package_importer.import_module
+            if package_importer is not None
+            else importlib.import_module
+        )
+        self.gpu_target: GPUTarget = gpu_target or driver.active.get_current_target()
+        self.auto_tune_cache_override_path: Optional[str] = (
+            auto_tune_cache_override_path
+        )
+
+    def _load_autotune_cache_overrides(
+        self,
+    ) -> dict[str, Any]:
+        if self.auto_tune_cache_override_path is None:
+            return {}
+        raise NotImplementedError(
+            "Local generative_recommenders AOT-T compile does not support "
+            "auto_tune_cache_override_path."
+        )
+
+    def __enter__(self) -> None:
+        state = get_aott_compile_state()
+        state.reset()
+        state.enable()
+        logger.info(
+            f"Start AOTT compile, output dir: {get_aott_compile_path()}, gpu_target: {self.gpu_target}"
+        )
+
+    def _resolve_autotune_cache(
+        self,
+        fn: Any,
+        fn_name: str,
+        fn_dir: str,
+        overrides: dict[str, Any],
+    ) -> None:
+        """Apply override (if matched) and dump the autotune cache to fn_dir."""
+        override = overrides.get(fn_name)
+        if override is not None:
+            logger.info(
+                f"[AOTT]: Overriding autotune cache for {fn_name} "
+                f"from {self.auto_tune_cache_override_path}"
+            )
+            fn.cache = override
+
+        # cache are dumped just for testing
+        if hasattr(fn, "cache") and is_non_empty_mapping_of_type(fn.cache, Config):
+            with open(f"{fn_dir}/{fn_name}_autotune_cache", "wb") as data:
+                # @lint-ignore PYTHONPICKLEISBAD
+                pickle.dump(fn.cache, data)
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> None:
+        compile_path = get_aott_compile_path()
+        if not os.path.exists(compile_path):
+            os.makedirs(compile_path)
+
+        kernel_specs = get_triton_aot_kernel_specs()
+        auto_tune_overrides = self._load_autotune_cache_overrides()
+
+        logger.info(f"[AOTT]: compiling {len(kernel_specs)} kernels")
+
+        for fn, specs in kernel_specs.items():
+            jit_fn = unwrap_heuristic(fn, JITFunction)
+            fn_name = jit_fn.__name__
+
+            logger.info(f"[AOTT]: compiling {fn_name} with specs: {specs}")
+
+            module_suffix = jit_fn.__module__.rsplit(".", 1)[-1]
+            fn_dir = f"{compile_path}/{module_suffix}_{fn_name}"
+            if not os.path.exists(fn_dir):
+                os.makedirs(fn_dir)
+
+            self._resolve_autotune_cache(fn, fn_name, fn_dir, auto_tune_overrides)
+
+            compile_to_cpp(
+                func=fn,
+                base_specs=specs,
+                install_dir=f"{fn_dir}",
+                prefix=f"{fn_name}",
+                gpu_target=self.gpu_target,
+                tuner_fallback=True,
+                import_module=self._import_module,
+            )
+
+            build_triton_aot_extension(
+                source_dir=fn_dir,
+                kernel_name=fn_name,
+                output_dir=fn_dir,
+            )
+
+        get_aott_compile_state().disable()
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py
new file mode 100644
index 000000000..e3848fd2e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+import hashlib
+from typing import Any, Type, TypeVar
+
+T = TypeVar("T")
+
+
+def unwrap_heuristic(func: Any, return_type: Type[T]) -> T:
+    while not isinstance(func, return_type):
+        func = func.fn
+        if not hasattr(func, "fn"):
+            # pyre-fixme[7]: Incompatible return type [7]: Expected `Variable[T]` but got `None`.
+            return None
+    return func
+
+
+def is_autotuner(obj: Any) -> bool:
+    """Check whether *obj* is a Triton Autotuner using duck typing.
+
+    In Buck builds the ``Autotuner`` class can be loaded from multiple module
+    paths (e.g. via ``torch.package`` re-imports), causing ``isinstance`` to
+    return ``False`` for genuine Autotuner instances.  We combine a class-name
+    check with duck-typing on the attributes that callers actually need
+    (``cache``, ``configs``, ``arg_names``), making detection robust against
+    module-path aliasing.
+    """
+    return "Autotuner" in type(obj).__name__ and all(
+        hasattr(obj, attr) for attr in ("cache", "configs", "arg_names")
+    )
+
+
+def hash_kernel_name(kernel_name: str) -> str:
+    """Hash kernel name to create shorter, filesystem-safe names.
+
+    Args:
+        kernel_name: Full kernel name (can be very long with specialization suffixes).
+            e.g., "_addmm_fwd_sm80_pfp32_pfp32_pfp32_pfp32_i32_..."
+
+    Returns:
+        Hashed name in format "kernel_<sha256_hex>".
+            e.g., "kernel_a1b2c3d4e5f6..."
+
+    """
+    return "kernel_" + hashlib.sha256(kernel_name.encode("utf-8")).hexdigest()
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py b/recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py
new file mode 100644
index 000000000..ce2e63d43
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py
@@ -0,0 +1,76 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-strict
+
+"""
+Preprocessing utilities for triton_aot models before AOT compilation.
+"""
+
+import logging
+
+from tgif.fx.tgif_tracer import TGIFTracer
+from torch.fx import GraphModule
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
+# AOT-T triton kernel wrapper functions. This is required for
+# AOT-T backend to recognize and trace correctly for ops transformation.
+AOTT_WRAPPER_PREFIX: str = "aot_triton_kernel_wrapper_"
+
+
+def unwrap_aott_wrapper_nodes(fx_m: GraphModule, tracer: TGIFTracer) -> GraphModule:
+    """Mark ``aot_triton_kernel_wrapper_*`` FX nodes as unwrapped and re-trace.
+
+    In the traced FX graph, outer wrapper functions (prefixed with
+    ``aot_triton_kernel_wrapper_``) are ``@torch.fx.wrap`` leaves.
+    Setting ``node.meta["is_wrapped"] = False`` causes a subsequent
+    ``symbolic_trace`` to trace *through* them, exposing the inner
+    ``@torch.fx.wrap`` functions (e.g., ``_triton_aot_grouped_gemm``)
+    that contain the actual kernel calls.
+
+    Any ``_body_transformer`` hook (e.g. one registered by
+    ``early_return_fx_code_transform``) is temporarily removed before
+    re-tracing to avoid injecting un-traceable control flow
+    (``if Proxy: …``) into the generated ``forward``.  After re-trace
+    the hook is restored on the new module.  See P2266562545.
+
+    Args:
+        fx_m: The FX GraphModule to modify **in-place** before re-trace.
+        tracer: Tracer instance used for the re-trace step.
+
+    Returns:
+        The re-traced ``GraphModule`` with AOTT wrappers expanded.
+    """
+    logger.info("Re-trace to get the AOTT node exposed.")
+
+    # Save and clear the body transformer so that re-trace does not hit
+    # ``if Proxy:`` from code-level hooks like early_return_fx_code_transform.
+    saved_body_transformer = fx_m.graph._codegen._body_transformer
+    fx_m.graph._codegen._body_transformer = None
+
+    unwrap_count = 0
+    for node in fx_m.graph.nodes:
+        if node.op == "call_function":
+            target = node.target
+            if hasattr(target, "__name__") and target.__name__.startswith(
+                AOTT_WRAPPER_PREFIX
+            ):
+                logger.info(f"[AOTT] Found inference wrapper node: {node=}")
+                node.meta["is_wrapped"] = False
+                unwrap_count += 1
+
+    if unwrap_count > 0:
+        logger.info(f"[AOTT] Found {unwrap_count} inference wrapper nodes.")
+        fx_m.recompile()
+    else:
+        logger.warning("[AOTT] No inference wrapper node found. Skip re-compile.")
+
+    result = tracer.symbolic_trace(fx_m)
+
+    # Restore the body transformer on the new module.
+    if saved_body_transformer is not None:
+        result.graph._codegen._body_transformer = saved_body_transformer
+        result.recompile()
+
+    return result
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py
new file mode 100644
index 000000000..be6235701
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py
@@ -0,0 +1,91 @@
+# pyre-strict
+"""
+This module provides shared utilities that handle differences between
+Triton versions.
+"""
+
+from typing import Any
+
+# @manual=//triton:triton
+import triton
+from packaging.version import Version
+from triton.runtime.jit import JITFunction
+
+TRITON_VERSION: str = triton.__version__
+
+
+def version_gte(version: str, target: str) -> bool:
+    """
+    Check if version >= target using semantic version comparison.
+    Simple string comparison fails for versions like "3.10" vs "3.5"
+    """
+    return Version(version) >= Version(target)
+
+
+def get_kernel_name(jit_fn: JITFunction[Any]) -> str:
+    """
+    Get the simple kernel name from a JITFunction.
+
+    In Triton 3.5+, JITFunction._fn_name returns the full qualified name
+    (e.g., "generative_recommenders.ops.triton_aot.triton_addmm._addmm_fwd").
+    In older versions, it returns just the simple name (e.g., "_addmm_fwd").
+
+    This function normalizes the behavior to always return the simple name.
+
+    Args:
+        jit_fn: A Triton JITFunction
+
+    Returns:
+        The simple kernel name (e.g., "_addmm_fwd")
+    """
+    fn_name = jit_fn._fn_name
+    if version_gte(TRITON_VERSION, "3.5"):
+        # Triton 3.5+ uses get_full_name(fn) which returns qualified name
+        return fn_name.rsplit(".", 1)[-1]
+    else:
+        # Older versions use fn.__name__ which is already simple
+        return fn_name
+
+
+def get_scratch_parameters(kernel: Any) -> tuple[str, list[str]]:
+    """
+    Get scratch parameter declarations and argument pointers for the kernel launcher.
+
+    Scratch parameters are backend and version-specific features for profiling
+    and global memory management.
+
+    Detection Strategy:
+        1. Check metadata first for each parameter
+        2. Fall back to version-based detection if metadata unavailable
+
+    Version Requirements (fallback):
+        - v3.4+: both global_scratch and profile_scratch
+        - v3.3: only global_scratch
+        - v3.2 and earlier: no scratch parameters
+
+    Args:
+        kernel: Compiled Triton kernel with metadata attribute
+
+    Returns:
+        Tuple of (declarations, arg_pointers):
+        - declarations: C++ variable declarations for scratch parameters
+        - arg_pointers: List of argument pointers to append to kernel args
+    """
+    declarations = []
+    arg_pointers = []
+
+    if hasattr(kernel.metadata, "global_scratch_size"):
+        declarations.append("CUdeviceptr global_scratch = 0;")
+        arg_pointers.append("&global_scratch")
+    elif version_gte(TRITON_VERSION, "3.3"):
+        declarations.append("CUdeviceptr global_scratch = 0;")
+        arg_pointers.append("&global_scratch")
+
+    if hasattr(kernel.metadata, "profile_scratch_size"):
+        declarations.append("CUdeviceptr profile_scratch = 0;")
+        arg_pointers.append("&profile_scratch")
+    elif version_gte(TRITON_VERSION, "3.4"):
+        declarations.append("CUdeviceptr profile_scratch = 0;")
+        arg_pointers.append("&profile_scratch")
+
+    return ("\n            ".join(declarations), arg_pointers)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py
new file mode 100644
index 000000000..4a1ebb133
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py
@@ -0,0 +1,389 @@
+# pyre-strict
+
+"""Functions for converting kernel specs to architecture-specific formats.
+
+A "spec" (specification) describes how to compile a Triton kernel for a specific
+set of input shapes and types. Users provide "base specs" in a human-friendly
+format that describes kernel arguments:
+
+    {"signature": [("*fp32", 16), ("*bf16", 16), ("i32", None), 128]}
+
+This format encodes dtypes, alignment hints, and constant values together.
+Before compilation, base specs must be converted to "compiled specs" that
+separate this information into distinct fields the compiler understands:
+
+    {"signature": {0: "*fp32", 1: "*bf16"},
+     "constants": {2: None, 3: 128},
+     "configs": (instance_descriptor(...),),
+     "cc": 80}
+
+This module provides the functionality to perform this transformation,
+extracting constraints, identifying constants, and preparing specs for each
+target GPU architecture.
+"""
+
+from collections import namedtuple
+from dataclasses import dataclass
+from typing import Any, TypeAlias
+
+from generative_recommenders.ops.triton_aot.shared.types import CTYPES
+
+# Compile-time constant values that can appear in signatures or be returned
+# by constexpr(). These are values the compiler can fold into generated code.
+ConstantValue: TypeAlias = str | int | float | bool | None
+
+# A single element in a kernel signature list.
+# Can be: dtype string, (dtype, alignment) tuple, (dtype, alignment, has_value)
+# triple for optional args, or a bare literal constant.
+SignatureElement: TypeAlias = (
+    ConstantValue | tuple[str, int | None] | tuple[str, int | None, bool]
+)
+
+
+instance_descriptor = namedtuple(
+    "instance_descriptor",
+    [
+        "divisible_by_16",
+        "equal_to_1",
+        "ids_of_folded_args",
+        "divisible_by_8",
+    ],
+)
+
+
+def constexpr(s: SignatureElement) -> ConstantValue:
+    """Identify compile-time constant expressions in signature elements.
+
+    Args:
+        s: A signature element.
+
+    Returns:
+        The constant value if s is a compile-time constant, None otherwise.
+        Constants are: int, float, bool, or strings that aren't dtype names.
+    """
+    expr = s[0] if isinstance(s, tuple) and len(s) > 1 else s
+
+    if expr is None:
+        return expr
+
+    try:
+        ret = int(expr)
+        return ret
+    except (ValueError, TypeError):
+        pass
+    try:
+        ret = float(expr)
+        return ret
+    except (ValueError, TypeError):
+        pass
+
+    if isinstance(expr, bool):
+        return expr
+    if isinstance(expr, str) and expr not in CTYPES and not expr.startswith("*"):
+        return expr
+    return None
+
+
+@dataclass
+class SignatureConstraints:
+    """Constraints extracted from parsing a kernel signature.
+
+    When compiling a Triton kernel, the compiler can generate more efficient
+    code if it knows certain properties about the arguments:
+
+    - Pointer alignment: If a pointer is always 16-byte aligned, the compiler
+      can use faster aligned memory operations.
+    - Constant values: Arguments known at compile time can be folded into the
+      generated code, eliminating runtime checks.
+    - FP8 dtypes: Some GPU architectures require dtype substitutions for FP8
+      types (e.g., gfx942 needs fp8e4b8 instead of fp8e4nv).
+
+    This dataclass collects all these constraints from a single pass over the
+    signature, so downstream code can use them without re-parsing.
+
+    Attributes:
+        divisible_by_16: Indices of args with values divisible by 16.
+        divisible_by_8: Indices of args with values divisible by 8.
+        equal_to_1: Indices of args with value equal to 1.
+        none_args: Indices of args that are None (not provided).
+        optional_args: Indices of optional arguments.
+        has_fp8: Whether any argument has an FP8 dtype.
+    """
+
+    divisible_by_16: set[int]
+    divisible_by_8: set[int]
+    equal_to_1: set[int]
+    none_args: set[int]
+    optional_args: set[int]
+    has_fp8: bool
+
+
+def collect_constraints(signature: list[SignatureElement]) -> SignatureConstraints:
+    """Collect divisibility and type constraints from a signature list.
+
+    Iterates through signature elements and identifies:
+    - Arguments divisible by 16 or 8 (for memory alignment)
+    - Arguments equal to 1 (for optimization)
+    - Optional arguments and those not provided (None)
+    - Whether any FP8 dtypes are present
+
+    Args:
+        signature: List of signature elements. The input format is unfortunately
+            variable; each element can be one of several types:
+
+            1. Plain string (dtype only, no alignment info):
+               "*fp32"           - A float32 pointer
+               "i32"             - A 32-bit integer scalar
+
+            2. Tuple of (dtype, value) where value indicates alignment or constness:
+               ("*fp32", 16)     - Float32 pointer, 16-byte aligned
+               ("i32", None)     - Integer arg not provided (becomes constant None)
+               ("*bf16", 1)      - Pointer with value=1 (folded as constant)
+
+            3. Triple of (dtype, value, has_value) for optional arguments:
+               ("*fp32", 16, True)  - Optional arg that IS provided, 16-byte aligned
+               ("*fp32", 16, False) - Optional arg NOT provided (becomes None)
+
+            4. Bare literals (become compile-time constants):
+               128               - Integer constant
+               "leaky_relu"      - String constant (e.g., activation name)
+
+    Returns:
+        SignatureConstraints with all constraint sets populated.
+
+    Example:
+        >>> sig = [("*fp32", 16), ("i32", None), ("*fp8e4nv", 8)]
+        >>> c = collect_constraints(sig)
+        >>> 0 in c.divisible_by_16
+        True
+        >>> c.has_fp8
+        True
+    """
+    divisible_by_16: set[int] = set()
+    divisible_by_8: set[int] = set()
+    equal_to_1: set[int] = set()
+    none_args: set[int] = set()
+    optional_args: set[int] = set()
+    has_fp8: bool = False
+
+    for i, s in enumerate(signature):
+        # Handle optional tensor case: tuple with 3 elements where s[2] indicates
+        # whether the optional arg has a value
+        if isinstance(s, tuple) and len(s) > 2:
+            optional_args.add(i)
+            # pyrefly: ignore [bad-index]
+            if not s[2]:  # has_value is False
+                none_args.add(i)
+                continue
+
+        # Extract dtype
+        dtype = s[0] if isinstance(s, tuple) else s
+
+        # Check for FP8 types
+        if isinstance(dtype, str) and ("fp8e4nv" in dtype or "fp8e4b8" in dtype):
+            has_fp8 = True
+
+        # Extract value (alignment or constant)
+        value = s[1] if isinstance(s, tuple) else s
+
+        # Check divisibility and equality constraints
+        if isinstance(value, int):
+            if value % 16 == 0:
+                divisible_by_16.add(i)
+            if value % 8 == 0:
+                divisible_by_8.add(i)
+            if value == 1:
+                equal_to_1.add(i)
+
+        if value is None:
+            none_args.add(i)
+
+    return SignatureConstraints(
+        divisible_by_16=divisible_by_16,
+        divisible_by_8=divisible_by_8,
+        equal_to_1=equal_to_1,
+        none_args=none_args,
+        optional_args=optional_args,
+        has_fp8=has_fp8,
+    )
+
+
+def make_instance_descriptor(
+    constraints: SignatureConstraints,
+) -> tuple[instance_descriptor]:
+    """Create an instance_descriptor tuple from constraints.
+
+    Args:
+        constraints: The collected signature constraints.
+
+    Returns:
+        A tuple containing a single instance_descriptor namedtuple with
+        divisible_by_16, equal_to_1, ids_of_folded_args, and divisible_by_8.
+    """
+    ids_of_folded_args = constraints.equal_to_1 | constraints.none_args
+    return (
+        instance_descriptor(
+            divisible_by_16=constraints.divisible_by_16,
+            equal_to_1=constraints.equal_to_1,
+            ids_of_folded_args=ids_of_folded_args,
+            divisible_by_8=constraints.divisible_by_8,
+        ),
+    )
+
+
+def extract_constants(
+    signature: list[SignatureElement],
+    constraints: SignatureConstraints,
+) -> dict[int, ConstantValue]:
+    """Extract compile-time constant values from signature elements.
+
+    Identifies arguments that can be folded into generated code at compile time.
+    Constants come from three sources:
+
+    1. Bare literals in the signature (e.g., 128 for block size, "leaky_relu"
+       for activation type, True for a boolean flag)
+    2. Arguments with value=1 (tracked in constraints.equal_to_1)
+    3. Arguments not provided (tracked in constraints.none_args)
+
+    Args:
+        signature: List of signature elements in input format.
+        constraints: The collected signature constraints.
+
+    Returns:
+        Dict mapping argument indices to their constant values.
+    """
+    # Use constexpr to identify constant expressions
+    constexprs = {i: constexpr(s) for i, s in enumerate(signature)}
+    constants: dict[int, ConstantValue] = {
+        k: v for k, v in constexprs.items() if v is not None
+    }
+
+    # Add equal_to_1 args with value 1
+    for k in constraints.equal_to_1:
+        constants[k] = 1
+
+    # Add none_args with value None
+    for k in constraints.none_args:
+        constants[k] = None
+
+    return constants
+
+
+def signature_list_to_dict(
+    signature: list[SignatureElement],
+    constants: dict[int, ConstantValue],
+) -> dict[int, str]:
+    """Convert signature from list format to dict format.
+
+    Transforms the input signature list into a dict mapping argument
+    indices to dtype strings. Arguments that are constants are excluded
+    since they don't need runtime type information.
+
+    Args:
+        signature: List of signature elements in input format.
+        constants: Dict of constant argument indices to exclude.
+
+    Returns:
+        Dict mapping non-constant argument indices to their dtype strings.
+    """
+    result: dict[int, str] = {}
+    for i, s in enumerate(signature):
+        if i in constants:
+            continue
+        # After filtering out constants, remaining elements are dtype declarations.
+        # For tuples like ("*fp32", 16), s[0] is the dtype string.
+        # For plain strings like "*fp32", the element itself is the dtype.
+        if isinstance(s, tuple) and len(s) > 1:
+            dtype = s[0]
+        else:
+            dtype = s
+        assert isinstance(dtype, str)
+        result[i] = dtype
+    return result
+
+
+# CC (compute capability) to AMD GPU architecture mapping
+# CC is a 2-digit shorthand: 94 -> gfx942, 95 -> gfx950
+HIP_CC_TO_ARCH_INFO: dict[int, str] = {
+    90: "gfx90a",
+    94: "gfx942",
+    95: "gfx950",
+}
+
+# Reverse mapping: architecture string -> CC string
+HIP_ARCH_TO_CC: dict[str, str] = {v: str(k) for k, v in HIP_CC_TO_ARCH_INFO.items()}
+
+HIP_CC_MI350X: str = "95"  # CC string for gfx950 (MI350X/MI355X)
+
+
+def _normalize_cc(cc: set[str]) -> set[str]:
+    """Normalize CC values to 2-digit format for internal comparison.
+
+    Accepts both tritoncc format ("94", "95") and Triton driver format
+    ("gfx942", "gfx950"). Returns 2-digit CC strings.
+    """
+    return {HIP_ARCH_TO_CC.get(c, c) for c in cc}
+
+
+def get_fp8_replacement_signature_for_amd(
+    spec: dict[str, Any], cc: set[str]
+) -> dict[int, str]:
+    """Replace FP8 dtypes in signature for AMD architectures.
+
+    Args:
+        spec: Compiled spec dict with 'signature' in dict format.
+        cc: Set of CC strings in either format:
+            - 2-digit tritoncc format: {"94"} for gfx942
+            - Triton driver format: {"gfx942"}
+            See HIP_CC_TO_ARCH_INFO.
+
+    Returns:
+        Dict mapping argument indices to dtype strings with FP8 types replaced.
+    """
+    normalized_cc: set[str] = _normalize_cc(cc)
+
+    def replace_fp8_type(dtype_str: str) -> str:
+        if "fp8e4nv" in dtype_str:
+            if HIP_CC_MI350X not in normalized_cc:
+                return dtype_str.replace("fp8e4nv", "fp8e4b8")
+        elif "fp8e4b8" in dtype_str and HIP_CC_MI350X in normalized_cc:
+            return dtype_str.replace("fp8e4b8", "fp8e4nv")
+        return dtype_str
+
+    replace_fp8_signatures: dict[int, str] = {}
+    for key, value in spec["signature"].items():
+        if isinstance(value, str):
+            replace_fp8_signatures[key] = replace_fp8_type(value)
+        else:
+            replace_fp8_signatures[key] = value
+
+    return replace_fp8_signatures
+
+
+def get_fp8_replacement_signature_for_sm80(
+    spec: dict[str, Any],
+) -> dict[int, Any]:
+    """Replace FP8 dtypes with bf16 for SM80 (A100) which lacks native FP8 support.
+
+    Args:
+        spec: Compiled spec dict with 'signature' in dict format.
+
+    Returns:
+        Dict mapping argument indices to dtype strings with FP8 types replaced by bf16.
+    """
+
+    def replace_fp8_type(dtype_str: str) -> str:
+        if "fp8e4nv" in dtype_str:
+            return dtype_str.replace("fp8e4nv", "bf16")
+        return dtype_str
+
+    replace_fp8_signatures: dict[int, Any] = {}
+    for key, value in spec["signature"].items():
+        if isinstance(value, tuple) and isinstance(value[0], str):
+            replace_fp8_signatures[key] = (replace_fp8_type(value[0]), value[1])
+        elif isinstance(value, str):
+            replace_fp8_signatures[key] = replace_fp8_type(value)
+        else:
+            replace_fp8_signatures[key] = value
+
+    return replace_fp8_signatures
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py
new file mode 100644
index 000000000..6a870fc4d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py
@@ -0,0 +1,58 @@
+# pyre-strict
+
+"""Shared type definitions for AOTT and Triton CC.
+
+This module contains fundamental type mappings used across the compiler.
+"""
+
+from typing import Any
+
+# Mapping from Triton dtype names to C type names
+CTYPES: dict[str, str] = {
+    "i1": "bool",
+    "u8": "uint8_t",
+    "i8": "int8_t",
+    "i16": "int16_t",
+    "i32": "int32_t",
+    "i64": "int64_t",
+    "fp16": "half",
+    "fp32": "float",
+    "fp64": "double",
+    "bf16": "__nv_bfloat16",
+    "fp8e4nv": "__nv_fp8_e4m3",
+    "fp8e4b8": "__hip_fp8_e4m3_fnuz",
+}
+
+# Mapping from Triton pointer dtype names to ATen scalar types
+ATYPES: dict[str, str] = {
+    "*i1": "at::kBool",
+    "*u8": "at::kByte",
+    "*i8": "at::kChar",
+    "*i16": "at::kShort",
+    "*i32": "at::kInt",
+    "*i64": "at::kLong",
+    "*fp16": "at::kHalf",
+    "*fp32": "at::kFloat",
+    "*fp64": "at::kDouble",
+    "*bf16": "at::kBFloat16",
+    "*fp8e4nv": "at::kFloat8_e4m3fn",
+    "*fp8e4b8": "at::kFloat8_e4m3fnuz",
+}
+
+# Mapping from Python types to C++ type names
+PY_TYPES_TO_CPP_TYPES: dict[type[Any], str] = {
+    int: "int64_t",
+    str: "at::string",
+    float: "double",
+}
+
+# Default values for autotuning attributes.
+# These are used as default kernel launch parameters.
+AUTOTUNE_ATTRs: dict[str, int] = {
+    "num_warps": 4,
+    "num_stages": 3,
+    # AMD only
+    "matrix_instr_nonkdim": 0,
+    "waves_per_eu": 1,
+    "kpack": 1,
+}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp
new file mode 100644
index 000000000..d269c3555
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp
@@ -0,0 +1,7 @@
+#include <cstddef>
+
+extern "C" {
+// __TRITON_AOT_GENERATE_BEGIN__ CUBIN_ARRAYS
+// placeholder
+// __TRITON_AOT_GENERATE_END__ CUBIN_ARRAYS
+}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp
new file mode 100644
index 000000000..7f882f11f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp
@@ -0,0 +1,104 @@
+// __TRITON_AOT_GENERATE_BEGIN__ HEADER_INCLUDE
+#include "kernel.h"
+// __TRITON_AOT_GENERATE_END__ HEADER_INCLUDE
+// These headers are used by code generated at runtime in KERNEL_SPECS blocks
+#include <torch/csrc/stable/accelerator.h>
+#include <torch/headeronly/core/ScalarType.h> // NOLINT(facebook-unused-include-check)
+
+inline void triton_aot_cu_check(CUresult err, const char* file, int line) {
+  if (err != CUDA_SUCCESS) {
+    const char* err_str;
+    cuGetErrorString(err, &err_str);
+    throw std::runtime_error(
+        std::string(file) + ":" + std::to_string(line) +
+        " CUDA driver error: " + (err_str ? err_str : "unknown"));
+  }
+}
+#define TRITON_AOT_CU_CHECK(EXPR) triton_aot_cu_check(EXPR, __FILE__, __LINE__)
+
+// NOLINTNEXTLINE(facebook-hte-NullableReturn): error path throws
+inline cudaStream_t triton_aot_get_current_stream() {
+  auto device_idx = torch::stable::accelerator::getCurrentDeviceIndex();
+  void* stream_ptr = nullptr;
+  // TODO: No torch::stable op provides the same functionality
+  // today. Revisit if torch exposes a proper stable::accelerator stream API.
+  if (aoti_torch_get_current_cuda_stream(device_idx, &stream_ptr) != 0) {
+    throw std::runtime_error("Failed to get current CUDA stream");
+  }
+  return reinterpret_cast<cudaStream_t>(stream_ptr);
+}
+
+namespace triton {
+namespace aot {
+
+namespace {
+[[maybe_unused]] int compute_capability() {
+  // Cached: AOTT hosts use homogeneous GPUs.
+  static int cc = 0;
+  if (cc == 0) {
+    CUdevice device;
+    TRITON_AOT_CU_CHECK(cuCtxGetDevice(&device));
+    int major, minor;
+    TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
+        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+    TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
+        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
+    cc = major * 10 + minor;
+  }
+  return cc;
+}
+} // namespace
+
+namespace {
+#ifdef USE_ROCM
+[[maybe_unused]] void check_errors(int shared, hipFunction_t func) {
+  // HIP doesn't need the same shared memory configuration as CUDA
+  return;
+}
+#else
+[[maybe_unused]] void check_errors(int shared, CUfunction func) {
+  int shared_optin;
+  int device = 0;
+  TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
+      &shared_optin,
+      CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+      device));
+  if (shared > 49152 && shared_optin > 49152) {
+    // If requested/shared_optin exceed 48 KB, it switches cache to prefer
+    // shared memory and sets the max dynamic shared memory so the kernel can
+    // allocate the larger amount needed.
+    TRITON_AOT_CU_CHECK(
+        cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_SHARED));
+    int shared_total, shared_static;
+    TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
+        &shared_total,
+        CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
+        device));
+    TRITON_AOT_CU_CHECK(cuFuncGetAttribute(
+        &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func));
+    TRITON_AOT_CU_CHECK(cuFuncSetAttribute(
+        func,
+        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+        shared_optin - shared_static));
+  }
+}
+#endif
+} // namespace
+
+// __TRITON_AOT_GENERATE_BEGIN__ KERNEL_SPECS
+// __TRITON_AOT_GENERATE_END__ KERNEL_SPECS
+
+// __TRITON_AOT_GENERATE_BEGIN__ SELECTOR
+// __TRITON_AOT_GENERATE_END__ SELECTOR
+
+} // namespace aot
+} // namespace triton
+
+// Anchor: keeps the inline `triton_aot_get_current_stream` (and its reference
+// to `aoti_torch_get_current_cuda_stream`) from being dead-stripped at
+// buck-build time, where KERNEL_SPECS is empty. `weak` dedups the symbol
+// across the per-op .so files generated by the runtime template substitution.
+extern "C" __attribute__((weak, visibility("default"))) cudaStream_t
+__triton_aot_anchor_get_stream() {
+  return triton_aot_get_current_stream();
+}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h
new file mode 100644
index 000000000..6b6f76e17
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <torch/csrc/stable/tensor.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace triton {
+namespace aot {
+
+#ifndef GRID_DIM_DEFINED_MACRO
+struct gridDims {
+  int x = 1;
+  int y = 1;
+  int z = 1;
+  cudaStream_t stream = nullptr;
+  gridDims(int _x = 1, int _y = 1, int _z = 1, cudaStream_t _stream = nullptr)
+      : x(_x), y(_y), z(_z), stream(_stream) {}
+};
+#define GRID_DIM_DEFINED_MACRO
+#endif
+
+#ifndef FITS_I32_DEFINED_MACRO
+constexpr bool fits_i32(int64_t v) {
+  return v >= INT32_MIN && v <= INT32_MAX;
+}
+#define FITS_I32_DEFINED_MACRO
+#endif
+
+// __TRITON_AOT_GENERATE_BEGIN__ TUNER_META_CPP
+// __TRITON_AOT_GENERATE_END__ TUNER_META_CPP
+// __TRITON_AOT_GENERATE_BEGIN__ SELECTOR_PROTO
+// __TRITON_AOT_GENERATE_END__ SELECTOR_PROTO
+
+} // namespace aot
+} // namespace triton
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py
new file mode 100644
index 000000000..d89bbe8b6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# pyre-strict
+
+"""
+Common utilities for template loading and rendering.
+
+This module provides functions to load template files from the templates
+directory and render them by replacing marker blocks with actual values.
+"""
+
+import re
+from collections import Counter
+from importlib import resources
+
+
+def load_template(name: str) -> str:
+    """Load template file content from Buck resources.
+
+    Templates are loaded from the resources bundled with this package via Buck's
+    select_accelerator mechanism:
+    - AMD builds: hipified templates with HIP APIs
+    - NVIDIA builds: original templates with CUDA APIs
+
+    Args:
+        name: Template filename (e.g., 'kernel.cpp', 'embedded_cubins.cpp').
+
+    Returns:
+        The template file content as a string.
+    """
+    return resources.files(__package__).joinpath(name).read_text()
+
+
+def render_template(template: str, replacements: dict[str, str]) -> str:
+    """Replace block markers in template with actual values.
+
+    Replaces content between "// __TRITON_AOT_GENERATE_BEGIN__ NAME"
+    and "// __TRITON_AOT_GENERATE_END__ NAME" with the value for key "NAME".
+    Each key must have exactly one BEGIN/END pair in the template.
+    The markers are preserved for easier debugging.
+
+    Args:
+        template: Template string containing marker blocks.
+        replacements: Dict mapping marker names to replacement values.
+
+    Returns:
+        Rendered template with all marker blocks replaced.
+
+    Raises:
+        AssertionError: If markers are duplicated, mismatched, or keys don't match.
+    """
+    BEGIN_PREFIX = "// __TRITON_AOT_GENERATE_BEGIN__ "
+    END_PREFIX = "// __TRITON_AOT_GENERATE_END__ "
+
+    begin_keys = re.findall(r"// __TRITON_AOT_GENERATE_BEGIN__ (\w+)", template)
+    end_keys = re.findall(r"// __TRITON_AOT_GENERATE_END__ (\w+)", template)
+
+    # Check for duplicate keys
+    begin_key_counts = Counter(begin_keys)
+    end_key_counts = Counter(end_keys)
+    for key, count in begin_key_counts.items():
+        assert count == 1, f"Duplicate BEGIN marker for key: {key}"
+    for key, count in end_key_counts.items():
+        assert count == 1, f"Duplicate END marker for key: {key}"
+
+    # Check BEGIN and END keys match
+    template_keys = set(begin_keys)
+    assert template_keys == set(end_keys), (
+        f"Mismatched BEGIN/END markers: BEGIN={template_keys}, END={set(end_keys)}"
+    )
+
+    # Validate keys match between template and replacements
+    replacement_keys = set(replacements.keys())
+    assert template_keys == replacement_keys, (
+        f"Keys mismatch: in template but not in replacements: {template_keys - replacement_keys}, "
+        f"in replacements but not in template: {replacement_keys - template_keys}"
+    )
+
+    # Do the replacements
+    result = template
+    for key, value in replacements.items():
+        begin_marker = f"{BEGIN_PREFIX}{key}"
+        end_marker = f"{END_PREFIX}{key}"
+
+        begin_idx = result.find(begin_marker)
+        newline_idx = result.find("\n", begin_idx)
+        assert newline_idx != -1, (
+            f"BEGIN marker for key '{key}' must be followed by newline"
+        )
+        content_start = newline_idx + 1
+        end_idx = result.find(end_marker, begin_idx)
+
+        result = result[:content_start] + value + result[end_idx:]
+
+    return result
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp
new file mode 100644
index 000000000..c2e5a4063
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp
@@ -0,0 +1,22 @@
+// __TRITON_AOT_GENERATE_BEGIN__ HEADER_INCLUDE
+#include "kernel.h"
+// __TRITON_AOT_GENERATE_END__ HEADER_INCLUDE
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h> // NOLINT(facebook-unused-include-check)
+
+// __TRITON_AOT_GENERATE_BEGIN__ TORCH_OP
+namespace {
+// no-op, force link StableLibrary
+torch::stable::Tensor _triton_aot_placeholder_noop(
+    torch::stable::Tensor input) {
+  return input;
+}
+} // namespace
+
+STABLE_TORCH_LIBRARY_FRAGMENT(triton_aot, m) {
+  m.def("_placeholder_noop(Tensor input) -> Tensor");
+}
+STABLE_TORCH_LIBRARY_IMPL(triton_aot, CPU, m) {
+  m.impl("_placeholder_noop", TORCH_BOX(&_triton_aot_placeholder_noop));
+}
+// __TRITON_AOT_GENERATE_END__ TORCH_OP
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py
new file mode 100644
index 000000000..e3c4f1955
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py
@@ -0,0 +1,89 @@
+# pyre-strict
+
+"""
+Import-header utilities for triton_aot codegen.
+"""
+
+import ast
+
+from torch import package
+
+
+def get_original_import_header(source_code: str) -> str:
+    """Extract all import statements from *source_code* as a single string."""
+    tree = ast.parse(source_code)
+    import_header = ""
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ImportFrom):
+            import_header += ast.unparse(node) + "\n"
+        elif isinstance(node, ast.Import):
+            import_header += ast.unparse(node) + "\n"
+    return import_header
+
+
+def _is_extern_module(module_name: str, extern_modules: set[str]) -> bool:
+    """Return True if *module_name* (or a parent) is in the extern set."""
+    if module_name in extern_modules:
+        return True
+    parts = module_name.split(".")
+    for i in range(1, len(parts)):
+        if ".".join(parts[:i]) in extern_modules:
+            return True
+    return False
+
+
+def rewrite_package_imports(
+    import_header: str,
+    package_importer: package.PackageImporter,
+) -> str:
+    """Rewrite interned imports to use ``_package_importer``.
+
+    Extern modules (``torch``, ``typing``, …) keep regular ``import``
+    statements.  Interned modules (for example, local
+    ``generative_recommenders.*`` modules) are rewritten to::
+
+        _pkg_mod = _package_importer.import_module(
+            'generative_recommenders.ops.triton.triton_utils'
+        )
+        helper = _pkg_mod.helper
+
+    The ``_package_importer`` object is injected into the wrapper module's
+    namespace by ``replace_kernels`` before ``exec_module`` is called.
+    """
+    extern_modules = set(package_importer.extern_modules)
+    header_tree = ast.parse(import_header)
+
+    regular: list[str] = []
+    from_package: list[str] = []
+
+    for node in header_tree.body:
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                if _is_extern_module(alias.name, extern_modules):
+                    regular.append(ast.unparse(node))
+                else:
+                    local = alias.asname or alias.name
+                    from_package.append(
+                        f"{local} = _package_importer.import_module('{alias.name}')"
+                    )
+        elif isinstance(node, ast.ImportFrom):
+            mod = node.module or ""
+            if _is_extern_module(mod, extern_modules):
+                regular.append(ast.unparse(node))
+            else:
+                var = f"_pkg_{mod.replace('.', '_')}"
+                from_package.append(f"{var} = _package_importer.import_module('{mod}')")
+                for alias in node.names:
+                    local = alias.asname or alias.name
+                    from_package.append(f"{local} = {var}.{alias.name}")
+        else:
+            # Non-import statement (should not appear, but preserve if it does)
+            regular.append(ast.unparse(node))
+
+    parts: list[str] = []
+    if regular:
+        parts.append("\n".join(regular))
+    if from_package:
+        parts.append("# Imports resolved from torch package via _package_importer")
+        parts.append("\n".join(from_package))
+    return "\n".join(parts) + "\n" if parts else ""
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py
new file mode 100644
index 000000000..51d144f1b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py
@@ -0,0 +1,500 @@
+# pyre-strict
+import ast
+import inspect
+import os
+from typing import Any, Callable, Dict, List, Optional
+
+from generative_recommenders.ops.triton_aot.compile.compile_state import (
+    get_aott_compile_path,
+    get_triton_aot_kernel_specs,
+)
+from generative_recommenders.ops.triton_aot.compile.utils import unwrap_heuristic
+from generative_recommenders.ops.triton_aot.shared.compat import get_kernel_name
+from generative_recommenders.ops.triton_aot.transform.import_utils import (
+    get_original_import_header,
+    rewrite_package_imports,
+)
+from generative_recommenders.ops.triton_aot.types import TritonAOT
+from pyre_extensions import none_throws
+from torch import package
+from torch.fx import GraphModule
+
+# @manual=//triton:triton
+from triton.runtime.autotuner import Autotuner
+from triton.runtime.jit import JITFunction, KernelInterface
+
+
+def _is_torch_package_module(module_name: str) -> bool:
+    """Check if a module name is from torch.package namespace."""
+    return module_name.startswith("<torch_package")
+
+
+def _strip_torch_package_prefix(module_name: str) -> str:
+    """Strip the torch.package namespace prefix from a module name.
+
+    Example:
+        '<torch_package_0>.generative_recommenders.ops.triton_aot.triton_layer_norm'
+        -> 'generative_recommenders.ops.triton_aot.triton_layer_norm'
+    """
+    if _is_torch_package_module(module_name):
+        # Remove '<torch_package_N>.' prefix
+        return module_name.split(".", 1)[1]
+    return module_name
+
+
+def _get_clean_module_basename(module_name: str) -> str:
+    """Get the basename of a module, stripping torch.package prefix if present.
+
+    Example:
+        '<torch_package_0>.generative_recommenders.ops.triton_aot.triton_layer_norm'
+        -> 'triton_layer_norm'
+        'generative_recommenders.ops.triton_aot.triton_layer_norm'
+        -> 'triton_layer_norm'
+    """
+    clean_name = _strip_torch_package_prefix(module_name)
+    return clean_name.rsplit(".", 1)[-1]
+
+
+def _extract_function_source(module_source: str, fn_name: str) -> str:
+    """Extract a function's source code from module source.
+
+    Parses the module source and extracts just the function definition.
+    """
+    tree = ast.parse(module_source)
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef) and node.name == fn_name:
+            return ast.unparse(node)
+    raise ValueError(f"Function '{fn_name}' not found in module source")
+
+
+def _get_module_and_source(
+    target: Callable[..., Any],
+    package_importer: Optional[package.PackageImporter],
+) -> tuple[Any, str, str]:
+    """Get module, module source, and function source for a callable.
+
+    Handles both regular modules and torch.package loaded modules.
+
+    Args:
+        target: The callable (function) to get source for
+        package_importer: Optional PackageImporter for torch.package modules
+
+    Returns:
+        Tuple of (module, module_source, function_source)
+    """
+    module_name = target.__module__
+    fn_name = target.__name__
+
+    if _is_torch_package_module(module_name) and package_importer is not None:
+        # Handle torch.package namespace
+        real_module_name = _strip_torch_package_prefix(module_name)
+        assert real_module_name.startswith(
+            "generative_recommenders.ops.triton_aot"
+        ) or real_module_name.startswith("prime_perf_optimizer"), (
+            f"Expected module under 'generative_recommenders.ops.triton_aot' or 'prime_perf_optimizer', got: {real_module_name}"
+        )
+
+        # Get module source from package
+        module_source = package_importer.get_source(real_module_name)
+
+        # Import the module through the package importer
+        fn_module = package_importer.import_module(real_module_name)
+
+        # Extract function source from module source
+        fn_source = _extract_function_source(module_source, fn_name)
+
+        return fn_module, module_source, fn_source
+    else:
+        # Standard module handling
+        fn_module = inspect.getmodule(target)
+        module_source = inspect.getsource(none_throws(fn_module))
+        fn_source = inspect.getsource(target)
+
+        return fn_module, module_source, fn_source
+
+
+def _calls_triton_aot_kernel(node: ast.FunctionDef, kernel_name: str) -> bool:
+    """
+    kernel_name is the JIT function name (e.g. "_weighted_layer_norm_fwd"),
+    which may differ from the wrapper function name (e.g.
+    "_triton_aot_swish_layer_norm").  We match by looking for a
+    Subscript-call ``kernel_name[grid](...)`` inside the function body.
+    """
+    for child in ast.walk(node):
+        if (
+            isinstance(child, ast.Call)
+            and isinstance(child.func, ast.Subscript)
+            and isinstance(child.func.value, ast.Name)
+            and child.func.value.id == kernel_name
+        ):
+            return True
+    return False
+
+
+def _is_torch_jit_unused(d: ast.expr) -> bool:
+    """Check if a decorator AST node represents @torch.jit.unused."""
+    return (
+        isinstance(d, ast.Attribute)
+        and d.attr == "unused"
+        and isinstance(d.value, ast.Attribute)
+        and d.value.attr == "jit"
+        and isinstance(d.value.value, ast.Name)
+        and d.value.value.id == "torch"
+    )
+
+
+def strip_jit_unused_decorator(
+    node: ast.FunctionDef, kernel_name: str
+) -> ast.FunctionDef:
+    """Strip @torch.jit.unused if the function body calls ``kernel_name[grid](...)``.
+
+    kernel_name is the TritonAOT kernel's JIT function name (e.g.
+    ``_weighted_layer_norm_fwd``), not the wrapper function name.  This avoids
+    relying on a naming convention on the wrapper function itself.
+    """
+    if _calls_triton_aot_kernel(node, kernel_name):
+        node.decorator_list = [
+            d for d in node.decorator_list if not _is_torch_jit_unused(d)
+        ]
+    return node
+
+
+class TritonAOTOperatorTransform(ast.NodeTransformer):
+    def __init__(self, kernel: Any) -> None:
+        super().__init__()
+        self._kernel: Any = kernel
+        self._kernel_jit_fn: JITFunction[List[Any]] = unwrap_heuristic(
+            kernel, return_type=JITFunction
+        )
+        self._kernel_autotuner: Optional[Autotuner] = unwrap_heuristic(
+            kernel, return_type=Autotuner
+        )
+        self._kernel_name: str = get_kernel_name(self._kernel_jit_fn)
+        # Only transform the function body
+        self._autotune_params: List[str] = (
+            list(list(self._kernel_autotuner.cache.values())[0].kwargs.keys())
+            if self._kernel_autotuner is not None
+            else []
+        )
+        self._autotune_params += ["num_warps", "num_stages"]
+
+        self._lambda_arg_name: Optional[str] = None
+        self._grid_name: Optional[str] = None
+        self._autotune_key_id: Optional[Dict[str, int]] = None
+        self._autotune_key_map: Optional[Dict[str, ast.expr]] = None
+        self._kernel_meta: Optional[ast.Assign] = None
+
+        if self._kernel_autotuner is not None:
+            autotune_key_id: Dict[str, int] = {}
+            self._autotune_key_id = autotune_key_id
+            # pyre-ignore[16]: JITFunction has arg_names at runtime
+            for key in self._kernel_autotuner.keys:
+                autotune_key_id[key] = self._kernel_jit_fn.arg_names.index(key)
+
+    def generate_function_meta(self) -> None:
+        targets = [
+            ast.Name(id=param, ctx=ast.Store()) for param in self._autotune_params
+        ]
+        autotune_key_map = self._autotune_key_map
+        kernel_autotuner = self._kernel_autotuner
+        call = ast.Call(
+            func=ast.Name(id=f"{self._kernel_name}_meta", ctx=ast.Load()),
+            args=[
+                none_throws(autotune_key_map)[key]
+                for key in none_throws(kernel_autotuner).keys
+            ]
+            if kernel_autotuner is not None
+            else [],
+            keywords=[],
+        )
+        self._kernel_meta = ast.Assign(
+            # pyre-ignore[6]: ast.Assign targets type
+            targets=[ast.Tuple(elts=targets, ctx=ast.Store())],
+            value=call,
+        )
+
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.FunctionDef:
+        strip_jit_unused_decorator(node, self._kernel_name)
+
+        new_body: List[ast.stmt] = []
+        stmts = node.body
+        for stmt in stmts:
+            if isinstance(stmt, ast.Assign):
+                for target in stmt.targets:
+                    if isinstance(target, ast.Name) and target.id == self._grid_name:
+                        assert self._kernel_meta is not None
+                        self._kernel_meta.lineno = stmt.lineno
+                        new_body.append(self._kernel_meta)
+            new_body.append(self.visit(stmt))
+        node.body = new_body
+        return node
+
+    def visit_Assign(self, node: ast.Assign) -> ast.Assign:
+        for target in node.targets:
+            if isinstance(target, ast.Name) and isinstance(node.value, ast.Lambda):
+                lambda_node = node.value
+                self._lambda_arg_name = lambda_node.args.args[0].arg
+                lambda_body = lambda_node.body
+                assert isinstance(lambda_body, ast.Tuple)
+                new_elts: List[ast.expr] = []
+                for elt in lambda_body.elts:
+                    new_elts.append(self.visit(elt))
+                node.value = ast.Tuple(elts=new_elts, ctx=ast.Load())
+                self._lambda_arg_name = None
+        return node
+
+    def visit_Subscript(self, node: ast.Subscript) -> ast.expr:
+        if isinstance(node.value, ast.Name) and node.value.id == self._lambda_arg_name:
+            assert isinstance(node.slice, ast.Constant)
+            assert isinstance(node.slice.value, str)
+            var_name = node.slice.value
+            # pyre-ignore
+            node = ast.Name(id=var_name, ctx=ast.Load())
+        return node
+
+    def visit_Expr(self, node: ast.Expr) -> ast.Expr:
+        if isinstance(node.value, ast.Call):
+            call = node.value
+            if (
+                isinstance(call.func, ast.Subscript)
+                and isinstance(call.func.value, ast.Name)
+                and call.func.value.id == self._kernel_name
+            ):
+                grid_arg = call.func.slice
+                new_func = ast.Attribute(
+                    value=ast.Attribute(
+                        value=ast.Attribute(
+                            value=ast.Name(id="torch", ctx=ast.Load()),
+                            attr="ops",
+                            ctx=ast.Load(),
+                        ),
+                        attr="triton_aot",
+                        ctx=ast.Load(),
+                    ),
+                    attr=self._kernel_name,
+                    ctx=ast.Load(),
+                )
+                new_args = [grid_arg] + call.args
+                new_keywords = call.keywords + [
+                    ast.keyword(arg=param, value=ast.Name(id=param, ctx=ast.Load()))
+                    for param in self._autotune_params
+                ]
+                node.value = ast.Call(
+                    func=new_func,
+                    args=new_args,
+                    keywords=new_keywords,
+                )
+        return node
+
+    def contains_triton_call(self, node: ast.AST) -> bool:
+        for child in ast.walk(node):
+            if (
+                isinstance(child, ast.Call)
+                and isinstance(child.func, ast.Subscript)
+                # pyre-ignore[16]: ast.expr may have `id` attribute at runtime
+                and child.func.value.id == self._kernel_name
+            ):
+                # pyrefly: ignore [missing-attribute]
+                self._grid_name = child.func.slice.id
+
+                if self._kernel_autotuner is not None:
+                    autotune_key_map: Dict[str, ast.expr] = {}
+                    self._autotune_key_map = autotune_key_map
+                    # pyre-ignore[16]: Autotuner has keys at runtime
+                    for key in self._kernel_autotuner.keys:
+                        found_key = False
+                        for keyword in child.keywords:
+                            if keyword.arg == key:
+                                autotune_key_map[key] = keyword.value
+                                found_key = True
+                                break
+
+                        if not found_key:
+                            autotune_key_id = self._autotune_key_id
+                            assert autotune_key_id is not None
+                            assert key in autotune_key_id
+                            key_id = autotune_key_id[key]
+                            autotune_key_map[key] = child.args[key_id]
+
+                self.generate_function_meta()
+                return True
+        return False
+
+    def contains_lambda(self, node: ast.AST) -> bool:
+        for child in ast.walk(node):
+            if isinstance(child, ast.Lambda):
+                return True
+        return False
+
+    def _get_grid_name(self, node: ast.AST) -> Optional[str]:
+        for child in ast.walk(node):
+            if (
+                isinstance(child, ast.Call)
+                and isinstance(child.func, ast.Subscript)
+                # pyre-ignore[16]: ast.expr may have `id` attribute at runtime
+                and child.func.value.id == self._kernel_name
+            ):
+                # pyrefly: ignore [missing-attribute]
+                return child.func.slice.id
+        return None
+
+    def generate_so_loading_code(
+        self,
+        node: ast.AST,
+        abs_triton_aot_path: str,
+    ) -> str:
+        """Return auto-generated code to load the compiled kernel at runtime.
+
+        If *node* contains a call to this transformer's kernel, returns
+        ``import importlib.util`` + meta-module loading + ``torch.ops.load_library``
+        code.  Otherwise returns an empty string.
+
+        This method also sets up internal transformer state (grid name,
+        autotune key map, etc.) via ``contains_triton_call`` as a side effect.
+
+        Example for _addmm_fwd kernel:
+            kernel_dir = "triton_addmm__addmm_fwd"
+            meta_module_path = "/path/to/triton_aot_compile/triton_addmm__addmm_fwd/_addmm_fwd_meta.py"
+            so_path = "/path/to/triton_aot_compile/triton_addmm__addmm_fwd/addmm_fwd.so"
+        """
+        if not self.contains_triton_call(node):
+            return ""
+
+        kernel_dir = f"{_get_clean_module_basename(self._kernel_jit_fn.__module__)}_{self._kernel_name}"
+
+        meta_module_path = os.path.join(
+            abs_triton_aot_path, kernel_dir, f"{self._kernel_name}_meta.py"
+        )
+
+        so_path = os.path.join(
+            abs_triton_aot_path,
+            kernel_dir,
+            f"{self._kernel_name.lstrip('_')}.so",
+        )
+
+        return f"""
+# Auto-generated by triton_aot.kernel_wrapper_codegen
+import importlib.util
+_meta_spec = importlib.util.spec_from_file_location("{self._kernel_name}_meta", "{meta_module_path}")
+_meta_module = importlib.util.module_from_spec(_meta_spec)
+_meta_spec.loader.exec_module(_meta_module)
+{self._kernel_name}_meta = _meta_module.{self._kernel_name}_meta
+
+torch.ops.load_library("{so_path}")
+"""
+
+
+def _find_triton_aot_kernel(
+    node_target: Any,
+    kernel_specs: Dict[KernelInterface[List[Any]], List[Dict[str, List[Any]]]],
+) -> Optional[TritonAOT]:
+    """Find the single TritonAOT kernel referenced in a node target's globals.
+
+    Scans ``node_target.__globals__`` for ``TritonAOT`` instances, validates
+    that every instance appears in *kernel_specs*, and asserts at most one
+    kernel is present (per the one-kernel-per-wrapper invariant).
+
+    Returns the kernel, or ``None`` if the function references no kernels.
+    """
+    kernels: set[TritonAOT] = set()
+    for _, var in node_target.__globals__.items():
+        if isinstance(var, TritonAOT):
+            if var.fn in kernel_specs:
+                kernels.add(var)
+            else:
+                raise RuntimeError(
+                    f"Cannot find TritonAOT kernel {var.fn} in TRITON_AOT_KERNEL_SPECS"
+                )
+
+    if len(kernels) == 0:
+        return None
+
+    fn_name = node_target.__name__
+    assert len(kernels) == 1, (
+        f"Expected exactly 1 kernel per wrapper function '{fn_name}', "
+        f"got {len(kernels)}"
+    )
+    (kernel_obj,) = kernels
+    return kernel_obj
+
+
+def _generate_wrapper_files(
+    node_target: Any,
+    kernel: TritonAOT,
+    compile_path: str,
+    package_importer: Optional[package.PackageImporter],
+) -> None:
+    """Generate ``_original.py`` and ``_wrapper.py`` for a single kernel.
+
+    Creates a per-kernel subdirectory under *compile_path*, writes the
+    original function source, then AST-transforms the wrapper to replace
+    ``kernel[grid](...)`` with ``torch.ops.triton_aot.*`` calls.
+    """
+    fn_name = node_target.__name__
+
+    jit_fn = none_throws(
+        unwrap_heuristic(kernel, return_type=JITFunction),
+        f"Failed to unwrap kernel to JITFunction: {kernel}",
+    )
+    kernel_dir = (
+        f"{_get_clean_module_basename(jit_fn.__module__)}_{get_kernel_name(jit_fn)}"
+    )
+    output_dir = os.path.join(compile_path, kernel_dir)
+    os.makedirs(output_dir, exist_ok=True)
+
+    _, module_code, wrapper_code = _get_module_and_source(node_target, package_importer)
+    import_header = get_original_import_header(module_code)
+
+    with open(os.path.join(output_dir, f"{fn_name}_original.py"), "w") as f:
+        f.write(import_header)
+        f.write(wrapper_code)
+
+    # When source comes from a torch package, rewrite interned
+    # imports to use _package_importer, which
+    # is injected by replace_kernels at load time.  Must happen
+    # before auto-generated code is appended (so stdlib imports
+    # like ``import importlib.util`` are not touched).
+    if package_importer is not None:
+        import_header = rewrite_package_imports(import_header, package_importer)
+
+    tree = ast.parse(wrapper_code)
+    transformer = TritonAOTOperatorTransform(kernel=kernel)
+    import_header += transformer.generate_so_loading_code(tree, compile_path)
+    tree = transformer.visit(tree)
+
+    new_source_code = ast.unparse(tree)
+
+    with open(os.path.join(output_dir, f"{fn_name}_wrapper.py"), "w") as f:
+        f.write(import_header)
+        f.write(new_source_code)
+
+
+def kernel_wrapper_codegen(
+    module: GraphModule, packageImporter: package.PackageImporter | None = None
+) -> None:
+    """
+    Generate wrapper files for TritonAOT kernels.
+    Requirement: under wrapper.py, @triton.jit kernel/func is imported without 'as' alias.
+
+    For each function containing TritonAOT kernels, generates:
+    - {fn_name}_original.py: Original source code with imports
+    - {fn_name}_wrapper.py: Transformed wrapper that uses torch.ops.triton_aot
+    """
+    compile_path = get_aott_compile_path()
+    if not os.path.exists(compile_path):
+        os.makedirs(compile_path)
+
+    transformed_ops: set[Callable[..., Any]] = set()
+    kernel_specs = get_triton_aot_kernel_specs()
+    for node in module.graph.nodes:
+        if node.op == "call_function" and hasattr(node.target, "__globals__"):
+            if node.target not in transformed_ops:
+                transformed_ops.add(node.target)
+            else:
+                continue
+
+            kernel = _find_triton_aot_kernel(node.target, kernel_specs)
+            if kernel is not None:
+                _generate_wrapper_files(
+                    node.target, kernel, compile_path, packageImporter
+                )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py
new file mode 100644
index 000000000..53e5d83e9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py
@@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import importlib.util
+import logging
+import os
+import sys
+from typing import Any, Dict, Optional
+
+from generative_recommenders.ops.triton_aot.compile.compile_state import (
+    get_aott_compile_path,
+)
+from torch import package
+from torch.fx import GraphModule
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def _find_wrapper_files(
+    compile_path: str,
+) -> list[tuple[str, str, str]]:
+    """Find all ``*_wrapper.py`` files under *compile_path*.
+
+    Walks one level deep into kernel subdirectories and returns a list of
+    ``(wrapper_name, fn_name, wrapper_path)`` tuples.
+    """
+    results: list[tuple[str, str, str]] = []
+    for dirpath, dirnames, filenames in os.walk(compile_path):
+        if dirpath != compile_path:
+            dirnames.clear()  # only recurse one level into kernel subdirs
+        for item in filenames:
+            if item.endswith("_wrapper.py"):
+                wrapper_name = item.removesuffix(".py")
+                fn_name = wrapper_name.removesuffix("_wrapper")
+                wrapper_path = os.path.join(dirpath, item)
+                results.append((wrapper_name, fn_name, wrapper_path))
+    return results
+
+
+def _load_wrapper_module(
+    wrapper_name: str,
+    fn_name: str,
+    wrapper_path: str,
+    package_importer: Optional[package.PackageImporter],
+) -> Optional[Any]:
+    """Dynamically import a single ``*_wrapper.py`` and return its wrapper callable.
+
+    Returns ``None`` if the module does not expose a function named *fn_name*.
+    """
+    spec = importlib.util.spec_from_file_location(wrapper_name, wrapper_path)
+    assert spec is not None, f"Failed to create spec for {wrapper_path}"
+    assert spec.loader is not None, f"Spec has no loader for {wrapper_path}"
+
+    loader = spec.loader
+    wrapper_module = importlib.util.module_from_spec(spec)
+
+    sys.modules[wrapper_name] = wrapper_module
+
+    if package_importer is not None:
+        wrapper_module._package_importer = package_importer  # type: ignore[attr-defined]
+
+    loader.exec_module(wrapper_module)
+
+    if hasattr(wrapper_module, fn_name):
+        return getattr(wrapper_module, fn_name)
+    return None
+
+
+def replace_kernels(
+    fx_m: GraphModule,
+    eager: bool = False,
+    package_importer: Optional[package.PackageImporter] = None,
+) -> GraphModule:
+    if eager:
+        raise NotImplementedError(
+            "Local generative_recommenders AOT-T transform does not support "
+            "eager replacement."
+        )
+
+    compile_path = get_aott_compile_path()
+    assert os.path.exists(compile_path), "triton_aot_compile dir does not exist"
+
+    wrapper_dict: Dict[str, Any] = {}
+    for wrapper_name, fn_name, wrapper_path in _find_wrapper_files(compile_path):
+        wrapper_fn = _load_wrapper_module(
+            wrapper_name, fn_name, wrapper_path, package_importer
+        )
+        if wrapper_fn is not None:
+            wrapper_dict[fn_name] = wrapper_fn
+
+    logger.info(f"replace_kernels: {wrapper_dict=}")
+
+    # Phase 2: Replace FX graph nodes
+    #   Walk the FX graph, find call_function nodes whose target name
+    #   matches a loaded wrapper, and swap the target so that
+    #   kernel[grid](...) calls become torch.ops.triton_aot.* calls.
+    replaced_count = 0
+    for nodes in fx_m.graph.nodes:
+        if nodes.op == "call_function" and nodes.target.__name__ in wrapper_dict.keys():
+            logger.info(
+                f"Replaced node: {nodes.op} {nodes.target} -> {wrapper_dict[nodes.target.__name__]} {nodes.meta}"
+            )
+            nodes.target = wrapper_dict[nodes.target.__name__]
+            replaced_count += 1
+
+    assert replaced_count > 0, (
+        f"No ops were replaced with triton_aot wrappers. "
+        f"wrapper_dict={wrapper_dict}, compile_path={compile_path}"
+    )
+    logger.info(
+        f"Successfully replaced {replaced_count} op(s) with triton_aot wrappers."
+    )
+
+    fx_m.recompile()
+    return fx_m
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py
new file mode 100644
index 000000000..c2be78989
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# pyre-strict
+
+from typing import Optional
+
+from generative_recommenders.ops.triton_aot.transform.kernel_wrapper_codegen import (
+    kernel_wrapper_codegen,
+)
+from generative_recommenders.ops.triton_aot.transform.replace_kernels import (
+    replace_kernels,
+)
+from torch import package
+from torch.fx import GraphModule
+
+
+def transform_kernels(
+    fx_m: GraphModule,
+    eager: bool = False,
+    package_importer: Optional[package.PackageImporter] = None,
+) -> GraphModule:
+    """Generate AOT wrappers and replace FX graph nodes in one step.
+
+    1. kernel_wrapper_codegen: AST-transforms wrapper functions,
+       rewrites kernel[grid](...) -> torch.ops.triton_aot.kernel(...),
+       writes {fn}_wrapper.py
+    2. replace_kernels: loads wrappers and replaces graph node targets
+    """
+    kernel_wrapper_codegen(fx_m, package_importer)
+    return replace_kernels(fx_m, eager=eager, package_importer=package_importer)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py
new file mode 100644
index 000000000..b71ec8144
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py
@@ -0,0 +1,347 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+# pyre-ignore-all-errors[2]: Triton has its own type system on func's input
+
+#!/usr/bin/env python3
+
+
+from typing import Any, List, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import (
+    BACKEND_ALLOW_TF32,
+    cdiv,
+    should_trigger_eager_impl,
+)
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+def get_mm_configs() -> List[triton.Config]:
+    return [
+        triton.Config(
+            {
+                "BLOCK_M": 32,
+                "BLOCK_N": 64,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=5,
+            num_warps=2,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 256,
+                "BLOCK_K": 64,
+                "GROUP_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 256,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 128,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 128,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 32,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 32,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=5,
+            num_warps=2,
+        ),
+    ]
+
+
+@triton_aot(
+    annotations={
+        "M": "i32",
+        "N": ("i32", 16),
+        "K": ("i32", 16),
+        "stride_xm": ("i32", 16),
+        "stride_xk": ("i32", 1),
+        "stride_wk": ("i32", 16),
+        "stride_wn": ("i32", 1),
+        "stride_ym": ("i32", 16),
+        "stride_yn": ("i32", 1),
+        "stride_zm": ("i32", 16),
+        "stride_zn": ("i32", 1),
+    },
+)
+# pyre-ignore[56]: Pyre cannot infer triton.autotune decorator type
+@triton.autotune(
+    configs=get_mm_configs(),
+    key=["N", "K"],
+)
+@triton.jit
+def _addmm_fwd(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    z_ptr,
+    M,
+    N,
+    K,
+    stride_xm,
+    stride_xk,
+    stride_wk,
+    stride_wn,
+    stride_ym,
+    stride_yn,
+    stride_zm,
+    stride_zn,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+) -> None:
+    pid_0, pid_1 = tl.program_id(axis=0), tl.program_id(axis=1)
+    pid = pid_0 * tl.num_programs(axis=1) + pid_1
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+    offs_n = tl.arange(0, BLOCK_N)
+    mask_m = (pid_m * BLOCK_M + offs_m)[:, None] < M
+    mask_n = (pid_n * BLOCK_N + offs_n)[None, :] < N
+    x_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_xm
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    w_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_wn
+    w_ptrs = w_ptr + (offs_k[:, None] * stride_wk + offs_n[None, :] * stride_wn)
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        mask_k = offs_k[None, :] < K - k * BLOCK_K
+        x = tl.load(x_ptrs, mask=mask_k & mask_m, other=0.0)
+        mask_k = offs_k[:, None] < K - k * BLOCK_K
+        w = tl.load(w_ptrs, mask=mask_k & mask_n, other=0.0)
+        accumulator += tl.dot(x, w, allow_tf32=ALLOW_TF32)
+        x_ptrs += BLOCK_K * stride_xk
+        w_ptrs += BLOCK_K * stride_wk
+
+    z_mask = mask_m & mask_n
+    if BROADCAST_Y:
+        # y is a vector, broadcast to add to z
+        y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn
+        y_ptrs = y_ptr + stride_yn * offs_n[None, :]
+        y = tl.load(y_ptrs, mask=mask_n)
+    else:
+        y_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_ym
+        y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn
+        y_ptrs = y_ptr + stride_ym * offs_m[:, None] + stride_yn * offs_n[None, :]
+        y = tl.load(y_ptrs, mask=z_mask)
+    z = (accumulator + y.to(tl.float32)).to(z_ptr.dtype.element_ty)
+    z_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_zm
+    z_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_zn
+    z_ptrs = z_ptr + stride_zm * offs_m[:, None] + stride_zn * offs_n[None, :]
+    tl.store(z_ptrs, z, mask=z_mask)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_addmm_fwd(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+    allow_tf32: bool = BACKEND_ALLOW_TF32,
+) -> torch.Tensor:
+    M, K = x.shape
+    KB, N = w.shape
+    assert K == KB, f"incompatible dimensions {K}, {KB}"
+
+    is_y_1d = y.dim() == 1
+    NY = y.shape[0] if is_y_1d else y.shape[1]
+    assert N == NY, f"incompatible dimensions {N}, {NY}"
+
+    # Allocate output
+    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return z
+
+    grid = lambda meta: (  # noqa E731
+        cdiv(M, meta["BLOCK_M"]),
+        cdiv(N, meta["BLOCK_N"]),
+    )
+
+    _addmm_fwd[grid](
+        x,
+        w,
+        y,
+        z,
+        M,
+        N,
+        K,
+        x.stride(0),
+        x.stride(1),
+        w.stride(0),
+        w.stride(1),
+        y.stride(0) if not is_y_1d else 0,
+        y.stride(1) if not is_y_1d else y.stride(0),
+        z.stride(0),
+        z.stride(1),
+        ALLOW_TF32=allow_tf32,
+        BROADCAST_Y=is_y_1d,
+    )
+    return z
+
+
+def _triton_aot_addmm_fwd_eager(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    return torch.addmm(y, x, w)
+
+
+@torch.fx.wrap
+def _triton_aot_addmm_fwd_maybe_eager(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # call eager
+        return torch.addmm(y, x, w)
+    else:
+        return _triton_aot_addmm_fwd(x, w, y)
+
+
+def triton_addmm_bwd(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    dz: torch.Tensor,
+    is_y_1d: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if is_y_1d:
+        dy = torch.sum(dz, dim=0)
+    else:
+        dy = dz
+    dw = torch.mm(x.t(), dz)
+    dx = torch.mm(dz, w.t())
+
+    return dx, dw, dy
+
+
+class _AddMmFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]: autograd.Function signature override
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        w: torch.Tensor,
+        y: torch.Tensor,
+    ) -> torch.Tensor:
+        ctx.save_for_backward(x, w)
+        ctx.is_y_1d = y.dim() == 1
+        return _triton_aot_addmm_fwd(x, w, y)
+
+    @staticmethod
+    # pyre-ignore[14]: autograd.Function signature override
+    def backward(
+        ctx: Any, dz: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        (x, w) = ctx.saved_tensors
+        return triton_addmm_bwd(x, w, dz, ctx.is_y_1d)
+
+
+def triton_addmm(
+    input: torch.Tensor,
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+) -> torch.Tensor:
+    return _AddMmFunction.apply(mat1, mat2, input)
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_addmm(
+    input: torch.Tensor,
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+    allow_tf32: bool = BACKEND_ALLOW_TF32,
+) -> torch.Tensor:
+    if should_trigger_eager_impl():
+        return torch.addmm(input, mat1, mat2)
+    else:
+        return _triton_aot_addmm_fwd(mat1, mat2, input, allow_tf32=allow_tf32)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py
new file mode 100644
index 000000000..8afc3c18e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py
@@ -0,0 +1,183 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    fx_unwrap_optional_tensor,
+    next_power_of_2,
+    should_trigger_eager_impl,
+)
+from generative_recommenders.ops.pytorch.pt_jagged import (
+    pytorch_replace_last_n_with_jagged,
+)
+from generative_recommenders.ops.pytorch.pt_jagged_tensors import (
+    pytorch_concat_2D_jagged,
+)
+from generative_recommenders.ops.triton.triton_jagged import concat_2D_jagged
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+concat_2D_jagged = triton_aot(
+    annotations={
+        "DenseSize": "i32",
+        "D": "i32",
+        "stride_ad": "i32",
+        "stride_bd": "i32",
+        "stride_dense_batch": "i32",
+        "stride_od": "i32",
+    },
+    # pyrefly: ignore [bad-argument-type]
+)(concat_2D_jagged)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_concat_2D_jagged(
+    max_seq_len: int,
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    is_replace: bool = False,
+) -> torch.Tensor:
+    is_dense_a = offsets_a is None
+    is_dense_b = offsets_b is None
+
+    dense_size: int = 0
+    if is_dense_a:
+        B, dense_size, D = values_a.size()
+        offsets_b = fx_unwrap_optional_tensor(offsets_b)
+        jagged_seq_len, _ = values_b.shape
+        values_out = torch.empty(
+            (dense_size * B + jagged_seq_len, D),
+            device=values_b.device,
+            dtype=values_b.dtype,
+        )
+        offsets_a = offsets_b.new_empty(0)
+        stride_dense_batch = values_a.stride(0)
+    elif is_dense_b:
+        B, dense_size, D = values_b.size()
+        offsets_a = fx_unwrap_optional_tensor(offsets_a)
+        jagged_seq_len, _ = values_a.shape
+        values_out = torch.empty(
+            (jagged_seq_len + dense_size * B, D),
+            device=values_a.device,
+            dtype=values_a.dtype,
+        )
+        offsets_b = offsets_a.new_empty(0)
+        stride_dense_batch = values_b.stride(0)
+    else:
+        offsets_a = fx_unwrap_optional_tensor(offsets_a)
+        offsets_b = fx_unwrap_optional_tensor(offsets_b)
+        B = offsets_a.size(0) - 1
+        seq_len_a, D = values_a.shape
+        seq_len_b, _ = values_b.shape
+        if is_replace:
+            values_out = torch.empty_like(values_a)
+        else:
+            values_out = torch.empty(
+                (seq_len_a + seq_len_b, D), device=values_a.device, dtype=values_a.dtype
+            )
+        stride_dense_batch = 0
+
+    # Make sure offsets are alignted on 16-byte to match AOTT spec
+    if (
+        offsets_a is not None
+        and (offsets_a.storage_offset() * offsets_a.element_size()) % 16 != 0
+    ):
+        offsets_a = offsets_a.clone()
+    if (
+        offsets_b is not None
+        and (offsets_b.storage_offset() * offsets_b.element_size()) % 16 != 0
+    ):
+        offsets_b = offsets_b.clone()
+
+    BLOCK_D = next_power_of_2(D)
+
+    grid = (max_seq_len, B)
+    # pyrefly: ignore [not-callable]
+    concat_2D_jagged[grid](
+        OffsetsA=offsets_a,
+        ValuesA=values_a,
+        OffsetsB=offsets_b,
+        ValuesB=values_b,
+        DenseSize=dense_size,
+        Out=values_out,
+        D=D,
+        stride_ad=(values_a.stride(1) if is_dense_a else values_a.stride(0)),
+        stride_bd=(values_b.stride(1) if is_dense_b else values_b.stride(0)),
+        stride_dense_batch=stride_dense_batch,
+        stride_od=values_out.stride(0),
+        # pyrefly: ignore [bad-argument-type]
+        IS_DENSE_A=is_dense_a,
+        # pyrefly: ignore [bad-argument-type]
+        IS_DENSE_B=is_dense_b,
+        # pyrefly: ignore [bad-argument-type]
+        BLOCK_D=BLOCK_D,
+        # pyrefly: ignore [bad-argument-type]
+        IS_REPLACE=is_replace,
+    )
+    return values_out
+
+
+@torch.fx.wrap
+# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
+# AOT-T triton kernel wrapper functions. This is required for
+# AOT-T backend to recognize and trace correctly for ops transformation.
+def aot_triton_kernel_wrapper_concat_2D_jagged(
+    max_seq_len: int,
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    is_replace: bool = False,
+) -> torch.Tensor:
+    if should_trigger_eager_impl():
+        if is_replace:
+            assert offsets_a is not None and offsets_b is not None
+            return pytorch_replace_last_n_with_jagged(
+                max_seq_len_left=max_seq_len,
+                offsets_left=offsets_a,
+                values_left=values_a,
+                offsets_right=offsets_b,
+                values_right=values_b,
+            )
+        return pytorch_concat_2D_jagged(
+            values_left=values_a,
+            values_right=values_b,
+            max_len_left=max_seq_len if offsets_a is None else None,
+            max_len_right=max_seq_len if offsets_b is None else None,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+        )
+    else:
+        return _triton_aot_concat_2D_jagged(
+            max_seq_len=max_seq_len,
+            values_a=values_a,
+            values_b=values_b,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            is_replace=is_replace,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py
new file mode 100644
index 000000000..15c609a3c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py
@@ -0,0 +1,124 @@
+# pyre-strict
+
+import torch
+from generative_recommenders.common import next_power_of_2, should_trigger_eager_impl
+from generative_recommenders.ops.pytorch.pt_hstu_linear import pytorch_norm_mul_dropout
+from generative_recommenders.ops.triton.triton_hstu_linear import (
+    _group_norm_mul_dropout_fwd,
+)
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+_group_norm_mul_dropout_fwd = triton_aot(
+    annotations={
+        "D": ("i32", 16),
+        "eps": "fp32",
+        "seed": "i64",
+        "dropout_ratio": "fp32",
+        "stride_x": ("i32", 16),
+        "stride_u": ("i32", 16),
+        "stride_y": ("i32", 16),
+    },
+    # pyrefly: ignore [bad-argument-type]
+)(_group_norm_mul_dropout_fwd)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_group_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    silu_u: bool,
+    concat_ux: bool,
+    num_heads: int,
+    linear_dim: int,
+) -> torch.Tensor:
+    x = x.contiguous()
+    u = u.contiguous()
+    N, _ = x.shape
+    if concat_ux:
+        y = torch.empty((N, 3 * num_heads * linear_dim), dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty((N, num_heads * linear_dim), dtype=x.dtype, device=x.device)
+    mean = torch.empty((N * num_heads,), dtype=x.dtype, device=x.device)
+    rstd = torch.empty((N * num_heads,), dtype=x.dtype, device=x.device)
+
+    BLOCK_D = next_power_of_2(linear_dim)
+    BLOCK_H = next_power_of_2(num_heads)
+
+    seed = 0
+    dropout_ratio = 0.0
+
+    grid = (N,)
+    # pyrefly: ignore [not-callable]
+    _group_norm_mul_dropout_fwd[grid](
+        x,  # X
+        u,  # U
+        y,  # Y
+        weight,  # W
+        bias,  # B
+        mean,  # Mean
+        rstd,  # Rstd
+        linear_dim,  # D
+        num_heads,  # Heads
+        eps,  # eps
+        seed,  # seed
+        dropout_ratio,  # dropout_ratio
+        x.stride(0),  # stride_x
+        u.stride(0),  # stride_u
+        y.stride(0),  # stride_y
+        # pyrefly: ignore [bad-argument-type]
+        SILU_U=silu_u,
+        # pyrefly: ignore [bad-argument-type]
+        BLOCK_D=BLOCK_D,
+        # pyrefly: ignore [bad-argument-type]
+        BLOCK_H=BLOCK_H,
+        # pyrefly: ignore [bad-argument-type]
+        TRAINING=False,
+        # pyrefly: ignore [bad-argument-type]
+        CONCAT_UX=concat_ux,
+    )
+    return y
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_group_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    silu_u: bool,
+    concat_ux: bool,
+    num_heads: int,
+    linear_dim: int,
+) -> torch.Tensor:
+    if should_trigger_eager_impl():
+        return pytorch_norm_mul_dropout(
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            dropout_ratio=0.0,
+            training=False,
+            silu_u=silu_u,
+            concat_u=concat_ux,
+            concat_x=concat_ux,
+            group_norm=True,
+            num_heads=num_heads,
+            linear_dim=linear_dim,
+        )
+    return _triton_aot_group_norm_mul_dropout(
+        x=x,
+        u=u,
+        weight=weight,
+        bias=bias,
+        eps=eps,
+        silu_u=silu_u,
+        concat_ux=concat_ux,
+        num_heads=num_heads,
+        linear_dim=linear_dim,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py
new file mode 100644
index 000000000..c5339033f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+
+#!/usr/bin/env python3
+
+import torch
+from generative_recommenders.common import (
+    cdiv,
+    next_power_of_2,
+    should_trigger_eager_impl,
+    switch_to_contiguous_if_needed,
+)
+from generative_recommenders.ops.pytorch.pt_layer_norm import (
+    pytorch_layer_norm,
+    pytorch_swish_layer_norm,
+)
+from generative_recommenders.ops.triton.triton_layer_norm import (
+    _weighted_layer_norm_fwd,
+)
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+_weighted_layer_norm_fwd = triton_aot(
+    annotations={
+        "N": "i32",
+        "D": ("i32", 16),
+        "stride_x": ("i32", 16),
+        "stride_y": ("i32", 16),
+    },
+)(_weighted_layer_norm_fwd)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_swish_layer_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    is_swish: bool,
+) -> torch.Tensor:
+    assert x.dim() == 2, f"x.dim() == {x.dim()}, expected 2"
+    x = switch_to_contiguous_if_needed(x)
+    N, D = x.shape
+
+    assert weight.dim() == 1
+    assert bias.dim() == 1
+    assert weight.numel() == D
+    assert bias.numel() == D
+
+    y = torch.empty_like(x)
+
+    BLOCK_D = next_power_of_2(D)
+
+    grid = lambda meta: (  # noqa E731
+        cdiv(N, meta["BLOCK_N"]),
+    )
+    # pyrefly: ignore [not-callable]
+    _weighted_layer_norm_fwd[grid](
+        x,
+        y,
+        weight,
+        bias,
+        torch.empty(0, dtype=torch.float32),
+        torch.empty(0, dtype=torch.float32),
+        N,
+        D,
+        eps,
+        stride_x=x.stride(0),
+        stride_y=y.stride(0),
+        IS_SWISH=is_swish,
+        TRAINING=False,
+        BLOCK_D=BLOCK_D,
+        COMPUTE_MEAN_AND_RSTD=True,
+    )
+
+    return y
+
+
+@torch.fx.wrap
+# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
+# AOT-T triton kernel wrapper functions. This is required for
+# AOT-T backend to recognize and trace correctly for ops transformation.
+def aot_triton_kernel_wrapper_swish_layer_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    is_swish: bool,
+) -> torch.Tensor:
+    if should_trigger_eager_impl():
+        if is_swish:
+            return pytorch_swish_layer_norm(x, [x.shape[1]], weight, bias, eps).to(
+                x.dtype
+            )
+        else:
+            return pytorch_layer_norm(x, [x.shape[1]], weight, bias, eps).to(x.dtype)
+    else:
+        return _triton_aot_swish_layer_norm(x, weight, bias, eps, is_swish)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py
new file mode 100644
index 000000000..7f7a6c743
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py
@@ -0,0 +1,162 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+
+#!/usr/bin/env python3
+
+
+import torch
+from generative_recommenders.common import next_power_of_2, should_trigger_eager_impl
+from generative_recommenders.ops.pytorch.pt_hstu_linear import pytorch_norm_mul_dropout
+from generative_recommenders.ops.triton.triton_hstu_linear import _ln_mul_dropout_fwd
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+_ln_mul_dropout_fwd = triton_aot(
+    annotations={
+        "D": ("i32", 16),
+        "stride_x": ("i32", 16),
+        "stride_u": ("i32", 16),
+        "stride_y": ("i32", 16),
+    },
+    # pyrefly: ignore [bad-argument-type]
+)(_ln_mul_dropout_fwd)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_layer_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool,
+    concat_ux: bool,
+    mul_u_activation_type: str,
+) -> torch.Tensor:
+    assert x.dim() == 2
+    if x.stride(1) != 1:
+        x = x.contiguous()
+    N, D = x.shape
+    assert weight.dim() == 1
+    assert bias.dim() == 1
+    assert weight.numel() == D
+    assert bias.numel() == D
+
+    if concat_ux:
+        y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty_like(x)
+    if N == 0:
+        return y
+    mean = x.new_empty((N,))
+    rstd = x.new_empty((N,))
+
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_D = min(MAX_FUSED_SIZE, next_power_of_2(D))
+    if D > BLOCK_D:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+    seed = 0
+    # num_warps = min(max(BLOCK_D // 256, 1), 8)
+    grid = (N,)
+    # pyrefly: ignore [not-callable]
+    _ln_mul_dropout_fwd[grid](
+        x,
+        u,
+        y,
+        weight,
+        bias,
+        mean,
+        rstd,
+        D,
+        eps,
+        seed,
+        dropout_ratio,
+        x.stride(0),
+        u.stride(0),
+        y.stride(0),
+        # pyrefly: ignore [bad-argument-type]
+        SILU_U=silu_u,
+        # pyrefly: ignore [bad-argument-type]
+        BLOCK_D=BLOCK_D,
+        # pyrefly: ignore [bad-argument-type]
+        TRAINING=training,
+        # pyrefly: ignore [bad-argument-type]
+        CONCAT_U=concat_ux,
+        # pyrefly: ignore [bad-argument-type]
+        CONCAT_X=concat_ux,
+        # pyrefly: ignore [bad-argument-type]
+        MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
+        # pyrefly: ignore [bad-argument-type]
+        FAST_DROPOUT=False,
+    )
+    return y
+
+
+@torch.fx.wrap
+# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
+# AOT-T triton kernel wrapper functions. This is required for
+# AOT-T backend to recognize and trace correctly for ops transformation.
+def aot_triton_kernel_wrapper_layer_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool,
+    concat_ux: bool,
+    mul_u_activation_type: str,
+) -> torch.Tensor:
+    if should_trigger_eager_impl():
+        return pytorch_norm_mul_dropout(
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            silu_u=silu_u,
+            concat_u=concat_ux,
+            concat_x=concat_ux,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=False,
+        )
+    else:
+        return _triton_aot_layer_norm_mul_dropout(
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            silu_u=silu_u,
+            concat_ux=concat_ux,
+            mul_u_activation_type=mul_u_activation_type,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py
new file mode 100644
index 000000000..828a4f4d4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py
@@ -0,0 +1,176 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    cdiv,
+    fx_unwrap_optional_tensor,
+    next_power_of_2,
+    prev_power_of_2,
+    should_trigger_eager_impl,
+)
+from generative_recommenders.ops.pytorch.pt_position import (
+    pytorch_add_timestamp_positional_embeddings,
+)
+from generative_recommenders.ops.triton.triton_position import (
+    _add_timestamp_position_embeddings_kernel,
+)
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+_add_timestamp_position_embeddings_kernel = triton_aot(
+    annotations={
+        "SeqEmb": ("*bf16", 16),
+        "Offsets": ("*i64", 16),
+        "Lengths": ("*i64", 16),
+        "PosEmb": ("*fp32", 16),
+        "TsEmb": ("*fp32", 16),
+        "Out": ("*bf16", 16),
+        "TS": ("*i64", 16),
+        "PosInds": ("*i32", 16),
+        "TsInds": ("*i32", 16),
+        "NumTargets": ("*i64", 16),
+        "AUTOTUNE_MAX_SEQ_LEN": "i32",
+        "D": "i32",
+        "num_time_buckets": "i32",
+        "time_bucket_increments": "fp32",
+        "time_bucket_scale": "fp32",
+        "time_delta": "i32",
+        "max_contextual_seq_len": "i32",
+        "max_pos_ind": "i32",
+        "stride_sn": ("i32", 16),
+        "stride_pn": ("i32", 16),
+        "stride_tn": ("i32", 16),
+        "stride_on": ("i32", 16),
+    },
+)(_add_timestamp_position_embeddings_kernel)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_position(
+    seq_embeddings: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    pos_embeddings: torch.Tensor,
+    ts_embeddings: torch.Tensor,
+    timestamps: torch.Tensor,
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    seq_lengths: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+    time_bucket_fn: str,
+) -> torch.Tensor:
+    has_multiple_targets = num_targets is not None
+    if not has_multiple_targets:
+        num_targets_resolved = torch.empty(
+            0, dtype=torch.int64, device=seq_embeddings.device
+        )
+    else:
+        num_targets_resolved = fx_unwrap_optional_tensor(num_targets).to(torch.int64)
+
+    seq_embeddings = seq_embeddings.contiguous()
+    pos_embeddings = pos_embeddings.contiguous()
+    ts_embeddings = ts_embeddings.contiguous()
+
+    max_pos_ind = pos_embeddings.shape[0]
+    B = seq_lengths.shape[0]
+
+    N, D = seq_embeddings.shape
+    out = torch.empty_like(seq_embeddings)
+
+    timestamps = timestamps.contiguous()
+    ts_inds = torch.empty((N,), device=timestamps.device, dtype=torch.int32)
+    pos_inds = torch.empty((N,), device=timestamps.device, dtype=torch.int32)
+
+    autotune_max_seq_len = prev_power_of_2(max_seq_len)
+    BLOCK_D = next_power_of_2(D) if D < 64 else 64
+
+    grid = lambda meta: (  # noqa E731
+        B,
+        cdiv(max_seq_len, meta["BLOCK_N"]),
+    )
+    # pyrefly: ignore [not-callable]
+    _add_timestamp_position_embeddings_kernel[grid](
+        SeqEmb=seq_embeddings,
+        Offsets=seq_offsets,
+        Lengths=seq_lengths,
+        PosEmb=pos_embeddings,
+        TsEmb=ts_embeddings,
+        Out=out,
+        TS=timestamps,
+        PosInds=pos_inds,
+        TsInds=ts_inds,
+        NumTargets=num_targets_resolved,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len,
+        D=D,
+        num_time_buckets=2048,
+        time_bucket_increments=60.0,
+        time_bucket_scale=1.0,
+        time_delta=0,
+        max_contextual_seq_len=max_contextual_seq_len,
+        max_pos_ind=max_pos_ind,
+        stride_sn=seq_embeddings.stride(0),
+        stride_pn=pos_embeddings.stride(0),
+        stride_tn=ts_embeddings.stride(0),
+        stride_on=out.stride(0),
+        TRAINING=False,
+        HAS_MULTIPLE_TARGETS=has_multiple_targets,
+        INTERLEAVE_TARGETS=interleave_targets,
+        TIME_BUCKET_FN=time_bucket_fn,
+        BLOCK_D=BLOCK_D,
+    )
+
+    return out
+
+
+@torch.fx.wrap
+# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
+# AOT-T triton kernel wrapper functions. This is required for
+# AOT-T backend to recognize and trace correctly for ops transformation.
+def aot_triton_kernel_wrapper_position(
+    alpha: float,
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    position_embeddings_weight: torch.Tensor,
+    timestamp_embeddings_weight: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    seq_lengths: torch.Tensor,
+    seq_embeddings: torch.Tensor,
+    timestamps: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+    time_bucket_fn: str,
+) -> torch.Tensor:
+    seq_embeddings = seq_embeddings * alpha
+    if should_trigger_eager_impl():
+        return pytorch_add_timestamp_positional_embeddings(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
+    else:
+        return _triton_aot_position(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py
new file mode 100644
index 000000000..4fa11dddc
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py
@@ -0,0 +1,366 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+
+#!/usr/bin/env python3
+
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    autotune_max_seq_len,
+    BACKEND_ALLOW_TF32,
+    cdiv,
+    prev_power_of_2,
+    should_trigger_eager_impl,
+)
+from generative_recommenders.ops.pytorch.pt_hstu_attention import (
+    pytorch_cached_hstu_mha,
+    pytorch_hstu_mha,
+)
+from generative_recommenders.ops.triton.triton_hstu_attention import _hstu_attn_fwd
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+for _config in _hstu_attn_fwd.configs:
+    if isinstance(_config.kwargs.get("USE_TLX"), bool):
+        _config.kwargs["USE_TLX"] = int(_config.kwargs["USE_TLX"])
+
+
+_hstu_attn_fwd = triton_aot(
+    annotations={
+        "stride_qm": ("i32", 16),
+        "stride_qh": ("i32", 16),
+        "stride_kn": ("i32", 16),
+        "stride_kh": ("i32", 16),
+        "stride_vn": ("i32", 16),
+        "stride_vh": ("i32", 16),
+        "stride_om": ("i32", 16),
+        "stride_oh": ("i32", 16),
+        "contextual_seq_len": "i32",
+        "max_attn_len": "i32",
+        "Z": "i32",
+        "AUTOTUNE_Z": "i32",
+        "H": "i32",
+        "MAX_SEQ_LEN": "i32",
+        "AUTOTUNE_MAX_SEQ_LEN": "i32",
+        "DimQ": "i32",
+        "DimV": "i32",
+        "DeltaSize": "i32",
+        "workspace_ptr": "*i8",
+        "sort_by_length_indices": "*i64",
+    }
+)(_hstu_attn_fwd)
+
+
+def _check_common_args(
+    invalid_attn_mask_type: str,
+    attn_scale: Optional[torch.Tensor],
+    full_attn_size: int,
+    num_softmax_heads: int,
+) -> None:
+    assert invalid_attn_mask_type in ("causal", "lower_triangular"), (
+        f"unsupported invalid_attn_mask_type: {invalid_attn_mask_type}"
+    )
+    assert attn_scale is None, "attn_scale is not implemented for AOT-T HSTU MHA"
+    assert full_attn_size == 0, "full_attn_size is not implemented for AOT-T HSTU MHA"
+    assert num_softmax_heads == 0, (
+        "num_softmax_heads is not implemented for AOT-T HSTU MHA"
+    )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_ragged_hstu_mha(
+    N: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    invalid_attn_mask_type: str,
+    num_targets: Optional[torch.Tensor],
+    attn_scale: Optional[torch.Tensor],
+    max_attn_len: int,
+    contextual_seq_len: int,
+    full_attn_size: int,
+    num_softmax_heads: int = 0,
+    allow_tf32: bool = BACKEND_ALLOW_TF32,
+) -> torch.Tensor:
+    assert invalid_attn_mask_type in ("causal", "lower_triangular"), (
+        f"unsupported invalid_attn_mask_type: {invalid_attn_mask_type}"
+    )
+    assert attn_scale is None, "attn_scale is not implemented for AOT-T HSTU MHA"
+    assert full_attn_size == 0, "full_attn_size is not implemented for AOT-T HSTU MHA"
+    assert num_softmax_heads == 0, (
+        "num_softmax_heads is not implemented for AOT-T HSTU MHA"
+    )
+    Z = seq_offsets.numel() - 1
+    L, H, DimQ = q.shape
+    DimV = v.shape[2]
+
+    q = q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    seq_offsets = seq_offsets.contiguous()
+
+    out = torch.empty_like(v)
+    if L == 0:
+        return out
+    workspace = torch.empty(0, dtype=torch.int8, device=q.device)
+    sort_by_length_indices = torch.empty(
+        0, dtype=torch.int64, device=seq_offsets.device
+    )
+
+    grid = lambda meta: (  # noqa E731
+        cdiv(N, meta["BLOCK_M"]),
+        Z * H,
+    )
+    # pyrefly: ignore [not-callable]
+    _hstu_attn_fwd[grid](
+        Q=q,
+        K=k,
+        V=v,
+        workspace_ptr=workspace,
+        sort_by_length_indices=sort_by_length_indices,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        Out=out,
+        stride_qm=q.stride(0),
+        stride_qh=q.stride(1),
+        stride_kn=k.stride(0),
+        stride_kh=k.stride(1),
+        stride_vn=v.stride(0),
+        stride_vh=v.stride(1),
+        stride_om=out.stride(0),
+        stride_oh=out.stride(1),
+        alpha=alpha,
+        contextual_seq_len=contextual_seq_len,
+        max_attn_len=max_attn_len,
+        Z=Z,
+        AUTOTUNE_Z=prev_power_of_2(Z),
+        H=H,
+        MAX_SEQ_LEN=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
+        DimQ=DimQ,
+        DimV=DimV,
+        DeltaSize=0,
+        HAS_MULTIPLE_TARGETS=num_targets is not None,
+        IS_DELTA_Q=False,
+        ALLOW_TF32=allow_tf32,
+        BLOCK_D_Q=DimQ,
+        BLOCK_D_V=DimV,
+        HAS_CONTEXTUAL_SEQ_LEN=contextual_seq_len > 0,
+        HAS_MAX_ATTN_LEN=max_attn_len > 0,
+        HAS_SORT_BY_LENGTH_INDICES=False,
+        ENABLE_TMA=False,
+        TMA_DESC_SIZE=128,
+    )
+    return out
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_ragged_hstu_mha(
+    N: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    invalid_attn_mask_type: str,
+    num_targets: Optional[torch.Tensor],
+    attn_scale: Optional[torch.Tensor],
+    max_attn_len: int,
+    contextual_seq_len: int,
+    full_attn_size: int,
+    num_softmax_heads: int,
+    allow_tf32: bool = BACKEND_ALLOW_TF32,
+) -> torch.Tensor:
+    _check_common_args(
+        invalid_attn_mask_type=invalid_attn_mask_type,
+        attn_scale=attn_scale,
+        full_attn_size=full_attn_size,
+        num_softmax_heads=num_softmax_heads,
+    )
+    if should_trigger_eager_impl():
+        return pytorch_hstu_mha(
+            max_seq_len=N,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            causal=True,
+            dropout_pr=0.0,
+            training=False,
+            num_targets=num_targets,
+            attn_scale=attn_scale,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            min_full_attn_seq_len=full_attn_size,
+        )
+    return _triton_aot_ragged_hstu_mha(
+        N=N,
+        alpha=alpha,
+        q=q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+        invalid_attn_mask_type=invalid_attn_mask_type,
+        num_targets=num_targets,
+        attn_scale=attn_scale,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        full_attn_size=full_attn_size,
+        num_softmax_heads=num_softmax_heads,
+        allow_tf32=allow_tf32,
+    )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_cached_hstu_mha(
+    N: int,
+    alpha: float,
+    delta_q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    delta_x_offsets: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    attn_scale: Optional[torch.Tensor],
+    max_attn_len: int,
+    full_attn_size: int,
+    allow_tf32: bool = BACKEND_ALLOW_TF32,
+) -> torch.Tensor:
+    assert attn_scale is None, "attn_scale is not implemented for AOT-T HSTU MHA"
+    assert full_attn_size == 0, "full_attn_size is not implemented for AOT-T HSTU MHA"
+    Z = seq_offsets.size(0) - 1
+    DELTA_L, H, DimQ = delta_q.shape
+    DeltaSize = DELTA_L // Z
+    DimV = v.shape[2]
+
+    delta_q = delta_q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    seq_offsets = seq_offsets.contiguous()
+
+    out = torch.empty((DELTA_L, H, DimV), dtype=delta_q.dtype, device=delta_q.device)
+    if DELTA_L == 0:
+        return out
+    workspace = torch.empty(0, dtype=torch.int8, device=delta_q.device)
+    sort_by_length_indices = torch.empty(
+        0, dtype=torch.int64, device=seq_offsets.device
+    )
+
+    grid = lambda meta: (  # noqa E731
+        cdiv(DeltaSize, meta["BLOCK_M"]),
+        Z * H,
+    )
+    # pyrefly: ignore [not-callable]
+    _hstu_attn_fwd[grid](
+        Q=delta_q,
+        K=k,
+        V=v,
+        workspace_ptr=workspace,
+        sort_by_length_indices=sort_by_length_indices,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        Out=out,
+        stride_qm=delta_q.stride(0),
+        stride_qh=delta_q.stride(1),
+        stride_kn=k.stride(0),
+        stride_kh=k.stride(1),
+        stride_vn=v.stride(0),
+        stride_vh=v.stride(1),
+        stride_om=out.stride(0),
+        stride_oh=out.stride(1),
+        alpha=alpha,
+        contextual_seq_len=0,
+        max_attn_len=max_attn_len,
+        Z=Z,
+        AUTOTUNE_Z=prev_power_of_2(Z),
+        H=H,
+        MAX_SEQ_LEN=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
+        DimQ=DimQ,
+        DimV=DimV,
+        DeltaSize=DeltaSize,
+        HAS_MULTIPLE_TARGETS=num_targets is not None,
+        IS_DELTA_Q=True,
+        ALLOW_TF32=allow_tf32,
+        BLOCK_D_Q=DimQ,
+        BLOCK_D_V=DimV,
+        HAS_CONTEXTUAL_SEQ_LEN=False,
+        HAS_MAX_ATTN_LEN=max_attn_len > 0,
+        HAS_SORT_BY_LENGTH_INDICES=False,
+        ENABLE_TMA=False,
+        TMA_DESC_SIZE=128,
+    )
+    return out
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_cached_hstu_mha(
+    N: int,
+    alpha: float,
+    delta_q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    delta_x_offsets: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    attn_scale: Optional[torch.Tensor],
+    max_attn_len: int,
+    full_attn_size: int,
+) -> torch.Tensor:
+    _check_common_args(
+        invalid_attn_mask_type="causal",
+        attn_scale=attn_scale,
+        full_attn_size=full_attn_size,
+        num_softmax_heads=0,
+    )
+    if should_trigger_eager_impl():
+        return pytorch_cached_hstu_mha(
+            max_seq_len=N,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=0,
+        )
+    return _triton_aot_cached_hstu_mha(
+        N=N,
+        alpha=alpha,
+        delta_q=delta_q,
+        k=k,
+        v=v,
+        delta_x_offsets=delta_x_offsets,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        attn_scale=attn_scale,
+        max_attn_len=max_attn_len,
+        full_attn_size=full_attn_size,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py
new file mode 100644
index 000000000..e5d9e093e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py
@@ -0,0 +1,114 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+
+#!/usr/bin/env python3
+
+import torch
+from generative_recommenders.common import (
+    cdiv,
+    next_power_of_2,
+    should_trigger_eager_impl,
+    switch_to_contiguous_if_needed,
+)
+from generative_recommenders.ops.pytorch.pt_layer_norm import pytorch_rms_norm
+from generative_recommenders.ops.triton.triton_layer_norm import _weighted_rms_norm_fwd
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+_weighted_rms_norm_fwd = triton_aot(
+    annotations={
+        "N": "i32",
+        "D": ("i32", 16),
+        "stride_x": ("i32", 16),
+        "stride_y": ("i32", 16),
+    },
+)(_weighted_rms_norm_fwd)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    silu: bool,
+) -> torch.Tensor:
+    """Internal AOTT kernel function for RMS norm."""
+    assert x.dim() == 2, f"x.dim() == {x.dim()}, expected 2"
+    x = switch_to_contiguous_if_needed(x)
+    N, D = x.shape
+
+    assert weight.dim() == 1
+    assert weight.numel() == D
+
+    y = torch.empty_like(x)
+    rstd = torch.empty(N, dtype=torch.float32, device=x.device)
+
+    BLOCK_D = next_power_of_2(D)
+
+    grid = lambda meta: (  # noqa E731
+        cdiv(N, meta["BLOCK_N"]),
+    )
+    # pyrefly: ignore [not-callable]
+    _weighted_rms_norm_fwd[grid](
+        x,
+        y,
+        weight,
+        rstd,
+        N,
+        D,
+        eps,
+        stride_x=x.stride(0),
+        stride_y=y.stride(0),
+        SILU=silu,
+        BLOCK_D=BLOCK_D,
+    )
+
+    return y
+
+
+def _pytorch_rms_norm_fallback(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    silu: bool,
+) -> torch.Tensor:
+    """PyTorch fallback for RMS norm in eager mode."""
+
+    return pytorch_rms_norm(x, [x.shape[-1]], weight, eps, silu)
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    silu: bool,
+) -> torch.Tensor:
+    """AOT-T wrapper for RMS norm.
+
+    Routes between PyTorch fallback (for tracing/serialization) and AOTT kernel path.
+    """
+    if should_trigger_eager_impl():
+        return _pytorch_rms_norm_fallback(x, weight, eps, silu)
+    else:
+        return _triton_aot_rms_norm(x, weight, eps, silu)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py
new file mode 100644
index 000000000..9aa0c655b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py
@@ -0,0 +1,138 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.common import (
+    fx_unwrap_optional_tensor,
+    next_power_of_2,
+    should_trigger_eager_impl,
+)
+from generative_recommenders.ops.pytorch.pt_jagged_tensors import (
+    pytorch_split_2D_jagged,
+)
+from generative_recommenders.ops.triton.triton_jagged import split_2D_jagged
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+split_2D_jagged = triton_aot(
+    annotations={
+        "DenseSize": "i32",
+        "D": ("i32", 16),
+        "stride_id": ("i32", 16),
+        "stride_ad": ("i32", 16),
+        "stride_bd": ("i32", 16),
+    },
+    # pyrefly: ignore [bad-argument-type]
+)(split_2D_jagged)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_split_2D_jagged(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: torch.Tensor,
+    offsets_b: torch.Tensor,
+    dense_size: int = 0,
+    is_dense_a: bool = False,
+    is_dense_b: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    _, D = values.shape
+    BLOCK_D = next_power_of_2(D)
+
+    if is_dense_a:
+        L, _ = values.shape
+        B = offsets_b.size(0) - 1
+        seq_len_a = dense_size * B
+        seq_len_b = L - seq_len_a
+    elif is_dense_b:
+        L, _ = values.shape
+        B = offsets_a.size(0) - 1
+        seq_len_b = dense_size * B
+        seq_len_a = L - seq_len_b
+    else:
+        B = offsets_a.size(0) - 1
+        seq_len_a = int(offsets_a[-1].item())
+        seq_len_b = int(offsets_b[-1].item())
+
+    values_a = torch.empty((seq_len_a, D), device=values.device, dtype=values.dtype)
+    values_b = torch.empty((seq_len_b, D), device=values.device, dtype=values.dtype)
+
+    grid = (max_seq_len, B)
+    # pyre-ignore[29]: TritonAOT.__getitem__ is callable at runtime
+    split_2D_jagged[grid](
+        JaggedIn=values,
+        DenseSize=dense_size,
+        OffsetsA=offsets_a,
+        OffsetsB=offsets_b,
+        OutA=values_a,
+        OutB=values_b,
+        D=D,
+        stride_id=values.stride(0),
+        stride_ad=values_a.stride(0),
+        stride_bd=values_b.stride(0),
+        # pyrefly: ignore [bad-argument-type]
+        IS_DENSE_A=is_dense_a,
+        # pyrefly: ignore [bad-argument-type]
+        IS_DENSE_B=is_dense_b,
+        # pyrefly: ignore [bad-argument-type]
+        BLOCK_D=BLOCK_D,
+        # pyrefly: ignore [bad-argument-type]
+        IS_REPLACE=False,
+    )
+
+    if is_dense_a:
+        values_a = values_a.reshape(B, dense_size, D)
+    if is_dense_b:
+        values_b = values_b.reshape(B, dense_size, D)
+
+    return values_a, values_b
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_split_2D_jagged(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    dense_size: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if should_trigger_eager_impl():
+        assert offsets_a is not None and offsets_b is not None, (
+            "Eager fallback requires both offsets_a and offsets_b"
+        )
+        return pytorch_split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            max_len_left=None,
+            max_len_right=None,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+        )
+    else:
+        is_dense_a: bool = offsets_a is None
+        is_dense_b: bool = offsets_b is None
+        resolved_offsets_a: torch.Tensor = values.new_empty(0)
+        resolved_offsets_b: torch.Tensor = values.new_empty(0)
+        if is_dense_a:
+            resolved_offsets_b = fx_unwrap_optional_tensor(offsets_b)
+            resolved_offsets_a = resolved_offsets_b.new_empty(0)
+        elif is_dense_b:
+            resolved_offsets_a = fx_unwrap_optional_tensor(offsets_a)
+            resolved_offsets_b = resolved_offsets_a.new_empty(0)
+        else:
+            resolved_offsets_a = fx_unwrap_optional_tensor(offsets_a)
+            resolved_offsets_b = fx_unwrap_optional_tensor(offsets_b)
+
+        return _triton_aot_split_2D_jagged(
+            values=values,
+            max_seq_len=max_seq_len,
+            offsets_a=resolved_offsets_a,
+            offsets_b=resolved_offsets_b,
+            dense_size=dense_size,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/types.py b/recommendation_v4/generative_recommenders/ops/triton_aot/types.py
new file mode 100644
index 000000000..bd7b2a1ed
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/types.py
@@ -0,0 +1,181 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any, Callable, ClassVar, Dict, List, Optional, Protocol, Union
+
+from generative_recommenders.ops.triton_aot.compile.utils import is_autotuner
+
+# @manual=//triton:triton
+from triton.runtime.jit import KernelInterface
+# triton.fb.triton_util depends on torch
+# @dep=//caffe2:_torch
+
+
+_VALID_HINTS: frozenset[int] = frozenset({1, 8, 16})
+_VALID_POINTER_HINTS: frozenset[int] = frozenset({16})
+
+
+@dataclass(frozen=True)
+class AnnotationHint:
+    """Annotation with a value hint (dtype + divisibility/alignment).
+
+    Valid hints: 16 (divisible_by_16), 8 (divisible_by_8), 1 (equal_to_1).
+    For pointers (dtype starts with ``*``), only 16 is valid — other values
+    would cause incorrect codegen (e.g. alignment=1 folds the pointer as a
+    constexpr constant, causing a segfault at launch).
+    """
+
+    dtype: str
+    hint: int
+
+    def __post_init__(self) -> None:
+        if self.hint not in _VALID_HINTS:
+            raise RuntimeError(
+                f"TritonAOT: invalid annotation hint {self.hint!r} for "
+                f"dtype {self.dtype!r}. Valid hints: {sorted(_VALID_HINTS)}."
+            )
+        if self.dtype.startswith("*") and self.hint not in _VALID_POINTER_HINTS:
+            raise RuntimeError(
+                f"TritonAOT: invalid pointer alignment {self.hint!r} for "
+                f"dtype {self.dtype!r}. Pointer annotations only support "
+                f"alignment={sorted(_VALID_POINTER_HINTS)}."
+            )
+
+    def to_tuple(self) -> tuple[str, int]:
+        """Convert to plain tuple for raw spec format."""
+        return (self.dtype, self.hint)
+
+
+# Internal annotation type (after normalization).
+Annotation = Union[str, AnnotationHint]
+
+# User-facing input type (also accepts raw tuples).
+AnnotationInput = Union[str, tuple[str, int], AnnotationHint]
+
+
+def _normalize_annotation(ann: AnnotationInput) -> Annotation:
+    """Convert a raw tuple to AnnotationHint (triggers validation)."""
+    if isinstance(ann, AnnotationHint):
+        return ann
+    if isinstance(ann, tuple):
+        return AnnotationHint(ann[0], ann[1])
+    return ann
+
+
+class SpecCollector(Protocol):
+    """Callback invoked by TritonAOT.run() to collect kernel specs during AOT compile."""
+
+    def __call__(
+        self,
+        fn: KernelInterface[List[Any]],
+        annotations: Dict[str, Annotation],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None: ...
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class TritonAOTMeta(type):
+    # TODO consider merge with AOTTCompileState
+    def __init__(cls, name, bases, attrs):  # pyre-ignore [2,3]
+        super().__init__(name, bases, attrs)
+        # Initialize an empty list for each new class created
+        cls._instances: List["TritonAOT"] = []
+
+    def __call__(cls, *args, **kwargs):  # pyre-ignore [2,3]
+        # Create the instance using the default behavior
+        instance = super().__call__(*args, **kwargs)
+        # Store the instance in the class-specific list
+        cls._instances.append(instance)
+        return instance
+
+    def get_instances(cls) -> List["TritonAOT"]:
+        return cls._instances
+
+
+class TritonAOT(KernelInterface[List[Any]], metaclass=TritonAOTMeta):
+    """Wraps a Triton kernel for ahead-of-time compilation.
+
+    Annotations specify dtype and optional value hints for kernel parameters:
+
+    - Scalar:  ``"i32"``, ``"fp32"``, or ``AnnotationHint("i32", 16)``
+      where 16 means the runtime value is divisible by 16.
+    - Pointer: ``AnnotationHint("*fp32", 16)`` for 16-byte aligned tensors.
+      Only alignment=16 is valid for pointers.
+    - Tensor:  typically inferred from runtime ``torch.Tensor.dtype``.
+    - Optional tensor:  auto-detected when the same kernel is called
+      with a tensor at one site and ``None`` at another.
+    """
+
+    _spec_collector: ClassVar[Optional[SpecCollector]] = None
+
+    def __init__(
+        self,
+        fn: KernelInterface[List[Any]],
+        annotations: Dict[str, AnnotationInput],
+    ) -> None:
+        self.fn: KernelInterface[List[Any]] = fn
+        self.annotations: Dict[str, Annotation] = {
+            k: _normalize_annotation(v) for k, v in annotations.items()
+        }
+
+    @classmethod
+    def set_spec_collector(cls, collector: Optional[SpecCollector]) -> None:
+        """Register or unregister the spec collection callback.
+
+        When a collector is registered (not None), TritonAOT.run() will call
+        it to collect kernel specs for AOT compilation.  When None, run()
+        simply delegates to the underlying Triton kernel (normal JIT path).
+        """
+        cls._spec_collector = collector
+
+    # pyrefly: ignore [bad-override]
+    def run(self, *args: Any, **kwargs: Any) -> Any:
+        if self._spec_collector is not None:
+            self._spec_collector(self.fn, self.annotations, *args, **kwargs)
+        # pyre-ignore[29]: KernelInterface.run is callable at runtime
+        return self.fn.run(*args, **kwargs)
+
+
+def triton_aot(
+    annotations: Dict[str, AnnotationInput],
+) -> Callable[[KernelInterface[List[Any]]], TritonAOT]:
+    def decorator(fn: KernelInterface[List[Any]]) -> TritonAOT:
+        return TritonAOT(fn, annotations)
+
+    return decorator
+
+
+def get_all_triton_aot_instances() -> List[TritonAOT]:
+    """Return all triton aot function instances (e.g. decorated with @triton_aot)."""
+    return TritonAOT.get_instances()
+
+
+def reset_all_triton_aot_autotune_cache() -> bool:
+    """Reset triton autotune cache for all triton aot kernels.
+
+    If triton aot compile is not enabled, this function is no op. Return True if any
+    kernel's autotune cache is reset. Else return False.
+
+    """
+    if TritonAOT._spec_collector is None:
+        return False
+
+    reset = False
+    for triton_aot_kernel in get_all_triton_aot_instances():
+        if is_autotuner(triton_aot_kernel.fn):
+            autotune_fn = triton_aot_kernel.fn
+            autotune_fn.cache.clear()  # pyre-ignore [16]
+            logger.info(
+                f"Reset autotune cache for triton kernel {autotune_fn.fn.__name__}"  # pyre-ignore [16]
+            )
+            reset = True
+
+    return reset
diff --git a/recommendation_v4/generative_recommenders/ops/utils.py b/recommendation_v4/generative_recommenders/ops/utils.py
new file mode 100644
index 000000000..16edd99a9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/utils.py
@@ -0,0 +1,124 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-ignore-all-errors
+
+import functools
+import os
+
+import torch
+
+
+class _PlainFuncWrapper:
+    """Thin wrapper around a plain function that provides no-op register_fake
+    and register_kernel methods, mirroring the CustomOpDef API so that
+    downstream @func.register_fake / func.register_kernel("cpu") calls
+    don't break when the function is not wrapped as a custom op."""
+
+    def __init__(self, func):
+        self._func = func
+        functools.update_wrapper(self, func)
+
+    def __call__(self, *args, **kwargs):
+        return self._func(*args, **kwargs)
+
+    def register_fake(self, fake_func):
+        return fake_func
+
+    def register_kernel(self, device):
+        def inner(func):
+            return func
+
+        return inner
+
+
+def maybe_register_custom_op(op_name, mutates_args):
+    """
+    Conditionally registers a function as a torch custom op.
+
+    When AOTI_LOWER is set in the environment, the function is returned
+    unwrapped so that torch.export / Dynamo can trace through the plain
+    Python implementation instead of treating the custom op as opaque.
+    """
+
+    def decorator(func):
+        if os.environ.get("AOTI_LOWER"):
+            return _PlainFuncWrapper(func)
+        return torch.library.custom_op(op_name, func, mutates_args=mutates_args)
+
+    return decorator
+
+
+def is_sm100_plus() -> bool:
+    """
+    Check if this is a Blackwell Datacenter GPU.
+    These are between 100 and 103 for B200-GB300.
+    """
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    return props.major == 10 and (props.minor >= 0 and props.minor <= 3)
+
+
+def is_sm90() -> bool:
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    return props.major == 9 and props.minor == 0
+
+
+def is_sm90_plus() -> bool:
+    return is_sm100_plus() or is_sm90()
+
+
+@functools.lru_cache(maxsize=None)
+def is_amd_mi350() -> bool:
+    """Detect an AMD Instinct MI350-class GPU (gfx950) running under ROCm.
+
+    MI350 benefits from the same multi-row, separated-RNG layer-norm-mul-dropout
+    path as Blackwell datacenter parts (sm_100), so it is gated together with
+    is_sm100_plus() at the kernel dispatch sites.
+    """
+    if not torch.cuda.is_available():
+        return False
+    if getattr(torch.version, "hip", None) is None:
+        return False
+    try:
+        arch = torch.cuda.get_device_properties(0).gcnArchName or ""
+    except (AssertionError, RuntimeError, AttributeError):
+        return False
+    return "gfx950" in arch
+
+
+def use_separated_rng_ln_mul_dropout() -> bool:
+    """Hardware that should use the autotuned, multi-row ``_ln_mul_dropout_fwd_rng``
+    kernel with a precomputed dropout mask instead of the legacy single-row,
+    fused-RNG ``_ln_mul_dropout_fwd`` kernel.
+
+    Blackwell datacenter GPUs (sm_100-103) and AMD MI350 (gfx950) both prefer the
+    separated-RNG path: it batches rows per program and lets the backward reuse the
+    same mask, which is a large win over launching one program per row.
+    """
+    return is_sm100_plus() or is_amd_mi350()
+
+
+def copy_if_different_ptr(dst: torch.Tensor, src: torch.Tensor) -> None:
+    if torch.compiler.is_compiling():
+        # .data_ptr() will break PT2
+        dst.copy_(src)
+    else:
+        if dst.data_ptr() != src.data_ptr():
+            dst.copy_(src)
diff --git a/recommendation_v4/generative_recommenders/research/data/dataset.py b/recommendation_v4/generative_recommenders/research/data/dataset.py
new file mode 100644
index 000000000..09a18ae01
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/data/dataset.py
@@ -0,0 +1,248 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import csv
+import linecache
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import torch
+
+
+class DatasetV2(torch.utils.data.Dataset):
+    """In reverse chronological order."""
+
+    def __init__(
+        self,
+        ratings_file: str,
+        padding_length: int,
+        ignore_last_n: int,  # used for creating train/valid/test sets
+        shift_id_by: int = 0,
+        chronological: bool = False,
+        sample_ratio: float = 1.0,
+    ) -> None:
+        """
+        Args:
+            csv_file (string): Path to the csv file.
+        """
+        super().__init__()
+
+        self.ratings_frame: pd.DataFrame = pd.read_csv(
+            ratings_file,
+            delimiter=",",
+            # iterator=True,
+        )
+        self._padding_length: int = padding_length
+        self._ignore_last_n: int = ignore_last_n
+        self._cache: Dict[int, Dict[str, torch.Tensor]] = dict()
+        self._shift_id_by: int = shift_id_by
+        self._chronological: bool = chronological
+        self._sample_ratio: float = sample_ratio
+
+    def __len__(self) -> int:
+        return len(self.ratings_frame)
+
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        if idx in self._cache.keys():
+            return self._cache[idx]
+        data = self.ratings_frame.iloc[idx]
+        sample = self.load_item(data)
+        self._cache[idx] = sample
+        return sample
+
+    def load_item(self, data) -> Dict[str, torch.Tensor]:
+        user_id = data.user_id
+
+        def eval_as_list(x: str, ignore_last_n: int) -> List[int]:
+            y = eval(x)
+            y_list = [y] if type(y) == int else list(y)
+            if ignore_last_n > 0:
+                # for training data creation
+                y_list = y_list[:-ignore_last_n]
+            return y_list
+
+        def eval_int_list(
+            x: str,
+            target_len: int,
+            ignore_last_n: int,
+            shift_id_by: int,
+            sampling_kept_mask: Optional[List[bool]],
+        ) -> Tuple[List[int], int]:
+            y = eval_as_list(x, ignore_last_n=ignore_last_n)
+            if sampling_kept_mask is not None:
+                y = [x for x, kept in zip(y, sampling_kept_mask) if kept]
+            y_len = len(y)
+            y.reverse()
+            if shift_id_by > 0:
+                y = [x + shift_id_by for x in y]
+            return y, y_len
+
+        if self._sample_ratio < 1.0:
+            raw_length = len(eval_as_list(data.sequence_item_ids, self._ignore_last_n))
+            sampling_kept_mask = (
+                torch.rand((raw_length,), dtype=torch.float32) < self._sample_ratio
+            ).tolist()
+        else:
+            sampling_kept_mask = None
+
+        movie_history, movie_history_len = eval_int_list(
+            data.sequence_item_ids,
+            self._padding_length,
+            self._ignore_last_n,
+            shift_id_by=self._shift_id_by,
+            sampling_kept_mask=sampling_kept_mask,
+        )
+        movie_history_ratings, ratings_len = eval_int_list(
+            data.sequence_ratings,
+            self._padding_length,
+            self._ignore_last_n,
+            0,
+            sampling_kept_mask=sampling_kept_mask,
+        )
+        movie_timestamps, timestamps_len = eval_int_list(
+            data.sequence_timestamps,
+            self._padding_length,
+            self._ignore_last_n,
+            0,
+            sampling_kept_mask=sampling_kept_mask,
+        )
+        assert movie_history_len == timestamps_len, (
+            f"history len {movie_history_len} differs from timestamp len {timestamps_len}."
+        )
+        assert movie_history_len == ratings_len, (
+            f"history len {movie_history_len} differs from ratings len {ratings_len}."
+        )
+
+        def _truncate_or_pad_seq(
+            y: List[int], target_len: int, chronological: bool
+        ) -> List[int]:
+            y_len = len(y)
+            if y_len < target_len:
+                y = y + [0] * (target_len - y_len)
+            else:
+                if not chronological:
+                    y = y[:target_len]
+                else:
+                    y = y[-target_len:]
+            assert len(y) == target_len
+            return y
+
+        historical_ids = movie_history[1:]
+        historical_ratings = movie_history_ratings[1:]
+        historical_timestamps = movie_timestamps[1:]
+        target_ids = movie_history[0]
+        target_ratings = movie_history_ratings[0]
+        target_timestamps = movie_timestamps[0]
+        if self._chronological:
+            historical_ids.reverse()
+            historical_ratings.reverse()
+            historical_timestamps.reverse()
+
+        max_seq_len = self._padding_length - 1
+        history_length = min(len(historical_ids), max_seq_len)
+        historical_ids = _truncate_or_pad_seq(
+            historical_ids,
+            max_seq_len,
+            self._chronological,
+        )
+        historical_ratings = _truncate_or_pad_seq(
+            historical_ratings,
+            max_seq_len,
+            self._chronological,
+        )
+        historical_timestamps = _truncate_or_pad_seq(
+            historical_timestamps,
+            max_seq_len,
+            self._chronological,
+        )
+        # moved to features.py
+        # if self._chronological:
+        #     historical_ids.append(0)
+        #     historical_ratings.append(0)
+        #     historical_timestamps.append(0)
+        # print(historical_ids, historical_ratings, historical_timestamps, target_ids, target_ratings, target_timestamps)
+        ret = {
+            "user_id": user_id,
+            "historical_ids": torch.tensor(historical_ids, dtype=torch.int64),
+            "historical_ratings": torch.tensor(historical_ratings, dtype=torch.int64),
+            "historical_timestamps": torch.tensor(
+                historical_timestamps, dtype=torch.int64
+            ),
+            "history_lengths": history_length,
+            "target_ids": target_ids,
+            "target_ratings": target_ratings,
+            "target_timestamps": target_timestamps,
+        }
+        return ret
+
+
+class MultiFileDatasetV2(DatasetV2, torch.utils.data.Dataset):
+    def __init__(
+        self,
+        file_prefix: str,
+        num_files: int,
+        padding_length: int,
+        ignore_last_n: int,  # used for creating train/valid/test sets
+        shift_id_by: int = 0,
+        chronological: bool = False,
+        sample_ratio: float = 1.0,
+    ) -> None:
+        torch.utils.data.Dataset().__init__()
+        self._file_prefix: str = file_prefix
+        self._num_files: int = num_files
+        with open(f"{file_prefix}_users.csv", "r") as file:
+            reader = csv.reader(file)
+            self.users_cumsum: List[int] = np.cumsum(
+                [int(row[1]) for row in reader]
+            ).tolist()
+        self._padding_length: int = padding_length
+        self._ignore_last_n: int = ignore_last_n
+        self._shift_id_by: int = shift_id_by
+        self._chronological: bool = chronological
+        self._sample_ratio: float = sample_ratio
+
+    def __len__(self) -> int:
+        return self.users_cumsum[-1]
+
+    def _process_line(self, line: str) -> pd.Series:
+        reader = csv.reader([line])
+        parsed_line = next(reader)
+        user_id = int(parsed_line[0])
+        sequence_item_ids = parsed_line[1]
+        sequence_ratings = parsed_line[2]
+        return pd.Series(
+            data={
+                "user_id": user_id,
+                "sequence_item_ids": sequence_item_ids,
+                "sequence_ratings": sequence_ratings,
+                "sequence_timestamps": sequence_item_ids,  # placeholder
+            }
+        )
+
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        assert idx < self.users_cumsum[-1]
+        file_idx: int = 0
+        while self.users_cumsum[file_idx] <= idx:
+            file_idx += 1
+        if file_idx == 0:
+            local_idx = idx
+        else:
+            local_idx = idx - self.users_cumsum[file_idx - 1]
+        line = linecache.getline(f"{self._file_prefix}_{file_idx}.csv", local_idx + 1)
+        data = self._process_line(line)
+        sample = self.load_item(data)
+        return sample
diff --git a/recommendation_v4/generative_recommenders/research/data/eval.py b/recommendation_v4/generative_recommenders/research/data/eval.py
new file mode 100644
index 000000000..16026e5c7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/data/eval.py
@@ -0,0 +1,263 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import logging
+import sys
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Set, Union
+
+import torch
+import torch.distributed as dist
+from generative_recommenders.research.indexing.candidate_index import (
+    CandidateIndex,
+    TopKModule,
+)
+from generative_recommenders.research.modeling.sequential.features import (
+    SequentialFeatures,
+)
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+from torch.utils.tensorboard import SummaryWriter
+
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+@dataclass
+class EvalState:
+    all_item_ids: Set[int]
+    candidate_index: CandidateIndex
+    top_k_module: TopKModule
+
+
+def get_eval_state(
+    model: SimilarityModule,
+    all_item_ids: List[int],  # [X]
+    negatives_sampler: torch.nn.Module,
+    top_k_module_fn: Callable[[torch.Tensor, torch.Tensor], TopKModule],
+    device: int,
+    float_dtype: Optional[torch.dtype] = None,
+) -> EvalState:
+    # Exhaustively eval all items (incl. seen ids).
+    eval_negatives_ids = torch.as_tensor(all_item_ids).to(device).unsqueeze(0)  # [1, X]
+    # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
+    eval_negative_embeddings = negatives_sampler.normalize_embeddings(
+        # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
+        model.get_item_embeddings(eval_negatives_ids)
+    )
+    if float_dtype is not None:
+        eval_negative_embeddings = eval_negative_embeddings.to(float_dtype)
+    candidates = CandidateIndex(
+        ids=eval_negatives_ids,
+        embeddings=eval_negative_embeddings,
+    )
+    return EvalState(
+        all_item_ids=set(all_item_ids),
+        candidate_index=candidates,
+        top_k_module=top_k_module_fn(eval_negative_embeddings, eval_negatives_ids),
+    )
+
+
+@torch.inference_mode  # pyre-ignore [56]
+def eval_metrics_v2_from_tensors(
+    eval_state: EvalState,
+    model: SimilarityModule,
+    seq_features: SequentialFeatures,
+    target_ids: torch.Tensor,  # [B, 1]
+    min_positive_rating: int = 4,
+    target_ratings: Optional[torch.Tensor] = None,  # [B, 1]
+    epoch: Optional[str] = None,
+    filter_invalid_ids: bool = True,
+    user_max_batch_size: Optional[int] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> Dict[str, Union[float, torch.Tensor]]:
+    """
+    Args:
+        eval_negatives_ids: Optional[Tensor]. If not present, defaults to eval over
+            the entire corpus (`num_items`) excluding all the items that users have
+            seen in the past (historical_ids, target_ids). This is consistent with
+            papers like SASRec and TDM but may not be fair in practice as retrieval
+            modules don't have access to read state during the initial fetch stage.
+        filter_invalid_ids: bool. If true, filters seen ids by default.
+    Returns:
+        keyed metric -> list of values for each example.
+    """
+    B, _ = target_ids.shape
+    device = target_ids.device
+
+    for target_id in target_ids:
+        target_id = int(target_id)
+        if target_id not in eval_state.all_item_ids:
+            print(f"missing target_id {target_id}")
+
+    # computes ro- part exactly once.
+    # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
+    shared_input_embeddings = model.encode(
+        past_lengths=seq_features.past_lengths,
+        past_ids=seq_features.past_ids,
+        # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
+        past_embeddings=model.get_item_embeddings(seq_features.past_ids),
+        past_payloads=seq_features.past_payloads,
+    )
+    if dtype is not None:
+        shared_input_embeddings = shared_input_embeddings.to(dtype)
+
+    MAX_K = 2500
+    k = min(MAX_K, eval_state.candidate_index.ids.size(1))
+    user_max_batch_size = user_max_batch_size or shared_input_embeddings.size(0)
+    num_batches = (
+        shared_input_embeddings.size(0) + user_max_batch_size - 1
+    ) // user_max_batch_size
+    eval_top_k_ids_all = []
+    eval_top_k_prs_all = []
+    for mb in range(num_batches):
+        eval_top_k_ids, eval_top_k_prs, _ = (
+            eval_state.candidate_index.get_top_k_outputs(
+                query_embeddings=shared_input_embeddings[
+                    mb * user_max_batch_size : (mb + 1) * user_max_batch_size, ...
+                ],
+                top_k_module=eval_state.top_k_module,
+                k=k,
+                invalid_ids=(
+                    seq_features.past_ids[
+                        mb * user_max_batch_size : (mb + 1) * user_max_batch_size, :
+                    ]
+                    if filter_invalid_ids
+                    else None
+                ),
+                return_embeddings=False,
+            )
+        )
+        eval_top_k_ids_all.append(eval_top_k_ids)
+        eval_top_k_prs_all.append(eval_top_k_prs)
+
+    if num_batches == 1:
+        eval_top_k_ids = eval_top_k_ids_all[0]
+        eval_top_k_prs = eval_top_k_prs_all[0]
+    else:
+        eval_top_k_ids = torch.cat(eval_top_k_ids_all, dim=0)
+        eval_top_k_prs = torch.cat(eval_top_k_prs_all, dim=0)
+
+    assert eval_top_k_ids.size(1) == k
+    _, eval_rank_indices = torch.max(
+        torch.cat(
+            [eval_top_k_ids, target_ids],
+            dim=1,
+        )
+        == target_ids,
+        dim=1,
+    )
+    eval_ranks = torch.where(eval_rank_indices == k, MAX_K + 1, eval_rank_indices + 1)
+
+    output = {
+        "ndcg@1": torch.where(
+            eval_ranks <= 1,
+            torch.div(1.0, torch.log2(eval_ranks + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        ),
+        "ndcg@10": torch.where(
+            eval_ranks <= 10,
+            torch.div(1.0, torch.log2(eval_ranks + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        ),
+        "ndcg@50": torch.where(
+            eval_ranks <= 50,
+            torch.div(1.0, torch.log2(eval_ranks + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        ),
+        "ndcg@100": torch.where(
+            eval_ranks <= 100,
+            torch.div(1.0, torch.log2(eval_ranks + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        ),
+        "ndcg@200": torch.where(
+            eval_ranks <= 200,
+            torch.div(1.0, torch.log2(eval_ranks + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        ),
+        "hr@1": (eval_ranks <= 1),
+        "hr@10": (eval_ranks <= 10),
+        "hr@50": (eval_ranks <= 50),
+        "hr@100": (eval_ranks <= 100),
+        "hr@200": (eval_ranks <= 200),
+        "hr@500": (eval_ranks <= 500),
+        "hr@1000": (eval_ranks <= 1000),
+        "mrr": torch.div(1.0, eval_ranks),
+    }
+    if target_ratings is not None:
+        target_ratings = target_ratings.squeeze(1)  # [B]
+        output["ndcg@10_>=4"] = torch.where(
+            eval_ranks[target_ratings >= 4] <= 10,
+            torch.div(1.0, torch.log2(eval_ranks[target_ratings >= 4] + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        )
+        output[f"hr@10_>={min_positive_rating}"] = (
+            eval_ranks[target_ratings >= min_positive_rating] <= 10
+        )
+        output[f"hr@50_>={min_positive_rating}"] = (
+            eval_ranks[target_ratings >= min_positive_rating] <= 50
+        )
+        output[f"mrr_>={min_positive_rating}"] = torch.div(
+            1.0, eval_ranks[target_ratings >= min_positive_rating]
+        )
+
+    return output  # pyre-ignore [7]
+
+
+def eval_recall_metrics_from_tensors(
+    eval_state: EvalState,
+    model: SimilarityModule,
+    seq_features: SequentialFeatures,
+    user_max_batch_size: Optional[int] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> Dict[str, torch.Tensor]:
+    target_ids = seq_features.past_ids[:, -1].unsqueeze(1)
+    filtered_past_ids = seq_features.past_ids.detach().clone()
+    filtered_past_ids[:, -1] = torch.zeros_like(target_ids.squeeze(1))
+    return eval_metrics_v2_from_tensors(
+        eval_state=eval_state,
+        model=model,
+        seq_features=SequentialFeatures(
+            past_lengths=seq_features.past_lengths - 1,
+            past_ids=filtered_past_ids,
+            past_embeddings=seq_features.past_embeddings,
+            past_payloads=seq_features.past_payloads,
+        ),
+        target_ids=target_ids,
+        user_max_batch_size=user_max_batch_size,
+        dtype=dtype,
+    )
+
+
+def _avg(x: torch.Tensor, world_size: int) -> torch.Tensor:
+    _sum_and_numel = torch.tensor(
+        [x.sum(), x.numel()], dtype=torch.float32, device=x.device
+    )
+    if world_size > 1:
+        dist.all_reduce(_sum_and_numel, op=dist.ReduceOp.SUM)
+    return _sum_and_numel[0] / _sum_and_numel[1]
+
+
+def add_to_summary_writer(
+    writer: Optional[SummaryWriter],
+    batch_id: int,
+    metrics: Dict[str, torch.Tensor],
+    prefix: str,
+    world_size: int,
+) -> None:
+    for key, values in metrics.items():
+        avg_value = _avg(values, world_size)
+        if writer is not None:
+            writer.add_scalar(f"{prefix}/{key}", avg_value, batch_id)
diff --git a/recommendation_v4/generative_recommenders/research/data/item_features.py b/recommendation_v4/generative_recommenders/research/data/item_features.py
new file mode 100644
index 000000000..8ecb6ea6a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/data/item_features.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import List
+
+import torch
+
+
+@dataclass
+class ItemFeatures:
+    num_items: int
+    max_jagged_dimension: int
+    max_ind_range: List[int]  # [(,)] x num_features
+    lengths: List[torch.Tensor]  # [(num_items,)] x num_features
+    values: List[torch.Tensor]  # [(num_items, max_jagged_dimension)] x num_features
diff --git a/recommendation_v4/generative_recommenders/research/data/preprocessor.py b/recommendation_v4/generative_recommenders/research/data/preprocessor.py
new file mode 100644
index 000000000..bf52f41da
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/data/preprocessor.py
@@ -0,0 +1,474 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+import logging
+import os
+import sys
+import tarfile
+from typing import Dict, Optional, Union
+from urllib.request import urlretrieve
+from zipfile import ZipFile
+
+import numpy as np
+import pandas as pd
+
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+class DataProcessor:
+    """
+    This preprocessor does not remap item_ids. This is intended so that we can easily join other
+    side-information based on item_ids later.
+    """
+
+    def __init__(
+        self,
+        prefix: str,
+        expected_num_unique_items: Optional[int],
+        expected_max_item_id: Optional[int],
+    ) -> None:
+        self._prefix: str = prefix
+        self._expected_num_unique_items = expected_num_unique_items
+        self._expected_max_item_id = expected_max_item_id
+
+    @abc.abstractmethod
+    def expected_num_unique_items(self) -> Optional[int]:
+        return self._expected_num_unique_items
+
+    @abc.abstractmethod
+    def expected_max_item_id(self) -> Optional[int]:
+        return self._expected_max_item_id
+
+    @abc.abstractmethod
+    def processed_item_csv(self) -> str:
+        pass
+
+    def output_format_csv(self) -> str:
+        return f"tmp/{self._prefix}/sasrec_format.csv"
+
+    def to_seq_data(
+        self,
+        ratings_data: pd.DataFrame,
+        user_data: Optional[pd.DataFrame] = None,
+    ) -> pd.DataFrame:
+        if user_data is not None:
+            ratings_data_transformed = ratings_data.join(
+                user_data.set_index("user_id"), on="user_id"
+            )
+        else:
+            ratings_data_transformed = ratings_data
+        ratings_data_transformed.item_ids = ratings_data_transformed.item_ids.apply(
+            lambda x: ",".join([str(v) for v in x])
+        )
+        ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
+            lambda x: ",".join([str(v) for v in x])
+        )
+        ratings_data_transformed.timestamps = ratings_data_transformed.timestamps.apply(
+            lambda x: ",".join([str(v) for v in x])
+        )
+        ratings_data_transformed.rename(
+            columns={
+                "item_ids": "sequence_item_ids",
+                "ratings": "sequence_ratings",
+                "timestamps": "sequence_timestamps",
+            },
+            inplace=True,
+        )
+        return ratings_data_transformed
+
+    def file_exists(self, name: str) -> bool:
+        return os.path.isfile("%s/%s" % (os.getcwd(), name))
+
+
+class MovielensSyntheticDataProcessor(DataProcessor):
+    def __init__(
+        self,
+        prefix: str,
+        expected_num_unique_items: Optional[int] = None,
+        expected_max_item_id: Optional[int] = None,
+    ) -> None:
+        super().__init__(prefix, expected_num_unique_items, expected_max_item_id)
+
+    def preprocess_rating(self) -> None:
+        return
+
+
+class MovielensDataProcessor(DataProcessor):
+    def __init__(
+        self,
+        download_path: str,
+        saved_name: str,
+        prefix: str,
+        convert_timestamp: bool,
+        expected_num_unique_items: Optional[int] = None,
+        expected_max_item_id: Optional[int] = None,
+    ) -> None:
+        super().__init__(prefix, expected_num_unique_items, expected_max_item_id)
+        self._download_path = download_path
+        self._saved_name = saved_name
+        self._convert_timestamp: bool = convert_timestamp
+
+    def download(self) -> None:
+        if not self.file_exists(self._saved_name):
+            urlretrieve(self._download_path, self._saved_name)
+        if self._saved_name[-4:] == ".zip":
+            ZipFile(self._saved_name, "r").extractall(path="tmp/")
+        else:
+            with tarfile.open(self._saved_name, "r:*") as tar_ref:
+                tar_ref.extractall("tmp/")
+
+    def processed_item_csv(self) -> str:
+        return f"tmp/processed/{self._prefix}/movies.csv"
+
+    def sasrec_format_csv_by_user_train(self) -> str:
+        return f"tmp/{self._prefix}/sasrec_format_by_user_train.csv"
+
+    def sasrec_format_csv_by_user_test(self) -> str:
+        return f"tmp/{self._prefix}/sasrec_format_by_user_test.csv"
+
+    def preprocess_rating(self) -> int:
+        self.download()
+
+        if self._prefix == "ml-1m":
+            users = pd.read_csv(
+                f"tmp/{self._prefix}/users.dat",
+                sep="::",
+                names=["user_id", "sex", "age_group", "occupation", "zip_code"],
+            )
+            ratings = pd.read_csv(
+                f"tmp/{self._prefix}/ratings.dat",
+                sep="::",
+                names=["user_id", "movie_id", "rating", "unix_timestamp"],
+            )
+            movies = pd.read_csv(
+                f"tmp/{self._prefix}/movies.dat",
+                sep="::",
+                names=["movie_id", "title", "genres"],
+                encoding="iso-8859-1",
+            )
+        elif self._prefix == "ml-20m":
+            # ml-20m
+            # ml-20m doesn't have user data.
+            users = None
+            # ratings: userId,movieId,rating,timestamp
+            ratings = pd.read_csv(
+                f"tmp/{self._prefix}/ratings.csv",
+                sep=",",
+            )
+            ratings.rename(
+                columns={
+                    "userId": "user_id",
+                    "movieId": "movie_id",
+                    "timestamp": "unix_timestamp",
+                },
+                inplace=True,
+            )
+            # movieId,title,genres
+            # 1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
+            # 2,Jumanji (1995),Adventure|Children|Fantasy
+            movies = pd.read_csv(
+                f"tmp/{self._prefix}/movies.csv",
+                sep=",",
+                encoding="iso-8859-1",
+            )
+            movies.rename(columns={"movieId": "movie_id"}, inplace=True)
+        else:
+            assert self._prefix == "ml-20mx16x32"
+            # ml-1b
+            user_ids = []
+            movie_ids = []
+            for i in range(16):
+                train_file = f"tmp/{self._prefix}/trainx16x32_{i}.npz"
+                with np.load(train_file) as data:
+                    user_ids.extend([x[0] for x in data["arr_0"]])
+                    movie_ids.extend([x[1] for x in data["arr_0"]])
+            ratings = pd.DataFrame(
+                data={
+                    "user_id": user_ids,
+                    "movie_id": movie_ids,
+                    "rating": user_ids,  # placeholder
+                    "unix_timestamp": movie_ids,  # placeholder
+                }
+            )
+            users = None
+            movies = None
+
+        if movies is not None:
+            # ML-1M and ML-20M only
+            movies["year"] = movies["title"].apply(lambda x: x[-5:-1])
+            movies["cleaned_title"] = movies["title"].apply(lambda x: x[:-7])
+            # movies.year = pd.Categorical(movies.year)
+            # movies["year"] = movies.year.cat.codes
+
+        if users is not None:
+            ## Users (ml-1m only)
+            users.sex = pd.Categorical(users.sex)
+            users["sex"] = users.sex.cat.codes
+
+            users.age_group = pd.Categorical(users.age_group)
+            users["age_group"] = users.age_group.cat.codes
+
+            users.occupation = pd.Categorical(users.occupation)
+            users["occupation"] = users.occupation.cat.codes
+
+            users.zip_code = pd.Categorical(users.zip_code)
+            users["zip_code"] = users.zip_code.cat.codes
+
+        # Normalize movie ids to speed up training
+        print(
+            f"{self._prefix} #item before normalize: {len(set(ratings['movie_id'].values))}"
+        )
+        print(
+            f"{self._prefix} max item id before normalize: {max(set(ratings['movie_id'].values))}"
+        )
+        # print(f"ratings.movie_id.cat.categories={ratings.movie_id.cat.categories}; {type(ratings.movie_id.cat.categories)}")
+        # print(f"ratings.movie_id.cat.codes={ratings.movie_id.cat.codes}; {type(ratings.movie_id.cat.codes)}")
+        # print(movie_id_to_cat)
+        # ratings["movie_id"] = ratings.movie_id.cat.codes
+        # print(f"{self._prefix} #item after normalize: {len(set(ratings['movie_id'].values))}")
+        # print(f"{self._prefix} max item id after normalize: {max(set(ratings['movie_id'].values))}")
+        # movies["remapped_id"] = movies["movie_id"].apply(lambda x: movie_id_to_cat[x])
+
+        if self._convert_timestamp:
+            ratings["unix_timestamp"] = pd.to_datetime(
+                ratings["unix_timestamp"], unit="s"
+            )
+
+        # Save primary csv's
+        if not os.path.exists(f"tmp/processed/{self._prefix}"):
+            os.makedirs(f"tmp/processed/{self._prefix}")
+        if users is not None:
+            users.to_csv(f"tmp/processed/{self._prefix}/users.csv", index=False)
+        if movies is not None:
+            movies.to_csv(f"tmp/processed/{self._prefix}/movies.csv", index=False)
+        ratings.to_csv(f"tmp/processed/{self._prefix}/ratings.csv", index=False)
+
+        num_unique_users = len(set(ratings["user_id"].values))
+        num_unique_items = len(set(ratings["movie_id"].values))
+
+        # SASRec version
+        ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")
+        seq_ratings_data = pd.DataFrame(
+            data={
+                "user_id": list(ratings_group.groups.keys()),
+                "item_ids": list(ratings_group.movie_id.apply(list)),
+                "ratings": list(ratings_group.rating.apply(list)),
+                "timestamps": list(ratings_group.unix_timestamp.apply(list)),
+            }
+        )
+
+        result = pd.DataFrame([[]])
+        for col in ["item_ids"]:
+            result[col + "_mean"] = seq_ratings_data[col].apply(len).mean()
+            result[col + "_min"] = seq_ratings_data[col].apply(len).min()
+            result[col + "_max"] = seq_ratings_data[col].apply(len).max()
+        print(self._prefix)
+        print(result)
+
+        seq_ratings_data = self.to_seq_data(seq_ratings_data, users)
+        seq_ratings_data.sample(frac=1).reset_index().to_csv(
+            self.output_format_csv(), index=False, sep=","
+        )
+
+        # Split by user ids (not tested yet)
+        user_id_split = int(num_unique_users * 0.9)
+        seq_ratings_data_train = seq_ratings_data[
+            seq_ratings_data["user_id"] <= user_id_split
+        ]
+        seq_ratings_data_train.sample(frac=1).reset_index().to_csv(
+            self.sasrec_format_csv_by_user_train(),
+            index=False,
+            sep=",",
+        )
+        seq_ratings_data_test = seq_ratings_data[
+            seq_ratings_data["user_id"] > user_id_split
+        ]
+        seq_ratings_data_test.sample(frac=1).reset_index().to_csv(
+            self.sasrec_format_csv_by_user_test(), index=False, sep=","
+        )
+        print(
+            f"{self._prefix}: train num user: {len(set(seq_ratings_data_train['user_id'].values))}"
+        )
+        print(
+            f"{self._prefix}: test num user: {len(set(seq_ratings_data_test['user_id'].values))}"
+        )
+
+        # print(seq_ratings_data)
+        if self.expected_num_unique_items() is not None:
+            assert self.expected_num_unique_items() == num_unique_items, (
+                f"Expected items: {self.expected_num_unique_items()}, got: {num_unique_items}"
+            )
+
+        return num_unique_items
+
+
+class AmazonDataProcessor(DataProcessor):
+    def __init__(
+        self,
+        download_path: str,
+        saved_name: str,
+        prefix: str,
+        expected_num_unique_items: Optional[int],
+    ) -> None:
+        super().__init__(
+            prefix,
+            expected_num_unique_items=expected_num_unique_items,
+            expected_max_item_id=None,
+        )
+        self._download_path = download_path
+        self._saved_name = saved_name
+        self._prefix = prefix
+
+    def download(self) -> None:
+        if not self.file_exists(self._saved_name):
+            urlretrieve(self._download_path, self._saved_name)
+
+    def preprocess_rating(self) -> int:
+        self.download()
+
+        ratings = pd.read_csv(
+            self._saved_name,
+            sep=",",
+            names=["user_id", "item_id", "rating", "timestamp"],
+        )
+        print(f"{self._prefix} #data points before filter: {ratings.shape[0]}")
+        print(
+            f"{self._prefix} #user before filter: {len(set(ratings['user_id'].values))}"
+        )
+        print(
+            f"{self._prefix} #item before filter: {len(set(ratings['item_id'].values))}"
+        )
+
+        # filter users and items with presence < 5
+        item_id_count = (
+            ratings["item_id"]
+            .value_counts()
+            .rename_axis("unique_values")
+            .reset_index(name="item_count")
+        )
+        user_id_count = (
+            ratings["user_id"]
+            .value_counts()
+            .rename_axis("unique_values")
+            .reset_index(name="user_count")
+        )
+        ratings = ratings.join(item_id_count.set_index("unique_values"), on="item_id")
+        ratings = ratings.join(user_id_count.set_index("unique_values"), on="user_id")
+        ratings = ratings[ratings["item_count"] >= 5]
+        ratings = ratings[ratings["user_count"] >= 5]
+        print(f"{self._prefix} #data points after filter: {ratings.shape[0]}")
+
+        # categorize user id and item id
+        ratings["item_id"] = pd.Categorical(ratings["item_id"])
+        ratings["item_id"] = ratings["item_id"].cat.codes
+        ratings["user_id"] = pd.Categorical(ratings["user_id"])
+        ratings["user_id"] = ratings["user_id"].cat.codes
+        print(
+            f"{self._prefix} #user after filter: {len(set(ratings['user_id'].values))}"
+        )
+        print(
+            f"{self._prefix} #item ater filter: {len(set(ratings['item_id'].values))}"
+        )
+
+        num_unique_items = len(set(ratings["item_id"].values))
+
+        # SASRec version
+        ratings_group = ratings.sort_values(by=["timestamp"]).groupby("user_id")
+
+        seq_ratings_data = pd.DataFrame(
+            data={
+                "user_id": list(ratings_group.groups.keys()),
+                "item_ids": list(ratings_group.item_id.apply(list)),
+                "ratings": list(ratings_group.rating.apply(list)),
+                "timestamps": list(ratings_group.timestamp.apply(list)),
+            }
+        )
+
+        seq_ratings_data = seq_ratings_data[
+            seq_ratings_data["item_ids"].apply(len) >= 5
+        ]
+
+        result = pd.DataFrame([[]])
+        for col in ["item_ids"]:
+            result[col + "_mean"] = seq_ratings_data[col].apply(len).mean()
+            result[col + "_min"] = seq_ratings_data[col].apply(len).min()
+            result[col + "_max"] = seq_ratings_data[col].apply(len).max()
+        print(self._prefix)
+        print(result)
+
+        if not os.path.exists(f"tmp/{self._prefix}"):
+            os.makedirs(f"tmp/{self._prefix}")
+
+        seq_ratings_data = self.to_seq_data(seq_ratings_data)
+        seq_ratings_data.sample(frac=1).reset_index().to_csv(
+            self.output_format_csv(), index=False, sep=","
+        )
+
+        if self.expected_num_unique_items() is not None:
+            assert self.expected_num_unique_items() == num_unique_items, (
+                f"expected: {self.expected_num_unique_items()}, actual: {num_unique_items}"
+            )
+            logging.info(f"{self.expected_num_unique_items()} unique items.")
+
+        return num_unique_items
+
+
+def get_common_preprocessors() -> Dict[
+    str,
+    Union[AmazonDataProcessor, MovielensDataProcessor, MovielensSyntheticDataProcessor],
+]:
+    ml_1m_dp = MovielensDataProcessor(  # pyre-ignore [45]
+        "http://files.grouplens.org/datasets/movielens/ml-1m.zip",
+        "tmp/movielens1m.zip",
+        prefix="ml-1m",
+        convert_timestamp=False,
+        expected_num_unique_items=3706,
+        expected_max_item_id=3952,
+    )
+    ml_20m_dp = MovielensDataProcessor(  # pyre-ignore [45]
+        "http://files.grouplens.org/datasets/movielens/ml-20m.zip",
+        "tmp/movielens20m.zip",
+        prefix="ml-20m",
+        convert_timestamp=False,
+        expected_num_unique_items=26744,
+        expected_max_item_id=131262,
+    )
+    ml_1b_dp = MovielensDataProcessor(  # pyre-ignore [45]
+        "https://files.grouplens.org/datasets/movielens/ml-20mx16x32.tar",
+        "tmp/movielens1b.tar",
+        prefix="ml-20mx16x32",
+        convert_timestamp=False,
+    )
+    ml_3b_dp = MovielensSyntheticDataProcessor(  # pyre-ignore [45]
+        prefix="ml-3b",
+        expected_num_unique_items=26743 * 32,
+        expected_max_item_id=26743 * 32,
+    )
+    amzn_books_dp = AmazonDataProcessor(  # pyre-ignore [45]
+        "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Books.csv",
+        "tmp/ratings_Books.csv",
+        prefix="amzn_books",
+        expected_num_unique_items=695762,
+    )
+    return {
+        "ml-1m": ml_1m_dp,
+        "ml-20m": ml_20m_dp,
+        "ml-1b": ml_1b_dp,
+        "ml-3b": ml_3b_dp,
+        "amzn-books": amzn_books_dp,
+    }
diff --git a/recommendation_v4/generative_recommenders/research/data/reco_dataset.py b/recommendation_v4/generative_recommenders/research/data/reco_dataset.py
new file mode 100644
index 000000000..eedcdc08a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/data/reco_dataset.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import List
+
+import pandas as pd
+import torch
+from generative_recommenders.research.data.dataset import DatasetV2, MultiFileDatasetV2
+from generative_recommenders.research.data.item_features import ItemFeatures
+from generative_recommenders.research.data.preprocessor import get_common_preprocessors
+
+
+@dataclass
+class RecoDataset:
+    max_sequence_length: int
+    num_unique_items: int
+    max_item_id: int
+    all_item_ids: List[int]
+    train_dataset: torch.utils.data.Dataset
+    eval_dataset: torch.utils.data.Dataset
+
+
+def get_reco_dataset(
+    dataset_name: str,
+    max_sequence_length: int,
+    chronological: bool,
+    positional_sampling_ratio: float = 1.0,
+) -> RecoDataset:
+    if dataset_name == "ml-1m":
+        dp = get_common_preprocessors()[dataset_name]
+        train_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=1,
+            chronological=chronological,
+            sample_ratio=positional_sampling_ratio,
+        )
+        eval_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=0,
+            chronological=chronological,
+            sample_ratio=1.0,  # do not sample
+        )
+    elif dataset_name == "ml-20m":
+        dp = get_common_preprocessors()[dataset_name]
+        train_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=1,
+            chronological=chronological,
+        )
+        eval_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=0,
+            chronological=chronological,
+        )
+    elif dataset_name == "ml-3b":
+        dp = get_common_preprocessors()[dataset_name]
+        train_dataset = MultiFileDatasetV2(
+            file_prefix="tmp/ml-3b/16x32",
+            num_files=16,
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=1,
+            chronological=chronological,
+        )
+        eval_dataset = MultiFileDatasetV2(
+            file_prefix="tmp/ml-3b/16x32",
+            num_files=16,
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=0,
+            chronological=chronological,
+        )
+    elif dataset_name == "amzn-books":
+        dp = get_common_preprocessors()[dataset_name]
+        train_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=1,
+            shift_id_by=1,  # [0..n-1] -> [1..n]
+            chronological=chronological,
+        )
+        eval_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=0,
+            shift_id_by=1,  # [0..n-1] -> [1..n]
+            chronological=chronological,
+        )
+    else:
+        raise ValueError(f"Unknown dataset {dataset_name}")
+
+    if dataset_name == "ml-1m" or dataset_name == "ml-20m":
+        items = pd.read_csv(dp.processed_item_csv(), delimiter=",")
+        max_jagged_dimension = 16
+        expected_max_item_id = dp.expected_max_item_id()
+        assert expected_max_item_id is not None
+        item_features: ItemFeatures = ItemFeatures(
+            max_ind_range=[63, 16383, 511],
+            num_items=expected_max_item_id + 1,
+            max_jagged_dimension=max_jagged_dimension,
+            lengths=[
+                torch.zeros((expected_max_item_id + 1,), dtype=torch.int64),
+                torch.zeros((expected_max_item_id + 1,), dtype=torch.int64),
+                torch.zeros((expected_max_item_id + 1,), dtype=torch.int64),
+            ],
+            values=[
+                torch.zeros(
+                    (expected_max_item_id + 1, max_jagged_dimension),
+                    dtype=torch.int64,
+                ),
+                torch.zeros(
+                    (expected_max_item_id + 1, max_jagged_dimension),
+                    dtype=torch.int64,
+                ),
+                torch.zeros(
+                    (expected_max_item_id + 1, max_jagged_dimension),
+                    dtype=torch.int64,
+                ),
+            ],
+        )
+        all_item_ids = []
+        for df_index, row in items.iterrows():
+            # print(f"index {df_index}: {row}")
+            movie_id = int(row["movie_id"])
+            genres = row["genres"].split("|")
+            titles = row["cleaned_title"].split(" ")
+            # print(f"{index}: genres{genres}, title{titles}")
+            genres_vector = [hash(x) % item_features.max_ind_range[0] for x in genres]
+            titles_vector = [hash(x) % item_features.max_ind_range[1] for x in titles]
+            years_vector = [hash(row["year"]) % item_features.max_ind_range[2]]
+            item_features.lengths[0][movie_id] = min(
+                len(genres_vector), max_jagged_dimension
+            )
+            item_features.lengths[1][movie_id] = min(
+                len(titles_vector), max_jagged_dimension
+            )
+            item_features.lengths[2][movie_id] = min(
+                len(years_vector), max_jagged_dimension
+            )
+            for f, f_values in enumerate([genres_vector, titles_vector, years_vector]):
+                for j in range(min(len(f_values), max_jagged_dimension)):
+                    item_features.values[f][movie_id][j] = f_values[j]
+            all_item_ids.append(movie_id)
+        max_item_id = dp.expected_max_item_id()
+        for x in all_item_ids:
+            assert x > 0, "x in all_item_ids should be positive"
+    else:
+        # expected_max_item_id and item_features are not set for Amazon datasets.
+        item_features = None
+        max_item_id = dp.expected_num_unique_items()
+        all_item_ids = [x + 1 for x in range(max_item_id)]  # pyre-ignore [6]
+
+    return RecoDataset(
+        max_sequence_length=max_sequence_length,
+        num_unique_items=dp.expected_num_unique_items(),  # pyre-ignore [6]
+        max_item_id=max_item_id,  # pyre-ignore [6]
+        all_item_ids=all_item_ids,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
diff --git a/recommendation_v4/generative_recommenders/research/indexing/candidate_index.py b/recommendation_v4/generative_recommenders/research/indexing/candidate_index.py
new file mode 100644
index 000000000..fee763eaa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/indexing/candidate_index.py
@@ -0,0 +1,179 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
+
+
+class CandidateIndex(object):
+    def __init__(
+        self,
+        ids: torch.Tensor,
+        embeddings: torch.Tensor,
+        invalid_ids: Optional[torch.Tensor] = None,
+        debug_path: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+
+        self._ids: torch.Tensor = ids
+        self._embeddings: torch.Tensor = embeddings
+        self._invalid_ids: Optional[torch.Tensor] = invalid_ids
+        self._debug_path: Optional[str] = debug_path
+
+    @property
+    def ids(self) -> torch.Tensor:
+        """
+        Returns:
+            (1, X) or (B, X), where valid ids are positive integers.
+        """
+        return self._ids
+
+    @property
+    def num_objects(self) -> int:
+        return self._ids.size(1)
+
+    @property
+    def embeddings(self) -> torch.Tensor:
+        """
+        Returns:
+            (1, X, D) or (B, X, D) with the same shape as `ids'.
+        """
+        return self._embeddings
+
+    def filter_invalid_ids(
+        self,
+        invalid_ids: torch.Tensor,
+    ) -> "CandidateIndex":
+        """
+        Filters invalid_ids (batch dimension dependent) from the current index.
+
+        Args:
+            invalid_ids: (B, N) x int64.
+
+        Returns:
+            CandidateIndex with invalid_ids filtered.
+        """
+        X = self._ids.size(1)
+        if self._ids.size(0) == 1:
+            # ((1, X, 1) == (B, 1, N)) -> (B, X)
+            invalid_mask, _ = (self._ids.unsqueeze(2) == invalid_ids.unsqueeze(1)).max(
+                dim=2
+            )
+            lengths = (~invalid_mask).int().sum(-1)  # (B,)
+            valid_1d_mask = (~invalid_mask).view(-1)
+            B: int = lengths.size(0)
+            D: int = self._embeddings.size(-1)
+            jagged_ids = self._ids.expand(B, -1).reshape(-1)[valid_1d_mask]
+            jagged_embeddings = self._embeddings.expand(B, -1, -1).reshape(-1, D)[
+                valid_1d_mask
+            ]
+            X_prime: int = lengths.max(-1)[0].item()
+            jagged_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+            return CandidateIndex(
+                ids=torch.ops.fbgemm.jagged_to_padded_dense(
+                    values=jagged_ids.unsqueeze(-1),
+                    offsets=[jagged_offsets],
+                    max_lengths=[X_prime],
+                    padding_value=0,
+                ).squeeze(-1),
+                embeddings=torch.ops.fbgemm.jagged_to_padded_dense(
+                    values=jagged_embeddings,
+                    offsets=[jagged_offsets],
+                    max_lengths=[X_prime],
+                    padding_value=0.0,
+                ),
+                debug_path=self._debug_path,
+            )
+        else:
+            assert self._invalid_ids == None
+            return CandidateIndex(
+                ids=self.ids,
+                embeddings=self.embeddings,
+                invalid_ids=invalid_ids,
+                debug_path=self._debug_path,
+            )
+
+    def get_top_k_outputs(
+        self,
+        query_embeddings: torch.Tensor,
+        k: int,
+        top_k_module: TopKModule,
+        invalid_ids: Optional[torch.Tensor],
+        r: int = 1,
+        return_embeddings: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Gets top-k outputs specified by `policy_fn', while filtering out
+        invalid ids per row as specified by `invalid_ids'.
+
+        Args:
+            k: int. top k to return.
+            policy_fn: lambda that takes in item-side embeddings (B, X, D,) and user-side
+                embeddings (B * r, ...), and returns predictions (unnormalized logits)
+                of shape (B * r, X,).
+            invalid_ids: (B * r, N_0) x int64. The list of ids (if > 0) to filter from
+                results if present. Expect N_0 to be a small constant.
+            return_embeddings: bool if we should additionally return embeddings for the
+                top k results.
+
+        Returns:
+            A tuple of (top_k_ids, top_k_prs, top_k_embeddings) of shape (B * r, k, ...).
+        """
+        B: int = query_embeddings.size(0)
+        max_num_invalid_ids = 0
+        if invalid_ids is not None:
+            max_num_invalid_ids = invalid_ids.size(1)
+
+        k_prime = min(k + max_num_invalid_ids, self.num_objects)
+        top_k_prime_scores, top_k_prime_ids = top_k_module(
+            query_embeddings=query_embeddings, k=k_prime
+        )
+        # Masks out invalid items rowwise.
+        if invalid_ids is not None:
+            id_is_valid = ~(
+                (top_k_prime_ids.unsqueeze(2) == invalid_ids.unsqueeze(1)).max(2)[0]
+            )  # [B, K + N_0]
+            id_is_valid = torch.logical_and(
+                id_is_valid, torch.cumsum(id_is_valid.int(), dim=1) <= k
+            )
+            # [[1, 0, 1, 0], [0, 1, 1, 1]], k=2 -> [[0, 2], [1, 2]]
+            top_k_rowwise_offsets = torch.nonzero(id_is_valid, as_tuple=True)[1].view(
+                -1, k
+            )
+            top_k_scores = torch.gather(
+                top_k_prime_scores, dim=1, index=top_k_rowwise_offsets
+            )
+            top_k_ids = torch.gather(
+                top_k_prime_ids, dim=1, index=top_k_rowwise_offsets
+            )
+        else:
+            top_k_scores = top_k_prime_scores
+            top_k_ids = top_k_prime_ids
+
+        # TODO: this should be decoupled from candidate_index.
+        if return_embeddings:
+            raise ValueError("return_embeddings not supported yet.")
+        else:
+            top_k_embeddings = None
+        return top_k_ids, top_k_scores, top_k_embeddings
+
+    def apply_object_filter(self) -> "CandidateIndex":
+        """
+        Applies general per batch filters.
+        """
+        raise NotImplementedError("not implemented.")
diff --git a/recommendation_v4/generative_recommenders/research/indexing/utils.py b/recommendation_v4/generative_recommenders/research/indexing/utils.py
new file mode 100644
index 000000000..972d3c2e7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/indexing/utils.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import torch
+from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
+from generative_recommenders.research.rails.indexing.mips_top_k import (
+    MIPSBruteForceTopK,
+)
+from generative_recommenders.research.rails.indexing.mol_top_k import MoLBruteForceTopK
+
+
+def get_top_k_module(
+    top_k_method: str,
+    model: torch.nn.Module,
+    item_embeddings: torch.Tensor,
+    item_ids: torch.Tensor,
+) -> TopKModule:
+    if top_k_method == "MIPSBruteForceTopK":
+        top_k_module = MIPSBruteForceTopK(
+            item_embeddings=item_embeddings,
+            item_ids=item_ids,
+        )
+    elif top_k_method == "MoLBruteForceTopK":
+        top_k_module = MoLBruteForceTopK(  # pyre-ignore [20]
+            item_embeddings=item_embeddings,
+            item_ids=item_ids,
+        )
+    else:
+        raise ValueError(f"Invalid top-k method {top_k_method}")
+    return top_k_module
diff --git a/recommendation_v4/generative_recommenders/research/modeling/initialization.py b/recommendation_v4/generative_recommenders/research/modeling/initialization.py
new file mode 100644
index 000000000..c80d60075
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/initialization.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import torch
+
+
+def truncated_normal(x: torch.Tensor, mean: float, std: float) -> torch.Tensor:
+    with torch.no_grad():
+        size = x.shape
+        tmp = x.new_empty(size + (4,)).normal_()
+        valid = (tmp < 2) & (tmp > -2)
+        ind = valid.max(-1, keepdim=True)[1]
+        x.data.copy_(tmp.gather(-1, ind).squeeze(-1))
+        x.data.mul_(std).add_(mean)
+        return x
+
+
+def init_mlp_xavier_weights_zero_bias(m: torch.nn.Module) -> None:
+    if isinstance(m, torch.nn.Linear):
+        torch.nn.init.xavier_uniform(m.weight)
+        if getattr(m, "bias", None) is not None:
+            m.bias.data.fill_(0.0)
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py
new file mode 100644
index 000000000..c32bedf0e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py
@@ -0,0 +1,477 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+from collections import OrderedDict
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+from torch.utils.checkpoint import checkpoint
+
+
+class NegativesSampler(torch.nn.Module):
+    def __init__(self, l2_norm: bool, l2_norm_eps: float) -> None:
+        super().__init__()
+
+        self._l2_norm: bool = l2_norm
+        self._l2_norm_eps: float = l2_norm_eps
+
+    def normalize_embeddings(self, x: torch.Tensor) -> torch.Tensor:
+        return self._maybe_l2_norm(x)
+
+    def _maybe_l2_norm(self, x: torch.Tensor) -> torch.Tensor:
+        if self._l2_norm:
+            x = x / torch.clamp(
+                torch.linalg.norm(x, ord=2, dim=-1, keepdim=True),
+                min=self._l2_norm_eps,
+            )
+        return x
+
+    @abc.abstractmethod
+    def debug_str(self) -> str:
+        pass
+
+    @abc.abstractmethod
+    def process_batch(
+        self,
+        ids: torch.Tensor,
+        presences: torch.Tensor,
+        embeddings: torch.Tensor,
+    ) -> None:
+        pass
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        positive_ids: torch.Tensor,
+        num_to_sample: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns:
+            A tuple of (sampled_ids, sampled_negative_embeddings).
+        """
+        pass
+
+
+class LocalNegativesSampler(NegativesSampler):
+    def __init__(
+        self,
+        num_items: int,
+        item_emb: torch.nn.Embedding,
+        all_item_ids: List[int],
+        l2_norm: bool,
+        l2_norm_eps: float,
+    ) -> None:
+        super().__init__(l2_norm=l2_norm, l2_norm_eps=l2_norm_eps)
+
+        self._num_items: int = len(all_item_ids)
+        self._item_emb: torch.nn.Embedding = item_emb
+        self.register_buffer("_all_item_ids", torch.tensor(all_item_ids))
+
+    def debug_str(self) -> str:
+        sampling_debug_str = (
+            f"local{f'-l2-eps{self._l2_norm_eps}' if self._l2_norm else ''}"
+        )
+        return sampling_debug_str
+
+    def process_batch(
+        self,
+        ids: torch.Tensor,
+        presences: torch.Tensor,
+        embeddings: torch.Tensor,
+    ) -> None:
+        pass
+
+    def forward(
+        self,
+        positive_ids: torch.Tensor,
+        num_to_sample: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns:
+            A tuple of (sampled_ids, sampled_negative_embeddings).
+        """
+        # assert torch.max(torch.abs(self._item_emb(positive_ids) - positive_embeddings)) < 1e-4
+        output_shape = positive_ids.size() + (num_to_sample,)
+        sampled_offsets = torch.randint(
+            low=0,
+            high=self._num_items,
+            size=output_shape,
+            dtype=positive_ids.dtype,
+            device=positive_ids.device,
+        )
+        sampled_ids = self._all_item_ids[sampled_offsets.view(-1)].reshape(output_shape)
+        return sampled_ids, self.normalize_embeddings(self._item_emb(sampled_ids))
+
+
+class InBatchNegativesSampler(NegativesSampler):
+    def __init__(
+        self,
+        l2_norm: bool,
+        l2_norm_eps: float,
+        dedup_embeddings: bool,
+    ) -> None:
+        super().__init__(l2_norm=l2_norm, l2_norm_eps=l2_norm_eps)
+
+        self._dedup_embeddings: bool = dedup_embeddings
+
+    def debug_str(self) -> str:
+        sampling_debug_str = (
+            f"in-batch{f'-l2-eps{self._l2_norm_eps}' if self._l2_norm else ''}"
+        )
+        if self._dedup_embeddings:
+            sampling_debug_str += "-dedup"
+        return sampling_debug_str
+
+    def process_batch(
+        self,
+        ids: torch.Tensor,
+        presences: torch.Tensor,
+        embeddings: torch.Tensor,
+    ) -> None:
+        """
+        Args:
+           ids: (N') or (B, N) x int64
+           presences: (N') or (B, N) x bool
+           embeddings: (N', D) or (B, N, D) x float
+        """
+        assert ids.size() == presences.size()
+        assert ids.size() == embeddings.size()[:-1]
+        if self._dedup_embeddings:
+            valid_ids = ids[presences]
+            unique_ids, unique_ids_inverse_indices = torch.unique(
+                input=valid_ids, sorted=False, return_inverse=True
+            )
+            device = unique_ids.device
+            unique_embedding_offsets = torch.empty(
+                (unique_ids.numel(),),
+                dtype=torch.int64,
+                device=device,
+            )
+            unique_embedding_offsets[unique_ids_inverse_indices] = torch.arange(
+                valid_ids.numel(), dtype=torch.int64, device=device
+            )
+            unique_embeddings = embeddings[presences][unique_embedding_offsets, :]
+            self._cached_embeddings = self._maybe_l2_norm(  # pyre-ignore [16]
+                unique_embeddings
+            )
+            self._cached_ids = unique_ids  # pyre-ignore [16]
+        else:
+            self._cached_embeddings = self._maybe_l2_norm(embeddings[presences])
+            self._cached_ids = ids[presences]
+
+    def get_all_ids_and_embeddings(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self._cached_ids, self._cached_embeddings  # pyre-ignore [7]
+
+    def forward(
+        self,
+        positive_ids: torch.Tensor,
+        num_to_sample: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns:
+            A tuple of (sampled_ids, sampled_negative_embeddings,).
+        """
+        X = self._cached_ids.size(0)
+        sampled_offsets = torch.randint(
+            low=0,
+            high=X,
+            size=positive_ids.size() + (num_to_sample,),
+            dtype=positive_ids.dtype,
+            device=positive_ids.device,
+        )
+        return (
+            self._cached_ids[sampled_offsets],  # pyre-ignore [29]
+            self._cached_embeddings[sampled_offsets],  # pyre-ignore [29]
+        )
+
+
+class AutoregressiveLoss(torch.nn.Module):
+    @abc.abstractmethod
+    def jagged_forward(
+        self,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        """
+        Variant of forward() when the tensors are already in jagged format.
+
+        Args:
+            output_embeddings: [N', D] x float, embeddings for the current
+                input sequence.
+            supervision_ids: [N'] x int64, (positive) supervision ids.
+            supervision_embeddings: [N', D] x float.
+            supervision_weights: Optional [N'] x float. Optional weights for
+                masking out invalid positions, or reweighting supervision labels.
+            negatives_sampler: sampler used to obtain negative examples paired with
+                positives.
+
+        Returns:
+            (1), loss for the current engaged sequence.
+        """
+        pass
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        lengths: torch.Tensor,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        """
+        Args:
+            lengths: [B] x int32 representing number of non-zero elements per row.
+            output_embeddings: [B, N, D] x float, embeddings for the current
+                input sequence.
+            supervision_ids: [B, N] x int64, (positive) supervision ids.
+            supervision_embeddings: [B, N, D] x float.
+            supervision_weights: Optional [B, N] x float. Optional weights for
+                masking out invalid positions, or reweighting supervision labels.
+            negatives_sampler: sampler used to obtain negative examples paired with
+                positives.
+
+        Returns:
+            (1), loss for the current engaged sequence.
+        """
+        pass
+
+
+class BCELoss(AutoregressiveLoss):
+    def __init__(
+        self,
+        temperature: float,
+        model: SimilarityModule,
+    ) -> None:
+        super().__init__()
+        self._temperature: float = temperature
+        self._model = model
+
+    def jagged_forward(
+        self,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        assert output_embeddings.size() == supervision_embeddings.size()
+        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
+        assert supervision_ids.size() == supervision_weights.size()
+
+        sampled_ids, sampled_negative_embeddings = negatives_sampler(
+            positive_ids=supervision_ids,
+            num_to_sample=1,
+        )
+
+        positive_logits = (
+            self._model.interaction(  # pyre-ignore [29]
+                input_embeddings=output_embeddings,  # [B, D] = [N', D]
+                target_ids=supervision_ids.unsqueeze(1),  # [N', 1]
+                target_embeddings=supervision_embeddings.unsqueeze(
+                    1
+                ),  # [N', D] -> [N', 1, D]
+            )[0].squeeze(1)
+            / self._temperature
+        )  # [N']
+
+        sampled_negatives_logits = (
+            self._model.interaction(  # pyre-ignore [29]
+                input_embeddings=output_embeddings,  # [N', D]
+                target_ids=sampled_ids,  # [N', 1]
+                target_embeddings=sampled_negative_embeddings,  # [N', 1, D]
+            )[0].squeeze(1)
+            / self._temperature
+        )  # [N']
+        sampled_negatives_valid_mask = (
+            supervision_ids != sampled_ids.squeeze(1)
+        ).float()  # [N']
+        loss_weights = supervision_weights * sampled_negatives_valid_mask
+        weighted_losses = (
+            (
+                F.binary_cross_entropy_with_logits(
+                    input=positive_logits,
+                    target=torch.ones_like(positive_logits),
+                    reduction="none",
+                )
+                + F.binary_cross_entropy_with_logits(
+                    input=sampled_negatives_logits,
+                    target=torch.zeros_like(sampled_negatives_logits),
+                    reduction="none",
+                )
+            )
+            * loss_weights
+            * 0.5
+        )
+        return weighted_losses.sum() / loss_weights.sum()
+
+    def forward(
+        self,
+        lengths: torch.Tensor,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        """
+        Args:
+          lengths: [B] x int32 representing number of non-zero elements per row.
+          output_embeddings: [B, N, D] x float, embeddings for the current
+              input sequence.
+          supervision_ids: [B, N] x int64, (positive) supervision ids.
+          supervision_embeddings: [B, N, D] x float.
+          supervision_weights: Optional [B, N] x float. Optional weights for
+              masking out invalid positions, or reweighting supervision labels.
+          negatives_sampler: sampler used to obtain negative examples paired with
+              positives.
+        Returns:
+          (1), loss for the current engaged sequence.
+        """
+        assert output_embeddings.size() == supervision_embeddings.size()
+        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
+        jagged_id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+        jagged_supervision_ids = (
+            torch.ops.fbgemm.dense_to_jagged(
+                supervision_ids.unsqueeze(-1).float(), [jagged_id_offsets]
+            )[0]
+            .squeeze(1)
+            .long()
+        )
+        jagged_supervision_weights = torch.ops.fbgemm.dense_to_jagged(
+            supervision_weights.unsqueeze(-1),
+            [jagged_id_offsets],
+        )[0].squeeze(1)
+        return self.jagged_forward(
+            output_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                output_embeddings,
+                [jagged_id_offsets],
+            )[0],
+            supervision_ids=jagged_supervision_ids,
+            supervision_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                supervision_embeddings,
+                [jagged_id_offsets],
+            )[0],
+            supervision_weights=jagged_supervision_weights,
+            negatives_sampler=negatives_sampler,
+        )
+
+
+class BCELossWithRatings(AutoregressiveLoss):
+    def __init__(
+        self,
+        temperature: float,
+        model: SimilarityModule,
+    ) -> None:
+        super().__init__()
+        self._temperature: float = temperature
+        self._model = model
+
+    def jagged_forward(
+        self,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        supervision_ratings: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        assert output_embeddings.size() == supervision_embeddings.size()
+        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
+        assert supervision_ids.size() == supervision_weights.size()
+
+        target_logits = (
+            self._model.interaction(  # pyre-ignore [29]
+                input_embeddings=output_embeddings,  # [B, D] = [N', D]
+                target_ids=supervision_ids.unsqueeze(1),  # [N', 1]
+                target_embeddings=supervision_embeddings.unsqueeze(
+                    1
+                ),  # [N', D] -> [N', 1, D]
+            )[0].squeeze(1)
+            / self._temperature
+        )  # [N', 1]
+
+        weighted_losses = (
+            F.binary_cross_entropy_with_logits(
+                input=target_logits,
+                target=supervision_ratings.to(dtype=target_logits.dtype),
+                reduction="none",
+            )
+        ) * supervision_weights
+        return weighted_losses.sum() / supervision_weights.sum()
+
+    def forward(
+        self,
+        lengths: torch.Tensor,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        supervision_ratings: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        """
+        Args:
+          lengths: [B] x int32 representing number of non-zero elements per row.
+          output_embeddings: [B, N, D] x float, embeddings for the current
+              input sequence.
+          supervision_ids: [B, N] x int64, (positive) supervision ids.
+          supervision_embeddings: [B, N, D] x float.
+          supervision_weights: Optional [B, N] x float. Optional weights for
+              masking out invalid positions, or reweighting supervision labels.
+          negatives_sampler: sampler used to obtain negative examples paired with
+              positives.
+        Returns:
+          (1), loss for the current engaged sequence.
+        """
+        assert output_embeddings.size() == supervision_embeddings.size()
+        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
+        jagged_id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+        jagged_supervision_ids = (
+            torch.ops.fbgemm.dense_to_jagged(
+                supervision_ids.unsqueeze(-1).float(), [jagged_id_offsets]
+            )[0]
+            .squeeze(1)
+            .long()
+        )
+        jagged_supervision_weights = torch.ops.fbgemm.dense_to_jagged(
+            supervision_weights.unsqueeze(-1),
+            [jagged_id_offsets],
+        )[0].squeeze(1)
+        return self.jagged_forward(
+            output_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                output_embeddings,
+                [jagged_id_offsets],
+            )[0],
+            supervision_ids=jagged_supervision_ids,
+            supervision_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                supervision_embeddings,
+                [jagged_id_offsets],
+            )[0],
+            supervision_weights=jagged_supervision_weights,
+            supervision_ratings=torch.ops.fbgemm.dense_to_jagged(
+                supervision_ratings.unsqueeze(-1),
+                [jagged_id_offsets],
+            )[0].squeeze(1),
+            negatives_sampler=negatives_sampler,
+        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py
new file mode 100644
index 000000000..6e85a62dd
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py
@@ -0,0 +1,108 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+
+import torch
+from generative_recommenders.research.modeling.initialization import truncated_normal
+
+
+class EmbeddingModule(torch.nn.Module):
+    @abc.abstractmethod
+    def debug_str(self) -> str:
+        pass
+
+    @abc.abstractmethod
+    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
+        pass
+
+    @property
+    @abc.abstractmethod
+    def item_embedding_dim(self) -> int:
+        pass
+
+
+class LocalEmbeddingModule(EmbeddingModule):
+    def __init__(
+        self,
+        num_items: int,
+        item_embedding_dim: int,
+    ) -> None:
+        super().__init__()
+
+        self._item_embedding_dim: int = item_embedding_dim
+        self._item_emb = torch.nn.Embedding(
+            num_items + 1, item_embedding_dim, padding_idx=0
+        )
+        self.reset_params()
+
+    def debug_str(self) -> str:
+        return f"local_emb_d{self._item_embedding_dim}"
+
+    def reset_params(self) -> None:
+        for name, params in self.named_parameters():
+            if "_item_emb" in name:
+                print(
+                    f"Initialize {name} as truncated normal: {params.data.size()} params"
+                )
+                truncated_normal(params, mean=0.0, std=0.02)
+            else:
+                print(f"Skipping initializing params {name} - not configured")
+
+    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
+        return self._item_emb(item_ids)
+
+    @property
+    def item_embedding_dim(self) -> int:
+        return self._item_embedding_dim
+
+
+class CategoricalEmbeddingModule(EmbeddingModule):
+    def __init__(
+        self,
+        num_items: int,
+        item_embedding_dim: int,
+        item_id_to_category_id: torch.Tensor,
+    ) -> None:
+        super().__init__()
+
+        self._item_embedding_dim: int = item_embedding_dim
+        self._item_emb: torch.nn.Embedding = torch.nn.Embedding(
+            num_items + 1, item_embedding_dim, padding_idx=0
+        )
+        self.register_buffer("_item_id_to_category_id", item_id_to_category_id)
+        self.reset_params()
+
+    def debug_str(self) -> str:
+        return f"cat_emb_d{self._item_embedding_dim}"
+
+    def reset_params(self) -> None:
+        for name, params in self.named_parameters():
+            if "_item_emb" in name:
+                print(
+                    f"Initialize {name} as truncated normal: {params.data.size()} params"
+                )
+                truncated_normal(params, mean=0.0, std=0.02)
+            else:
+                print(f"Skipping initializing params {name} - not configured")
+
+    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
+        item_ids = self._item_id_to_category_id[(item_ids - 1).clamp(min=0)] + 1
+        return self._item_emb(item_ids)
+
+    @property
+    def item_embedding_dim(self) -> int:
+        return self._item_embedding_dim
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py
new file mode 100644
index 000000000..dc64aa2cf
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py
@@ -0,0 +1,150 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import gin
+from generative_recommenders.research.modeling.sequential.embedding_modules import (
+    EmbeddingModule,
+)
+from generative_recommenders.research.modeling.sequential.hstu import HSTU
+from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
+    InputFeaturesPreprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.output_postprocessors import (
+    OutputPostprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.sasrec import SASRec
+from generative_recommenders.research.modeling.similarity_module import (
+    SequentialEncoderWithLearnedSimilarityModule,
+)
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+
+
+@gin.configurable
+def sasrec_encoder(
+    max_sequence_length: int,
+    max_output_length: int,
+    embedding_module: EmbeddingModule,
+    similarity_module: SimilarityModule,
+    input_preproc_module: InputFeaturesPreprocessorModule,
+    output_postproc_module: OutputPostprocessorModule,
+    activation_checkpoint: bool,
+    verbose: bool,
+    ffn_hidden_dim: int = 64,
+    ffn_activation_fn: str = "relu",
+    ffn_dropout_rate: float = 0.2,
+    num_blocks: int = 2,
+    num_heads: int = 1,
+) -> SequentialEncoderWithLearnedSimilarityModule:
+    return SASRec(
+        embedding_module=embedding_module,
+        max_sequence_len=max_sequence_length,
+        max_output_len=max_output_length,
+        embedding_dim=embedding_module.item_embedding_dim,
+        ffn_hidden_dim=ffn_hidden_dim,
+        ffn_activation_fn=ffn_activation_fn,
+        ffn_dropout_rate=ffn_dropout_rate,
+        num_blocks=num_blocks,
+        num_heads=num_heads,
+        similarity_module=similarity_module,  # pyre-ignore [6]
+        input_features_preproc_module=input_preproc_module,
+        output_postproc_module=output_postproc_module,
+        activation_checkpoint=activation_checkpoint,
+        verbose=verbose,
+    )
+
+
+@gin.configurable
+def hstu_encoder(
+    max_sequence_length: int,
+    max_output_length: int,
+    embedding_module: EmbeddingModule,
+    similarity_module: SimilarityModule,
+    input_preproc_module: InputFeaturesPreprocessorModule,
+    output_postproc_module: OutputPostprocessorModule,
+    activation_checkpoint: bool,
+    verbose: bool,
+    num_blocks: int = 2,
+    num_heads: int = 1,
+    dqk: int = 64,
+    dv: int = 64,
+    linear_dropout_rate: float = 0.0,
+    attn_dropout_rate: float = 0.0,
+    normalization: str = "rel_bias",
+    linear_config: str = "uvqk",
+    linear_activation: str = "silu",
+    concat_ua: bool = False,
+    enable_relative_attention_bias: bool = True,
+) -> SequentialEncoderWithLearnedSimilarityModule:
+    return HSTU(
+        embedding_module=embedding_module,
+        similarity_module=similarity_module,  # pyre-ignore [6]
+        input_features_preproc_module=input_preproc_module,
+        output_postproc_module=output_postproc_module,
+        max_sequence_len=max_sequence_length,
+        max_output_len=max_output_length,
+        embedding_dim=embedding_module.item_embedding_dim,
+        num_blocks=num_blocks,
+        num_heads=num_heads,
+        attention_dim=dqk,
+        linear_dim=dv,
+        linear_dropout_rate=linear_dropout_rate,
+        attn_dropout_rate=attn_dropout_rate,
+        linear_config=linear_config,
+        linear_activation=linear_activation,
+        normalization=normalization,
+        concat_ua=concat_ua,
+        enable_relative_attention_bias=enable_relative_attention_bias,
+        verbose=verbose,
+    )
+
+
+@gin.configurable
+def get_sequential_encoder(
+    module_type: str,
+    max_sequence_length: int,
+    max_output_length: int,
+    embedding_module: EmbeddingModule,
+    interaction_module: SimilarityModule,
+    input_preproc_module: InputFeaturesPreprocessorModule,
+    output_postproc_module: OutputPostprocessorModule,
+    verbose: bool,
+    activation_checkpoint: bool = False,
+) -> SequentialEncoderWithLearnedSimilarityModule:
+    if module_type == "SASRec":
+        model = sasrec_encoder(
+            max_sequence_length=max_sequence_length,
+            max_output_length=max_output_length,
+            embedding_module=embedding_module,
+            similarity_module=interaction_module,
+            input_preproc_module=input_preproc_module,
+            output_postproc_module=output_postproc_module,
+            activation_checkpoint=activation_checkpoint,
+            verbose=verbose,
+        )
+    elif module_type == "HSTU":
+        model = hstu_encoder(
+            max_sequence_length=max_sequence_length,
+            max_output_length=max_output_length,
+            embedding_module=embedding_module,
+            similarity_module=interaction_module,
+            input_preproc_module=input_preproc_module,
+            output_postproc_module=output_postproc_module,
+            activation_checkpoint=activation_checkpoint,
+            verbose=verbose,
+        )
+    else:
+        raise ValueError(f"Unsupported module_type {module_type}")
+    return model
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/features.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/features.py
new file mode 100644
index 000000000..70bf80cc0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/features.py
@@ -0,0 +1,94 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from typing import Dict, NamedTuple, Optional, Tuple
+
+import torch
+
+
+class SequentialFeatures(NamedTuple):
+    # (B,) x int64. Requires past_lengths[i] > 0 \forall i.
+    past_lengths: torch.Tensor
+    # (B, N,) x int64. 0 denotes valid ids.
+    past_ids: torch.Tensor
+    # (B, N, D) x float.
+    past_embeddings: Optional[torch.Tensor]
+    # Implementation-specific payloads.
+    # e.g., past timestamps, past event_types (e.g., clicks, likes), etc.
+    past_payloads: Dict[str, torch.Tensor]
+
+
+def movielens_seq_features_from_row(
+    row: Dict[str, torch.Tensor],
+    device: int,
+    max_output_length: int,
+) -> Tuple[SequentialFeatures, torch.Tensor, torch.Tensor]:
+    historical_lengths = row["history_lengths"].to(device)  # [B]
+    historical_ids = row["historical_ids"].to(device)  # [B, N]
+    historical_ratings = row["historical_ratings"].to(device)
+    historical_timestamps = row["historical_timestamps"].to(device)
+    target_ids = row["target_ids"].to(device).unsqueeze(1)  # [B, 1]
+    target_ratings = row["target_ratings"].to(device).unsqueeze(1)
+    target_timestamps = row["target_timestamps"].to(device).unsqueeze(1)
+    if max_output_length > 0:
+        B = historical_lengths.size(0)
+        historical_ids = torch.cat(
+            [
+                historical_ids,
+                torch.zeros(
+                    (B, max_output_length), dtype=historical_ids.dtype, device=device
+                ),
+            ],
+            dim=1,
+        )
+        historical_ratings = torch.cat(
+            [
+                historical_ratings,
+                torch.zeros(
+                    (B, max_output_length),
+                    dtype=historical_ratings.dtype,
+                    device=device,
+                ),
+            ],
+            dim=1,
+        )
+        historical_timestamps = torch.cat(
+            [
+                historical_timestamps,
+                torch.zeros(
+                    (B, max_output_length),
+                    dtype=historical_timestamps.dtype,
+                    device=device,
+                ),
+            ],
+            dim=1,
+        )
+        historical_timestamps.scatter_(
+            dim=1,
+            index=historical_lengths.view(-1, 1),
+            src=target_timestamps.view(-1, 1),
+        )
+        # print(f"historical_ids.size()={historical_ids.size()}, historical_timestamps.size()={historical_timestamps.size()}")
+    features = SequentialFeatures(
+        past_lengths=historical_lengths,
+        past_ids=historical_ids,
+        past_embeddings=None,
+        past_payloads={
+            "timestamps": historical_timestamps,
+            "ratings": historical_ratings,
+        },
+    )
+    return features, target_ids, target_ratings
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py
new file mode 100644
index 000000000..3c89245a2
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py
@@ -0,0 +1,808 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Implements HSTU (Hierarchical Sequential Transduction Unit) in
+Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations
+(https://arxiv.org/abs/2402.17152, ICML'24).
+"""
+
+import abc
+import math
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.modeling.sequential.embedding_modules import (
+    EmbeddingModule,
+)
+from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
+    InputFeaturesPreprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.output_postprocessors import (
+    OutputPostprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.utils import (
+    get_current_embeddings,
+)
+from generative_recommenders.research.modeling.similarity_module import (
+    SequentialEncoderWithLearnedSimilarityModule,
+)
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+
+
+TIMESTAMPS_KEY = "timestamps"
+
+
+class RelativeAttentionBiasModule(torch.nn.Module):
+    @abc.abstractmethod
+    def forward(
+        self,
+        all_timestamps: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            all_timestamps: [B, N] x int64
+        Returns:
+            torch.float tensor broadcastable to [B, N, N]
+        """
+        pass
+
+
+class RelativePositionalBias(RelativeAttentionBiasModule):
+    def __init__(self, max_seq_len: int) -> None:
+        super().__init__()
+
+        self._max_seq_len: int = max_seq_len
+        self._w = torch.nn.Parameter(
+            torch.empty(2 * max_seq_len - 1).normal_(mean=0, std=0.02),
+        )
+
+    def forward(
+        self,
+        all_timestamps: torch.Tensor,
+    ) -> torch.Tensor:
+        del all_timestamps
+        n: int = self._max_seq_len
+        t = F.pad(self._w[: 2 * n - 1], [0, n]).repeat(n)
+        t = t[..., :-n].reshape(1, n, 3 * n - 2)
+        r = (2 * n - 1) // 2
+        return t[..., r:-r]
+
+
+class RelativeBucketedTimeAndPositionBasedBias(RelativeAttentionBiasModule):
+    """
+    Bucketizes timespans based on ts(next-item) - ts(current-item).
+    """
+
+    def __init__(
+        self,
+        max_seq_len: int,
+        num_buckets: int,
+        bucketization_fn: Callable[[torch.Tensor], torch.Tensor],
+    ) -> None:
+        super().__init__()
+
+        self._max_seq_len: int = max_seq_len
+        self._ts_w = torch.nn.Parameter(
+            torch.empty(num_buckets + 1).normal_(mean=0, std=0.02),
+        )
+        self._pos_w = torch.nn.Parameter(
+            torch.empty(2 * max_seq_len - 1).normal_(mean=0, std=0.02),
+        )
+        self._num_buckets: int = num_buckets
+        self._bucketization_fn: Callable[[torch.Tensor], torch.Tensor] = (
+            bucketization_fn
+        )
+
+    def forward(
+        self,
+        all_timestamps: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            all_timestamps: (B, N).
+        Returns:
+            (B, N, N).
+        """
+        B = all_timestamps.size(0)
+        N = self._max_seq_len
+        t = F.pad(self._pos_w[: 2 * N - 1], [0, N]).repeat(N)
+        t = t[..., :-N].reshape(1, N, 3 * N - 2)
+        r = (2 * N - 1) // 2
+
+        # [B, N + 1] to simplify tensor manipulations.
+        ext_timestamps = torch.cat(
+            [all_timestamps, all_timestamps[:, N - 1 : N]], dim=1
+        )
+        # causal masking. Otherwise [:, :-1] - [:, 1:] works
+        bucketed_timestamps = torch.clamp(
+            self._bucketization_fn(
+                ext_timestamps[:, 1:].unsqueeze(2) - ext_timestamps[:, :-1].unsqueeze(1)
+            ),
+            min=0,
+            max=self._num_buckets,
+        ).detach()
+        rel_pos_bias = t[:, :, r:-r]
+        rel_ts_bias = torch.index_select(
+            self._ts_w, dim=0, index=bucketed_timestamps.view(-1)
+        ).view(B, N, N)
+        return rel_pos_bias + rel_ts_bias
+
+
+HSTUCacheState = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+
+
+def _hstu_attention_maybe_from_cache(
+    num_heads: int,
+    attention_dim: int,
+    linear_dim: int,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cached_q: Optional[torch.Tensor],
+    cached_k: Optional[torch.Tensor],
+    delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]],
+    x_offsets: torch.Tensor,
+    all_timestamps: Optional[torch.Tensor],
+    invalid_attn_mask: torch.Tensor,
+    rel_attn_bias: RelativeAttentionBiasModule,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    B: int = x_offsets.size(0) - 1
+    n: int = invalid_attn_mask.size(-1)
+    if delta_x_offsets is not None:
+        padded_q, padded_k = cached_q, cached_k
+        flattened_offsets = delta_x_offsets[1] + torch.arange(
+            start=0,
+            end=B * n,
+            step=n,
+            device=delta_x_offsets[1].device,
+            dtype=delta_x_offsets[1].dtype,
+        )
+        assert isinstance(padded_q, torch.Tensor)
+        assert isinstance(padded_k, torch.Tensor)
+        padded_q = (
+            padded_q.view(B * n, -1)
+            .index_copy_(
+                dim=0,
+                index=flattened_offsets,
+                source=q,
+            )
+            .view(B, n, -1)
+        )
+        padded_k = (
+            padded_k.view(B * n, -1)
+            .index_copy_(
+                dim=0,
+                index=flattened_offsets,
+                source=k,
+            )
+            .view(B, n, -1)
+        )
+    else:
+        padded_q = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=q, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
+        )
+        padded_k = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=k, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
+        )
+
+    qk_attn = torch.einsum(
+        "bnhd,bmhd->bhnm",
+        padded_q.view(B, n, num_heads, attention_dim),
+        padded_k.view(B, n, num_heads, attention_dim),
+    )
+    if all_timestamps is not None:
+        qk_attn = qk_attn + rel_attn_bias(all_timestamps).unsqueeze(1)
+    qk_attn = F.silu(qk_attn) / n
+    qk_attn = qk_attn * invalid_attn_mask.unsqueeze(0).unsqueeze(0)
+    attn_output = torch.ops.fbgemm.dense_to_jagged(
+        torch.einsum(
+            "bhnm,bmhd->bnhd",
+            qk_attn,
+            torch.ops.fbgemm.jagged_to_padded_dense(v, [x_offsets], [n]).reshape(
+                B, n, num_heads, linear_dim
+            ),
+        ).reshape(B, n, num_heads * linear_dim),
+        [x_offsets],
+    )[0]
+    return attn_output, padded_q, padded_k
+
+
+class SequentialTransductionUnitJagged(torch.nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        linear_hidden_dim: int,
+        attention_dim: int,
+        dropout_ratio: float,
+        attn_dropout_ratio: float,
+        num_heads: int,
+        linear_activation: str,
+        relative_attention_bias_module: Optional[RelativeAttentionBiasModule] = None,
+        normalization: str = "rel_bias",
+        linear_config: str = "uvqk",
+        concat_ua: bool = False,
+        epsilon: float = 1e-6,
+        max_length: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self._embedding_dim: int = embedding_dim
+        self._linear_dim: int = linear_hidden_dim
+        self._attention_dim: int = attention_dim
+        self._dropout_ratio: float = dropout_ratio
+        self._attn_dropout_ratio: float = attn_dropout_ratio
+        self._num_heads: int = num_heads
+        self._rel_attn_bias: Optional[RelativeAttentionBiasModule] = (
+            relative_attention_bias_module
+        )
+        self._normalization: str = normalization
+        self._linear_config: str = linear_config
+        if self._linear_config == "uvqk":
+            self._uvqk: torch.nn.Parameter = torch.nn.Parameter(
+                torch.empty(
+                    (
+                        embedding_dim,
+                        linear_hidden_dim * 2 * num_heads
+                        + attention_dim * num_heads * 2,
+                    )
+                ).normal_(mean=0, std=0.02),
+            )
+        else:
+            raise ValueError(f"Unknown linear_config {self._linear_config}")
+        self._linear_activation: str = linear_activation
+        self._concat_ua: bool = concat_ua
+        self._o = torch.nn.Linear(
+            in_features=linear_hidden_dim * num_heads * (3 if concat_ua else 1),
+            out_features=embedding_dim,
+        )
+        torch.nn.init.xavier_uniform_(self._o.weight)
+        self._eps: float = epsilon
+
+    def _norm_input(self, x: torch.Tensor) -> torch.Tensor:
+        return F.layer_norm(x, normalized_shape=[self._embedding_dim], eps=self._eps)
+
+    def _norm_attn_output(self, x: torch.Tensor) -> torch.Tensor:
+        return F.layer_norm(
+            x, normalized_shape=[self._linear_dim * self._num_heads], eps=self._eps
+        )
+
+    def forward(  # pyre-ignore [3]
+        self,
+        x: torch.Tensor,
+        x_offsets: torch.Tensor,
+        all_timestamps: Optional[torch.Tensor],
+        invalid_attn_mask: torch.Tensor,
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cache: Optional[HSTUCacheState] = None,
+        return_cache_states: bool = False,
+    ):
+        """
+        Args:
+            x: (\sum_i N_i, D) x float.
+            x_offsets: (B + 1) x int32.
+            all_timestamps: optional (B, N) x int64.
+            invalid_attn_mask: (B, N, N) x float, each element in {0, 1}.
+            delta_x_offsets: optional 2-tuple ((B,) x int32, (B,) x int32).
+                For the 1st element in the tuple, each element is in [0, x_offsets[-1]). For the
+                2nd element in the tuple, each element is in [0, N).
+            cache: Optional 4-tuple of (v, padded_q, padded_k, output) from prior runs,
+                where all except padded_q, padded_k are jagged.
+        Returns:
+            x' = f(x), (\sum_i N_i, D) x float.
+        """
+        n: int = invalid_attn_mask.size(-1)
+        cached_q = None
+        cached_k = None
+        if delta_x_offsets is not None:
+            # In this case, for all the following code, x, u, v, q, k become restricted to
+            # [delta_x_offsets[0], :].
+            assert cache is not None
+            x = x[delta_x_offsets[0], :]
+            cached_v, cached_q, cached_k, cached_outputs = cache
+
+        normed_x = self._norm_input(x)
+
+        if self._linear_config == "uvqk":
+            batched_mm_output = torch.mm(normed_x, self._uvqk)
+            if self._linear_activation == "silu":
+                batched_mm_output = F.silu(batched_mm_output)
+            elif self._linear_activation == "none":
+                batched_mm_output = batched_mm_output
+            u, v, q, k = torch.split(
+                batched_mm_output,
+                [
+                    self._linear_dim * self._num_heads,
+                    self._linear_dim * self._num_heads,
+                    self._attention_dim * self._num_heads,
+                    self._attention_dim * self._num_heads,
+                ],
+                dim=1,
+            )
+        else:
+            raise ValueError(f"Unknown self._linear_config {self._linear_config}")
+
+        if delta_x_offsets is not None:
+            v = cached_v.index_copy_(dim=0, index=delta_x_offsets[0], source=v)
+
+        B: int = x_offsets.size(0) - 1
+        if self._normalization == "rel_bias" or self._normalization == "hstu_rel_bias":
+            assert self._rel_attn_bias is not None
+            attn_output, padded_q, padded_k = _hstu_attention_maybe_from_cache(
+                num_heads=self._num_heads,
+                attention_dim=self._attention_dim,
+                linear_dim=self._linear_dim,
+                q=q,
+                k=k,
+                v=v,
+                cached_q=cached_q,
+                cached_k=cached_k,
+                delta_x_offsets=delta_x_offsets,
+                x_offsets=x_offsets,
+                all_timestamps=all_timestamps,
+                invalid_attn_mask=invalid_attn_mask,
+                rel_attn_bias=self._rel_attn_bias,
+            )
+        elif self._normalization == "softmax_rel_bias":
+            if delta_x_offsets is not None:
+                B = x_offsets.size(0) - 1
+                padded_q, padded_k = cached_q, cached_k
+                flattened_offsets = delta_x_offsets[1] + torch.arange(
+                    start=0,
+                    end=B * n,
+                    step=n,
+                    device=delta_x_offsets[1].device,
+                    dtype=delta_x_offsets[1].dtype,
+                )
+                assert padded_q is not None
+                assert padded_k is not None
+                padded_q = (
+                    padded_q.view(B * n, -1)
+                    .index_copy_(
+                        dim=0,
+                        index=flattened_offsets,
+                        source=q,
+                    )
+                    .view(B, n, -1)
+                )
+                padded_k = (
+                    padded_k.view(B * n, -1)
+                    .index_copy_(
+                        dim=0,
+                        index=flattened_offsets,
+                        source=k,
+                    )
+                    .view(B, n, -1)
+                )
+            else:
+                padded_q = torch.ops.fbgemm.jagged_to_padded_dense(
+                    values=q, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
+                )
+                padded_k = torch.ops.fbgemm.jagged_to_padded_dense(
+                    values=k, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
+                )
+
+            qk_attn = torch.einsum("bnd,bmd->bnm", padded_q, padded_k)
+            if self._rel_attn_bias is not None:
+                qk_attn = qk_attn + self._rel_attn_bias(all_timestamps)
+            qk_attn = F.softmax(qk_attn / math.sqrt(self._attention_dim), dim=-1)
+            qk_attn = qk_attn * invalid_attn_mask
+            attn_output = torch.ops.fbgemm.dense_to_jagged(
+                torch.bmm(
+                    qk_attn,
+                    torch.ops.fbgemm.jagged_to_padded_dense(v, [x_offsets], [n]),
+                ),
+                [x_offsets],
+            )[0]
+        else:
+            raise ValueError(f"Unknown normalization method {self._normalization}")
+
+        attn_output = (
+            attn_output
+            if delta_x_offsets is None
+            else attn_output[delta_x_offsets[0], :]
+        )
+        if self._concat_ua:
+            a = self._norm_attn_output(attn_output)
+            o_input = torch.cat([u, a, u * a], dim=-1)
+        else:
+            o_input = u * self._norm_attn_output(attn_output)
+
+        new_outputs = (
+            self._o(
+                F.dropout(
+                    o_input,
+                    p=self._dropout_ratio,
+                    training=self.training,
+                )
+            )
+            + x
+        )
+
+        if delta_x_offsets is not None:
+            new_outputs = cached_outputs.index_copy_(
+                dim=0, index=delta_x_offsets[0], source=new_outputs
+            )
+
+        if return_cache_states and delta_x_offsets is None:
+            v = v.contiguous()
+
+        return new_outputs, (v, padded_q, padded_k, new_outputs)
+
+
+class HSTUJagged(torch.nn.Module):
+    def __init__(
+        self,
+        modules: List[SequentialTransductionUnitJagged],
+        autocast_dtype: Optional[torch.dtype],
+    ) -> None:
+        super().__init__()
+
+        self._attention_layers: torch.nn.ModuleList = torch.nn.ModuleList(
+            modules=modules
+        )
+        self._autocast_dtype: Optional[torch.dtype] = autocast_dtype
+
+    def jagged_forward(
+        self,
+        x: torch.Tensor,
+        x_offsets: torch.Tensor,
+        all_timestamps: Optional[torch.Tensor],
+        invalid_attn_mask: torch.Tensor,
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cache: Optional[List[HSTUCacheState]] = None,
+        return_cache_states: bool = False,
+    ) -> Tuple[torch.Tensor, List[HSTUCacheState]]:
+        """
+        Args:
+            x: (\sum_i N_i, D) x float
+            x_offsets: (B + 1) x int32
+            all_timestamps: (B, 1 + N) x int64
+            invalid_attn_mask: (B, N, N) x float, each element in {0, 1}
+            return_cache_states: bool. True if we should return cache states.
+
+        Returns:
+            x' = f(x), (\sum_i N_i, D) x float
+        """
+        cache_states: List[HSTUCacheState] = []
+
+        with torch.autocast(
+            "cuda",
+            enabled=self._autocast_dtype is not None,
+            dtype=self._autocast_dtype or torch.float16,
+        ):
+            for i, layer in enumerate(self._attention_layers):
+                x, cache_states_i = layer(
+                    x=x,
+                    x_offsets=x_offsets,
+                    all_timestamps=all_timestamps,
+                    invalid_attn_mask=invalid_attn_mask,
+                    delta_x_offsets=delta_x_offsets,
+                    cache=cache[i] if cache is not None else None,
+                    return_cache_states=return_cache_states,
+                )
+                if return_cache_states:
+                    cache_states.append(cache_states_i)
+
+        return x, cache_states
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_offsets: torch.Tensor,
+        all_timestamps: Optional[torch.Tensor],
+        invalid_attn_mask: torch.Tensor,
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cache: Optional[List[HSTUCacheState]] = None,
+        return_cache_states: bool = False,
+    ) -> Tuple[torch.Tensor, List[HSTUCacheState]]:
+        """
+        Args:
+            x: (B, N, D) x float.
+            x_offsets: (B + 1) x int32.
+            all_timestamps: (B, 1 + N) x int64
+            invalid_attn_mask: (B, N, N) x float, each element in {0, 1}.
+        Returns:
+            x' = f(x), (B, N, D) x float
+        """
+        if len(x.size()) == 3:
+            x = torch.ops.fbgemm.dense_to_jagged(x, [x_offsets])[0]
+
+        jagged_x, cache_states = self.jagged_forward(
+            x=x,
+            x_offsets=x_offsets,
+            all_timestamps=all_timestamps,
+            invalid_attn_mask=invalid_attn_mask,
+            delta_x_offsets=delta_x_offsets,
+            cache=cache,
+            return_cache_states=return_cache_states,
+        )
+        y = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=jagged_x,
+            offsets=[x_offsets],
+            max_lengths=[invalid_attn_mask.size(1)],
+            padding_value=0.0,
+        )
+        return y, cache_states
+
+
+class HSTU(SequentialEncoderWithLearnedSimilarityModule):
+    """
+    Implements HSTU (Hierarchical Sequential Transduction Unit) in
+    Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations,
+    https://arxiv.org/abs/2402.17152.
+
+    Note that this implementation is intended for reproducing experiments in
+    the traditional sequential recommender setting (Section 4.1.1), and does
+    not yet use optimized kernels discussed in the paper.
+    """
+
+    def __init__(
+        self,
+        max_sequence_len: int,
+        max_output_len: int,
+        embedding_dim: int,
+        num_blocks: int,
+        num_heads: int,
+        linear_dim: int,
+        attention_dim: int,
+        normalization: str,
+        linear_config: str,
+        linear_activation: str,
+        linear_dropout_rate: float,
+        attn_dropout_rate: float,
+        embedding_module: EmbeddingModule,
+        similarity_module: SimilarityModule,
+        input_features_preproc_module: InputFeaturesPreprocessorModule,
+        output_postproc_module: OutputPostprocessorModule,
+        enable_relative_attention_bias: bool = True,
+        concat_ua: bool = False,
+        verbose: bool = True,
+    ) -> None:
+        super().__init__(ndp_module=similarity_module)
+
+        self._embedding_dim: int = embedding_dim
+        self._item_embedding_dim: int = embedding_module.item_embedding_dim
+        self._max_sequence_length: int = max_sequence_len
+        self._embedding_module: EmbeddingModule = embedding_module
+        self._input_features_preproc: InputFeaturesPreprocessorModule = (
+            input_features_preproc_module
+        )
+        self._output_postproc: OutputPostprocessorModule = output_postproc_module
+        self._num_blocks: int = num_blocks
+        self._num_heads: int = num_heads
+        self._dqk: int = attention_dim
+        self._dv: int = linear_dim
+        self._linear_activation: str = linear_activation
+        self._linear_dropout_rate: float = linear_dropout_rate
+        self._attn_dropout_rate: float = attn_dropout_rate
+        self._enable_relative_attention_bias: bool = enable_relative_attention_bias
+        self._hstu = HSTUJagged(
+            modules=[
+                SequentialTransductionUnitJagged(
+                    embedding_dim=self._embedding_dim,
+                    linear_hidden_dim=linear_dim,
+                    attention_dim=attention_dim,
+                    normalization=normalization,
+                    linear_config=linear_config,
+                    linear_activation=linear_activation,
+                    num_heads=num_heads,
+                    # TODO: change to lambda x.
+                    relative_attention_bias_module=(
+                        RelativeBucketedTimeAndPositionBasedBias(
+                            max_seq_len=max_sequence_len
+                            + max_output_len,  # accounts for next item.
+                            num_buckets=128,
+                            bucketization_fn=lambda x: (
+                                torch.log(torch.abs(x).clamp(min=1)) / 0.301
+                            ).long(),
+                        )
+                        if enable_relative_attention_bias
+                        else None
+                    ),
+                    dropout_ratio=linear_dropout_rate,
+                    attn_dropout_ratio=attn_dropout_rate,
+                    concat_ua=concat_ua,
+                )
+                for _ in range(num_blocks)
+            ],
+            autocast_dtype=None,
+        )
+        # causal forward, w/ +1 for padding.
+        self.register_buffer(
+            "_attn_mask",
+            torch.triu(
+                torch.ones(
+                    (
+                        self._max_sequence_length + max_output_len,
+                        self._max_sequence_length + max_output_len,
+                    ),
+                    dtype=torch.bool,
+                ),
+                diagonal=1,
+            ),
+        )
+        self._verbose: bool = verbose
+        self.reset_params()
+
+    def reset_params(self) -> None:
+        for name, params in self.named_parameters():
+            if ("_hstu" in name) or ("_embedding_module" in name):
+                if self._verbose:
+                    print(f"Skipping init for {name}")
+                continue
+            try:
+                torch.nn.init.xavier_normal_(params.data)
+                if self._verbose:
+                    print(
+                        f"Initialize {name} as xavier normal: {params.data.size()} params"
+                    )
+            except:
+                if self._verbose:
+                    print(f"Failed to initialize {name}: {params.data.size()} params")
+
+    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
+        return self._embedding_module.get_item_embeddings(item_ids)
+
+    def debug_str(self) -> str:
+        debug_str = (
+            f"HSTU-b{self._num_blocks}-h{self._num_heads}-dqk{self._dqk}-dv{self._dv}"
+            + f"-l{self._linear_activation}d{self._linear_dropout_rate}"
+            + f"-ad{self._attn_dropout_rate}"
+        )
+        if not self._enable_relative_attention_bias:
+            debug_str += "-norab"
+        return debug_str
+
+    def generate_user_embeddings(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cache: Optional[List[HSTUCacheState]] = None,
+        return_cache_states: bool = False,
+    ) -> Tuple[torch.Tensor, List[HSTUCacheState]]:
+        """
+        [B, N] -> [B, N, D].
+        """
+        device = past_lengths.device
+        float_dtype = past_embeddings.dtype
+        B, N, _ = past_embeddings.size()
+
+        past_lengths, user_embeddings, _ = self._input_features_preproc(
+            past_lengths=past_lengths,
+            past_ids=past_ids,
+            past_embeddings=past_embeddings,
+            past_payloads=past_payloads,
+        )
+
+        float_dtype = user_embeddings.dtype
+        user_embeddings, cached_states = self._hstu(
+            x=user_embeddings,
+            x_offsets=torch.ops.fbgemm.asynchronous_complete_cumsum(past_lengths),
+            all_timestamps=(
+                past_payloads[TIMESTAMPS_KEY]
+                if TIMESTAMPS_KEY in past_payloads
+                else None
+            ),
+            invalid_attn_mask=1.0 - self._attn_mask.to(float_dtype),
+            delta_x_offsets=delta_x_offsets,
+            cache=cache,
+            return_cache_states=return_cache_states,
+        )
+        return self._output_postproc(user_embeddings), cached_states
+
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+        batch_id: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Runs the main encoder.
+
+        Args:
+            past_lengths: (B,) x int64
+            past_ids: (B, N,) x int64 where the latest engaged ids come first. In
+                particular, past_ids[i, past_lengths[i] - 1] should correspond to
+                the latest engaged values.
+            past_embeddings: (B, N, D) x float or (\sum_b N_b, D) x float.
+            past_payloads: implementation-specific keyed tensors of shape (B, N, ...).
+
+        Returns:
+            encoded_embeddings of [B, N, D].
+        """
+        encoded_embeddings, _ = self.generate_user_embeddings(
+            past_lengths=past_lengths,
+            past_ids=past_ids,
+            past_embeddings=past_embeddings,
+            past_payloads=past_payloads,
+        )
+        return encoded_embeddings
+
+    def _encode(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        cache: Optional[List[HSTUCacheState]],
+        return_cache_states: bool,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[HSTUCacheState]]]:
+        """
+        Args:
+            past_lengths: (B,) x int64.
+            past_ids: (B, N,) x int64.
+            past_embeddings: (B, N, D,) x float.
+            past_payloads: implementation-specific keyed tensors of shape (B, N, ...).
+            return_cache_states: bool.
+
+        Returns:
+            (B, D) x float, representing embeddings for the current state.
+        """
+        encoded_seq_embeddings, cache_states = self.generate_user_embeddings(
+            past_lengths=past_lengths,
+            past_ids=past_ids,
+            past_embeddings=past_embeddings,
+            past_payloads=past_payloads,
+            delta_x_offsets=delta_x_offsets,
+            cache=cache,
+            return_cache_states=return_cache_states,
+        )  # [B, N, D]
+        current_embeddings = get_current_embeddings(
+            lengths=past_lengths, encoded_embeddings=encoded_seq_embeddings
+        )
+        if return_cache_states:
+            return current_embeddings, cache_states
+        else:
+            return current_embeddings
+
+    def encode(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cache: Optional[List[HSTUCacheState]] = None,
+        return_cache_states: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[HSTUCacheState]]]:
+        """
+        Runs encoder to obtain the current hidden states.
+
+        Args:
+            past_lengths: (B,) x int.
+            past_ids: (B, N,) x int.
+            past_embeddings: (B, N, D) x float.
+            past_payloads: implementation-specific keyed tensors of shape (B, N, ...).
+
+        Returns:
+            (B, D,) x float, representing encoded states at the most recent time step.
+        """
+        return self._encode(
+            past_lengths=past_lengths,
+            past_ids=past_ids,
+            past_embeddings=past_embeddings,
+            past_payloads=past_payloads,
+            delta_x_offsets=delta_x_offsets,
+            cache=cache,
+            return_cache_states=return_cache_states,
+        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py
new file mode 100644
index 000000000..a461ab879
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py
@@ -0,0 +1,259 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+import math
+from typing import Dict, Tuple
+
+import torch
+from generative_recommenders.research.modeling.initialization import truncated_normal
+
+
+class InputFeaturesPreprocessorModule(torch.nn.Module):
+    @abc.abstractmethod
+    def debug_str(self) -> str:
+        pass
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        pass
+
+
+class LearnablePositionalEmbeddingInputFeaturesPreprocessor(
+    InputFeaturesPreprocessorModule
+):
+    def __init__(
+        self,
+        max_sequence_len: int,
+        embedding_dim: int,
+        dropout_rate: float,
+    ) -> None:
+        super().__init__()
+
+        self._embedding_dim: int = embedding_dim
+        self._pos_emb: torch.nn.Embedding = torch.nn.Embedding(
+            max_sequence_len,
+            self._embedding_dim,
+        )
+        self._dropout_rate: float = dropout_rate
+        self._emb_dropout = torch.nn.Dropout(p=dropout_rate)
+        self.reset_state()
+
+    def debug_str(self) -> str:
+        return f"posi_d{self._dropout_rate}"
+
+    def reset_state(self) -> None:
+        truncated_normal(
+            self._pos_emb.weight.data,
+            mean=0.0,
+            std=math.sqrt(1.0 / self._embedding_dim),
+        )
+
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        B, N = past_ids.size()
+        D = past_embeddings.size(-1)
+
+        user_embeddings = past_embeddings * (self._embedding_dim**0.5) + self._pos_emb(
+            torch.arange(N, device=past_ids.device).unsqueeze(0).repeat(B, 1)
+        )
+        user_embeddings = self._emb_dropout(user_embeddings)
+
+        valid_mask = (past_ids != 0).unsqueeze(-1).float()  # [B, N, 1]
+        user_embeddings *= valid_mask
+        return past_lengths, user_embeddings, valid_mask
+
+
+class LearnablePositionalEmbeddingRatedInputFeaturesPreprocessor(
+    InputFeaturesPreprocessorModule
+):
+    def __init__(
+        self,
+        max_sequence_len: int,
+        item_embedding_dim: int,
+        dropout_rate: float,
+        rating_embedding_dim: int,
+        num_ratings: int,
+    ) -> None:
+        super().__init__()
+
+        self._embedding_dim: int = item_embedding_dim + rating_embedding_dim
+        self._pos_emb: torch.nn.Embedding = torch.nn.Embedding(
+            max_sequence_len,
+            self._embedding_dim,
+        )
+        self._dropout_rate: float = dropout_rate
+        self._emb_dropout = torch.nn.Dropout(p=dropout_rate)
+        self._rating_emb: torch.nn.Embedding = torch.nn.Embedding(
+            num_ratings,
+            rating_embedding_dim,
+        )
+        self.reset_state()
+
+    def debug_str(self) -> str:
+        return f"posir_d{self._dropout_rate}"
+
+    def reset_state(self) -> None:
+        truncated_normal(
+            self._pos_emb.weight.data,
+            mean=0.0,
+            std=math.sqrt(1.0 / self._embedding_dim),
+        )
+        truncated_normal(
+            self._rating_emb.weight.data,
+            mean=0.0,
+            std=math.sqrt(1.0 / self._embedding_dim),
+        )
+
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        B, N = past_ids.size()
+
+        user_embeddings = torch.cat(
+            [past_embeddings, self._rating_emb(past_payloads["ratings"].int())],
+            dim=-1,
+        ) * (self._embedding_dim**0.5) + self._pos_emb(
+            torch.arange(N, device=past_ids.device).unsqueeze(0).repeat(B, 1)
+        )
+        user_embeddings = self._emb_dropout(user_embeddings)
+
+        valid_mask = (past_ids != 0).unsqueeze(-1).float()  # [B, N, 1]
+        user_embeddings *= valid_mask
+        return past_lengths, user_embeddings, valid_mask
+
+
+class CombinedItemAndRatingInputFeaturesPreprocessor(InputFeaturesPreprocessorModule):
+    def __init__(
+        self,
+        max_sequence_len: int,
+        item_embedding_dim: int,
+        dropout_rate: float,
+        num_ratings: int,
+    ) -> None:
+        super().__init__()
+
+        self._embedding_dim: int = item_embedding_dim
+        # Due to [item_0, rating_0, item_1, rating_1, ...]
+        self._pos_emb: torch.nn.Embedding = torch.nn.Embedding(
+            max_sequence_len * 2,
+            self._embedding_dim,
+        )
+        self._dropout_rate: float = dropout_rate
+        self._emb_dropout = torch.nn.Dropout(p=dropout_rate)
+        self._rating_emb: torch.nn.Embedding = torch.nn.Embedding(
+            num_ratings,
+            item_embedding_dim,
+        )
+        self.reset_state()
+
+    def debug_str(self) -> str:
+        return f"combir_d{self._dropout_rate}"
+
+    def reset_state(self) -> None:
+        truncated_normal(
+            self._pos_emb.weight.data,
+            mean=0.0,
+            std=math.sqrt(1.0 / self._embedding_dim),
+        )
+        truncated_normal(
+            self._rating_emb.weight.data,
+            mean=0.0,
+            std=math.sqrt(1.0 / self._embedding_dim),
+        )
+
+    def get_preprocessed_ids(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Returns (B, N * 2,) x int64.
+        """
+        B, N = past_ids.size()
+        return torch.cat(
+            [
+                past_ids.unsqueeze(2),  # (B, N, 1)
+                past_payloads["ratings"].to(past_ids.dtype).unsqueeze(2),
+            ],
+            dim=2,
+        ).reshape(B, N * 2)
+
+    def get_preprocessed_masks(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Returns (B, N * 2,) x bool.
+        """
+        B, N = past_ids.size()
+        return (past_ids != 0).unsqueeze(2).expand(-1, -1, 2).reshape(B, N * 2)
+
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        B, N = past_ids.size()
+        D = past_embeddings.size(-1)
+
+        user_embeddings = torch.cat(
+            [
+                past_embeddings,  # (B, N, D)
+                self._rating_emb(past_payloads["ratings"].int()),
+            ],
+            dim=2,
+        ) * (self._embedding_dim**0.5)
+        user_embeddings = user_embeddings.view(B, N * 2, D)
+        user_embeddings = user_embeddings + self._pos_emb(
+            torch.arange(N * 2, device=past_ids.device).unsqueeze(0).repeat(B, 1)
+        )
+        user_embeddings = self._emb_dropout(user_embeddings)
+
+        valid_mask = (
+            self.get_preprocessed_masks(
+                past_lengths,
+                past_ids,
+                past_embeddings,
+                past_payloads,
+            )
+            .unsqueeze(2)
+            .float()
+        )  # (B, N * 2, 1,)
+        user_embeddings *= valid_mask
+        return past_lengths * 2, user_embeddings, valid_mask
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py
new file mode 100644
index 000000000..8e2195783
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py
@@ -0,0 +1,193 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from collections import OrderedDict
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.modeling.sequential.autoregressive_losses import (
+    AutoregressiveLoss,
+    NegativesSampler,
+)
+from torch.utils.checkpoint import checkpoint
+
+
+class SampledSoftmaxLoss(AutoregressiveLoss):
+    def __init__(
+        self,
+        num_to_sample: int,
+        softmax_temperature: float,
+        model,
+        activation_checkpoint: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self._num_to_sample: int = num_to_sample
+        self._softmax_temperature: float = softmax_temperature
+        self._model = model
+        self._activation_checkpoint: bool = activation_checkpoint
+
+    def jagged_forward(  # pyre-ignore [15]
+        self,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        assert output_embeddings.size() == supervision_embeddings.size()
+        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
+        assert supervision_ids.size() == supervision_weights.size()
+
+        sampled_ids, sampled_negative_embeddings = negatives_sampler(
+            positive_ids=supervision_ids,
+            num_to_sample=self._num_to_sample,
+        )
+        positive_embeddings = negatives_sampler.normalize_embeddings(
+            supervision_embeddings
+        )
+        positive_logits, aux_losses = self._model.similarity_fn(
+            query_embeddings=output_embeddings,  # [B, D] = [N', D]
+            item_ids=supervision_ids.unsqueeze(1),  # [N', 1]
+            item_embeddings=positive_embeddings.unsqueeze(1),  # [N', D] -> [N', 1, D]
+            **kwargs,
+        )
+        positive_logits = positive_logits / self._softmax_temperature  # [0]
+        sampled_negatives_logits, _ = self._model.similarity_fn(
+            query_embeddings=output_embeddings,  # [N', D]
+            item_ids=sampled_ids,  # [N', R]
+            item_embeddings=sampled_negative_embeddings,  # [N', R, D]
+            **kwargs,
+        )  # [N', R]  # [0]
+        sampled_negatives_logits = torch.where(
+            supervision_ids.unsqueeze(1) == sampled_ids,  # [N', R]
+            -5e4,
+            sampled_negatives_logits / self._softmax_temperature,
+        )
+        jagged_loss = -F.log_softmax(
+            torch.cat([positive_logits, sampled_negatives_logits], dim=1), dim=1
+        )[:, 0]
+        return (
+            jagged_loss * supervision_weights
+        ).sum() / supervision_weights.sum(), aux_losses
+
+    def forward(  # pyre-ignore [15]
+        self,
+        lengths: torch.Tensor,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            lengths: [B] x int32 representing number of non-zero elements per row.
+            output_embeddings: [B, N, D] x float, embeddings for the current
+                input sequence.
+            supervision_ids: [B, N] x int64, (positive) supervision ids.
+            supervision_embeddings: [B, N, D] x float.
+            supervision_weights: Optional [B, N] x float. Optional weights for
+                masking out invalid positions, or reweighting supervision labels.
+            negatives_sampler: sampler used to obtain negative examples paired with
+                positives.
+
+        Returns:
+            Tuple of (loss for the current engaged sequence, str-keyed aux_losses).
+        """
+        torch._assert(
+            output_embeddings.size() == supervision_embeddings.size(),
+            "Invalid supervision embeddings size.",
+        )
+        torch._assert(
+            supervision_ids.size() == supervision_embeddings.size()[:-1],
+            "Invalid supervision ids size.",
+        )
+
+        jagged_id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+        jagged_supervision_ids = (
+            torch.ops.fbgemm.dense_to_jagged(
+                supervision_ids.unsqueeze(-1).float(), [jagged_id_offsets]
+            )[0]
+            .squeeze(1)
+            .long()
+        )
+        if "user_ids" in kwargs:
+            # expand to jagged.
+            max_length: int = int(lengths.max())
+            kwargs["user_ids"] = torch.ops.fbgemm.dense_to_jagged(
+                kwargs["user_ids"]
+                .unsqueeze(1)
+                .expand(-1, max_length)
+                .unsqueeze(2),  # (B, max_length, 1)
+                [jagged_id_offsets],
+            )[0].squeeze(1)
+
+        args = OrderedDict(
+            [
+                (
+                    "output_embeddings",
+                    torch.ops.fbgemm.dense_to_jagged(
+                        output_embeddings,
+                        [jagged_id_offsets],
+                    )[0],
+                ),
+                ("supervision_ids", jagged_supervision_ids),
+                (
+                    "supervision_embeddings",
+                    torch.ops.fbgemm.dense_to_jagged(
+                        supervision_embeddings,
+                        [jagged_id_offsets],
+                    )[0],
+                ),
+                (
+                    "supervision_weights",
+                    torch.ops.fbgemm.dense_to_jagged(
+                        supervision_weights.unsqueeze(-1),
+                        [jagged_id_offsets],
+                    )[0].squeeze(1),
+                ),
+                ("negatives_sampler", negatives_sampler),
+            ]
+        )
+        args.update(kwargs)
+        if self._activation_checkpoint:
+            return checkpoint(
+                self.jagged_forward,
+                *args.values(),
+                use_reentrant=False,
+            )
+        else:
+            return self.jagged_forward(
+                output_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                    output_embeddings,
+                    [jagged_id_offsets],
+                )[0],
+                supervision_ids=jagged_supervision_ids,
+                supervision_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                    supervision_embeddings,
+                    [jagged_id_offsets],
+                )[0],
+                supervision_weights=torch.ops.fbgemm.dense_to_jagged(
+                    supervision_weights.unsqueeze(-1),
+                    [jagged_id_offsets],
+                )[0].squeeze(1),
+                negatives_sampler=negatives_sampler,
+                **kwargs,
+            )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py
new file mode 100644
index 000000000..3319dfd93
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+
+import torch
+import torch.nn.functional as F
+
+
+class OutputPostprocessorModule(torch.nn.Module):
+    @abc.abstractmethod
+    def debug_str(self) -> str:
+        pass
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        output_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        pass
+
+
+class L2NormEmbeddingPostprocessor(OutputPostprocessorModule):
+    def __init__(
+        self,
+        embedding_dim: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self._embedding_dim: int = embedding_dim
+        self._eps: float = eps
+
+    def debug_str(self) -> str:
+        return "l2"
+
+    def forward(
+        self,
+        output_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        output_embeddings = output_embeddings[..., : self._embedding_dim]
+        return output_embeddings / torch.clamp(
+            torch.linalg.norm(output_embeddings, ord=None, dim=-1, keepdim=True),
+            min=self._eps,
+        )
+
+
+class LayerNormEmbeddingPostprocessor(OutputPostprocessorModule):
+    def __init__(
+        self,
+        embedding_dim: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self._embedding_dim: int = embedding_dim
+        self._eps: float = eps
+
+    def debug_str(self) -> str:
+        return "ln"
+
+    def forward(
+        self,
+        output_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        output_embeddings = output_embeddings[..., : self._embedding_dim]
+        return F.layer_norm(
+            output_embeddings,
+            normalized_shape=(self._embedding_dim,),
+            eps=self._eps,
+        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py
new file mode 100644
index 000000000..2709ddb08
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py
@@ -0,0 +1,316 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Implements SASRec (Self-Attentive Sequential Recommendation, https://arxiv.org/abs/1808.09781, ICDM'18).
+
+Compared with the original paper which used BCE loss, this implementation is modified so that
+we can utilize a Sampled Softmax loss proposed in Revisiting Neural Retrieval on Accelerators
+(https://arxiv.org/abs/2306.04039, KDD'23) and Turning Dross Into Gold Loss: is BERT4Rec really
+better than SASRec? (https://arxiv.org/abs/2309.07602, RecSys'23), where the authors showed
+sampled softmax loss to significantly improved SASRec model quality.
+"""
+
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.modeling.sequential.embedding_modules import (
+    EmbeddingModule,
+)
+from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
+    InputFeaturesPreprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.output_postprocessors import (
+    OutputPostprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.utils import (
+    get_current_embeddings,
+)
+from generative_recommenders.research.modeling.similarity_module import (
+    SequentialEncoderWithLearnedSimilarityModule,
+)
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+
+
+class StandardAttentionFF(torch.nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        hidden_dim: int,
+        activation_fn: str,
+        dropout_rate: float,
+    ) -> None:
+        super().__init__()
+
+        assert activation_fn == "relu" or activation_fn == "gelu", (
+            f"Invalid activation_fn {activation_fn}"
+        )
+
+        self._conv1d = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                in_channels=embedding_dim,
+                out_channels=hidden_dim,
+                kernel_size=1,
+            ),
+            torch.nn.GELU() if activation_fn == "gelu" else torch.nn.ReLU(),
+            torch.nn.Dropout(p=dropout_rate),
+            torch.nn.Conv1d(
+                in_channels=hidden_dim,
+                out_channels=embedding_dim,
+                kernel_size=1,
+            ),
+            torch.nn.Dropout(p=dropout_rate),
+        )
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        # Conv1D requires (B, D, N)
+        return self._conv1d(inputs.transpose(-1, -2)).transpose(-1, -2) + inputs
+
+
+class SASRec(SequentialEncoderWithLearnedSimilarityModule):
+    """
+    Implements SASRec (Self-Attentive Sequential Recommendation, https://arxiv.org/abs/1808.09781, ICDM'18).
+
+    Compared with the original paper which used BCE loss, this implementation is modified so that
+    we can utilize a Sampled Softmax loss proposed in Revisiting Neural Retrieval on Accelerators
+    (https://arxiv.org/abs/2306.04039, KDD'23) and Turning Dross Into Gold Loss: is BERT4Rec really
+    better than SASRec? (https://arxiv.org/abs/2309.07602, RecSys'23), where the authors showed
+    sampled softmax loss to significantly improved SASRec model quality.
+    """
+
+    def __init__(
+        self,
+        max_sequence_len: int,
+        max_output_len: int,
+        embedding_dim: int,
+        num_blocks: int,
+        num_heads: int,
+        ffn_hidden_dim: int,
+        ffn_activation_fn: str,
+        ffn_dropout_rate: float,
+        embedding_module: EmbeddingModule,
+        similarity_module: SimilarityModule,
+        input_features_preproc_module: InputFeaturesPreprocessorModule,
+        output_postproc_module: OutputPostprocessorModule,
+        activation_checkpoint: bool = False,
+        verbose: bool = False,
+    ) -> None:
+        super().__init__(ndp_module=similarity_module)
+
+        self._embedding_module: EmbeddingModule = embedding_module
+        self._embedding_dim: int = embedding_dim
+        self._item_embedding_dim: int = embedding_module.item_embedding_dim
+        self._max_sequence_length: int = max_sequence_len + max_output_len
+        self._input_features_preproc: InputFeaturesPreprocessorModule = (
+            input_features_preproc_module
+        )
+        self._output_postproc: OutputPostprocessorModule = output_postproc_module
+        self._activation_checkpoint: bool = activation_checkpoint
+        self._verbose: bool = verbose
+
+        self.attention_layers = torch.nn.ModuleList()
+        self.forward_layers = torch.nn.ModuleList()
+        self._num_blocks: int = num_blocks
+        self._num_heads: int = num_heads
+        self._ffn_hidden_dim: int = ffn_hidden_dim
+        self._ffn_activation_fn: str = ffn_activation_fn
+        self._ffn_dropout_rate: float = ffn_dropout_rate
+
+        for _ in range(num_blocks):
+            self.attention_layers.append(
+                torch.nn.MultiheadAttention(
+                    embed_dim=self._embedding_dim,
+                    num_heads=num_heads,
+                    dropout=ffn_dropout_rate,
+                    batch_first=True,
+                )
+            )
+            self.forward_layers.append(
+                StandardAttentionFF(
+                    embedding_dim=self._embedding_dim,
+                    hidden_dim=ffn_hidden_dim,
+                    activation_fn=ffn_activation_fn,
+                    dropout_rate=self._ffn_dropout_rate,
+                )
+            )
+
+        self.register_buffer(
+            "_attn_mask",
+            torch.triu(
+                torch.ones(
+                    (self._max_sequence_length, self._max_sequence_length),
+                    dtype=torch.bool,
+                ),
+                diagonal=1,
+            ),
+        )
+        self.reset_state()
+
+    def reset_state(self) -> None:
+        for name, params in self.named_parameters():
+            if (
+                "_input_features_preproc" in name
+                or "_embedding_module" in name
+                or "_output_postproc" in name
+            ):
+                if self._verbose:
+                    print(f"Skipping initialization for {name}")
+                continue
+            try:
+                torch.nn.init.xavier_normal_(params.data)
+                if self._verbose:
+                    print(
+                        f"Initialize {name} as xavier normal: {params.data.size()} params"
+                    )
+            except:
+                if self._verbose:
+                    print(f"Failed to initialize {name}: {params.data.size()} params")
+
+    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
+        return self._embedding_module.get_item_embeddings(item_ids)
+
+    def debug_str(self) -> str:
+        return (
+            f"SASRec-d{self._item_embedding_dim}-b{self._num_blocks}-h{self._num_heads}"
+            + "-"
+            + self._input_features_preproc.debug_str()
+            + "-"
+            + self._output_postproc.debug_str()
+            + f"-ffn{self._ffn_hidden_dim}-{self._ffn_activation_fn}-d{self._ffn_dropout_rate}"
+            + f"{'-ac' if self._activation_checkpoint else ''}"
+        )
+
+    def _run_one_layer(
+        self,
+        i: int,
+        user_embeddings: torch.Tensor,
+        valid_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        Q = F.layer_norm(
+            user_embeddings,
+            normalized_shape=(self._embedding_dim,),
+            eps=1e-8,
+        )
+        mha_outputs, _ = self.attention_layers[i](
+            query=Q,
+            key=user_embeddings,
+            value=user_embeddings,
+            attn_mask=self._attn_mask,
+        )
+        user_embeddings = self.forward_layers[i](
+            F.layer_norm(
+                Q + mha_outputs,
+                normalized_shape=(self._embedding_dim,),
+                eps=1e-8,
+            )
+        )
+        user_embeddings *= valid_mask
+        return user_embeddings
+
+    def generate_user_embeddings(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Args:
+            past_ids: (B, N,) x int
+
+        Returns:
+            (B, N, D,) x float
+        """
+        past_lengths, user_embeddings, valid_mask = self._input_features_preproc(
+            past_lengths=past_lengths,
+            past_ids=past_ids,
+            past_embeddings=past_embeddings,
+            past_payloads=past_payloads,
+        )
+
+        for i in range(len(self.attention_layers)):
+            if self._activation_checkpoint:
+                user_embeddings = torch.utils.checkpoint.checkpoint(
+                    self._run_one_layer,
+                    i,
+                    user_embeddings,
+                    valid_mask,
+                    use_reentrant=False,
+                )
+            else:
+                user_embeddings = self._run_one_layer(i, user_embeddings, valid_mask)
+
+        return self._output_postproc(user_embeddings)
+
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+        batch_id: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            past_ids: [B, N] x int64 where the latest engaged ids come first. In
+                particular, [:, 0] should correspond to the last engaged values.
+            past_ratings: [B, N] x int64.
+            past_timestamps: [B, N] x int64.
+
+        Returns:
+            encoded_embeddings of [B, N, D].
+        """
+        encoded_embeddings = self.generate_user_embeddings(
+            past_lengths,
+            past_ids,
+            past_embeddings,
+            past_payloads,
+        )
+        return encoded_embeddings
+
+    def encode(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,  # [B, N] x int64
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        encoded_seq_embeddings = self.generate_user_embeddings(
+            past_lengths, past_ids, past_embeddings, past_payloads
+        )  # [B, N, D]
+        return get_current_embeddings(
+            lengths=past_lengths, encoded_embeddings=encoded_seq_embeddings
+        )
+
+    def predict(
+        self,
+        past_ids: torch.Tensor,
+        past_ratings: torch.Tensor,
+        past_timestamps: torch.Tensor,
+        next_timestamps: torch.Tensor,
+        target_ids: torch.Tensor,
+        batch_id: Optional[int] = None,
+    ) -> torch.Tensor:
+        return self.interaction(  # pyre-ignore [29]
+            self.encode(
+                past_ids,
+                past_ratings,
+                past_timestamps,
+                next_timestamps,  # pyre-ignore [6]
+            ),
+            target_ids,
+        )  # [B, X]
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py
new file mode 100644
index 000000000..60dfb8e44
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py
@@ -0,0 +1,129 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import torch
+
+
+def batch_gather_embeddings(
+    rowwise_indices: torch.Tensor,
+    embeddings: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Args:
+        rowwise_indices: (B, N) x int, where each entry is in [0, X).
+        embeddings: (B, X, D,) x float.
+
+    Returns:
+        (B, N, D,) x float, embeddings corresponding to rowwise_indices.
+    """
+    _, N = rowwise_indices.size()
+    B, X, D = embeddings.size()
+    flattened_indices = (
+        rowwise_indices
+        + torch.arange(
+            start=0,
+            end=B,
+            step=1,
+            dtype=rowwise_indices.dtype,
+            device=rowwise_indices.device,
+        )
+        .unsqueeze(1)
+        .expand(-1, N)
+        * X
+    )
+    return embeddings.view(-1, D)[flattened_indices, :].reshape(
+        rowwise_indices.size() + (D,)
+    )
+
+
+def batch_scatter_embeddings(
+    dst_embeddings: torch.Tensor,
+    rowwise_indices: torch.Tensor,
+    src_embeddings: torch.Tensor,
+) -> None:
+    """
+    Args:
+        dst_embeddings: (B, N, D,) x float.
+        rowwise_indices: (B,) x int, where each entry is in [0, N - 1).
+        source_embeddings: (B, D,) x float.
+    """
+    B, N, D = dst_embeddings.size()
+    flattened_indices = rowwise_indices + torch.arange(
+        start=0,
+        end=B * N,
+        step=N,
+        dtype=rowwise_indices.dtype,
+        device=rowwise_indices.device,
+    )
+    dst_embeddings.view(B * N, D)[flattened_indices, :] = src_embeddings
+
+
+def get_current_embeddings(
+    lengths: torch.Tensor,
+    encoded_embeddings: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Args:
+        lengths: (B,) x int
+        seq_embeddings: (B, N, D,) x float
+
+    Returns:
+        (B, D,) x float, where [i, :] == encoded_embeddings[i, lengths[i] - 1, :]
+    """
+    B, N, D = encoded_embeddings.size()
+    flattened_offsets = (lengths - 1) + torch.arange(
+        start=0, end=B, step=1, dtype=lengths.dtype, device=lengths.device
+    ) * N
+    return encoded_embeddings.reshape(-1, D)[flattened_offsets, :].reshape(B, D)
+
+
+def jagged_or_dense_repeat_interleave_dim0(
+    x: torch.Tensor, lengths: torch.Tensor, repeats: int
+) -> torch.Tensor:
+    if len(x.size()) == 3:
+        return x.repeat_interleave(repeats, dim=0)
+    else:
+        assert len(x.size()) == 2, f"x.size() = {x.size()}"
+        padded_x = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=x,
+            offsets=[torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)],
+            max_lengths=[lengths.max()],
+            padding_value=0.0,
+        )
+        lengths = lengths.repeat_interleave(repeats, dim=0)
+        return torch.ops.fbgemm.dense_to_jagged(
+            padded_x.repeat_interleave(repeats, dim=0),
+            [torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)],
+        )[0]
+
+
+def jagged_or_dense_index_select_dim0(
+    x: torch.Tensor, lengths: torch.Tensor, indices: torch.Tensor
+) -> torch.Tensor:
+    if len(x.size()) == 3:
+        return x[indices, :, :]
+    else:
+        assert len(x.size()) == 2, f"x.size() = {x.size()}"
+        padded_x = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=x,
+            offsets=[torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)],
+            max_lengths=[lengths.max()],
+            padding_value=0.0,
+        )
+        return torch.ops.fbgemm.dense_to_jagged(
+            padded_x[indices, :],
+            [torch.ops.fbgemm.asynchronous_complete_cumsum(lengths[indices])],
+        )[0]
diff --git a/recommendation_v4/generative_recommenders/research/modeling/similarity_module.py b/recommendation_v4/generative_recommenders/research/modeling/similarity_module.py
new file mode 100644
index 000000000..3ba32d239
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/similarity_module.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+from typing import Optional
+
+import torch
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+
+
+class SequentialEncoderWithLearnedSimilarityModule(torch.nn.Module):
+    """
+    Interface enabling using various similarity functions (besides inner products)
+    as part of a sequential encoder/decoder.
+
+    See rails/ for more details.
+    """
+
+    def __init__(
+        self,
+        ndp_module: SimilarityModule,
+    ) -> None:
+        super().__init__()
+
+        self._ndp_module: SimilarityModule = ndp_module
+
+    @abc.abstractmethod
+    def debug_str(
+        self,
+    ) -> str:
+        pass
+
+    def similarity_fn(
+        self,
+        query_embeddings: torch.Tensor,
+        item_ids: torch.Tensor,
+        item_embeddings: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        torch._assert(
+            len(query_embeddings.size()) == 2, "len(query_embeddings.size()) must be 2"
+        )
+        torch._assert(len(item_ids.size()) == 2, "len(item_ids.size()) must be 2")
+        if item_embeddings is None:
+            item_embeddings = self.get_item_embeddings(item_ids)  # pyre-ignore [29]
+        torch._assert(
+            len(item_embeddings.size()) == 3, "len(item_embeddings.size()) must be 3"
+        )
+
+        return self._ndp_module(
+            query_embeddings=query_embeddings,  # (B, query_embedding_dim)
+            item_embeddings=item_embeddings,  # (1/B, X, item_embedding_dim)
+            item_ids=item_ids,
+            **kwargs,
+        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py b/recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py
new file mode 100644
index 000000000..7fd870b4b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py
@@ -0,0 +1,222 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from typing import List, Optional, Tuple
+
+import gin
+import torch
+from generative_recommenders.research.rails.similarities.dot_product_similarity_fn import (
+    DotProductSimilarity,
+)
+from generative_recommenders.research.rails.similarities.layers import SwiGLU
+from generative_recommenders.research.rails.similarities.mol.item_embeddings_fn import (
+    RecoMoLItemEmbeddingsFn,
+)
+from generative_recommenders.research.rails.similarities.mol.query_embeddings_fn import (
+    RecoMoLQueryEmbeddingsFn,
+)
+from generative_recommenders.research.rails.similarities.mol.similarity_fn import (
+    MoLSimilarity,
+    SoftmaxDropoutCombiner,
+)
+
+
+def init_mlp_xavier_weights_zero_bias(m) -> None:
+    if isinstance(m, torch.nn.Linear):
+        torch.nn.init.xavier_uniform(m.weight)
+        if getattr(m, "bias", None) is not None:
+            m.bias.data.fill_(0.0)
+
+
+@gin.configurable
+def create_mol_interaction_module(
+    query_embedding_dim: int,
+    item_embedding_dim: int,
+    dot_product_dimension: int,
+    query_dot_product_groups: int,
+    item_dot_product_groups: int,
+    temperature: float,
+    query_dropout_rate: float,
+    query_hidden_dim: int,
+    item_dropout_rate: float,
+    item_hidden_dim: int,
+    gating_query_hidden_dim: int,
+    gating_qi_hidden_dim: int,
+    gating_item_hidden_dim: int,
+    softmax_dropout_rate: float,
+    bf16_training: bool,
+    gating_query_fn: bool = True,
+    gating_item_fn: bool = True,
+    dot_product_l2_norm: bool = True,
+    query_nonlinearity: str = "geglu",
+    item_nonlinearity: str = "geglu",
+    uid_dropout_rate: float = 0.5,
+    uid_embedding_hash_sizes: Optional[List[int]] = None,
+    uid_embedding_level_dropout: bool = False,
+    gating_combination_type: str = "glu_silu",
+    gating_item_dropout_rate: float = 0.0,
+    gating_qi_dropout_rate: float = 0.0,
+    eps: float = 1e-6,
+) -> Tuple[MoLSimilarity, str]:
+    """
+    Gin wrapper for creating MoL learned similarity.
+    """
+    mol_module = MoLSimilarity(
+        query_embedding_dim=query_embedding_dim,
+        item_embedding_dim=item_embedding_dim,
+        dot_product_dimension=dot_product_dimension,
+        query_dot_product_groups=query_dot_product_groups,
+        item_dot_product_groups=item_dot_product_groups,
+        temperature=temperature,
+        dot_product_l2_norm=dot_product_l2_norm,
+        query_embeddings_fn=RecoMoLQueryEmbeddingsFn(
+            query_embedding_dim=query_embedding_dim,
+            query_dot_product_groups=query_dot_product_groups,
+            dot_product_dimension=dot_product_dimension,
+            dot_product_l2_norm=dot_product_l2_norm,
+            proj_fn=lambda input_dim, output_dim: (
+                torch.nn.Sequential(
+                    torch.nn.Dropout(p=query_dropout_rate),
+                    SwiGLU(
+                        in_features=input_dim,
+                        out_features=query_hidden_dim,
+                    ),
+                    torch.nn.Linear(
+                        in_features=query_hidden_dim,
+                        out_features=output_dim,
+                    ),
+                ).apply(init_mlp_xavier_weights_zero_bias)
+            ),
+            eps=eps,
+        ),
+        item_embeddings_fn=RecoMoLItemEmbeddingsFn(
+            item_embedding_dim=item_embedding_dim,
+            item_dot_product_groups=item_dot_product_groups,
+            dot_product_dimension=dot_product_dimension,
+            dot_product_l2_norm=dot_product_l2_norm,
+            proj_fn=lambda input_dim, output_dim: (
+                torch.nn.Sequential(
+                    torch.nn.Dropout(p=item_dropout_rate),
+                    SwiGLU(in_features=input_dim, out_features=item_hidden_dim),
+                    torch.nn.Linear(
+                        in_features=item_hidden_dim,
+                        out_features=output_dim,
+                    ),
+                ).apply(init_mlp_xavier_weights_zero_bias)
+            ),
+            eps=eps,
+        ),
+        gating_query_only_partial_fn=lambda input_dim, output_dim: (  # pyre-ignore [6]
+            torch.nn.Sequential(
+                torch.nn.Linear(
+                    in_features=input_dim,
+                    out_features=gating_query_hidden_dim,
+                ),
+                torch.nn.SiLU(),
+                torch.nn.Linear(
+                    in_features=gating_query_hidden_dim,
+                    out_features=output_dim,
+                    bias=False,
+                ),
+            ).apply(init_mlp_xavier_weights_zero_bias)
+            if gating_query_fn
+            else None
+        ),
+        gating_item_only_partial_fn=lambda input_dim, output_dim: (  # pyre-ignore [6]
+            torch.nn.Sequential(
+                torch.nn.Dropout(p=gating_item_dropout_rate),
+                torch.nn.Linear(
+                    in_features=input_dim,
+                    out_features=gating_item_hidden_dim,
+                ),
+                torch.nn.SiLU(),
+                torch.nn.Linear(
+                    in_features=gating_item_hidden_dim,
+                    out_features=output_dim,
+                    bias=False,
+                ),
+            ).apply(init_mlp_xavier_weights_zero_bias)
+            if gating_item_fn
+            else None
+        ),
+        gating_qi_partial_fn=lambda input_dim, output_dim: (  # pyre-ignore [6]
+            torch.nn.Sequential(
+                torch.nn.Dropout(p=gating_qi_dropout_rate),
+                torch.nn.Linear(
+                    in_features=input_dim,
+                    out_features=gating_qi_hidden_dim,
+                ),
+                torch.nn.SiLU(),
+                torch.nn.Linear(
+                    in_features=gating_qi_hidden_dim,
+                    out_features=output_dim,
+                ),
+            ).apply(init_mlp_xavier_weights_zero_bias)
+            if gating_qi_hidden_dim > 0
+            else torch.nn.Sequential(
+                torch.nn.Dropout(p=gating_qi_dropout_rate),
+                torch.nn.Linear(
+                    in_features=input_dim,
+                    out_features=output_dim,
+                ),
+            ).apply(init_mlp_xavier_weights_zero_bias)
+        ),
+        gating_combination_type=gating_combination_type,
+        gating_normalization_fn=lambda _: SoftmaxDropoutCombiner(
+            dropout_rate=softmax_dropout_rate, eps=1e-6
+        ),
+        eps=eps,
+        autocast_bf16=bf16_training,
+    )
+    interaction_module_debug_str = (
+        f"MoL-{query_dot_product_groups}x{item_dot_product_groups}x{dot_product_dimension}"
+        + f"-t{temperature}-d{softmax_dropout_rate}"
+        + f"{'-l2' if dot_product_l2_norm else ''}"
+        + f"-q{query_hidden_dim}d{query_dropout_rate}{query_nonlinearity}"
+        + f"-i{item_hidden_dim}d{item_dropout_rate}{item_nonlinearity}"
+        + (f"-gq{gating_query_hidden_dim}" if gating_query_fn else "")
+        + (
+            f"-gi{gating_item_hidden_dim}d{gating_item_dropout_rate}"
+            if gating_item_fn
+            else ""
+        )
+        + f"-gqi{gating_qi_hidden_dim}d{gating_qi_dropout_rate}-x-{gating_combination_type}"
+    )
+    return mol_module, interaction_module_debug_str
+
+
+@gin.configurable
+def get_similarity_function(
+    module_type: str,
+    query_embedding_dim: int,
+    item_embedding_dim: int,
+    bf16_training: bool = False,
+    activation_checkpoint: bool = False,
+) -> Tuple[torch.nn.Module, str]:
+    if module_type == "DotProduct":
+        interaction_module = DotProductSimilarity()
+        interaction_module_debug_str = "DotProduct"
+    elif module_type == "MoL":
+        interaction_module, interaction_module_debug_str = (
+            create_mol_interaction_module(
+                query_embedding_dim=query_embedding_dim,
+                item_embedding_dim=item_embedding_dim,
+                bf16_training=bf16_training,
+            )
+        )
+    else:
+        raise ValueError(f"Unknown interaction_module_type {module_type}")
+    return interaction_module, interaction_module_debug_str
diff --git a/recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py b/recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py
new file mode 100644
index 000000000..f628468ce
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+from typing import Tuple
+
+import torch
+
+
+class TopKModule(torch.nn.Module):
+    @abc.abstractmethod
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        k: int,
+        sorted: bool = True,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            query_embeddings: (B, X, ...). Implementation-specific.
+            k: int. top k to return.
+            sorted: bool.
+
+        Returns:
+            Tuple of (top_k_scores, top_k_ids), both of shape (B, K,)
+        """
+        pass
diff --git a/recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py b/recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py
new file mode 100644
index 000000000..810b24c42
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py
@@ -0,0 +1,80 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from typing import Tuple
+
+import torch
+from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
+
+
+class MIPSTopKModule(TopKModule):
+    def __init__(
+        self,
+        item_embeddings: torch.Tensor,
+        item_ids: torch.Tensor,
+    ) -> None:
+        """
+        Args:
+            item_embeddings: (1, X, D)
+            item_ids: (1, X,)
+        """
+        super().__init__()
+
+        self._item_embeddings: torch.Tensor = item_embeddings
+        self._item_ids: torch.Tensor = item_ids
+
+
+class MIPSBruteForceTopK(MIPSTopKModule):
+    def __init__(
+        self,
+        item_embeddings: torch.Tensor,
+        item_ids: torch.Tensor,
+    ) -> None:
+        super().__init__(
+            item_embeddings=item_embeddings,
+            item_ids=item_ids,
+        )
+        del self._item_embeddings
+        self._item_embeddings_t: torch.Tensor = item_embeddings.permute(
+            2, 1, 0
+        ).squeeze(2)
+
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        k: int,
+        sorted: bool = True,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            query_embeddings: (B, ...). Implementation-specific.
+            k: int. final top-k to return.
+            sorted: bool. whether to sort final top-k results or not.
+
+        Returns:
+            Tuple of (top_k_scores x float, top_k_ids x int), both of shape (B, K,)
+        """
+        # (B, X,)
+        all_logits = torch.mm(query_embeddings, self._item_embeddings_t)
+        top_k_logits, top_k_indices = torch.topk(
+            all_logits,
+            dim=1,
+            k=k,
+            sorted=sorted,
+            largest=True,
+        )  # (B, k,)
+        return top_k_logits, self._item_ids.squeeze(0)[top_k_indices]
diff --git a/recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py b/recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py
new file mode 100644
index 000000000..fe88ca919
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py
@@ -0,0 +1,132 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Defines exact- and approximate- Top-K modules for Mixture-of-Logits (MoL),
+discussed in Retrieval with Learned Similarities (https://arxiv.org/abs/2407.15462).
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+from typing import Tuple
+
+import torch
+from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
+from generative_recommenders.research.rails.similarities.mol.similarity_fn import (
+    MoLSimilarity,
+)
+
+
+class MoLTopKModule(TopKModule):
+    def __init__(
+        self,
+        mol_module: MoLSimilarity,
+        item_embeddings: torch.Tensor,
+        item_ids: torch.Tensor,
+        flatten_item_ids_and_embeddings: bool,
+        keep_component_level_item_embeddings: bool,
+        component_level_item_embeddings_dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        """
+        Args:
+            mol_module: MoLSimilarity.
+            item_embeddings: (1, X, D) if mol_module._apply_item_embeddings_fn is True,
+                (1, X, P_X, D_P) otherwise.
+            item_ids: (1, X,) representing the item ids.
+            flatten_item_ids_and_embeddings: bool. If true, do not keep the extra (1,)
+                dimension at size(0).
+            keep_component_level_item_embeddings: bool. If true, keep P_x component-level
+                embeddings in `self._mol_item_embeddings` for downstream applications.
+            component_level_item_embeddings_dtype: torch.dtype. If set, the dtype
+                to keep component-level item embeddings in. By default we use bfloat16.
+        """
+        super().__init__()
+
+        self._mol_module: MoLSimilarity = mol_module
+        self._item_embeddings: torch.Tensor = (
+            item_embeddings
+            if not flatten_item_ids_and_embeddings
+            else item_embeddings.squeeze(0)
+        )
+
+        if keep_component_level_item_embeddings:
+            self._mol_item_embeddings: torch.Tensor = (
+                mol_module.get_item_component_embeddings(
+                    (
+                        self._item_embeddings.squeeze(0)
+                        if not flatten_item_ids_and_embeddings
+                        else self._item_embeddings
+                    ),
+                    decoupled_inference=True,
+                )[0]  # (X, D) -> (X, P_X, D_P)
+            ).to(component_level_item_embeddings_dtype)
+
+        self._item_ids: torch.Tensor = (
+            item_ids if not flatten_item_ids_and_embeddings else item_ids.squeeze(0)
+        )
+
+    @property
+    def mol_module(self) -> MoLSimilarity:
+        return self._mol_module
+
+
+class MoLBruteForceTopK(MoLTopKModule):
+    def __init__(
+        self,
+        mol_module: MoLSimilarity,
+        item_embeddings: torch.Tensor,
+        item_ids: torch.Tensor,
+    ) -> None:
+        super().__init__(
+            mol_module=mol_module,
+            item_embeddings=item_embeddings,
+            item_ids=item_ids,
+            flatten_item_ids_and_embeddings=False,
+            keep_component_level_item_embeddings=False,
+        )
+
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        k: int,
+        sorted: bool = True,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            query_embeddings: (B, X, D) if mol_module._apply_query_embeddings_fn is True,
+                (B, X, P_Q, D_P) otherwise.
+            k: int. final top-k to return.
+            sorted: bool. whether to sort final top-k results or not.
+            **kwargs: Implementation-specific keys/values.
+
+        Returns:
+            Tuple of (top_k_scores x float, top_k_ids x int), both of shape (B, K,)
+        """
+        # (B, X,)
+        all_logits, _ = self.mol_module(
+            query_embeddings,
+            self._item_embeddings,
+            **kwargs,
+        )
+        top_k_logits, top_k_indices = torch.topk(
+            all_logits,
+            dim=1,
+            k=k,
+            sorted=sorted,
+            largest=True,
+        )  # (B, k,)
+        return top_k_logits, self._item_ids.squeeze(0)[top_k_indices]
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py
new file mode 100644
index 000000000..9357fd0e4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from typing import Dict, Tuple
+
+import torch
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+
+
+class DotProductSimilarity(SimilarityModule):
+    def __init__(
+        self,
+    ) -> None:
+        super().__init__()
+
+    def debug_str(self) -> str:
+        return "dp"
+
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            query_embeddings: (B, D,) or (B * r, D) x float.
+            item_embeddings: (1, X, D) or (B, X, D) x float.
+
+        Returns:
+            (B, X) x float.
+        """
+
+        B_I, X, D = item_embeddings.size()
+        if B_I == 1:
+            # [B, D] x ([1, X, D] -> [D, X]) => [B, X]
+            return (
+                torch.mm(query_embeddings, item_embeddings.squeeze(0).t()),
+                {},
+            )  # [B, X]
+        elif query_embeddings.size(0) != B_I:
+            # (B * r, D) x (B, X, D).
+            return (
+                torch.bmm(
+                    query_embeddings.view(B_I, -1, D),
+                    item_embeddings.permute(0, 2, 1),
+                ).view(-1, X),
+                {},
+            )
+        else:
+            # [B, X, D] x ([B, D] -> [B, D, 1]) => [B, X, 1] -> [B, X]
+            return (
+                torch.bmm(item_embeddings, query_embeddings.unsqueeze(2)).squeeze(2),
+                {},
+            )
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/layers.py b/recommendation_v4/generative_recommenders/research/rails/similarities/layers.py
new file mode 100644
index 000000000..3f838bc48
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/layers.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Defines network architectures used in constructing various learned similarities.
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+import torch
+import torch.nn.functional as F
+
+
+class GeGLU(torch.nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+    ) -> None:
+        super().__init__()
+
+        self._in_features = in_features
+        self._out_features = out_features
+        self._w = torch.nn.Parameter(
+            torch.empty((in_features, out_features * 2)).normal_(mean=0, std=0.02),
+        )
+        self._b = torch.nn.Parameter(
+            torch.zeros((1, out_features * 2)),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bs = x.size()[:-1]
+        lhs, rhs = torch.split(
+            torch.mm(x.reshape(-1, self._in_features), self._w) + self._b,
+            [self._out_features, self._out_features],
+            dim=-1,
+        )
+        return (F.gelu(lhs) * rhs).reshape(bs + (self._out_features,))
+
+
+class SwiGLU(torch.nn.Module):
+    """
+    SwiGLU from https://arxiv.org/abs/2002.05202.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+    ) -> None:
+        super().__init__()
+
+        self._in_features = in_features
+        self._out_features = out_features
+        self._w = torch.nn.Parameter(
+            torch.empty((in_features, out_features * 2)).normal_(mean=0, std=0.02),
+        )
+        self._b = torch.nn.Parameter(
+            torch.zeros((1, out_features * 2)),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bs = x.size()[:-1]
+        lhs, rhs = torch.split(
+            torch.mm(x.reshape(-1, self._in_features), self._w) + self._b,
+            [self._out_features, self._out_features],
+            dim=-1,
+        )
+        return (F.silu(lhs) * rhs).reshape(bs + (self._out_features,))
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/module.py b/recommendation_v4/generative_recommenders/research/rails/similarities/module.py
new file mode 100644
index 000000000..e4061fa74
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/module.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+from typing import Dict, Tuple
+
+import torch
+
+
+class SimilarityModule(torch.nn.Module):
+    """
+    Interface enabling interfacing with various similarity functions.
+
+    While the discussions in our initial ICML'24 paper are based on inner products
+    for simplicity, we provide this interface (SimilarityModule) to support various
+    learned similarities at the retrieval stage, such as MLPs, Factorization Machines
+    (FMs), and Mixture-of-Logits (MoL), which we discussed in
+    - Revisiting Neural Retrieval on Accelerators (KDD'23), and
+    - Retrieval with Learned Similarities (https://arxiv.org/abs/2407.15462).
+    """
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            query_embeddings: (B, input_embedding_dim) x float.
+            item_embeddings: (1/B, X, item_embedding_dim) x float.
+            **kwargs: Implementation-specific keys/values (e.g.,
+                item ids / sideinfo, etc.)
+
+        Returns:
+            A tuple of (
+                (B, X,) similarity values,
+                keyed outputs representing auxiliary losses at training time.
+            ).
+        """
+        pass
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py
new file mode 100644
index 000000000..fd94e6f22
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Defines interface for generating query- and item-side embeddings for MoL.
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+import abc
+from typing import Dict, Tuple
+
+import torch
+
+
+class MoLEmbeddingsFn(torch.nn.Module):
+    """
+    Generates K_Q query-side (K_I item-side) embeddings for MoL based on
+    input embeddings and other optional implementation-specific tensors.
+    """
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        input_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            input_embeddings: (B, ...) x float where B is the batch size.
+            kwargs: implementation-specific.
+
+        Returns:
+            Tuple of (
+                (B, query_dot_product_groups/item_dot_product_groups, dot_product_embedding_dim) x float,
+                str-keyed auxiliary losses.
+            ).
+        """
+        pass
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py
new file mode 100644
index 000000000..237cd8942
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Defines functions to generate item-side embeddings for MoL.
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+from typing import Callable, Dict, Tuple
+
+import torch
+from generative_recommenders.research.rails.similarities.mol.embeddings_fn import (
+    MoLEmbeddingsFn,
+)
+
+
+def init_mlp_xavier_weights_zero_bias(m) -> None:
+    if isinstance(m, torch.nn.Linear):
+        torch.nn.init.xavier_uniform_(m.weight)
+        if getattr(m, "bias", None) is not None:
+            m.bias.data.fill_(0.0)
+
+
+class RecoMoLItemEmbeddingsFn(MoLEmbeddingsFn):
+    """
+    Generates P_X query-side embeddings for MoL based on input embeddings and other
+    optional tensors for recommendation models. Tested for sequential retrieval
+    scenarios.
+    """
+
+    def __init__(
+        self,
+        item_embedding_dim: int,
+        item_dot_product_groups: int,
+        dot_product_dimension: int,
+        dot_product_l2_norm: bool,
+        proj_fn: Callable[[int, int], torch.nn.Module],
+        eps: float,
+    ) -> None:
+        super().__init__()
+
+        self._item_emb_based_dot_product_groups: int = item_dot_product_groups
+        self._item_emb_proj_module: torch.nn.Module = proj_fn(
+            item_embedding_dim,
+            dot_product_dimension * self._item_emb_based_dot_product_groups,
+        )
+        self._dot_product_dimension: int = dot_product_dimension
+        self._dot_product_l2_norm: bool = dot_product_l2_norm
+        self._eps: float = eps
+
+    def forward(
+        self,
+        input_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            input_embeddings: (B, item_embedding_dim,) x float where B is the batch size.
+            kwargs: str-keyed tensors. Implementation-specific.
+
+        Returns:
+            Tuple of (
+                (B, item_dot_product_groups, dot_product_embedding_dim) x float,
+                str-keyed aux_losses,
+            ).
+        """
+        split_item_embeddings = self._item_emb_proj_module(input_embeddings).reshape(
+            input_embeddings.size()[:-1]
+            + (
+                self._item_emb_based_dot_product_groups,
+                self._dot_product_dimension,
+            )
+        )
+
+        if self._dot_product_l2_norm:
+            split_item_embeddings = split_item_embeddings / torch.clamp(
+                torch.linalg.norm(
+                    split_item_embeddings,
+                    ord=None,
+                    dim=-1,
+                    keepdim=True,
+                ),
+                min=self._eps,
+            )
+        return split_item_embeddings, {}
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py
new file mode 100644
index 000000000..8fe28ee11
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py
@@ -0,0 +1,164 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Defines functions to generate query-side embeddings for MoL.
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.rails.similarities.mol.embeddings_fn import (
+    MoLEmbeddingsFn,
+)
+
+
+def init_mlp_xavier_weights_zero_bias(m) -> None:
+    if isinstance(m, torch.nn.Linear):
+        torch.nn.init.xavier_uniform_(m.weight)
+        if getattr(m, "bias", None) is not None:
+            m.bias.data.fill_(0.0)
+
+
+class RecoMoLQueryEmbeddingsFn(MoLEmbeddingsFn):
+    """
+    Generates P_Q query-side embeddings for MoL based on input embeddings and other
+    optional tensors for recommendation models. Tested for sequential retrieval
+    scenarios.
+
+    The current implementation accesses user_ids associated with the query from
+    `user_ids' in kwargs.
+    """
+
+    def __init__(
+        self,
+        query_embedding_dim: int,
+        query_dot_product_groups: int,
+        dot_product_dimension: int,
+        dot_product_l2_norm: bool,
+        proj_fn: Callable[[int, int], torch.nn.Module],
+        eps: float,
+        uid_embedding_hash_sizes: Optional[List[int]] = None,
+        uid_dropout_rate: float = 0.0,
+        uid_embedding_level_dropout: bool = False,
+    ) -> None:
+        super().__init__()
+        self._uid_embedding_hash_sizes: List[int] = uid_embedding_hash_sizes or []
+        self._query_emb_based_dot_product_groups: int = query_dot_product_groups - len(
+            self._uid_embedding_hash_sizes
+        )
+        self._query_emb_proj_module: torch.nn.Module = proj_fn(
+            query_embedding_dim,
+            dot_product_dimension * self._query_emb_based_dot_product_groups,
+        )
+        self._dot_product_dimension: int = dot_product_dimension
+        self._dot_product_l2_norm: bool = dot_product_l2_norm
+        if len(self._uid_embedding_hash_sizes) > 0:
+            for i, hash_size in enumerate(self._uid_embedding_hash_sizes):
+                setattr(
+                    self,
+                    f"_uid_embeddings_{i}",
+                    torch.nn.Embedding(
+                        hash_size + 1, dot_product_dimension, padding_idx=0
+                    ),
+                )
+        self._uid_dropout_rate: float = uid_dropout_rate
+        self._uid_embedding_level_dropout: bool = uid_embedding_level_dropout
+        self._eps: float = eps
+
+    def forward(
+        self,
+        input_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            input_embeddings: (B, query_embedding_dim,) x float where B is the batch size.
+            kwargs: str-keyed tensors. Implementation-specific.
+
+        Returns:
+            Tuple of (
+                (B, query_dot_product_groups, dot_product_embedding_dim) x float,
+                str-keyed aux_losses,
+            ).
+        """
+        split_query_embeddings = self._query_emb_proj_module(input_embeddings).reshape(
+            (
+                input_embeddings.size(0),
+                self._query_emb_based_dot_product_groups,
+                self._dot_product_dimension,
+            )
+        )
+
+        aux_losses: Dict[str, torch.Tensor] = {}
+
+        if len(self._uid_embedding_hash_sizes) > 0:
+            all_uid_embeddings = []
+            for i, hash_size in enumerate(self._uid_embedding_hash_sizes):
+                # TODO: decouple this from MoLQueryEmbeddingFn.
+                uid_embeddings = getattr(self, f"_uid_embeddings_{i}")(
+                    (kwargs["user_ids"] % hash_size) + 1
+                )
+                if self.training:
+                    l2_norm = (uid_embeddings * uid_embeddings).sum(-1).mean()
+                    if i == 0:
+                        aux_losses["uid_embedding_l2_norm"] = l2_norm
+                    else:
+                        aux_losses["uid_embedding_l2_norm"] = (
+                            aux_losses["uid_embedding_l2_norm"] + l2_norm
+                        )
+
+                if self._uid_dropout_rate > 0.0:
+                    if self._uid_embedding_level_dropout:
+                        # conditionally dropout the entire embedding.
+                        if self.training:
+                            uid_dropout_mask = (
+                                torch.rand(
+                                    uid_embeddings.size()[:-1],
+                                    device=uid_embeddings.device,
+                                )
+                                > self._uid_dropout_rate
+                            )
+                            uid_embeddings = (
+                                uid_embeddings
+                                * uid_dropout_mask.unsqueeze(-1)
+                                / (1.0 - self._uid_dropout_rate)
+                            )
+                    else:
+                        uid_embeddings = F.dropout(
+                            uid_embeddings,
+                            p=self._uid_dropout_rate,
+                            training=self.training,
+                        )
+                all_uid_embeddings.append(uid_embeddings.unsqueeze(1))
+            split_query_embeddings = torch.cat(
+                [split_query_embeddings] + all_uid_embeddings, dim=1
+            )
+
+        if self._dot_product_l2_norm:
+            split_query_embeddings = split_query_embeddings / torch.clamp(
+                torch.linalg.norm(
+                    split_query_embeddings,
+                    ord=None,
+                    dim=-1,
+                    keepdim=True,
+                ),
+                min=self._eps,
+            )
+        return split_query_embeddings, aux_losses
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py
new file mode 100644
index 000000000..34e4c4a23
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py
@@ -0,0 +1,388 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Implements MoL (Mixture-of-Logits) with load balancing regularization loss, as discussed in:
+- Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
+- Retrieval with Learned Similarities (https://arxiv.org/abs/2407.15462).
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+from typing import Callable, Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+from generative_recommenders.research.rails.similarities.mol.embeddings_fn import (
+    MoLEmbeddingsFn,
+)
+
+
+@torch.compile(dynamic=True)
+def _softmax_dropout_combiner_fn(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    dropout_pr: float,
+    eps: float,
+    training: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Computes (_softmax_dropout_fn(x) * y).sum(-1).
+    """
+    x = F.softmax(x, dim=-1)
+    if dropout_pr > 0.0:
+        x = F.dropout(x, p=dropout_pr, training=training)
+        x = x / torch.clamp(x.sum(-1, keepdims=True), min=eps)  # pyre-ignore [19]
+    return x, (x * y).sum(-1)
+
+
+@torch.compile
+def _load_balancing_mi_loss_fn(
+    gating_prs: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    """
+    See Retrieval with Learned Similarities (RAILS, https://arxiv.org/abs/2407.15462) for discussions.
+    """
+    B, X, E = gating_prs.size()
+    expert_util_prs = gating_prs.view(B * X, E).sum(0, keepdim=False) / (1.0 * B * X)
+    expert_util_entropy = -(expert_util_prs * torch.log(expert_util_prs + eps)).sum()
+    per_example_expert_entropy = -(gating_prs * torch.log(gating_prs + eps)).sum() / (
+        1.0 * B * X
+    )
+    return -expert_util_entropy + per_example_expert_entropy
+
+
+class SoftmaxDropoutCombiner(torch.nn.Module):
+    def __init__(
+        self,
+        dropout_rate: float,
+        eps: float,
+    ) -> None:
+        super().__init__()
+
+        self._dropout_rate: float = dropout_rate
+        self._eps: float = eps
+
+    def forward(
+        self,
+        gating_weights: torch.Tensor,
+        x: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        gating_prs, combined_logits = _softmax_dropout_combiner_fn(
+            x=gating_weights,
+            y=x,
+            dropout_pr=self._dropout_rate,
+            eps=self._eps,
+            training=self.training,
+        )
+
+        aux_losses = {}
+        if self.training:
+            aux_losses["mi_loss"] = _load_balancing_mi_loss_fn(
+                gating_prs, eps=self._eps
+            )
+
+        return combined_logits, aux_losses
+
+
+class MoLGatingFn(torch.nn.Module):
+    """
+    Implements the gating function for MoL, used to compute pi_p(q, x) for a given (p, x) pair.
+    """
+
+    def __init__(
+        self,
+        num_logits: int,
+        query_embedding_dim: int,
+        item_embedding_dim: int,
+        query_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
+        item_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
+        qi_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
+        combination_type: str,
+        normalization_fn: Callable[[int], torch.nn.Module],
+    ) -> None:
+        super().__init__()
+
+        self._query_only_partial_module: Optional[torch.nn.Module] = (
+            query_only_partial_fn(query_embedding_dim, num_logits)
+            if query_only_partial_fn
+            else None
+        )
+        self._item_only_partial_module: Optional[torch.nn.Module] = (
+            item_only_partial_fn(item_embedding_dim, num_logits)
+            if item_only_partial_fn
+            else None
+        )
+        self._qi_partial_module: Optional[torch.nn.Module] = (
+            qi_partial_fn(
+                num_logits,
+                num_logits,
+            )
+            if qi_partial_fn is not None
+            else None
+        )
+        if (
+            self._query_only_partial_module is None
+            and self._item_only_partial_module is None
+            and self._qi_partial_module is None
+        ):
+            raise ValueError(
+                "At least one of query_only_partial_fn, item_only_partial_fn, "
+                "and qi_partial_fn must not be None."
+            )
+        self._num_logits: int = num_logits
+        self._combination_type: str = combination_type
+        self._normalization_fn: torch.nn.Module = normalization_fn(num_logits)
+
+    def forward(
+        self,
+        logits: torch.Tensor,
+        query_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            logits: (B, X, P_Q * P_X) x float;
+            query_embeddings: (B, D) x float;
+            item_embeddings: (1/B, X, D') x float;
+
+        Returns:
+            (B, X) x float, Dict[str, Tensor] representing auxiliary losses.
+        """
+        B, X, _ = logits.size()
+        # [B, 1, F], [1/B, X, F], [B, X, F]
+        query_partial_inputs, item_partial_inputs, qi_partial_inputs = None, None, None
+        if self._query_only_partial_module is not None:
+            query_partial_inputs = self._query_only_partial_module(
+                query_embeddings
+            ).unsqueeze(1)
+        if self._item_only_partial_module is not None:
+            item_partial_inputs = self._item_only_partial_module(item_embeddings)
+        if self._qi_partial_module is not None:
+            qi_partial_inputs = self._qi_partial_module(logits)
+
+        if self._combination_type == "glu_silu":
+            gating_inputs = (
+                query_partial_inputs * item_partial_inputs + qi_partial_inputs
+            )
+            gating_weights = gating_inputs * F.sigmoid(gating_inputs)
+        elif self._combination_type == "glu_silu_ln":
+            gating_inputs = (
+                query_partial_inputs * item_partial_inputs + qi_partial_inputs
+            )
+            gating_weights = gating_inputs * F.sigmoid(
+                F.layer_norm(gating_inputs, normalized_shape=[self._num_logits])
+            )
+        elif self._combination_type == "none":
+            gating_inputs = query_partial_inputs
+            if gating_inputs is None:
+                gating_inputs = item_partial_inputs
+            elif item_partial_inputs is not None:
+                gating_inputs += item_partial_inputs
+            if gating_inputs is None:
+                gating_inputs = qi_partial_inputs
+            elif qi_partial_inputs is not None:
+                gating_inputs += qi_partial_inputs
+            gating_weights = gating_inputs
+        else:
+            raise ValueError(f"Unknown combination_type {self._combination_type}")
+
+        return self._normalization_fn(gating_weights, logits)
+
+
+class MoLSimilarity(SimilarityModule):
+    def __init__(
+        self,
+        query_embedding_dim: int,
+        item_embedding_dim: int,
+        dot_product_dimension: int,
+        query_dot_product_groups: int,
+        item_dot_product_groups: int,
+        temperature: float,
+        dot_product_l2_norm: bool,
+        query_embeddings_fn: MoLEmbeddingsFn,
+        item_embeddings_fn: Optional[MoLEmbeddingsFn],
+        gating_query_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
+        gating_item_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
+        gating_qi_partial_fn: Optional[Callable[[int], torch.nn.Module]],
+        gating_combination_type: str,
+        gating_normalization_fn: Callable[[int], torch.nn.Module],
+        eps: float,
+        apply_query_embeddings_fn: bool = True,
+        apply_item_embeddings_fn: bool = True,
+        autocast_bf16: bool = False,
+    ) -> None:
+        """
+        Args:
+            apply_query_embeddings_fn: bool. If true, compute query_embeddings_fn
+                to input during forward(). Otherwise, we assume the caller will
+                invoke get_query_component_embeddings() separately before
+                calling forward().
+            apply_item_embeddings_fn: bool. If true, compute item_embeddings_fn
+                to input during forward(). Otherwise, we assume the caller will
+                invoke get_item_component_embeddings() separately before
+                calling forward().
+        """
+        super().__init__()
+
+        self._gating_fn: MoLGatingFn = MoLGatingFn(
+            num_logits=query_dot_product_groups * item_dot_product_groups,
+            query_embedding_dim=query_embedding_dim,
+            item_embedding_dim=item_embedding_dim,
+            query_only_partial_fn=gating_query_only_partial_fn,
+            item_only_partial_fn=gating_item_only_partial_fn,
+            qi_partial_fn=gating_qi_partial_fn,  # pyre-ignore [6]
+            combination_type=gating_combination_type,
+            normalization_fn=gating_normalization_fn,
+        )
+        self._query_embeddings_fn: MoLEmbeddingsFn = query_embeddings_fn
+        self._item_embeddings_fn: MoLEmbeddingsFn = (  # pyre-ignore [8]
+            item_embeddings_fn
+        )
+        self._apply_query_embeddings_fn: bool = apply_query_embeddings_fn
+        self._apply_item_embeddings_fn: bool = apply_item_embeddings_fn
+        self._dot_product_l2_norm: bool = dot_product_l2_norm
+        self._query_dot_product_groups: int = query_dot_product_groups
+        self._item_dot_product_groups: int = item_dot_product_groups
+        self._dot_product_dimension: int = dot_product_dimension
+        self._temperature: float = temperature
+        self._eps: float = eps
+        self._autocast_bf16: bool = autocast_bf16
+
+    def get_query_component_embeddings(
+        self,
+        input_embeddings: torch.Tensor,
+        decoupled_inference: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            input_embeddings: (B, self._input_embedding_dim,) x float
+                or (B, P_Q, self._dot_product_dimension) x float.
+            decoupled_inference: bool. If true, the call represents an attempt to run
+                forward() in decoupled mode at inference time (e.g., to pre-compute
+                component-level query embeddings for filtering, etc.). We simulate
+                the logic in forward() in this case (e.g., if forward() doesn't apply
+                query_embeddings_fn, then this call won't either).
+            kwargs: additional implementation-specific arguments.
+
+        Returns:
+            (B, query_dot_product_groups, dot_product_embedding_dim) x float.
+        """
+        if decoupled_inference and not self._apply_query_embeddings_fn:
+            return input_embeddings, {}
+        return self._query_embeddings_fn(input_embeddings, **kwargs)
+
+    def get_item_component_embeddings(
+        self,
+        input_embeddings: torch.Tensor,
+        decoupled_inference: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            input_embeddings: (..., self._input_embedding_dim,) x float
+                or (..., P_X, self._dot_product_dimension) x float.
+            decoupled_inference: bool. If true, the call represents an attempt to run
+                forward() in decoupled mode at inference time (e.g., to pre-compute
+                component-level item embeddings for filtering, etc.). We simulate
+                the logic in forward() in this case (e.g., if forward() doesn't apply
+                item_embeddings_fn, then this call won't either).
+            kwargs: additional implementation-specific arguments.
+
+        Returns:
+            (..., item_dot_product_groups, dot_product_embedding_dim) x float.
+        """
+        if decoupled_inference and not self._apply_item_embeddings_fn:
+            return input_embeddings, {}
+
+        return self._item_embeddings_fn(input_embeddings, **kwargs)
+
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            query_embeddings: (B, self._input_embedding_dim) x float or
+                (B, P_Q, self._dot_product_dimension) x float (when query_embeddings_fn
+                is applied externally).
+            item_embeddings: (1/B, X, self._item_embedding_dim) x float or
+                (1/B, X, P_X, self._dot_product_dimension) x float (when item_embeddings_fn
+                is applied externally).
+            kwargs: additional implementation-specific arguments.
+
+        Returns:
+            (B, X) x float, Dict[str, Tensor] representing auxiliary losses.
+        """
+        with torch.autocast(
+            enabled=self._autocast_bf16, dtype=torch.bfloat16, device_type="cuda"
+        ):
+            B = query_embeddings.size(0)
+            B_prime = item_embeddings.shape[0]  # 1 or B
+            X = item_embeddings.shape[1]
+
+            if self._apply_query_embeddings_fn:
+                (
+                    split_query_embeddings,
+                    query_aux_losses,
+                ) = self.get_query_component_embeddings(
+                    query_embeddings,
+                    **kwargs,
+                )
+            else:
+                split_query_embeddings, query_aux_losses = query_embeddings, {}
+
+            if self._apply_item_embeddings_fn:
+                (
+                    split_item_embeddings,
+                    item_aux_losses,
+                ) = self.get_item_component_embeddings(
+                    input_embeddings=item_embeddings,
+                    **kwargs,
+                )
+            else:
+                split_item_embeddings, item_aux_losses = item_embeddings, {}
+
+            if B_prime == 1:
+                logits = torch.einsum(
+                    "bnd,xmd->bxnm",
+                    split_query_embeddings,
+                    split_item_embeddings.squeeze(0),
+                ).reshape(
+                    B, X, self._query_dot_product_groups * self._item_dot_product_groups
+                )
+            else:
+                logits = torch.einsum(
+                    "bnd,bxmd->bxnm", split_query_embeddings, split_item_embeddings
+                ).reshape(
+                    B, X, self._query_dot_product_groups * self._item_dot_product_groups
+                )
+
+            gated_outputs, gating_aux_losses = self._gating_fn(
+                logits=logits / self._temperature,  # [B, X, L]
+                query_embeddings=query_embeddings,  # [B, D]
+                item_embeddings=item_embeddings,  # [1/B, X, D']
+            )
+            return gated_outputs, {
+                **gating_aux_losses,
+                **query_aux_losses,
+                **item_aux_losses,
+            }
diff --git a/recommendation_v4/generative_recommenders/research/trainer/data_loader.py b/recommendation_v4/generative_recommenders/research/trainer/data_loader.py
new file mode 100644
index 000000000..390b04bdb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/trainer/data_loader.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import os
+from typing import Optional, Tuple
+
+import gin
+import torch
+
+
+@gin.configurable
+def create_data_loader(
+    dataset: torch.utils.data.Dataset,
+    batch_size: int,
+    world_size: int,
+    rank: int,
+    shuffle: bool,
+    prefetch_factor: int = 128,
+    num_workers: Optional[int] = os.cpu_count(),
+    drop_last: bool = False,
+) -> Tuple[
+    Optional[torch.utils.data.distributed.DistributedSampler[torch.utils.data.Dataset]],
+    torch.utils.data.DataLoader,
+]:
+    if shuffle:
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            dataset,
+            num_replicas=world_size,
+            rank=rank,
+            shuffle=True,
+            seed=0,
+            drop_last=drop_last,
+        )
+    else:
+        sampler = None
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        # shuffle=True, cannot use with sampler
+        num_workers=num_workers or 0,
+        sampler=sampler,
+        prefetch_factor=prefetch_factor,
+    )
+    return sampler, data_loader
diff --git a/recommendation_v4/generative_recommenders/research/trainer/train.py b/recommendation_v4/generative_recommenders/research/trainer/train.py
new file mode 100644
index 000000000..6d2da5be7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/trainer/train.py
@@ -0,0 +1,532 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import logging
+import os
+import random
+import time
+from datetime import date
+from typing import Dict, Optional
+
+import gin
+import torch
+import torch.distributed as dist
+from generative_recommenders.research.data.eval import (
+    _avg,
+    add_to_summary_writer,
+    eval_metrics_v2_from_tensors,
+    get_eval_state,
+)
+from generative_recommenders.research.data.reco_dataset import get_reco_dataset
+from generative_recommenders.research.indexing.utils import get_top_k_module
+from generative_recommenders.research.modeling.sequential.autoregressive_losses import (
+    BCELoss,
+    InBatchNegativesSampler,
+    LocalNegativesSampler,
+)
+from generative_recommenders.research.modeling.sequential.embedding_modules import (
+    EmbeddingModule,
+    LocalEmbeddingModule,
+)
+from generative_recommenders.research.modeling.sequential.encoder_utils import (
+    get_sequential_encoder,
+)
+from generative_recommenders.research.modeling.sequential.features import (
+    movielens_seq_features_from_row,
+)
+from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
+    LearnablePositionalEmbeddingInputFeaturesPreprocessor,
+)
+from generative_recommenders.research.modeling.sequential.losses.sampled_softmax import (
+    SampledSoftmaxLoss,
+)
+from generative_recommenders.research.modeling.sequential.output_postprocessors import (
+    L2NormEmbeddingPostprocessor,
+    LayerNormEmbeddingPostprocessor,
+)
+from generative_recommenders.research.modeling.similarity_utils import (
+    get_similarity_function,
+)
+from generative_recommenders.research.trainer.data_loader import create_data_loader
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+
+
+def setup(rank: int, world_size: int, master_port: int) -> None:
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(master_port)
+
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+def cleanup() -> None:
+    dist.destroy_process_group()
+
+
+@gin.configurable
+def get_weighted_loss(
+    main_loss: torch.Tensor,
+    aux_losses: Dict[str, torch.Tensor],
+    weights: Dict[str, float],
+) -> torch.Tensor:
+    weighted_loss = main_loss
+    for key, weight in weights.items():
+        cur_weighted_loss = aux_losses[key] * weight
+        weighted_loss = weighted_loss + cur_weighted_loss
+    return weighted_loss
+
+
+@gin.configurable
+def train_fn(
+    rank: int,
+    world_size: int,
+    master_port: int,
+    dataset_name: str = "ml-20m",
+    max_sequence_length: int = 200,
+    positional_sampling_ratio: float = 1.0,
+    local_batch_size: int = 128,
+    eval_batch_size: int = 128,
+    eval_user_max_batch_size: Optional[int] = None,
+    main_module: str = "SASRec",
+    main_module_bf16: bool = False,
+    dropout_rate: float = 0.2,
+    user_embedding_norm: str = "l2_norm",
+    sampling_strategy: str = "in-batch",
+    loss_module: str = "SampledSoftmaxLoss",
+    loss_weights: Optional[Dict[str, float]] = {},
+    num_negatives: int = 1,
+    loss_activation_checkpoint: bool = False,
+    item_l2_norm: bool = False,
+    temperature: float = 0.05,
+    num_epochs: int = 101,
+    learning_rate: float = 1e-3,
+    num_warmup_steps: int = 0,
+    weight_decay: float = 1e-3,
+    top_k_method: str = "MIPSBruteForceTopK",
+    eval_interval: int = 100,
+    full_eval_every_n: int = 1,
+    save_ckpt_every_n: int = 1000,
+    partial_eval_num_iters: int = 32,
+    embedding_module_type: str = "local",
+    item_embedding_dim: int = 240,
+    interaction_module_type: str = "",
+    gr_output_length: int = 10,
+    l2_norm_eps: float = 1e-6,
+    enable_tf32: bool = False,
+    random_seed: int = 42,
+) -> None:
+    # to enable more deterministic results.
+    random.seed(random_seed)
+    torch.backends.cuda.matmul.allow_tf32 = enable_tf32
+    torch.backends.cudnn.allow_tf32 = enable_tf32
+    logging.info(f"cuda.matmul.allow_tf32: {enable_tf32}")
+    logging.info(f"cudnn.allow_tf32: {enable_tf32}")
+    logging.info(f"Training model on rank {rank}.")
+    setup(rank, world_size, master_port)
+
+    dataset = get_reco_dataset(
+        dataset_name=dataset_name,
+        max_sequence_length=max_sequence_length,
+        chronological=True,
+        positional_sampling_ratio=positional_sampling_ratio,
+    )
+
+    train_data_sampler, train_data_loader = create_data_loader(
+        dataset.train_dataset,
+        batch_size=local_batch_size,
+        world_size=world_size,
+        rank=rank,
+        shuffle=True,
+        drop_last=world_size > 1,
+    )
+    eval_data_sampler, eval_data_loader = create_data_loader(
+        dataset.eval_dataset,
+        batch_size=eval_batch_size,
+        world_size=world_size,
+        rank=rank,
+        shuffle=True,  # needed for partial eval
+        drop_last=world_size > 1,
+    )
+
+    model_debug_str = main_module
+    if embedding_module_type == "local":
+        embedding_module: EmbeddingModule = LocalEmbeddingModule(
+            num_items=dataset.max_item_id,
+            item_embedding_dim=item_embedding_dim,
+        )
+    else:
+        raise ValueError(f"Unknown embedding_module_type {embedding_module_type}")
+    model_debug_str += f"-{embedding_module.debug_str()}"
+
+    interaction_module, interaction_module_debug_str = get_similarity_function(
+        module_type=interaction_module_type,
+        query_embedding_dim=item_embedding_dim,
+        item_embedding_dim=item_embedding_dim,
+    )
+
+    assert user_embedding_norm == "l2_norm" or user_embedding_norm == "layer_norm", (
+        f"Not implemented for {user_embedding_norm}"
+    )
+    output_postproc_module = (
+        L2NormEmbeddingPostprocessor(
+            embedding_dim=item_embedding_dim,
+            eps=1e-6,
+        )
+        if user_embedding_norm == "l2_norm"
+        else LayerNormEmbeddingPostprocessor(
+            embedding_dim=item_embedding_dim,
+            eps=1e-6,
+        )
+    )
+    input_preproc_module = LearnablePositionalEmbeddingInputFeaturesPreprocessor(
+        max_sequence_len=dataset.max_sequence_length + gr_output_length + 1,
+        embedding_dim=item_embedding_dim,
+        dropout_rate=dropout_rate,
+    )
+
+    model = get_sequential_encoder(
+        module_type=main_module,
+        max_sequence_length=dataset.max_sequence_length,
+        max_output_length=gr_output_length + 1,
+        embedding_module=embedding_module,
+        interaction_module=interaction_module,
+        input_preproc_module=input_preproc_module,
+        output_postproc_module=output_postproc_module,
+        verbose=True,
+    )
+    model_debug_str = model.debug_str()
+
+    # loss
+    loss_debug_str = loss_module
+    if loss_module == "BCELoss":
+        loss_debug_str = loss_debug_str[:-4]
+        assert temperature == 1.0
+        ar_loss = BCELoss(temperature=temperature, model=model)
+    elif loss_module == "SampledSoftmaxLoss":
+        loss_debug_str = "ssl"
+        if temperature != 1.0:
+            loss_debug_str += f"-t{temperature}"
+        ar_loss = SampledSoftmaxLoss(
+            num_to_sample=num_negatives,
+            softmax_temperature=temperature,
+            model=model,
+            activation_checkpoint=loss_activation_checkpoint,
+        )
+        loss_debug_str += (
+            f"-n{num_negatives}{'-ac' if loss_activation_checkpoint else ''}"
+        )
+    else:
+        raise ValueError(f"Unrecognized loss module {loss_module}.")
+
+    # sampling
+    if sampling_strategy == "in-batch":
+        negatives_sampler = InBatchNegativesSampler(
+            l2_norm=item_l2_norm,
+            l2_norm_eps=l2_norm_eps,
+            dedup_embeddings=True,
+        )
+        sampling_debug_str = (
+            f"in-batch{f'-l2-eps{l2_norm_eps}' if item_l2_norm else ''}-dedup"
+        )
+    elif sampling_strategy == "local":
+        negatives_sampler = LocalNegativesSampler(
+            num_items=dataset.max_item_id,
+            item_emb=model._embedding_module._item_emb,
+            all_item_ids=dataset.all_item_ids,
+            l2_norm=item_l2_norm,
+            l2_norm_eps=l2_norm_eps,
+        )
+    else:
+        raise ValueError(f"Unrecognized sampling strategy {sampling_strategy}.")
+    sampling_debug_str = negatives_sampler.debug_str()
+
+    # Creates model and moves it to GPU with id rank
+    device = rank
+    if main_module_bf16:
+        model = model.to(torch.bfloat16)
+    model = model.to(device)
+    ar_loss = ar_loss.to(device)
+    negatives_sampler = negatives_sampler.to(device)
+    model = DDP(model, device_ids=[rank], broadcast_buffers=False)
+
+    # TODO: wrap in create_optimizer.
+    opt = torch.optim.AdamW(
+        model.parameters(),
+        lr=learning_rate,
+        betas=(0.9, 0.98),
+        weight_decay=weight_decay,
+    )
+
+    date_str = date.today().strftime("%Y-%m-%d")
+    model_subfolder = f"{dataset_name}-l{max_sequence_length}"
+    model_desc = (
+        f"{model_subfolder}"
+        + f"/{model_debug_str}_{interaction_module_debug_str}_{sampling_debug_str}_{loss_debug_str}"
+        + f"{f'-ddp{world_size}' if world_size > 1 else ''}-b{local_batch_size}-lr{learning_rate}-wu{num_warmup_steps}-wd{weight_decay}{'' if enable_tf32 else '-notf32'}-{date_str}"
+    )
+    if full_eval_every_n > 1:
+        model_desc += f"-fe{full_eval_every_n}"
+    if positional_sampling_ratio is not None and positional_sampling_ratio < 1:
+        model_desc += f"-d{positional_sampling_ratio}"
+    # creates subfolders.
+    os.makedirs(f"./exps/{model_subfolder}", exist_ok=True)
+    os.makedirs(f"./ckpts/{model_subfolder}", exist_ok=True)
+    log_dir = f"./exps/{model_desc}"
+    if rank == 0:
+        writer = SummaryWriter(log_dir=log_dir)
+        logging.info(f"Rank {rank}: writing logs to {log_dir}")
+    else:
+        writer = None
+        logging.info(f"Rank {rank}: disabling summary writer")
+
+    last_training_time = time.time()
+    torch.autograd.set_detect_anomaly(True)
+
+    batch_id = 0
+    epoch = 0
+    for epoch in range(num_epochs):
+        if train_data_sampler is not None:
+            train_data_sampler.set_epoch(epoch)
+        if eval_data_sampler is not None:
+            eval_data_sampler.set_epoch(epoch)
+        model.train()
+        for row in iter(train_data_loader):
+            seq_features, target_ids, target_ratings = movielens_seq_features_from_row(
+                row,
+                device=device,
+                max_output_length=gr_output_length + 1,
+            )
+
+            if (batch_id % eval_interval) == 0:
+                model.eval()
+
+                eval_state = get_eval_state(
+                    model=model.module,
+                    all_item_ids=dataset.all_item_ids,
+                    negatives_sampler=negatives_sampler,
+                    top_k_module_fn=lambda item_embeddings, item_ids: get_top_k_module(
+                        top_k_method=top_k_method,
+                        model=model.module,
+                        item_embeddings=item_embeddings,
+                        item_ids=item_ids,
+                    ),
+                    device=device,
+                    float_dtype=torch.bfloat16 if main_module_bf16 else None,
+                )
+                eval_dict = eval_metrics_v2_from_tensors(
+                    eval_state,
+                    model.module,
+                    seq_features,
+                    target_ids=target_ids,
+                    target_ratings=target_ratings,
+                    user_max_batch_size=eval_user_max_batch_size,
+                    dtype=torch.bfloat16 if main_module_bf16 else None,
+                )
+                add_to_summary_writer(
+                    writer, batch_id, eval_dict, prefix="eval", world_size=world_size
+                )
+                logging.info(
+                    f"rank {rank}:  batch-stat (eval): iter {batch_id} (epoch {epoch}): "
+                    + f"NDCG@10 {_avg(eval_dict['ndcg@10'], world_size):.4f}, "
+                    f"HR@10 {_avg(eval_dict['hr@10'], world_size):.4f}, "
+                    f"HR@50 {_avg(eval_dict['hr@50'], world_size):.4f}, "
+                    + f"MRR {_avg(eval_dict['mrr'], world_size):.4f} "
+                )
+                model.train()
+
+            # TODO: consider separating this out?
+            B, N = seq_features.past_ids.shape
+            seq_features.past_ids.scatter_(
+                dim=1,
+                index=seq_features.past_lengths.view(-1, 1),
+                src=target_ids.view(-1, 1),
+            )
+
+            opt.zero_grad()
+            input_embeddings = model.module.get_item_embeddings(seq_features.past_ids)
+            seq_embeddings = model(
+                past_lengths=seq_features.past_lengths,
+                past_ids=seq_features.past_ids,
+                past_embeddings=input_embeddings,
+                past_payloads=seq_features.past_payloads,
+            )  # [B, X]
+
+            supervision_ids = seq_features.past_ids
+
+            if sampling_strategy == "in-batch":
+                # get_item_embeddings currently assume 1-d tensor.
+                in_batch_ids = supervision_ids.view(-1)
+                negatives_sampler.process_batch(
+                    ids=in_batch_ids,
+                    presences=(in_batch_ids != 0),
+                    embeddings=model.module.get_item_embeddings(in_batch_ids),
+                )
+            else:
+                # pyre-fixme[16]: `InBatchNegativesSampler` has no attribute
+                #  `_item_emb`.
+                negatives_sampler._item_emb = model.module._embedding_module._item_emb
+
+            ar_mask = supervision_ids[:, 1:] != 0
+            loss, aux_losses = ar_loss(
+                lengths=seq_features.past_lengths,  # [B],
+                output_embeddings=seq_embeddings[:, :-1, :],  # [B, N-1, D]
+                supervision_ids=supervision_ids[:, 1:],  # [B, N-1]
+                supervision_embeddings=input_embeddings[:, 1:, :],  # [B, N - 1, D]
+                supervision_weights=ar_mask.float(),
+                negatives_sampler=negatives_sampler,
+                **seq_features.past_payloads,
+            )  # [B, N]
+
+            main_loss = loss.detach().clone()
+            loss = get_weighted_loss(loss, aux_losses, weights=loss_weights or {})
+
+            if rank == 0:
+                assert writer is not None
+                writer.add_scalar("losses/ar_loss", loss, batch_id)
+                writer.add_scalar("losses/main_loss", main_loss, batch_id)
+
+            loss.backward()
+
+            # Optional linear warmup.
+            if batch_id < num_warmup_steps:
+                lr_scalar = min(1.0, float(batch_id + 1) / num_warmup_steps)
+                for pg in opt.param_groups:
+                    pg["lr"] = lr_scalar * learning_rate
+                lr = lr_scalar * learning_rate
+            else:
+                lr = learning_rate
+
+            if (batch_id % eval_interval) == 0:
+                logging.info(
+                    f" rank: {rank}, batch-stat (train): step {batch_id} "
+                    f"(epoch {epoch} in {time.time() - last_training_time:.2f}s): {loss:.6f}"
+                )
+                last_training_time = time.time()
+                if rank == 0:
+                    assert writer is not None
+                    writer.add_scalar("loss/train", loss, batch_id)
+                    writer.add_scalar("lr", lr, batch_id)
+
+            opt.step()
+
+            batch_id += 1
+
+        def is_full_eval(epoch: int) -> bool:
+            return (epoch % full_eval_every_n) == 0
+
+        # eval per epoch
+        eval_dict_all = None
+        eval_start_time = time.time()
+        model.eval()
+        eval_state = get_eval_state(
+            model=model.module,
+            all_item_ids=dataset.all_item_ids,
+            negatives_sampler=negatives_sampler,
+            top_k_module_fn=lambda item_embeddings, item_ids: get_top_k_module(
+                top_k_method=top_k_method,
+                model=model.module,
+                item_embeddings=item_embeddings,
+                item_ids=item_ids,
+            ),
+            device=device,
+            float_dtype=torch.bfloat16 if main_module_bf16 else None,
+        )
+        for eval_iter, row in enumerate(iter(eval_data_loader)):
+            seq_features, target_ids, target_ratings = movielens_seq_features_from_row(
+                row, device=device, max_output_length=gr_output_length + 1
+            )
+            eval_dict = eval_metrics_v2_from_tensors(
+                eval_state,
+                model.module,
+                seq_features,
+                target_ids=target_ids,
+                target_ratings=target_ratings,
+                user_max_batch_size=eval_user_max_batch_size,
+                dtype=torch.bfloat16 if main_module_bf16 else None,
+            )
+
+            if eval_dict_all is None:
+                eval_dict_all = {}
+                for k, v in eval_dict.items():
+                    eval_dict_all[k] = []
+
+            for k, v in eval_dict.items():
+                eval_dict_all[k] = eval_dict_all[k] + [v]
+            del eval_dict
+
+            if (eval_iter + 1 >= partial_eval_num_iters) and (not is_full_eval(epoch)):
+                logging.info(
+                    f"Truncating epoch {epoch} eval to {eval_iter + 1} iters to save cost.."
+                )
+                break
+
+        assert eval_dict_all is not None
+        for k, v in eval_dict_all.items():
+            eval_dict_all[k] = torch.cat(v, dim=-1)
+
+        ndcg_10 = _avg(eval_dict_all["ndcg@10"], world_size=world_size)
+        ndcg_50 = _avg(eval_dict_all["ndcg@50"], world_size=world_size)
+        hr_10 = _avg(eval_dict_all["hr@10"], world_size=world_size)
+        hr_50 = _avg(eval_dict_all["hr@50"], world_size=world_size)
+        mrr = _avg(eval_dict_all["mrr"], world_size=world_size)
+
+        add_to_summary_writer(
+            writer,
+            batch_id=epoch,
+            metrics=eval_dict_all,
+            prefix="eval_epoch",
+            world_size=world_size,
+        )
+        if full_eval_every_n > 1 and is_full_eval(epoch):
+            add_to_summary_writer(
+                writer,
+                batch_id=epoch,
+                metrics=eval_dict_all,
+                prefix="eval_epoch_full",
+                world_size=world_size,
+            )
+        if rank == 0 and epoch > 0 and (epoch % save_ckpt_every_n) == 0:
+            torch.save(
+                {
+                    "epoch": epoch,
+                    "model_state_dict": model.state_dict(),
+                    "optimizer_state_dict": opt.state_dict(),
+                },
+                f"./ckpts/{model_desc}_ep{epoch}",
+            )
+
+        logging.info(
+            f"rank {rank}: eval @ epoch {epoch} in {time.time() - eval_start_time:.2f}s: "
+            f"NDCG@10 {ndcg_10:.4f}, NDCG@50 {ndcg_50:.4f}, HR@10 {hr_10:.4f}, HR@50 {hr_50:.4f}, MRR {mrr:.4f}"
+        )
+        last_training_time = time.time()
+
+    if rank == 0:
+        if writer is not None:
+            writer.flush()
+            writer.close()
+
+        torch.save(
+            {
+                "epoch": epoch,
+                "model_state_dict": model.state_dict(),
+                "optimizer_state_dict": opt.state_dict(),
+            },
+            f"./ckpts/{model_desc}_ep{epoch}",
+        )
+
+    cleanup()
diff --git a/recommendation_v4/generative_recommenders/tests/test_common.py b/recommendation_v4/generative_recommenders/tests/test_common.py
new file mode 100644
index 000000000..be3823d67
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/tests/test_common.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import switch_to_contiguous_if_needed
+
+
+class SwitchToContiguousIfNeededTest(unittest.TestCase):
+    def test_torchscript_does_not_compile_fx_tracing_helper(self) -> None:
+        class ContiguousModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return switch_to_contiguous_if_needed(x)
+
+        scripted = torch.jit.script(ContiguousModule())
+        x = torch.arange(12).reshape(3, 4).transpose(0, 1)
+
+        out = scripted(x)
+
+        self.assertTrue(torch.equal(out, x))
+        self.assertTrue(out.is_contiguous())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/main.py b/recommendation_v4/main.py
new file mode 100644
index 000000000..445f25820
--- /dev/null
+++ b/recommendation_v4/main.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Main entry point for model training. Please refer to README.md for usage instructions.
+"""
+
+import logging
+import os
+from typing import List, Optional
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"  # Hide excessive tensorflow debug messages
+import sys
+
+import fbgemm_gpu  # noqa: F401, E402
+import gin
+import torch
+import torch.multiprocessing as mp
+from absl import app, flags
+from generative_recommenders.research.trainer.train import train_fn
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+def delete_flags(FLAGS, keys_to_delete: List[str]) -> None:  # pyre-ignore [2]
+    keys = [key for key in FLAGS._flags()]
+    for key in keys:
+        if key in keys_to_delete:
+            delattr(FLAGS, key)
+
+
+delete_flags(flags.FLAGS, ["gin_config_file", "master_port"])
+flags.DEFINE_string("gin_config_file", None, "Path to the config file.")
+flags.DEFINE_integer("master_port", 12355, "Master port.")
+FLAGS = flags.FLAGS  # pyre-ignore [5]
+
+
+def mp_train_fn(
+    rank: int,
+    world_size: int,
+    master_port: int,
+    gin_config_file: Optional[str],
+) -> None:
+    if gin_config_file is not None:
+        # Hack as absl doesn't support flag parsing inside multiprocessing.
+        logging.info(f"Rank {rank}: loading gin config from {gin_config_file}")
+        gin.parse_config_file(gin_config_file)
+
+    train_fn(rank, world_size, master_port)
+
+
+def _main(argv) -> None:  # pyre-ignore [2]
+    world_size = torch.cuda.device_count()
+
+    mp.set_start_method("forkserver")
+    mp.spawn(
+        mp_train_fn,
+        args=(world_size, FLAGS.master_port, FLAGS.gin_config_file),
+        nprocs=world_size,
+        join=True,
+    )
+
+
+def main() -> None:
+    app.run(_main)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/md5sums_yambda_5b_processed.txt b/recommendation_v4/md5sums_yambda_5b_processed.txt
new file mode 100644
index 000000000..82998cca3
--- /dev/null
+++ b/recommendation_v4/md5sums_yambda_5b_processed.txt
@@ -0,0 +1,22 @@
+# MD5 checksums for the preprocessed yambda-5b dataset (processed_5b/).
+#
+# Format: standard `md5sum` output -> "<md5hash>  <relative_path>".
+# Paths are relative to ${DLRM_DATA_PATH}/processed_5b/.
+#
+# These hashes are PLACEHOLDERS (TODO). They must be generated from a canonical
+# preprocessing run before this benchmark is submitted, e.g.:
+#
+#   cd "${DLRM_DATA_PATH}/processed_5b"
+#   md5sum train_sessions.parquet test_events.parquet session_index.parquet \
+#          item_popularity.npy split_meta.json \
+#     > <repo>/md5sums_yambda_5b_processed.txt
+#
+# Until then `verify_dataset.sh` falls back to an existence/layout check and
+# warns that checksums are not yet pinned.
+#
+# TODO(rcp/data): replace the lines below with real md5 hashes.
+TODO_GENERATE_HASH  train_sessions.parquet
+TODO_GENERATE_HASH  test_events.parquet
+TODO_GENERATE_HASH  session_index.parquet
+TODO_GENERATE_HASH  item_popularity.npy
+TODO_GENERATE_HASH  split_meta.json
diff --git a/recommendation_v4/preprocess_public_data.py b/recommendation_v4/preprocess_public_data.py
new file mode 100644
index 000000000..927ccf4c6
--- /dev/null
+++ b/recommendation_v4/preprocess_public_data.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Usage: mkdir -p tmp/ && python3 preprocess_public_data.py
+"""
+
+from generative_recommenders.research.data.preprocessor import get_common_preprocessors
+
+
+def main() -> None:
+    get_common_preprocessors()["ml-1m"].preprocess_rating()
+    get_common_preprocessors()["ml-20m"].preprocess_rating()
+    # get_common_preprocessors()["ml-1b"].preprocess_rating()
+    get_common_preprocessors()["amzn-books"].preprocess_rating()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/rcp/README.md b/recommendation_v4/rcp/README.md
new file mode 100644
index 000000000..02a977862
--- /dev/null
+++ b/recommendation_v4/rcp/README.md
@@ -0,0 +1,28 @@
+# Reference Convergence Points (RCP)
+
+**Status: placeholder — RCPs not yet generated. Intentionally left blank.**
+
+This directory will hold the Reference Convergence Points for the
+recommendation_v4 (HSTU / yambda-5b) benchmark once convergence runs are
+complete.
+
+Per the MLPerf Training
+[CONTRIBUTING guidance](https://github.com/mlcommons/training_policies/blob/master/CONTRIBUTING.md)
+("Some things to note while generating reference convergence points"):
+
+- Use FP32 or BF16 precision and record the exact precision used in the RCP JSON.
+- Generate RCPs for at least **3 reasonable batch sizes**.
+- Run RCPs with an eval frequency **higher** than the chosen benchmark eval
+  frequency (more data points for picking the target accuracy).
+- Run at least **2N seeds**, where N = number of submission runs.
+
+The convergence target for this benchmark is **eval AUC >= 0.80275** (see
+[../README.MD](../README.MD) §9). The RCP JSON files and convergence-curve plots
+(samples-to-converge vs. batch size / seed) will be committed here.
+
+## TODO
+
+- [ ] Run >= 2N-seed convergence sweeps at >= 3 batch sizes.
+- [ ] Record precision (FP32/BF16) per the rules.
+- [ ] Add `rcp_<batchsize>.json` files in the mlperf_logging RCP format.
+- [ ] Add convergence-curve plots and the chosen target-accuracy justification.
diff --git a/recommendation_v4/requirements.txt b/recommendation_v4/requirements.txt
new file mode 100644
index 000000000..852aa149b
--- /dev/null
+++ b/recommendation_v4/requirements.txt
@@ -0,0 +1,43 @@
+# Frozen dependency versions for the recommendation_v4 (HSTU / yambda-5b) MLPerf
+# reference. The CANONICAL, fully-reproducible environment is the Dockerfile
+# (built on rocm/primus:v26.3); see docs/training_recipe.md for the per-platform
+# (MI350X / B200) install commands and rationale. The pins below mirror that
+# stack. torch / torchvision / torchaudio / fbgemm_gpu / torchrec are
+# accelerator-specific and MUST be installed with --no-deps from the matching
+# index (see Dockerfile) so pip does not clobber the +rocm wheels.
+
+# --- accelerator stack (install via Dockerfile; --no-deps, matching index) ---
+# torch==2.12.0+rocm7.2          # --index-url https://download.pytorch.org/whl/rocm7.2
+# torchvision==0.27.0+rocm7.2
+# torchaudio==2.11.0+rocm7.2
+# fbgemm_gpu                     # built from FBGEMM commit 10b775730212923f65f7b78f79b6a01d80cf3c29 for gfx950
+torch==2.12.0
+fbgemm_gpu==1.7.0
+torchrec @ git+https://github.com/pytorch/torchrec.git@v2026.06.01.00
+
+# --- data / config / logging -------------------------------------------------
+polars-u64-idx==1.33.1
+gin-config==0.5.0
+absl-py==2.1.0
+pandas==2.2.3
+pyarrow==17.0.0
+numpy==1.26.4
+xxhash==3.5.0
+datasets==3.2.0
+huggingface_hub==0.27.0
+
+# --- metrics / training utils ------------------------------------------------
+torchmetrics==1.0.3
+tensordict==0.6.2
+tensorboard==2.19.0
+pyre-extensions==0.0.32
+iopath==0.1.10
+typing-inspect==0.9.0
+psutil==6.1.0
+tqdm==4.67.1
+pyyaml==6.0.2
+pybind11==2.13.6
+lightning-utilities==0.11.9
+
+# --- MLPerf compliance logging (pinned to the Training 6.0 tag) --------------
+mlperf-logging @ git+https://github.com/mlcommons/logging.git@6.0.0-rc6
diff --git a/recommendation_v4/run_and_time.sh b/recommendation_v4/run_and_time.sh
new file mode 100755
index 000000000..a0207e71c
--- /dev/null
+++ b/recommendation_v4/run_and_time.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# MLPerf Training reference script: run the benchmark and report wall-clock time.
+#
+# Runs the full-reference HSTU / yambda-5b streaming train+eval sweep to the
+# MLPerf quality target (eval AUC >= 0.80275) and prints the elapsed time of the
+# timed region. This is the canonical single-host (8-GPU) entry point; for
+# multi-node SLURM launches use scripts/launch_slurm.sh (which calls into the
+# same trainer).
+#
+# Usage:
+#   DLRM_DATA_PATH=/path/to/dlrm_data ./run_and_time.sh
+#
+# Env (run shape / cadence -- defaults are the FULL reference sweep):
+#   DLRM_DATA_PATH        data root (required).
+#   SEED                  RNG seed (default 1).
+#   START_TS / NUM_TRAIN_TS   window range (default 0 / 299 = full sweep).
+#   EVAL_EVERY_DATA_PCT   eval cadence as a fraction of train data (default 0.005).
+#   AUC_THRESHOLD         convergence target (default 0.80275).
+#   GPUS_PER_NODE         GPUs on this host (default 8).
+#   RUN_NAME              results dir name under results/ (default reference_run).
+set -euo pipefail
+
+: "${DLRM_DATA_PATH:?Set DLRM_DATA_PATH to the data root}"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "${REPO_ROOT}"
+
+# ---- Reference run shape (full sweep to the quality target) -----------------
+export SEED="${SEED:-1}"
+export START_TS="${START_TS:-0}"
+export NUM_TRAIN_TS="${NUM_TRAIN_TS:-299}"
+export NUM_TRAIN_BATCHES="${NUM_TRAIN_BATCHES:-0}"
+export NUM_EVAL_BATCHES="${NUM_EVAL_BATCHES:-0}"
+export EVAL_EVERY_N_WINDOWS="${EVAL_EVERY_N_WINDOWS:-0}"
+export EVAL_EVERY_DATA_PCT="${EVAL_EVERY_DATA_PCT:-0.005}"
+export AUC_THRESHOLD="${AUC_THRESHOLD:-0.80275}"
+export RUN_NAME="${RUN_NAME:-reference_run}"
+
+# ---- Single-host distributed topology (override for multi-node) -------------
+export GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+export NNODES="${NNODES:-1}"
+export NODE_RANK="${NODE_RANK:-0}"
+export WORLD_SIZE="${WORLD_SIZE:-$((NNODES * GPUS_PER_NODE))}"
+export MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
+export MASTER_PORT="${MASTER_PORT:-29500}"
+
+# ---- MLPerf compliance logging ----------------------------------------------
+export MLPERF_LOGGING="${MLPERF_LOGGING:-1}"
+export MLPERF_LOG_PATH="${MLPERF_LOG_PATH:-${REPO_ROOT}/results/${RUN_NAME}/mlperf/yambda_5b_mlperf.log}"
+export MLPERF_SUBMISSION_PLATFORM="${MLPERF_SUBMISSION_PLATFORM:-MI355X}"
+mkdir -p "$(dirname "${MLPERF_LOG_PATH}")"
+
+# ---- Timed region -----------------------------------------------------------
+# Pull the start timestamp into a clear region per the MLPerf run_and_time.sh idiom.
+start=$(date +%s)
+echo "STARTING TIMING RUN AT $(date -u '+%Y-%m-%d %r')"
+
+python -m generative_recommenders.dlrm_v3.train.train_ranker \
+    --dataset yambda-5b \
+    --mode streaming-train-eval
+
+end=$(date +%s)
+result=$(( end - start ))
+echo "ENDING TIMING RUN AT $(date -u '+%Y-%m-%d %r')"
+echo "RESULT,recommendation_v4_hstu_yambda_5b,${SEED},${result},$(whoami),$(date -u '+%Y-%m-%d %r')"
diff --git a/recommendation_v4/run_fractal_expansion.py b/recommendation_v4/run_fractal_expansion.py
new file mode 100644
index 000000000..308eadea2
--- /dev/null
+++ b/recommendation_v4/run_fractal_expansion.py
@@ -0,0 +1,588 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Run fractal expansion introduced in https://arxiv.org/abs/1901.08910.
+Implementation adapted from the scripts used to generate MovieLens-1B
+(https://grouplens.org/datasets/movielens/movielens-1b/).
+"""
+
+# Generate a 3B dataset (takes around 50 minutes):
+# python run_fractal_expansion.py --input-csv-file ~/data/ml-20m/ratings.csv --write-dataset True --output-prefix ~/data/ml-3b/
+# Generate a 13B dataset with 440M item size:
+# python run_fractal_expansion.py --input-csv-file ~/data/ml-20m/ratings.csv --write-dataset True --output-prefix ~/data/ml-13b/ --num-row-multiplier 16 --num-col-multiplier 16384 --element-sample-rate 0.2 --block-sample-rate 0.05
+# Generate a 18B dataset with 1B item size:
+# python run_fractal_expansion.py --input-csv-file ~/data/ml-20m/ratings.csv --write-dataset True --output-prefix ~/data/ml-18b/ --num-row-multiplier 20 --num-col-multiplier 36864 --element-sample-rate 0.08 --block-sample-rate 0.05
+
+import csv
+import linecache
+import logging
+import os
+import pickle
+from dataclasses import dataclass
+
+import click
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import scipy.linalg
+import skimage.transform as transform
+from scipy import sparse
+from scipy.sparse import linalg
+from sklearn.utils import shuffle
+from tqdm import tqdm
+
+
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+@dataclass
+class SparseMatrixMetadata:
+    num_interactions: int = 0
+    num_rows: int = 0
+    num_cols: int = 0
+
+
+def _dropout_sparse_coo_matrix(
+    sparse_matrix, rate, min_dropout_rate=0.005, max_dropout_rate=0.999
+):
+    assert min_dropout_rate <= max_dropout_rate
+    sampling_rate = 1.0 - rate
+
+    sampled_fraction = min(
+        max(sampling_rate, 1.0 - max_dropout_rate), 1.0 - min_dropout_rate
+    )
+    if sampled_fraction != sampling_rate:
+        logger.warning(
+            f"Desired sampling rate {sampling_rate} clipped to {sampled_fraction}."
+        )
+    num_sampled = min(
+        max(int(sparse_matrix.nnz * sampled_fraction), 1), sparse_matrix.nnz
+    )
+    sampled_indices = np.random.choice(
+        sparse_matrix.nnz, size=num_sampled, replace=False
+    )
+    return sparse.coo_matrix(
+        (
+            sparse_matrix.data[sampled_indices],
+            (sparse_matrix.row[sampled_indices], sparse_matrix.col[sampled_indices]),
+        ),
+        shape=sparse_matrix.shape,
+    )
+
+
+def shuffle_sparse_matrix(
+    sparse_matrix, dropout_rate=0.0, min_dropout_rate=0.005, max_dropout_rate=0.999
+):
+    """
+    Shuffle sparse matrix encoded as a SciPy csr matrix.
+    """
+
+    assert dropout_rate >= 0.0 and dropout_rate <= 1.0
+    (num_rows, num_cols) = sparse_matrix.shape
+    shuffled_rows = shuffle(np.arange(num_rows))
+    shuffled_cols = shuffle(np.arange(num_cols))
+    sparse_matrix = _dropout_sparse_coo_matrix(
+        sparse_matrix, dropout_rate, min_dropout_rate, max_dropout_rate
+    )
+    new_row = np.take(shuffled_rows, sparse_matrix.row)
+    new_col = np.take(shuffled_cols, sparse_matrix.col)
+    return sparse.csr_matrix(
+        (sparse_matrix.data, (new_row, new_col)), shape=(num_rows, num_cols)
+    )
+
+
+def graph_reduce(usv, num_rows, num_cols):
+    """Apply algorithm 2 in https://arxiv.org/pdf/1901.08910.pdf."""
+
+    def _closest_column_orthogonal_matrix(matrix):
+        return np.matmul(
+            matrix, np.linalg.inv(scipy.linalg.sqrtm(np.matmul(matrix.T, matrix)))
+        )
+
+    u, s, v = usv
+    k = min(num_rows, num_cols)
+    u_random_proj = transform.resize(u[:, :k], (num_rows, k))
+    v_random_proj = transform.resize(v[:k, :], (k, num_cols))
+    u_random_proj_orth = _closest_column_orthogonal_matrix(u_random_proj)
+    v_random_proj_orth = _closest_column_orthogonal_matrix(v_random_proj.T).T
+    return np.matmul(u_random_proj_orth, np.matmul(np.diag(s[:k]), v_random_proj_orth))
+
+
+def rescale(matrix, rescale_w_abs=False, element_sample_rate=1.0):
+    """Rescale all values of the matrix into [0, 1]."""
+    if rescale_w_abs:
+        abs_matrix = np.abs(matrix.copy())
+        out = abs_matrix / abs_matrix.max()
+    else:
+        out = (matrix - matrix.min()) / (matrix.max() - matrix.min())
+        assert out.min() >= 0 and out.max() <= 1
+    return out * element_sample_rate
+
+
+def _compute_row_block(
+    i, left_matrix, right_matrix, block_sample_rate, indices_out_path, remove_empty_rows
+):
+    """Compute row block of expansion for row i of the left_matrix."""
+
+    kron_blocks = []
+    num_rows = 0
+    num_removed_rows = 0
+    num_interactions = 0
+
+    for j in range(left_matrix.shape[1]):
+        if np.random.random() <= block_sample_rate:
+            dropout_rate = 1.0 - left_matrix[i, j]
+            kron_block = shuffle_sparse_matrix(right_matrix, dropout_rate).tocsr()
+            num_interactions += kron_block.nnz
+            kron_blocks.append(kron_block)
+            logger.info(f"Kronecker block ({i}, {j}) processed.")
+        else:
+            kron_blocks.append(sparse.csr_matrix(right_matrix.shape))
+            logger.info(f"Kronecker block ({i}, {j}) skipped.")
+
+    rows_to_write = sparse.hstack(kron_blocks).tocsr()
+    logger.info("Writing dataset row by row.")
+
+    # Write Kronecker product line per line.
+    filepath = f"{indices_out_path}_{i}.csv"
+    os.makedirs(os.path.dirname(filepath), exist_ok=True)
+    with open(filepath, "w", newline="") as file:
+        writer = csv.writer(file)
+        for k in range(right_matrix.shape[0]):
+            items_to_write = rows_to_write.getrow(k).indices
+            ratings_to_write = rows_to_write.getrow(k).data
+            num = items_to_write.shape[0]
+            if remove_empty_rows and (not num):
+                logger.info(f"Removed empty output row {i * left_matrix.shape[0] + k}.")
+                num_removed_rows += 1
+                continue
+            num_rows += 1
+            writer.writerow(
+                [
+                    i * right_matrix.shape[0] + k,
+                    ",".join([str(x) for x in items_to_write]),
+                    ",".join([str(x) for x in ratings_to_write]),
+                ]
+            )
+            if k % 100000 == 0:
+                logger.info(f"Done producing data set row {k}.")
+
+    num_cols = rows_to_write.shape[1]
+    metadata = SparseMatrixMetadata(
+        num_interactions=num_interactions, num_rows=num_rows, num_cols=num_cols
+    )
+    logger.info(
+        f"Done with left matrix row {i}, {num_interactions} interactions written in shard, {num_removed_rows} rows removed in shard."
+    )
+    return (num_removed_rows, metadata)
+
+
+def visualize_samples(
+    right_matrix,
+    visualize_num_samples,
+    expanded_file_name,
+    output_prefix,
+):
+    # Note: only the rows of the first Kronecker block are visualized.
+    logger.info("visualize dataset row by row.")
+    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
+    axs[0].set_title("Original data Histogram")
+    axs[0].set_xlabel("Value")
+    axs[0].set_ylabel("Frequency")
+    axs[1].set_title("Expended Row Histogram")
+    axs[1].set_xlabel("Value")
+    axs[1].set_ylabel("Frequency")
+    for k in range(visualize_num_samples):
+        original_row = right_matrix.getrow(k).data
+        line = linecache.getline(expanded_file_name, k + 1)
+        reader = csv.reader([line])
+        parsed_line = next(reader)
+        expended_row = eval(parsed_line[2])
+        original_hist_counts, original_bin_edges = np.histogram(original_row, bins=9)
+        expended_hist_counts, expended_bin_edges = np.histogram(expended_row, bins=9)
+        axs[0].plot(original_bin_edges[:-1], original_hist_counts, alpha=0.2)
+        axs[1].plot(expended_bin_edges[:-1], expended_hist_counts, alpha=0.2)
+        axs[0].fill_between(original_bin_edges[:-1], original_hist_counts, alpha=0.2)
+        axs[1].fill_between(expended_bin_edges[:-1], expended_hist_counts, alpha=0.2)
+    plt.tight_layout()
+    plt.savefig(f"{output_prefix}_sample_distribution.png")
+    logger.info("Sample visualization finished.")
+
+
+def build_randomized_kronecker(
+    left_matrix,
+    right_matrix,
+    block_sample_rate,
+    indices_out_path,
+    metadata_out_path=None,
+    remove_empty_rows=True,
+):
+    """Compute randomized Kronecker product and dump it on the fly based on https://arxiv.org/pdf/1901.08910.pdf."""
+    logger.info(f"Writing item sequences to pickle files {metadata_out_path}.")
+
+    num_rows = 0
+    num_removed_rows = 0
+    num_cols = left_matrix.shape[1] * right_matrix.shape[1]
+    num_interactions = 0
+
+    filepath = f"{indices_out_path}_users.csv"
+    os.makedirs(os.path.dirname(filepath), exist_ok=True)
+    with open(filepath, "w", newline="") as file:
+        writer = csv.writer(file)
+        for i in tqdm(range(left_matrix.shape[0])):
+            (shard_num_removed_rows, shard_metadata) = _compute_row_block(
+                i,
+                left_matrix,
+                right_matrix,
+                block_sample_rate,
+                indices_out_path,
+                remove_empty_rows,
+            )
+            writer.writerow([i, shard_metadata.num_rows])
+            file.flush()
+            num_rows += shard_metadata.num_rows
+            num_removed_rows += shard_num_removed_rows
+            num_interactions += shard_metadata.num_interactions
+
+    logger.info(f"{num_interactions / num_rows} average sequence length")
+    logger.info(f"{num_interactions} total interactions written.")
+    logger.info(f"{num_removed_rows} total rows removed.")
+
+    metadata = SparseMatrixMetadata(
+        num_interactions=num_interactions, num_rows=num_rows, num_cols=num_cols
+    )
+    if metadata_out_path is not None:
+        logger.info(f"Writing metadata file to {metadata_out_path}")
+        with open(metadata_out_path, "wb") as output_file:
+            pickle.dump(metadata, output_file)
+    return metadata
+
+
+def _preprocess_movie_lens(ratings_df, binary=False):
+    """
+    Filters out users with less than three distinct timestamps.
+    """
+
+    def _create_index(df, colname):
+        value_set = sorted(set(df[colname].values))
+        num_unique = len(value_set)
+        return dict(zip(value_set, range(num_unique)))
+
+    if not binary:
+        ratings_df["data"] = ratings_df["rating"]
+    else:
+        ratings_df["data"] = 1.0
+    ratings_df["binary_data"] = 1.0
+    num_timestamps = ratings_df[["userId", "timestamp"]].groupby("userId").nunique()
+    ratings_df["numberOfTimestamps"] = ratings_df["userId"].apply(
+        lambda x: num_timestamps["timestamp"][x]
+    )
+    ratings_df = ratings_df[ratings_df["numberOfTimestamps"] > 2]
+    user_id_to_user_idx = _create_index(ratings_df, "userId")
+    item_id_to_item_idx = _create_index(ratings_df, "movieId")
+    ratings_df["row"] = ratings_df["userId"].apply(lambda x: user_id_to_user_idx[x])
+    ratings_df["col"] = ratings_df["movieId"].apply(lambda x: item_id_to_item_idx[x])
+    return ratings_df
+
+
+def normalize(matrix):
+    norm_matrix = matrix.copy()
+    if isinstance(norm_matrix, np.ndarray):
+        norm_matrix -= norm_matrix.mean()
+    else:
+        norm_matrix.data -= norm_matrix.mean()
+    max_val = norm_matrix.max()
+    min_val = norm_matrix.min()
+    if isinstance(norm_matrix, np.ndarray):
+        norm_matrix /= max(abs(max_val), abs(min_val))
+    else:
+        norm_matrix.data /= max(abs(max_val), abs(min_val))
+    return norm_matrix
+
+
+def plot_distribution(user_wise_sum, item_wise_sum, s, title_prefix, normalized=False):
+    y_label = "rating sums" if normalized else "number of ratings"
+    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
+    ax1.loglog(
+        np.arange(len(user_wise_sum)) + 1,
+        np.sort(user_wise_sum)[::-1],
+        linestyle="-",
+        color="blue",
+        marker="",
+    )
+    ax1.set_title(f"{title_prefix} matrix user-wise rating sums")
+    ax1.set_xlabel("User rank")
+    ax1.set_ylabel(y_label)
+    ax1.grid(True)
+    ax2.loglog(
+        np.arange(len(item_wise_sum)) + 1,
+        np.sort(item_wise_sum)[::-1],
+        linestyle="-",
+        color="green",
+        marker="",
+    )
+    ax2.set_title(f"{title_prefix} matrix item-wise rating sums")
+    ax2.set_xlabel("Item rank")
+    ax2.set_ylabel(y_label)
+    ax2.grid(True)
+    ax3.loglog(
+        np.arange(len(s)) + 1, np.sort(s)[::-1], linestyle="-", color="red", marker=""
+    )
+    ax3.set_title(f"{title_prefix} matrix singular values")
+    ax3.set_xlabel("Singular value Rank")
+    ax3.set_ylabel("Magnitude")
+    ax3.grid(True)
+    plt.tight_layout()
+    plt.savefig(f"{title_prefix}_distribution.png")
+
+
+def visualize_distribution(mat, reduced_mat, s, reduced_s, normalized=False, title=""):
+    user_wise_sum = np.asarray(mat.sum(axis=1)).flatten()
+    item_wise_sum = np.asarray(mat.sum(axis=0)).flatten()
+    assert len(user_wise_sum) == mat.shape[0]
+    assert len(item_wise_sum) == mat.shape[1]
+    plot_distribution(
+        user_wise_sum,
+        item_wise_sum,
+        s,
+        title_prefix=f"{title}_Original",
+        normalized=normalized,
+    )
+
+    reduced_user_wise_sum = np.asarray(reduced_mat.sum(axis=1)).flatten()
+    reduced_item_wise_sum = np.asarray(reduced_mat.sum(axis=0)).flatten()
+    assert len(reduced_user_wise_sum) == reduced_mat.shape[0]
+    assert len(reduced_item_wise_sum) == reduced_mat.shape[1]
+    plot_distribution(
+        reduced_user_wise_sum,
+        reduced_item_wise_sum,
+        reduced_s,
+        title_prefix=f"{title}_Reduced",
+        normalized=normalized,
+    )
+
+    expanded_s = np.einsum("i,j->ij", reduced_s, s).flatten()
+    expanded_user_wise_sum = np.einsum("ij,k->ik", reduced_mat, user_wise_sum).flatten()
+    expanded_item_wise_sum = np.einsum("ij,k->jk", reduced_mat, item_wise_sum).flatten()
+    assert len(expanded_user_wise_sum) == reduced_mat.shape[0] * mat.shape[0]
+    assert len(expanded_item_wise_sum) == reduced_mat.shape[1] * mat.shape[1]
+    plot_distribution(
+        expanded_user_wise_sum,
+        expanded_item_wise_sum,
+        expanded_s,
+        title_prefix=f"{title}_Expanded",
+        normalized=normalized,
+    )
+
+
+def expand_dataset(
+    ratings_matrix,
+    binary_ratings_matrix,
+    num_users,
+    num_items,
+    reduced_num_rows,
+    reduced_num_cols,
+    rescale_w_abs,
+    element_sample_rate,
+    block_sample_rate,
+    visualize,
+    write_dataset,
+    output_prefix,
+):
+    k = min(reduced_num_rows, reduced_num_cols)
+    norm_rating_matrix = normalize(ratings_matrix)
+    (u, s, v) = linalg.svds(
+        norm_rating_matrix, k=k, maxiter=None, return_singular_vectors=True
+    )
+
+    logger.info(
+        f"Creating reduced rating matrix (size {reduced_num_rows}, {reduced_num_cols})"
+    )
+    reduced_matrix = graph_reduce((u, s, v), reduced_num_rows, reduced_num_cols)
+    norm_reduced_matrix = normalize(reduced_matrix)
+    (_, s_reduce, _) = linalg.svds(
+        norm_reduced_matrix, k=k - 1, maxiter=None, return_singular_vectors=True
+    )
+    reduced_matrix = rescale(
+        reduced_matrix,
+        rescale_w_abs=rescale_w_abs,
+        element_sample_rate=element_sample_rate,
+    )
+    logger.info(f"largest singular value of the reduced matrix is {s_reduce[-1]}")
+    logger.info(
+        f"Sampling rate mean is {reduced_matrix.mean()}, var is {reduced_matrix.var()}, min is {reduced_matrix.min()}, max is {reduced_matrix.max()}"
+    )
+    samples = reduced_matrix.sum() * ratings_matrix.nnz * block_sample_rate
+    logger.info(
+        f"Expected number of synthetic samples: {samples}, sparsity is {samples / (num_users * num_items * reduced_num_rows * reduced_num_cols)}, average seqlen is {samples / (num_users * reduced_num_rows)}"
+    )
+
+    if visualize:
+        s = linalg.svds(
+            norm_rating_matrix, k=20 * k, maxiter=None, return_singular_vectors=False
+        )
+        visualize_distribution(
+            norm_rating_matrix,
+            norm_reduced_matrix,
+            s,
+            s_reduce,
+            normalized=True,
+            title="Normalized",
+        )
+        visualize_distribution(
+            binary_ratings_matrix,
+            reduced_matrix,
+            s,
+            s_reduce,
+            normalized=False,
+            title="Binary",
+        )
+    if write_dataset:
+        output_file = (
+            output_prefix + str(reduced_num_rows) + "x" + str(reduced_num_cols)
+        )
+        output_file_metadata = None
+
+        logger.info(f"Creating synthetic dataset and dumping to {output_file}.")
+        build_randomized_kronecker(
+            left_matrix=reduced_matrix,
+            right_matrix=ratings_matrix.tocoo(),
+            block_sample_rate=block_sample_rate,
+            indices_out_path=output_file,
+            metadata_out_path=output_file_metadata,
+        )
+
+
+@click.command()
+@click.option(
+    "--random-seed",
+    type=int,
+    default=0,
+)
+@click.option(
+    "--input-csv-file",
+    type=str,
+    default="ratings.csv",
+)
+@click.option(
+    "--output-prefix",
+    type=str,
+    default="",
+)
+@click.option(
+    "--num-row-multiplier",
+    type=int,
+    default=16,
+)
+@click.option(
+    "--num-col-multiplier",
+    type=int,
+    default=32,
+)
+@click.option(
+    "--element-sample-rate",
+    type=float,
+    default=1.0,
+)
+@click.option(
+    "--block-sample-rate",
+    type=float,
+    default=1.0,
+)
+@click.option(
+    "--visualize",
+    type=bool,
+    default=False,
+)
+@click.option(
+    "--write-dataset",
+    type=bool,
+    default=False,
+)
+@click.option(
+    "--visualize-num-samples",
+    type=int,
+    default=0,
+)
+def main(
+    random_seed: int,
+    input_csv_file: str,
+    output_prefix: str,
+    num_row_multiplier: int,
+    num_col_multiplier: int,
+    element_sample_rate: float,
+    block_sample_rate: float,
+    visualize: bool,
+    write_dataset: bool,
+    visualize_num_samples: int,
+):
+    np.random.seed(random_seed)
+
+    logger.info(f"Loading and preprocessing MovieLens-20m from {input_csv_file}")
+    with open(input_csv_file, "r") as infile:
+        ratings_df = pd.read_csv(infile, sep=",", header=0)
+    ratings_df = _preprocess_movie_lens(ratings_df, binary=False)
+    num_ratings = len(ratings_df)
+    num_users = len(set(ratings_df["row"].values))
+    num_items = len(set(ratings_df["col"].values))
+    logger.info(
+        f"number of ratings of input dataset is {num_ratings}, number of users is {num_users}, number of items is {num_items}, sparsity is {num_ratings / (num_users * num_items)}, average seqlen is {num_ratings / num_users}"
+    )
+
+    ratings_matrix = sparse.csr_matrix(
+        (
+            ratings_df["data"].values,
+            (ratings_df["row"].values, ratings_df["col"].values),
+        ),
+        shape=(num_users, num_items),
+    )
+    binary_ratings_matrix = sparse.csr_matrix(
+        (
+            ratings_df["binary_data"].values,
+            (ratings_df["row"].values, ratings_df["col"].values),
+        ),
+        shape=(num_users, num_items),
+    )
+    if write_dataset or visualize:
+        expand_dataset(
+            ratings_matrix=ratings_matrix,
+            binary_ratings_matrix=binary_ratings_matrix,
+            num_users=num_users,
+            num_items=num_items,
+            reduced_num_rows=num_row_multiplier,
+            reduced_num_cols=num_col_multiplier,
+            rescale_w_abs=False,
+            element_sample_rate=element_sample_rate,
+            block_sample_rate=block_sample_rate,
+            visualize=visualize,
+            write_dataset=write_dataset,
+            output_prefix=output_prefix,
+        )
+    if visualize_num_samples > 0:
+        logger.info(f"Visualizing {visualize_num_samples} samples.")
+        visualize_samples(
+            right_matrix=ratings_matrix.tocoo(),
+            visualize_num_samples=visualize_num_samples,
+            expanded_file_name=f"{output_prefix}{num_row_multiplier}x{num_col_multiplier}_0.csv",
+            output_prefix="Sample_Histogram",
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/scripts/build_ne_auc_trajectory.py b/recommendation_v4/scripts/build_ne_auc_trajectory.py
new file mode 100644
index 000000000..edc57cfbf
--- /dev/null
+++ b/recommendation_v4/scripts/build_ne_auc_trajectory.py
@@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build a combined train+eval NE/AUC trajectory from a streaming-train-eval log.
+
+The streaming loop (generative_recommenders/dlrm_v3/train/utils.py) emits, via
+MetricsLogger.compute(), one line per logged step of the form:
+
+  INFO:utils:train - Step 201 metrics: {'metric/lifetime_ne/listen_plus':
+    tensor(1.0182, dtype=torch.float64), 'metric/window_ne/listen_plus':
+    tensor(0.9846, ...), ..., 'metric/window_auc/listen_plus': tensor(0.5912),
+    'metric/lifetime_auc/listen_plus': tensor(0.5480)}
+
+and the analogous `eval - Step N metrics:` lines during each (full-holdout) eval
+window, plus throughput lines:
+
+  INFO:utils:train - Step 201 perf: local_sps=97.0 global_sps=776.2
+    step_ms=10553.89 elapsed_sec=680.6 total_samples=205824
+
+This script parses all three, for a chosen task (default listen_plus), and writes:
+  * <out>/trajectory.json   — {"train": {step: {...}}, "eval": {...}, "perf": [...]}
+  * <out>/trajectory.csv    — long-form rows (mode, step, metric, value)
+  * <out>/trajectory_ne_auc.png — NE and AUC vs train step, train + eval overlaid
+                                  (skipped gracefully if matplotlib is absent)
+
+It is dependency-light (stdlib + optional matplotlib) so it runs anywhere the
+log is readable, including the head node.
+
+Usage:
+  python3 scripts/build_ne_auc_trajectory.py LOG [--out DIR] [--task listen_plus]
+"""
+
+import argparse
+import csv
+import json
+import os
+import re
+import sys
+from typing import Dict, List, Optional, Tuple
+
+# `train - Step 201 metrics: {...}` / `eval - Step 17 metrics: {...}`
+_STEP_RE = re.compile(r"(train|eval) - Step (\d+) metrics: \{(.*)\}")
+# `metric/<prefix><name>/<task>': tensor(<value>` — value may be int/float/sci.
+_METRIC_RE = re.compile(
+    r"metric/([A-Za-z0-9_]+)/([A-Za-z0-9_+]+)'?\s*:\s*tensor\(\s*([-0-9.eE+]+)"
+)
+# `train - Step 201 perf: local_sps=97.0 global_sps=776.2 step_ms=10553.89 `
+# `elapsed_sec=680.6 total_samples=205824`
+_PERF_RE = re.compile(
+    r"train - Step (\d+) perf: local_sps=([-0-9.eE+]+) global_sps=([-0-9.eE+]+) "
+    r"step_ms=([-0-9.eE+]+) elapsed_sec=([-0-9.eE+]+) total_samples=(\d+)"
+)
+# `[boundary] eval_ts=181 eval first-batch ...` — marks the start of a full-holdout
+# eval block; the eval runs at whatever the latest train global step was, so we use
+# it to anchor each eval's metrics onto the shared train-global-step x-axis.
+_EVAL_BOUNDARY_RE = re.compile(r"\[boundary\] eval_ts=(\d+) eval first-batch")
+
+# Metrics we surface in the trajectory (others are still captured if present).
+_KEEP = ("window_ne", "lifetime_ne", "window_auc", "lifetime_auc",
+         "window_accuracy", "lifetime_accuracy", "window_gauc", "lifetime_gauc")
+
+
+def _parse_metrics(body: str, task: str) -> Dict[str, float]:
+    row: Dict[str, float] = {}
+    for name, tname, val in _METRIC_RE.findall(body):
+        if tname != task:
+            continue
+        try:
+            row[name] = float(val)
+        except ValueError:
+            continue
+    return row
+
+
+def parse_log(
+    log_path: str, task: str
+) -> Tuple[Dict[str, Dict[int, Dict[str, float]]], List[Dict[str, float]]]:
+    """Return ({'train': {step: {metric: val}}, 'eval': {...}}, perf_rows).
+
+    Train is keyed by train global step (last write wins — duplicate per-rank
+    prints are identical). Eval uses a per-rank-resetting internal step counter
+    that restarts every eval window, so we instead anchor each eval window onto
+    the *train global step at which it ran* (the loop trains window T then evals
+    window T+1, so the eval's anchor is the last train step before it). Each eval
+    window collapses to a single point carrying its final, most-aggregated
+    full-holdout metrics, plus `eval_window` (the eval_ts) for reference.
+    """
+    out: Dict[str, Dict[int, Dict[str, float]]] = {"train": {}, "eval": {}}
+    perf: List[Dict[str, float]] = []
+
+    last_train_step = 0
+    cur_anchor: Optional[int] = None   # train global step this eval block runs at
+    cur_ts: Optional[int] = None       # eval window id (eval_ts)
+    cur_row: Optional[Dict[str, float]] = None  # final row of the current block
+    cur_internal: Optional[int] = None  # last eval internal step (reset detection)
+
+    def flush_eval() -> None:
+        nonlocal cur_anchor, cur_ts, cur_row, cur_internal
+        if cur_row:
+            anchor = cur_anchor if cur_anchor is not None else last_train_step
+            row = dict(cur_row)
+            if cur_ts is not None:
+                row["eval_window"] = float(cur_ts)
+            key = anchor
+            while key in out["eval"]:  # keep distinct evals from colliding
+                key += 1
+            out["eval"][key] = row
+        cur_anchor = cur_ts = cur_row = cur_internal = None
+
+    with open(log_path, "r", errors="replace") as f:
+        for line in f:
+            pm = _PERF_RE.search(line)
+            if pm:
+                perf.append({
+                    "step": int(pm.group(1)),
+                    "local_sps": float(pm.group(2)),
+                    "global_sps": float(pm.group(3)),
+                    "step_ms": float(pm.group(4)),
+                    "elapsed_sec": float(pm.group(5)),
+                    "total_samples": int(pm.group(6)),
+                })
+                continue
+            bm = _EVAL_BOUNDARY_RE.search(line)
+            if bm:
+                # The boundary line (a different logger) can interleave before OR
+                # after this eval's metric lines, so don't use it to delimit the
+                # block — just tag the current block with its eval_ts. Block
+                # boundaries come from eval-step resets / training resuming.
+                if cur_anchor is None:
+                    cur_anchor = last_train_step
+                cur_ts = int(bm.group(1))
+                continue
+            m = _STEP_RE.search(line)
+            if not m:
+                continue
+            mode, step_s, body = m.group(1), m.group(2), m.group(3)
+            step = int(step_s)
+            row = _parse_metrics(body, task)
+            if mode == "train":
+                last_train_step = step
+                if cur_anchor is not None or cur_row is not None:
+                    flush_eval()  # an eval block ends when training resumes
+                if row:
+                    out["train"][step] = row  # last write wins
+            else:  # eval — accumulate into the current block (last = most aggregated)
+                # Fallback for logs without a boundary marker: a drop in the eval
+                # internal step counter signals a fresh eval window.
+                if (cur_internal is not None and step < cur_internal
+                        and cur_anchor is None):
+                    flush_eval()
+                if cur_anchor is None:
+                    cur_anchor = last_train_step
+                cur_internal = step
+                if row:
+                    cur_row = row
+    flush_eval()
+    return out, perf
+
+
+def write_outputs(
+    traj: Dict[str, Dict[int, Dict[str, float]]],
+    perf: List[Dict[str, float]],
+    out_dir: str,
+    task: str,
+) -> None:
+    os.makedirs(out_dir, exist_ok=True)
+
+    json_path = os.path.join(out_dir, "trajectory.json")
+    with open(json_path, "w") as f:
+        json.dump(
+            {
+                "task": task,
+                "train": {str(k): v for k, v in sorted(traj["train"].items())},
+                "eval": {str(k): v for k, v in sorted(traj["eval"].items())},
+                "perf": perf,
+            },
+            f,
+            indent=2,
+        )
+
+    csv_path = os.path.join(out_dir, "trajectory.csv")
+    with open(csv_path, "w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["mode", "step", "metric", "value"])
+        for mode in ("train", "eval"):
+            for step in sorted(traj[mode]):
+                for metric, val in traj[mode][step].items():
+                    w.writerow([mode, step, metric, val])
+
+    n_train = len(traj["train"])
+    n_eval = len(traj["eval"])
+    print(f"Parsed {n_train} train points, {n_eval} eval points, "
+          f"{len(perf)} perf points (task={task}).", file=sys.stderr)
+    print(f"Wrote {json_path}", file=sys.stderr)
+    print(f"Wrote {csv_path}", file=sys.stderr)
+
+    _maybe_plot(traj, out_dir, task)
+
+
+def _maybe_plot(
+    traj: Dict[str, Dict[int, Dict[str, float]]], out_dir: str, task: str
+) -> None:
+    try:
+        import matplotlib
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+    except Exception as e:  # noqa: BLE001
+        print(f"matplotlib unavailable ({e}); skipping plot.", file=sys.stderr)
+        return
+
+    def series(mode: str, metric: str) -> Tuple[List[int], List[float]]:
+        steps = sorted(s for s in traj[mode] if metric in traj[mode][s])
+        return steps, [traj[mode][s][metric] for s in steps]
+
+    fig, (ax_ne, ax_auc) = plt.subplots(2, 1, figsize=(11, 9), sharex=True)
+
+    for metric, style in (("window_ne", "-"), ("lifetime_ne", "--")):
+        xs, ys = series("train", metric)
+        if xs:
+            ax_ne.plot(xs, ys, style, label=f"train/{metric}", alpha=0.85)
+    for metric, marker in (("window_ne", "o"), ("lifetime_ne", "s")):
+        xs, ys = series("eval", metric)
+        if xs:
+            ax_ne.plot(xs, ys, marker=marker, ms=5, ls="-", lw=1.0, alpha=0.9,
+                       label=f"eval/{metric}")
+    ax_ne.set_ylabel("NE (normalized entropy)")
+    ax_ne.set_title(f"yambda-5b streaming train+eval trajectory — task={task}")
+    ax_ne.grid(True, alpha=0.3)
+    ax_ne.legend(fontsize=8, ncol=2)
+
+    for metric, style in (("window_auc", "-"), ("lifetime_auc", "--")):
+        xs, ys = series("train", metric)
+        if xs:
+            ax_auc.plot(xs, ys, style, label=f"train/{metric}", alpha=0.85)
+    for metric, marker in (("window_auc", "o"), ("lifetime_auc", "s")):
+        xs, ys = series("eval", metric)
+        if xs:
+            ax_auc.plot(xs, ys, marker=marker, ms=5, ls="-", lw=1.0, alpha=0.9,
+                        label=f"eval/{metric}")
+    ax_auc.set_ylabel("AUC")
+    ax_auc.set_xlabel("train global step (eval points anchored to the step they ran at)")
+    ax_auc.grid(True, alpha=0.3)
+    ax_auc.legend(fontsize=8, ncol=2)
+
+    png_path = os.path.join(out_dir, "trajectory_ne_auc.png")
+    fig.tight_layout()
+    fig.savefig(png_path, dpi=120)
+    print(f"Wrote {png_path}", file=sys.stderr)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("log", help="Path to the streaming train.log")
+    ap.add_argument("--out", default=None,
+                    help="Output dir (default: <log_dir>/<log_stem>_trajectory)")
+    ap.add_argument("--task", default="listen_plus",
+                    help="Task name to extract (default: listen_plus)")
+    args = ap.parse_args()
+
+    if not os.path.exists(args.log):
+        print(f"Log not found: {args.log}", file=sys.stderr)
+        return 2
+    out_dir = args.out
+    if out_dir is None:
+        stem = os.path.splitext(os.path.basename(args.log))[0]
+        out_dir = os.path.join(os.path.dirname(os.path.abspath(args.log)),
+                               f"{stem}_trajectory")
+
+    traj, perf = parse_log(args.log, args.task)
+    write_outputs(traj, perf, out_dir, args.task)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/recommendation_v4/scripts/launch_local.sh b/recommendation_v4/scripts/launch_local.sh
new file mode 100755
index 000000000..9a69a36e5
--- /dev/null
+++ b/recommendation_v4/scripts/launch_local.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# =============================================================================
+# launch_local.sh — single-host, NON-SLURM launcher for the yambda-5b trainer.
+#
+# This is the SLURM-free analog of scripts/launch_slurm.sh's `worker` phase:
+# it sets the single-node distributed topology + sane env and invokes the SAME
+# entry point (`train_ranker.py --dataset yambda-5b`) reading the SAME
+# train/gin/yambda_5b.gin config. No scheduler, no docker, no RDMA overlay —
+# everything runs directly on this host against an already-prepared dataset.
+#
+# Use it to:
+#   * Smoke-test the launch path on a single GPU box (SMOKE=1, the default —
+#     a few train/eval batches of one streaming window), or
+#   * Run the full gin-default workload (SMOKE=0 — consumes whole windows).
+#
+# PREREQUISITES
+#   1) Data prepared (run once, CPU-only — no GPU needed):
+#        python generative_recommenders/dlrm_v3/preprocess_public_data.py \
+#            --dataset yambda-5b --data-path "$DLRM_DATA_PATH"
+#      producing  $DLRM_DATA_PATH/processed_5b/{train_sessions.parquet,...}
+#             and $DLRM_DATA_PATH/shared_metadata/{artist,album}_item_mapping.parquet
+#   2) The train_recipe GPU stack importable by $PYTHON (see docs/training_recipe.md):
+#        torch (rocm or cuda build), fbgemm_gpu, torchrec, polars-u64-idx,
+#        gin-config, xxhash, pandas, tensorboard, ...
+#      This box must have visible GPUs (the trainer shards embeddings onto HBM).
+#
+# USAGE
+#   # smoke (default): one window, 20 train + 10 eval batches
+#   DLRM_DATA_PATH=/home/chcai/dlrm_data bash scripts/launch_local.sh
+#
+#   # full gin-default run (whole windows; long)
+#   SMOKE=0 DLRM_DATA_PATH=/home/chcai/dlrm_data bash scripts/launch_local.sh
+#
+#   # restrict to 2 GPUs, custom log, plain (non-streaming) train-eval
+#   GPUS_PER_NODE=2 MODE=train-eval LOG=/tmp/y.log bash scripts/launch_local.sh
+#
+# Every knob below is env-overridable; defaults reproduce launch_slurm.sh's
+# single-node smoke path so a local run matches the known-good cluster path.
+# =============================================================================
+set -uo pipefail
+
+REPO_ROOT=$(cd "$(dirname "$0")/.." && pwd)
+cd "$REPO_ROOT"
+
+# ---- interpreter ------------------------------------------------------------
+# Default to the venv created for data prep if present, else system python3.
+# Override with PYTHON=/path/to/python (e.g. the in-container recipe python).
+DEFAULT_PY=/home/chcai/dlrmv4_venv/bin/python
+PYTHON=${PYTHON:-$([ -x "$DEFAULT_PY" ] && echo "$DEFAULT_PY" || echo python3)}
+
+# ---- dataset / data path ----------------------------------------------------
+DATASET=${DATASET:-yambda-5b}
+MODE=${MODE:-streaming-train-eval}
+# Mirrors the yambda_5b.gin default ("/apps/chcai/dlrm_data"); point at wherever
+# preprocess_public_data.py wrote processed_5b/ + shared_metadata/.
+export DLRM_DATA_PATH=${DLRM_DATA_PATH:-/home/chcai/dlrm_data}
+
+LOG=${LOG:-$REPO_ROOT/yambda_local.$(date +%Y%m%d_%H%M%S).log}
+
+# ---- single-node distributed topology --------------------------------------
+# train_ranker reads these from the env (see train_ranker.main): it spawns
+# GPUS_PER_NODE ranks via torch.multiprocessing on THIS host. localhost
+# rendezvous; empty MASTER_PORT => train_ranker picks a free port.
+export NNODES=${NNODES:-1}
+export NODE_RANK=${NODE_RANK:-0}
+export MASTER_ADDR=${MASTER_ADDR:-localhost}
+export MASTER_PORT=${MASTER_PORT:-}
+# GPUS_PER_NODE: 0/unset => train_ranker auto-detects torch.cuda.device_count().
+export GPUS_PER_NODE=${GPUS_PER_NODE:-0}
+
+# ---- runtime env (matches launch_slurm.sh worker defaults) ------------------
+export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-TRITON}
+export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
+export PYTHONPATH="$REPO_ROOT:${PYTHONPATH:-}"
+# Single-node RCCL bootstrap: all ranks rendezvous over localhost, so pin the
+# loopback NIC. Left to auto-detect, RCCL can grab a non-routable per-GPU RoCE
+# NIC and hang/"No route to host" at init (same failure launch_slurm.sh pins
+# fenic0 to avoid). Override NCCL_SOCKET_IFNAME for a routable multi-host setup.
+export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-lo}
+export NCCL_DEBUG=${NCCL_DEBUG:-WARN}
+
+# ---- smoke caps -------------------------------------------------------------
+# SMOKE=1 (default): apply small per-window batch caps so a launch finishes in
+# minutes (validates the path end-to-end). SMOKE=0: leave the gin defaults
+# untouched (consume full windows — the real workload).
+SMOKE=${SMOKE:-1}
+if [ "$SMOKE" = "1" ]; then
+  export START_TS=${START_TS:-150}
+  export NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
+  export NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
+  export NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
+  export EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
+  export METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
+  # Smaller per-sample shape keeps the smoke run light; drop these to use the
+  # gin defaults (4086/4096). Reuse an existing hstu_cache_L<N>/ if present.
+  export BATCH_SIZE=${BATCH_SIZE:-32}
+fi
+
+mkdir -p "$(dirname "$LOG")"
+{
+  echo "[$(date)] launch_local: dataset=$DATASET mode=$MODE smoke=$SMOKE"
+  echo "[$(date)] PYTHON=$PYTHON"
+  echo "[$(date)] DLRM_DATA_PATH=$DLRM_DATA_PATH"
+  echo "[$(date)] topology: nnodes=$NNODES node_rank=$NODE_RANK gpus_per_node(req)=$GPUS_PER_NODE master=$MASTER_ADDR:${MASTER_PORT:-<auto>}"
+} | tee -a "$LOG"
+
+# ---- preflight: data present? ----------------------------------------------
+SUFFIX=${DATASET#yambda-}
+PROCESSED="$DLRM_DATA_PATH/processed_${SUFFIX}/train_sessions.parquet"
+META="$DLRM_DATA_PATH/shared_metadata/artist_item_mapping.parquet"
+if [ "$DATASET" = "yambda-5b" ] && { [ ! -f "$PROCESSED" ] || [ ! -f "$META" ]; }; then
+  echo "[$(date)] ERROR: prepared data not found." | tee -a "$LOG"
+  echo "  expected: $PROCESSED" | tee -a "$LOG"
+  echo "        and: $META" | tee -a "$LOG"
+  echo "  run preprocessing first:" | tee -a "$LOG"
+  echo "    $PYTHON generative_recommenders/dlrm_v3/preprocess_public_data.py --dataset $DATASET --data-path $DLRM_DATA_PATH" | tee -a "$LOG"
+  exit 1
+fi
+
+# ---- preflight: GPU stack importable + GPUs visible? ------------------------
+echo "[$(date)] preflight: checking torch / fbgemm_gpu / torchrec + GPU count" | tee -a "$LOG"
+"$PYTHON" - <<'PY' 2>&1 | tee -a "$LOG"
+import sys
+missing = []
+for m in ("torch", "fbgemm_gpu", "torchrec", "polars", "gin", "xxhash"):
+    try:
+        __import__(m)
+    except Exception as e:
+        missing.append(f"{m} ({e.__class__.__name__})")
+if missing:
+    print("PREFLIGHT FAIL: missing/broken imports: " + ", ".join(missing))
+    print("Install the train_recipe GPU stack (see docs/training_recipe.md).")
+    sys.exit(3)
+import torch
+n = torch.cuda.device_count()
+print(f"imports OK, torch {torch.__version__}, cuda/hip available={torch.cuda.is_available()}, {n} GPU(s)")
+if n == 0:
+    print("PREFLIGHT FAIL: no GPUs visible — the HSTU trainer shards embeddings "
+          "onto GPU HBM and cannot run CPU-only. Launch on a GPU host.")
+    sys.exit(4)
+PY
+pf=${PIPESTATUS[0]}
+if [ "$pf" -ne 0 ]; then
+  echo "[$(date)] preflight failed (rc=$pf) — not launching trainer." | tee -a "$LOG"
+  exit "$pf"
+fi
+
+# ---- launch -----------------------------------------------------------------
+echo "[$(date)] launching train_ranker ($DATASET, mode=$MODE)" | tee -a "$LOG"
+"$PYTHON" -m generative_recommenders.dlrm_v3.train.train_ranker \
+    --dataset "$DATASET" --mode "$MODE" 2>&1 | tee -a "$LOG"
+rc=${PIPESTATUS[0]}
+echo "[$(date)] launch_local finished rc=$rc" | tee -a "$LOG"
+exit "$rc"
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
new file mode 100755
index 000000000..a1b6334e3
--- /dev/null
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -0,0 +1,650 @@
+#!/bin/bash
+#SBATCH --job-name=yambda_slurm
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --partition=meta64          # [CLUSTER-SPECIFIC] partition name
+#SBATCH --output=/apps/chcai/yambda_slurm.%j.out
+# =============================================================================
+# launch_slurm.sh — single entry point for the yambda-5b trainer on N>=1 nodes.
+#
+# Consolidates what used to be separate scripts so multi-node enablement is
+# ONE committable script (plus the train_ranker.py / utils.py python changes):
+#   * orchestrate phase (host SLURM glue) — formerly sbatch_smoke_multinode.sh
+#   * provision   phase (container + RDMA) — formerly _provision_yambda_primus.sh
+#   * worker      phase (in-container train) — now inlined below
+#
+# PHASES (auto-detected from context; force with LAUNCH_SLURM_PHASE=<phase>):
+#   orchestrate  Runs on the SLURM batch host (no /.dockerenv). Resolves the
+#                rendezvous (MASTER_ADDR/PORT), ensures the container on every
+#                node (provision phase), then `docker exec`s the worker phase on
+#                every node, one task per node.
+#   provision    Runs on a compute-node host. Ensures the `yambda_primus`
+#                container is up (loads the pre-baked image if present — no
+#                internet/pip — else builds from the base image) and stages the
+#                host RDMA userspace overlay on shared NFS.
+#   worker       Runs INSIDE the container. Sets the distributed topology +
+#                NCCL/RDMA env and spawns this node's GPU ranks via train_ranker.
+#                N==1 transparently uses the legacy single-node path (localhost,
+#                node_rank 0), byte-for-byte as before, so the streaming-e2e
+#                supervisor's direct `bash scripts/launch_slurm.sh` is unchanged.
+#
+# USAGE
+#   Multi-node (N>=1):  sbatch --nodes=2 scripts/launch_slurm.sh
+#   Single-node direct: bash scripts/launch_slurm.sh   (already inside container;
+#                       what run_streaming_e2e.sh invokes per relaunch)
+#   Perf pair:
+#     LOG=/apps/chcai/perf_1node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
+#       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
+#       sbatch --nodes=1 --job-name=y1 scripts/launch_slurm.sh
+#     LOG=/apps/chcai/perf_2node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
+#       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
+#       sbatch --nodes=2 --job-name=y2 scripts/launch_slurm.sh
+#     # then: bash scripts/compare_node_perf.sh /apps/chcai/perf_1node.log /apps/chcai/perf_2node.log
+#
+# ONE-TIME IMAGE BAKE (so fresh nodes skip the multi-GB torch download + pip):
+#   BAKE_IMAGE=1 LAUNCH_SLURM_PHASE=provision bash scripts/launch_slurm.sh
+#   (commits the deps-installed container to $BAKED_IMAGE and `docker save`s it to
+#    $BAKED_TAR on NFS; subsequent provisions `docker load` it offline.)
+#
+# -----------------------------------------------------------------------------
+# PORTABILITY — what to change for a DIFFERENT cluster / network / hardware.
+# Every such knob is also tagged inline with "[CLUSTER-SPECIFIC]" (grep for it).
+# All are env-overridable, so you can adapt without editing this file.
+#
+#  A) SLURM / scheduler
+#     - #SBATCH --partition=meta64  : partition name. CHANGE per cluster.
+#     - #SBATCH --time / --exclusive : policy; adjust to taste.
+#
+#  B) Filesystems (must be shared/NFS across ALL nodes — this script re-invokes
+#     itself and reads the overlay + data from these paths cluster-wide)
+#     - REPO_MOUNT (repo + this script, e.g. /home/<user>) is bind-mounted rw;
+#       DATA_MOUNT (e.g. /apps/chcai) holds the read-only dataset + overlay +
+#       baked tar + pip tarball; SCRATCH (e.g. /home/<user>/yambda_runs) is the
+#       writable log/output root. Override any via env — nothing is user-hardwired.
+#
+#  C) Container image / GPU software stack (tied to the GPU arch + ROCm version)
+#     - IMAGE=rocm/primus:v26.3        : base image. ROCm/AMD-specific.
+#     - docker run --device=/dev/kfd --device=/dev/dri --group-add video : AMD ROCm
+#       device passthrough. For NVIDIA this is --gpus all / nvidia runtime instead.
+#     - --ulimit memlock=-1 : REQUIRED for RDMA QP registration (do not drop).
+#     - TORCH_IDX (rocm7.2), torch/vision/audio ==*+rocm7.2, FBGEMM_WHL (a gfx950
+#       wheel), torchrec pin : the whole deps set is arch/ROCm-version-specific.
+#
+#  D) Network fabric — THE trickiest part; defaults are PROVEN on meta64 cv350
+#     (Broadcom bnxt_re RoCEv2). On a different fabric these almost certainly change
+#     (see the worker-phase block for the full rationale):
+#     - NCCL_SOCKET_IFNAME=fenic0 : the ONE routable host NIC for TCP bootstrap.
+#       Find yours with `ip -br addr`; the per-GPU RDMA NICs are usually NOT
+#       routable for plain TCP, so auto-detect hangs init — you MUST pin this.
+#     - NCCL_IB_HCA=bnxt_re0..7 : the RDMA HCA device names. List with `ibv_devices`.
+#       Different NIC vendor (e.g. mlx5_*, ionic_*) => different names AND a
+#       different userspace provider, which changes the RDMA overlay below.
+#     - NCCL_IB_GID_INDEX=3 : RoCEv2 IPv4 GID index. Check `show_gids`; v1/v2 and
+#       IPv4/IPv6 live at different indices per port.
+#     - NCCL_IB_TC=104 : RoCE lossless (PFC) traffic class. Fabric/switch-specific.
+#     - RDMA overlay (provision phase): only needed when the CONTAINER's rdma-core
+#       is older than the HOST kernel driver's uapi (our bnxt_re v34-vs-v59 case).
+#       Different NIC/host => different /usr/lib64 provider .so to stage, or the
+#       overlay may be unnecessary entirely (set RDMA_OVERLAY= to disable). If RDMA
+#       can't be made to work, NCCL_NET_TRANSPORT=socket falls back to TCP.
+#
+#  E) Not cluster-specific (auto-derived): GPUS_PER_NODE (torch.cuda.device_count),
+#     NNODES/NODE_RANK/MASTER_ADDR (from SLURM), WORLD_SIZE.
+# =============================================================================
+set -uo pipefail
+
+# Absolute path to THIS script so the orchestrate phase can re-invoke it on every
+# node (home is shared NFS, so the same path resolves cluster-wide).
+SELF=$(cd "$(dirname "$0")" && pwd)/$(basename "$0")
+REPO_ROOT=$(cd "$(dirname "$0")/.." && pwd)
+
+# ---- phase detection --------------------------------------------------------
+PHASE="${LAUNCH_SLURM_PHASE:-}"
+if [ -z "$PHASE" ]; then
+  if [ -f /.dockerenv ]; then PHASE=worker; else PHASE=orchestrate; fi
+fi
+
+# ---- shared config (env-overridable) ----------------------------------------
+CONTAINER=${CONTAINER:-yambda_primus}
+REPO=${REPO:-$REPO_ROOT}                       # repo path inside the container
+IMAGE=${IMAGE:-rocm/primus:v26.3}              # [CLUSTER-SPECIFIC] ROCm/arch base image
+BAKED_IMAGE=${BAKED_IMAGE:-yambda_primus_baked:latest}
+BAKED_TAR=${BAKED_TAR:-/apps/chcai/yambda_primus_baked.tar}   # [CLUSTER-SPECIFIC] shared-NFS path
+USE_BAKED=${USE_BAKED:-1}
+OVERLAY=${RDMA_OVERLAY:-/apps/chcai/rdma_host_el9_new}        # [CLUSTER-SPECIFIC] shared-NFS RDMA overlay
+
+REPO_MOUNT=${REPO_MOUNT:-$HOME}              # NFS home holding the repo (must contain $REPO); override if your repo lives elsewhere
+DATA_MOUNT=${DATA_MOUNT:-/apps/chcai}        # shared dataset + RDMA overlay + pip/fbgemm assets (read-only)
+SCRATCH=${SCRATCH:-$HOME/yambda_runs}        # writable output root (logs / tb / traces)
+
+# =============================================================================
+# PHASE: orchestrate  (SLURM batch host)
+# =============================================================================
+orchestrate() {
+  # When run as the SLURM batch script, $0 is the node-local staged copy
+  # (/var/spool/slurmd/job<ID>/slurm_script), so $SELF / $REPO_ROOT are WRONG
+  # here (they don't exist on other nodes). Resolve the REAL shared-NFS script
+  # path + repo root from SLURM so we can re-invoke this script on every node and
+  # `cd` to the right repo inside the container.
+  SCRIPT_PATH=$(scontrol show job "${SLURM_JOB_ID:-0}" 2>/dev/null | grep -oP 'Command=\K\S+')
+  [ -f "${SCRIPT_PATH:-}" ] || SCRIPT_PATH="${SLURM_SUBMIT_DIR:-$REPO_ROOT}/scripts/launch_slurm.sh"
+  [ -f "$SCRIPT_PATH" ] || SCRIPT_PATH="$SELF"
+  REPO=$(cd "$(dirname "$SCRIPT_PATH")/.." && pwd)
+
+  mkdir -p "$SCRATCH" 2>/dev/null || true
+  LOG=${LOG:-$SCRATCH/yambda_slurm.${SLURM_JOB_ID:-manual}.log}
+
+  # Smoke defaults — override via env for a perf run (see header USAGE).
+  MODE=${MODE:-streaming-train-eval}
+  START_TS=${START_TS:-150}
+  NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
+  NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
+  NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
+  EVAL_EACH_WINDOW=${EVAL_EACH_WINDOW:-1}
+  EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
+  METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
+  FORCE_PROVISION=${FORCE_PROVISION:-0}
+
+  # Truncate the metrics log on a FRESH run; APPEND on a supervised relaunch
+  # (APPEND_LOG=1) so the full-run NE/AUC history survives crash/node-failover
+  # resubmits instead of being wiped on every attempt (mirrors the single-node
+  # supervisor's init-once/append model).
+  if [ "${APPEND_LOG:-0}" = "1" ]; then
+    echo "[$(date)] === resume: appending to existing $LOG (APPEND_LOG=1) ===" >> "$LOG"
+  else
+    : > "$LOG"
+  fi
+  echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
+  echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
+  echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ" | tee -a "$LOG"
+
+  # Rendezvous resolved on the HOST (the container image has no SLURM client).
+  MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
+  MASTER_ADDR=${MASTER_ADDR:-localhost}
+  MASTER_PORT=$(( 20000 + ${SLURM_JOB_ID:-0} % 20000 ))
+  echo "[$(date)] rendezvous: MASTER_ADDR=$MASTER_ADDR MASTER_PORT=$MASTER_PORT" | tee -a "$LOG"
+
+  # Optional NCCL/RCCL fabric overrides — forwarded into the container only when
+  # set at submit time (docker exec does NOT inherit the srun task env). The
+  # worker phase applies its own validated multi-node bnxt_re defaults when these
+  # are unset. Common: NCCL_NET_TRANSPORT=socket (TCP fallback), NCCL_DEBUG=INFO.
+  NCCL_ENV_ARGS=""
+  for v in NCCL_NET_TRANSPORT NCCL_DEBUG NCCL_SOCKET_IFNAME NCCL_IB_HCA NCCL_IB_GID_INDEX \
+           NCCL_IB_TC NCCL_IB_TIMEOUT NCCL_IGNORE_CPU_AFFINITY RCCL_MSCCL_ENABLE NCCL_NET_GDR_LEVEL \
+           NCCL_IB_PCI_RELAXED_ORDERING NCCL_IB_USE_INLINE NCCL_IB_QPS_PER_CONNECTION \
+           NCCL_IB_ECE_ENABLE NCCL_DMABUF_ENABLE NCCL_GDRCOPY_ENABLE NCCL_GDR_FLUSH_DISABLE \
+           NCCL_PXN_DISABLE NCCL_CHECKS_DISABLE NCCL_CROSS_NIC RDMA_OVERLAY; do
+    eval "val=\${$v:-}"
+    if [ -n "$val" ]; then NCCL_ENV_ARGS="$NCCL_ENV_ARGS -e $v=$val"; fi
+  done
+
+  # TRICKY — variable expansion inside the `srun ... bash -c "..."` blocks below:
+  # the string is double-quoted, so PLAIN $VAR expands NOW on the batch host (e.g.
+  # $MASTER_ADDR, $CONTAINER, $SCRIPT_PATH — values computed above), while
+  # BACKSLASH-escaped \$VAR is passed through literally and expands LATER on each
+  # compute node inside the srun task (e.g. \$SLURM_NODEID, \$(hostname)) where the
+  # per-node SLURM_* env actually lives. Mixing these up sends every rank the
+  # wrong node id or breaks the docker exec — keep the \$ on per-node values.
+
+  # --- step 1: ensure the container is up on every node ----------------------
+  echo "[$(date)] ensuring container '$CONTAINER' on all nodes (force=$FORCE_PROVISION)" | tee -a "$LOG"
+  srun --ntasks-per-node=1 bash -c "
+    # Reap stale/foreign GPU containers from prior jobs BEFORE (re)provisioning.
+    # The node is allocated --exclusive, so any GPU container other than
+    # '$CONTAINER' is an orphan left by a previous job (its container outlives the
+    # SLURM allocation). We remove every such container that has GPU access
+    # (/dev/kfd or /dev/dri) — running OR stopped, whether or not it currently
+    # pins VRAM ('docker ps -aq' includes stopped ones) — since idle orphans can
+    # still hold device handles or wake up; leaked HBM from these has caused both
+    # OOMs and RCCL collective hangs. We deliberately SKIP non-GPU containers
+    # (e.g. 'k8s-node-services-*' and other cluster system services) so we don't
+    # disrupt node infrastructure. docker teardown lets the driver reclaim HBM.
+    for _c in \$(docker ps -aq 2>/dev/null); do
+      _nm=\$(docker inspect -f '{{.Name}}' \"\$_c\" 2>/dev/null | sed 's#^/##')
+      [ \"\$_nm\" = \"$CONTAINER\" ] && continue
+      _dev=\$(docker inspect -f '{{range .HostConfig.Devices}}{{.PathOnHost}} {{end}}' \"\$_c\" 2>/dev/null)
+      case \"\$_dev\" in
+        *kfd*|*dri*)
+          echo \"[\$(hostname)] reaping stale GPU container \$_nm (\$_c)\"
+          docker rm -f \"\$_c\" >/dev/null 2>&1 || true ;;
+        *)
+          echo \"[\$(hostname)] keeping non-GPU/system container \$_nm (\$_c)\" ;;
+      esac
+    done
+    # Reuse a STOPPED '$CONTAINER' (its installed deps persist in the container
+    # fs) instead of destructively re-provisioning from the base image + pip.
+    # Harmless no-op on a fresh node (no such container) -> falls through to
+    # provision below. Repo code is bind-mounted, so live edits are still picked up.
+    docker start $CONTAINER >/dev/null 2>&1 || true
+    if [ \"$FORCE_PROVISION\" = \"1\" ] || ! docker exec $CONTAINER true >/dev/null 2>&1; then
+      echo \"[\$(hostname)] (re)provisioning container\"
+      LAUNCH_SLURM_PHASE=provision CONTAINER=$CONTAINER IMAGE=$IMAGE \
+        BAKED_IMAGE=$BAKED_IMAGE BAKED_TAR=$BAKED_TAR USE_BAKED=$USE_BAKED \
+        BAKE_IMAGE=${BAKE_IMAGE:-0} RDMA_OVERLAY=$OVERLAY REPO=$REPO \
+        REPO_MOUNT=$REPO_MOUNT DATA_MOUNT=$DATA_MOUNT SCRATCH=$SCRATCH bash $SCRIPT_PATH
+    else
+      # Container persists across jobs; the reap above only removes FOREIGN GPU
+      # containers, so our own '$CONTAINER' can still pin HBM via stray trainer
+      # ranks left by a prior OOM/crash (this caused repeated 'CUDA out of memory'
+      # on relaunch onto the same node). Restart it to kill every exec'd proc and
+      # let the driver reclaim HBM — cheap (keeps the installed deps in the
+      # container fs; NFS RDMA overlay also persists), no full re-provision.
+      echo \"[\$(hostname)] container already up — restarting to free any leaked HBM before launch\"
+      docker restart $CONTAINER >/dev/null 2>&1 || true
+      # Readiness gate: a bare 'docker exec true' can pass while the runtime is
+      # still settling, so the SUBSEQUENT (heavier) worker exec races the restart
+      # and dies with 'container is not running' / OCI 'setns' errors (observed on
+      # c07-08 and e08-08 -> the peer never joins rendezvous -> master 600s
+      # TCPStore timeout). Require State.Running=true AND a successful probe, then
+      # a short settle, before considering the container ready.
+      for _w in \$(seq 1 30); do
+        [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
+          && docker exec $CONTAINER true >/dev/null 2>&1 && break
+        sleep 2
+      done
+      sleep 2
+      echo \"[\$(hostname)] container restarted (HBM reclaimed; running=\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null))\"
+    fi
+  " 2>&1 | tee -a "$LOG"
+
+  # --- step 2: launch the worker (trainer) inside the container on every node -
+  echo "[$(date)] launching trainer (worker phase) on all nodes" | tee -a "$LOG"
+  srun --ntasks-per-node=1 bash -c "
+    # Pre-flight readiness gate (per node): step 1 ran in a SEPARATE srun, so the
+    # container can still be settling here. Wait for State.Running=true + a probe
+    # before the worker exec so we don't race a just-restarted container.
+    for _w in \$(seq 1 30); do
+      [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
+        && docker exec $CONTAINER true >/dev/null 2>&1 && break
+      [ \$_w -eq 1 ] && echo \"[\$(hostname)] worker pre-flight: waiting for container to be ready...\"
+      sleep 2
+    done
+    # Retry wrapper: docker exec startup failures (rc 125 daemon 'container is not
+    # running', 126/127 OCI/setns 'exec failed') mean the container wasn't ready,
+    # NOT that the trainer ran and failed. Restart + re-gate + retry a few times.
+    # Any OTHER rc (the trainer actually started and exited) is propagated so the
+    # supervisor's resume-from-checkpoint logic owns real failures.
+    _wattempt=0
+    while : ; do
+    _wattempt=\$((_wattempt+1))
+    docker exec \
+      -e LAUNCH_SLURM_PHASE=worker \
+      -e SCRATCH=$SCRATCH \
+      -e SLURM_NNODES=\$SLURM_NNODES \
+      -e SLURM_NODEID=\$SLURM_NODEID \
+      -e SLURM_PROCID=\$SLURM_PROCID \
+      -e SLURM_JOB_NODELIST=\"\$SLURM_JOB_NODELIST\" \
+      -e SLURM_JOB_ID=\$SLURM_JOB_ID \
+      -e MASTER_ADDR=$MASTER_ADDR \
+      -e MASTER_PORT=$MASTER_PORT \
+      -e HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-TRITON} \
+      -e MODE=$MODE \
+      -e START_TS=$START_TS \
+      -e NUM_TRAIN_TS=$NUM_TRAIN_TS \
+      -e EVAL_EACH_WINDOW=$EVAL_EACH_WINDOW \
+      -e EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS \
+      ${EVAL_EVERY_DATA_PCT:+-e EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT} \
+      -e NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
+      -e NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
+      -e METRIC_LOG_FREQ=$METRIC_LOG_FREQ \
+      ${MLPERF_LOGGING:+-e MLPERF_LOGGING=$MLPERF_LOGGING} \
+      ${MLPERF_TRAIN_LOSS_LOG_FREQ:+-e MLPERF_TRAIN_LOSS_LOG_FREQ=$MLPERF_TRAIN_LOSS_LOG_FREQ} \
+      ${STREAMING_SHUFFLE_FRACTION:+-e STREAMING_SHUFFLE_FRACTION=$STREAMING_SHUFFLE_FRACTION} \
+      ${STREAMING_SHUFFLE_SEED:+-e STREAMING_SHUFFLE_SEED=$STREAMING_SHUFFLE_SEED} \
+      ${NUM_WORKERS:+-e NUM_WORKERS=$NUM_WORKERS} \
+      ${PREFETCH_FACTOR:+-e PREFETCH_FACTOR=$PREFETCH_FACTOR} \
+      ${DIAG_UNIQUE_EMB:+-e DIAG_UNIQUE_EMB=$DIAG_UNIQUE_EMB} \
+      ${DIAG_EMB_STEPS:+-e DIAG_EMB_STEPS=$DIAG_EMB_STEPS} \
+      ${OUTPUT_TRACE:+-e OUTPUT_TRACE=$OUTPUT_TRACE} \
+      ${MIN_HISTORY:+-e MIN_HISTORY=$MIN_HISTORY} \
+      ${SEED:+-e SEED=$SEED} \
+      ${DENSE_LR:+-e DENSE_LR=$DENSE_LR} \
+      ${SPARSE_LR:+-e SPARSE_LR=$SPARSE_LR} \
+      ${GRAD_CLIP_NORM:+-e GRAD_CLIP_NORM=$GRAD_CLIP_NORM} \
+      ${HSTU_NUM_LAYERS:+-e HSTU_NUM_LAYERS=$HSTU_NUM_LAYERS} \
+      ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN=$MAX_SEQ_LEN} \
+      ${HISTORY_LENGTH:+-e HISTORY_LENGTH=$HISTORY_LENGTH} \
+      ${BATCH_SIZE:+-e BATCH_SIZE=$BATCH_SIZE} \
+      ${CKPT_TIME_INTERVAL_S:+-e CKPT_TIME_INTERVAL_S=$CKPT_TIME_INTERVAL_S} \
+      ${KEEP_LAST_N:+-e KEEP_LAST_N=$KEEP_LAST_N} \
+      ${IN_WINDOW_CKPT_FREQ:+-e IN_WINDOW_CKPT_FREQ=$IN_WINDOW_CKPT_FREQ} \
+      ${CKPT_STEP_FREQ:+-e CKPT_STEP_FREQ=$CKPT_STEP_FREQ} \
+      -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
+      -e AUC_THRESHOLD=${AUC_THRESHOLD:-1.0} \
+      ${MLPERF_SUBMISSION_PLATFORM:+-e MLPERF_SUBMISSION_PLATFORM=$MLPERF_SUBMISSION_PLATFORM} \
+      -e SPLIT_SALT=${SPLIT_SALT:-0} \
+      -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
+      -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \
+      ${WORKER_CMD:+-e WORKER_CMD=\"$WORKER_CMD\"} \
+      ${RUN_NAME:+-e RUN_NAME=$RUN_NAME} \
+      ${TENSORBOARD_LOG_PATH:+-e TENSORBOARD_LOG_PATH=$TENSORBOARD_LOG_PATH} \
+      ${MLPERF_LOG_PATH:+-e MLPERF_LOG_PATH=$MLPERF_LOG_PATH} \
+      ${CKPT_PATH:+-e CKPT_PATH=$CKPT_PATH} \
+      ${SPARSE_A2A_FWD:+-e SPARSE_A2A_FWD=$SPARSE_A2A_FWD} \
+      ${SPARSE_A2A_BWD:+-e SPARSE_A2A_BWD=$SPARSE_A2A_BWD} \
+      -e LOG=$LOG \
+      $NCCL_ENV_ARGS \
+      $CONTAINER bash -lc 'cd $REPO && LAUNCH_SLURM_PHASE=worker bash scripts/launch_slurm.sh'
+    _wrc=\$?
+    if { [ \$_wrc -eq 125 ] || [ \$_wrc -eq 126 ] || [ \$_wrc -eq 127 ]; } && [ \$_wattempt -lt 5 ]; then
+      echo \"[\$(hostname)] worker exec failed to START (rc=\$_wrc, attempt \$_wattempt/5) — container not ready; restarting + retrying\"
+      docker restart $CONTAINER >/dev/null 2>&1 || true
+      for _w in \$(seq 1 30); do
+        [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
+          && docker exec $CONTAINER true >/dev/null 2>&1 && break
+        sleep 2
+      done
+      sleep 3
+      continue
+    fi
+    exit \$_wrc
+    done
+  " 2>&1 | tee -a "$LOG"
+  rc=${PIPESTATUS[0]}
+  echo "[$(date)] launch_slurm/orchestrate finished rc=$rc" | tee -a "$LOG"
+  exit $rc
+}
+
+# =============================================================================
+# PHASE: provision  (compute-node host)
+# =============================================================================
+provision() {
+  export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH:-}"
+  DOCKER=$(command -v docker 2>/dev/null || true); DOCKER=${DOCKER:-/usr/bin/docker}
+  FBGEMM_WHL=${FBGEMM_WHL:-/apps/chcai/FBGEMM/fbgemm_gpu/dist/fbgemm_gpu_nightly_rocm-2026.6.2-cp312-cp312-linux_x86_64.whl}  # [CLUSTER-SPECIFIC] gfx950/ROCm wheel
+  TORCH_IDX=${TORCH_IDX:-https://download.pytorch.org/whl/rocm7.2}  # [CLUSTER-SPECIFIC] ROCm version index
+  echo "[provision] host=$(hostname) container=$CONTAINER docker=$DOCKER"
+
+  # Resolve which image to run + whether deps must be installed. Prefer a pre-baked
+  # image (deps already installed) to skip the multi-GB torch download + pip /
+  # torchrec-from-git build on every fresh node:
+  #   1) baked image in this node's docker -> use it, skip deps
+  #   2) baked image tar on NFS            -> docker load (local, no internet)
+  #   3) neither                           -> base image + pip (slow path, which
+  #                                           can then be baked via BAKE_IMAGE=1)
+  NEED_DEPS=1
+  RUN_IMAGE="$IMAGE"
+  if [ "$USE_BAKED" = "1" ]; then
+    if "$DOCKER" image inspect "$BAKED_IMAGE" >/dev/null 2>&1; then
+      echo "[provision] using baked image $BAKED_IMAGE (deps preinstalled, no download)"
+      RUN_IMAGE="$BAKED_IMAGE"; NEED_DEPS=0
+    elif [ -f "$BAKED_TAR" ]; then
+      echo "[provision] loading baked image from $BAKED_TAR (local, no internet)..."
+      if "$DOCKER" load -i "$BAKED_TAR" >/dev/null 2>&1 && "$DOCKER" image inspect "$BAKED_IMAGE" >/dev/null 2>&1; then
+        RUN_IMAGE="$BAKED_IMAGE"; NEED_DEPS=0; echo "[provision] baked image loaded"
+      else
+        echo "[provision] WARNING: docker load failed; falling back to base-image + pip"
+      fi
+    fi
+  fi
+  if ! "$DOCKER" image inspect "$RUN_IMAGE" >/dev/null 2>&1; then
+    echo "[provision] pulling $RUN_IMAGE (this can take a while)..."; "$DOCKER" pull "$RUN_IMAGE"
+  fi
+
+  echo "[provision] (re)starting container $CONTAINER from $RUN_IMAGE"
+  "$DOCKER" rm -f "$CONTAINER" >/dev/null 2>&1 || true
+  "$DOCKER" run -d --name "$CONTAINER" \
+    --network=host --ipc=host --shm-size=64g \
+    --device=/dev/kfd --device=/dev/dri --group-add video \
+    `# [CLUSTER-SPECIFIC] AMD ROCm device passthrough; NVIDIA uses --gpus all / nvidia runtime` \
+    --cap-add=SYS_PTRACE --cap-add=CAP_SYS_ADMIN --cap-add=IPC_LOCK \
+    --ulimit memlock=-1:-1 --ulimit stack=67108864:67108864 \
+    `# memlock=-1 is REQUIRED for RDMA QP memory registration — do not drop` \
+    --security-opt seccomp=unconfined --privileged \
+    -v "$REPO_MOUNT:$REPO_MOUNT" \
+    -v "$DATA_MOUNT:$DATA_MOUNT" \
+    `# shared-NFS bind mounts: repo home (REPO_MOUNT, rw) + dataset/build assets (DATA_MOUNT)` \
+    -w "$REPO" \
+    "$RUN_IMAGE" sleep infinity
+
+  # --- RDMA userspace overlay for in-container RCCL (bnxt_re) -----------------
+  # The image (rocm/primus, rdma-core 50/libbnxt_re-rdmav34) ships an OLDER RDMA
+  # userspace than the host kernel bnxt_re driver. The stock v34 provider faults
+  # RCCL's deep-queue create_qp (max_send_wr=256) against the newer kernel uapi
+  # -> "ibv_create_qp ... Bad address". Fix: stage the host's matched rdma-core
+  # (libibverbs v61 + libbnxt_re-rdmav59 + libnl) on NFS so the worker phase makes
+  # RCCL load it via LD_PRELOAD + LD_LIBRARY_PATH. The UNVERSIONED libibverbs.so
+  # symlink is essential (import torch pulls the unversioned soname; without it
+  # the lookup falls through to the container v34 lib and the fix regresses).
+  if [ "${FORCE_OVERLAY:-0}" != "1" ] && ls "$OVERLAY/lib/libibverbs/"libbnxt_re-rdmav*.so >/dev/null 2>&1 && [ -L "$OVERLAY/lib/libibverbs.so" ]; then
+    echo "[provision] host RDMA overlay already staged at $OVERLAY (shared NFS) — skipping"
+  else
+    echo "[provision] staging host RDMA userspace overlay -> $OVERLAY"
+    rm -rf "${OVERLAY}.tmp" 2>/dev/null
+    mkdir -p "${OVERLAY}.tmp/lib/libibverbs" "${OVERLAY}.tmp/libibverbs.d"
+    cp -L /usr/lib64/libibverbs.so.1 /usr/lib64/libnl-3.so.200 /usr/lib64/libnl-route-3.so.200 "${OVERLAY}.tmp/lib/" 2>/dev/null || true
+    ln -sf libibverbs.so.1 "${OVERLAY}.tmp/lib/libibverbs.so"
+    cp -L /usr/lib64/libibverbs/*.so "${OVERLAY}.tmp/lib/libibverbs/" 2>/dev/null || true
+    cp /etc/libibverbs.d/*.driver "${OVERLAY}.tmp/libibverbs.d/" 2>/dev/null || true
+    if ls "${OVERLAY}.tmp/lib/libibverbs/"libbnxt_re-rdmav*.so >/dev/null 2>&1; then
+      rm -rf "$OVERLAY" 2>/dev/null
+      mv "${OVERLAY}.tmp" "$OVERLAY" 2>/dev/null || { mkdir -p "$OVERLAY"; cp -a "${OVERLAY}.tmp/." "$OVERLAY/"; }
+      echo "[provision] host RDMA overlay staged: $(ls "$OVERLAY/lib/libibverbs" | wc -l) providers + libibverbs.so symlink"
+    else
+      echo "[provision] WARNING: host bnxt_re provider not found at /usr/lib64/libibverbs — multi-node RDMA will fail 'Bad address'; use NCCL_NET_TRANSPORT=socket"
+    fi
+  fi
+
+  if [ "$NEED_DEPS" = "0" ]; then
+    echo "[provision] baked image — deps preinstalled; verifying imports only"
+    "$DOCKER" exec "$CONTAINER" bash -lc '
+python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print(\"imports OK,\", torch.__version__, torch.version.hip, torch.cuda.device_count(), \"gpus\")"
+' || echo "[provision] WARNING: baked-image import smoke failed"
+  else
+    echo "[provision] installing recipe deps (base image, slow path)"
+    # Install misc deps FIRST, then pin the rocm torch stack + fbgemm + torchrec
+    # LAST with --no-deps so nothing pulls a CUDA torch over the rocm build.
+    "$DOCKER" exec "$CONTAINER" bash -lc '
+set -e
+echo "=== native torch ==="; python -c "import torch;print(torch.__version__)" || true
+echo "=== misc python deps ==="
+pip install --no-cache-dir polars-u64-idx pyarrow pyyaml tqdm psutil numba xxhash gin-config \
+  absl-py pandas tensorboard torchmetrics tensordict pyre-extensions iopath typing-inspect 2>&1 | tail -3 || true
+echo "=== rocm torch stack (force, no-deps, LAST) ==="
+pip install --force-reinstall --no-deps --index-url '"$TORCH_IDX"' \
+  torch==2.12.0+rocm7.2 torchvision==0.27.0+rocm7.2 torchaudio==2.11.0+rocm7.2
+echo "=== fbgemm (local gfx950 wheel) ==="
+pip install --force-reinstall --no-deps '"$FBGEMM_WHL"'
+echo "=== torchrec v2026.06.01.00 (force, no-deps) ==="
+pip install --force-reinstall --no-deps "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"
+echo "=== import smoke ==="
+python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print(\"imports OK,\", torch.__version__, torch.version.hip, torch.cuda.device_count(), \"gpus\")"
+'
+  fi
+
+  # --- one-time bake: snapshot the deps-installed container into a reusable image
+  # and save it to NFS so future nodes skip the download/pip path entirely.
+  if [ "${BAKE_IMAGE:-0}" = "1" ]; then
+    echo "[provision] baking: docker commit $CONTAINER -> $BAKED_IMAGE"
+    if "$DOCKER" commit "$CONTAINER" "$BAKED_IMAGE" >/dev/null; then
+      echo "[provision] saving $BAKED_IMAGE -> $BAKED_TAR (one-time, tens of GB)"
+      if "$DOCKER" save "$BAKED_IMAGE" -o "${BAKED_TAR}.tmp.$$" && mv -f "${BAKED_TAR}.tmp.$$" "$BAKED_TAR"; then
+        echo "[provision] bake done: $(ls -lh "$BAKED_TAR" 2>/dev/null | awk '{print $5}')"
+      else
+        echo "[provision] WARNING: docker save failed"; rm -f "${BAKED_TAR}.tmp.$$" 2>/dev/null
+      fi
+    else
+      echo "[provision] WARNING: docker commit failed"
+    fi
+  fi
+  echo "[provision] DONE"
+}
+
+# =============================================================================
+# PHASE: worker  (inside the container)
+# =============================================================================
+worker() {
+  cd "$REPO_ROOT"
+  mkdir -p "$SCRATCH" 2>/dev/null || true
+  LOG=${LOG:-$SCRATCH/yambda_5b_8gpu.log}
+  # Append (not truncate): under the streaming-e2e supervisor a run may relaunch
+  # many times into the SAME $LOG; the supervisor initializes it once at run start.
+  # MLPerf compliance log (rank 0 writes it). Per-job filename so each standalone
+  # sbatch gets a clean log; the e2e supervisor pins MLPERF_LOG_PATH itself.
+  export MLPERF_LOG_PATH=${MLPERF_LOG_PATH:-$SCRATCH/mlperf/yambda_5b_mlperf.${SLURM_JOB_ID:-manual}.log}
+  echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee -a "$LOG"
+
+  # polars-u64-idx (NOT stock polars) — yambda parquet's flat-explode overruns
+  # 32-bit row index. Reserved node has no outbound DNS, so install from a
+  # pre-staged tarball under /apps/chcai/. Override PIP_LOCAL_TGZ for other hosts.
+  PIP_LOCAL_TGZ=${PIP_LOCAL_TGZ:-/apps/chcai/pip_local_yambda.tgz}   # [CLUSTER-SPECIFIC] shared-NFS path
+  PIP_LOCAL_DIR=${PIP_LOCAL_DIR:-/tmp/pip_local}
+  if [ ! -f "$PIP_LOCAL_DIR/lib/python3.12/site-packages/polars/__init__.py" ]; then
+    rm -rf "$PIP_LOCAL_DIR"
+    mkdir -p "$PIP_LOCAL_DIR" && tar xzf "$PIP_LOCAL_TGZ" -C "$(dirname "$PIP_LOCAL_DIR")" 2>&1 | tail -3 | tee -a "$LOG"
+  fi
+
+  export PYTHONPATH="$PIP_LOCAL_DIR/lib/python3.12/site-packages:$REPO_ROOT:${PYTHONPATH:-}"
+  export HOME=${HOME:-/tmp}
+  echo "[$(date)] PYTHONPATH=$PYTHONPATH" | tee -a "$LOG"
+  python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print('imports OK,', torch.__version__, torch.cuda.device_count(),'gpus')" 2>&1 | tee -a "$LOG"
+
+  export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
+  export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
+
+  # --- distributed topology ---------------------------------------------------
+  GPUS_PER_NODE=$(python -c "import torch; print(torch.cuda.device_count())")
+  # Multi-node when launched one-task-per-node under SLURM (SLURM_NNODES>1);
+  # otherwise fall through to legacy single-node defaults (localhost, node_rank 0).
+  if [ "${SLURM_NNODES:-1}" -gt 1 ] && [ -n "${SLURM_JOB_NODELIST:-}" ]; then
+    NNODES=${SLURM_NNODES}
+    NODE_RANK=${SLURM_NODEID:-${SLURM_PROCID:-0}}
+    # PREFER a MASTER_ADDR/PORT forwarded from the orchestrate phase (resolved on
+    # the host, which has scontrol); the container image carries no SLURM client.
+    if [ -z "${MASTER_ADDR:-}" ]; then
+      MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
+      MASTER_ADDR=${MASTER_ADDR:-localhost}
+    fi
+    MASTER_PORT=${MASTER_PORT:-$(( 20000 + ${SLURM_JOB_ID:-0} % 20000 ))}
+  else
+    NNODES=${NNODES:-1}
+    NODE_RANK=${NODE_RANK:-0}
+    # Single-node: all ranks live on THIS host, so rendezvous over loopback and
+    # do NOT use the SLURM hostname. On some nodes the hostname resolves to a
+    # non-routable per-GPU RoCE /31 (benic 192.168.x) address; using it makes the
+    # NCCL bootstrap fail with "No route to host". localhost is node-independent.
+    MASTER_ADDR=localhost
+    MASTER_PORT=${MASTER_PORT:-}     # empty => train_ranker picks a free port
+  fi
+  export NNODES NODE_RANK GPUS_PER_NODE MASTER_ADDR MASTER_PORT
+  export WORLD_SIZE=$(( NNODES * GPUS_PER_NODE ))
+  echo "[$(date)] topology: nnodes=$NNODES node_rank=$NODE_RANK gpus_per_node=$GPUS_PER_NODE world_size=$WORLD_SIZE master=$MASTER_ADDR:${MASTER_PORT:-<auto>}" | tee -a "$LOG"
+
+  # NCCL bootstrap NIC — pin for BOTH single- and multi-node. The container is
+  # --network=host so RCCL sees ALL host interfaces; if left to auto-detect, NCCL
+  # can pick a non-routable per-GPU RoCE /31 (benic* 192.168.x) link and fail
+  # bootstrap with "No route to host" (this is node-dependent: it happened to
+  # work on some nodes and not others, causing repetitive single-node init
+  # failures). Pinning the routable host NIC fixes it everywhere.
+  # [CLUSTER-SPECIFIC] routable host NIC for TCP bootstrap (find via `ip -br addr`).
+  export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
+
+  # Multi-node additionally needs the RDMA data-plane (bnxt_re HCAs) configured;
+  # single-node uses intra-node P2P (XGMI/PCIe) so only the bootstrap NIC matters.
+  if [ "$NNODES" -gt 1 ]; then
+    NCCL_NET_TRANSPORT=${NCCL_NET_TRANSPORT:-ib}
+    if [ "$NCCL_NET_TRANSPORT" = "socket" ]; then
+      export NCCL_IB_DISABLE=1
+      echo "[$(date)] NCCL: IB disabled — allreduce over TCP (fenic0). Functional, not RDMA-fast." | tee -a "$LOG"
+    else
+      # bnxt_re userspace provider ABI overlay (REQUIRED for RCCL). The stock v34
+      # provider faults RCCL's create_qp (256 WRs) against the host kernel uapi
+      # ("Bad address"); the host v61/v59 set staged by the provision phase works.
+      # The libibverbs.so (UNVERSIONED) symlink + LD_PRELOAD are both required so
+      # the torch process maps ONLY the host lib (see provision phase comment).
+      if [ -e "$OVERLAY/lib/libibverbs.so.1" ]; then
+        [ -e "$OVERLAY/lib/libibverbs.so" ] || ln -sf libibverbs.so.1 "$OVERLAY/lib/libibverbs.so" 2>/dev/null || true
+        export LD_LIBRARY_PATH="$OVERLAY/lib:$OVERLAY/lib/libibverbs:${LD_LIBRARY_PATH:-}"
+        export LD_PRELOAD="$OVERLAY/lib/libibverbs.so.1${LD_PRELOAD:+:$LD_PRELOAD}"
+        echo "[$(date)] NCCL: bnxt_re provider overlay -> $OVERLAY (host rdma-core v61/v59; symlink+LD_PRELOAD so RCCL binds the host lib for QP creation)" | tee -a "$LOG"
+      else
+        echo "[$(date)] WARNING: RDMA overlay $OVERLAY missing — RCCL QP creation will fail 'Bad address' on stock v34 provider; set RDMA_OVERLAY or use NCCL_NET_TRANSPORT=socket" | tee -a "$LOG"
+      fi
+      # MINIMAL bnxt_re set PROVEN on these meta64 cv350 nodes (cmcknigh RCCL
+      # benchmarks + confirmed e2e here). NCCL_IB_TC=104 (RoCE lossless PFC class)
+      # is required; do NOT add the ionic-AINIC QPS/ECE/DMABUF block.
+      # [CLUSTER-SPECIFIC] RDMA HCA names (`ibv_devices`); other vendors => mlx5_*/ionic_*
+      export NCCL_IB_HCA=${NCCL_IB_HCA:-bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7}
+      export NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX:-3}    # [CLUSTER-SPECIFIC] RoCEv2 IPv4 GID idx (`show_gids`)
+      export NCCL_IB_TC=${NCCL_IB_TC:-104}                # [CLUSTER-SPECIFIC] RoCE lossless/PFC traffic class
+      export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:-14}
+      export NCCL_IGNORE_CPU_AFFINITY=${NCCL_IGNORE_CPU_AFFINITY:-1}
+      export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
+      # GPU-Direct RDMA: ENABLED by default. The brcmrdma host kernel ships the
+      # inbox peer-memory client (`ib_register_peer_memory_client` in
+      # /proc/kallsyms), so RCCL does true GPU<->NIC DMA over bnxt_re instead of
+      # bouncing through host memory. Measured ~+22% throughput at 2 nodes
+      # (65.7%->79.8% weak-scaling efficiency) vs the old host-staged path.
+      # GDR_LEVEL=5 (most permissive) is required so GDR is used even when the GPU
+      # and NIC cross the CPU root complex. NCCL_DMABUF_ENABLE=1 is a harmless
+      # no-op here (kernel lacks CONFIG_DMABUF_MOVE_NOTIFY/CONFIG_PCI_P2PDMA, so
+      # peermem carries it). Enabling is non-fatal: if peermem is ever absent RCCL
+      # just logs "GDR 0" and falls back to host staging. Override with
+      # NCCL_NET_GDR_LEVEL=0 to force the legacy host-staged path.
+      export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-5}
+      export NCCL_DMABUF_ENABLE=${NCCL_DMABUF_ENABLE:-1}
+      echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}, DMABUF=${NCCL_DMABUF_ENABLE}; meta64 bnxt_re config, validated)" | tee -a "$LOG"
+    fi
+  fi
+  export NCCL_DEBUG=${NCCL_DEBUG:-WARN}
+  export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-}
+  export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
+
+  # --- GPU clock sanity guard -------------------------------------------------
+  # A leftover perf_determinism cap (half clock) silently slows every kernel ~1.9x.
+  # Log the perf level + a live sclk sample and try to restore boost (non-fatal).
+  if command -v rocm-smi >/dev/null 2>&1; then
+    echo "[$(date)] GPU perf-level check:" | tee -a "$LOG"
+    rocm-smi --showperflevel 2>/dev/null | grep -iE "GPU\[[0-9]+\]" | tee -a "$LOG" || true
+    if rocm-smi --showperflevel 2>/dev/null | grep -iqE "Performance Level: *(perf_determinism|manual|low)"; then
+      echo "[$(date)] WARNING: GPUs not in 'auto' perf level — attempting --setperflevel auto" | tee -a "$LOG"
+      rocm-smi --setperflevel auto 2>/dev/null | grep -iE "set to auto" | tee -a "$LOG" \
+        || echo "[$(date)] WARNING: could not set perf level (no permission?). Run 'rocm-smi --setperflevel auto' on the HOST before benchmarking — clocks may be capped." | tee -a "$LOG"
+    fi
+    echo "[$(date)] sclk sample (GPU0):$(rocm-smi -d 0 --showclocks 2>/dev/null | grep -i 'sclk clock level' | sed -E 's/.*sclk clock level//')" | tee -a "$LOG" || true
+  fi
+
+  # --- stray-trainer / leaked-VRAM guard -------------------------------------
+  # The trainer runs via `docker exec` into a long-lived container, so its procs
+  # live in the container PID namespace, NOT the SLURM job cgroup. If a prior job
+  # OOM'd/crashed, a rank can leak and keep holding ~half of every GPU's VRAM,
+  # which persists across jobs (container survives) and guarantees the next
+  # attempt OOMs. Before launching, reap any pre-existing trainer procs (there
+  # should be none at this point) and wait for VRAM to drain. [g]-guard avoids
+  # self-match. Non-fatal.
+  if pgrep -f '[g]enerative_recommenders' >/dev/null 2>&1; then
+    echo "[$(date)] WARNING: leaked trainer procs found pre-launch — killing." | tee -a "$LOG"
+    pkill -9 -f '[g]enerative_recommenders' 2>/dev/null || true
+    for _i in $(seq 1 15); do
+      pgrep -f '[g]enerative_recommenders' >/dev/null 2>&1 || break
+      sleep 2
+    done
+    sleep 5  # let the driver release VRAM after process exit
+    if command -v rocm-smi >/dev/null 2>&1; then
+      echo "[$(date)] post-cleanup GPU0 used GiB:$(rocm-smi --showmeminfo vram 2>/dev/null | awk -F: '/Used/{printf " %.0f", $3/1073741824; exit}')" | tee -a "$LOG"
+    fi
+  fi
+
+  # WORKER_CMD override: run an arbitrary in-container command (e.g. an a2a/RCCL
+  # micro-benchmark) instead of the trainer, REUSING all the NCCL/RDMA/topology
+  # setup above so it exercises the exact transport the trainer uses. The
+  # supervisor never sets WORKER_CMD, so the training path is unchanged.
+  if [ -n "${WORKER_CMD:-}" ]; then
+    echo "[$(date)] WORKER_CMD override (WORLD_SIZE=$WORLD_SIZE): $WORKER_CMD" | tee -a "$LOG"
+    bash -lc "cd $REPO_ROOT && $WORKER_CMD" 2>&1 | tee -a "$LOG"
+    return
+  fi
+
+  echo "[$(date)] launching train_ranker with WORLD_SIZE=$WORLD_SIZE" | tee -a "$LOG"
+  python -m generative_recommenders.dlrm_v3.train.train_ranker \
+      --dataset yambda-5b --mode "${MODE:-streaming-train-eval}" 2>&1 | tee -a "$LOG"
+}
+
+# ---- dispatch ---------------------------------------------------------------
+case "$PHASE" in
+  orchestrate) orchestrate ;;
+  provision)   provision ;;
+  worker)      worker ;;
+  *) echo "launch_slurm.sh: unknown LAUNCH_SLURM_PHASE='$PHASE'" >&2; exit 2 ;;
+esac
diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
new file mode 100755
index 000000000..c913bac7b
--- /dev/null
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -0,0 +1,295 @@
+#!/bin/bash
+# =============================================================================
+# run_streaming_e2e.sh — self-healing supervisor for a yambda-5b streaming
+#   train+eval run (sbatch-job level). Works for 1..N nodes.
+# =============================================================================
+#
+# WHAT IT SUPERVISES
+#   The run is an `sbatch [--nodes=N] scripts/launch_slurm.sh` BATCH job. That
+#   batch script is fully self-contained: it runs orchestrate -> provision
+#   (container + RDMA) -> worker (in-container trainer) on EVERY node, so it
+#   handles single-node (--nodes=1) and multi-node (world_size=8N) identically.
+#   This supervisor wraps THAT job: it monitors it and, on crash / node-failure
+#   / hang, RESUBMITS it (which resumes from the latest checkpoint via load
+#   auto-latest), bounded by --max-relaunch. There is no docker-exec lifecycle
+#   or in-place node failover here — node replacement is SLURM's job on resubmit.
+#
+# RESUME MODEL (why a resubmit "just works")
+#   The trainer checkpoints to $CKPT_PATH and on startup load_dmp_checkpoint
+#   auto-resolves to the highest-numbered subdir, restoring model+optimizer+RNG
+#   and skipping already-trained batches of a partial window. So resubmitting the
+#   SAME submit-script (same CKPT_PATH/LOG) continues from where it died.
+#   Resubmits set APPEND_LOG=1 so the metrics log is preserved across attempts.
+#
+# WHAT IT DETECTS (poll every --poll-s)
+#   * job left the queue  -> read sacct State/ExitCode:
+#       COMPLETED+0   => run finished (success, exit 0)
+#       CANCELLED     => user intent (stop, exit 0 — NOT our place to resubmit)
+#       FAILED/NODE_FAIL/TIMEOUT/OUT_OF_MEMORY/BOOT_FAIL/PREEMPTED => relaunch
+#   * hang watchdog: job RUNNING but LOG frozen >= --stall-s AND no trainer
+#       process alive on ANY node (cross-node pgrep) => scancel + relaunch.
+#   * disk guard before each (re)submit: require --min-free-gib on the ckpt vol.
+#
+# WHERE IT RUNS
+#   On the SLURM head node (NFS-mounted /home/chcai code + /apps/chcai
+#   ckpts/logs are visible here for squeue/sacct/df and the cross-node pgrep).
+#
+# USAGE
+#   # Submit a fresh job from the launch script, then supervise it:
+#   nohup bash scripts/run_streaming_e2e.sh \
+#       --submit-script /apps/chcai/yambda_5b_e2e/<run>/launch_1node.sh \
+#       --log         /apps/chcai/yambda_5b_e2e/<run>/<run>.log \
+#       --ckpt-path   /apps/chcai/yambda_5b_e2e/<run>/ckpts \
+#       --run-name    <run> \
+#       > /apps/chcai/yambda_5b_e2e/<run>/<run>.supervisor.console.log 2>&1 &
+#
+#   # Adopt an already-submitted job instead of submitting a new one:
+#   nohup bash scripts/run_streaming_e2e.sh --jobid 13235 \
+#       --submit-script .../launch_2node.sh --log .../run.log \
+#       --ckpt-path .../ckpts --run-name <run> > .../console.log 2>&1 &
+#
+#   The node count, partition, and reservation all live in the --submit-script's
+#   sbatch line (launch_1node.sh / launch_2node.sh / ...), not here.
+#
+# EXIT CODES
+#   0  run completed (COMPLETED+0) or user-cancelled
+#   1  exhausted --max-relaunch without completion (or submit failed)
+#   3  disk guard tripped
+# =============================================================================
+set -uo pipefail
+
+JOBID=""                       # adopt this job; empty => submit fresh
+SUBMIT_SCRIPT=""
+LOG=""
+CKPT_PATH=""
+RUN_NAME="yambda_5b_e2e"
+CONTAINER=yambda_primus
+MAX_RELAUNCH=50
+MIN_FREE_GIB=1200
+STALL_S=2400                   # 40 min: comfortably exceeds a full-holdout eval
+                               # window + a blocking ckpt save; only trips when
+                               # the log is frozen AND no trainer proc is alive.
+POLL_S=30
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --jobid) JOBID="$2"; shift 2;;
+        --submit-script) SUBMIT_SCRIPT="$2"; shift 2;;
+        --log) LOG="$2"; shift 2;;
+        --ckpt-path) CKPT_PATH="$2"; shift 2;;
+        --run-name) RUN_NAME="$2"; shift 2;;
+        --container) CONTAINER="$2"; shift 2;;
+        --max-relaunch) MAX_RELAUNCH="$2"; shift 2;;
+        --min-free-gib) MIN_FREE_GIB="$2"; shift 2;;
+        --stall-s) STALL_S="$2"; shift 2;;
+        --poll-s) POLL_S="$2"; shift 2;;
+        *) echo "Unknown arg: $1"; exit 1;;
+    esac
+done
+
+[[ -n "$SUBMIT_SCRIPT" && -f "$SUBMIT_SCRIPT" ]] || { echo "FATAL: --submit-script required and must exist"; exit 1; }
+[[ -n "$LOG" ]] || { echo "FATAL: --log required"; exit 1; }
+
+SUP_LOG="${LOG%.log}.supervisor.log"
+sup() { echo "[$(date '+%F %T')] [supervisor] $*" | tee -a "$SUP_LOG"; }
+
+# Is the job in the queue right now (single read)?
+job_in_queue() { [[ -n "$(squeue -h -j "$1" -o '%T' 2>/dev/null | head -1)" ]]; }
+job_state()  { squeue -h -j "$1" -o '%T' 2>/dev/null | head -1; }
+
+# Is the job still active? squeue/the SLURM control plane can transiently return
+# empty during an NFS/controller blip even though the job is alive (this once
+# killed all supervisors at once: empty squeue -> sacct said RUNNING -> a bogus
+# "relaunch"). So a SINGLE empty read is not trusted: re-check a few times before
+# believing the job is really gone.
+job_active() {
+    job_in_queue "$1" && return 0
+    local k
+    for k in 1 2 3; do
+        sleep 10
+        job_in_queue "$1" && return 0
+    done
+    return 1
+}
+
+# Terminal State + ExitCode from accounting once the job has left the queue.
+job_final() { sacct -j "$1" -X -n -o State,ExitCode 2>/dev/null | head -1 | tr -s ' '; }
+
+# sacct/SLURM states that mean the job is STILL ALIVE (not terminal). If we see
+# one of these after the monitor loop exits, squeue lied (transient) — resume
+# monitoring instead of relaunching (which could spawn a DUPLICATE job).
+is_active_state() {
+    case "$1" in
+        RUNNING|PENDING|CONFIGURING|COMPLETING|REQUEUED|RESIZING|SUSPENDED|REQUEUE_HOLD|REQUEUE_FED|SIGNALING|STAGE_OUT) return 0;;
+        *) return 1;;
+    esac
+}
+
+# Any trainer process alive on ANY node of the allocation? (cross-node pgrep via
+# overlap srun into each node's container). [g]enerative self-match guard avoids
+# pgrep matching its own command line.
+trainer_alive() {
+    local jid="$1" n
+    n=$(timeout 70 srun --jobid="$jid" --overlap --ntasks-per-node=1 bash -c \
+        "docker exec $CONTAINER bash -lc 'set -f; pgrep -f [g]enerative_recommenders | wc -l' 2>/dev/null" 2>/dev/null \
+        | awk '{s+=$1} END{print s+0}')
+    [[ "${n:-0}" -gt 0 ]]
+}
+
+# Free GiB on the ckpt volume (NFS is mounted on this head node, so df locally).
+disk_free_gib() {
+    df -BG --output=avail "$CKPT_PATH" 2>/dev/null | tail -1 | tr -dc '0-9'
+}
+
+disk_guard() {
+    [[ -z "$CKPT_PATH" ]] && return 0
+    local free; free=$(disk_free_gib); free=${free:-0}
+    sup "disk guard: ${free} GiB free on $CKPT_PATH (min ${MIN_FREE_GIB})"
+    if (( free < MIN_FREE_GIB )); then
+        sup "FATAL: insufficient free space (${free} < ${MIN_FREE_GIB} GiB). Aborting."
+        return 1
+    fi
+    return 0
+}
+
+# Resubmit the run; resumes from latest checkpoint. APPEND_LOG=1 preserves the
+# metrics log. Echoes the new jobid.
+resubmit() {
+    local out newjid
+    out=$(APPEND_LOG=1 bash "$SUBMIT_SCRIPT" 2>&1)
+    newjid=$(echo "$out" | grep -oE 'Submitted batch job [0-9]+' | grep -oE '[0-9]+' | head -1)
+    echo "$out" | sed 's/^/    /' >> "$SUP_LOG"
+    echo "$newjid"
+}
+
+# Submit with retries+backoff. A transient NFS / control-plane error (e.g.
+# "sbatch: error: ... I/O error writing script/environment to file") must NOT
+# kill the supervisor — it leaves runs unsupervised / unlaunched. Echoes a jobid
+# on success, or empty after all retries.
+submit_retry() {
+    local cand sub_try
+    for sub_try in $(seq 1 12); do
+        cand=$(resubmit)
+        if [[ "$cand" =~ ^[0-9]+$ ]]; then echo "$cand"; return 0; fi
+        sup "submit attempt $sub_try/12 failed (transient sbatch/NFS error) — backing off."
+        sleep $(( sub_try < 5 ? 30 : 120 ))
+    done
+    return 1
+}
+
+sup "=== streaming e2e supervisor start ==="
+sup "run=$RUN_NAME submit=$SUBMIT_SCRIPT log=$LOG ckpt=$CKPT_PATH"
+sup "max_relaunch=$MAX_RELAUNCH min_free_gib=$MIN_FREE_GIB stall_s=$STALL_S poll_s=$POLL_S"
+
+attempt=0
+if [[ -z "$JOBID" ]]; then
+    if ! disk_guard; then exit 3; fi
+    sup "no --jobid given; submitting a fresh job"
+    JOBID=$(submit_retry)
+    [[ "$JOBID" =~ ^[0-9]+$ ]] || { sup "FATAL: could not submit after 12 retries — aborting."; exit 1; }
+fi
+attempt=1
+sup "supervising jobid=$JOBID (attempt $attempt/$MAX_RELAUNCH)"
+
+while (( attempt <= MAX_RELAUNCH )); do
+    # --- wait for the job to be schedulable / running ---
+    wait_pend=0
+    while job_active "$JOBID" && [[ "$(job_state "$JOBID")" != "RUNNING" ]]; do
+        (( wait_pend % 10 == 0 )) && sup "job $JOBID state=$(job_state "$JOBID") — waiting to run…"
+        sleep "$POLL_S"; wait_pend=$((wait_pend + 1))
+    done
+    [[ "$(job_state "$JOBID")" == "RUNNING" ]] && sup "job $JOBID RUNNING on $(squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)"
+
+    # --- monitor loop ---
+    last_size=0; stall_accum=0; hb=0; self_cancelled=0
+    while job_active "$JOBID"; do
+        st=$(job_state "$JOBID")
+        if [[ "$st" == "RUNNING" ]]; then
+            cur_size=$(stat -c %s "$LOG" 2>/dev/null || echo 0)
+            if [[ "$cur_size" == "$last_size" ]]; then
+                # frozen log: only count as a stall if no trainer proc is alive
+                # (a long eval / blocking save keeps the process up -> not a stall)
+                hb=$((hb + 1))
+                if (( hb % 4 == 0 )); then
+                    if trainer_alive "$JOBID"; then
+                        stall_accum=0
+                    else
+                        stall_accum=$((stall_accum + POLL_S * 4))
+                        sup "log frozen + no trainer alive (${stall_accum}s/${STALL_S}s)"
+                        if (( stall_accum >= STALL_S )); then
+                            sup "STALL: hung run — scancel $JOBID and relaunch."
+                            self_cancelled=1
+                            scancel "$JOBID" 2>/dev/null || true
+                            sleep 20
+                            break
+                        fi
+                    fi
+                fi
+            else
+                stall_accum=0; last_size=$cur_size
+            fi
+        fi
+        sleep "$POLL_S"
+    done
+
+    # --- job has left the queue (or we scancel'd it): decide ---
+    sleep 5
+    final=$(job_final "$JOBID")
+    state=$(echo "$final" | awk '{print $1}')
+    code=$(echo "$final" | awk '{print $2}')
+    sup "job $JOBID ended: state='${state:-?}' exit='${code:-?}'"
+
+    # The monitor loop only exits when squeue has been empty across several
+    # confirming reads. If accounting STILL reports an active state, the job is
+    # actually alive (squeue/control-plane blip) — resume monitoring rather than
+    # relaunching, which would create a duplicate job.
+    if is_active_state "$state"; then
+        sup "sacct reports still-active state '$state' — transient squeue blip; resuming monitoring (NOT relaunching)."
+        sleep "$POLL_S"
+        continue
+    fi
+
+    case "$state" in
+        COMPLETED)
+            if [[ "$code" == "0:0" ]]; then
+                sup "RUN COMPLETED CLEANLY on attempt $attempt."
+                sup "=== supervisor done (success) ==="
+                exit 0
+            fi
+            sup "COMPLETED but nonzero exit ($code) — relaunching."
+            ;;
+        CANCELLED*)
+            if (( self_cancelled )); then
+                sup "job CANCELLED by our own stall recovery — relaunching from latest checkpoint."
+            else
+                sup "job CANCELLED (user/admin intent) — NOT resubmitting. Stopping supervisor."
+                sup "=== supervisor done (cancelled) ==="
+                exit 0
+            fi
+            ;;
+        FAILED|NODE_FAIL|TIMEOUT|OUT_OF_MEMORY|BOOT_FAIL|PREEMPTED|"")
+            sup "failure state '${state:-unknown}' — will relaunch from latest checkpoint."
+            ;;
+        *)
+            sup "unrecognized terminal state '${state}' — relaunching to be safe."
+            ;;
+    esac
+
+    if (( attempt >= MAX_RELAUNCH )); then break; fi
+    if ! disk_guard; then exit 3; fi
+    sleep $(( attempt < 5 ? 20 : 60 ))   # small backoff
+    # Resubmit with retries. A transient NFS / control-plane error (e.g.
+    # "sbatch: error: Batch job submission failed: I/O error writing
+    # script/environment to file") must NOT kill the supervisor — that once
+    # left a live run permanently unsupervised. Retry with backoff first.
+    JOBID=$(submit_retry)
+    if ! [[ "$JOBID" =~ ^[0-9]+$ ]]; then
+        sup "FATAL: resubmit failed after 12 retries — aborting."; exit 1
+    fi
+    attempt=$((attempt + 1))
+    sup "relaunched as jobid=$JOBID (attempt $attempt/$MAX_RELAUNCH)"
+done
+
+sup "FATAL: exhausted MAX_RELAUNCH=$MAX_RELAUNCH without completion."
+sup "=== supervisor done (failure) ==="
+exit 1
diff --git a/recommendation_v4/scripts/run_streaming_e2e_local.sh b/recommendation_v4/scripts/run_streaming_e2e_local.sh
new file mode 100755
index 000000000..85a817c8c
--- /dev/null
+++ b/recommendation_v4/scripts/run_streaming_e2e_local.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+# =============================================================================
+# run_streaming_e2e_local.sh — self-healing supervisor for a SINGLE-HOST
+#   (NON-SLURM) yambda-5b streaming train+eval run. Local analog of
+#   scripts/run_streaming_e2e.sh (the SLURM/sbatch supervisor).
+#
+# WHAT IT SUPERVISES
+#   The "job" is one foreground run of --submit-script (default
+#   scripts/launch_e2e_local.sh), which `docker exec`s the trainer in the
+#   container. The supervisor runs that submit-script in the BACKGROUND so:
+#     * its host PID == liveness  (kill -0 / hang watchdog), and
+#     * `wait $PID` == the trainer's EXIT CODE (success vs. failure).
+#   On crash / nonzero-exit / hang it RELAUNCHES the same submit-script (same
+#   $CKPT_PATH/$LOG → resumes from the latest checkpoint), bounded by
+#   --max-relaunch. This is the SLURM supervisor's sacct/squeue/scancel control
+#   plane re-expressed with a local process + `docker exec` lifecycle.
+#
+# WHAT IT DETECTS (poll every --poll-s)
+#   * submit-script process exits -> read its exit code:
+#       0      => run finished cleanly (success)
+#       != 0   => crash/OOM/die_at_step(42)/etc. => relaunch from latest ckpt
+#   * hang watchdog: process alive but $LOG frozen >= --stall-s AND no trainer
+#       process alive in the container (pgrep via docker exec) => kill + pkill in
+#       container + relaunch. A long eval / blocking ckpt save keeps the trainer
+#       process up, so it is NOT counted as a stall.
+#   * disk guard before each (re)launch: require --min-free-gib on the ckpt vol.
+#
+# USAGE
+#   nohup bash scripts/run_streaming_e2e_local.sh \
+#       --submit-script scripts/launch_e2e_local.sh \
+#       --log       /home/chcai/yambda_5b_e2e/<run>/<run>.log \
+#       --ckpt-path /home/chcai/yambda_5b_e2e/<run>/ckpts \
+#       --run-name  <run> \
+#       > /home/chcai/yambda_5b_e2e/<run>/<run>.supervisor.console.log 2>&1 &
+#
+#   Per-run hyperparameters live in the --submit-script's env defaults (or are
+#   exported before invoking this supervisor), not here.
+#
+# EXIT CODES
+#   0  run completed cleanly
+#   1  exhausted --max-relaunch without completion (or launch failed)
+#   3  disk guard tripped
+# =============================================================================
+set -uo pipefail
+
+SUBMIT_SCRIPT="scripts/launch_e2e_local.sh"
+LOG=""
+CKPT_PATH=""
+RUN_NAME="yambda_5b_e2e_local"
+CONTAINER=${CONTAINER:-yambda_local}
+DOCKER=${DOCKER:-sudo docker}
+MAX_RELAUNCH=50
+MIN_FREE_GIB=700        # one full DMP ckpt (~600 GB) + headroom for the atomic
+                        # .tmp written beside the retained one during a save.
+STALL_S=2400            # 40 min frozen-log + no-trainer-proc => hung.
+POLL_S=30
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --submit-script) SUBMIT_SCRIPT="$2"; shift 2;;
+        --log) LOG="$2"; shift 2;;
+        --ckpt-path) CKPT_PATH="$2"; shift 2;;
+        --run-name) RUN_NAME="$2"; shift 2;;
+        --container) CONTAINER="$2"; shift 2;;
+        --docker) DOCKER="$2"; shift 2;;
+        --max-relaunch) MAX_RELAUNCH="$2"; shift 2;;
+        --min-free-gib) MIN_FREE_GIB="$2"; shift 2;;
+        --stall-s) STALL_S="$2"; shift 2;;
+        --poll-s) POLL_S="$2"; shift 2;;
+        *) echo "Unknown arg: $1"; exit 1;;
+    esac
+done
+
+[[ -n "$SUBMIT_SCRIPT" && -f "$SUBMIT_SCRIPT" ]] || { echo "FATAL: --submit-script required and must exist ($SUBMIT_SCRIPT)"; exit 1; }
+[[ -n "$LOG" ]] || { echo "FATAL: --log required"; exit 1; }
+
+SUP_LOG="${LOG%.log}.supervisor.log"
+# Create the log + ckpt dirs up front so the disk guard's df has a real path to
+# stat (df on a nonexistent dir returns 0 avail -> false "disk full" abort).
+mkdir -p "$(dirname "$SUP_LOG")"
+[[ -n "$CKPT_PATH" ]] && mkdir -p "$CKPT_PATH"
+sup() { echo "[$(date '+%F %T')] [supervisor] $*" | tee -a "$SUP_LOG"; }
+
+# Any trainer process alive in the container? [g]enerative self-match guard
+# avoids pgrep matching its own command line.
+trainer_alive() {
+    local n
+    n=$($DOCKER exec "$CONTAINER" bash -lc 'set -f; pgrep -f "[g]enerative_recommenders" | wc -l' 2>/dev/null | tr -dc '0-9')
+    [[ "${n:-0}" -gt 0 ]]
+}
+
+# Hard-kill any trainer processes left in the container AND wait for GPU HBM to
+# actually drain before returning. A rank stuck in a HIP/RCCL collective sits in
+# uninterruptible D-state and keeps its multi-hundred-GB embedding shard resident
+# for many seconds after SIGKILL; relaunching before that frees makes the next
+# attempt OOM on dirty GPUs (an OOM-crash -> dirty-GPU -> OOM cascade). So kill,
+# then poll rocm-smi until every GPU is <5 GB (or give up after ~120s).
+cleanup_container() {
+    $DOCKER exec "$CONTAINER" bash -lc \
+        'pkill -9 -f generative_recommenders 2>/dev/null; pkill -9 -f spawn_main 2>/dev/null; pkill -9 -f resource_tracker 2>/dev/null; true' \
+        2>/dev/null || true
+    local k busy
+    for k in $(seq 1 24); do                       # up to ~120s
+        busy=$($DOCKER exec "$CONTAINER" bash -lc \
+            "rocm-smi --showmeminfo vram 2>/dev/null | awk '/Used/{if (\$NF+0 > 5e9) c++} END{print c+0}'" \
+            2>/dev/null | tr -dc '0-9')
+        busy=${busy:-0}
+        [[ "$busy" == "0" ]] && return 0
+        sup "waiting for GPU HBM to drain ($busy GPU(s) still >5GB)…"
+        $DOCKER exec "$CONTAINER" bash -lc 'pkill -9 -f spawn_main 2>/dev/null; true' 2>/dev/null || true
+        sleep 5
+    done
+    sup "WARNING: GPUs still show residual HBM after 120s — launching anyway."
+    return 0
+}
+
+disk_free_gib() { df -BG --output=avail "$CKPT_PATH" 2>/dev/null | tail -1 | tr -dc '0-9'; }
+
+disk_guard() {
+    [[ -z "$CKPT_PATH" ]] && return 0
+    local free; free=$(disk_free_gib); free=${free:-0}
+    sup "disk guard: ${free} GiB free on $CKPT_PATH (min ${MIN_FREE_GIB})"
+    if (( free < MIN_FREE_GIB )); then
+        sup "FATAL: insufficient free space (${free} < ${MIN_FREE_GIB} GiB). Aborting."
+        return 1
+    fi
+    return 0
+}
+
+# Run the submit-script in the FOREGROUND (its exit status == the trainer's).
+# APPEND_LOG=1 preserves the metrics log across relaunches. This is invoked as
+# `launch & PID=$!` from the main loop so the backgrounded copy is a DIRECT child
+# of this shell — otherwise `wait $PID` can't reap it and always returns 127,
+# making every clean completion look like a failure (infinite relaunch loop).
+launch() {
+    APPEND_LOG=1 CONTAINER="$CONTAINER" DOCKER="$DOCKER" \
+        RUN_NAME="$RUN_NAME" LOG="$LOG" CKPT_PATH="$CKPT_PATH" \
+        bash "$SUBMIT_SCRIPT" >>"$SUP_LOG" 2>&1
+}
+
+sup "=== streaming e2e LOCAL supervisor start ==="
+sup "run=$RUN_NAME submit=$SUBMIT_SCRIPT log=$LOG ckpt=$CKPT_PATH container=$CONTAINER"
+sup "max_relaunch=$MAX_RELAUNCH min_free_gib=$MIN_FREE_GIB stall_s=$STALL_S poll_s=$POLL_S"
+
+attempt=1
+while (( attempt <= MAX_RELAUNCH )); do
+    if ! disk_guard; then exit 3; fi
+    sup "launching attempt $attempt/$MAX_RELAUNCH"
+    cleanup_container                    # ensure no stragglers from a prior attempt
+    launch & PID=$!                      # direct child => wait $PID reaps the real rc
+    sup "submit-script running as host pid=$PID"
+
+    # --- monitor loop ---
+    last_size=0; stall_accum=0; hb=0; hung=0
+    while kill -0 "$PID" 2>/dev/null; do
+        cur_size=$(stat -c %s "$LOG" 2>/dev/null || echo 0)
+        if [[ "$cur_size" == "$last_size" ]]; then
+            hb=$((hb + 1))
+            # Re-check liveness only every 4 polls (cheap docker exec amortized).
+            if (( hb % 4 == 0 )); then
+                if trainer_alive; then
+                    stall_accum=0
+                else
+                    stall_accum=$((stall_accum + POLL_S * 4))
+                    sup "log frozen + no trainer alive (${stall_accum}s/${STALL_S}s)"
+                    if (( stall_accum >= STALL_S )); then
+                        sup "STALL: hung run — killing pid=$PID + container trainer procs, will relaunch."
+                        hung=1
+                        kill -9 "$PID" 2>/dev/null || true
+                        cleanup_container
+                        break
+                    fi
+                fi
+            fi
+        else
+            stall_accum=0; last_size=$cur_size
+        fi
+        sleep "$POLL_S"
+    done
+
+    # --- the submit-script has exited (or we killed it): decide ---
+    wait "$PID" 2>/dev/null; rc=$?
+    if (( hung )); then
+        sup "attempt $attempt ended via STALL recovery (rc=$rc) — relaunching from latest checkpoint."
+    elif (( rc == 0 )); then
+        sup "RUN COMPLETED CLEANLY on attempt $attempt."
+        sup "=== supervisor done (success) ==="
+        exit 0
+    else
+        sup "attempt $attempt exited rc=$rc (crash/OOM/die_at_step) — relaunching from latest checkpoint."
+    fi
+
+    if (( attempt >= MAX_RELAUNCH )); then break; fi
+    sleep $(( attempt < 5 ? 20 : 60 ))   # small backoff
+    attempt=$((attempt + 1))
+done
+
+sup "FATAL: exhausted MAX_RELAUNCH=$MAX_RELAUNCH without completion."
+sup "=== supervisor done (failure) ==="
+exit 1
diff --git a/recommendation_v4/scripts/stitch_traces.py b/recommendation_v4/scripts/stitch_traces.py
new file mode 100644
index 000000000..54d7963d6
--- /dev/null
+++ b/recommendation_v4/scripts/stitch_traces.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+"""Stitch per-rank Chrome traces from a dlrm_v3 run into one merged file.
+
+When ``Profiler`` runs on multiple ranks, each rank writes its own file:
+
+    <trace_dir>/trace_step{step}_rank{rank}.json
+
+Each per-rank trace uses overlapping ``pid`` namespaces (CPU pid = OS pid;
+GPU streams pid = 0..N), so concatenating the raw event lists would collapse
+multiple ranks onto the same Perfetto track. This script:
+
+* Identifies each pid as ``CPU`` / ``GPU`` / ``Spans`` (and other torch.profiler
+  string-pid tracks) using the per-rank ``process_labels`` metadata events.
+* Always drops the ``Spans`` track (low-signal in this codebase, large in
+  visual clutter).
+* Optionally filters to just ``cpu`` or ``gpu`` events via ``--include``.
+* Sorts the surviving tracks into contiguous Perfetto sections:
+  **all CPU tracks (rank 0..N) first, then all GPU tracks (rank 0..N, stream
+  0..K)**.
+* Remaps every event's ``pid`` and flow ``id`` so cross-rank events never
+  collide on the same track or flow arrow.
+
+Because torch.profiler emits ``baseTimeNanoseconds`` from the same node clock,
+timestamps line up directly across ranks — no time-shift needed for single-node
+runs (multi-node would need clock-skew correction, not implemented here).
+
+Examples
+--------
+Stitch step 52, default (CPU + GPU, drop Spans), gzip output::
+
+    python scripts/stitch_traces.py <trace_dir> --step 52 --gzip
+
+GPU-only view (skip CPU thread tree entirely — useful for kernel-level analysis)::
+
+    python scripts/stitch_traces.py <trace_dir> --step 52 --include gpu --gzip
+
+CPU-only view (host-side ops, profiler annotations, comm scheduling)::
+
+    python scripts/stitch_traces.py <trace_dir> --step 52 --include cpu --gzip
+"""
+from __future__ import annotations
+
+import argparse
+import gzip
+import json
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+# trace_step52_rank3.json or trace_3_rank0.json (legacy filename)
+_RANK_RE = re.compile(r"trace_(?:step)?(\d+)_rank(\d+)\.json$")
+_KEY_RE = re.compile(r"trace_(.+?)_rank\d+\.json$")
+
+# Per-rank pid offset. Picked large enough that no real OS pid collides
+# (Linux pids fit in 22 bits; 1e6 per rank gives ~10 ranks of headroom).
+_PID_STRIDE = 1_000_000
+
+# Per-rank flow-id offset. torch.profiler flow ids are int32/int64 — pack rank
+# into the high bits so cross-rank flows can never link by accident.
+_FLOW_ID_STRIDE = 1 << 40
+
+# Sort-index sections in Perfetto. Lower = appears higher in the timeline UI.
+# Each section reserves a wide range so within-section ordering (rank, stream)
+# fits comfortably without overlapping the next section.
+_SORT_BASE = {
+    "cpu":   0,
+    "gpu":   1_000_000,
+    "other": 10_000_000,   # Traces / "" misc string-pid tracks
+}
+
+# `Spans` carries no useful content in our workloads (one X event per trace)
+# and clutters the timeline — always dropped.
+_ALWAYS_DROP_PIDS_STR = {"Spans"}
+
+
+def _classify_pid(pid_to_label: dict, pid_to_name: dict) -> dict:
+    """Map original pid -> ('cpu'|'gpu'|'spans'|'other', stream_idx_or_0).
+
+    Classification order, first match wins:
+      1. pid (as a string) is in the always-drop set        -> 'spans'
+      2. process_name is in the always-drop set             -> 'spans'
+      3. process_labels == 'CPU'                            -> 'cpu'
+      4. process_labels starts with 'GPU '                  -> 'gpu', stream id
+      5. anything else (including unlabeled pids)           -> 'other'
+    """
+    all_pids = set(pid_to_label) | set(pid_to_name)
+    out: dict = {}
+    for pid in all_pids:
+        label = pid_to_label.get(pid, "")
+        name = pid_to_name.get(pid, "")
+        if isinstance(pid, str) and pid in _ALWAYS_DROP_PIDS_STR:
+            out[pid] = ("spans", 0)
+            continue
+        if name in _ALWAYS_DROP_PIDS_STR:
+            out[pid] = ("spans", 0)
+            continue
+        if label == "CPU":
+            out[pid] = ("cpu", 0)
+        elif label.startswith("GPU"):
+            try:
+                stream_idx = int(label.split()[1])
+            except (IndexError, ValueError):
+                stream_idx = 0
+            out[pid] = ("gpu", stream_idx)
+        else:
+            out[pid] = ("other", 0)
+    return out
+
+
+def _scan_pid_metadata(events: list[dict]) -> tuple[dict, dict]:
+    """First pass: collect per-pid label and name from ``ph='M'`` events."""
+    label: dict = {}
+    name: dict = {}
+    for e in events:
+        if e.get("ph") != "M":
+            continue
+        pid = e.get("pid")
+        if pid is None:
+            continue
+        if e.get("name") == "process_labels":
+            label[pid] = e.get("args", {}).get("labels", "")
+        elif e.get("name") == "process_name":
+            name[pid] = e.get("args", {}).get("name", "")
+    return label, name
+
+
+def _new_sort_index(kind: str, rank: int, stream_idx: int) -> int:
+    """Compute Perfetto sort_index so tracks group as: CPU(rank0..N), GPU(rank0..N, stream0..K), other."""
+    base = _SORT_BASE.get(kind, _SORT_BASE["other"])
+    return base + rank * 100 + stream_idx
+
+
+def _new_pid(orig_pid, rank: int) -> object:
+    """Remap a single pid into a per-rank namespace, preserving int vs str."""
+    if isinstance(orig_pid, int):
+        return orig_pid + rank * _PID_STRIDE
+    if isinstance(orig_pid, str):
+        try:
+            return int(orig_pid) + rank * _PID_STRIDE
+        except ValueError:
+            return f"rank{rank}_{orig_pid}" if orig_pid else f"rank{rank}_misc"
+    return orig_pid
+
+
+def _process_one_rank(
+    events: list[dict],
+    rank: int,
+    include: set[str],
+) -> list[dict]:
+    """Filter + remap one rank's events. ``include`` is a subset of {'cpu','gpu','other'}."""
+    label, name = _scan_pid_metadata(events)
+    classify = _classify_pid(label, name)
+
+    out: list[dict] = []
+    for e in events:
+        pid = e.get("pid")
+        if pid is None:
+            out.append(e)
+            continue
+        # Always-drop check on the raw pid value first - Spans events in our
+        # workloads have NO process_name/process_labels metadata, so the
+        # classifier table doesn't list them. Catch them here directly.
+        if isinstance(pid, str) and pid in _ALWAYS_DROP_PIDS_STR:
+            continue
+        kind, stream_idx = classify.get(pid, ("other", 0))
+        if kind == "spans":          # always dropped
+            continue
+        if kind not in include:      # filtered by --include
+            continue
+
+        # Remap pid + flow id (per-rank namespace).
+        e["pid"] = _new_pid(pid, rank)
+        if "id" in e and e.get("ph") in ("s", "t", "f"):
+            try:
+                e["id"] = int(e["id"]) + rank * _FLOW_ID_STRIDE
+            except (TypeError, ValueError):
+                pass
+
+        # Rewrite metadata: section-aware sort_index + rank-prefixed name.
+        if e.get("ph") == "M":
+            args = e.setdefault("args", {})
+            if e.get("name") == "process_sort_index":
+                args["sort_index"] = _new_sort_index(kind, rank, stream_idx)
+            elif e.get("name") == "process_name":
+                orig = args.get("name", "python")
+                args["name"] = f"[Rank {rank}] {orig}"
+
+        out.append(e)
+
+    return out
+
+
+def _group_by_step(trace_dir: Path) -> dict[str, dict[int, Path]]:
+    """Map step-key (e.g. ``"step52"`` or ``"3"``) -> {rank: path}."""
+    groups: dict[str, dict[int, Path]] = defaultdict(dict)
+    for p in sorted(trace_dir.glob("trace_*_rank*.json")):
+        m = _RANK_RE.search(p.name)
+        if not m:
+            continue
+        prefix_match = _KEY_RE.match(p.name)
+        key = prefix_match.group(1) if prefix_match else m.group(1)
+        groups[key][int(m.group(2))] = p
+    return dict(groups)
+
+
+def stitch_one(rank_to_path: dict[int, Path], out_path: Path, *,
+               include: set[str], gzip_out: bool, verbose: bool) -> None:
+    """Merge one (step, rank->path) group into a single trace file."""
+    merged_events: list[dict] = []
+    base: dict | None = None
+
+    for rank in sorted(rank_to_path):
+        path = rank_to_path[rank]
+        if verbose:
+            sz_mb = path.stat().st_size / (1 << 20)
+            print(f"  rank {rank}: {path.name} ({sz_mb:.1f} MB)", file=sys.stderr)
+        with path.open() as f:
+            trace = json.load(f)
+        if base is None:
+            base = {k: v for k, v in trace.items() if k != "traceEvents"}
+            base["distributedInfo"] = {
+                **trace.get("distributedInfo", {}),
+                "stitched_ranks": sorted(rank_to_path),
+                "stitched_files": [p.name for p in rank_to_path.values()],
+                "stitched_include": sorted(include),
+            }
+        merged_events.extend(
+            _process_one_rank(trace.get("traceEvents", []), rank, include)
+        )
+
+    assert base is not None, "no input traces provided"
+    base["traceEvents"] = merged_events
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    if gzip_out:
+        with gzip.open(out_path, "wt") as f:
+            json.dump(base, f)
+    else:
+        with out_path.open("w") as f:
+            json.dump(base, f)
+    if verbose:
+        sz_mb = out_path.stat().st_size / (1 << 20)
+        print(
+            f"  -> {out_path}  ({len(merged_events):,} events, {sz_mb:.1f} MB)",
+            file=sys.stderr,
+        )
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    ap.add_argument("trace_dir", type=Path,
+                    help="Directory containing trace_*_rank*.json files.")
+    ap.add_argument("--step", type=str, default=None,
+                    help="Stitch only the given step key (e.g. '52' or 'step52'). "
+                         "Default: stitch every step group found.")
+    ap.add_argument("--out", type=Path, default=None,
+                    help="Output path. Only valid when --step selects exactly "
+                         "one group. Default: <trace_dir>/trace_<step>.json[.gz] "
+                         "(or trace_<step>_cpu/_gpu when --include filters).")
+    ap.add_argument("--include", choices=("cpu", "gpu", "both"), default="both",
+                    help="Which sections to keep: cpu-only tracks, gpu-only "
+                         "tracks, or both (default). 'Spans' is always dropped.")
+    ap.add_argument("--gzip", action="store_true",
+                    help="Write gzip-compressed JSON (Perfetto auto-detects).")
+    ap.add_argument("-q", "--quiet", action="store_true")
+    args = ap.parse_args()
+
+    if not args.trace_dir.is_dir():
+        print(f"error: {args.trace_dir} is not a directory", file=sys.stderr)
+        return 2
+
+    if args.include == "both":
+        # 'other' covers torch.profiler string-pid tracks (Traces / misc) that
+        # carry low-volume but legitimate annotations. Dropped under cpu/gpu
+        # so each filtered view is clean.
+        include = {"cpu", "gpu", "other"}
+    else:
+        include = {args.include}
+
+    groups = _group_by_step(args.trace_dir)
+    if not groups:
+        print(f"error: no trace_*_rank*.json files under {args.trace_dir}",
+              file=sys.stderr)
+        return 2
+
+    if args.step is not None:
+        wanted = args.step if args.step.startswith("step") else f"step{args.step}"
+        if wanted not in groups and args.step in groups:
+            wanted = args.step
+        if wanted not in groups:
+            print(
+                f"error: step {args.step!r} not found. "
+                f"Available: {sorted(groups)}",
+                file=sys.stderr,
+            )
+            return 2
+        groups = {wanted: groups[wanted]}
+
+    if args.out is not None and len(groups) != 1:
+        print("error: --out requires --step to select exactly one group",
+              file=sys.stderr)
+        return 2
+
+    for key, rank_map in sorted(groups.items()):
+        if not args.quiet:
+            print(
+                f"stitching {key} ({len(rank_map)} ranks, include={args.include}):",
+                file=sys.stderr,
+            )
+        if args.out is not None:
+            out = args.out
+        else:
+            ext = ".json.gz" if args.gzip else ".json"
+            # Default mode ("both") gets the bare filename; explicit cpu/gpu
+            # filters tag the output so they can coexist in one directory.
+            suffix = "" if args.include == "both" else f"_{args.include}"
+            out = args.trace_dir / f"trace_{key}{suffix}{ext}"
+        stitch_one(rank_map, out, include=include,
+                   gzip_out=args.gzip, verbose=not args.quiet)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/recommendation_v4/setup.py b/recommendation_v4/setup.py
new file mode 100644
index 000000000..bdab528f4
--- /dev/null
+++ b/recommendation_v4/setup.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from setuptools import find_packages, setup
+
+setup(
+    name="generative_recommenders",
+    version="0.1.0",
+    description="Library for generative recommendation algorithms.",
+    packages=find_packages(exclude=["configs"]),
+    python_requires=">=3.10",
+    install_requires=[
+        "torch>=2.6.0",
+        "fbgemm_gpu>=1.1.0",
+        "torchrec>=1.1.0",
+        "gin_config>=0.5.0",
+        "pandas>=2.2.0",
+        "tensorboard>=2.19.0",
+        "pybind11",
+        "click",
+        "pandas",
+        "matplotlib",
+    ],
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/meta-recsys/generative-recommenders",
+    license="Apache-2.0",
+)
diff --git a/recommendation_v4/verify_dataset.sh b/recommendation_v4/verify_dataset.sh
new file mode 100755
index 000000000..839ccb91f
--- /dev/null
+++ b/recommendation_v4/verify_dataset.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# MLPerf Training reference script: verify the preprocessed dataset.
+#
+# Checks the integrity of the preprocessed dataset under
+#   ${DLRM_DATA_PATH}/${PROCESSED_SUBDIR}
+# against md5sums_yambda_5b_processed.txt (standard `md5sum -c` format).
+#
+# If the checksum file still contains placeholder hashes (TODO_GENERATE_HASH),
+# the script falls back to an existence/layout check and warns that the
+# canonical checksums have not been pinned yet.
+#
+# Usage:
+#   DLRM_DATA_PATH=/path/to/dlrm_data ./verify_dataset.sh
+#
+# Env:
+#   DLRM_DATA_PATH    data root (required).
+#   PROCESSED_SUBDIR  processed subdir under the data root (default: processed_5b).
+set -euo pipefail
+
+: "${DLRM_DATA_PATH:?Set DLRM_DATA_PATH to the data root}"
+PROCESSED_SUBDIR="${PROCESSED_SUBDIR:-processed_5b}"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CHECKSUM_FILE="${REPO_ROOT}/md5sums_yambda_5b_processed.txt"
+PROCESSED_DIR="${DLRM_DATA_PATH}/${PROCESSED_SUBDIR}"
+
+echo "[verify_dataset] processed dir: ${PROCESSED_DIR}"
+
+if [[ ! -d "${PROCESSED_DIR}" ]]; then
+    echo "[verify_dataset] ERROR: ${PROCESSED_DIR} does not exist. Run ./download_dataset.sh first." >&2
+    exit 1
+fi
+
+EXPECTED_FILES=(
+    train_sessions.parquet
+    test_events.parquet
+    session_index.parquet
+    item_popularity.npy
+    split_meta.json
+)
+
+# Detect whether the checksum file has real (32 hex char) hashes or placeholders.
+if grep -qiE '^[0-9a-f]{32}[[:space:]]' "${CHECKSUM_FILE}"; then
+    echo "[verify_dataset] checking md5 checksums from ${CHECKSUM_FILE}"
+    (cd "${PROCESSED_DIR}" && md5sum -c "${CHECKSUM_FILE}")
+    echo "[verify_dataset] OK: all checksums match."
+else
+    echo "[verify_dataset] WARNING: ${CHECKSUM_FILE} contains placeholder hashes;" >&2
+    echo "[verify_dataset]          falling back to existence/layout check only." >&2
+    missing=0
+    for f in "${EXPECTED_FILES[@]}"; do
+        if [[ -s "${PROCESSED_DIR}/${f}" ]]; then
+            echo "  OK   ${f}"
+        else
+            echo "  MISS ${f}" >&2
+            missing=1
+        fi
+    done
+    if [[ "${missing}" -ne 0 ]]; then
+        echo "[verify_dataset] ERROR: one or more expected files are missing/empty." >&2
+        exit 1
+    fi
+    echo "[verify_dataset] layout OK (checksums NOT yet pinned -- see TODO in ${CHECKSUM_FILE})."
+fi