mlcommons · chriscai-amd · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 30, 2026
@@ -6,3 +6,6 @@
 	path = text_to_image/torchtitan
 	url = https://github.com/pytorch/torchtitan.git
 	branch = mlperf-training-flux.1
+[submodule "recommendation_v4/cutlass"]
+	path = recommendation_v4/generative_recommenders/ops/cpp/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
@@ -0,0 +1,159 @@
+# Don't check in parsed data files and other temporary files
+tmp/
+exps/
+ckpts/
+results/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
@@ -0,0 +1,86 @@
+# MI350X path — implements docs/training_recipe.md §"MI350X".
+
+FROM rocm/primus:v26.3
+
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+
+WORKDIR /workspace/recommendation_v4
+
+# torch / torchvision / torchaudio — training_recipe.md:38-40.
+RUN pip install --upgrade --no-deps \
+        --index-url https://download.pytorch.org/whl/rocm7.2 \
+        torch==2.12.0+rocm7.2 \
+        torchvision==0.27.0+rocm7.2 \
+        torchaudio==2.11.0+rocm7.2
+
+# torchrec — training_recipe.md:43.
+RUN pip install --force-reinstall --no-deps \
+        "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"
+
+# fbgemm_gpu — training_recipe.md:42. Build from FBGEMM commit 10b77573 for
+# gfx950 against the replaced torch. ~30-60 min.
+RUN apt-get update && apt-get install -y --no-install-recommends git build-essential && \
+    rm -rf /var/lib/apt/lists/* && \
+    git clone --recursive https://github.com/pytorch/FBGEMM.git /tmp/FBGEMM && \
+    cd /tmp/FBGEMM && \
+    git checkout 10b775730212923f65f7b78f79b6a01d80cf3c29 && \
+    git submodule update --init --recursive && \
+    cd fbgemm_gpu && \
+    # Filter `fairscale` and the torch family from fbgemm's requirements.txt:
+    # fairscale pulls a CPU torch that would clobber the +rocm7.2 wheel installed
+    # above. fairscale is a distributed-training lib used by fbgemm tests, not
+    # by the build itself.
+    grep -v -E '^(fairscale|torch|torchvision|torchaudio)([<>=!]|$)' requirements.txt > /tmp/req.txt && \
+    pip install -r /tmp/req.txt && \
+    python setup.py -j 32 bdist_wheel \
+        --build-target=default \
+        --build-variant=rocm \
+        -DHIP_ROOT_DIR=/opt/rocm \
+        -DAMDGPU_TARGETS=gfx950 && \
+    pip install --force-reinstall --no-deps dist/fbgemm_gpu_nightly_rocm*.whl && \
+    cd / && rm -rf /tmp/FBGEMM
+
+# polars-u64-idx — training_recipe.md:44 (mandatory; yambda-5b > 4.29 B rows).
+# Remaining packages — training_recipe.md:156-159 ("Additional Python deps") plus
+# `datasets` + `huggingface_hub`, which the recipe does not list but
+# preprocess_public_data.py:278 imports to download yambda from HuggingFace.
+RUN pip install \
+        polars-u64-idx==1.33.1 \
+        gin-config \
+        absl-py \
+        datasets \
+        huggingface_hub \
+        pyre-extensions \
+        iopath \
+        typing-inspect \
+        psutil \
+        tqdm \
+        pyyaml \
+        lightning-utilities && \
+    # torchmetrics and tensordict declare `torch` as a dep; without --no-deps
+    # pip pulls torch==2.12.0+cu130 from PyPI which clobbers the +rocm7.2 wheel
+    # we installed above (libtorch_hip.so disappears, fbgemm_gpu fails to load).
+    pip install --no-deps \
+        torchmetrics==1.0.3 \
+        tensordict
+
+# mlperf_logging — required by train/mlperf_logging_utils.py for MLPerf
+# compliance logs. Pinned to the Training 6.0 tag for reproducibility; --no-deps
+# so pip does not resolve requirements.txt's torch/fbgemm_gpu/torchrec pins and
+# clobber the +rocm7.2 wheels above.
+RUN pip install --no-deps "git+https://github.com/mlcommons/logging.git@6.0.0-rc6"
+
+# Smoke-test the 6 imports the launch script checks at
+# scripts/launch_smoke_8gpu.sh:26.
+RUN python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; \
+print('torch', torch.__version__, '| hip', getattr(torch.version, 'hip', None))"
+
+COPY . /workspace/recommendation_v4
+
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+    HSTU_HAMMER_KERNEL=TRITON \
+    DLRM_DATA_PATH=/data/mlperf_dlrm_v4
+
+CMD ["bash"]