diff --git a/.gitmodules b/.gitmodules index 51d8eac03..6ebcea592 100644 --- a/.gitmodules +++ b/.gitmodules @@ -6,3 +6,6 @@ path = text_to_image/torchtitan url = https://github.com/pytorch/torchtitan.git branch = mlperf-training-flux.1 +[submodule "recommendation_v4/cutlass"] + path = recommendation_v4/generative_recommenders/ops/cpp/cutlass + url = https://github.com/NVIDIA/cutlass.git diff --git a/recommendation_v4/.gitignore b/recommendation_v4/.gitignore new file mode 100644 index 000000000..5edddc5b3 --- /dev/null +++ b/recommendation_v4/.gitignore @@ -0,0 +1,159 @@ +# Don't check in parsed data files and other temporary files +tmp/ +exps/ +ckpts/ +results/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/recommendation_v4/Dockerfile b/recommendation_v4/Dockerfile new file mode 100644 index 000000000..450a5ab55 --- /dev/null +++ b/recommendation_v4/Dockerfile @@ -0,0 +1,86 @@ +# MI350X path — implements docs/training_recipe.md §"MI350X". + +FROM rocm/primus:v26.3 + +ENV PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + PIP_DISABLE_PIP_VERSION_CHECK=1 + +WORKDIR /workspace/recommendation_v4 + +# torch / torchvision / torchaudio — training_recipe.md:38-40. +RUN pip install --upgrade --no-deps \ + --index-url https://download.pytorch.org/whl/rocm7.2 \ + torch==2.12.0+rocm7.2 \ + torchvision==0.27.0+rocm7.2 \ + torchaudio==2.11.0+rocm7.2 + +# torchrec — training_recipe.md:43. +RUN pip install --force-reinstall --no-deps \ + "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00" + +# fbgemm_gpu — training_recipe.md:42. Build from FBGEMM commit 10b77573 for +# gfx950 against the replaced torch. ~30-60 min. +RUN apt-get update && apt-get install -y --no-install-recommends git build-essential && \ + rm -rf /var/lib/apt/lists/* && \ + git clone --recursive https://github.com/pytorch/FBGEMM.git /tmp/FBGEMM && \ + cd /tmp/FBGEMM && \ + git checkout 10b775730212923f65f7b78f79b6a01d80cf3c29 && \ + git submodule update --init --recursive && \ + cd fbgemm_gpu && \ + # Filter `fairscale` and the torch family from fbgemm's requirements.txt: + # fairscale pulls a CPU torch that would clobber the +rocm7.2 wheel installed + # above. fairscale is a distributed-training lib used by fbgemm tests, not + # by the build itself. + grep -v -E '^(fairscale|torch|torchvision|torchaudio)([<>=!]|$)' requirements.txt > /tmp/req.txt && \ + pip install -r /tmp/req.txt && \ + python setup.py -j 32 bdist_wheel \ + --build-target=default \ + --build-variant=rocm \ + -DHIP_ROOT_DIR=/opt/rocm \ + -DAMDGPU_TARGETS=gfx950 && \ + pip install --force-reinstall --no-deps dist/fbgemm_gpu_nightly_rocm*.whl && \ + cd / && rm -rf /tmp/FBGEMM + +# polars-u64-idx — training_recipe.md:44 (mandatory; yambda-5b > 4.29 B rows). +# Remaining packages — training_recipe.md:156-159 ("Additional Python deps") plus +# `datasets` + `huggingface_hub`, which the recipe does not list but +# preprocess_public_data.py:278 imports to download yambda from HuggingFace. +RUN pip install \ + polars-u64-idx==1.33.1 \ + gin-config \ + absl-py \ + datasets \ + huggingface_hub \ + pyre-extensions \ + iopath \ + typing-inspect \ + psutil \ + tqdm \ + pyyaml \ + lightning-utilities && \ + # torchmetrics and tensordict declare `torch` as a dep; without --no-deps + # pip pulls torch==2.12.0+cu130 from PyPI which clobbers the +rocm7.2 wheel + # we installed above (libtorch_hip.so disappears, fbgemm_gpu fails to load). + pip install --no-deps \ + torchmetrics==1.0.3 \ + tensordict + +# mlperf_logging — required by train/mlperf_logging_utils.py for MLPerf +# compliance logs. Pinned to the Training 6.0 tag for reproducibility; --no-deps +# so pip does not resolve requirements.txt's torch/fbgemm_gpu/torchrec pins and +# clobber the +rocm7.2 wheels above. +RUN pip install --no-deps "git+https://github.com/mlcommons/logging.git@6.0.0-rc6" + +# Smoke-test the 6 imports the launch script checks at +# scripts/launch_smoke_8gpu.sh:26. +RUN python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; \ +print('torch', torch.__version__, '| hip', getattr(torch.version, 'hip', None))" + +COPY . /workspace/recommendation_v4 + +ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \ + HSTU_HAMMER_KERNEL=TRITON \ + DLRM_DATA_PATH=/data/mlperf_dlrm_v4 + +CMD ["bash"] diff --git a/recommendation_v4/LICENSE b/recommendation_v4/LICENSE new file mode 100644 index 000000000..d64569567 --- /dev/null +++ b/recommendation_v4/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD new file mode 100644 index 000000000..e078bcf0a --- /dev/null +++ b/recommendation_v4/README.MD @@ -0,0 +1,380 @@ +# Recommendation v4 — HSTU sequential recommendation (Yambda-5b) + +MLPerf Training reference benchmark. This is a fork of +[meta-recsys/generative-recommenders](https://github.com/meta-recsys/generative-recommenders) +extended to train an HSTU (Hierarchical Sequential Transduction Units) ranking +model on the [Yambda-5b](https://huggingface.co/datasets/yandex/yambda) +music-recommendation dataset, sized as an MLPerf-style training benchmark inside +the `mlcommons/training` tree. + +## 1. Summary + +This benchmark trains a model that predicts what a person will listen to next. +Given the history of songs a user has played, liked, or skipped, the model +learns to rank which song the user is most likely to genuinely listen to (rather +than skip) next. This is the same kind of "what should we recommend next?" +problem that powers music and video streaming feeds. The model is trained on a +large public dataset of anonymized music-listening events and is scored on how +well it predicts future listens it has never seen. + +## 2. Benchmark overview (technical) + +The model is a **sequential recommender**: instead of treating each interaction +independently (as classic click-through-rate models like DLRM-DCNv2 do), it +consumes a user's chronologically ordered interaction history as a sequence and +applies a Transformer-style attention stack (HSTU) over it. Each training +example is one "anchor" listen event together with that user's prior history +(user interaction history, or UIH) and a set of contextual/cross features. The +supervised target is a binary `listen_plus` label (a real listen: played for at +least 50% of the track) versus a skip. + +Training is **streaming / temporal-order**: the timeline is sliced into +fixed-duration windows and the model trains on window `T` then evaluates on the +strictly-future window `T+1`, so every reported metric is genuine +next-period generalization with no future leakage. The quality metric is +**AUC** on the held-out future window, and the convergence target is +**AUC >= 0.80275** (matching the DLRM-DCNv2-style target). + +The reference runs on 8 GPUs (validated on AMD Instinct MI350X / MI355X and +NVIDIA B200; see [docs/training_recipe.md](docs/training_recipe.md)) and scales +to multi-node via SLURM. + +## 3. Directions — steps to run + +The benchmark follows the standard MLPerf reference script flow: + +```bash +# 0. build/enter the container (canonical frozen environment) +docker build -t recommendation_v4 . +docker run --rm -it --device=/dev/kfd --device=/dev/dri \ + -v /path/to/dlrm_data:/data/mlperf_dlrm_v4 recommendation_v4 + +# 1. download + preprocess the dataset +DLRM_DATA_PATH=/data/mlperf_dlrm_v4 ./download_dataset.sh + +# 2. verify the preprocessed dataset +DLRM_DATA_PATH=/data/mlperf_dlrm_v4 ./verify_dataset.sh + +# 3. run the benchmark to the quality target and report wall-clock time +DLRM_DATA_PATH=/data/mlperf_dlrm_v4 ./run_and_time.sh +``` + +- [`download_dataset.sh`](download_dataset.sh) wraps the preprocessing pipeline + in `generative_recommenders.dlrm_v3.preprocess_public_data` (HuggingFace + download + temporal split + session segmentation + item-popularity counts). +- [`verify_dataset.sh`](verify_dataset.sh) checks the preprocessed files against + [`md5sums_yambda_5b_processed.txt`](md5sums_yambda_5b_processed.txt) (falls + back to a layout check until the canonical checksums are pinned). +- [`run_and_time.sh`](run_and_time.sh) runs the full-reference streaming + train+eval sweep on a single 8-GPU host with `AUC_THRESHOLD=0.80275` and MLPerf + compliance logging, printing the elapsed time of the timed region. + +### 3.1 Multi-node (SLURM) + +For N >= 1 nodes use [`scripts/launch_slurm.sh`](scripts/launch_slurm.sh), which +provisions the container on each node and launches the same trainer. A bare +submit runs a small functional smoke run; set the run-shape knobs for the full +sweep: + +```bash +# smoke (fast functional check) +sbatch --nodes=1 scripts/launch_slurm.sh + +# full reference sweep +START_TS=0 NUM_TRAIN_TS=299 \ +NUM_TRAIN_BATCHES=0 NUM_EVAL_BATCHES=0 \ +EVAL_EVERY_N_WINDOWS=0 EVAL_EVERY_DATA_PCT=0.005 \ +AUC_THRESHOLD=0.80275 \ +sbatch --nodes=2 scripts/launch_slurm.sh +``` + +Multi-node uses real RDMA (RoCEv2); the fabric/NCCL setup is documented in +[docs/multi_node_config.md](docs/multi_node_config.md). Keep all run outputs +(log, checkpoints, mllog, TensorBoard) under a writable scratch path you own — +the dataset mount is read-only. + +## 4. Model + +The model is **HSTU** (Hierarchical Sequential Transduction Units), the +generative-recommender architecture from Meta's ICML'24 paper *Actions Speak +Louder than Words: Trillion-Parameter Sequential Transducers for Generative +Recommendations* ([arXiv:2402.17152](https://arxiv.org/abs/2402.17152)). + +HSTU replaces the feature-interaction stack of a classic DLRM with a stack of +pointwise-attention "transducer" layers operating over the user's interaction +sequence. In this benchmark (the `dlrm_v3` path): + +- **Embeddings**: sparse tables for `item_id`, `artist_id`, `album_id`, `uid`, + and 7 cross-feature hashes (e.g. `user_x_artist`, `item_x_hour`), sharded + across GPUs with TorchRec `DistributedModelParallel`. +- **Sequence model**: an HSTU attention stack (`HSTU_NUM_LAYERS`, default 3) + over the interleaved UIH, computed with a fused jagged-attention Triton kernel + in bf16. +- **Supervision**: a single `listen_plus` binary task. The candidate event's + `action_weight` carries the supervision bit, and BCE loss is masked to + listen_plus candidates. + +See `generative_recommenders/dlrm_v3/configs.py` +(`get_hstu_configs`, `get_embedding_table_config`) for the exact architecture +and table specs, and the upstream README for the original modeling code. + +## 5. Dataset + +[Yambda-5b](https://huggingface.co/datasets/yandex/yambda) is a public +anonymized music-recommendation dataset from Yandex. The `5b` variant is used +for the reference. Statistics after preprocessing: + +| | | +|---|---| +| Total interaction events | **4.76 B** | +| Unique users | **1.00 M** | +| Max events per user | 27,738 | +| Median events per user | 2,695 | +| Mean events per user | 4,763 | +| Train events (300d) | 4.76 B | +| Test events (1d) | 22.4 M | +| Item catalog size | 9.39 M | + +### 5.1 Per-event-type distribution (across the full 4.76 B corpus) + +| Pool | Definition | Count | Share | +|---|---|---|---| +| **listen_plus (lp)** | `is_listen AND played_ratio >= 50%` | 2.92 B | **61.3%** | +| **skip** | `is_listen AND played_ratio < 50%` | 1.71 B | **35.9%** | +| **like** | explicit thumbs-up action | 89 M | **1.9%** | +| other | dislike / unlike / undislike | 47 M | 1.0% | + +The `like` pool is roughly **30x rarer** than `lp` — important context for the +gather strategy in §6. + +### 5.2 Preprocessing & download + +`./download_dataset.sh` (which calls +`python3 -m generative_recommenders.dlrm_v3.preprocess_public_data --dataset +yambda-5b --data-path `) downloads the 5b variant from HuggingFace, then: + +1. **Encodes** the raw `event_type` string into a uint8 lookup (listen=0, + like=1, dislike=2, unlike=3, undislike=4). +2. **Splits** events temporally — 300 train days, 30-min gap, 1 test day — by + Global Temporal Split (GTS). +3. **Segments** per-user event timelines into sessions on a 30-min inactivity + gap. +4. **Computes** per-item popularity for downstream metric weighting. +5. **Writes** the layout `DLRMv3YambdaDataset` expects: + +``` +/ +├── raw/5b/multi_event.parquet 50 GB (downloaded) +├── shared_metadata/ +│ ├── artist_item_mapping.parquet 60 MB +│ ├── album_item_mapping.parquet 76 MB +│ └── embeddings.parquet 18 GB (unused by HSTU training) +└── processed_5b/ + ├── train_sessions.parquet 47 GB ← main training input + ├── test_events.parquet 152 MB + ├── session_index.parquet 600 MB + ├── item_popularity.npy 75 MB + └── split_meta.json anchor + boundary stats +``` + +For smaller variants (`yambda-50m` / `yambda-500m`) substitute the dataset name +(`DATASET=yambda-50m ./download_dataset.sh`). Preprocessing takes ~2 min for 50m +and ~53 min for 5b end-to-end. + +Integrity is verified with `./verify_dataset.sh` against +[`md5sums_yambda_5b_processed.txt`](md5sums_yambda_5b_processed.txt). + +## 6. How data is fed to HSTU + +For every training anchor (a LISTEN event with >= `min_history` prior events), +the dataset builds a `(uih_kjt, candidate_kjt)` pair: + +``` +UIH (User Interaction History): + ┌─ Sequence features (chronologically interleaved across 3 pools) + │ item_id, artist_id, album_id ← per-position + │ action_weight ← per-position (LP_BIT/LIKE_BIT/SKIP_BIT) + │ action_timestamp, dummy_watch_time ← per-position + └─ Contextual features (length 1 each) + uid + 7 cross-feature hashes (user_x_artist, item_x_hour, …) + = 8 contextual entries + +CANDIDATE (the LISTEN event at the anchor): + item_id, artist_id, album_id, item_query_time, + item_action_weight (LP_BIT if listen_plus, else 0), + item_dummy_watchtime +``` + +The candidate's `action_weight` is **the supervision label**: HSTU's +`_get_supervision_labels_and_weights` masks BCE training to +`(supervision_bitmask & task_weight) > 0`, with `task_weight = 1` (LP bit) for +the single `listen_plus` task — so only listen_plus candidates supervise. + +### 6.1 Per-pool gather (the cap = L // 3 strategy) + +The UIH is built by `DLRMv3YambdaDataset._gather_interleaved_history`. For each +anchor it: + +1. Scans the most recent `scan_window` (default 20,000) events of any type + before the anchor, **clipped to user_start**. +2. From those, takes **the last `L // 3` events** from each of the three pools + (lp, like, skip) independently. +3. Concatenates and **re-sorts chronologically** to produce an interleaved + sequence. +4. Tags each event's pool identity into `action_weight` via OR'd bitmask + (LP=1, LIKE=2, SKIP=4). + +With `history_length = 4086` and `max_seq_len = 4096`: per-pool cap = `4086 // +3 = 1362`, and `3 × 1362 + 8 contextual + 1 candidate = 4095 <= 4096` (no +truncation). Because the `like` pool is rare (1.9%) it under-fills (~105 events +per anchor on average); the Triton jagged-attention backend skips unfilled +slots, so the under-fill costs sequence budget but not GPU compute. + +## 7. Optimizer + +Two optimizers, configured in +[`yambda_5b.gin`](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin): + +| component | optimizer | gin binding | key settings | +|---|---|---|---| +| Dense params (HSTU blocks, MLPs) | **Adam** | `dense_optimizer_factory_and_class.*` | lr `DENSE_LR`, betas (0.95, 0.999), eps 1e-8, weight_decay 0 | +| Sparse embedding tables | **RowWiseAdagrad** (fused FBGEMM TBE) | `sparse_optimizer_factory_and_class.*` | lr `SPARSE_LR`, eps 1e-8, weight_decay 0 | + +Gradient clipping (`GRAD_CLIP_NORM`, default `max_norm=1.0`) is applied to the +dense parameters on the streaming path; the fused sparse optimizer is +unaffected. Training is bf16 mixed precision (`make_model.bf16_training=True`). + +## 8. Hyperparameters + +All tunable hyperparameters live in +[`yambda_5b.gin`](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin) +(the config-file source of truth) and are **overridable via environment +variables** (the env value takes precedence over the gin default, per MLPerf +CONTRIBUTING rule 4d). The gin macros (`@env_int`, `@env_float`, `@env_str`) +enforce the correct type for each parameter. + +| hyperparameter | env var | gin binding | type | default | tuning rule | +|---|---|---|---|---|---| +| Per-rank batch size | `BATCH_SIZE` | `batch_size` | int | 1024 | positive integer (global batch = `BATCH_SIZE × world_size`) | +| Dense learning rate | `DENSE_LR` | `dense_optimizer_factory_and_class.learning_rate` | float | 1e-7 | positive float | +| Sparse learning rate | `SPARSE_LR` | `sparse_optimizer_factory_and_class.learning_rate` | float | 1e-7 | positive float | +| Grad clip max-norm | `GRAD_CLIP_NORM` | `streaming_train_eval_loop.grad_clip_norm` | float | 1.0 | float >= 0 (0 disables) | +| RNG seed | `SEED` | `seed_everything.seed` | int | 1 | any integer (-1 = random per run) | +| HSTU attention layers | `HSTU_NUM_LAYERS` | `get_hstu_configs.hstu_attn_num_layers` | int | 3 | positive integer | +| UIH history length | `HISTORY_LENGTH` | `get_dataset.history_length` | int | 4086 | positive integer (per-pool cap = L//3) | +| Max sequence length | `MAX_SEQ_LEN` | `get_hstu_configs.max_seq_len` | int | 4096 | positive integer (>= `history_length + 9`) | +| History strategy | `HISTORY_STRATEGY` | `get_dataset.history_strategy` | str | `interleaved` | one of `interleaved` \| `last_n` | +| Min history (anchor floor) | `MIN_HISTORY` | `get_dataset.min_history` | int | 4086 | integer >= 0 | +| Train user split | `TRAIN_SPLIT_PERCENTAGE` | `*.train_split_percentage` | float | 1.0 | float in (0, 1] | +| Streaming shuffle fraction | `STREAMING_SHUFFLE_FRACTION` | `get_dataset.streaming_shuffle_fraction` | float | 0.0 | float in [0, 1] | +| Streaming shuffle seed | `STREAMING_SHUFFLE_SEED` | `get_dataset.streaming_shuffle_seed` | int | 0 | any integer | +| Split salt | `SPLIT_SALT` | `get_dataset.split_salt` | int | 0 | any integer | +| Start window | `START_TS` | `streaming_train_eval_loop.start_ts` | int | 150 | integer >= 0 | +| Number of train windows | `NUM_TRAIN_TS` | `streaming_train_eval_loop.num_train_ts` | int | 149 | positive integer (clamped to available) | +| Sparse A2A fwd precision | `SPARSE_A2A_FWD` | `make_optimizer_and_shard.sparse_a2a_forward_precision` | str | `fp32` | one of `fp32` \| `bf16` \| `fp16` | +| Sparse A2A bwd precision | `SPARSE_A2A_BWD` | `make_optimizer_and_shard.sparse_a2a_backward_precision` | str | `fp32` | one of `fp32` \| `bf16` \| `fp16` | + +Non-tunable / fixed reference values (optimizer betas (0.95, 0.999), eps 1e-8, +weight_decay 0, bf16 training, streaming window = 86400 s) are pinned in the gin +file. Submitters tuning hyperparameters must follow the allowed values above and +the +[MLPerf training rules](https://github.com/mlcommons/training_policies/blob/master/training_rules.adoc#hyperparameters). + +## 9. Quality target & evaluation + +- **Metric**: AUC on the held-out future evaluation window (`window_auc` for the + `listen_plus` task), computed by `MetricsLogger` in + `generative_recommenders/dlrm_v3/utils.py`. +- **Target**: **eval AUC >= 0.80275**. Set via `AUC_THRESHOLD=0.80275` + (`MetricsLogger.auc_threshold`); the run logs `RUN_STOP` with `SUCCESS` and + stops once the target is reached. The gin default of `1.0` is unreachable + (trains all windows with no early stop) and is overridden by the reference + scripts. +- **Evaluation frequency**: the full-reference run uses + `EVAL_EVERY_DATA_PCT=0.005` — evaluate every 0.5% of the training stream + (~200 evenly-data-spaced eval points), independent of node count. The + alternative per-window cadence (`EVAL_EVERY_N_WINDOWS`) is mutually exclusive. +- **Evaluation set**: a fixed held-out future window (`eval_holdout_ts`, default + `start_ts + num_train_ts`); with `TRAIN_SPLIT_PERCENTAGE < 1.0` the held-out + users' anchors over that window form the eval set. The temporal one-window + lead guarantees no future leakage (see §11). + +Evaluation is always one window ahead of training, so reported AUC is genuine +next-period generalization. + +## 10. Reference Convergence Points (RCP) + +*Placeholder — to be generated.* + +RCPs have **not yet been generated** for this benchmark. Per the MLPerf +[CONTRIBUTING guidance](https://github.com/mlcommons/training_policies/blob/master/CONTRIBUTING.md), +RCPs must be generated for at least 3 reasonable batch sizes using at least 2N +seeds (N = number of submission runs), in FP32 or BF16, with the exact precision +recorded in the RCP JSON. The convergence curves (steps/samples to reach +AUC >= 0.80275) will be added under [`rcp/`](rcp/) once the convergence runs are +complete. This section is intentionally left blank for now. + +## 11. Streaming (temporal-order) training + +`scripts/launch_slurm.sh` and `run_and_time.sh` default to +`--mode streaming-train-eval`, which trains Yambda in strict wall-clock order +instead of shuffling the whole corpus. The timeline is sliced into +fixed-duration **windows** (default 1 day, +`get_dataset.streaming_window_seconds = 86400`), and the loop walks them forward: + +``` +window T: train window T+1: eval (then train) window T+2: eval (then train) ... + └─ train window T ─┐ + └─ eval window T+1 ─┐ + └─ train window T+1 ─┐ + └─ eval window T+2 ... +``` + +i.e. for each step it **trains window T, then evaluates window T+1** before +advancing — always predicting the immediate future from the past. + +### 11.1 Temporal guarantee + +The streaming path enforces **no future leakage** at two levels: + +1. **Across windows** — a window is the set of anchors whose target/candidate + timestamp falls in `[t_min + T·W, t_min + (T+1)·W)`. Training only ever sees + windows `<= T`; the evaluation window `T+1` is strictly in the future of every + training anchor it is scored against. +2. **Within an anchor** — history is gathered **causally**: the UIH scan is + `scan_start:flat_pos` (events strictly before the anchor), so no event at or + after the anchor's timestamp can enter its features. + +This is a *temporal* split on the training stream — distinct from the +preprocessing GTS split (§5) that carves off the final test day. Windows are +indexed off the per-anchor target timestamp via a lazily-built, mmap'd +`anchor_ts` cache keyed by `(history_length, min_history)`. + +### 11.2 Streaming knobs + +All configurable via +[`yambda_5b.gin`](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin) with +env overrides: + +| env | gin | default | meaning | +|---|---|---|---| +| `START_TS` | `streaming_train_eval_loop.start_ts` | 150 | first window (early windows are near-empty warm-up) | +| `NUM_TRAIN_TS` | `streaming_train_eval_loop.num_train_ts` | 149 | number of train windows (clamped to available) | +| `PERSISTENT_LOADER` | `streaming_train_eval_loop.persistent_loader` | 1 | reuse one worker pool across windows | +| `DOUBLE_BUFFER` | `streaming_train_eval_loop.double_buffer` | 1 | prepare the next window in a background thread | +| `EVAL_EVERY_N_WINDOWS` | `streaming_train_eval_loop.eval_every_n_windows` | 1 | eval cadence by window count (0 to use data-pct) | +| `EVAL_EVERY_DATA_PCT` | `streaming_train_eval_loop.eval_every_data_pct` | 0.0 | eval cadence by fraction of train data (full ref: 0.005) | +| `MIN_HISTORY` | `get_dataset.min_history` | 4086 | anchor-eligibility floor (0 = ~all users incl. cold-start) | + +### 11.3 Checkpointing & resume + +The streaming loop is resume-aware: set `CKPT_PATH` to enable DMP checkpoint +save/load (auto-resolves to the highest-numbered subdir), with retention via +`KEEP_LAST_N` and cadences `IN_WINDOW_CKPT_FREQ` / `CKPT_STEP_FREQ` / +`CKPT_TIME_INTERVAL_S`. The MLPerf run state (run-started flag, global sample +count) is persisted across resume so compliance logging is continuous. See +`generative_recommenders/dlrm_v3/checkpoint.py`. + +## 12. License + +Apache 2.0 (inherited from upstream). diff --git a/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin new file mode 100644 index 000000000..8fb8b258c --- /dev/null +++ b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin @@ -0,0 +1,49 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Frozen config, validated on 04/12/2024. +# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in +# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152). +# +# Run this as: +# mkdir -p logs/amzn-books-l50/ +# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/hstu-sampled-softmax-n512-final.gin --master_port=12346 2>&1 | tee logs/amzn-books-l50/hstu-sampled-softmax-n512-final.log + +train_fn.dataset_name = "amzn-books" +train_fn.max_sequence_length = 50 +train_fn.local_batch_size = 128 +train_fn.eval_batch_size = 128 + +train_fn.main_module = "HSTU" +train_fn.dropout_rate = 0.5 +train_fn.user_embedding_norm = "l2_norm" +train_fn.item_embedding_dim = 64 + +hstu_encoder.num_blocks = 4 +hstu_encoder.num_heads = 4 +hstu_encoder.dv = 16 +hstu_encoder.dqk = 16 +hstu_encoder.linear_dropout_rate = 0.5 + +train_fn.eval_interval = 4000 +train_fn.num_epochs = 201 +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.interaction_module_type = "DotProduct" +train_fn.top_k_method = "MIPSBruteForceTopK" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 512 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True +train_fn.full_eval_every_n = 5 +train_fn.partial_eval_num_iters = 64 + +create_data_loader.prefetch_factor = 1024 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin new file mode 100644 index 000000000..097d4cbc7 --- /dev/null +++ b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin @@ -0,0 +1,49 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Frozen config, validated on 04/12/2024. +# Based on HSTU-large results in +# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152). +# +# Run this as: +# mkdir -p logs/amzn-books-l50/ +# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin --master_port=12346 2>&1 | tee logs/amzn-books-l50/hstu-sampled-softmax-n512-large-final2.log + +train_fn.dataset_name = "amzn-books" +train_fn.max_sequence_length = 50 +train_fn.local_batch_size = 128 +train_fn.eval_batch_size = 128 + +train_fn.main_module = "HSTU" +train_fn.dropout_rate = 0.5 +train_fn.user_embedding_norm = "l2_norm" +train_fn.item_embedding_dim = 64 + +hstu_encoder.num_blocks = 16 +hstu_encoder.num_heads = 8 +hstu_encoder.dv = 8 +hstu_encoder.dqk = 8 +hstu_encoder.linear_dropout_rate = 0.5 + +train_fn.eval_interval = 4000 +train_fn.num_epochs = 201 +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.interaction_module_type = "DotProduct" +train_fn.top_k_method = "MIPSBruteForceTopK" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 512 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True +train_fn.full_eval_every_n = 5 +train_fn.partial_eval_num_iters = 64 + +create_data_loader.prefetch_factor = 1024 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin b/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin new file mode 100644 index 000000000..bc899c9fb --- /dev/null +++ b/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin @@ -0,0 +1,50 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Frozen config, validated on 04/12/2024. +# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23). +# +# Run this as: +# mkdir -p logs/amzn-books-l50/ +# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/sasrec-sampled-softmax-n512-final.gin --master_port=12346 2>&1 | tee logs/amzn-books-l50/sasrec-sampled-softmax-n512-final.log + +train_fn.dataset_name = "amzn-books" +train_fn.max_sequence_length = 50 +train_fn.local_batch_size = 128 +train_fn.eval_batch_size = 128 + +train_fn.main_module = "SASRec" +train_fn.dropout_rate = 0.5 +train_fn.user_embedding_norm = "l2_norm" +train_fn.item_embedding_dim = 64 + +sasrec_encoder.num_blocks = 4 +sasrec_encoder.num_heads = 4 +sasrec_encoder.ffn_dropout_rate = 0.5 +sasrec_encoder.ffn_hidden_dim = 64 +sasrec_encoder.ffn_activation_fn = "relu" + +train_fn.eval_interval = 4000 +train_fn.num_epochs = 201 +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.save_ckpt_every_n = 10 + +train_fn.interaction_module_type = "DotProduct" +train_fn.top_k_method = "MIPSBruteForceTopK" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 512 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True +train_fn.full_eval_every_n = 5 +train_fn.partial_eval_num_iters = 64 + +create_data_loader.prefetch_factor = 1024 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin new file mode 100644 index 000000000..841b1c80a --- /dev/null +++ b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin @@ -0,0 +1,45 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Frozen config, validated on 04/11/2024. +# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in +# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152). +# +# Run this as: +# mkdir -p logs/ml-1m-l200/ +# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-1m/hstu-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-1m-l200/hstu-sampled-softmax-n128-final.log + +train_fn.dataset_name = "ml-1m" +train_fn.max_sequence_length = 200 +train_fn.local_batch_size = 128 + +train_fn.main_module = "HSTU" +train_fn.dropout_rate = 0.2 +train_fn.user_embedding_norm = "l2_norm" +train_fn.num_epochs = 101 +train_fn.item_embedding_dim = 50 + +hstu_encoder.num_blocks = 2 +hstu_encoder.num_heads = 1 +hstu_encoder.dqk = 50 +hstu_encoder.dv = 50 +hstu_encoder.linear_dropout_rate = 0.2 + +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.interaction_module_type = "DotProduct" +train_fn.top_k_method = "MIPSBruteForceTopK" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 128 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True + +create_data_loader.prefetch_factor = 128 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin new file mode 100644 index 000000000..7ffc7ef64 --- /dev/null +++ b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin @@ -0,0 +1,45 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Frozen config, validated on 04/11/2024. +# Based on HSTU-large results in +# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152). +# +# Run this as: +# mkdir -p logs/ml-1m-l200/ +# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin --master_port=12346 2>&1 | tee logs/ml-1m-l200/hstu-sampled-softmax-n128-large-final.log + +train_fn.dataset_name = "ml-1m" +train_fn.max_sequence_length = 200 +train_fn.local_batch_size = 128 + +train_fn.main_module = "HSTU" +train_fn.dropout_rate = 0.2 +train_fn.user_embedding_norm = "l2_norm" +train_fn.num_epochs = 101 +train_fn.item_embedding_dim = 50 + +hstu_encoder.num_blocks = 8 +hstu_encoder.num_heads = 2 +hstu_encoder.dqk = 25 +hstu_encoder.dv = 25 +hstu_encoder.linear_dropout_rate = 0.2 + +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.interaction_module_type = "DotProduct" +train_fn.top_k_method = "MIPSBruteForceTopK" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 128 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True + +create_data_loader.prefetch_factor = 128 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin new file mode 100644 index 000000000..ead7bb21c --- /dev/null +++ b/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin @@ -0,0 +1,44 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Frozen config, validated on 04/11/2024. +# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23). +# +# Run this as: +# mkdir -p logs/ml-1m-l200/ +# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-1m/sasrec-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-1m-l200/sasrec-sampled-softmax-n128-final.log + +train_fn.dataset_name = "ml-1m" +train_fn.max_sequence_length = 200 +train_fn.local_batch_size = 128 + +train_fn.main_module = "SASRec" +train_fn.dropout_rate = 0.2 +train_fn.user_embedding_norm = "l2_norm" +train_fn.num_epochs = 101 +train_fn.item_embedding_dim = 50 + +sasrec_encoder.num_blocks = 2 +sasrec_encoder.num_heads = 1 +sasrec_encoder.ffn_dropout_rate = 0.2 +sasrec_encoder.ffn_hidden_dim = 50 +sasrec_encoder.ffn_activation_fn = "relu" + +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.top_k_method = "MIPSBruteForceTopK" +train_fn.interaction_module_type = "DotProduct" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 128 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True + +create_data_loader.prefetch_factor = 128 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin new file mode 100644 index 000000000..5823ad5b6 --- /dev/null +++ b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin @@ -0,0 +1,45 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Frozen config, validated on 04/12/2024. +# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in +# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152). +# +# Run this as: +# mkdir -p logs/ml-20m-l200/ +# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-20m/hstu-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/hstu-sampled-softmax-n128-final.log + +train_fn.dataset_name = "ml-20m" +train_fn.max_sequence_length = 200 +train_fn.local_batch_size = 128 + +train_fn.main_module = "HSTU" +train_fn.dropout_rate = 0.2 +train_fn.user_embedding_norm = "l2_norm" +train_fn.num_epochs = 101 +train_fn.item_embedding_dim = 256 + +hstu_encoder.num_blocks = 4 +hstu_encoder.num_heads = 4 +hstu_encoder.dv = 64 +hstu_encoder.dqk = 64 +hstu_encoder.linear_dropout_rate = 0.2 + +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.interaction_module_type = "DotProduct" +train_fn.top_k_method = "MIPSBruteForceTopK" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 128 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True + +create_data_loader.prefetch_factor = 128 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin new file mode 100644 index 000000000..0199afa24 --- /dev/null +++ b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin @@ -0,0 +1,45 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Frozen config, validated on 04/12/2024. +# Based on HSTU-large results in +# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152). +# +# Run this as: +# mkdir -p logs/ml-20m-l200/ +# CUDA_VISIBLE_DEVICES=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python3 main.py --gin_config_file=configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/hstu-sampled-softmax-n128-large-final.log + +train_fn.dataset_name = "ml-20m" +train_fn.max_sequence_length = 200 +train_fn.local_batch_size = 128 + +train_fn.main_module = "HSTU" +train_fn.dropout_rate = 0.2 +train_fn.user_embedding_norm = "l2_norm" +train_fn.num_epochs = 101 +train_fn.item_embedding_dim = 256 + +hstu_encoder.num_blocks = 16 +hstu_encoder.num_heads = 8 +hstu_encoder.dv = 32 +hstu_encoder.dqk = 32 +hstu_encoder.linear_dropout_rate = 0.2 + +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.interaction_module_type = "DotProduct" +train_fn.top_k_method = "MIPSBruteForceTopK" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 128 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True + +create_data_loader.prefetch_factor = 128 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin new file mode 100644 index 000000000..3c666f802 --- /dev/null +++ b/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin @@ -0,0 +1,44 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Frozen config, validated on 04/12/2024. +# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23). +# +# Run this as: +# mkdir -p logs/ml-20m-l200/ +# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-20m/sasrec-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/sasrec-sampled-softmax-n128-final.log + +train_fn.dataset_name = "ml-20m" +train_fn.max_sequence_length = 200 +train_fn.local_batch_size = 128 + +train_fn.main_module = "SASRec" +train_fn.dropout_rate = 0.2 +train_fn.user_embedding_norm = "l2_norm" +train_fn.num_epochs = 101 +train_fn.item_embedding_dim = 256 + +sasrec_encoder.num_blocks = 4 +sasrec_encoder.num_heads = 4 +sasrec_encoder.ffn_dropout_rate = 0.2 +sasrec_encoder.ffn_hidden_dim = 256 +sasrec_encoder.ffn_activation_fn = "relu" + +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.top_k_method = "MIPSBruteForceTopK" +train_fn.interaction_module_type = "DotProduct" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 128 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True + +create_data_loader.prefetch_factor = 128 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin new file mode 100644 index 000000000..ac7a85350 --- /dev/null +++ b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin @@ -0,0 +1,42 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Run this as: +# mkdir -p logs/ml-3b-l500/ +# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/hstu-sampled-softmax-n96-seqlen500-final.log + +train_fn.dataset_name = "ml-3b" +train_fn.max_sequence_length = 500 +train_fn.local_batch_size = 96 +train_fn.eval_batch_size = 96 + +train_fn.main_module = "HSTU" +train_fn.dropout_rate = 0.2 +train_fn.user_embedding_norm = "l2_norm" +train_fn.num_epochs = 100 +train_fn.item_embedding_dim = 256 + +hstu_encoder.num_blocks = 4 +hstu_encoder.num_heads = 4 +hstu_encoder.dv = 64 +hstu_encoder.dqk = 64 +hstu_encoder.linear_dropout_rate = 0.2 + +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.interaction_module_type = "DotProduct" +train_fn.top_k_method = "MIPSBruteForceTopK" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 128 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True + +create_data_loader.prefetch_factor = 128 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin new file mode 100644 index 000000000..a30ad3657 --- /dev/null +++ b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin @@ -0,0 +1,42 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Run this as: +# mkdir -p logs/ml-3b-l500/ +# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/hstu-sampled-softmax-n96-seqlen500-large-final.log + +train_fn.dataset_name = "ml-3b" +train_fn.max_sequence_length = 500 +train_fn.local_batch_size = 96 +train_fn.eval_batch_size = 96 + +train_fn.main_module = "HSTU" +train_fn.dropout_rate = 0.2 +train_fn.user_embedding_norm = "l2_norm" +train_fn.num_epochs = 100 +train_fn.item_embedding_dim = 256 + +hstu_encoder.num_blocks = 16 +hstu_encoder.num_heads = 8 +hstu_encoder.dv = 32 +hstu_encoder.dqk = 32 +hstu_encoder.linear_dropout_rate = 0.2 + +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.interaction_module_type = "DotProduct" +train_fn.top_k_method = "MIPSBruteForceTopK" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 128 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True + +create_data_loader.prefetch_factor = 128 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin b/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin new file mode 100644 index 000000000..034c478b4 --- /dev/null +++ b/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin @@ -0,0 +1,42 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Run this as: +# mkdir -p logs/ml-3b-l500/ +# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/sasrec-sampled-softmax-n96-seqlen500-final.log + +train_fn.dataset_name = "ml-3b" +train_fn.max_sequence_length = 500 +train_fn.local_batch_size = 96 +train_fn.eval_batch_size = 96 + +train_fn.main_module = "SASRec" +train_fn.dropout_rate = 0.2 +train_fn.user_embedding_norm = "l2_norm" +train_fn.num_epochs = 100 +train_fn.item_embedding_dim = 256 + +sasrec_encoder.num_blocks = 4 +sasrec_encoder.num_heads = 4 +sasrec_encoder.ffn_dropout_rate = 0.2 +sasrec_encoder.ffn_hidden_dim = 256 +sasrec_encoder.ffn_activation_fn = "relu" + +train_fn.learning_rate = 1e-3 +train_fn.weight_decay = 0 +train_fn.num_warmup_steps = 0 + +train_fn.top_k_method = "MIPSBruteForceTopK" +train_fn.interaction_module_type = "DotProduct" + +train_fn.loss_module = "SampledSoftmaxLoss" +train_fn.num_negatives = 128 + +train_fn.sampling_strategy = "local" +train_fn.temperature = 0.05 +train_fn.item_l2_norm = True +train_fn.l2_norm_eps = 1e-6 + +train_fn.enable_tf32 = True + +create_data_loader.prefetch_factor = 128 +create_data_loader.num_workers = 8 diff --git a/recommendation_v4/docs/multi_node_config.md b/recommendation_v4/docs/multi_node_config.md new file mode 100644 index 000000000..52fdbbb69 --- /dev/null +++ b/recommendation_v4/docs/multi_node_config.md @@ -0,0 +1,230 @@ +# Multi-Node Training Enablement (yambda-5b, MI350X / Broadcom bnxt_re RoCE) + +How N-node (N×8-GPU) distributed training was brought up for the yambda-5b HSTU +ranker on the `meta64` cv350 cluster, the hard problems solved, and **exactly +which settings are cluster/fabric-specific** so this can be reused or re-tuned +when the underlying network changes. + +Companion to [`perf_opt.md`](./perf_opt.md) and [`training_recipe.md`](./training_recipe.md). +The single entry point is [`scripts/launch_slurm.sh`](../scripts/launch_slurm.sh); +the Python side is `generative_recommenders/dlrm_v3/train/{train_ranker,utils}.py`. + +--- + +## TL;DR + +- Multi-node works over **real RDMA** (RoCEv2 on 8× Broadcom bnxt_re HCAs). + 2-node = `world_size=16`, clean `rc=0`, ~7.7–8.0k `global_sps` (≈1.28× of + 1-node 6.2k; weak scaling, per-rank batch fixed). +- The one non-obvious blocker was a **userspace RDMA provider ABI mismatch** + inside the container, fixed with an `LD_PRELOAD`/`LD_LIBRARY_PATH` **overlay** + of the host's matched `rdma-core` (no container lib surgery). +- Everything is one script with three auto-detected phases + (`orchestrate` → `provision` → `worker`) plus small Python changes for global + ranks. All cluster-specific knobs are env-overridable and tagged + `[CLUSTER-SPECIFIC]` in the script. + +--- + +## Architecture: one script, three phases + +`launch_slurm.sh` self-dispatches by context (`LAUNCH_SLURM_PHASE`, else +auto-detected via `/.dockerenv`): + +| Phase | Runs on | Does | +|---|---|---| +| `orchestrate` | SLURM batch host | Resolve rendezvous (`MASTER_ADDR/PORT`), ensure container on every node (calls `provision`), then `docker exec` the `worker` phase on every node (one srun task per node). | +| `provision` | each compute node (host) | Ensure the `yambda_primus` container is up (baked image if present, else base image + pip), stage the host RDMA overlay on NFS. | +| `worker` | inside the container | Derive topology, set NCCL/RDMA env, apply the RDMA overlay, spawn this node's 8 GPU ranks via `train_ranker`. `NNODES==1` => legacy single-node path unchanged. | + +Why one script: multi-node enablement is then a single committable file. The +worker phase is also what the streaming-e2e supervisor invokes directly +(single-node, already inside the container), so the production path is unchanged. + +``` +sbatch --nodes=N launch_slurm.sh + │ (batch host: orchestrate) + ├─ srun: provision ──> docker container up + RDMA overlay staged (×N nodes) + └─ srun: docker exec launch_slurm.sh (worker) (×N nodes) + │ in container: topology + NCCL/RDMA env + LD overlay + └─ python train_ranker ──> 8 local ranks ──> RCCL rendezvous over RDMA +``` + +--- + +## The hard problems (lessons learned) + +### 1. RDMA provider ABI mismatch — the core blocker + +**Symptom:** multi-node RCCL died at init with +`ibv_create_qp ... Bad address`. + +**Root cause:** the container image (`rocm/primus:v26.3`) ships an **older** +userspace `rdma-core` (v34, `libbnxt_re-rdmav34.so`) than the **host kernel** +bnxt_re driver's uapi (host `rdma-core` v61 / `libbnxt_re-rdmav59.so`). The v34 +provider enumerates the HCAs and creates *shallow* QPs fine, but **faults when +creating a deep send queue** — RCCL uses `max_send_wr=256`. Verified with a +parameterized verbs probe: v34 `create_qp` is OK at depth ≤16 and faults at ≥64; +the host v59 provider works at **every** depth. So it is purely the **userspace +provider**, not the kernel or the fabric (a 2-node RoCEv2 RDMA-write test passes +on the stock stack, and bare-metal RCCL benchmarks run fine with the host libs). + +**Fix (no container surgery):** the `provision` phase stages the host's matched +`rdma-core` on shared NFS (`$OVERLAY`): + +``` +$OVERLAY/lib/libibverbs.so.1 # host libibverbs v61 +$OVERLAY/lib/libibverbs.so -> .so.1 # UNVERSIONED symlink (critical, see below) +$OVERLAY/lib/libnl-3.so.200, libnl-route-3.so.200 +$OVERLAY/lib/libibverbs/.so # incl. libbnxt_re-rdmav59.so +``` + +The `worker` phase makes RCCL load it at runtime: + +```bash +export LD_LIBRARY_PATH="$OVERLAY/lib:$OVERLAY/lib/libibverbs:$LD_LIBRARY_PATH" +export LD_PRELOAD="$OVERLAY/lib/libibverbs.so.1:$LD_PRELOAD" +``` + +We do **not** modify the container's system libs — only this process tree's +`LD_*`. Single-node and other users keep the stock stack. + +### 2. The UNVERSIONED `libibverbs.so` symlink is mandatory + +An earlier overlay attempt set `LD_LIBRARY_PATH` but still failed with +`Bad address`. Reason: at `import torch` the ROCm stack pulls in the +**unversioned** soname `libibverbs.so` (not `libibverbs.so.1`). If the overlay +only has `libibverbs.so.1`, that unversioned lookup misses the overlay, falls +through to the **container's** old lib, which then occupies the `libibverbs.so.1` +slot — so RCCL's later `dlopen("libibverbs.so.1")` binds the v34 stack and +`create_qp(256)` faults again. The overlay **must** expose +`libibverbs.so -> libibverbs.so.1`. With it (verified via `/proc//maps`), +the process maps **only** the host lib. `LD_PRELOAD` is belt-and-braces so the +host lib claims the soname slot first. + +### 3. Two network planes — pin TCP bootstrap, RDMA for data + +The container is `--network=host`, so RCCL sees **all** host interfaces and, left +to auto-detect, picks the wrong one. These nodes expose: +- `benic1p1..benic8p1` — per-GPU point-to-point RoCE links on `192.168.{1..8}.x/31`. + These are **not node-routable** for plain TCP; the very first bring-up **hung** + in `init_process_group` because RCCL tried the TCP bootstrap over a + non-routable `192.168.x` backend addr. +- `fenic0` — the routable front-end (`10.190.x`). + +So we split the planes explicitly: +- `NCCL_SOCKET_IFNAME=fenic0` → TCP bootstrap/rendezvous over the routable NIC. +- `NCCL_IB_HCA=bnxt_re0..7` → RDMA **data** over the 8 RoCE HCAs (the RoCEv2 + fabric *is* reachable rail-to-rail at the RDMA layer even though plain IP is not). + +### 4. Minimal proven bnxt_re NCCL config + +The minimal set proven on these nodes (matches cmcknigh's bare-metal RCCL +benchmarks): `NCCL_IB_GID_INDEX=3` (RoCEv2 IPv4 GID), `NCCL_IB_TC=104` (RoCE +lossless / PFC traffic class). **Do not** add the heavy +`QPS_PER_CONNECTION / ECE / DMABUF` block — that belongs to a different +(ionic AINIC) fabric and is counterproductive on bnxt_re. GPU-Direct RDMA +(`NCCL_NET_GDR_LEVEL`) is left **off**: it needs DMABUF/peermem, unavailable +in-container here, so RCCL stages through host memory (still real RDMA). + +### 5. Rendezvous must be resolved on the host + +The container image has **no SLURM client** (`scontrol` absent). So the +`orchestrate` phase resolves `MASTER_ADDR` (first host of the allocation) and a +deterministic `MASTER_PORT` (`20000 + job_id % 20000`, same on all nodes) **on +the host** and forwards them into the container via `docker exec -e`. + +### 6. Global rank derivation (Python) + +`mp.start_processes` hands out a node-local `local_rank` (0..7). Every downstream +consumer (data sharding, checkpoint I/O, metrics) needs the **global** rank: + +```python +rank = node_rank * gpus_per_node + local_rank # train_ranker._main_func +device = torch.device(f"cuda:{local_rank}") # CUDA device stays node-local +``` + +Also: `make_optimizer_and_shard(local_world_size=gpus_per_node)` so the TorchRec +planner respects the intra-node GPU count, and `MetricsLogger(world_size=...)` +gets the live world size (the gin default of 8 would mis-normalize multi-node). +`NNODES==1` makes `rank == local_rank` — identical to the old single-node path. + +### 7. `$0` is the staged `slurm_script`, not the repo path + +For an sbatch batch script, `$0` = +`/var/spool/slurmd/job/slurm_script` (node-local), so deriving the script / +repo path from `$0` gives a path that **doesn't exist on other nodes** (`bash +$SELF` → "No such file", and the worker's `cd $REPO` → exit 127). The +`orchestrate` phase instead resolves the real shared-NFS path from SLURM: + +```bash +SCRIPT_PATH=$(scontrol show job "$SLURM_JOB_ID" | grep -oP 'Command=\K\S+') +# fallbacks: $SLURM_SUBMIT_DIR/scripts/launch_slurm.sh, then $SELF +REPO=$(cd "$(dirname "$SCRIPT_PATH")/.." && pwd) +``` + +### 8. `srun ... bash -c "…"` host-vs-remote expansion + +Inside the double-quoted srun command string, **plain `$VAR` expands now on the +batch host** (values computed in orchestrate: `$MASTER_ADDR`, `$SCRIPT_PATH`, …) +while **`\$VAR` is deferred to each compute node** (`\$SLURM_NODEID`, +`\$(hostname)`) where the per-node SLURM env lives. Mixing these up sends every +rank the wrong node id. + +### 9. `memlock` ulimit for QP registration + +`docker run --ulimit memlock=-1:-1` is **required** — RDMA QP memory +registration needs unlimited locked memory. A container started with the default +8 MB memlock fails QP creation regardless of the overlay. + +### 10. Provisioning & the image-bake caveat + +Fresh nodes otherwise re-download a **6.1 GB** ROCm torch wheel + pip + build +torchrec-from-git every time. The script supports a pre-baked image +(`docker commit` → NFS tar → `docker load` offline). **Caveat:** the committed +image is **~127 GB** (ROCm base is huge), so the full-image NFS tar is impractical +(loading it can be slower than re-downloading 6 GB). For true download-avoidance +prefer a **local pip wheelhouse** (`pip install --no-index --find-links` from +~8 GB of NFS wheels) or a **local registry** (ships only the ~35 GB delta layer). +The bake hook is left in (`BAKE_IMAGE=1`) but defaults off; provisioning falls +back to base-image + pip. + +### Debunked theory (do not re-introduce) + +An earlier claim that the container's rdma-core was "too old → 0 devices / +Bad address" and needed an **in-place lib copy** was a red herring: the "0 +devices" came from a *broken in-place copy* of the host EL9 libs (mixing v34 +tooling that links `IBVERBS_PRIVATE_34` with host v61 libs breaks symbol-version +lookup). The stock container enumerates all 8 HCAs fine. The real issue is only +the deep-QP create path; the fix is the **LD overlay**, never in-place surgery. + +--- + +## Cluster-specific settings — change these when the fabric/hardware changes + +All are env-overridable and tagged `[CLUSTER-SPECIFIC]` in `launch_slurm.sh` +(`grep '\[CLUSTER-SPECIFIC\]' scripts/launch_slurm.sh`). + +| Setting | Default (meta64) | What it is | How to find the right value | +|---|---|---|---| +| `#SBATCH --partition` | `meta64` | scheduler partition | `sinfo` | +| bind mounts + default paths | `/home/chcai`, `/apps/chcai` | repo + scratch, **must be shared/NFS on all nodes** | `df -h`, cluster docs | +| `IMAGE` | `rocm/primus:v26.3` | base container (GPU arch + ROCm version) | vendor image registry | +| docker `--device` | `/dev/kfd /dev/dri` (AMD) | GPU passthrough | NVIDIA: `--gpus all` / nvidia runtime | +| `--ulimit memlock` | `-1` | locked mem for RDMA QP | keep `-1` for any RDMA fabric | +| `TORCH_IDX` / torch,vision,audio | `rocm7.2`, `2.12.0+rocm7.2` … | ROCm-version'd wheels | `download.pytorch.org/whl/` | +| `FBGEMM_WHL` | gfx950 wheel on NFS | GPU-arch fbgemm | build/stage per arch | +| `NCCL_SOCKET_IFNAME` | `fenic0` | **routable** host NIC for TCP bootstrap | `ip -br addr` (pick the routable one; NOT the per-GPU RDMA NICs) | +| `NCCL_IB_HCA` | `bnxt_re0..7` | RDMA HCA device names | `ibv_devices` (vendor: `mlx5_*`, `ionic_*`, …) | +| `NCCL_IB_GID_INDEX` | `3` | RoCEv2 IPv4 GID index | `show_gids` (v1/v2 & IPv4/IPv6 differ per port) | +| `NCCL_IB_TC` | `104` | RoCE lossless / PFC traffic class | fabric/switch admin | +| `RDMA_OVERLAY` (+ provider .so) | `/apps/chcai/rdma_host_el9_new` | host rdma-core overlay | only needed if container rdma-core < host kernel uapi; else set `RDMA_OVERLAY=` to disable. Stage the host's matching `/usr/lib64/libibverbs/.so` | + +**Different NIC vendor (e.g. Mellanox `mlx5`)** typically means: change +`NCCL_IB_HCA` names, re-check `NCCL_IB_GID_INDEX`/`NCCL_IB_TC`, and the RDMA +overlay is often **unnecessary** (Mellanox userspace in the image usually matches +the host) — set `RDMA_OVERLAY=` to skip it. + +**Emergency fallback:** `NCCL_NET_TRANSPORT=socket` disables IB and runs +allreduce over TCP (`fenic0`). Functional but ~100–200× slower; use only to +isolate a fabric problem. diff --git a/recommendation_v4/docs/perf_opt.md b/recommendation_v4/docs/perf_opt.md new file mode 100644 index 000000000..7799848ad --- /dev/null +++ b/recommendation_v4/docs/perf_opt.md @@ -0,0 +1,73 @@ +# Performance Optimizations — MI350X HSTU / OneTrans (yambda-5b, bs=1024, TRITON) + +Performance work for the 8× MI350X HSTU ranker on `yambda-5b` at `batch_size=1024` +with the **TRITON** HSTU kernel and bf16 training. Companion to +[`training_recipe.md`](./training_recipe.md) (environment + reproduction). + +Throughput numbers are global samples/sec across 8 GPUs (`global_sps`), measured +at steady state (instantaneous, computed from consecutive logged steps). + +--- + +## LN-dropout: multi-row, separated-RNG path on MI350 + +### What + +`_ln_mul_dropout_*` has two kernel variants: + +- **legacy** — single program per row, RNG fused inline (`_ln_mul_dropout_fwd`). +- **separated-RNG** — multiple rows per program, dropout mask precomputed once + and reused by the backward (`_ln_mul_dropout_fwd_rng` / + `_ln_mul_dropout_bwd_dx_du_rng`). + +The separated path was previously gated to Blackwell only (`is_sm100_plus()`). +MI350X (`gfx950`) benefits from the same structure, so the gate now also enables +it on MI350. + +### Where + +| file | change | +|---|---| +| `ops/utils.py` | `is_amd_mi350()` (gfx950 detect) + `use_separated_rng_ln_mul_dropout()` gate | +| `ops/triton/triton_hstu_linear.py` | dispatch LN-dropout fwd to the separated-RNG path when the gate is true | + +```python +# ops/utils.py +def use_separated_rng_ln_mul_dropout() -> bool: + return is_sm100_plus() or is_amd_mi350() +``` + +### Perf + +**+5.6% end-to-end → 14,222 global sps** (separated-RNG vs legacy fused, identical +config, full boost clocks — see the caveat below). + +--- + +## Caveat — GPU clock lock can mask all perf changes + +A node-level GPU clock lock will silently invalidate any benchmark on this +machine, so check it before trusting numbers. + +During this work all 8 GPUs were stuck in **`perf_determinism`** performance +level at **sclk 1093 MHz** (DPM level 1) while the real max is **2200 MHz** +(level 2) — despite 100% utilization, ~370 W of power headroom (629 / 1000 W), +and low temps (~50 °C). This was **not** thermal/power throttling; it was +leftover node state from a prior job. + +Effect: a **uniform ~1.87× slowdown of every Triton compute kernel** +(`2200 / 1093 ≈ 2.0×`), including kernels unrelated to any code change. It made +the LN-dropout fix above look like a regression until the clock state was found. + +### Detect + fix + +```bash +rocm-smi --showperflevel # expect "auto", not perf_determinism/manual/low +rocm-smi -d 0 --showclocks # expect sclk ~2000+ MHz under load +rocm-smi --setperflevel auto # restore boost +``` + +`scripts/launch_slurm.sh` (worker phase) now logs the perf level + a live `sclk` sample on +every launch, auto-restores `auto` if it finds a `perf_determinism`/`manual`/`low` +lock, and warns (to reset from the host) if it lacks permission inside the +container. **Always sanity-check `sclk ≈ 2000+ MHz` before trusting a benchmark.** diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md new file mode 100644 index 000000000..cf31a9fff --- /dev/null +++ b/recommendation_v4/docs/training_recipe.md @@ -0,0 +1,203 @@ +# Training Recipe + +Reproducible environment + configuration for training HSTU / DLRM-v3 on the +`yambda-5b` dataset. + +--- + +## MI350X + +Single-node, 8× AMD **Instinct MI350X** (`gfx950`, ~288 GiB HBM3e each), HSTU +ranker on `yambda-5b` with the **TRITON** HSTU kernel and **bf16** +mixed-precision training. + +### Hardware / host + +| item | value | +|---|---| +| GPUs | 8× AMD Instinct MI350X (`gfx950`, ROCm 7.2.1) | +| Host CPU | AMD EPYC 9655 96-Core (192 cores × 2 threads) | + +### Container image + +``` +rocm/primus:v26.3 +``` + +### Dependency versions + +Aligned with the B200 path: same torch major.minor, same torchrec commit, +same fbgemm SHA. The image's native torch / torchvision / torchaudio / +torchrec / fbgemm_gpu are all replaced; only the image's triton stays. + +| package | version | install | +|---|---|---| +| **torch** | `2.12.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torch==2.12.0+rocm7.2` | +| **torchvision** | `0.27.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torchvision` — ABI must match torch 2.12 | +| **torchaudio** | `2.11.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torchaudio` — ABI must match torch 2.12 | +| **triton** | `3.6.0` | image native, unchanged | +| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.2` (built from FBGEMM commit `10b77573`, same SHA as the B200 path) for `gfx950` | rebuild from source against the replaced torch. Build command: `python setup.py -j 32 bdist_wheel --build-target=default --build-variant=rocm -DHIP_ROOT_DIR=/opt/rocm -DAMDGPU_TARGETS=gfx950` | +| **torchrec** | `1.7.0a0+bf55480` (git tag `v2026.06.01.00`) | `pip install --force-reinstall --no-deps "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"` | +| **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows. Installed from a pre-staged local tarball by `scripts/launch_smoke_8gpu.sh` | + +### Training configuration + +From `generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin`: + +| parameter | value | gin binding | +|---|---|---| +| num_workers (dataloader) | 4 | `make_train_test_dataloaders.num_workers` | +| prefetch_factor | 8 | `make_train_test_dataloaders.prefetch_factor` | +| num_blocks | 1 | `make_train_test_dataloaders.num_blocks` | +| train_split_percentage | 0.90 | `make_train_test_dataloaders.train_split_percentage` | +| history_length (per-sample UIH budget) | 2039 | `get_dataset.history_length` | +| max_seq_len (attention budget) | 2048 | `get_hstu_configs.max_seq_len` | +| bf16 training | True | `make_model.bf16_training` | +| HBM cap (per GPU) | 260 GiB | `make_optimizer_and_shard.hbm_cap_gb` (env `HBM_CAP_GB`) | +| **triton autotune pinning** | **False (pinned)** | `apply_env_bootstrap.TRITON_FULL_AUTOTUNE` | +| dense optimizer | Adam, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `dense_optimizer_factory_and_class.*` | +| sparse optimizer | RowWiseAdagrad, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `sparse_optimizer_factory_and_class.*` | +| world_size | 8 | `MetricsLogger.world_size` | + +Effective global batch = `batch_size × world_size = 32 × 8 = 256` samples/step. + +### Environment variables + +| var | value | purpose | +|---|---|---| +| `HSTU_HAMMER_KERNEL` | `TRITON` | fast HSTU kernel (vs `PYTORCH` fallback) | +| `DLRM_DATA_PATH` | dataset root | overrides gin default `/apps/chcai/dlrm_data` | +| `HBM_CAP_GB` | (optional) | embedding planner HBM budget per GPU | +| `RUN_NAME` | run id | results dir → `results//` | +| `PYTORCH_CUDA_ALLOC_CONF` | `expandable_segments:True` | allocator headroom | +| `HIP_VISIBLE_DEVICES` / `CUDA_VISIBLE_DEVICES` | `0,1,2,3,4,5,6,7` | rank visibility | + +`TRITON_FULL_AUTOTUNE` is set automatically by the gin-driven bootstrap +(`generative_recommenders.dlrm_v3.train._env_bootstrap.apply_env_bootstrap`), +which runs in `train_ranker._main_func` BEFORE the triton kernel modules +import — so the gin file is the source of truth. + +### Measured performance + +| variant | steady-state ms/step | global sps | epoch ETA (3.23B anchors) | +|---|---|---|---| +| nightly + fp32 + PYTORCH attn (baseline) | ~190 | ~1340 | ~28 d | +| nightly + bf16 + TRITON attn | ~93 | ~2787 | ~13.4 d | +| primus + bf16 + TRITON attn | ~67.5 | ~3793 | ~9.9 d | +| primus + fbgemm HEAD + bf16 + TRITON, autotune drift | ~53 fast / ~70 slow | 3700–4860 | 7.7–10.2 d | +| **primus + fbgemm HEAD + bf16 + TRITON + pinning (default)** | **~52** | **~4970** | **~7.6 d** | + +The "pinning" line is the deterministic per-cold-start equilibrium — +three layer-norm / jagged triton kernels have two stable autotune winners +and the pin forces the fast one every run. + +### Known pitfalls + +- The image ships `fbgemm_gpu==2026.5.14`. The wheel built from FBGEMM HEAD + (`2026.6.1`) is required for the 70 → 52 ms step. Build inside the + container so the wheel links against the image's native torch. +- Stock `polars` silently overflows on `yambda-5b` (> 4.29 B rows); always + use `polars-u64-idx`. +- When changing shape (batch size, history length), GPU, or triton/torch + version, flip `apply_env_bootstrap.TRITON_FULL_AUTOTUNE = True` and run + with `TRITON_PRINT_AUTOTUNING=1` to re-capture winners, then update the + pinned configs at the `pinned_or_full(...)` call sites in + `generative_recommenders/ops/triton/`. +- Do not run with bf16 on the `PYTORCH` HSTU attention backend at our + sequence length — `pt_hstu_attention`'s QK einsum backward overflows in + bf16 at N > 1k and produces NaN at step 1. bf16 is only safe with TRITON. + +--- + +## B200 + +Single-node, 8× NVIDIA **B200** (Blackwell, `sm_100`, ~183 GiB HBM each), HSTU +ranker on `yambda-5b` with the **TRITON** HSTU kernel and **bf16** mixed-precision +training. + +### Hardware / host + +| item | value | +|---|---| +| GPUs | 8× NVIDIA B200 (`sm_100`, compute capability 10.0) | +| Host driver | 580.159.03 (reports CUDA 13.0) | +| Forward-compat userspace driver | `libcuda.so.595.58.03` (CUDA 13.2.1; engaged automatically by the NGC image) | + +### Container image + +``` +nvcr.io/nvidia/pytorch:26.04-py3 +``` + +Digest: `sha256:192d749b4d773610ec9e01c0443a9df545d196c412b7b8fd33bfa3da362a49e7` + +The image's native PyTorch is kept as-is and must not be reinstalled (so CUPTI +stays matched to the driver and `sm_100` support is preserved). + +`nvcr.io/nvidia/pytorch:26.01-py3` (torch `2.10.0a0` / CUDA 13.1, digest +`sha256:38ed2ecb2c16d10677006d73fb0a150855d6ec81db8fc66e800b5ae92741007e`) is +also validated and performance-equivalent — rebuild `fbgemm_gpu` against +whichever image's torch you run. + +### Dependency versions + +| package | version | notes | +|---|---|---| +| **torch** | `2.12.0a0+0291f960b6.nv26.04.48445190` (CUDA 13.2) | native to the image; not reinstalled | +| **triton** | `3.6.0` | native to the image; provides `triton.language.make_tensor_descriptor` (required by the TRITON HSTU path) | +| **fbgemm_gpu** | FBGEMM commit `10b775730212923f65f7b78f79b6a01d80cf3c29` (2026-06-01 `main`, CUDA 13.2, `sm_100`) | built from source against the native torch; public wheels are ABI-incompatible with the NGC torch. The built wheel is named `fbgemm_gpu_nightly-2026.6.1` — that version is the build date, not the source date, so always identify the build by the commit above. Build command: `TORCH_CUDA_ARCH_LIST=10.0 python setup.py bdist_wheel --build-target default --build-variant cuda --package_channel nightly --nvml_lib_path /usr/lib/x86_64-linux-gnu/libnvidia-ml.so` (~55 min — the `sm_100` TBE-forward kernels dominate via `ptxas`) | +| **torchrec** | `1.7.0.dev20260601+cu130` (nightly, tested) | installed `--no-deps` from `https://download.pytorch.org/whl/nightly/cu130`. Perf-neutral vs stable `1.4.0`; use `1.4.0` (latest stable) if you prefer a non-pre-release | +| **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows (overflows stock polars' 32-bit index) | +| CUPTI (for `torch.profiler`) | 13.2 (native) | matches the driver; the `+cu128` stack's CUPTI 12.8 fails on B200 (`CUPTI_ERROR_INVALID_DEVICE`) | + +Additional Python deps: +`xxhash`, `gin-config`, `absl-py`, `pandas`, `tensorboard`, `pyarrow`, `pyyaml`, +`tqdm`, `psutil`, `torchmetrics==1.0.3`, `tensordict`, `pyre-extensions`, +`iopath`, `typing-inspect`. + +### Training configuration + +From `generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin`: + +| parameter | value | gin binding | +|---|---|---| +| batch_size (train) | 32 | `make_train_test_dataloaders.batch_size` | +| eval_batch_size | 32 | `make_train_test_dataloaders.eval_batch_size` | +| num_workers (dataloader) | 4 | `make_train_test_dataloaders.num_workers` | +| prefetch_factor | 8 | `make_train_test_dataloaders.prefetch_factor` | +| num_blocks | 1 | `make_train_test_dataloaders.num_blocks` | +| train_split_percentage | 0.90 | `make_train_test_dataloaders.train_split_percentage` | +| history_length (per-sample UIH budget) | 2039 | `get_dataset.history_length` | +| max_seq_len (attention budget) | 2048 | `get_hstu_configs.max_seq_len` | +| bf16 training | True | `make_model.bf16_training` | +| HBM cap (per GPU) | 150 GiB | `make_optimizer_and_shard.hbm_cap_gb` (env `HBM_CAP_GB`) | +| **triton autotune pinning** | **True (full autotune)** | `apply_env_bootstrap.TRITON_FULL_AUTOTUNE` — the pinned configs are MI350X-specific, so B200 runs full autotune to find its own `sm_100` winners | +| dense optimizer | Adam, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `dense_optimizer_factory_and_class.*` | +| sparse optimizer | RowWiseAdagrad, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `sparse_optimizer_factory_and_class.*` | +| world_size | 8 | `MetricsLogger.world_size` | + +Effective global batch = `batch_size × world_size = 32 × 8 = 256` samples/step. + +### Environment variables + +| var | value | purpose | +|---|---|---| +| `HSTU_HAMMER_KERNEL` | `TRITON` | fast HSTU kernel (vs `PYTORCH` fallback) | +| `TORCH_CUDA_ARCH_LIST` | `10.0` | target `sm_100` for JIT / Triton compilation | +| `DLRM_DATA_PATH` | dataset root | overrides gin default `/apps/chcai/dlrm_data` | +| `HBM_CAP_GB` | `150` | embedding planner HBM budget per GPU | +| `RUN_NAME` | run id | results dir → `results//` | +| `PYTORCH_CUDA_ALLOC_CONF` | `expandable_segments:True` | allocator headroom | +| `TRITON_CACHE_DIR` | cache path | persist compiled Triton kernels across runs | +| `WORLD_SIZE` / `LOCAL_WORLD_SIZE` | `8` | mp.spawn rank count | + +### Known pitfalls + +- Never reinstall torch in this image — a cu12x wheel breaks CUPTI and may drop + `sm_100`. +- The `+cu128` stack (`torch==2.7.1+cu128` + `fbgemm-gpu==1.2.0+cu128` + + `torchrec==1.2.0+cu128`) runs on B200 but cannot profile GPU activity (CUPTI + 12.8 vs the 13.2 driver). +- Stock `polars` silently overflows on `yambda-5b` (> 4.29 B rows); always use + `polars-u64-idx`. +- `EmbeddingBoundsCheck ... Setting idx to zero` warnings are benign data clamps. diff --git a/recommendation_v4/download_dataset.sh b/recommendation_v4/download_dataset.sh new file mode 100755 index 000000000..d02f382dc --- /dev/null +++ b/recommendation_v4/download_dataset.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# MLPerf Training reference script: download + preprocess the dataset. +# +# Downloads the Yambda dataset from HuggingFace (yandex/yambda) and runs the +# preprocessing pipeline (event-type encoding, temporal GTS split, session +# segmentation, item-popularity counts) into the on-disk layout that +# DLRMv3YambdaDataset consumes. This is a thin wrapper over +# generative_recommenders.dlrm_v3.preprocess_public_data +# so the full reference data pipeline lives in one place. +# +# Usage: +# DLRM_DATA_PATH=/path/to/dlrm_data ./download_dataset.sh +# DATASET=yambda-50m DLRM_DATA_PATH=/path/to/dlrm_data ./download_dataset.sh +# +# Env: +# DATASET dataset variant (default: yambda-5b). One of +# kuairand-1k | kuairand-27k | yambda-50m | yambda-500m | yambda-5b +# DLRM_DATA_PATH destination data root (required). +set -euo pipefail + +DATASET="${DATASET:-yambda-5b}" +: "${DLRM_DATA_PATH:?Set DLRM_DATA_PATH to the destination data root}" + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${REPO_ROOT}" + +echo "[download_dataset] dataset=${DATASET} data-path=${DLRM_DATA_PATH}" +mkdir -p "${DLRM_DATA_PATH}" + +python3 -m generative_recommenders.dlrm_v3.preprocess_public_data \ + --dataset "${DATASET}" \ + --data-path "${DLRM_DATA_PATH}" + +echo "[download_dataset] done. Preprocessed layout under ${DLRM_DATA_PATH}:" +echo " raw/5b/multi_event.parquet" +echo " shared_metadata/{artist,album}_item_mapping.parquet, embeddings.parquet" +echo " processed_5b/{train_sessions,test_events,session_index}.parquet" +echo " processed_5b/item_popularity.npy, processed_5b/split_meta.json" +echo "[download_dataset] verify integrity with: ./verify_dataset.sh" diff --git a/recommendation_v4/generative_recommenders/README.md b/recommendation_v4/generative_recommenders/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/recommendation_v4/generative_recommenders/common.py b/recommendation_v4/generative_recommenders/common.py new file mode 100644 index 000000000..2ff8edf80 --- /dev/null +++ b/recommendation_v4/generative_recommenders/common.py @@ -0,0 +1,513 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/usr/bin/env python3 + +# pyre-strict + +import abc +import copy +import os +from enum import Enum, unique +from typing import Any, Callable, List, Optional, Tuple + +import torch + +# @manual=//triton:triton +import triton +from generative_recommenders.ops.utils import is_sm100_plus, is_sm90_plus +from torch.fx._symbolic_trace import is_fx_tracing +from torch.utils._python_dispatch import _get_current_dispatch_mode_stack + +# @manual=//triton:triton +from triton.runtime.autotuner import Autotuner + +try: + torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops") + torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu") +except OSError: + pass + +try: + # @manual=//triton:triton + import triton.language.extra.tlx # type: ignore + + HAS_TLX = True +except ImportError: + HAS_TLX = False + +try: + from generative_recommenders.fb.triton_cc.utils import triton_cc + from hammer.ops.triton.utils import triton_autotune + from hammer.utils import is_dev_mode, set_dev_mode, set_verbose_level +except ImportError: + # pyre-ignore + def triton_cc(annotations): + # pyre-ignore + def decorator(fn): + return fn + + return decorator + + # pyre-ignore + def triton_autotune( + configs: List[triton.Config], + key: List[str], + # pyre-ignore + prune_configs_by=None, + # pyre-ignore + reset_to_zero=None, + # pyre-ignore + restore_value=None, + warmup: int = 25, + rep: int = 100, + ): + # pyre-ignore + def decorator(fn): + return Autotuner( + fn, + fn.arg_names, + configs, + key, + reset_to_zero, + restore_value, + pre_hook=None, + post_hook=None, + prune_configs_by=prune_configs_by, + warmup=warmup, + rep=rep, + ) + + return decorator + + DEV_MODE: bool = False + VERBOSE_LEVEL: int = 0 + + def set_dev_mode(val: bool) -> None: + global DEV_MODE + DEV_MODE = val + + def is_dev_mode() -> bool: + global DEV_MODE # noqa: F824 + return DEV_MODE + + def set_verbose_level(level: int) -> None: + global VERBOSE_LEVEL + VERBOSE_LEVEL = level + + def get_verbose_level() -> int: + global VERBOSE_LEVEL # noqa: F824 + return VERBOSE_LEVEL + + +@unique +class HammerKernel(Enum): + TRITON = "TRITON" + TLX = "TLX" + PYTORCH = "PYTORCH" + CUDA = "CUDA" + TRITON_CC = "TRITON_CC" + TRITON_INFERENCE = "TRITON_INFERENCE" + CUTEDSL = "CUTEDSL" + + +class HammerModule(torch.nn.Module, abc.ABC): + _is_inference: bool = False + _use_triton_cc: bool = True + _training_dtype: torch.dtype = torch.float32 + _hammer_kernel: Optional[HammerKernel] = None + + def __init__( + self, + is_inference: bool, + training_dytpe: torch.dtype = torch.float32, + use_triton_cc: bool = _use_triton_cc, + hammer_kernel: Optional[HammerKernel] = None, + ) -> None: + super().__init__() + self._is_inference = is_inference + self._training_dtype = training_dytpe + self._hammer_kernel = hammer_kernel + self._use_triton_cc = use_triton_cc + + def hammer_kernel(self) -> HammerKernel: + kernel = self._hammer_kernel + if kernel is not None: + return kernel + if self._is_inference and self._use_triton_cc: + return HammerKernel.TRITON_CC + else: + return HammerKernel.TRITON + + # pyre-ignore[2] + def recursive_setattr(self, name: str, value: Any) -> None: + for _, module in self.named_modules(): + if hasattr(module, name): + setattr(module, name, value) + + def set_use_triton_cc(self, use_triton_cc: bool) -> None: + self._use_triton_cc = use_triton_cc + self.recursive_setattr("_use_triton_cc", use_triton_cc) + + def set_is_inference(self, is_inference: bool) -> None: + self._is_inference = is_inference + self.recursive_setattr("_is_inference", is_inference) + + def set_training_dtype(self, training_dtype: torch.dtype) -> None: + self._training_dtype = training_dtype + self.recursive_setattr("_training_dtype", training_dtype) + + def set_hammer_kernel(self, hammer_kernel: HammerKernel) -> None: + self._hammer_kernel = hammer_kernel + self.recursive_setattr("_hammer_kernel", hammer_kernel) + + @property + def is_inference(self) -> bool: + return self._is_inference + + @property + def is_eval(self) -> bool: + return (not self._is_inference) and (not self.training) + + @property + def is_train(self) -> bool: + return (not self._is_inference) and self.training + + +def generate_sparse_seq_len( + size: int, + max_seq_len: int, + sparsity: float, + device: torch.device, +) -> torch.Tensor: + if sparsity == 0.0: + return torch.zeros(size=(size,), device=device, dtype=torch.int) + elif sparsity == 1.0: + return torch.ones(size=(size,), device=device, dtype=torch.int) * max_seq_len + elif sparsity >= 0.5: + min_seq_len: int = int((2 * sparsity - 1.0) * max_seq_len) + return torch.randint( + low=min_seq_len, + high=max_seq_len, + size=(size,), + device=device, + dtype=torch.int, + ) + else: + min_seq_len: int = 0 + max_seq_len: int = int(2 * sparsity * max_seq_len) + return torch.randint( + low=min_seq_len, + high=max_seq_len, + size=(size,), + device=device, + dtype=torch.int, + ) + + +def apply_sampling( + lengths: torch.Tensor, + alpha: float, + max_seq_len: int, +) -> torch.Tensor: + threshold = int(max_seq_len ** (alpha / 2)) + no_sample_prob = (max_seq_len**alpha) / torch.pow(lengths, 2) + users_to_sample = torch.logical_and( + lengths > threshold, + torch.rand_like(no_sample_prob) < 1 - no_sample_prob, + ) + lengths = torch.where(users_to_sample, threshold, lengths) + return lengths + + +nv_gpu_unavailable: Tuple[bool, str] = ( + not torch.cuda.is_available() or torch.cuda.device_count() == 0, + "CUDA is not available or no GPUs detected", +) +nv_gpu_available: bool = not nv_gpu_unavailable[0] + + +amd_gpu_unavailable: Tuple[bool, str] = ( + not torch.version.hip, + "AMD HIP not available or no GPUs detected", +) +amd_gpu_available: bool = not amd_gpu_unavailable[0] + +gpu_unavailable: Tuple[bool, str] = ( + not nv_gpu_available and not amd_gpu_available, + "CUDA/HIP is not available or no GPUs detected", +) + +gpu_available: bool = not gpu_unavailable[0] + +blackwell_tlx_unavailable: Tuple[bool, str] = ( + not is_sm100_plus() or not HAS_TLX, + "Skip TLX and blackwell only tests", +) + +tma_unavailable: Tuple[bool, str] = ( + not is_sm90_plus(), # noqa + "Skip TMA only tests", +) + + +def switch_to_contiguous_if_needed(x: torch.Tensor) -> torch.Tensor: + if torch.jit.is_scripting(): + if x.stride(-1) == 1: + return x + return x.contiguous() + if torch.compiler.is_compiling(): + # Tell Dynamo this data-dependent value is in the range (0, 10**9) + torch._check(x.size(0) > 0) + torch._check(x.size(0) < 10**9) + # FX cannot trace Python control flow over symbolic stride checks + # (`x.stride(-1) == 1`). For AOT-T lowering, conservatively emit the + # contiguous op instead of branching on a symbolic value. + if is_fx_tracing(): + return x.contiguous() + if x.stride(-1) == 1: + return x + return x.contiguous() + + +def cdiv(x: int, y: int) -> int: + return (x + y - 1) // y + + +def backend_allow_tf32() -> bool: + return True + + +BACKEND_ALLOW_TF32: bool = backend_allow_tf32() + + +def next_power_of_2(n: int) -> int: + """Return the smallest power of 2 greater than or equal to n""" + n -= 1 + n |= n >> 1 + n |= n >> 2 + n |= n >> 4 + n |= n >> 8 + n |= n >> 16 + n |= n >> 32 + n += 1 + return n + + +def _prev_power_of_2_bitwise(x: int) -> int: + """Return the largest power of 2 less than or equal to x.""" + x |= x >> 1 + x |= x >> 2 + x |= x >> 4 + x |= x >> 8 + x |= x >> 16 + x |= x >> 32 + return (x >> 1) + 1 + + +@torch.fx.wrap +def _prev_power_of_2_legacy(x: int) -> int: + if torch.compiler.is_compiling(): + # Re-write to make Dynamo happy + x_tensor = torch.scalar_tensor(x, dtype=torch.int64) # type: ignore[arg-type] + x_tensor_orig = x_tensor.clone() + out_val = next_power_of_2(int(x_tensor.item())) # type: ignore[arg-type] + out = torch.scalar_tensor(out_val, dtype=torch.int64) + return int(torch.where(torch.lt(x_tensor_orig, out), out // 2, out).item()) # type: ignore[return-value] + else: + out = next_power_of_2(x) + return out // 2 if out > x else out + + +prev_power_of_2: Callable[[int], int] = ( + _prev_power_of_2_legacy + if os.environ.get("PREV_POWER_OF_2_IMPL", "legacy") == "legacy" + else _prev_power_of_2_bitwise +) + + +STATIC_MAX_SEQ_LENS: List[int] = [] +USE_RUNTIME_MAX_SEQ_LEN: bool = False + + +def set_static_max_seq_lens(max_seq_lens: List[int]) -> None: + global STATIC_MAX_SEQ_LENS + STATIC_MAX_SEQ_LENS = copy.deepcopy(max_seq_lens) + STATIC_MAX_SEQ_LENS.sort() + + +def set_use_runtime_max_seq_len(use_runtime_max_seq_len: bool) -> None: + global USE_RUNTIME_MAX_SEQ_LEN + USE_RUNTIME_MAX_SEQ_LEN = use_runtime_max_seq_len + + +def autotune_max_seq_len(runtime_max_seq_len: int) -> int: + global USE_RUNTIME_MAX_SEQ_LEN # noqa: F824 + + if USE_RUNTIME_MAX_SEQ_LEN: + return prev_power_of_2(runtime_max_seq_len) + else: + if STATIC_MAX_SEQ_LENS == []: + return 1 + for max_len in STATIC_MAX_SEQ_LENS: + if max_len >= runtime_max_seq_len: + return max_len + return STATIC_MAX_SEQ_LENS[-1] + + +def fine_grained_autotune_max_seq_len(runtime_max_seq_len: int) -> int: + global USE_RUNTIME_MAX_SEQ_LEN # noqa: F824 + + if USE_RUNTIME_MAX_SEQ_LEN: + return _fine_grained_bucket_size(runtime_max_seq_len) + else: + if STATIC_MAX_SEQ_LENS == []: + return 1 + for max_len in STATIC_MAX_SEQ_LENS: + if max_len >= runtime_max_seq_len: + return max_len + return STATIC_MAX_SEQ_LENS[-1] + + +def _generate_fine_grained_buckets() -> List[int]: + buckets = [ + 1024, + 2048, + 4096, + 8192, + 12288, + 16384, + 24576, + 32768, + 40960, + 49152, + 65536, + 81920, + 98304, + ] + return buckets + + +@torch.fx.wrap +def _fine_grained_bucket_size(x: int) -> int: + if torch.compiler.is_compiling(): + x_tensor = torch.scalar_tensor(x, dtype=torch.int64) + buckets = torch.tensor(_generate_fine_grained_buckets(), dtype=torch.int64) + + mask = buckets >= x_tensor + valid_buckets = torch.where( + mask, buckets, torch.tensor(2**31 - 1, dtype=torch.int64) + ) + + result = torch.where(mask.any(), valid_buckets.min(), buckets[-1]) + + return int(result.item()) + else: + buckets = _generate_fine_grained_buckets() + + for bucket in buckets: + if x <= bucket: + return bucket + + return buckets[-1] + + +@torch.fx.wrap +def fx_unwrap_optional_tensor(optional: Optional[torch.Tensor]) -> torch.Tensor: + assert optional is not None, "Expected optional to be non-None Tensor" + return optional + + +@torch.fx.wrap +def fx_arange(len: int, device: torch.device) -> torch.Tensor: + return torch.arange(len, device=device) + + +@torch.fx.wrap +def fx_infer_max_len( + lengths: torch.Tensor, +) -> int: + # Do not call ".item()" to avoid unbacked symint problems for lowering + max_len = int(lengths.max()) + if not torch.jit.is_scripting() and torch.compiler.is_compiling(): + # Tell Dynamo this data-dependent value is in the range [0, 10**9) + torch._check_is_size(max_len) + torch._check(max_len < 10**9) + torch._check(max_len > 0) + return max_len + + +@torch.fx.wrap +def fx_mark_length_features(tensor: torch.Tensor) -> torch.Tensor: + return tensor + + +@torch.fx.wrap +def fx_torch_ones( + shape: List[int], + device: torch.device, + dtype: torch.dtype, +) -> torch.Tensor: + return torch.ones(shape, device=device, dtype=dtype) + + +@torch.fx.wrap +def fx_torch_zeros(shape: List[int], device: torch.device) -> torch.Tensor: + return torch.zeros(shape, device=device) + + +def _is_in_dispatch_modes(mode_names: List[str]) -> bool: + modes = _get_current_dispatch_mode_stack() + return any(mode.__class__.__name__ in mode_names for mode in modes) + + +def should_trigger_eager_impl() -> bool: + if torch.jit.is_scripting(): + return True + if torch.compiler.is_compiling(): + return False + return _is_in_dispatch_modes(["SplitDispatchMode", "FakeTensorMode"]) + + +@torch.fx.wrap +def jagged_to_padded_dense( + values: torch.Tensor, + offsets: List[torch.Tensor], + max_lengths: List[int], + padding_value: float, +) -> torch.Tensor: + return torch.ops.fbgemm.jagged_to_padded_dense( + values=values, + offsets=offsets, + max_lengths=max_lengths, + padding_value=padding_value, + ) + + +@torch.fx.wrap +def dense_to_jagged( + dense: torch.Tensor, + x_offsets: List[torch.Tensor], +) -> torch.Tensor: + return torch.ops.fbgemm.dense_to_jagged( + dense=dense, + x_offsets=x_offsets, + )[0] + + +def init_mlp_weights_optional_bias(m: torch.nn.Module) -> None: + if isinstance(m, torch.nn.Linear): + torch.nn.init.xavier_uniform_(m.weight) + if m.bias is not None: + m.bias.data.fill_(0.0) diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py new file mode 100644 index 000000000..46cc10e2e --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py @@ -0,0 +1,680 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict +""" +Checkpoint utilities for saving and loading DLRMv3 model checkpoints. + +This module provides functions for saving and loading distributed model checkpoints, +including both sparse (embedding) and dense (non-embedding) components. +""" + +import gc +import logging +import os +import random +import shutil +import time +from datetime import datetime +from typing import Any, Dict, Optional, Set, Tuple + +import gin +import numpy as np +import torch +from generative_recommenders.dlrm_v3.utils import ( + BinnedCumulativeAUC, + LifetimeAUCMetricComputation, + MetricsLogger, +) +from torch.distributed.checkpoint.stateful import Stateful +from torch.optim.optimizer import Optimizer +from torchrec.distributed.types import ShardedTensor + +logger: logging.Logger = logging.getLogger(__name__) + +# Sentinel meaning "the saved window completed in full" — when the loop reads +# this back it advances start_ts past the saved train_ts. Anything >=0 means the +# saved checkpoint stopped mid-window after K batches; resume continues that +# window at batch K. +WINDOW_COMPLETE: int = -1 + +# Filename (per-rank) holding the lifetime-AUC trailing buffers, mirroring the +# rng_rank{rank}.pt pattern. The buffers are per-rank-local, so a single +# rank-0 copy in non_sparse.ckpt would (wrongly) restore 1/world_size of the +# true history to every rank — hence a dedicated per-rank artifact. +METRICBUF_FILE_FMT: str = "metricbuf_rank{rank}.pt" + + +def _metric_blob_state_dict(m: torch.nn.Module) -> Dict[str, Any]: + """State dict for the shared (rank-0) non_sparse.ckpt metric blob. + + Both lifetime-AUC backends carry per-rank-local state that is persisted + authoritatively per-rank in ``metricbuf_rank{rank}.pt``; we must keep it out + of the shared blob so a rank's load doesn't inherit rank-0's counts: + + - ``LifetimeAUCMetricComputation``: drop the explicitly-serialized trailing + buffer keys (the rest of the blob keys are the parent's persistent state). + - ``BinnedCumulativeAUC``: zero the histogram buffers (they are persistent so + the keys must remain for a strict load, but the values are neutralized). + + All other metrics serialize normally. In both cases the per-rank file is + loaded afterward and is authoritative. + """ + sd = m.state_dict() + if isinstance(m, LifetimeAUCMetricComputation): + prefix = LifetimeAUCMetricComputation._LIFETIME_KEY_PREFIX + sd = {k: v for k, v in sd.items() if not k.startswith(prefix)} + elif isinstance(m, BinnedCumulativeAUC): + sd = { + k: (torch.zeros_like(v) if torch.is_tensor(v) else v) + for k, v in sd.items() + } + return sd + + +def _collect_perrank_metric_state( + metric_logger: "MetricsLogger", +) -> Dict[str, Dict[str, Any]]: + """Map "||" -> state_dict for every metric whose + cumulative state is per-rank-local and must be restored per-rank: + + - lifetime-AUC instances (`LifetimeAUCMetricComputation` trailing buffer, or + `BinnedCumulativeAUC` histograms) in class_metrics train/eval. Covers the + train lifetime AUC and, in legacy single-set eval, the eval lifetime AUC, + under either configured backend. + - the ENTIRE cumulative eval set (`eval_cum`, both class + regression) used + by the streaming dual-set eval: the lifetime-AUC backend state plus the + persistent cumulative scalar sums of NE/Accuracy/GAUC/MSE/MAE. + + Selected by structure/isinstance (not a hard index) since metric positions + depend on the configured tasks/mode. + """ + out: Dict[str, Dict[str, Any]] = {} + for mode in ("train", "eval"): + for idx, m in enumerate(metric_logger.class_metrics.get(mode, [])): + if isinstance(m, (LifetimeAUCMetricComputation, BinnedCumulativeAUC)): + out[f"class_metrics|{mode}|{idx}"] = m.state_dict() + for coll in ("class_metrics", "regression_metrics"): + for idx, m in enumerate(getattr(metric_logger, coll).get("eval_cum", [])): + out[f"{coll}|eval_cum|{idx}"] = m.state_dict() + return out + + +def _restore_perrank_metric_state( + metric_logger: "MetricsLogger", state: Dict[str, Dict[str, Any]] +) -> None: + for key, sd in state.items(): + coll, mode, idx_str = key.split("|") + getattr(metric_logger, coll)[mode][int(idx_str)].load_state_dict(sd) + + +def _perrank_sample_counts(metric_logger: "MetricsLogger") -> Dict[str, int]: + out: Dict[str, int] = {} + + def _count(m: torch.nn.Module) -> Optional[int]: + if isinstance(m, LifetimeAUCMetricComputation): + return m.lifetime_sample_count() + if isinstance(m, BinnedCumulativeAUC): + return m.cumulative_sample_count() + return None + + for mode in ("train", "eval", "eval_cum"): + for idx, m in enumerate(metric_logger.class_metrics.get(mode, [])): + n = _count(m) + if n is not None: + out[f"class|{mode}|{idx}"] = n + return out + + +class SparseState(Stateful): + """ + Stateful wrapper for sparse (embedding) tensors in a model. + + This class implements the Stateful interface for distributed checkpointing, + allowing sparse tensors to be saved and loaded separately from dense tensors. + + Args: + model: The PyTorch model containing sparse tensors. + sparse_tensor_keys: Set of keys identifying sparse tensors in the model's state dict. + """ + + def __init__(self, model: torch.nn.Module, sparse_tensor_keys: Set[str]) -> None: + self.model = model + self.sparse_tensor_keys = sparse_tensor_keys + + def state_dict(self) -> Dict[str, torch.Tensor]: + out_dict: Dict[str, torch.Tensor] = {} + is_sharded_tensor: Optional[bool] = None + for k, v in self.model.state_dict().items(): + if k in self.sparse_tensor_keys: + if is_sharded_tensor is None: + is_sharded_tensor = isinstance(v, ShardedTensor) + assert is_sharded_tensor == isinstance(v, ShardedTensor) + out_dict[k] = v + return out_dict + + def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None: + incompatible_keys = self.model.load_state_dict(state_dict, strict=False) + assert not incompatible_keys.unexpected_keys + + +def is_sparse_key(k: str, v: torch.Tensor) -> bool: + return isinstance(v, ShardedTensor) or "embedding_collection" in k + + +def load_dense_state_dict(model: torch.nn.Module, state_dict: Dict[str, Any]) -> None: + own_state = model.state_dict() + own_state_dense_keys = {k for k, v in own_state.items() if not is_sparse_key(k, v)} + state_dict_dense_keys = { + k for k, v in state_dict.items() if not is_sparse_key(k, v) + } + assert own_state_dense_keys == state_dict_dense_keys, ( + f"expects {own_state_dense_keys} but gets {state_dict_dense_keys}" + ) + for name in state_dict_dense_keys: + param = state_dict[name] + if isinstance(param, torch.nn.Parameter): + # backwards compatibility for serialized parameters + param = param.data + own_state[name].copy_(param) + + +def _rng_state(device: torch.device) -> Dict[str, Any]: + """Snapshot every RNG source bit-equal training depends on. + + HSTU has stochastic dropout (input_dropout=0.2, linear_dropout_rate=0.1) + consuming the per-device CUDA RNG cycle each step. Without round-tripping + these, a resumed run draws different dropout masks and the resumed AUC + trajectory diverges from the uninterrupted run within a few steps. + """ + return { + "cpu": torch.get_rng_state(), + "cuda": torch.cuda.get_rng_state(device), + "numpy": np.random.get_state(), + "random": random.getstate(), + } + + +def _restore_rng_state(state: Dict[str, Any], device: torch.device) -> None: + torch.set_rng_state(state["cpu"]) + torch.cuda.set_rng_state(state["cuda"], device) + np.random.set_state(state["numpy"]) + random.setstate(state["random"]) + + +def _list_numeric_subdirs(base_path: str) -> list[str]: + """Return subdir names of `base_path` that look like an int, sorted ascending. + + Filters out `*.tmp` (orphaned in-progress saves), `*.sparse/` and any other + non-numeric entries. + """ + if not os.path.isdir(base_path): + return [] + out: list[str] = [] + for name in os.listdir(base_path): + if name.isdigit(): + out.append(name) + return sorted(out, key=int) + + +def _resolve_latest_subdir(path: str) -> str: + """Map a base ckpt dir → its highest-numbered numeric subdir. + + Used so users can set `load_dmp_checkpoint.path = ""` (or + `CKPT_PATH=`) and automatically pick up the most recent save without + needing to know which step number to point at. If `path` already names a leaf save (numeric basename) it's returned + unchanged. If the base dir has no numeric subdirs yet — the cold-start case + where ``CKPT_PATH`` is configured but nothing has been saved (e.g. the + interrupt phase of the resume test starts from a freshly-cleaned dir) — we + return ``""`` so ``load_*_checkpoint`` no-ops instead of asserting on a + missing ``sparse/.metadata``. + """ + if not path: + return path + base = path.rstrip("/") + leaf = os.path.basename(base) + if leaf.isdigit(): + return base # already a leaf, caller knows what it wants + subs = _list_numeric_subdirs(base) + if not subs: + logger.info("No checkpoint subdirs under %s — cold start (no load).", base) + return "" # nothing to load → load_*_checkpoint short-circuits + resolved = os.path.join(base, subs[-1]) + logger.info("Auto-latest checkpoint: %s → %s", base, resolved) + return resolved + + +def _prune_old_checkpoints(base_path: str, keep_last_n: int, just_saved_subdir: str) -> None: + """Delete numeric subdirs older than the keep_last_n most recent. + + Defensive: never prune `just_saved_subdir` even if it would be evicted by + the keep_last_n window (shouldn't happen since we just wrote it, but + catches off-by-one bugs). Skipped entirely when keep_last_n<=0. + """ + if keep_last_n <= 0: + return + subs = _list_numeric_subdirs(base_path) + if len(subs) <= keep_last_n: + return + to_prune = subs[:-keep_last_n] + for name in to_prune: + full = os.path.join(base_path, name) + if os.path.realpath(full) == os.path.realpath(just_saved_subdir): + continue + try: + shutil.rmtree(full) + logger.info("Pruned old checkpoint: %s", full) + except OSError as e: + logger.warning("Failed to prune %s: %s", full, e) + + +def _cleanup_stale_tmps(base_path: str) -> None: + """Remove `*.tmp`/`*.old` subdirs left by a crashed prior save attempt. + + `*.tmp` = an interrupted write; `*.old` = an interrupted atomic-overwrite + swap (see the promotion step in save_dmp_checkpoint). Both are non-numeric + so `_resolve_latest_subdir` already ignores them; this just reclaims disk. + """ + if not os.path.isdir(base_path): + return + for name in os.listdir(base_path): + if name.endswith(".tmp") or name.endswith(".old"): + full = os.path.join(base_path, name) + try: + shutil.rmtree(full) + logger.warning("Removed stale checkpoint dir: %s", full) + except OSError as e: + logger.warning("Failed to remove stale dir %s: %s", full, e) + + +@gin.configurable +def save_dmp_checkpoint( + model: torch.nn.Module, + optimizer: Optimizer, + metric_logger: MetricsLogger, + rank: int, + batch_idx: int, + path: str = "", + keep_last_n: int = 1, + train_ts: Optional[int] = None, + batch_idx_in_window: int = WINDOW_COMPLETE, + device: Optional[torch.device] = None, + split_contract: Optional[Dict[str, Any]] = None, +) -> None: + """ + Save a distributed model checkpoint including sparse and dense components. + + Writes into a per-rank-coordinated atomic layout: + /.tmp/ ← directory written into during save + // ← atomically renamed from .tmp on success + + A crash mid-save leaves the `.tmp/` orphan, which `_cleanup_stale_tmps` + sweeps on the next save attempt and which `_resolve_latest_subdir` ignores + (non-numeric basename). The previous successful `/` remains valid. + + Args: + model: The model to checkpoint. + optimizer: The optimizer whose state should be saved. + metric_logger: The metrics logger containing training/eval metrics. + rank: The current process rank in distributed training. + batch_idx: Subdir name (for streaming we set this == train_ts so the + on-disk layout monotonically increases). + path: Base path for saving the checkpoint. If empty, no checkpoint is saved. + keep_last_n: Number of most-recent numeric subdirs to retain after a + successful save. Set 1 (default) for disk-bounded long runs; + <=0 disables pruning. + train_ts: For streaming-train-eval, the current train timestamp. + Stored in non_sparse.ckpt so resume knows which window to enter. + batch_idx_in_window: For streaming-train-eval, batches completed within + train_ts. WINDOW_COMPLETE (-1) means the window finished; resume + advances to train_ts+1. >=0 means crash happened mid-window; resume + re-enters train_ts at batch_idx_in_window. + device: CUDA device for the per-rank RNG snapshot. Required for + bit-equal trajectories across resume (HSTU dropout consumes the + per-device RNG cycle). + """ + if path == "": + return + # Exclude checkpoint wall-time from the train step-time window so step_ms + # reports canonical compute latency; the duration is surfaced separately + # (window_ckpt_time_ms + the per-save log below). pause/resume are no-ops if + # metric_logger is None. Not wrapped in try/finally: a save that raises + # crashes the process (supervisor restarts fresh), so a dangling pause on + # the soon-dead logger is irrelevant. + _t_ckpt_start = time.perf_counter() + if metric_logger is not None: + metric_logger.pause_perf("ckpt") + base_path = path + # Atomic-save layout: write to .tmp, rename to final, prune older. + tmp_subdir = f"{base_path}/{batch_idx}.tmp" + final_subdir = f"{base_path}/{batch_idx}" + + if rank == 0: + _cleanup_stale_tmps(base_path) + # Always (re)write into a fresh .tmp. An existing `final_subdir` with the + # same batch_idx (e.g. a later in-window save for the same train_ts, or a + # deterministic re-run at the same step) is overwritten atomically at the + # promotion step below — NOT skipped here. Skipping would desync ranks: + # the collective barrier/checkpoint.save calls below run on *every* rank, + # so a rank-0-only early return deadlocks ranks 1..N on the next barrier. + shutil.rmtree(tmp_subdir, ignore_errors=True) + os.makedirs(tmp_subdir, exist_ok=True) + os.makedirs(f"{tmp_subdir}/sparse/", exist_ok=True) + torch.distributed.barrier() + sparse_path = f"{tmp_subdir}/sparse/" + non_sparse_ckpt = f"{tmp_subdir}/non_sparse.ckpt" + + sparse_tensor_keys = { + k for k, v in model.state_dict().items() if isinstance(v, ShardedTensor) + } + if rank == 0: + dense_state_dict = { + k: v + for k, v in model.state_dict().items() + if not isinstance(v, ShardedTensor) + } + class_metric_state_dict = { + "train": [ + _metric_blob_state_dict(m) + for m in metric_logger.class_metrics["train"] + ], + "eval": [ + _metric_blob_state_dict(m) + for m in metric_logger.class_metrics["eval"] + ], + } + regression_metric_state_dict = { + "train": [ + m.state_dict() for m in metric_logger.regression_metrics["train"] + ], + "eval": [m.state_dict() for m in metric_logger.regression_metrics["eval"]], + } + torch.save( + { + "dense_dict": dense_state_dict, + "optimizer_dict": optimizer.state_dict(), + "class_metrics": class_metric_state_dict, + "reg_metrics": regression_metric_state_dict, + "global_step": metric_logger.global_step, + # MLPerf progress counter (global trained samples). Defaulted on + # load so pre-existing checkpoints restore as 0 and resume the + # count from there. + "cumulative_train_samples": metric_logger.cumulative_train_samples, + # MLPerf run-marker state: lets a resume relaunch continue the + # SAME run's event stream without re-emitting INIT_START/RUN_START. + "mlperf_run_started": metric_logger.mlperf_run_started, + "sparse_tensor_keys": sparse_tensor_keys, + # Streaming resume fields. Defaulted on load so old checkpoints + # (pre-streaming-resume) still load as a normal restart. + "train_ts": train_ts, + "batch_idx_in_window": batch_idx_in_window, + # Immutable train:eval split + resume-determinism contract + # (train_split_percentage, split_salt, eval holdout window, + # batch_size, world_size). Validated on resume so a relaunch + # cannot silently change the split (which would desync the skip + # offset and/or train on held-out eval users). None for + # non-holdout / legacy runs. + "split_contract": split_contract, + }, + non_sparse_ckpt, + ) + + # Per-rank RNG snapshot. Written even on a single rank because dropout's + # randomness comes from the CUDA generator which differs across devices. + if device is not None: + rng_path = f"{tmp_subdir}/rng_rank{rank}.pt" + torch.save(_rng_state(device), rng_path) + + # Per-rank cumulative metric state (lifetime-AUC buffers + cumulative-eval + # histograms/scalar sums). Written by EVERY rank (outside the rank-0 block) + # because this state is per-rank-local; restoring rank-0's copy to all ranks + # would lose (world_size-1)/world_size of the history. + if metric_logger is not None: + perrank_state = _collect_perrank_metric_state(metric_logger) + if perrank_state: + torch.save( + perrank_state, + f"{tmp_subdir}/{METRICBUF_FILE_FMT.format(rank=rank)}", + ) + logger.info( + "checkpoint save: cumulative metric state rank=%d samples=%s", + rank, + _perrank_sample_counts(metric_logger), + ) + + torch.distributed.barrier() + sparse_dict = {"sparse_dict": SparseState(model, sparse_tensor_keys)} + torch.distributed.checkpoint.save( + sparse_dict, + storage_writer=torch.distributed.checkpoint.FileSystemWriter(sparse_path), + ) + torch.distributed.barrier() + # Promote .tmp → final, then prune. Done on rank 0 only since the directory + # operations are global filesystem state. + if rank == 0: + if os.path.exists(final_subdir): + # POSIX rename() refuses to replace a non-empty directory, so we + # can't os.replace(tmp, final) directly. Swap the old snapshot aside + # (instant rename), move the new one into place, then delete the old. + # The `.old` name is non-numeric → ignored by _resolve_latest_subdir + # and swept by _cleanup_stale_tmps on the next save if we crash mid-swap. + old_aside = f"{final_subdir}.old" + shutil.rmtree(old_aside, ignore_errors=True) + os.replace(final_subdir, old_aside) + os.replace(tmp_subdir, final_subdir) + shutil.rmtree(old_aside, ignore_errors=True) + else: + os.replace(tmp_subdir, final_subdir) + _prune_old_checkpoints(base_path, keep_last_n, final_subdir) + logger.info( + "checkpoint successfully saved → %s (wall-time %.2fs)", + final_subdir, + time.perf_counter() - _t_ckpt_start, + ) + torch.distributed.barrier() + if metric_logger is not None: + metric_logger.resume_perf("ckpt") + + +@gin.configurable +def load_sparse_checkpoint( + model: torch.nn.Module, + path: str = "", +) -> None: + if path == "": + return + sparse_path = f"{path}/sparse/" + + sparse_tensor_keys = { + k for k, v in model.state_dict().items() if is_sparse_key(k, v) + } + sparse_dict = {"sparse_dict": SparseState(model, sparse_tensor_keys)} + gc.collect() + torch.distributed.checkpoint.load( + sparse_dict, + storage_reader=torch.distributed.checkpoint.FileSystemReader(sparse_path), + ) + gc.collect() + print("sparse checkpoint successfully loaded") + + +@gin.configurable +def load_nonsparse_checkpoint( + model: torch.nn.Module, + device: torch.device, + optimizer: Optional[Optimizer] = None, + metric_logger: Optional[MetricsLogger] = None, + path: str = "", + rank: int = 0, +) -> Tuple[Optional[int], int, Optional[Dict[str, Any]]]: + """ + Load non-sparse (dense) components from a checkpoint. + + Loads dense model parameters, and optionally optimizer state and metrics. + Also restores per-rank RNG state if a matching `rng_rank{rank}.pt` is found + next to `non_sparse.ckpt`. + + Returns: + (train_ts, batch_idx_in_window, split_contract) — the streaming resume + hint and the saved train:eval split contract (None for legacy / non- + holdout checkpoints). `(None, WINDOW_COMPLETE, None)` if not a streaming + checkpoint or no path supplied. + """ + if path == "": + return None, WINDOW_COMPLETE, None + non_sparse_ckpt = f"{path}/non_sparse.ckpt" + + # weights_only=False: these are our own trusted checkpoints, and they hold + # non-tensor objects (optimizer/metric state dicts, numpy-backed RNG state) + # that PyTorch>=2.6's weights_only=True default refuses to unpickle. + non_sparse_state_dict = torch.load( + non_sparse_ckpt, map_location=device, weights_only=False + ) + load_dense_state_dict(model, non_sparse_state_dict["dense_dict"]) + print("dense checkpoint successfully loaded") + if optimizer is not None: + optimizer.load_state_dict(non_sparse_state_dict["optimizer_dict"]) + print("optimizer checkpoint successfully loaded") + if metric_logger is not None: + metric_logger.global_step = non_sparse_state_dict["global_step"] + # Defaulted for legacy checkpoints written before the counter existed. + metric_logger.cumulative_train_samples = non_sparse_state_dict.get( + "cumulative_train_samples", 0 + ) + # Defaulted False for legacy/cold checkpoints: a resume that loads a + # checkpoint where the run was already open continues without re-emitting + # the run markers. + metric_logger.mlperf_run_started = non_sparse_state_dict.get( + "mlperf_run_started", False + ) + class_metric_state_dict = non_sparse_state_dict["class_metrics"] + regression_metric_state_dict = non_sparse_state_dict["reg_metrics"] + # Length-safe positional restore: if a checkpoint was written with a + # different metric set (e.g. tasks added/removed since), restore the + # overlap instead of crashing with an IndexError at run end. + def _restore_metric_list( + live: list, saved: Optional[list], label: str + ) -> None: + saved = saved or [] + if len(live) != len(saved): + logger.warning( + "metric count mismatch for %s: live=%d saved=%d; " + "restoring overlapping %d", + label, + len(live), + len(saved), + min(len(live), len(saved)), + ) + for i in range(min(len(live), len(saved))): + live[i].load_state_dict(saved[i]) + + _restore_metric_list( + metric_logger.class_metrics["train"], + class_metric_state_dict.get("train"), + "class/train", + ) + _restore_metric_list( + metric_logger.class_metrics["eval"], + class_metric_state_dict.get("eval"), + "class/eval", + ) + _restore_metric_list( + metric_logger.regression_metrics["train"], + regression_metric_state_dict.get("train"), + "reg/train", + ) + _restore_metric_list( + metric_logger.regression_metrics["eval"], + regression_metric_state_dict.get("eval"), + "reg/eval", + ) + + # Per-rank cumulative metric state restore. This runs AFTER the generic + # load above so it is authoritative: the shared blob carries no lifetime + # buffers (stripped at save) nor any eval_cum state, and each rank + # restores its OWN cumulative state here. Missing file = legacy/pre-fix + # checkpoint; cumulative metrics self-heal (lifetime AUC refills; the + # binned-AUC histograms / scalar sums restart from zero). + mb_path = f"{path}/{METRICBUF_FILE_FMT.format(rank=rank)}" + if os.path.exists(mb_path): + perrank_state = torch.load( + mb_path, map_location=device, weights_only=False + ) + _restore_perrank_metric_state(metric_logger, perrank_state) + logger.info( + "checkpoint load: cumulative metric state rank=%d samples=%s", + rank, + _perrank_sample_counts(metric_logger), + ) + else: + logger.info( + "checkpoint load: no per-rank cumulative metric state at %s " + "(legacy/pre-fix checkpoint); cumulative metrics will refill", + mb_path, + ) + + # Per-rank RNG restore. Missing file = bit-equal trajectory not requested at + # save time; we silently continue (the test harness checks for both). + rng_path = f"{path}/rng_rank{rank}.pt" + if os.path.exists(rng_path): + # weights_only=False: RNG state is numpy/Python tuples, not tensors. + rng_state = torch.load(rng_path, map_location="cpu", weights_only=False) + _restore_rng_state(rng_state, device) + logger.info("RNG state restored from %s", rng_path) + + train_ts = non_sparse_state_dict.get("train_ts") + batch_idx_in_window = non_sparse_state_dict.get( + "batch_idx_in_window", WINDOW_COMPLETE + ) + split_contract = non_sparse_state_dict.get("split_contract") + return train_ts, batch_idx_in_window, split_contract + + +@gin.configurable +def load_dmp_checkpoint( + model: torch.nn.Module, + optimizer: Optimizer, + metric_logger: MetricsLogger, + device: torch.device, + path: str = "", + rank: int = 0, +) -> Tuple[Optional[int], int, Optional[Dict[str, Any]], bool]: + """ + Load a complete distributed model checkpoint (both sparse and dense components). + + `path` is auto-resolved: if it points at a directory containing numeric + subdirs (e.g. CKPT_PATH=/), the highest-numbered subdir is used. If it + already names a leaf save (e.g. /300), it's used as-is. Empty string = + no load. + + Returns: + (train_ts, batch_idx_in_window, split_contract, cold_start) — streaming + resume hint plus the saved split contract, and `cold_start` which is True + iff there was nothing to load (no checkpoint resolved). `cold_start` + distinguishes a genuine fresh run (no weights loaded) from a resume that + merely lacks a split contract (e.g. a legacy/non-streaming checkpoint), + which the caller's split-contract guard must still reject. + """ + resolved = _resolve_latest_subdir(path) + cold_start = resolved == "" + load_sparse_checkpoint(model=model, path=resolved) + train_ts, batch_idx_in_window, split_contract = load_nonsparse_checkpoint( + model=model, + optimizer=optimizer, + metric_logger=metric_logger, + path=resolved, + device=device, + rank=rank, + ) + return train_ts, batch_idx_in_window, split_contract, cold_start diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/configs.py b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py new file mode 100644 index 000000000..387fb4900 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py @@ -0,0 +1,830 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict +""" +Configuration module for DLRMv3 model. + +This module provides configuration functions for the HSTU model architecture and embedding table configurations. +""" + +import hashlib +import math +import os +from typing import Callable, Dict, Optional, Tuple + +import gin +import torch + +from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig +from generative_recommenders.modules.multitask_module import ( + MultitaskTaskType, + TaskConfig, +) +from torchrec.modules.embedding_configs import DataType, EmbeddingConfig + +HSTU_EMBEDDING_DIM = 512 # final DLRMv3 model +HASH_SIZE = 10_000_000 +HASH_SIZE_1B = 1_000_000_000 + +# (name, keys, num_embeddings, salt) — single source of truth for both +# get_embedding_table_config("yambda-5b") and the dataset's cross-hash inputs. +# Sizes mirror Primus-DLRM/configs/bench_onetrans_large_5b_cross_feat_shampoo.yaml. +YAMBDA_5B_CROSS_SPECS = [ + ("user_x_artist", ("uid", "artist_id"), 100_000_000, 0), + ("user_x_album", ("uid", "album_id"), 40_000_000, 0), + ("user_x_hour", ("uid", "hour_of_day"), 24_000_000, 0), + ("item_x_hour", ("item_id", "hour_of_day"), 40_000_000, 0), + ("artist_x_hour", ("artist_id", "hour_of_day"), 32_000_000, 0), + ("user_x_is_organic", ("uid", "is_organic"), 2_000_000, 0), + ("user_x_artist_x_hour", ("uid", "artist_id", "hour_of_day"), 40_000_000, 0), +] + + +@gin.configurable +def get_hstu_configs( + dataset: str = "debug", + max_seq_len: Optional[int] = None, + max_num_candidates: Optional[int] = None, + hstu_embedding_table_dim: Optional[int] = None, + hstu_transducer_embedding_dim: Optional[int] = None, + hstu_num_heads: Optional[int] = None, + hstu_attn_num_layers: Optional[int] = None, + hstu_attn_linear_dim: Optional[int] = None, + hstu_attn_qk_dim: Optional[int] = None, + hstu_input_dropout_ratio: Optional[float] = None, + hstu_linear_dropout_rate: Optional[float] = None, +) -> DlrmHSTUConfig: + """ + Create and return HSTU model configuration. + + Builds a complete DlrmHSTUConfig with default hyperparameters for the HSTU + architecture including attention settings, embedding dimensions, dropout rates, + and feature name mappings. + + Args: + dataset: Dataset identifier (currently unused, reserved for dataset-specific configs). + + Returns: + DlrmHSTUConfig: Complete configuration object for the HSTU model. + """ + hstu_config = DlrmHSTUConfig( + hstu_num_heads=4, + hstu_attn_linear_dim=128, + hstu_attn_qk_dim=128, + hstu_attn_num_layers=5, + hstu_embedding_table_dim=HSTU_EMBEDDING_DIM, + hstu_preprocessor_hidden_dim=256, + hstu_transducer_embedding_dim=512, + hstu_group_norm=False, + hstu_input_dropout_ratio=0.2, + hstu_linear_dropout_rate=0.1, + causal_multitask_weights=0.2, + ) + if "movielens" in dataset: + assert dataset in [ + "movielens-1m", + "movielens-20m", + "movielens-13b", + "movielens-18b", + ] + hstu_config.user_embedding_feature_names = ( + [ + "movie_id", + "user_id", + "sex", + "age_group", + "occupation", + "zip_code", + ] + if dataset == "movielens-1m" + else [ + "movie_id", + "user_id", + ] + ) + hstu_config.item_embedding_feature_names = [ + "item_movie_id", + ] + hstu_config.uih_post_id_feature_name = "movie_id" + hstu_config.uih_action_time_feature_name = "action_timestamp" + hstu_config.candidates_querytime_feature_name = "item_query_time" + hstu_config.candidates_weight_feature_name = "item_action_weights" + hstu_config.uih_weight_feature_name = "item_weights" + hstu_config.candidates_watchtime_feature_name = "item_movie_rating" + hstu_config.action_weights = [1, 2, 4, 8, 16] + hstu_config.contextual_feature_to_max_length = ( + { + "user_id": 1, + "sex": 1, + "age_group": 1, + "occupation": 1, + "zip_code": 1, + } + if dataset == "movielens-1m" + else { + "user_id": 1, + } + ) + hstu_config.contextual_feature_to_min_uih_length = ( + { + "user_id": 20, + "sex": 20, + "age_group": 20, + "occupation": 20, + "zip_code": 20, + } + if dataset == "movielens-1m" + else { + "user_id": 20, + } + ) + hstu_config.merge_uih_candidate_feature_mapping = [ + ("movie_id", "item_movie_id"), + ("movie_rating", "item_movie_rating"), + ("action_timestamp", "item_query_time"), + ("item_weights", "item_action_weights"), + ("dummy_watch_time", "item_dummy_watchtime"), + ] + hstu_config.hstu_uih_feature_names = ( + [ + "user_id", + "sex", + "age_group", + "occupation", + "zip_code", + "movie_id", + "movie_rating", + "action_timestamp", + "item_weights", + "dummy_watch_time", + ] + if dataset == "movielens-1m" + else [ + "user_id", + "movie_id", + "movie_rating", + "action_timestamp", + "item_weights", + "dummy_watch_time", + ] + ) + hstu_config.hstu_candidate_feature_names = [ + "item_movie_id", + "item_movie_rating", + "item_query_time", + "item_action_weights", + "item_dummy_watchtime", + ] + hstu_config.max_num_candidates = 10 + hstu_config.max_num_candidates_inference = ( + 5 if dataset not in ["movielens-13b", "movielens-18b"] else 2048 + ) + hstu_config.multitask_configs = [ + TaskConfig( + task_name="rating", + task_weight=1, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ) + ] + elif "streaming" in dataset: + hstu_config.user_embedding_feature_names = [ + "item_id", + "user_id", + "item_category_id", + ] + hstu_config.item_embedding_feature_names = [ + "item_candidate_id", + "item_candidate_category_id", + ] + hstu_config.uih_post_id_feature_name = "item_id" + hstu_config.uih_action_time_feature_name = "action_timestamp" + hstu_config.candidates_querytime_feature_name = "item_query_time" + hstu_config.candidates_weight_feature_name = "item_action_weights" + hstu_config.uih_weight_feature_name = "item_weights" + hstu_config.candidates_watchtime_feature_name = "item_rating" + hstu_config.action_weights = [1, 2, 4, 8, 16] + hstu_config.action_embedding_init_std = 5.0 + hstu_config.contextual_feature_to_max_length = {"user_id": 1} + hstu_config.contextual_feature_to_min_uih_length = {"user_id": 20} + hstu_config.merge_uih_candidate_feature_mapping = [ + ("item_id", "item_candidate_id"), + ("item_rating", "item_candidate_rating"), + ("action_timestamp", "item_query_time"), + ("item_weights", "item_action_weights"), + ("dummy_watch_time", "item_dummy_watchtime"), + ("item_category_id", "item_candidate_category_id"), + ] + hstu_config.hstu_uih_feature_names = [ + "user_id", + "item_id", + "item_rating", + "action_timestamp", + "item_weights", + "dummy_watch_time", + "item_category_id", + ] + hstu_config.hstu_candidate_feature_names = [ + "item_candidate_id", + "item_candidate_rating", + "item_query_time", + "item_action_weights", + "item_dummy_watchtime", + "item_candidate_category_id", + ] + hstu_config.max_num_candidates = 32 + hstu_config.max_num_candidates_inference = 2048 + hstu_config.multitask_configs = [ + TaskConfig( + task_name="rating", + task_weight=1, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ) + ] + elif "kuairand" in dataset: + hstu_config.user_embedding_feature_names = [ + "video_id", + "user_id", + "user_active_degree", + "follow_user_num_range", + "fans_user_num_range", + "friend_user_num_range", + "register_days_range", + ] + hstu_config.item_embedding_feature_names = [ + "item_video_id", + ] + hstu_config.uih_post_id_feature_name = "video_id" + hstu_config.uih_action_time_feature_name = "action_timestamp" + hstu_config.candidates_querytime_feature_name = "item_query_time" + hstu_config.uih_weight_feature_name = "action_weight" + hstu_config.candidates_weight_feature_name = "item_action_weight" + hstu_config.candidates_watchtime_feature_name = "item_target_watchtime" + # There are more contextual features in the dataset, see https://kuairand.com/ for details + hstu_config.contextual_feature_to_max_length = { + "user_id": 1, + "user_active_degree": 1, + "follow_user_num_range": 1, + "fans_user_num_range": 1, + "friend_user_num_range": 1, + "register_days_range": 1, + } + hstu_config.merge_uih_candidate_feature_mapping = [ + ("video_id", "item_video_id"), + ("action_timestamp", "item_query_time"), + ("action_weight", "item_action_weight"), + ("watch_time", "item_target_watchtime"), + ] + hstu_config.hstu_uih_feature_names = [ + "user_id", + "user_active_degree", + "follow_user_num_range", + "fans_user_num_range", + "friend_user_num_range", + "register_days_range", + "video_id", + "action_timestamp", + "action_weight", + "watch_time", + ] + hstu_config.hstu_candidate_feature_names = [ + "item_video_id", + "item_action_weight", + "item_target_watchtime", + "item_query_time", + ] + hstu_config.multitask_configs = [ + TaskConfig( + task_name="is_click", + task_weight=1, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ), + TaskConfig( + task_name="is_like", + task_weight=2, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ), + TaskConfig( + task_name="is_follow", + task_weight=4, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ), + TaskConfig( + task_name="is_comment", + task_weight=8, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ), + TaskConfig( + task_name="is_forward", + task_weight=16, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ), + TaskConfig( + task_name="is_hate", + task_weight=32, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ), + TaskConfig( + task_name="long_view", + task_weight=64, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ), + TaskConfig( + task_name="is_profile_enter", + task_weight=128, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ), + ] + hstu_config.action_weights = [1, 2, 4, 8, 16, 32, 64, 128] + elif "yambda" in dataset: + assert dataset in ["yambda-5b"] + cross_names = [name for (name, _k, _n, _s) in YAMBDA_5B_CROSS_SPECS] + # Per-table dim defaults to HSTU_EMBEDDING_DIM (512); override via the + # `get_hstu_configs.hstu_embedding_table_dim = N` gin binding if needed. + # Note: the embedding tables in get_embedding_table_config also use + # HSTU_EMBEDDING_DIM and must stay aligned with this value. + hstu_config.hstu_embedding_table_dim = HSTU_EMBEDDING_DIM + hstu_config.hstu_transducer_embedding_dim = 512 + hstu_config.max_seq_len = 8192 + hstu_config.max_num_candidates = 1 + hstu_config.max_num_candidates_inference = 1 + # Per dlrm_hstu convention (see streaming-100b/movielens): + # - user_embedding_feature_names = UIH-side post-id features + contextual features. + # After main_forward merges UIH + candidate, only these entries hold the merged + # sequence (used by user-side transducer). + # - item_embedding_feature_names = candidate-side names only. _item_forward + # concats these along dim=-1 to feed the item MLP (per-candidate, not per-position). + hstu_config.user_embedding_feature_names = ( + ["uid"] + + cross_names + + ["item_id", "artist_id", "album_id"] + ) + hstu_config.item_embedding_feature_names = [ + "item_candidate_id", + "item_candidate_artist_id", + "item_candidate_album_id", + ] + hstu_config.uih_post_id_feature_name = "item_id" + hstu_config.uih_action_time_feature_name = "action_timestamp" + hstu_config.uih_weight_feature_name = "action_weight" + hstu_config.candidates_querytime_feature_name = "item_query_time" + hstu_config.candidates_weight_feature_name = "item_action_weight" + hstu_config.candidates_watchtime_feature_name = "item_dummy_watchtime" + hstu_config.action_weights = [1, 2, 4] # lp, like, skip bits + hstu_config.contextual_feature_to_max_length = { + "uid": 1, + **{name: 1 for name in cross_names}, + } + hstu_config.contextual_feature_to_min_uih_length = { + "uid": 0, + **{name: 0 for name in cross_names}, + } + # uih names map to candidate names (no name collisions allowed): + # item_id/artist_id/album_id appear with prefix "item_" on candidate side. + hstu_config.merge_uih_candidate_feature_mapping = [ + ("item_id", "item_candidate_id"), + ("artist_id", "item_candidate_artist_id"), + ("album_id", "item_candidate_album_id"), + ("action_weight", "item_action_weight"), + ("action_timestamp", "item_query_time"), + ("dummy_watch_time", "item_dummy_watchtime"), + ] + hstu_config.hstu_uih_feature_names = ( + ["uid"] + + cross_names + + [ + "item_id", + "artist_id", + "album_id", + "action_weight", + "action_timestamp", + "dummy_watch_time", + ] + ) + hstu_config.hstu_candidate_feature_names = [ + "item_candidate_id", + "item_candidate_artist_id", + "item_candidate_album_id", + "item_query_time", + "item_action_weight", + "item_dummy_watchtime", + ] + hstu_config.multitask_configs = [ + TaskConfig( + task_name="listen_plus", + task_weight=1, # matches action_weights[0] (lp bit) + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ) + ] + else: + hstu_config.user_embedding_feature_names = [ + "uih_post_id", + "uih_owner_id", + "viewer_id", + "dummy_contexual", + ] + hstu_config.item_embedding_feature_names = [ + "item_post_id", + "item_owner_id", + ] + hstu_config.uih_post_id_feature_name = "uih_post_id" + hstu_config.uih_action_time_feature_name = "uih_action_time" + hstu_config.candidates_querytime_feature_name = "item_query_time" + hstu_config.candidates_weight_feature_name = "item_action_weight" + hstu_config.candidates_watchtime_feature_name = "item_target_watchtime" + hstu_config.contextual_feature_to_max_length = { + "viewer_id": 1, + "dummy_contexual": 1, + } + hstu_config.contextual_feature_to_min_uih_length = { + "viewer_id": 128, + "dummy_contexual": 128, + } + hstu_config.merge_uih_candidate_feature_mapping = [ + ("uih_post_id", "item_post_id"), + ("uih_owner_id", "item_owner_id"), + ("uih_action_time", "item_query_time"), + ("uih_weight", "item_action_weight"), + ("uih_watchtime", "item_target_watchtime"), + ("uih_video_length", "item_video_length"), + ("uih_surface_type", "item_surface_type"), + ] + hstu_config.hstu_uih_feature_names = [ + "uih_post_id", + "uih_action_time", + "uih_weight", + "uih_owner_id", + "uih_watchtime", + "uih_surface_type", + "uih_video_length", + "viewer_id", + "dummy_contexual", + ] + hstu_config.hstu_candidate_feature_names = [ + "item_post_id", + "item_owner_id", + "item_surface_type", + "item_video_length", + "item_action_weight", + "item_target_watchtime", + "item_query_time", + ] + hstu_config.multitask_configs = [ + TaskConfig( + task_name="vvp100", + task_weight=1, + task_type=MultitaskTaskType.BINARY_CLASSIFICATION, + ) + ] + + # Apply gin overrides last so a value set in the gin file wins over the + # per-dataset defaults above. Anything left as None inherits the default + # the dataset branch (or DlrmHSTUConfig) chose. Example in a gin file: + # get_hstu_configs.max_seq_len = 4096 + # get_hstu_configs.hstu_embedding_table_dim = 256 + _gin_overrides = { + "max_seq_len": max_seq_len, + "max_num_candidates": max_num_candidates, + "max_num_candidates_inference": max_num_candidates, + "hstu_embedding_table_dim": hstu_embedding_table_dim, + "hstu_transducer_embedding_dim": hstu_transducer_embedding_dim, + "hstu_num_heads": hstu_num_heads, + "hstu_attn_num_layers": hstu_attn_num_layers, + "hstu_attn_linear_dim": hstu_attn_linear_dim, + "hstu_attn_qk_dim": hstu_attn_qk_dim, + "hstu_input_dropout_ratio": hstu_input_dropout_ratio, + "hstu_linear_dropout_rate": hstu_linear_dropout_rate, + } + for _name, _val in _gin_overrides.items(): + if _val is not None: + setattr(hstu_config, _name, _val) + + return hstu_config + + +def _stable_table_seed(init_seed: int, table_name: str) -> int: + """Deterministic 63-bit seed from (init_seed, table_name). + + Uses sha256 (not Python's salted built-in ``hash()``) so the per-table seed + is identical across processes/ranks/runs for a given ``$SEED`` + table name. + """ + digest = hashlib.sha256(f"{init_seed}:{table_name}".encode("utf-8")).digest() + return int.from_bytes(digest[:8], "big") & 0x7FFF_FFFF_FFFF_FFFF + + +def _uniform_init_bounds(cfg: EmbeddingConfig) -> Tuple[float, float]: + """Mirror TorchREC's default per-table init bounds. + + TorchREC falls back to ``uniform_(-1/sqrt(N), +1/sqrt(N))`` when a table does + not set ``weight_init_min/max``; honor any explicit bounds the config carries. + """ + bound = math.sqrt(1.0 / cfg.num_embeddings) + lo = -bound if cfg.weight_init_min is None else cfg.weight_init_min + hi = bound if cfg.weight_init_max is None else cfg.weight_init_max + return lo, hi + + +def _make_seeded_uniform_init( + table_seed: int, lo: float, hi: float +) -> Callable[[torch.Tensor], torch.Tensor]: + """Build a seeded in-place uniform initializer for one table's weight. + + TorchREC/FBGEMM calls ``init_fn`` with the (per-rank) local shard tensor on + its compute device, so we seed a generator on that same device. For a fixed + sharding plan (world size + plan unchanged) this makes embedding init + byte-reproducible run-to-run. + """ + + def _init(weight: torch.Tensor) -> torch.Tensor: + # TorchREC builds the unsharded EmbeddingCollection on the META device + # first (DMP materializes real storage on the compute device later). + # Meta tensors have no storage and torch.Generator(device="meta") is + # invalid ("META device type not an accelerator"), so skip them: the + # seeded init for the sharded/fused TBE path is provided by the RNG + # re-seed right before DMP in make_optimizer_and_shard. On a real + # device (eager/non-meta path) we still apply the per-table seeded fill. + if weight.device.type == "meta": + return weight + gen = torch.Generator(device=weight.device) + gen.manual_seed(table_seed) + with torch.no_grad(): + weight.uniform_(lo, hi, generator=gen) + return weight + + return _init + + +@gin.configurable +def get_embedding_table_config( + dataset: str = "debug", + embedding_dim: Optional[int] = None, + init_seed: Optional[int] = None, +) -> Dict[str, EmbeddingConfig]: + """ + Create and return embedding table configurations. + + Defines the embedding table configurations for item IDs, category IDs, and user IDs + with their respective dimensions and data types. + + Args: + dataset: Dataset identifier (currently unused, reserved for dataset-specific configs). + embedding_dim: Per-table embedding width override. When set via gin + (e.g. `get_embedding_table_config.embedding_dim = 256`), wins over + `HSTU_EMBEDDING_DIM`. Keep in sync with the matching gin override on + `get_hstu_configs.hstu_embedding_table_dim` — the model and the + tables must agree on dim or sharding will reject the plan. + init_seed: Base seed for the per-table seeded `init_fn` (Tier 1 + reproducible embedding init). When None, falls back to `$SEED` + (default 1), matching `seed_everything`. Each table draws from a + generator seeded by `sha256(init_seed, table_name)` so init is + reproducible run-to-run for a fixed sharding plan. + + Returns: + Dict mapping table names to their EmbeddingConfig objects. + """ + tables = _build_embedding_table_config(dataset=dataset, embedding_dim=embedding_dim) + + if init_seed is None: + init_seed = int(os.environ.get("SEED", "1")) + for name, cfg in tables.items(): + lo, hi = _uniform_init_bounds(cfg) + cfg.init_fn = _make_seeded_uniform_init( + _stable_table_seed(init_seed, name), lo, hi + ) + return tables + + +def _build_embedding_table_config( + dataset: str = "debug", + embedding_dim: Optional[int] = None, +) -> Dict[str, EmbeddingConfig]: + DIM = embedding_dim if embedding_dim is not None else HSTU_EMBEDDING_DIM + if "movielens" in dataset: + assert dataset in [ + "movielens-1m", + "movielens-20m", + "movielens-13b", + "movielens-18b", + ] + return ( + { + "movie_id": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="movie_id", + data_type=DataType.FP16, + feature_names=["movie_id", "item_movie_id"], + ), + "user_id": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="user_id", + data_type=DataType.FP16, + feature_names=["user_id"], + ), + "sex": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="sex", + data_type=DataType.FP16, + feature_names=["sex"], + ), + "age_group": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="age_group", + data_type=DataType.FP16, + feature_names=["age_group"], + ), + "occupation": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="occupation", + data_type=DataType.FP16, + feature_names=["occupation"], + ), + "zip_code": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="zip_code", + data_type=DataType.FP16, + feature_names=["zip_code"], + ), + } + if dataset == "movielens-1m" + else { + "movie_id": EmbeddingConfig( + num_embeddings=HASH_SIZE_1B, + embedding_dim=DIM, + name="movie_id", + data_type=DataType.FP16, + feature_names=["movie_id", "item_movie_id"], + ), + "user_id": EmbeddingConfig( + num_embeddings=3_000_000, + embedding_dim=DIM, + name="user_id", + data_type=DataType.FP16, + feature_names=["user_id"], + ), + } + ) + elif "streaming" in dataset: + return { + "item_id": EmbeddingConfig( + num_embeddings=HASH_SIZE_1B, + embedding_dim=DIM, + name="item_id", + data_type=DataType.FP16, + feature_names=["item_id", "item_candidate_id"], + ), + "item_category_id": EmbeddingConfig( + num_embeddings=128, + embedding_dim=DIM, + name="item_category_id", + data_type=DataType.FP16, + weight_init_max=1.0, + weight_init_min=-1.0, + feature_names=["item_category_id", "item_candidate_category_id"], + ), + "user_id": EmbeddingConfig( + num_embeddings=10_000_000, + embedding_dim=DIM, + name="user_id", + data_type=DataType.FP16, + feature_names=["user_id"], + ), + } + elif "kuairand" in dataset: + return { + "video_id": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="video_id", + data_type=DataType.FP16, + feature_names=["video_id", "item_video_id"], + ), + "user_id": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="user_id", + data_type=DataType.FP16, + feature_names=["user_id"], + ), + "user_active_degree": EmbeddingConfig( + num_embeddings=8, + embedding_dim=DIM, + name="user_active_degree", + data_type=DataType.FP16, + feature_names=["user_active_degree"], + ), + "follow_user_num_range": EmbeddingConfig( + num_embeddings=9, + embedding_dim=DIM, + name="follow_user_num_range", + data_type=DataType.FP16, + feature_names=["follow_user_num_range"], + ), + "fans_user_num_range": EmbeddingConfig( + num_embeddings=9, + embedding_dim=DIM, + name="fans_user_num_range", + data_type=DataType.FP16, + feature_names=["fans_user_num_range"], + ), + "friend_user_num_range": EmbeddingConfig( + num_embeddings=8, + embedding_dim=DIM, + name="friend_user_num_range", + data_type=DataType.FP16, + feature_names=["friend_user_num_range"], + ), + "register_days_range": EmbeddingConfig( + num_embeddings=8, + embedding_dim=DIM, + name="register_days_range", + data_type=DataType.FP16, + feature_names=["register_days_range"], + ), + } + elif "yambda" in dataset: + assert dataset in ["yambda-5b"] + tables: Dict[str, EmbeddingConfig] = { + "item_id": EmbeddingConfig( + num_embeddings=9_390_624, + embedding_dim=DIM, + name="item_id", + data_type=DataType.FP32, + feature_names=["item_id", "item_candidate_id"], + ), + "artist_id": EmbeddingConfig( + num_embeddings=1_293_395, + embedding_dim=DIM, + name="artist_id", + data_type=DataType.FP32, + feature_names=["artist_id", "item_candidate_artist_id"], + ), + "album_id": EmbeddingConfig( + num_embeddings=3_367_692, + embedding_dim=DIM, + name="album_id", + data_type=DataType.FP32, + feature_names=["album_id", "item_candidate_album_id"], + ), + "uid": EmbeddingConfig( + num_embeddings=1_000_001, + embedding_dim=DIM, + name="uid", + data_type=DataType.FP32, + feature_names=["uid"], + ), + } + for name, _keys, num_embeddings, _salt in YAMBDA_5B_CROSS_SPECS: + tables[name] = EmbeddingConfig( + num_embeddings=num_embeddings, + embedding_dim=DIM, + name=name, + data_type=DataType.FP32, + feature_names=[name], + ) + return tables + else: + return { + "post_id": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="post_id", + data_type=DataType.FP16, + feature_names=[ + "uih_post_id", + "item_post_id", + "uih_owner_id", + "item_owner_id", + ], + ), + "viewer_id": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="viewer_id", + data_type=DataType.FP16, + feature_names=["viewer_id"], + ), + "dummy_contexual": EmbeddingConfig( + num_embeddings=HASH_SIZE, + embedding_dim=DIM, + name="dummy_contexual", + data_type=DataType.FP16, + feature_names=["dummy_contexual"], + ), + } diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py new file mode 100644 index 000000000..204c06df1 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py @@ -0,0 +1,461 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-unsafe +""" +Dataset implementations for DLRMv3. + +This module provides dataset classes for loading and processing recommendation +data, including sample containers, collation functions, and random data generation. +""" + +import logging +import time +from dataclasses import dataclass +from typing import Dict, List, Tuple + +import torch +from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig +from torchrec.sparse.jagged_tensor import KeyedJaggedTensor +from torchrec.streamable import Pipelineable + + +logging.basicConfig(level=logging.INFO) +logger: logging.Logger = logging.getLogger("dlrmv3_dataset") + + +@dataclass +class Samples(Pipelineable): + """ + Container for batched samples with user interaction history and candidate features. + + Attributes: + uih_features_kjt: User interaction history features as KeyedJaggedTensor. + candidates_features_kjt: Candidate item features as KeyedJaggedTensor. + """ + + uih_features_kjt: KeyedJaggedTensor + candidates_features_kjt: KeyedJaggedTensor + # UIH + candidate features concatenated into the single KJT that the model's + # sharded EmbeddingCollection consumes. Pre-built here (dataloader/CPU) rather + # than inside DlrmHSTU.forward so the embedding lookup's input is a plain + # attribute of the batch — which lets TorchRec's TrainPipelineSparseDist hoist + # its input_dist into the prefetch stage (otherwise the runtime cat + + # from_lengths_sync counts as an "input modification" and the embedding + # collection is left un-pipelined). + merged_sparse_features: KeyedJaggedTensor + + def to(self, device: torch.device, non_blocking: bool = False) -> "Samples": + """ + Move all tensors to the specified device (in place) and return self. + + Returning ``self`` (rather than ``None``) and accepting ``non_blocking`` + makes ``Samples`` conform to TorchRec's ``Pipelineable`` protocol so it + can be driven by ``TrainPipelineSparseDist``. Existing call sites that + use ``sample.to(device)`` for its side effect continue to work unchanged. + """ + for attr in vars(self): + setattr( + self, + attr, + getattr(self, attr).to(device=device, non_blocking=non_blocking), + ) + return self + + def record_stream(self, stream: torch.Stream) -> None: + """Record the contained KJTs on ``stream`` (Pipelineable protocol). + + Required by ``TrainPipelineSparseDist`` so the prefetched batch's H2D + copy on the side stream is not freed before compute consumes it. + """ + self.uih_features_kjt.record_stream(stream) + self.candidates_features_kjt.record_stream(stream) + self.merged_sparse_features.record_stream(stream) + + def pin_memory(self) -> "Samples": + """Pin the contained KJTs' host memory (Pipelineable protocol).""" + self.uih_features_kjt = self.uih_features_kjt.pin_memory() + self.candidates_features_kjt = self.candidates_features_kjt.pin_memory() + self.merged_sparse_features = self.merged_sparse_features.pin_memory() + return self + + def batch_size(self) -> int: + """ + Get the batch size of the samples. + + Returns: + Number of samples in the batch. + """ + return self.uih_features_kjt.stride() + + +def merge_uih_candidate_kjts( + uih_features: KeyedJaggedTensor, + candidates_features: KeyedJaggedTensor, +) -> KeyedJaggedTensor: + """Concatenate the UIH and candidate KJTs into the single KJT consumed by the + model's ``EmbeddingCollection``. + + Must mirror ``DlrmHSTU.preprocess`` exactly (key order = uih + candidates, + values/lengths concatenated in that order). Built on the dataloader side so + the model can read it straight off the batch and TorchRec can pipeline the + embedding ``input_dist``. + """ + return KeyedJaggedTensor.from_lengths_sync( + keys=uih_features.keys() + candidates_features.keys(), + values=torch.cat( + [uih_features.values(), candidates_features.values()], + dim=0, + ), + lengths=torch.cat( + [uih_features.lengths(), candidates_features.lengths()], + dim=0, + ), + ) + + +def collate_fn( + samples: List[Tuple[KeyedJaggedTensor, KeyedJaggedTensor]], +) -> Samples: + """ + Collate multiple samples into a batched Samples object. + + Args: + samples: List of (uih_features, candidates_features) tuples. + + Returns: + Batched Samples object with concatenated features. + """ + ( + uih_features_kjt_list, + candidates_features_kjt_list, + ) = list(zip(*samples)) + + uih_features_kjt = kjt_batch_func(uih_features_kjt_list) + candidates_features_kjt = kjt_batch_func(candidates_features_kjt_list) + return Samples( + uih_features_kjt=uih_features_kjt, + candidates_features_kjt=candidates_features_kjt, + merged_sparse_features=merge_uih_candidate_kjts( + uih_features_kjt, candidates_features_kjt + ), + ) + + +class Dataset: + """ + Base dataset class for DLRMv3. + + Provides the interface for loading, accessing, and managing samples + for recommendation model training and inference. + + Args: + hstu_config: HSTU model configuration. + **args: Additional arguments (unused in base class). + """ + + def __init__(self, hstu_config: DlrmHSTUConfig, **args): + self.arrival = None + self.image_list = [] + self.label_list = [] + self.image_list_inmemory = {} + self.last_loaded = -1.0 + + def preprocess(self, use_cache=True): + """ + Preprocess the dataset. + + Args: + use_cache: Whether to use cached preprocessed data. + + Raises: + NotImplementedError: Subclasses must implement this method. + """ + raise NotImplementedError("Dataset:preprocess") + + def get_item_count(self): + """ + Get the total number of items in the dataset. + + Returns: + Number of items. + """ + return len(self.image_list) + + def load_query_samples(self, sample_list): + """ + Load specified samples into memory. + + Args: + sample_list: List of sample indices to load. + + Raises: + NotImplementedError: Subclasses must implement this method. + """ + raise NotImplementedError("Dataset:load_query_samples") + + def unload_query_samples(self, sample_list): + """ + Unload specified samples from memory. + + Args: + sample_list: List of sample indices to unload. + + Raises: + NotImplementedError: Subclasses must implement this method. + """ + raise NotImplementedError("Dataset:unload_query_samples") + + def get_sample(self, id: int): + """ + Get a single sample by ID. + + Args: + id: Sample identifier. + + Raises: + NotImplementedError: Subclasses must implement this method. + """ + raise NotImplementedError("Dataset:get_sample") + + def get_samples(self, id_list: List[int]) -> Samples: + """ + Get multiple samples and collate them into a batch. + + Args: + id_list: List of sample identifiers. + + Returns: + Collated Samples object containing the batch. + """ + list_samples = [self.get_sample(ix) for ix in id_list] + return collate_fn(list_samples) + + +@torch.jit.script +def kjt_batch_func( + kjt_list: List[KeyedJaggedTensor], +) -> KeyedJaggedTensor: + """ + Batch multiple KeyedJaggedTensors into a single tensor. + + Uses FBGEMM operations for efficient batching and reordering of + jagged tensor data. + + Args: + kjt_list: List of KeyedJaggedTensors to batch. + + Returns: + Batched KeyedJaggedTensor with reordered indices and lengths. + """ + bs_list = [kjt.stride() for kjt in kjt_list] + bs = sum(bs_list) + batched_length = torch.cat([kjt.lengths() for kjt in kjt_list], dim=0) + batched_indices = torch.cat([kjt.values() for kjt in kjt_list], dim=0) + bs_offset = torch.ops.fbgemm.asynchronous_complete_cumsum( + torch.tensor(bs_list) + ).int() + batched_offset = torch.ops.fbgemm.asynchronous_complete_cumsum(batched_length) + reorder_length = torch.ops.fbgemm.reorder_batched_ad_lengths( + batched_length, bs_offset, bs + ) + reorder_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(reorder_length) + reorder_indices = torch.ops.fbgemm.reorder_batched_ad_indices( + batched_offset, batched_indices, reorder_offsets, bs_offset, bs + ) + out = KeyedJaggedTensor( + keys=kjt_list[0].keys(), + lengths=reorder_length.long(), + values=reorder_indices.long(), + ) + return out + + +def get_random_data( + contexual_features: List[str], + hstu_uih_keys: List[str], + hstu_candidates_keys: List[str], + uih_max_seq_len: int, + max_num_candidates: int, + value_bound: int = 1000, +): + """ + Generate random sample data for testing and debugging. + + Creates synthetic user interaction history and candidate features + with random values. + + Args: + contexual_features: List of contextual feature names. + hstu_uih_keys: List of UIH feature keys. + hstu_candidates_keys: List of candidate feature keys. + uih_max_seq_len: Maximum sequence length for UIH. + max_num_candidates: Maximum number of candidates. + value_bound: Upper bound for random values. + + Returns: + Tuple of (uih_features_kjt, candidates_features_kjt). + """ + uih_non_seq_feature_keys = contexual_features + uih_seq_feature_keys = [ + k for k in hstu_uih_keys if k not in uih_non_seq_feature_keys + ] + uih_seq_len = torch.randint( + int(uih_max_seq_len * 0.8), + uih_max_seq_len + 1, + (1,), + ).item() + uih_lengths = torch.tensor( + [1 for _ in uih_non_seq_feature_keys] + + [uih_seq_len for _ in uih_seq_feature_keys] + ) + # logging.info(f"uih_lengths: {uih_lengths}") + uih_values = torch.randint( + 1, + value_bound, + # pyre-ignore[6] + (uih_seq_len * len(uih_seq_feature_keys) + len(uih_non_seq_feature_keys),), + ) + uih_features_kjt = KeyedJaggedTensor( + keys=uih_non_seq_feature_keys + uih_seq_feature_keys, + lengths=uih_lengths.long(), + values=uih_values.long(), + ) + num_candidates = torch.randint( + 1, + max_num_candidates + 1, + (1,), + ).item() + candidates_lengths = num_candidates * torch.ones(len(hstu_candidates_keys)) + candidates_values = torch.randint( + 1, + value_bound, + (num_candidates * len(hstu_candidates_keys),), # pyre-ignore[6] + ) + candidates_features_kjt = KeyedJaggedTensor( + keys=hstu_candidates_keys, + lengths=candidates_lengths.long(), + values=candidates_values.long(), + ) + return uih_features_kjt, candidates_features_kjt + + +class DLRMv3RandomDataset(Dataset): + """ + Dataset that generates random synthetic data for DLRMv3. + + Useful for testing and benchmarking without real data dependencies. + + Args: + hstu_config: HSTU model configuration. + num_aggregated_samples: Total number of samples to generate. + is_inference: Whether the dataset is used for inference mode. + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + """ + + def __init__( + self, + hstu_config: DlrmHSTUConfig, + num_aggregated_samples: int = 10000, + is_inference: bool = False, + *args, + **kwargs, + ): + super().__init__( + hstu_config=hstu_config, + ) + self.hstu_config: DlrmHSTUConfig = hstu_config + self._max_num_candidates: int = hstu_config.max_num_candidates + self._max_num_candidates_inference: int = ( + hstu_config.max_num_candidates_inference + ) + self._max_seq_len: int = hstu_config.max_seq_len + self._uih_keys: List[str] = hstu_config.hstu_uih_feature_names + self._candidates_keys: List[str] = hstu_config.hstu_candidate_feature_names + self._contextual_feature_to_max_length: Dict[str, int] = ( + hstu_config.contextual_feature_to_max_length + ) + self._max_uih_len: int = ( + self._max_seq_len + - self._max_num_candidates + - ( + len(self._contextual_feature_to_max_length) + if self._contextual_feature_to_max_length + else 0 + ) + ) + self._is_inference = is_inference + + self.contexual_features = [] + if hstu_config.contextual_feature_to_max_length is not None: + self.contexual_features = [ + p[0] for p in hstu_config.contextual_feature_to_max_length + ] + + self.num_aggregated_samples = num_aggregated_samples + self.items_in_memory = {} + + def get_sample(self, id: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: + """ + Get a sample by ID from in-memory storage. + + Args: + id: Sample identifier. + + Returns: + Tuple of (uih_features_kjt, candidates_features_kjt). + """ + return self.items_in_memory[id] + + def get_item_count(self): + """ + Get the total number of samples in the dataset. + + Returns: + Number of aggregated samples. + """ + return self.num_aggregated_samples + + def unload_query_samples(self, sample_list): + """ + Clear all samples from memory. + + Args: + sample_list: Ignored; clears all samples. + """ + self.items_in_memory = {} + + def load_query_samples(self, sample_list): + """ + Generate and load random samples into memory. + + Args: + sample_list: List of sample IDs to generate. + """ + max_num_candidates = ( + self._max_num_candidates_inference + if self._is_inference + else self._max_num_candidates + ) + self.items_in_memory = {} + for sample in sample_list: + self.items_in_memory[sample] = get_random_data( + contexual_features=self.contexual_features, + hstu_uih_keys=self.hstu_config.hstu_uih_feature_names, + hstu_candidates_keys=self.hstu_config.hstu_candidate_feature_names, + uih_max_seq_len=self._max_uih_len, + max_num_candidates=max_num_candidates, + ) + self.last_loaded = time.time() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/kuairand.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/kuairand.py new file mode 100644 index 000000000..f6cd9e672 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/kuairand.py @@ -0,0 +1,163 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-unsafe +import json +import time +from functools import partial +from typing import Any, Dict, List + +import pandas as pd +import torch +from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset +from generative_recommenders.dlrm_v3.datasets.utils import ( + maybe_truncate_seq, + separate_uih_candidates, +) +from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig +from torchrec.sparse.jagged_tensor import KeyedJaggedTensor + + +def process_and_hash_x(x: Any, hash_size: int) -> Any: + if isinstance(x, str): + x = json.loads(x) + if isinstance(x, list): + return [x_i % hash_size for x_i in x] + else: + return x % hash_size + + +class DLRMv3KuaiRandDataset(DLRMv3RandomDataset): + def __init__( + self, + hstu_config: DlrmHSTUConfig, + embedding_config: Dict[str, Any], + seq_logs_file: str, + is_inference: bool, + **kwargs, + ) -> None: + super().__init__(hstu_config=hstu_config, is_inference=is_inference) + self.seq_logs_frame: pd.DataFrame = pd.read_csv(seq_logs_file, delimiter=",") + # apply hashing from embedding table config + for key, table in embedding_config.items(): + assert key in self.seq_logs_frame.columns, ( + "Rename key in embedding table configs!" + ) + hash_size = table.num_embeddings + self.seq_logs_frame[key] = self.seq_logs_frame[key].apply( + partial(process_and_hash_x, hash_size=hash_size) + ) + + def get_item_count(self): + return len(self.seq_logs_frame) + + def unload_query_samples(self, sample_list): + self.items_in_memory = {} + + def load_query_samples(self, sample_list): + max_num_candidates = ( + self._max_num_candidates_inference + if self._is_inference + else self._max_num_candidates + ) + self.items_in_memory = {} + for idx in sample_list: + data = self.seq_logs_frame.iloc[idx] + if len(data.video_id) <= max_num_candidates: + continue + sample = self.load_item(data, max_num_candidates) + self.items_in_memory[idx] = sample + + self.last_loaded = time.time() + + def load_item(self, data, max_num_candidates): + with torch.profiler.record_function("load_item"): + video_history_uih, video_history_candidates = separate_uih_candidates( + data.video_id, + candidates_max_seq_len=max_num_candidates, + ) + action_weights_uih, action_weights_candidates = separate_uih_candidates( + data.action_weights, + candidates_max_seq_len=max_num_candidates, + ) + timestamps_uih, _ = separate_uih_candidates( + data.time_ms, + candidates_max_seq_len=max_num_candidates, + ) + watch_time_uih, watch_time_candidates = separate_uih_candidates( + data.play_time_ms, + candidates_max_seq_len=max_num_candidates, + ) + + video_history_uih = maybe_truncate_seq(video_history_uih, self._max_uih_len) + action_weights_uih = maybe_truncate_seq( + action_weights_uih, self._max_uih_len + ) + timestamps_uih = maybe_truncate_seq(timestamps_uih, self._max_uih_len) + watch_time_uih = maybe_truncate_seq(watch_time_uih, self._max_uih_len) + + uih_seq_len = len(video_history_uih) + assert uih_seq_len == len(timestamps_uih), ( + "history len differs from timestamp len." + ) + assert uih_seq_len == len(action_weights_uih), ( + "history len differs from weights len." + ) + assert uih_seq_len == len(watch_time_uih), ( + "history len differs from watch time len." + ) + + uih_kjt_values: List[torch.Tensor] = [] + uih_kjt_lengths: List[torch.Tensor] = [] + for name, length in self._contextual_feature_to_max_length.items(): + uih_kjt_values.append(data[name]) + uih_kjt_lengths.append(length) + + uih_kjt_values.extend( + video_history_uih + timestamps_uih + action_weights_uih + watch_time_uih + ) + + uih_kjt_lengths.extend( + [ + uih_seq_len + for _ in range( + len(self._uih_keys) + - len(self._contextual_feature_to_max_length) + ) + ] + ) + + dummy_query_time = max(timestamps_uih) + uih_features_kjt = KeyedJaggedTensor( + keys=self._uih_keys, + lengths=torch.tensor(uih_kjt_lengths).long(), + values=torch.tensor(uih_kjt_values).long(), + ) + + candidates_kjt_lengths = max_num_candidates * torch.ones( + len(self._candidates_keys) + ) + candidates_kjt_values = ( + video_history_candidates + + action_weights_candidates + + watch_time_candidates + + [dummy_query_time] * max_num_candidates + ) + candidates_features_kjt = KeyedJaggedTensor( + keys=self._candidates_keys, + lengths=torch.tensor(candidates_kjt_lengths).long(), + values=torch.tensor(candidates_kjt_values).long(), + ) + + return uih_features_kjt, candidates_features_kjt diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/movie_lens.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/movie_lens.py new file mode 100644 index 000000000..d74fb575b --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/movie_lens.py @@ -0,0 +1,177 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-unsafe +import logging +import time +from typing import List, Optional + +import pandas as pd +import torch +from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset +from generative_recommenders.dlrm_v3.datasets.utils import ( + maybe_truncate_seq, + separate_uih_candidates, +) +from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig +from torchrec.sparse.jagged_tensor import KeyedJaggedTensor + +logger = logging.getLogger(__name__) + + +class DLRMv3MovieLensDataset(DLRMv3RandomDataset): + def __init__( + self, + hstu_config: DlrmHSTUConfig, + ratings_file: str, + is_inference: bool, + *args, + **kwargs, + ): + super().__init__(hstu_config=hstu_config, is_inference=is_inference) + self.ratings_frame: Optional[pd.DataFrame] = None + if ratings_file != "": + self.ratings_frame = pd.read_csv( + ratings_file, + delimiter=",", + ) + assert hstu_config.action_weights is not None + self.action_weights: List[int] = hstu_config.action_weights + + def get_item_count(self): + assert self.ratings_frame is not None + return len(self.ratings_frame) + + def unload_query_samples(self, sample_list): + self.items_in_memory = {} + + def iloc(self, idx): + assert self.ratings_frame is not None + return self.ratings_frame.iloc[idx] + + def load_query_samples(self, sample_list): + max_num_candidates = ( + self._max_num_candidates_inference + if self._is_inference + else self._max_num_candidates + ) + self.items_in_memory = {} + for idx in sample_list: + data = self.iloc(idx) + if len(data.sequence_item_ids) <= max_num_candidates: + continue + sample = self.load_item(data, max_num_candidates) + self.items_in_memory[idx] = sample + + self.last_loaded = time.time() + + def get_timestamp_uih(self, data, max_num_candidates, size): + movie_timestamps_uih, _ = separate_uih_candidates( + data.sequence_timestamps, + candidates_max_seq_len=max_num_candidates, + ) + return movie_timestamps_uih + + def load_item(self, data, max_num_candidates): + movie_history_uih, movie_history_candidates = separate_uih_candidates( + data.sequence_item_ids, + candidates_max_seq_len=max_num_candidates, + ) + movie_history_ratings_uih, movie_history_ratings_candidates = ( + separate_uih_candidates( + data.sequence_ratings, + candidates_max_seq_len=max_num_candidates, + ) + ) + movie_timestamps_uih = self.get_timestamp_uih( + data=data, + max_num_candidates=max_num_candidates, + size=len(movie_history_uih), + ) + + assert len(movie_history_uih) == len(movie_timestamps_uih), ( + "history len differs from timestamp len." + ) + assert len(movie_history_uih) == len(movie_history_ratings_uih), ( + "history len differs from ratings len." + ) + + movie_history_uih = maybe_truncate_seq(movie_history_uih, self._max_uih_len) + movie_history_ratings_uih = maybe_truncate_seq( + movie_history_ratings_uih, self._max_uih_len + ) + movie_timestamps_uih = maybe_truncate_seq( + movie_timestamps_uih, self._max_uih_len + ) + + uih_kjt_values: List[torch.Tensor] = [] + uih_kjt_lengths: List[torch.Tensor] = [] + for name, length in self._contextual_feature_to_max_length.items(): + uih_kjt_values.append(data[name]) + uih_kjt_lengths.append(length) + + uih_seq_len = len(movie_history_uih) + movie_dummy_watch_times_uih = [0 for _ in range(uih_seq_len)] + action_weights_uih = [ + self.action_weights[int(rating) - 1] for rating in movie_history_ratings_uih + ] + uih_kjt_values.extend( + movie_history_uih + + movie_history_ratings_uih + + movie_timestamps_uih + + action_weights_uih + + movie_dummy_watch_times_uih + ) + uih_kjt_lengths.extend( + [ + uih_seq_len + for _ in range( + len(self._uih_keys) - len(self._contextual_feature_to_max_length) + ) + ] + ) + + dummy_query_time = ( + 0 if movie_timestamps_uih == [] else max(movie_timestamps_uih) + ) + uih_kjt_values.append(dummy_query_time) + uih_kjt_lengths.append(1) + uih_features_kjt = KeyedJaggedTensor( + keys=self._uih_keys + ["dummy_query_time"], + lengths=torch.tensor(uih_kjt_lengths).long(), + values=torch.tensor(uih_kjt_values).long(), + ) + + candidates_kjt_lengths = max_num_candidates * torch.ones( + len(self._candidates_keys) + ) + action_weights_candidates = [ + int(rating >= 3.5) for rating in movie_history_ratings_candidates + ] + candidates_kjt_values = ( + movie_history_candidates + + movie_history_ratings_candidates + + [dummy_query_time] * max_num_candidates # item_query_time + + action_weights_candidates + + [1] * max_num_candidates # item_dummy_watchtime + ) + candidates_features_kjt = KeyedJaggedTensor( + keys=self._candidates_keys, + lengths=candidates_kjt_lengths.detach().clone().long(), + values=torch.tensor(candidates_kjt_values).long(), + ) + return ( + uih_features_kjt, + candidates_features_kjt, + ) diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_movie_lens.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_movie_lens.py new file mode 100644 index 000000000..6cf8a5f56 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_movie_lens.py @@ -0,0 +1,83 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-unsafe +import csv +import linecache +import logging +import sys +from typing import List + +import numpy as np +import pandas as pd +from generative_recommenders.dlrm_v3.datasets.movie_lens import DLRMv3MovieLensDataset +from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig + +csv.field_size_limit(sys.maxsize) +logger = logging.getLogger(__name__) + + +class DLRMv3SyntheticMovieLensDataset(DLRMv3MovieLensDataset): + def __init__( + self, + hstu_config: DlrmHSTUConfig, + ratings_file_prefix: str, + is_inference: bool, + *args, + **kwargs, + ): + super().__init__( + hstu_config=hstu_config, is_inference=is_inference, ratings_file="" + ) + self.ratings_file_prefix = ratings_file_prefix + with open(f"{self.ratings_file_prefix}_users.csv", "r") as file: + reader = csv.reader(file) + self.users_cumsum: List[int] = np.cumsum( + [int(row[1]) for row in reader] + ).tolist() + + def get_item_count(self): + return self.users_cumsum[-1] + + def _process_line(self, line: str) -> pd.Series: + reader = csv.reader([line]) + parsed_line = next(reader) + user_id = int(parsed_line[0]) + sequence_item_ids = parsed_line[1] + sequence_ratings = parsed_line[2] + return pd.Series( + data={ + "user_id": user_id, + "sequence_item_ids": sequence_item_ids, + "sequence_ratings": sequence_ratings, + } + ) + + def iloc(self, idx) -> pd.Series: + assert idx < self.users_cumsum[-1] + file_idx: int = 0 + while self.users_cumsum[file_idx] <= idx: + file_idx += 1 + if file_idx == 0: + local_idx = idx + else: + local_idx = idx - self.users_cumsum[file_idx - 1] + line = linecache.getline( + f"{self.ratings_file_prefix}_{file_idx}.csv", local_idx + 1 + ) + data = self._process_line(line) + return data + + def get_timestamp_uih(self, data, max_num_candidates, size): + return [1] * size diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py new file mode 100644 index 000000000..6e38fe334 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py @@ -0,0 +1,403 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict +""" +Synthetic streaming dataset for DLRMv3 inference benchmarking. + +This module provides a streaming dataset implementation that loads user interaction +data from pre-generated CSV files with temporal (timestamp) organization, suitable +for simulating real-time recommendation scenarios. +""" + +import csv +import logging +import sys +import time +from typing import Any, Dict, List, Set, Tuple + +import pandas as pd +import torch +from generative_recommenders.dlrm_v3.datasets.dataset import ( + collate_fn, + DLRMv3RandomDataset, + Samples, +) +from generative_recommenders.dlrm_v3.datasets.utils import ( + json_loads, + maybe_truncate_seq, +) +from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig +from torchrec.sparse.jagged_tensor import KeyedJaggedTensor + +csv.field_size_limit(sys.maxsize) +logger: logging.Logger = logging.getLogger(__name__) + + +class DLRMv3SyntheticStreamingDataset(DLRMv3RandomDataset): + """ + Streaming dataset that loads pre-generated synthetic recommendation data. + + Supports timestamp-based data organization for simulating streaming scenarios + where user interaction histories evolve over time. + + Args: + hstu_config: HSTU model configuration. + ratings_file_prefix: Path prefix for rating data files. + is_inference: Whether dataset is used for inference. + train_ts: Number of timestamps used for training. + total_ts: Total number of timestamps in the data. + num_files: Number of data files (for parallelization). + num_users: Total number of users in the dataset. + num_items: Total number of items in the catalog. + num_categories: Number of item categories. + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + """ + + def __init__( + self, + hstu_config: DlrmHSTUConfig, + ratings_file_prefix: str, + is_inference: bool, + train_ts: int, + total_ts: int, + num_files: int, + num_users: int, + num_items: int, + num_categories: int, + *args: Any, + **kwargs: Any, + ) -> None: + super().__init__(hstu_config=hstu_config, is_inference=is_inference) + self.ratings_file_prefix = ratings_file_prefix + self.file_to_offsets: Dict[int, List[int]] = {} + with open(f"{self.ratings_file_prefix}offset.csv", "r") as file: + reader = csv.reader(file) + for size in range(num_files): + row = next(reader) + assert len(row) == 1 + offset = json_loads(row[0]) + assert len(offset) == num_users // num_files + self.file_to_offsets[size] = offset + self.ts_requests_offsets: List[int] = [] + with open(f"{self.ratings_file_prefix}requests_per_ts_offset.csv", "r") as file: + reader = csv.reader(file) + row = next(reader) + assert len(row) == 1 + self.ts_requests_offsets = json_loads(row[0]) + assert len(self.ts_requests_offsets) == total_ts + self.requests: List[int] = [] + self.ts_to_users_cumsum: Dict[int, List[int]] = {} + with open( + f"{self.ratings_file_prefix}users_cumsum_per_ts.csv", "r" + ) as cumsum_file: + reader = csv.reader(cumsum_file) + ts = 0 + for row in reader: + assert len(row) == 1 + cumsum = json_loads(row[0]) + self.ts_to_users_cumsum[ts] = cumsum + ts += 1 + self.train_ts = train_ts + self.total_ts = total_ts + self.num_files = num_files + self.ts: int = -1 + self.is_inference: bool = False + self.is_eval: bool = False + self.users_per_file: int = num_users // num_files + self.cached_files: Set[str] = set() + self.items_per_category: int = num_items // num_categories + assert hstu_config.action_weights is not None + self.action_weights: List[int] = hstu_config.action_weights + self.items_in_memory: Dict[ + int, Dict[int, Tuple[KeyedJaggedTensor, KeyedJaggedTensor]] + ] = {} + + def get_item_count(self) -> int: + return len(self.requests) + + def load_query_samples(self, sample_list: List[int]) -> None: + max_num_candidates = ( + self._max_num_candidates_inference + if self._is_inference + else self._max_num_candidates + ) + for idx in sample_list: + data = self.iloc(idx) + sample = self.load_item(data, max_num_candidates) + if self.ts not in self.items_in_memory: + self.items_in_memory[self.ts] = {} + self.items_in_memory[self.ts][idx] = sample + + self.last_loaded = time.time() + + def unload_query_samples(self, sample_list: List[int]) -> None: + self.items_in_memory = {} + + def get_sample(self, id: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: + return self.items_in_memory[self.ts][id] + + def get_sample_with_ts( + self, id: int, ts: int + ) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: + """ + Get a sample for a specific timestamp. + + Args: + id: Sample identifier. + ts: Timestamp index. + + Returns: + Tuple of (uih_features_kjt, candidates_features_kjt). + """ + return self.items_in_memory[ts][id] + + def get_samples_with_ts(self, id_list: List[int], ts: int) -> Samples: + """ + Get and collate multiple samples for a specific timestamp. + + Args: + id_list: List of sample identifiers. + ts: Timestamp index. + + Returns: + Collated Samples object. + """ + list_samples = [self.get_sample_with_ts(ix, ts) for ix in id_list] + return collate_fn(list_samples) + + def _process_line(self, line: str, user_id: int) -> pd.Series: + """ + Parse a CSV line into a pandas Series with user interaction data. + + Args: + line: CSV line containing user data. + user_id: User identifier. + + Returns: + pd.Series with parsed user interaction history and candidates. + """ + reader = csv.reader([line]) + parsed_line = next(reader) + # total ts + one more eval ts + one base ts so that uih won't be zero + # for each ts, ordered as candidate_ids, candidate_ratings, uih_ids, uih_ratings + assert len(parsed_line) == 4 * (self.total_ts + 2) + uih_item_ids_list = [] + uih_ratings_list = [] + candidate_item_ids = "" + candidate_ratings = "" + if (not self.is_eval) and (not self.is_inference): + assert self.ts < self.train_ts + for i in range(self.ts + 1): + if parsed_line[4 * i]: + uih_item_ids_list.append(parsed_line[2 + 4 * i]) + uih_ratings_list.append(parsed_line[3 + 4 * i]) + candidate_item_ids = parsed_line[4 * (self.ts + 1)] + candidate_ratings = parsed_line[1 + 4 * (self.ts + 1)] + elif self.is_eval: + for i in range(self.ts + 1): + if parsed_line[4 * i]: + uih_item_ids_list.append(parsed_line[2 + 4 * i]) + uih_ratings_list.append(parsed_line[3 + 4 * i]) + candidate_item_ids = parsed_line[4 * (self.ts + 1)] + candidate_ratings = parsed_line[1 + 4 * (self.ts + 1)] + else: + assert self.is_inference is True + assert self.ts >= self.train_ts + for i in range(self.train_ts + 1): + if parsed_line[4 * i]: + uih_item_ids_list.append(parsed_line[2 + 4 * i]) + uih_ratings_list.append(parsed_line[3 + 4 * i]) + for i in range(self.train_ts + 2, self.ts + 2): + if parsed_line[4 * i]: + uih_item_ids_list.append(parsed_line[2 + 4 * i]) + uih_ratings_list.append(parsed_line[3 + 4 * i]) + candidate_item_ids = parsed_line[4 * (self.ts + 2)] + candidate_ratings = parsed_line[1 + 4 * (self.ts + 2)] + uih_item_ids = ",".join(uih_item_ids_list) + uih_ratings = ",".join(uih_ratings_list) + assert candidate_item_ids != "" and candidate_ratings != "" + return pd.Series( + data={ + "user_id": user_id, + "uih_item_ids": uih_item_ids, + "uih_ratings": uih_ratings, + "candidate_item_ids": candidate_item_ids, + "candidate_ratings": candidate_ratings, + } + ) + + def iloc(self, idx: int) -> pd.Series: + """ + Get user data by request index using file offsets for efficient access. + + Args: + idx: Request index within the current timestamp. + + Returns: + pd.Series with parsed user interaction data. + """ + cumsum: List[int] = self.ts_to_users_cumsum[self.ts] + assert cumsum != [] + assert idx < cumsum[-1] + file_idx: int = 0 + while cumsum[file_idx] <= idx: + file_idx += 1 + user_idx = self.requests[idx] + filename = f"{self.ratings_file_prefix}{file_idx}.csv" + with open(filename, "r") as file: + idx = user_idx % self.users_per_file + file.seek(self.file_to_offsets[file_idx][idx]) + line = file.readline() + data = self._process_line(line=line, user_id=user_idx) + return data + + def get_timestamp_uih( + self, data: pd.Series, max_num_candidates: int, size: int + ) -> List[int]: + return [1] * size + + def set_ts(self, ts: int, train_only: bool = False) -> None: + """ + Set the current timestamp and load associated request data. + + Args: + ts: Timestamp index to set. + train_only: Accepted for API parity with the yambda dataset (which + supports a user-level train:eval holdout). This synthetic + dataset has no holdout, so the flag is ignored. + """ + logger.warning(f"Streaming dataset ts set to {ts}") + if ts == self.ts: + return + self.ts = ts + with open( + f"{self.ratings_file_prefix}requests_per_ts.csv", "r" + ) as request_file: + request_file.seek(self.ts_requests_offsets[self.ts]) + line = request_file.readline() + reader = csv.reader([line]) + row = next(reader) + assert len(row) == 1 + requests = json_loads(row[0]) + self.requests = requests + logger.warning(f"DLRMv3SyntheticStreamingDataset: ts={ts} requests loaded") + assert self.ts_to_users_cumsum[self.ts][-1] == len(self.requests) + logger.warning( + f"DLRMv3SyntheticStreamingDataset: ts={ts} users_cumsum={self.ts_to_users_cumsum[self.ts]}" + ) + + def load_item( + self, data: pd.Series, max_num_candidates: int + ) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: + """ + Load and process a single user's data into KeyedJaggedTensors. + + Converts parsed user data into feature tensors suitable for model input, + including truncation to maximum sequence lengths. + + Args: + data: pd.Series with user interaction history and candidates. + max_num_candidates: Maximum number of candidates to include. + + Returns: + Tuple of (uih_features_kjt, candidates_features_kjt). + """ + ids_uih = json_loads(data.uih_item_ids) + ids_candidates = json_loads(data.candidate_item_ids) + ratings_uih = json_loads(data.uih_ratings) + ratings_candidates = json_loads(data.candidate_ratings) + timestamps_uih = self.get_timestamp_uih( + data=data, + max_num_candidates=max_num_candidates, + size=len(ids_uih), + ) + assert len(ids_uih) == len(timestamps_uih), ( + "history len differs from timestamp len." + ) + assert len(ids_uih) == len(ratings_uih), ( + f"history len {len(ids_uih)} differs from ratings len {len(ratings_uih)}." + ) + assert len(ids_candidates) == len(ratings_candidates), ( + f"candidates len {len(ids_candidates)} differs from ratings len {len(ratings_candidates)}." + ) + + ids_uih = maybe_truncate_seq(ids_uih, self._max_uih_len) + ratings_uih = maybe_truncate_seq(ratings_uih, self._max_uih_len) + timestamps_uih = maybe_truncate_seq(timestamps_uih, self._max_uih_len) + ids_candidates = maybe_truncate_seq(ids_candidates, max_num_candidates) + num_candidates = len(ids_candidates) + ratings_candidates = maybe_truncate_seq(ratings_candidates, max_num_candidates) + action_weights_uih = [ + self.action_weights[int(rating) - 1] for rating in ratings_uih + ] + action_weights_candidates = [ + int(rating >= 3.5) for rating in ratings_candidates + ] + + uih_kjt_values: List[int] = [] + uih_kjt_lengths: List[int] = [] + for name, length in self._contextual_feature_to_max_length.items(): + uih_kjt_values.append(data[name]) + uih_kjt_lengths.append(length) + + uih_seq_len = len(ids_uih) + dummy_watch_times_uih = [0 for _ in range(uih_seq_len)] + item_category_ids = [id // self.items_per_category for id in ids_uih] + extend_uih_kjt_values: List[int] = ( + ids_uih + + ratings_uih + + timestamps_uih + + action_weights_uih + + dummy_watch_times_uih + + item_category_ids + ) + uih_kjt_values.extend(extend_uih_kjt_values) + uih_kjt_lengths.extend( + [ + uih_seq_len + for _ in range( + len(self._uih_keys) - len(self._contextual_feature_to_max_length) + ) + ] + ) + + dummy_query_time = 0 if timestamps_uih == [] else max(timestamps_uih) + uih_kjt_values.append(dummy_query_time) + uih_kjt_lengths.append(1) + uih_features_kjt: KeyedJaggedTensor = KeyedJaggedTensor( + keys=self._uih_keys + ["dummy_query_time"], + lengths=torch.tensor(uih_kjt_lengths).long(), + values=torch.tensor(uih_kjt_values).long(), + ) + + candidates_kjt_lengths = num_candidates * torch.ones(len(self._candidates_keys)) + item_candidate_category_ids = [ + id // self.items_per_category for id in ids_candidates + ] + candidates_kjt_values = ( + ids_candidates + + ratings_candidates + + [dummy_query_time] * num_candidates # item_query_time + + action_weights_candidates + + [1] * num_candidates # item_dummy_watchtime + + item_candidate_category_ids + ) + candidates_features_kjt: KeyedJaggedTensor = KeyedJaggedTensor( + keys=self._candidates_keys, + lengths=candidates_kjt_lengths.detach().clone().long(), + values=torch.tensor(candidates_kjt_values).long(), + ) + return uih_features_kjt, candidates_features_kjt diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/utils.py new file mode 100644 index 000000000..aeca75d41 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/utils.py @@ -0,0 +1,146 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-unsafe +""" +Utility functions for dataset processing. + +This module provides helper functions for parsing and processing data +in the DLRMv3 dataset pipeline. +""" + +import json +import struct +from typing import Dict, List, Sequence, Tuple + +import numpy as np +import xxhash + + +def json_loads( + x: str | int | List[int], +) -> List[int]: + """ + Parse a JSON-like string into a list of integers. + + Handles multiple input formats including JSON arrays, comma-separated + strings, and single values. + + Args: + x: Input that can be a JSON array string, a single integer, + or already a list of integers. + + Returns: + List of integers parsed from the input. + """ + if isinstance(x, str): + if x[0] != "[" and x[-1] != "]": + x = "[" + x + "]" + y = json.loads(x) + else: + y = x + y_list = [y] if type(y) == int else list(y) + return y_list + + +def separate_uih_candidates( + x: str | int | List[int], + candidates_max_seq_len: int, +) -> Tuple[List[int], List[int]]: + """ + Separate a sequence into user interaction history (UIH) and candidates. + + Splits the input sequence such that the last `candidates_max_seq_len` + elements become candidates and the rest become UIH. + + Args: + x: Input sequence as JSON string, single int, or list of ints. + candidates_max_seq_len: Number of items at the end to use as candidates. + + Returns: + Tuple of (uih, candidates) where both are lists of integers. + """ + if isinstance(x, str): + if x[0] != "[" and x[-1] != "]": + x = "[" + x + "]" + y = json.loads(x) + else: + y = x + y_list = [y] if type(y) == int else list(y) + candidates, uih = ( + y_list[-candidates_max_seq_len:], + y_list[:-candidates_max_seq_len], + ) + return uih, candidates + + +def maybe_truncate_seq( + y: List[int], + max_seq_len: int, +) -> List[int]: + """ + Truncate a sequence if it exceeds the maximum length. + + Args: + y: Input sequence to potentially truncate. + max_seq_len: Maximum allowed sequence length. + + Returns: + The input sequence, truncated to max_seq_len if necessary. + """ + y_len = len(y) + if y_len > max_seq_len: + y = y[:max_seq_len] + return y + + +def xxhash_cross( + anchor: Dict[str, int], + keys: Sequence[str], + table_size: int, + salt: int = 0, +) -> int: + """xxhash64(seed=salt) over little-endian int64 concat(anchor[k] for k in keys), mod table_size. + + Bit-identical to primus_dlrm.data.hashing.cross_hash_nway — embedding rows + are interchangeable with Primus-trained ones. + """ + n = len(keys) + assert n >= 2, f"xxhash_cross needs >=2 keys, got {n}" + digest = xxhash.xxh64(seed=salt) + digest.update(struct.Struct(f"<{n}q").pack(*(int(anchor[k]) for k in keys))) + return digest.intdigest() % table_size + + +def xxhash_cross_batch( + arr_by_key: Dict[str, np.ndarray], + keys: Sequence[str], + table_size: int, + salt: int = 0, +) -> np.ndarray: + """Vectorised xxhash_cross over equal-length int64 arrays (one per key).""" + n = len(keys) + assert n >= 2 + cols = [np.asarray(arr_by_key[k], dtype=np.int64).ravel() for k in keys] + length = cols[0].shape[0] + for c in cols: + assert c.shape[0] == length + pack = struct.Struct(f"<{n}q").pack + digest_cls = xxhash.xxh64 + out = np.empty(length, dtype=np.int64) + for i in range(length): + d = digest_cls(seed=salt) + d.update(pack(*(int(c[i]) for c in cols))) + out[i] = d.intdigest() % table_size + return out diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py new file mode 100644 index 000000000..00b22cff9 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py @@ -0,0 +1,1066 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 + +# pyre-unsafe +""" +Yambda dataset for the DLRMv3 HSTU `modules/` path. + +Reads the parquets produced by `dlrm_v3/preprocess_public_data.py +--dataset yambda-`. Each sample is one anchor LISTEN event with: + * label = (played_ratio >= LISTEN_PLUS_THRESHOLD) — the listen_plus bit + * a chronologically interleaved 3-pool history (listen+/like/skip), with + pool identity tagged per-position in `action_weight` (bits 1/2/4) + * 7 pre-hashed cross-feature ids exposed as length-1 contextual entries +""" + +import logging +import mmap as _mmap_mod +import os +import time +from pathlib import Path +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import numpy as np +import polars as pl +import torch +from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset +from generative_recommenders.dlrm_v3.datasets.utils import xxhash_cross +from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig +from torchrec.sparse.jagged_tensor import KeyedJaggedTensor + +logger = logging.getLogger(__name__) + + +def _load_npy_readonly(path: Union[str, Path]) -> np.ndarray: + # MAP_SHARED + PROT_READ so the kernel does not charge the mapping against + # vm.overcommit_memory=2 limits. numpy's mmap_mode='r' uses MAP_PRIVATE and + # reserves per-process commit; at 8 ranks × ~190 GB store, that OOMs. + path = Path(path) + with open(path, "rb") as f: + version = np.lib.format.read_magic(f) + if version[0] == 1: + shape, _, dtype = np.lib.format.read_array_header_1_0(f) + else: + shape, _, dtype = np.lib.format.read_array_header_2_0(f) + offset = f.tell() + fd = os.open(str(path), os.O_RDONLY) + try: + buf = _mmap_mod.mmap(fd, 0, access=_mmap_mod.ACCESS_READ) + finally: + os.close(fd) + arr = np.ndarray(shape, dtype=dtype, buffer=buf, offset=offset) + arr.flags.writeable = False + return arr + + +def _uid_unit_hash(uids: np.ndarray, salt: int) -> np.ndarray: + """Deterministic uniform-in-[0,1) hash of user ids (splitmix64 finalizer). + + Pure function of (uid, salt): the same uid always maps to the same value, + so the train/eval user split is identical across processes, ranks, and + crash/resume — the property the no-leakage holdout relies on. Vectorized + uint64 arithmetic wraps mod 2**64 (defined for unsigned), so we silence the + benign overflow warnings. + """ + GOLDEN = np.uint64(0x9E3779B97F4A7C15) + M1 = np.uint64(0xBF58476D1CE4E5B9) + M2 = np.uint64(0x94D049BB133111EB) + s30, s27, s31 = np.uint64(30), np.uint64(27), np.uint64(31) + with np.errstate(over="ignore"): + z = uids.astype(np.uint64) + GOLDEN + np.uint64(salt & 0xFFFFFFFFFFFFFFFF) + z = (z ^ (z >> s30)) * M1 + z = (z ^ (z >> s27)) * M2 + z = z ^ (z >> s31) + # Top 53 bits -> uniform [0, 1) double (same trick numpy uses for randoms). + return (z >> np.uint64(11)).astype(np.float64) * (1.0 / 9007199254740992.0) + +# Yambda event-type encoding written by preprocess_public_data.py. +LISTEN_TYPE = 0 +LIKE_TYPE = 1 +LISTEN_PLUS_THRESHOLD = 50 + +# Action-weight bits (must match hstu_config.action_weights = [1, 2, 4]). +LP_BIT = 1 +LIKE_BIT = 2 +SKIP_BIT = 4 + + +class _FlatEventStore: + """Per-user flat event index built from the preprocessed sessions parquet. + + Reads `train_sessions.parquet` and explodes per-session arrays into flat + numpy columns + per-user `(start, end)` index arrays. Cache-compatible + layout, but writes nothing (rebuilds from parquet each construction). + """ + + # On-disk column layout. + _MMAP_COLS = ( + "flat_uid", "flat_item_ids", "flat_timestamps", + "flat_event_types", "flat_played_ratio", + "flat_is_listen_plus", "flat_is_like", "flat_is_skip", + "flat_is_organic", + "user_start", "user_end", "unique_uids", + ) + + def __init__(self, sessions_df: pl.DataFrame) -> None: + logger.info("Building flat event store from sessions...") + sorted_sessions = sessions_df.sort(["uid", "session_id"]) + exploded = sorted_sessions.explode( + ["item_ids", "timestamps", "event_types", "is_organic", "played_ratio_pct"] + ) + + self.flat_uid: np.ndarray = exploded["uid"].to_numpy().astype(np.int64) + self.flat_item_ids: np.ndarray = exploded["item_ids"].to_numpy().astype(np.int64) + self.flat_timestamps: np.ndarray = exploded["timestamps"].to_numpy().astype(np.int64) + self.flat_event_types: np.ndarray = exploded["event_types"].to_numpy().astype(np.int64) + self.flat_played_ratio: np.ndarray = exploded["played_ratio_pct"].to_numpy().astype(np.float32) + self.flat_is_organic: np.ndarray = exploded["is_organic"].to_numpy().astype(np.int8) + np.nan_to_num(self.flat_played_ratio, copy=False, nan=0.0) + + is_listen = self.flat_event_types == LISTEN_TYPE + self.flat_is_listen_plus: np.ndarray = is_listen & ( + self.flat_played_ratio >= LISTEN_PLUS_THRESHOLD + ) + self.flat_is_like: np.ndarray = self.flat_event_types == LIKE_TYPE + self.flat_is_skip: np.ndarray = is_listen & ( + self.flat_played_ratio < LISTEN_PLUS_THRESHOLD + ) + + uid_changes = np.where(np.diff(self.flat_uid) != 0)[0] + 1 + starts = np.concatenate([[0], uid_changes]) + ends = np.concatenate([uid_changes, [len(self.flat_uid)]]) + uid_vals = self.flat_uid[starts] + max_uid = int(uid_vals.max()) + 1 + self.user_start: np.ndarray = np.full(max_uid, -1, dtype=np.int64) + self.user_end: np.ndarray = np.full(max_uid, -1, dtype=np.int64) + self.user_start[uid_vals] = starts + self.user_end[uid_vals] = ends + self.unique_uids: np.ndarray = uid_vals + self.num_users: int = len(uid_vals) + self.total_events: int = len(self.flat_item_ids) + logger.info( + f"FlatEventStore: {self.total_events:,} events, {self.num_users:,} users" + ) + + @classmethod + def load_mmap(cls, cache_dir: Union[str, Path]) -> "_FlatEventStore": + """Load flat columns by MAP_SHARED+PROT_READ from a prebuilt cache. + All ranks on a node share the same physical pages.""" + import json as _json + cache_dir = Path(cache_dir) + with open(cache_dir / "store_meta.json") as f: + meta = _json.load(f) + store = object.__new__(cls) + for name in cls._MMAP_COLS: + setattr(store, name, _load_npy_readonly(cache_dir / f"{name}.npy")) + store.num_users = int(meta["num_users"]) + store.total_events = int(meta["total_events"]) + logger.info( + f"FlatEventStore mmap from {cache_dir}: " + f"{store.total_events:,} events, {store.num_users:,} users" + ) + return store + + def save_mmap(self, cache_dir: Union[str, Path]) -> None: + """Persist flat columns to disk as .npy, then write a sentinel. + Subsequent runs (any rank, any node sharing the FS) load via mmap.""" + import json as _json + cache_dir = Path(cache_dir) + cache_dir.mkdir(parents=True, exist_ok=True) + for name in self._MMAP_COLS: + np.save(cache_dir / f"{name}.npy", getattr(self, name)) + with open(cache_dir / "store_meta.json", "w") as f: + _json.dump( + {"num_users": self.num_users, "total_events": self.total_events}, f + ) + # Sentinel — readers check this before mmap'ing to avoid partial files. + (cache_dir / "_READY").touch() + logger.info(f"FlatEventStore saved to {cache_dir}") + + +class DLRMv3YambdaDataset(DLRMv3RandomDataset): + """Yambda-5b dataset for the DLRMv3 HSTU modules/ path. + + Args: + hstu_config: DlrmHSTUConfig (must come from `get_hstu_configs("yambda-5b")`). + processed_dir: directory with `train_sessions.parquet` + `item_popularity.npy`. + metadata_dir: directory with `{artist,album}_item_mapping.parquet`. + history_length: UIH cap. Under "interleaved" it is the per-pool cap + (total ≤ 3 * history_length // 3); under "last_n" it is the literal + total number of pooled events kept. + scan_window: how far back to scan when filling each pool. + history_strategy: "interleaved" (equal per-pool L//3 cap, re-interleaved) + or "last_n" (last history_length pooled events, no per-pool split). + cross_specs: list of (name, keys, num_embeddings, salt). Source of truth + in `dlrm_v3/configs.py:YAMBDA_5B_CROSS_SPECS`. + is_inference: passed through to base class. + """ + + def __init__( + self, + hstu_config: DlrmHSTUConfig, + processed_dir: str, + metadata_dir: str, + history_length: int = 2048, + scan_window: int = 20000, + min_history: Optional[int] = None, + history_strategy: str = "interleaved", + cross_specs: Optional[Sequence[Tuple[str, Sequence[str], int, int]]] = None, + cache_dir: Optional[str] = None, + is_inference: bool = False, + streaming_window_seconds: int = 86400, + streaming_sort_within_window: bool = False, + streaming_shuffle_fraction: float = 0.0, + streaming_shuffle_seed: int = 0, + train_split_percentage: float = 1.0, + split_salt: int = 0, + *args, + **kwargs, + ) -> None: + super().__init__(hstu_config=hstu_config, is_inference=is_inference) + self._processed_dir: str = processed_dir + self._metadata_dir: str = metadata_dir + self._history_length: int = history_length + self._scan_window: int = scan_window + # UIH construction strategy: + # "interleaved" (default) — equal history_length//3 cap per behavior + # pool (listen+/like/skip), re-interleaved chronologically. Likes are + # ~1.9% of the corpus so the like pool over-fills relative to its + # natural frequency while the sequence under-fills overall. + # "last_n" — take the last history_length events of ANY pool type with + # no per-pool split. Fills the sequence to ~history_length (higher + # effective length) and lets the like share fall to its natural rate. + # Both exclude dislike/unlike/undislike (no action-weight bit). + if history_strategy not in ("interleaved", "last_n"): + raise ValueError( + f"history_strategy must be 'interleaved' or 'last_n', got " + f"{history_strategy!r}" + ) + self._history_strategy: str = history_strategy + # Minimum prior-event count for a LISTEN event to qualify as an anchor. + # Decoupled from history_length (which is only the gather/truncation cap): + # jagged attention handles short UIH, so we no longer require a full + # history_length of context to include a sample. Default None preserves + # the legacy "need a full history_length of prior events" behavior (which + # dropped ~60% of users); set small (e.g. 1) to include ~all users. + self._min_history: int = ( + history_length if min_history is None else int(min_history) + ) + # Streaming/temporal-order state. Everything here is LAZY: nothing is + # built or read until the first set_ts()/num_windows() call (only the + # streaming-train-eval loop does that), so the default train-eval path + # is byte-for-byte unaffected. + self._streaming_window_seconds: int = streaming_window_seconds + self._streaming_sort_within_window: bool = streaming_sort_within_window + # In-window shuffle dial in [0, 1] to break user-major batching (default + # 0.0 = off, user-major order preserved for page-local mmap scans). Maps to + # a within-segment shuffle with K = round(fraction * per-window train-anchor + # count): 1.0 = full per-element shuffle (max user diversity per batch), + # intermediate = interpolation. Computed from the global anchor count BEFORE + # round-robin striding, so a given fraction yields the same diversity + # regardless of world_size / #nodes / batch_size (config-invariant). The + # permutation is a pure function of (seed, ts) so the per-rank round-robin + # slice + mid-window resume skip stay deterministic across restarts. + self._streaming_shuffle_fraction: float = streaming_shuffle_fraction + self._streaming_shuffle_seed: int = streaming_shuffle_seed + # User-level train:eval split. `train_split_percentage >= 1.0` means no + # holdout (legacy behavior: every anchor is trainable). Otherwise the + # top `1 - train_split_percentage` fraction of users (by a deterministic + # hash of `uid + split_salt`) are held out: NEVER trained, used only to + # build the fixed eval set. The split is a pure function of (uid, salt), + # so it is identical across crash/resume (no leakage on failover). + self._train_split_percentage: float = train_split_percentage + self._split_salt: int = split_salt + # Cache only the (small) fixed eval-holdout index list; the per-window + # train filter is computed on the fly to avoid a full-length mask. + self._eval_holdout_cache: Optional[np.ndarray] = None + self._eval_holdout_cache_key: Optional[Tuple[int, int]] = None + self._active: Optional[np.ndarray] = None + self.is_eval: bool = False + self._anchor_ts: Optional[np.ndarray] = None + self._t_min: Optional[int] = None + self._t_max: Optional[int] = None + self._cache_dir: Optional[str] = cache_dir + self._cross_specs: List[Tuple[str, Tuple[str, ...], int, int]] = [ + (name, tuple(keys), n, s) for (name, keys, n, s) in (cross_specs or []) + ] + assert hstu_config.action_weights is not None + self._action_weights: List[int] = hstu_config.action_weights + + self._load_metadata(metadata_dir) + # Build-once-mmap-many: first rank to arrive acquires the build lock + # and explodes the parquet (one ~190 GB in-memory pass), then writes + # flat .npy columns + _READY sentinel. All ranks (including the + # builder, after dropping its in-memory copy) reload via MAP_SHARED+ + # PROT_READ — kernel shares physical pages across ranks so the steady- + # state per-rank RSS for the dataset is ~0. + if cache_dir is None: + cache_dir = os.path.join(processed_dir, f"hstu_cache_L{history_length}") + self._cache_dir = cache_dir + self._ensure_cache_built(cache_dir, processed_dir, history_length) + self.store: _FlatEventStore = _FlatEventStore.load_mmap(cache_dir) + # Anchor positions depend on min_history (the eligibility floor), not + # just history_length (the gather cap), so they live in a + # min_history-versioned file that shares the flat store. Built + # independently of the _READY sentinel so changing the floor rebuilds + # only this (cheap) array, not the whole 150 GB cache. + self._positions_name: str = self._positions_filename( + history_length, self._min_history + ) + self._ensure_positions_built( + cache_dir, self._positions_name, self._min_history + ) + self._positions: np.ndarray = _load_npy_readonly( + os.path.join(cache_dir, self._positions_name) + ) + logger.info( + f"Yambda dataset ready: {self.store.total_events:,} events, " + f"{len(self._positions):,} training positions" + ) + + @staticmethod + def _positions_filename(history_length: int, min_history: int) -> str: + """Anchor-positions filename. Uses the legacy name when the floor equals + the gather cap (the historical "full history required" behavior) so + existing caches are reused as-is; otherwise a min_history-tagged name.""" + if min_history == history_length: + return f"positions_L{history_length}.npy" + return f"positions_L{history_length}_m{min_history}.npy" + + @staticmethod + def _ensure_positions_built( + cache_dir: str, positions_name: str, min_history: int + ) -> None: + """Build the anchor-positions array for ``min_history`` if absent. + + Anchors are LISTEN events whose user-local offset is >= ``min_history`` + (i.e. the user already has that many prior events). This is decoupled + from the _READY-gated flat-store build so a new floor only rebuilds this + (cheap, ~one int64 scan) array rather than the whole 150 GB cache. + Multi-rank safe via an exclusive lock + atomic rename; all ranks then + mmap the result read-only. + """ + import fcntl + + positions_path = os.path.join(cache_dir, positions_name) + if os.path.exists(positions_path): + return + lock_path = os.path.join(cache_dir, "_positions_lock") + with open(lock_path, "w") as lf: + logger.info(f"Acquiring positions build lock for {positions_path}...") + fcntl.flock(lf, fcntl.LOCK_EX) + try: + if os.path.exists(positions_path): + return + flat_uid = _load_npy_readonly( + os.path.join(cache_dir, "flat_uid.npy") + ) + event_types = _load_npy_readonly( + os.path.join(cache_dir, "flat_event_types.npy") + ) + user_start = _load_npy_readonly( + os.path.join(cache_dir, "user_start.npy") + ) + idx = np.arange(len(flat_uid), dtype=np.int64) + keep = (idx - user_start[flat_uid] >= min_history) & ( + event_types == LISTEN_TYPE + ) + positions = np.where(keep)[0].astype(np.int64) + tmp = positions_path + ".tmp.npy" + np.save(tmp, positions) + os.replace(tmp, positions_path) + logger.info( + f"Wrote {positions_name}: {len(positions):,} anchors " + f"(min_history={min_history})" + ) + finally: + fcntl.flock(lf, fcntl.LOCK_UN) + + @staticmethod + def _ensure_cache_built( + cache_dir: str, processed_dir: str, history_length: int + ) -> None: + """File-locked one-shot build with column-at-a-time explode. + + A naive `pl.read_parquet(...).explode([5 list cols])` peaks at ~1.6 TB + on the 5b dataset (polars holds input list-columns + dense output + + parallel-worker scratch all together). Instead we: + 1) Read parquet + sort once (sorted list-column DF, ~80 GB). + 2) For each output column: select that single list, explode, write + .npy, drop. Bounds incremental peak to one column (~38 GB). + 3) Derive bool flags and indices from the on-disk mmaps. + + Peak RAM: ~150 GB. Steady state across all ranks afterward: ~0 + incremental thanks to MAP_SHARED in load_mmap. + """ + import fcntl + import gc + import json as _json + + ready = os.path.join(cache_dir, "_READY") + if os.path.exists(ready): + return + os.makedirs(cache_dir, exist_ok=True) + lock_path = os.path.join(cache_dir, "_lock") + with open(lock_path, "w") as lf: + logger.info(f"Acquiring build lock for {cache_dir}...") + fcntl.flock(lf, fcntl.LOCK_EX) + try: + if os.path.exists(ready): + return + parquet_path = os.path.join(processed_dir, "train_sessions.parquet") + logger.info( + f"Building flat-event cache from {parquet_path} " + f"(column-at-a-time, ~150 GB peak RAM)" + ) + + # Step 1: read + sort. List columns stay nested at this stage. + sessions = pl.read_parquet(parquet_path).sort(["uid", "session_id"]) + logger.info(f"Sessions sorted: {sessions.shape}") + + # Per-session lengths + uids — used to derive flat_uid via + # np.repeat (cheap) without exploding the whole DF at once. + lengths = ( + sessions.select(pl.col("item_ids").list.len()) + .to_numpy() + .reshape(-1) + .astype(np.int64) + ) + session_uids = sessions["uid"].to_numpy().astype(np.int64) + N = int(lengths.sum()) + num_users = int(np.unique(session_uids).shape[0]) + logger.info(f"Total events: {N:,}, users: {num_users:,}") + + # Step 2: column-at-a-time explode → save → drop. + # uid is per-session scalar; expand via np.repeat. + flat_uid = np.repeat(session_uids, lengths).astype(np.int64) + np.save(os.path.join(cache_dir, "flat_uid.npy"), flat_uid) + del flat_uid, session_uids, lengths + gc.collect() + logger.info("Wrote flat_uid.npy") + + # Derived columns flat_is_listen_plus/like/skip depend on + # event_types + played_ratio. Save those two first, then + # derive the bools from the mmaps. + _list_cols = [ + ("item_ids", "flat_item_ids", np.int64), + ("timestamps", "flat_timestamps", np.int64), + ("event_types", "flat_event_types", np.int64), + ("is_organic", "flat_is_organic", np.int8), + ("played_ratio_pct", "flat_played_ratio", np.float32), + ] + for src_col, dst_name, dtype in _list_cols: + exploded = sessions.select(pl.col(src_col).explode()) + arr = exploded[src_col].to_numpy().astype(dtype, copy=False) + if dtype == np.float32: + np.nan_to_num(arr, copy=False, nan=0.0) + np.save(os.path.join(cache_dir, f"{dst_name}.npy"), arr) + del exploded, arr + gc.collect() + logger.info(f"Wrote {dst_name}.npy") + + # Drop the sessions DF now that all source columns are on disk. + del sessions + gc.collect() + + # Step 3: derive bool flags from the just-written mmaps. + event_types = _load_npy_readonly( + os.path.join(cache_dir, "flat_event_types.npy") + ) + played_ratio = _load_npy_readonly( + os.path.join(cache_dir, "flat_played_ratio.npy") + ) + is_listen = event_types == LISTEN_TYPE + np.save( + os.path.join(cache_dir, "flat_is_listen_plus.npy"), + is_listen & (played_ratio >= LISTEN_PLUS_THRESHOLD), + ) + np.save( + os.path.join(cache_dir, "flat_is_like.npy"), + event_types == LIKE_TYPE, + ) + np.save( + os.path.join(cache_dir, "flat_is_skip.npy"), + is_listen & (played_ratio < LISTEN_PLUS_THRESHOLD), + ) + del is_listen, played_ratio + gc.collect() + logger.info("Wrote flat_is_listen_plus/like/skip.npy") + + # user_start / user_end / unique_uids from flat_uid mmap. + flat_uid = _load_npy_readonly( + os.path.join(cache_dir, "flat_uid.npy") + ) + uid_changes = np.where(np.diff(flat_uid) != 0)[0] + 1 + starts = np.concatenate([[0], uid_changes]) + ends = np.concatenate([uid_changes, [len(flat_uid)]]) + uid_vals = flat_uid[starts] + max_uid = int(uid_vals.max()) + 1 + user_start = np.full(max_uid, -1, dtype=np.int64) + user_end = np.full(max_uid, -1, dtype=np.int64) + user_start[uid_vals] = starts + user_end[uid_vals] = ends + np.save(os.path.join(cache_dir, "user_start.npy"), user_start) + np.save(os.path.join(cache_dir, "user_end.npy"), user_end) + np.save(os.path.join(cache_dir, "unique_uids.npy"), uid_vals) + logger.info("Wrote user_start/end/unique_uids.npy") + + # Positions: LISTEN events with ≥history_length prior history. + # Done now (before dropping user_start) so all sibling ranks + # just mmap the result instead of each running a 75 GB build. + user_start_per_event = user_start[flat_uid] + idx = np.arange(len(flat_uid), dtype=np.int64) + keep = (idx - user_start_per_event >= history_length) & ( + event_types == LISTEN_TYPE + ) + positions = np.where(keep)[0].astype(np.int64) + np.save( + os.path.join(cache_dir, f"positions_L{history_length}.npy"), + positions, + ) + logger.info( + f"Wrote positions_L{history_length}.npy: {len(positions):,}" + ) + del ( + flat_uid, event_types, user_start, user_end, uid_vals, + starts, ends, uid_changes, idx, user_start_per_event, + keep, positions, + ) + gc.collect() + + # Meta + sentinel — written last; readers gate on _READY. + with open(os.path.join(cache_dir, "store_meta.json"), "w") as f: + _json.dump( + {"num_users": num_users, "total_events": N}, f + ) + open(os.path.join(cache_dir, "_READY"), "w").close() + logger.info(f"Cache build complete: {cache_dir}") + finally: + fcntl.flock(lf, fcntl.LOCK_UN) + + def _load_metadata(self, metadata_dir: str) -> None: + item_pop_path = os.path.join(metadata_dir, "item_popularity.npy") + if os.path.exists(item_pop_path): + item_popularity = np.load(item_pop_path) + else: + # Fallback: derive vocab size from the artist+album maps. + item_popularity = None + + artist_map = pl.read_parquet(os.path.join(metadata_dir, "artist_item_mapping.parquet")) + album_map = pl.read_parquet(os.path.join(metadata_dir, "album_item_mapping.parquet")) + n_items = int( + max( + int(artist_map["item_id"].max()) + 1, + int(album_map["item_id"].max()) + 1, + len(item_popularity) if item_popularity is not None else 0, + ) + ) + self.item_to_artist: np.ndarray = np.zeros(n_items, dtype=np.int64) + valid = artist_map.filter(pl.col("item_id") < n_items) + self.item_to_artist[valid["item_id"].to_numpy()] = valid["artist_id"].to_numpy() + self.item_to_album: np.ndarray = np.zeros(n_items, dtype=np.int64) + valid = album_map.filter(pl.col("item_id") < n_items) + self.item_to_album[valid["item_id"].to_numpy()] = valid["album_id"].to_numpy() + self.num_items: int = n_items + + def get_item_count(self) -> int: + # Streaming mode restricts the active set to the current time window; + # otherwise the full (user-major) anchor list is used (train-eval). + if self._active is not None: + return int(len(self._active)) + return int(len(self._positions)) + + def iloc(self, idx: int) -> int: + if self._active is not None: + return int(self._positions[self._active[idx]]) + return int(self._positions[idx]) + + def _ensure_streaming_index(self) -> None: + """Lazily build + mmap the per-anchor target-timestamp array used for + time-windowed streaming. + + Built only on the first ``set_ts()``/``num_windows()`` call, so the + default train-eval path never reads timestamps or writes a new file. + Multi-rank safe via an exclusive file lock + atomic rename; all ranks + then mmap the result read-only (shared physical pages, ~0 anon). + """ + if self._anchor_ts is not None: + return + import fcntl + + assert self._cache_dir is not None + # Target-ts array is per-anchor, so it must track the same min_history + # versioning as the positions file it indexes into. + anchor_path = os.path.join( + self._cache_dir, + self._positions_name.replace("positions_", "anchor_ts_", 1), + ) + if not os.path.exists(anchor_path): + lock_path = os.path.join(self._cache_dir, "_anchor_ts_lock") + with open(lock_path, "w") as lf: + logger.info(f"Acquiring anchor-ts build lock for {anchor_path}...") + fcntl.flock(lf, fcntl.LOCK_EX) + if not os.path.exists(anchor_path): + logger.info( + f"Building {anchor_path}: target ts for " + f"{len(self._positions):,} anchors" + ) + anchor_ts = self.store.flat_timestamps[self._positions] + tmp = anchor_path + ".tmp.npy" + np.save(tmp, anchor_ts) + os.replace(tmp, anchor_path) + del anchor_ts + self._anchor_ts = _load_npy_readonly(anchor_path) + self._t_min = int(self._anchor_ts.min()) + self._t_max = int(self._anchor_ts.max()) + + def num_windows(self) -> int: + """Number of fixed-duration windows spanning [t_min, t_max].""" + self._ensure_streaming_index() + assert self._t_min is not None and self._t_max is not None + span = self._t_max - self._t_min + 1 + w = self._streaming_window_seconds + return int((span + w - 1) // w) + + def window_indices( + self, ts: int, sort_by_time: Optional[bool] = None + ) -> np.ndarray: + """Global anchor indices (into ``_positions``) whose target timestamp is + in window ``ts``: ``[t_min + ts*W, t_min + (ts+1)*W)``. + + Returned in ascending global-index order (user-major), which keeps the + per-sample history scans page-local in the mmap'd event arrays. Used by + the per-window path (via ``set_ts``) and the persistent path (shipped to + workers through the sampler). ``sort_by_time`` defaults to + ``streaming_sort_within_window``. + + Note: an O(log N) variant using a cached argsort of the timestamps was + evaluated but rejected — it doubles resident mmap (sorted-ts + order + permutation, ~52 GB) and that extra residency evicts the event-array + page cache, stalling dataloader workers (NCCL watchdog timeouts). The + O(N) mask here keeps only one ~26 GB array resident and is robust. + """ + self._ensure_streaming_index() + assert self._anchor_ts is not None and self._t_min is not None + w = self._streaming_window_seconds + lo = self._t_min + ts * w + hi = lo + w + idx = np.where((self._anchor_ts >= lo) & (self._anchor_ts < hi))[0] + do_sort = ( + self._streaming_sort_within_window if sort_by_time is None else sort_by_time + ) + if do_sort and idx.size > 0: + idx = idx[np.argsort(self._anchor_ts[idx], kind="stable")] + logger.warning(f"window_indices({ts}): [{lo}, {hi}) -> {idx.size:,} anchors") + return idx.astype(np.int64) + + def _eval_anchor_mask(self, anchor_idx: np.ndarray) -> np.ndarray: + """Bool mask (aligned to ``anchor_idx``) marking held-out eval users. + + Computed on the fly for just this slice of anchors (a window is ~tens of + millions, not the full ~3B ``_positions``), so we never materialize a + full-length mask. ``uid``-hash >= ``train_split_percentage`` -> eval. + """ + uids = self.store.flat_uid[self._positions[anchor_idx]] + return _uid_unit_hash(uids, self._split_salt) >= self._train_split_percentage + + def _shuffle_window(self, idx: np.ndarray, ts: int) -> np.ndarray: + """Optionally break user-major ordering within a train window. + + ``streaming_shuffle_fraction`` (0..1) is the single diversity dial. It + maps to a within-segment shuffle with ``K = round(fraction * N)`` where + ``N`` is this window's train-anchor count: + + - 0.0 -> off: return ``idx`` unchanged (user-major, page-local scans). + - 1.0 -> full per-element shuffle (max user diversity per batch). + - else -> permute WITHIN each contiguous size-K segment (segment order + preserved). A per-rank batch then draws across a bounded user-major + region, so diversity scales with the fraction while the concurrently + touched mmap working set stays within ~one K-segment (page locality). + + Because ``N`` is a property of the dataset/window (not the compute layout) + and the permutation is applied BEFORE the per-rank round-robin striding, a + given fraction yields the same diversity across world_size / #nodes / + batch_size (config-invariant). + + The permutation is a pure function of ``(seed, ts)`` via + ``np.random.default_rng(seed + ts)``, so every (re)run of this window + yields the IDENTICAL order. This keeps the per-rank round-robin slice and + the mid-window resume ``skip_samples`` offset consistent across restarts, + exactly like the unshuffled path. + """ + frac = self._streaming_shuffle_fraction + if idx.size <= 1 or not frac or frac <= 0.0: + return idx + rng = np.random.default_rng(self._streaming_shuffle_seed + ts) + if frac >= 1.0: + return idx[rng.permutation(idx.size)] + # Within-segment shuffle (K = round(fraction * N)): a single vectorized + # lexsort over per-element random keys, stable within each size-K segment + # so elements never cross a segment boundary (bounds the working set). O(N + # log K), run once per window in the background prep thread. + n = idx.size + k = max(1, int(round(frac * n))) + seg = np.arange(n, dtype=np.int64) // k + keys = rng.random(n) + order = np.lexsort((keys, seg)) + return idx[order] + + def train_window_indices(self, ts: int) -> np.ndarray: + """Global anchor indices for TRAIN in window ``ts``: ``window_indices`` + with held-out eval users removed. Identical across resume because + ``window_indices``, the uid hash, and the (seed,ts)-keyed in-window + shuffle are all pure functions, so the per-rank round-robin slice (and + the mid-window skip offset) stay consistent.""" + idx = self.window_indices(ts) + if self._train_split_percentage >= 1.0: + return self._shuffle_window(idx, ts) + kept = idx[~self._eval_anchor_mask(idx)] + logger.warning( + f"train_window_indices({ts}): {idx.size:,} -> {kept.size:,} anchors " + f"(holdout tsp={self._train_split_percentage}, salt={self._split_salt})" + ) + return self._shuffle_window(kept, ts) + + def eval_holdout_indices(self, start_ts: int, num_windows: int = 1) -> np.ndarray: + """Fixed eval set: held-out users' anchors over windows + ``[start_ts, start_ts + num_windows)``. Computed once and cached, so the + SAME anchors are evaluated at every eval step (stable, comparable curve). + With no holdout (tsp>=1.0) this falls back to the full window(s).""" + key = (int(start_ts), int(num_windows)) + if self._eval_holdout_cache is not None and self._eval_holdout_cache_key == key: + return self._eval_holdout_cache + parts: List[np.ndarray] = [] + for ts in range(start_ts, start_ts + max(1, num_windows)): + idx = self.window_indices(ts) + if self._train_split_percentage < 1.0: + idx = idx[self._eval_anchor_mask(idx)] + parts.append(idx) + holdout = ( + np.concatenate(parts).astype(np.int64) + if parts + else np.empty(0, dtype=np.int64) + ) + logger.warning( + f"eval_holdout_indices(start_ts={start_ts}, num_windows={num_windows}): " + f"{holdout.size:,} held-out anchors (tsp={self._train_split_percentage})" + ) + self._eval_holdout_cache = holdout + self._eval_holdout_cache_key = key + return holdout + + def total_train_anchors(self, start_ts: int, num_ts: int) -> int: + """Total TRAIN anchors across windows ``[start_ts, start_ts + num_ts)``. + + A single O(N) pass over the cached ``_anchor_ts`` array (NOT per-window + ``train_window_indices`` scans). Used to convert a "fraction of training + data" eval cadence into a global train-step interval. With a user holdout + (``train_split_percentage`` < 1.0) the held-out eval users are excluded + via the SAME uid hash as ``train_window_indices``, so the count matches + what is actually trained. + + NOTE: this is an UPPER BOUND on the realized train STEP count — the + per-window samplers truncate each window to a multiple of ``world_size`` + and drop the last partial per-rank batch (``drop_last=True``). The small + overcount is acceptable for a cadence knob (it only shifts the eval grid + by a fraction of a window). + """ + self._ensure_streaming_index() + assert self._anchor_ts is not None and self._t_min is not None + if num_ts <= 0: + return 0 + w = self._streaming_window_seconds + lo = self._t_min + start_ts * w + hi = self._t_min + (start_ts + num_ts) * w + in_range = (self._anchor_ts >= lo) & (self._anchor_ts < hi) + if self._train_split_percentage >= 1.0: + total = int(np.count_nonzero(in_range)) + else: + sel = np.where(in_range)[0] + total = int(np.count_nonzero(~self._eval_anchor_mask(sel))) + logger.warning( + f"total_train_anchors(start_ts={start_ts}, num_ts={num_ts}): " + f"{total:,} train anchors (tsp={self._train_split_percentage})" + ) + return total + + def set_ts(self, ts: int, train_only: bool = False) -> None: + """Restrict the active sample set to anchors in window ``ts`` (used by + the per-window-DataLoader path, where ``iloc``/``get_item_count`` index + through ``_active``). + + ``train_only=True`` removes held-out eval users so the non-persistent + TRAIN loader never sees them (closes the leakage path). Forward-only + temporal slicing for streaming train/eval. History for any anchor is + still gathered causally (``scan_start:flat_pos``) and may span earlier + windows, so there is no feature leakage from future events. + """ + self._active = ( + self.train_window_indices(ts) if train_only else self.window_indices(ts) + ) + + def set_active_indices(self, indices: np.ndarray) -> None: + """Restrict the active sample set to an explicit array of global anchor + indices (into ``_positions``). Used by the non-persistent eval path to + iterate the fixed user-holdout set (which spans a window range, not a + single ``ts``).""" + self._active = np.asarray(indices, dtype=np.int64) + + def load_query_samples(self, sample_list) -> None: + max_num_candidates = ( + self._max_num_candidates_inference + if self._is_inference + else self._max_num_candidates + ) + self.items_in_memory = {} + for idx in sample_list: + flat_pos = self.iloc(idx) + self.items_in_memory[idx] = self._build_sample(flat_pos, max_num_candidates) + self.last_loaded = time.time() + + def get_sample(self, idx: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: + if idx in self.items_in_memory: + return self.items_in_memory[idx] + max_num_candidates = ( + self._max_num_candidates_inference + if self._is_inference + else self._max_num_candidates + ) + flat_pos = self.iloc(idx) + return self._build_sample(flat_pos, max_num_candidates) + + @staticmethod + def _empty_history() -> Tuple[ + np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray + ]: + empty = np.empty(0, dtype=np.int64) + return empty, empty, empty, empty, empty + + def _read_scan_window( + self, flat_pos: int, user_start: int + ) -> Optional[ + Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray] + ]: + """Read the causal scan window [scan_start, flat_pos) for an anchor. + Returns (item_ids, timestamps, is_lp, is_like, is_skip) views, or None + if the window is empty.""" + scan_start = max(int(user_start), int(flat_pos) - self._scan_window) + scan_end = int(flat_pos) + if scan_end <= scan_start: + return None + return ( + self.store.flat_item_ids[scan_start:scan_end], + self.store.flat_timestamps[scan_start:scan_end], + self.store.flat_is_listen_plus[scan_start:scan_end], + self.store.flat_is_like[scan_start:scan_end], + self.store.flat_is_skip[scan_start:scan_end], + ) + + def _materialize_history( + self, + keep_local: np.ndarray, + item_ids: np.ndarray, + timestamps: np.ndarray, + is_lp: np.ndarray, + is_like: np.ndarray, + is_skip: np.ndarray, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Gather item/artist/album/ts + pool-bitmask `weight` for the kept + (chronologically-ordered) local indices.""" + items = item_ids[keep_local] + ts = timestamps[keep_local] + artists = self.item_to_artist[np.clip(items, 0, self.item_to_artist.shape[0] - 1)] + albums = self.item_to_album[np.clip(items, 0, self.item_to_album.shape[0] - 1)] + # Pool bitmask per kept event (LP/LIKE/SKIP are mutually exclusive in + # the source data, but OR is safe and forward-compatible). + weight = np.zeros(keep_local.shape[0], dtype=np.int64) + weight[is_lp[keep_local]] |= LP_BIT + weight[is_like[keep_local]] |= LIKE_BIT + weight[is_skip[keep_local]] |= SKIP_BIT + return items, artists, albums, ts, weight + + def _gather_history( + self, flat_pos: int, user_start: int + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Dispatch UIH construction to the configured strategy.""" + if self._history_strategy == "last_n": + return self._gather_last_n_history(flat_pos, user_start) + return self._gather_interleaved_history(flat_pos, user_start) + + def _gather_interleaved_history( + self, flat_pos: int, user_start: int + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Build a single chronologically-ordered history sequence from the 3 + behavior pools. Each event's `action_weight` carries the pool bitmask + (LP_BIT/LIKE_BIT/SKIP_BIT). Per-pool cap = history_length // 3.""" + L = self._history_length + per_pool = max(1, L // 3) + scan = self._read_scan_window(flat_pos, user_start) + if scan is None: + return self._empty_history() + item_ids, timestamps, is_lp, is_like, is_skip = scan + + # Local indices into the scan window — preserves chronological order + # within each pool and lets us interleave by re-sorting. + idx_all = np.arange(item_ids.shape[0], dtype=np.int64) + lp_idx = idx_all[is_lp][-per_pool:] + like_idx = idx_all[is_like][-per_pool:] + skip_idx = idx_all[is_skip][-per_pool:] + + keep_local = np.concatenate([lp_idx, like_idx, skip_idx]) + if keep_local.size == 0: + return self._empty_history() + + order = np.argsort(keep_local, kind="stable") + keep_local = keep_local[order] + + return self._materialize_history( + keep_local, item_ids, timestamps, is_lp, is_like, is_skip + ) + + def _gather_last_n_history( + self, flat_pos: int, user_start: int + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """Build the UIH from the last `history_length` events of ANY pool type + (listen+/like/skip) with no per-pool split. Vs the interleaved strategy + this fills the sequence to ~history_length (higher effective length) and + lets the like share fall to its natural corpus rate (~1.9%). Events + outside the 3 pools (dislike/unlike/undislike) are excluded as before.""" + L = self._history_length + scan = self._read_scan_window(flat_pos, user_start) + if scan is None: + return self._empty_history() + item_ids, timestamps, is_lp, is_like, is_skip = scan + + member = is_lp | is_like | is_skip + # Last L pooled events, in chronological order (already position-sorted + # within the scan window, so no re-sort is needed). + keep_local = np.arange(item_ids.shape[0], dtype=np.int64)[member][-L:] + if keep_local.size == 0: + return self._empty_history() + + return self._materialize_history( + keep_local, item_ids, timestamps, is_lp, is_like, is_skip + ) + + def _build_sample( + self, flat_pos: int, max_num_candidates: int + ) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: + uid = int(self.store.flat_uid[flat_pos]) + user_start = int(self.store.user_start[uid]) + + items, artists, albums, ts, weight = self._gather_history( + flat_pos, user_start + ) + + target_item = int(self.store.flat_item_ids[flat_pos]) + target_artist = int( + self.item_to_artist[target_item] + if target_item < self.item_to_artist.shape[0] + else 0 + ) + target_album = int( + self.item_to_album[target_item] + if target_item < self.item_to_album.shape[0] + else 0 + ) + target_ts = int(self.store.flat_timestamps[flat_pos]) + + played_ratio = float(self.store.flat_played_ratio[flat_pos]) + is_lp = ( + int(self.store.flat_event_types[flat_pos]) == LISTEN_TYPE + and played_ratio >= LISTEN_PLUS_THRESHOLD + ) + # Label encoded into the candidate's action_weight via the LP bit, so + # _get_supervision_labels_and_weights sees the right supervision. + candidate_action_weight = LP_BIT if is_lp else 0 + + cross_id_anchor: Dict[str, int] = { + "uid": uid, + "item_id": target_item, + "artist_id": target_artist, + "album_id": target_album, + "hour_of_day": int((target_ts // 3600) % 24), + "is_organic": int(self.store.flat_is_organic[flat_pos]), + } + cross_ids: Dict[str, int] = { + name: xxhash_cross(cross_id_anchor, list(keys), n, salt) + for (name, keys, n, salt) in self._cross_specs + } + + # ---- Truncate UIH to fit max_seq_len budget ---- + uih_seq_len_budget = ( + self._max_seq_len + - max_num_candidates + - len(self._contextual_feature_to_max_length or {}) + ) + if items.shape[0] > uih_seq_len_budget: + items = items[-uih_seq_len_budget:] + artists = artists[-uih_seq_len_budget:] + albums = albums[-uih_seq_len_budget:] + ts = ts[-uih_seq_len_budget:] + weight = weight[-uih_seq_len_budget:] + uih_seq_len = int(items.shape[0]) + dummy_watch_time = np.zeros(uih_seq_len, dtype=np.int64) + + # ---- Build UIH KJT ---- + # Contextual features (length-1 each) iterated in the same order as + # `_contextual_feature_to_max_length` (matches movielens reference). + uih_kjt_values: List[int] = [] + uih_kjt_lengths: List[int] = [] + for name, length in (self._contextual_feature_to_max_length or {}).items(): + assert length == 1, f"yambda contextuals are length-1, got {name}={length}" + if name == "uid": + uih_kjt_values.append(uid) + else: + uih_kjt_values.append(int(cross_ids[name])) + uih_kjt_lengths.append(1) + + # Sequential features — order must match the trailing entries of + # hstu_uih_feature_names in configs.py: + # item_id, artist_id, album_id, action_weight, action_timestamp, dummy_watch_time + uih_kjt_values.extend(items.tolist()) + uih_kjt_values.extend(artists.tolist()) + uih_kjt_values.extend(albums.tolist()) + uih_kjt_values.extend(weight.tolist()) + uih_kjt_values.extend(ts.tolist()) + uih_kjt_values.extend(dummy_watch_time.tolist()) + n_sequential = len(self._uih_keys) - len(self._contextual_feature_to_max_length or {}) + uih_kjt_lengths.extend([uih_seq_len] * n_sequential) + + dummy_query_time = int(ts[-1]) if uih_seq_len > 0 else target_ts + uih_kjt_values.append(dummy_query_time) + uih_kjt_lengths.append(1) + + uih_features_kjt = KeyedJaggedTensor( + keys=self._uih_keys + ["dummy_query_time"], + lengths=torch.tensor(uih_kjt_lengths, dtype=torch.long), + values=torch.tensor(uih_kjt_values, dtype=torch.long), + ) + + # ---- Build candidates KJT ---- + # Order must match configs.py:hstu_candidate_feature_names exactly: + # item_candidate_id, item_candidate_artist_id, item_candidate_album_id, + # item_query_time, item_action_weight, item_dummy_watchtime + candidates_kjt_lengths = max_num_candidates * torch.ones( + len(self._candidates_keys), dtype=torch.long + ) + candidates_kjt_values: List[int] = ( + [target_item] * max_num_candidates + + [target_artist] * max_num_candidates + + [target_album] * max_num_candidates + + [dummy_query_time] * max_num_candidates + + [candidate_action_weight] * max_num_candidates + + [1] * max_num_candidates # item_dummy_watchtime + ) + candidates_features_kjt = KeyedJaggedTensor( + keys=self._candidates_keys, + lengths=candidates_kjt_lengths, + values=torch.tensor(candidates_kjt_values, dtype=torch.long), + ) + return uih_features_kjt, candidates_features_kjt diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md new file mode 100644 index 000000000..ef1c9686d --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md @@ -0,0 +1,88 @@ +# MLPerf Inference reference implementation for DLRMv3 + +## Install dependencies + +The reference implementation has been tested on a single host, with x86_64 CPUs +and 8 NVIDIA H100/B200 GPUs. Dependencies can be installed below, + +``` +cd generative_recommenders/ +pip install -e . +``` + +## Build loadgen + +``` +cd generative_recommenders/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/ +CFLAGS="-std=c++14 -O3" python -m pip install . +``` + +## Dataset download + +DLRMv3 uses a synthetic dataset specifically designed to match the model and +system characteristics of large-scale sequential recommendation (large item set +and long average sequence length for each request). To generate the dataset used +for both training and inference, run + +``` +cd generative_recommenders/dlrm_v3/ +python streaming_synthetic_data.py +``` + +The generated dataset has 2TB size, and contains 5 million users interacting +with a billion items over 100 timestamps. + +Only 1% of the dataset is used in the inference benchmark. The sampled DLRMv3 +dataset and trained checkpoint are available at +https://inference.mlcommons-storage.org/. + +Script to download the sampled dataset used in inference benchmark: + +``` +bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) https://inference.mlcommons-storage.org/metadata/dlrm-v3-dataset.uri +``` + +Script to download the 1TB trained checkpoint: + +``` +bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) https://inference.mlcommons-storage.org/metadata/dlrm-v3-checkpoint.uri +``` + +## Inference benchmark + +``` +cd generative_recommenders/generative_recommenders/dlrm_v3/inference/ +WORLD_SIZE=8 python main.py --dataset sampled-streaming-100b +``` + +The config file is listed in `dlrm_v3/inference/gin/streaming_100b.gin`. +`WORLD_SIZE` is the number of GPUs used in the inference benchmark. + +To load checkpoint from training, modify `run.model_path` inside the inference +gin config file. (We will relase the checkpoint soon.) + +To achieve the best performance, tune `run.target_qps` and `run.batch_size` in +the config file. + +## Accuracy test + +Set `run.compute_eval` will run the accuracy test and dump prediction outputs in +`mlperf_log_accuracy.json`. To check the accuracy, run + +``` +python accuracy.py --path path/to/mlperf_log_accuracy.json +``` + +We use normalized entropy (NE), accuracy, and AUC as the metrics to evaluate the model quality. For accepted submissions, all three metrics (NE, Accuracy, AUC) must be within 99% of the reference implementation values. The accuracy for the reference implementation evaluated on 34,996 requests across 10 inference timestamps are listed below: + +``` +NE: 86.687% +Accuracy: 69.651% +AUC: 78.663% +``` + +## Run unit tests + +``` +python tests/inference_test.py +``` diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py new file mode 100644 index 000000000..19242f7bd --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py @@ -0,0 +1,86 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict +""" +Tool to calculate accuracy for loadgen accuracy output found in mlperf_log_accuracy.json +""" + +import argparse +import json +import logging + +import numpy as np +import torch +from generative_recommenders.dlrm_v3.configs import get_hstu_configs +from generative_recommenders.dlrm_v3.utils import MetricsLogger + +logger: logging.Logger = logging.getLogger("main") + + +def get_args() -> argparse.Namespace: + """Parse commandline.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--path", + required=True, + help="path to mlperf_log_accuracy.json", + ) + args = parser.parse_args() + return args + + +def main() -> None: + """ + Main function to calculate accuracy metrics from loadgen output. + + Reads the mlperf_log_accuracy.json file, parses the results, and computes + accuracy metrics using the MetricsLogger. Each result entry contains + predictions, labels, and weights packed as float32 numpy arrays. + """ + args = get_args() + logger.warning("Parsing loadgen accuracy log...") + with open(args.path, "r") as f: + results = json.load(f) + hstu_config = get_hstu_configs(dataset="sampled-streaming-100b") + metrics = MetricsLogger( + multitask_configs=hstu_config.multitask_configs, + batch_size=1, + window_size=3000, + device=torch.device("cpu"), + rank=0, + ) + logger.warning(f"results have {len(results)} entries") + for result in results: + data = np.frombuffer(bytes.fromhex(result["data"]), np.float32) + num_candidates = data[-1].astype(int) + assert len(data) == 1 + num_candidates * 3 + mt_target_preds = torch.from_numpy(data[0:num_candidates]) + mt_target_labels = torch.from_numpy(data[num_candidates : num_candidates * 2]) + mt_target_weights = torch.from_numpy( + data[num_candidates * 2 : num_candidates * 3] + ) + num_candidates = torch.tensor([num_candidates]) + metrics.update( + predictions=mt_target_preds.view(1, -1), + labels=mt_target_labels.view(1, -1), + weights=mt_target_weights.view(1, -1), + num_candidates=num_candidates, + ) + for k, v in metrics.compute().items(): + logger.warning(f"{k}: {v}") + + +if __name__ == "__main__": + main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp b/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp new file mode 100644 index 000000000..d4d0d4082 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp @@ -0,0 +1,215 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// +// End-to-end runner for the HSTU torch.jit / torch.package artifacts produced +// by generative_recommenders/dlrm_v3/inference/packager.py and exercised by +// :end_to_end_test. +// +// CLI: +// hstu_runner [--aott_library ...] +// +// +// Where: +// sparse.pt ScriptModule whose forward(uih, candidates) returns +// Tuple[Dict[str,Tensor], Dict[str,Tensor], +// Dict[str,Tensor], Tensor, Tensor] +// dense.pt ScriptModule (cuda:0, bf16) whose forward(...) returns +// Tuple[Tensor, Optional[Tensor], Optional[Tensor]] +// inputs.pt ScriptModule whose forward() returns +// Tuple[KeyedJaggedTensor, KeyedJaggedTensor] +// output.pt torch::pickle_save destination for the predictions tensor; +// readable from Python as ``torch.load(output.pt)``. + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace { + +struct RunnerArgs { + std::vector aottLibraryPaths; + std::string sparsePath; + std::string densePath; + std::string inputsPath; + std::string outputPath; +}; + +RunnerArgs parseArgs(int argc, char** argv) { + RunnerArgs args; + std::vector positional; + for (int i = 1; i < argc; ++i) { + const std::string arg{argv[i]}; + if (arg == "--aott_library") { + if (++i >= argc) { + throw std::runtime_error("--aott_library requires a path"); + } + args.aottLibraryPaths.emplace_back(argv[i]); + } else { + positional.push_back(arg); + } + } + + if (positional.size() != 4) { + throw std::runtime_error( + "Usage: hstu_runner [--aott_library ...] " + " "); + } + args.sparsePath = positional[0]; + args.densePath = positional[1]; + args.inputsPath = positional[2]; + args.outputPath = positional[3]; + return args; +} + +void loadAottLibraries( + const std::vector& libraryPaths, + const std::function& log) { + for (const auto& path : libraryPaths) { + log("[runner] loading AOT-T library " + path); + void* handle = dlopen(path.c_str(), RTLD_GLOBAL | RTLD_NOW); + if (handle == nullptr) { + throw std::runtime_error( + "failed to dlopen AOT-T library " + path + ": " + dlerror()); + } + } +} + +torch::jit::Module loadModule(const std::string& path) { + // @patternlint-disable-next-line no-torch-low-level-api + auto m = torch::jit::load(path); + m.eval(); + return m; +} + +// Walk a Dict and replace every value with .to(device) (and +// optionally .to(bfloat16)). C++ analog of move_sparse_output_to_device. +void moveDictToDevice( + c10::impl::GenericDict& d, + const torch::Device& device, + bool toBfloat16) { + for (auto& kv : d) { + auto t = kv.value().toTensor().to(device); + if (toBfloat16) { + t = t.to(torch::kBFloat16); + } + d.insert_or_assign(kv.key(), t); + } +} + +void writePickle(const torch::Tensor& t, const std::string& path) { + // torch::pickle_save returns a byte buffer in the same wire format as + // ``torch.save(tensor, ...)``, so the Python side can read it with + // ``torch.load(path)``. + const auto data = torch::jit::pickle_save(c10::IValue(t)); + std::ofstream out(path, std::ios::binary); + if (!out) { + throw std::runtime_error("failed to open output: " + path); + } + out.write(data.data(), static_cast(data.size())); +} + +} // namespace + +int main(int argc, char** argv) { + RunnerArgs args; + try { + args = parseArgs(argc, argv); + } catch (const std::exception& e) { + std::cerr << e.what() << '\n'; + return 1; + } + + // Log to a file next to the output so we can inspect even if + // buck2 swallows stderr. + const std::string logPath = args.outputPath + ".log"; + std::ofstream logFile(logPath); + auto log = [&](const std::string& msg) { + logFile << msg << std::endl; + logFile.flush(); + std::cerr << msg << std::endl; + }; + + try { + log("[runner] step 0: loading AOT-T libraries"); + loadAottLibraries(args.aottLibraryPaths, log); + log("[runner] step 0 done: loaded " + + std::to_string(args.aottLibraryPaths.size()) + " AOT-T libraries"); + + log("[runner] step 1: loading sparse module from " + args.sparsePath); + auto sparse = loadModule(args.sparsePath); + + log("[runner] step 2: loading dense module from " + args.densePath); + auto dense = loadModule(args.densePath); + + log("[runner] step 3: loading inputs module from " + args.inputsPath); + auto inputs = loadModule(args.inputsPath); + + log("[runner] step 4: running inputs.forward()"); + auto inputsTuple = inputs.forward({}).toTuple(); + auto uihLengths = inputsTuple->elements()[0]; + auto uihValues = inputsTuple->elements()[1]; + auto candidatesLengths = inputsTuple->elements()[2]; + auto candidatesValues = inputsTuple->elements()[3]; + log("[runner] step 4 done: got 4 input tensors"); + + log("[runner] step 5: running sparse.forward()"); + std::vector sparseInputs{ + uihLengths, uihValues, candidatesLengths, candidatesValues}; + auto sparseOut = sparse.forward(sparseInputs).toTuple(); + log("[runner] step 5 done: sparse forward returned " + + std::to_string(sparseOut->elements().size()) + " elements"); + + log("[runner] step 6: unpacking sparse output dicts"); + auto seqEmbValues = sparseOut->elements()[0].toGenericDict(); + auto seqEmbLengths = sparseOut->elements()[1].toGenericDict(); + auto payloadFeatures = sparseOut->elements()[2].toGenericDict(); + auto uihSeqLengths = sparseOut->elements()[3].toTensor(); + auto numCandidates = sparseOut->elements()[4].toTensor(); + log("[runner] step 6 done: unpacked dicts"); + + log("[runner] step 7: moving dicts to cuda:0"); + const auto device = torch::Device(torch::kCUDA, 0); + moveDictToDevice(seqEmbValues, device, /*toBfloat16=*/true); + log("[runner] step 7a: seqEmbValues moved"); + moveDictToDevice(seqEmbLengths, device, /*toBfloat16=*/false); + log("[runner] step 7b: seqEmbLengths moved"); + moveDictToDevice(payloadFeatures, device, /*toBfloat16=*/false); + log("[runner] step 7c: payloadFeatures moved"); + uihSeqLengths = uihSeqLengths.to(device); + numCandidates = numCandidates.to(device); + log("[runner] step 7 done: all on cuda:0"); + + log("[runner] step 8: running dense.forward()"); + std::vector denseInputs{ + seqEmbValues, + seqEmbLengths, + payloadFeatures, + uihSeqLengths, + numCandidates, + }; + auto denseOut = dense.forward(denseInputs); + log("[runner] step 8 done: dense forward returned"); + + auto preds = denseOut.toTensor().detach().cpu(); + log("[runner] step 9: preds on cpu"); + + std::cout << "preds shape: " << preds.sizes() << '\n'; + std::cout << "preds sum: " + << preds.to(torch::kFloat32).sum().item() << '\n'; + + writePickle(preds, args.outputPath); + std::cout << "wrote " << args.outputPath << '\n'; + log("[runner] step 10: done, wrote output"); + return 0; + } catch (const std::exception& e) { + log(std::string("hstu_runner FAILED: ") + e.what()); + return 1; + } +} diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py new file mode 100644 index 000000000..6a8db77c8 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py @@ -0,0 +1,227 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict +""" +Data producer module for DLRMv3 inference. + +This module provides classes for producing and managing query data during inference, +supporting both single-threaded and multi-threaded data production modes. +""" + +import logging +import threading +import time +from queue import Queue +from typing import List, Optional, Tuple, Union + +import torch +from generative_recommenders.dlrm_v3.datasets.dataset import Dataset, Samples + +logging.basicConfig(level=logging.INFO) +logger: logging.Logger = logging.getLogger("data_producer") + + +class QueryItem: + """ + Container for a query item to be processed by the inference thread pool. + + Attributes: + query_ids: List of unique identifiers for the queries in this batch. + samples: The sample data containing features for the queries. + start: Time when the query was first received. + dt_queue: Time spent in the queue before processing. + dt_batching: Time spent on batching the data. + """ + + def __init__( + self, + query_ids: List[int], + samples: Samples, + start: float, + dt_queue: float, + dt_batching: float, + ) -> None: + self.query_ids = query_ids + self.samples = samples + self.start: float = start + self.dt_queue: float = dt_queue + self.dt_batching: float = dt_batching + + +class SingleThreadDataProducer: + """ + Single-threaded data producer for synchronous query processing. + + This producer processes queries on the main thread without any parallelism, + suitable for debugging or low-throughput scenarios. + + Args: + ds: The dataset to fetch samples from. + run_one_item: Callback function to process a single QueryItem. + """ + + def __init__(self, ds: Dataset, run_one_item) -> None: # pyre-ignore [2] + self.ds = ds + self.run_one_item = run_one_item # pyre-ignore [4] + + def enqueue( + self, query_ids: List[int], content_ids: List[int], t0: float, dt_queue: float + ) -> None: + """ + Enqueue queries for immediate synchronous processing. + + Args: + query_ids: List of unique query identifiers. + content_ids: List of content/sample identifiers to fetch. + t0: Timestamp when the query batch was created. + dt_queue: Time spent waiting in the queue. + """ + with torch.profiler.record_function("data batching"): + t0_batching: float = time.time() + samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids) + dt_batching: float = time.time() - t0_batching + if isinstance(samples, Samples): + query = QueryItem( + query_ids=query_ids, + samples=samples, + start=t0, + dt_queue=dt_queue, + dt_batching=dt_batching, + ) + self.run_one_item(query) + else: + start_idx = 0 + for sample in samples: + batch_size: int = sample.batch_size() + query = QueryItem( + query_ids=query_ids[start_idx : start_idx + batch_size], + samples=sample, + start=t0, + dt_queue=dt_queue, + dt_batching=dt_batching, + ) + start_idx += batch_size + self.run_one_item(query) + + def finish(self) -> None: + """Finalize the producer. No-op for single-threaded mode.""" + pass + + +class MultiThreadDataProducer: + """ + Multi-threaded data producer for parallel query processing. + + Uses a thread pool to fetch and batch data in parallel with model inference, + improving throughput for high-load scenarios. + + Args: + ds: The dataset to fetch samples from. + threads: Number of worker threads to use. + run_one_item: Callback function to process a single QueryItem. + """ + + def __init__( + self, + ds: Dataset, + threads: int, + run_one_item, # pyre-ignore [2] + ) -> None: + queue_size_multiplier = 4 + self.ds = ds + self.threads = threads + self.run_one_item = run_one_item # pyre-ignore [4] + self.tasks: Queue[Optional[Tuple[List[int], List[int], float, float]]] = Queue( + maxsize=threads * queue_size_multiplier + ) + self.workers: List[threading.Thread] = [] + for _ in range(self.threads): + worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,)) + worker.daemon = True + self.workers.append(worker) + worker.start() + + def handle_tasks( + self, tasks_queue: Queue[Optional[Tuple[List[int], List[int], float, float]]] + ) -> None: + """ + Worker thread main loop to process tasks from the queue. + + Each worker maintains its own CUDA stream for parallel execution. + + Args: + tasks_queue: Queue containing task tuples or None for termination. + """ + stream = torch.cuda.Stream() + while True: + query_and_content_ids = tasks_queue.get() + if query_and_content_ids is None: + tasks_queue.task_done() + break + query_ids, content_ids, t0, dt_queue = query_and_content_ids + t0_batching: float = time.time() + samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids) + dt_batching: float = time.time() - t0_batching + if isinstance(samples, Samples): + qitem = QueryItem( + query_ids=query_ids, + samples=samples, + start=t0, + dt_queue=dt_queue, + dt_batching=dt_batching, + ) + with torch.inference_mode(), torch.cuda.stream(stream): + self.run_one_item(qitem) + else: + start_idx = 0 + for sample in samples: + batch_size: int = sample.batch_size() + qitem = QueryItem( + query_ids=query_ids[start_idx : start_idx + batch_size], + samples=sample, + start=t0, + dt_queue=dt_queue, + dt_batching=dt_batching, + ) + start_idx += batch_size + with torch.inference_mode(), torch.cuda.stream(stream): + self.run_one_item(qitem) + tasks_queue.task_done() + + def enqueue( + self, query_ids: List[int], content_ids: List[int], t0: float, dt_queue: float + ) -> None: + """ + Enqueue queries for asynchronous processing by worker threads. + + Args: + query_ids: List of unique query identifiers. + content_ids: List of content/sample identifiers to fetch. + t0: Timestamp when the query batch was created. + dt_queue: Time spent waiting in the queue. + """ + with torch.profiler.record_function("data batching"): + self.tasks.put((query_ids, content_ids, t0, dt_queue)) + + def finish(self) -> None: + """ + Signal all worker threads to terminate and wait for completion. + + Sends None to each worker to trigger graceful shutdown. + """ + for _ in self.workers: + self.tasks.put(None) + for worker in self.workers: + worker.join() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py new file mode 100644 index 000000000..add2781bc --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py @@ -0,0 +1,96 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict + +""" +TorchScript-friendly wrapper for the HSTU dense path (GPU transformer). + +``HSTUDenseScriptModule`` accepts the *flattened* sparse-output dicts produced +by :class:`HSTUSparseScriptModule`, reconstructs ``Dict[str, +SequenceEmbedding]`` for the existing :meth:`DlrmHSTU.main_forward` and +returns a 3-tuple of ``(preds, labels, weights)`` -- the only fields the +predictor actually consumes. +""" + +from typing import Dict + +import torch +from generative_recommenders.dlrm_v3.inference.inference_modules import get_hstu_model +from generative_recommenders.dlrm_v3.inference.ts_types import ( + SeqEmbLengths, + SeqEmbValues, + unflatten_seq_embeddings, +) +from generative_recommenders.modules.dlrm_hstu import DlrmHSTU, DlrmHSTUConfig +from torchrec.modules.embedding_configs import EmbeddingConfig + + +class HSTUDenseScriptModule(torch.nn.Module): + """Script-friendly dense module. + + The wrapper owns a dense-only :class:`DlrmHSTU` (no + ``_embedding_collection``) and delegates to ``main_forward`` after + reconstructing the ``SequenceEmbedding`` NamedTuple form. + """ + + def __init__( + self, + hstu_config: DlrmHSTUConfig, + table_config: Dict[str, EmbeddingConfig], + ) -> None: + super().__init__() + self._hstu_model: DlrmHSTU = get_hstu_model( + table_config=table_config, + hstu_config=hstu_config, + table_device="cpu", + is_dense=True, + ) + + def forward( + self, + seq_emb_values: SeqEmbValues, + seq_emb_lengths: SeqEmbLengths, + payload_features: Dict[str, torch.Tensor], + uih_seq_lengths: torch.Tensor, + num_candidates: torch.Tensor, + ) -> torch.Tensor: + # TorchScript supports ``int(tensor.item())`` on a 0-d tensor. + max_uih_len: int = int(uih_seq_lengths.max().item()) + max_num_candidates: int = int(num_candidates.max().item()) + + seq_embeddings = unflatten_seq_embeddings(seq_emb_values, seq_emb_lengths) + + ( + _, + _, + _, + mt_target_preds, + _mt_target_labels, + _mt_target_weights, + ) = self._hstu_model.main_forward( + seq_embeddings=seq_embeddings, + payload_features=payload_features, + max_uih_len=max_uih_len, + uih_seq_lengths=uih_seq_lengths, + max_num_candidates=max_num_candidates, + num_candidates=num_candidates, + ) + assert mt_target_preds is not None + # Return just the predictions tensor; labels/weights are unused by + # the predictor at inference time and would force ``Optional[Tensor]`` + # in the return type, which torch.jit.trace rejects ("Only tensors, + # lists, tuples of tensors, or dictionary of tensors can be output + # from traced functions"). + return mt_target_preds diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py new file mode 100644 index 000000000..f1b956d9c --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py @@ -0,0 +1,795 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict + +""" +End-to-end smoke test for the HSTU TorchScript + C++ deployment pipeline. + +What this binary does, in order: + +1. Build a synthetic batch (uih_kjt, candidates_kjt) via :func:`get_random_data`. +2. Build the eager :class:`HSTUSparseScriptModule` and + :class:`HSTUDenseScriptModule`. +3. Run them eagerly to obtain the reference ``preds_eager``. +4. ``torch.jit.script`` + save: + - ``sparse.pt`` (CPU) + - ``dense.pt`` (cuda:0, bf16) + - ``inputs.pt`` (an :class:`InputsBundle` ScriptModule whose + ``forward()`` returns ``Tuple[KeyedJaggedTensor, KeyedJaggedTensor]``) +5. Run the C++ runner + ``hstu_runner [--aott_library ...] ``. +6. ``torch.load`` the runner's output and compare against ``preds_eager`` + with :func:`torch.testing.assert_close` (loose tolerance because the + scripted path may use either the PyTorch fallback trace or AOT-T-loaded + Triton inference kernels). + +Usage (manual override of the runner path): + + buck2 run @mode/opt //generative_recommenders/dlrm_v3/inference:end_to_end_test \\ + -- --cpp_runner /path/to/hstu_runner + +By default the binary locates the runner via ``libfb.py.parutil`` -- it ships +inside the par as a resource (see BUCK). +""" + +import argparse +import logging +import os +import shutil +import sys +import tempfile +from typing import Any, Dict, List, Tuple + +import torch +from generative_recommenders.dlrm_v3.configs import ( + get_embedding_table_config, + get_hstu_configs, +) +from generative_recommenders.dlrm_v3.datasets.dataset import get_random_data +from generative_recommenders.dlrm_v3.inference.dense_predict_module import ( + HSTUDenseScriptModule, +) +from generative_recommenders.dlrm_v3.inference.sparse_predict_module import ( + HSTUSparseScriptModule, +) +from generative_recommenders.dlrm_v3.inference.ts_types import ( + SeqEmbLengths, + SeqEmbValues, + unflatten_seq_embeddings, +) +from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig +from security.frameworks.python.exec.subprocess import TrustedSubprocessWithList +from torchrec.modules.embedding_configs import EmbeddingConfig +from torchrec.sparse.jagged_tensor import KeyedJaggedTensor + + +logger: logging.Logger = logging.getLogger(__name__) + + +_DEFAULT_DATASET = "kuairand-1k" + + +class InputsBundle(torch.nn.Module): + """Scripted holder for the test inputs. + + Returns the constituent tensors of the two KJTs as a 4-tuple + ``(uih_lengths, uih_values, candidates_lengths, candidates_values)`` so + the traced sparse module can rebuild the KJTs inside its forward (KJT + instances themselves are not traceable inputs). + """ + + def __init__( + self, + uih_kjt: KeyedJaggedTensor, + candidates_kjt: KeyedJaggedTensor, + ) -> None: + super().__init__() + self.register_buffer("uih_lengths", uih_kjt.lengths()) + self.register_buffer("uih_values", uih_kjt.values()) + self.register_buffer("candidates_lengths", candidates_kjt.lengths()) + self.register_buffer("candidates_values", candidates_kjt.values()) + + def forward( + self, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + return ( + self.uih_lengths, + self.uih_values, + self.candidates_lengths, + self.candidates_values, + ) + + +class _SparseTraceShim(torch.nn.Module): + """Adapter that takes raw tensors and rebuilds the KJTs inside forward. + + ``torch.jit.trace`` does not accept ``KeyedJaggedTensor`` (or any + non-Tensor / non-collection-of-Tensor type) as a top-level forward + input, so we make the traced boundary tensor-only and bake the + ``List[str]`` of feature keys in as Python constants captured by the + closure / module attribute. + """ + + def __init__( + self, + sparse_module: HSTUSparseScriptModule, + uih_keys: List[str], + candidates_keys: List[str], + ) -> None: + super().__init__() + self._sparse_module: HSTUSparseScriptModule = sparse_module + self._uih_keys: List[str] = uih_keys + self._candidates_keys: List[str] = candidates_keys + + def forward( + self, + uih_lengths: torch.Tensor, + uih_values: torch.Tensor, + candidates_lengths: torch.Tensor, + candidates_values: torch.Tensor, + ) -> Tuple[ + SeqEmbValues, + SeqEmbLengths, + Dict[str, torch.Tensor], + torch.Tensor, + torch.Tensor, + ]: + uih_kjt = KeyedJaggedTensor( + keys=self._uih_keys, + lengths=uih_lengths, + values=uih_values, + ) + candidates_kjt = KeyedJaggedTensor( + keys=self._candidates_keys, + lengths=candidates_lengths, + values=candidates_values, + ) + return self._sparse_module( + uih_features=uih_kjt, candidates_features=candidates_kjt + ) + + +class _DenseAottTraceShim(torch.nn.Module): + """FX-traceable dense adapter for the representative AOT-T shape.""" + + def __init__( + self, + dense_module: HSTUDenseScriptModule, + max_uih_len: int, + max_num_candidates: int, + total_uih_len: int, + total_targets: int, + ) -> None: + super().__init__() + self._dense_module: HSTUDenseScriptModule = dense_module + self._max_uih_len: int = max_uih_len + self._max_num_candidates: int = max_num_candidates + self._total_uih_len: int = total_uih_len + self._total_targets: int = total_targets + + def forward( + self, + seq_emb_values: SeqEmbValues, + seq_emb_lengths: SeqEmbLengths, + payload_features: Dict[str, torch.Tensor], + uih_seq_lengths: torch.Tensor, + num_candidates: torch.Tensor, + ) -> torch.Tensor: + seq_embeddings = unflatten_seq_embeddings(seq_emb_values, seq_emb_lengths) + + ( + _, + _, + _, + mt_target_preds, + _mt_target_labels, + _mt_target_weights, + ) = self._dense_module._hstu_model.main_forward( + seq_embeddings=seq_embeddings, + payload_features=payload_features, + max_uih_len=self._max_uih_len, + uih_seq_lengths=uih_seq_lengths, + max_num_candidates=self._max_num_candidates, + num_candidates=num_candidates, + total_uih_len=self._total_uih_len, + total_targets=self._total_targets, + ) + assert mt_target_preds is not None + return mt_target_preds + + +def _dense_aott_concrete_args( + dense_inputs: Tuple[ + Dict[str, torch.Tensor], + Dict[str, torch.Tensor], + Dict[str, torch.Tensor], + torch.Tensor, + torch.Tensor, + ], +) -> Dict[str, Any]: + from torch.fx._symbolic_trace import PH + + seq_emb_values, seq_emb_lengths, payload_features, _, _ = dense_inputs + return { + "seq_emb_values": {key: PH for key in seq_emb_values}, + "seq_emb_lengths": {key: PH for key in seq_emb_lengths}, + "payload_features": {key: PH for key in payload_features}, + } + + +def _find_cpp_runner() -> str: + """Locate the bundled hstu_runner binary. + + Tries ``importlib.resources`` (the canonical fbcode resource resolver, + works whether the binary is in a par or unpacked), and falls back to + looking next to ``sys.argv[0]``. + """ + try: + from importlib.resources import files + + path = files("generative_recommenders.dlrm_v3.inference.cpp").joinpath( + "hstu_runner" + ) + if path.is_file(): + return str(path) + except Exception as exc: + logger.debug("importlib.resources lookup failed: %s", exc) + + candidate = os.path.join( + os.path.dirname(os.path.abspath(sys.argv[0])), "hstu_runner" + ) + if os.path.exists(candidate): + return candidate + + raise RuntimeError( + "Could not find hstu_runner binary. " + "Pass --cpp_runner= or build the cpp_binary target first." + ) + + +def _eager_run( + sparse_module: HSTUSparseScriptModule, + dense_module: HSTUDenseScriptModule, + uih_kjt: KeyedJaggedTensor, + candidates_kjt: KeyedJaggedTensor, + device: torch.device, +) -> torch.Tensor: + """Reference path: sparse → device-move + bf16 → dense, all in Python.""" + with torch.no_grad(): + seq_emb_values, seq_emb_lengths, payload, uih_lens, num_cands = sparse_module( + uih_features=uih_kjt, candidates_features=candidates_kjt + ) + seq_emb_values = { + k: v.to(device).to(torch.bfloat16) for k, v in seq_emb_values.items() + } + seq_emb_lengths = {k: v.to(device) for k, v in seq_emb_lengths.items()} + payload = {k: v.to(device) for k, v in payload.items()} + uih_lens = uih_lens.to(device) + num_cands = num_cands.to(device) + preds = dense_module( + seq_emb_values, seq_emb_lengths, payload, uih_lens, num_cands + ) + return preds.detach().to(torch.float32).cpu() + + +def _find_aott_libraries() -> List[str]: + from generative_recommenders.ops.triton_aot.compile.compile_state import ( + get_aott_compile_path, + ) + + compile_path = get_aott_compile_path() + libraries: List[str] = [] + for root, _, files in os.walk(compile_path): + for filename in files: + if filename.endswith(".so"): + libraries.append(os.path.join(root, filename)) + return sorted(libraries) + + +def _copy_aott_libraries_to_workdir( + library_paths: List[str], workdir: str +) -> List[str]: + copied: List[str] = [] + for index, path in enumerate(library_paths): + dst = os.path.join(workdir, f"aott_{index}_{os.path.basename(path)}") + shutil.copy2(path, dst) + copied.append(dst) + return copied + + +def _load_aott_libraries_for_python(library_paths: List[str]) -> None: + for library_path in library_paths: + logger.info("Python roundtrip: loading AOT-T library %s", library_path) + torch.ops.load_library(library_path) + + +def _save_aott_dense_module( + dense_module: HSTUDenseScriptModule, + dense_inputs: Tuple[ + Dict[str, torch.Tensor], + Dict[str, torch.Tensor], + Dict[str, torch.Tensor], + torch.Tensor, + torch.Tensor, + ], + dense_path: str, + workdir: str, + atol: float, + rtol: float, +) -> List[str]: + """Lower the dense module with AOT-T and save a TorchScript artifact. + + This follows the AOT-T example flow: + + 1. FX trace the module. + 2. Unwrap outer `aot_triton_kernel_wrapper_*` nodes. + 3. Run representative CUDA inputs under `TritonAOTCompile`. + 4. `transform_kernels` to replace wrappers with `torch.ops.triton_aot.*`. + 5. Script and save the transformed dense module. + + The full HSTU dense wrapper has historically needed tracing rather than FX, + so failures here are reported with context and the default path remains the + D102 traced TorchScript fallback. + """ + from generative_recommenders.ops.triton_aot.compile.triton_aot_compile import ( + TritonAOTCompile, + ) + from generative_recommenders.ops.triton_aot.preprocess import ( + unwrap_aott_wrapper_nodes, + ) + from generative_recommenders.ops.triton_aot.transform.transform_kernels import ( + transform_kernels, + ) + from tgif.fx.tgif_tracer import TGIFTracer + + max_uih_len = int(dense_inputs[3].max().item()) + max_num_candidates = int(dense_inputs[4].max().item()) + total_uih_len = int(dense_inputs[3].sum().item()) + total_targets = int(dense_inputs[4].sum().item()) + trace_shim = _DenseAottTraceShim( + dense_module=dense_module, + max_uih_len=max_uih_len, + max_num_candidates=max_num_candidates, + total_uih_len=total_uih_len, + total_targets=total_targets, + ).eval() + + logger.info( + "AOT-T dense: FX tracing representative shape " + "(max_uih_len=%d, max_num_candidates=%d, " + "total_uih_len=%d, total_targets=%d)...", + max_uih_len, + max_num_candidates, + total_uih_len, + total_targets, + ) + try: + fx_dense = TGIFTracer().symbolic_trace( + trace_shim, + concrete_args=_dense_aott_concrete_args(dense_inputs), + ) + lowered_dense = unwrap_aott_wrapper_nodes(fx_dense, TGIFTracer()) + except Exception as exc: + raise RuntimeError( + "AOT-T dense lowering requires an FX-traceable dense entry point. " + "Use --dense_backend=torchscript to fall back to the D102 traced " + "TorchScript path." + ) from exc + + logger.info("AOT-T dense: compiling Triton kernels from sample inputs...") + with torch.no_grad(): + with TritonAOTCompile(): + ref_output = lowered_dense(*dense_inputs) + + original_code = lowered_dense.code + lowered_dense = transform_kernels(lowered_dense) + if lowered_dense.code == original_code: + logger.warning( + "AOT-T dense: transform_kernels did not change the FX graph. " + "This usually means no aot_triton_kernel_wrapper_* nodes were " + "present in the dense path." + ) + + libraries = _find_aott_libraries() + if not libraries: + raise RuntimeError( + "AOT-T dense lowering produced no .so files. Ensure the dense path " + "uses HammerKernel.TRITON_INFERENCE branches backed by triton_aot ops." + ) + + with torch.no_grad(): + lowered_output = lowered_dense(*dense_inputs) + torch.testing.assert_close(ref_output, lowered_output, atol=atol, rtol=rtol) + + logger.info("AOT-T dense: tracing transformed module...") + torch.jit.trace( + lowered_dense, + example_inputs=dense_inputs, + strict=False, + check_trace=False, + ).save(dense_path) + copied_libraries = _copy_aott_libraries_to_workdir(libraries, workdir) + logger.info("AOT-T dense: copied %d library file(s)", len(copied_libraries)) + return copied_libraries + + +def _build_synthetic_inputs( + hstu_config: DlrmHSTUConfig, + table_config: Dict[str, EmbeddingConfig], + uih_max_seq_len: int, +) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]: + contextual: List[str] = list(hstu_config.contextual_feature_to_max_length.keys()) + # The kuairand-1k dataset has tiny embedding tables for some contextual + # features (e.g. user_active_degree has num_embeddings=8). Clamp the + # random value range so every index stays in range for every table. + min_rows = min(t.num_embeddings for t in table_config.values()) + value_bound = max(2, min_rows) + logger.info( + "synthetic value_bound=%d (min table rows=%d across %d tables)", + value_bound, + min_rows, + len(table_config), + ) + return get_random_data( + contexual_features=contextual, + hstu_uih_keys=hstu_config.hstu_uih_feature_names, + hstu_candidates_keys=hstu_config.hstu_candidate_feature_names, + uih_max_seq_len=uih_max_seq_len, + max_num_candidates=hstu_config.max_num_candidates_inference, + value_bound=value_bound, + ) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--cpp_runner", + type=str, + default=None, + help="Path to the hstu_runner binary; default: bundled resource.", + ) + parser.add_argument( + "--dataset", + type=str, + default=_DEFAULT_DATASET, + help="Dataset key for HSTU/embedding configs.", + ) + parser.add_argument( + "--device", type=str, default="cuda:0", help="Dense-module device." + ) + parser.add_argument( + "--uih_max_seq_len", + type=int, + default=128, + help="Max UIH length for the synthetic batch.", + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--atol", type=float, default=1e-2) + parser.add_argument("--rtol", type=float, default=1e-2) + parser.add_argument( + "--dense_backend", + choices=("torchscript", "aott"), + default="torchscript", + help="Dense artifact backend. aott lowers TRITON_INFERENCE wrappers and passes compiled libraries to the C++ runner.", + ) + parser.add_argument( + "--aott_library", + action="append", + default=[], + help="Additional prebuilt AOT-T shared library to dlopen before loading dense.pt. May be repeated.", + ) + parser.add_argument( + "--keep_workdir", + action="store_true", + help="Do not delete the temp dir holding the saved artifacts.", + ) + return parser.parse_args() + + +def main() -> None: # noqa: C901 + logging.basicConfig(level=logging.INFO, format="[e2e] %(message)s", force=True) + logger.setLevel(logging.DEBUG) + args = _parse_args() + + if not torch.cuda.is_available(): + logger.error("CUDA is required; aborting.") + sys.exit(2) + + runner_path = args.cpp_runner or _find_cpp_runner() + logger.info("Using C++ runner: %s", runner_path) + + torch.manual_seed(args.seed) + device = torch.device(args.device) + torch.cuda.set_device(device) + + hstu_config = get_hstu_configs(args.dataset) + table_config = get_embedding_table_config(args.dataset) + + uih_kjt, candidates_kjt = _build_synthetic_inputs( + hstu_config, table_config, args.uih_max_seq_len + ) + + sparse_module = HSTUSparseScriptModule( + table_config=table_config, + hstu_config=hstu_config, + use_no_copy_embedding_collection=True, + ).eval() + dense_module = ( + HSTUDenseScriptModule(hstu_config=hstu_config, table_config=table_config) + .to(torch.bfloat16) + .to(device) + .eval() + ) + + from generative_recommenders.common import HammerKernel + + dense_kernel = ( + HammerKernel.TRITON_INFERENCE + if args.dense_backend == "aott" + else HammerKernel.PYTORCH + ) + sparse_module._sparse._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH) + dense_module._hstu_model.set_hammer_kernel(dense_kernel) + + # Diagnostic: walk every HammerModule submodule and print its effective + # kernel selection, so any submodule that didn't pick up the override + # surfaces immediately. Triton/Triton-CC selections will fail at trace + # time, so this print is critical for triaging the next iteration if + # tracing fails. + from generative_recommenders.common import HammerModule as _HM + + for name, m in list(sparse_module.named_modules()) + list( + dense_module.named_modules() + ): + if isinstance(m, _HM): + logger.info( + "kernel-pin %-60s -> %s (is_inference=%s, use_triton_cc=%s)", + name or "", + m.hammer_kernel().value, + m._is_inference, + m._use_triton_cc, + ) + + # === 1. Eager reference === + logger.info("Running eager reference...") + preds_eager = _eager_run( + sparse_module, dense_module, uih_kjt, candidates_kjt, device + ) + logger.info( + "preds_eager shape=%s sum=%.6f", + tuple(preds_eager.shape), + preds_eager.sum().item(), + ) + + # === 2. Trace/lower + save === + # The default path keeps D102's trace-based TorchScript artifact. The + # AOT-T path follows ModelStore's compile/transform flow and saves a + # scripted FX module whose Triton kernels dispatch through torch.ops. + workdir = tempfile.mkdtemp(prefix="hstu_e2e_") + sparse_path = os.path.join(workdir, "sparse.pt") + dense_path = os.path.join(workdir, "dense.pt") + inputs_path = os.path.join(workdir, "inputs.pt") + cpp_out_path = os.path.join(workdir, "preds_cpp.pt") + eager_out_path = os.path.join(workdir, "preds_eager.pt") + aott_library_paths: List[str] = list(args.aott_library) + python_roundtrip_aott_library_paths: List[str] = list(args.aott_library) + logger.info("workdir: %s", workdir) + + # Re-run sparse eagerly to capture an example output that can drive the + # dense trace. + with torch.no_grad(): + sparse_out = sparse_module( + uih_features=uih_kjt, candidates_features=candidates_kjt + ) + seq_emb_values = { + k: v.to(device).to(torch.bfloat16) for k, v in sparse_out[0].items() + } + seq_emb_lengths = {k: v.to(device) for k, v in sparse_out[1].items()} + payload = {k: v.to(device) for k, v in sparse_out[2].items()} + uih_lens = sparse_out[3].to(device) + num_cands = sparse_out[4].to(device) + + logger.info("Tracing sparse module via raw-tensor shim (CPU)...") + sparse_shim = _SparseTraceShim( + sparse_module=sparse_module, + uih_keys=list(uih_kjt.keys()), + candidates_keys=list(candidates_kjt.keys()), + ) + traced_sparse = torch.jit.trace( + sparse_shim, + example_inputs=( + uih_kjt.lengths(), + uih_kjt.values(), + candidates_kjt.lengths(), + candidates_kjt.values(), + ), + strict=False, + check_trace=False, + ) + traced_sparse.save(sparse_path) + + dense_inputs = ( + seq_emb_values, + seq_emb_lengths, + payload, + uih_lens, + num_cands, + ) + if args.dense_backend == "aott": + logger.info("Lowering dense module with AOT-T...") + generated_aott_library_paths = _save_aott_dense_module( + dense_module, + dense_inputs, + dense_path, + workdir, + args.atol, + args.rtol, + ) + aott_library_paths.extend(generated_aott_library_paths) + else: + logger.info("Tracing dense module (cuda:0, bf16)...") + traced_dense = torch.jit.trace( + dense_module, + example_inputs=dense_inputs, + strict=False, + check_trace=False, + ) + traced_dense.save(dense_path) + + logger.info("Scripting + saving inputs bundle...") + torch.jit.script(InputsBundle(uih_kjt, candidates_kjt)).save(inputs_path) + torch.save(preds_eager, eager_out_path) + + # === 2.5. Python-side roundtrip verification === + # Load the saved traced artifacts back in Python and verify they produce + # the same results as the eager run. This proves the artifacts are correct + # independently of the C++ runner. + logger.info("Python roundtrip: loading traced artifacts back...") + if python_roundtrip_aott_library_paths: + _load_aott_libraries_for_python(python_roundtrip_aott_library_paths) + rt_inputs = torch.jit.load(inputs_path) + rt_sparse = torch.jit.load(sparse_path) + rt_dense = torch.jit.load(dense_path) + + with torch.no_grad(): + rt_uih_l, rt_uih_v, rt_cand_l, rt_cand_v = rt_inputs() + logger.info( + " rt inputs: uih_l=%s uih_v=%s cand_l=%s cand_v=%s", + rt_uih_l.shape, + rt_uih_v.shape, + rt_cand_l.shape, + rt_cand_v.shape, + ) + + rt_sparse_out = rt_sparse(rt_uih_l, rt_uih_v, rt_cand_l, rt_cand_v) + + for i, elem in enumerate(rt_sparse_out): + if isinstance(elem, dict): + for k, v in elem.items(): + has_nan = torch.isnan(v).any().item() + has_inf = torch.isinf(v).any().item() + logger.info( + " sparse_out[%d][%s] shape=%s dtype=%s nan=%s inf=%s", + i, + k, + tuple(v.shape), + v.dtype, + has_nan, + has_inf, + ) + elif isinstance(elem, torch.Tensor): + logger.info( + " sparse_out[%d] shape=%s dtype=%s nan=%s inf=%s", + i, + tuple(elem.shape), + elem.dtype, + torch.isnan(elem).any().item(), + torch.isinf(elem).any().item(), + ) + + rt_sev = { + k: v.to(device).to(torch.bfloat16) for k, v in rt_sparse_out[0].items() + } + rt_sel = {k: v.to(device) for k, v in rt_sparse_out[1].items()} + rt_pay = {k: v.to(device) for k, v in rt_sparse_out[2].items()} + rt_uih = rt_sparse_out[3].to(device) + rt_nc = rt_sparse_out[4].to(device) + + preds_rt = rt_dense(rt_sev, rt_sel, rt_pay, rt_uih, rt_nc) + + preds_rt_cpu = preds_rt.detach().to(torch.float32).cpu() + logger.info( + "preds_roundtrip shape=%s sum=%.6f nan=%s inf=%s", + tuple(preds_rt_cpu.shape), + preds_rt_cpu.sum().item(), + torch.isnan(preds_rt_cpu).any().item(), + torch.isinf(preds_rt_cpu).any().item(), + ) + + try: + torch.testing.assert_close( + preds_eager, preds_rt_cpu, atol=args.atol, rtol=args.rtol + ) + except AssertionError as e: + logger.error("PYTHON ROUNDTRIP PARITY FAILED: %s", e) + if not args.keep_workdir: + logger.info("(workdir kept for inspection: %s)", workdir) + sys.exit(1) + logger.info("PYTHON ROUNDTRIP PASSED (atol=%g rtol=%g)", args.atol, args.rtol) + + # === 3. Invoke C++ runner === + runner_args: List[str] = [] + for library_path in aott_library_paths: + runner_args.extend(["--aott_library", library_path]) + runner_args.extend([sparse_path, dense_path, inputs_path, cpp_out_path]) + + logger.info("Running C++: %s %s", runner_path, " ".join(runner_args)) + # pyre-fixme[6]: TrustedSubprocessWithList requires Literal[str] but this + # runner is resolved from a built resource or explicit test argument. + result = TrustedSubprocessWithList.run( + executable=runner_path, + cmd_args=runner_args, + capture_output=True, + text=True, + check=False, + ) + if result.stdout: + logger.info("--- runner stdout ---\n%s", result.stdout.rstrip()) + if result.stderr: + logger.info("--- runner stderr ---\n%s", result.stderr.rstrip()) + if result.returncode != 0: + if result.returncode == -11: + logger.warning( + "C++ runner SIGSEGV (exit -11). This is a known issue with " + "torch-cpp-cuda static initialization on some machines. " + "Python roundtrip verification passed above. " + "Artifacts in: %s", + workdir, + ) + args.keep_workdir = True + else: + logger.error("C++ runner exited with code %d", result.returncode) + if not args.keep_workdir: + shutil.rmtree(workdir, ignore_errors=True) + sys.exit(result.returncode) + + # === 4. Compare === + if not os.path.exists(cpp_out_path): + logger.error("C++ runner did not produce %s", cpp_out_path) + sys.exit(1) + preds_cpp = torch.load(cpp_out_path, weights_only=False).to(torch.float32).cpu() + logger.info( + "preds_cpp shape=%s sum=%.6f", + tuple(preds_cpp.shape), + preds_cpp.sum().item(), + ) + + try: + torch.testing.assert_close( + preds_eager, preds_cpp, atol=args.atol, rtol=args.rtol + ) + except AssertionError as e: + logger.error("PARITY FAILED: %s", e) + if not args.keep_workdir: + logger.info("(workdir kept for inspection: %s)", workdir) + sys.exit(1) + + logger.info("PASSED: eager and C++ agree (atol=%g rtol=%g)", args.atol, args.rtol) + if not args.keep_workdir: + shutil.rmtree(workdir, ignore_errors=True) + + +if __name__ == "__main__": + main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin new file mode 100644 index 000000000..e2025dee0 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin @@ -0,0 +1,13 @@ +run.model_path = "" +run.scenario_name = "Server" +run.batchsize = 16 +run.output_trace = False +run.data_producer_threads = 4 +run.compute_eval = False +run.find_peak_performance = False +run.train_split_percentage = 0.75 + +# below will override mlperf rules compliant settings - don't use for official submission +run.target_qps = 2000 +run.num_queries = 10000 +run.numpy_rand_seed = 123 diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin new file mode 100644 index 000000000..a770aa014 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin @@ -0,0 +1,14 @@ +# run.model_path = "/home/linjianma/ckpts/kuairand_1k/2025_01_12_17_56_43/" +run.scenario_name = "Server" +run.batchsize = 16 +run.output_trace = False +run.data_producer_threads = 4 +run.compute_eval = False +run.find_peak_performance = False +run.train_split_percentage = 0.75 + +# below will override mlperf rules compliant settings - don't use for official submission +run.target_qps = 2000 +run.num_queries = 10000 +run.numpy_rand_seed = 123 +run.dataset_path_prefix = "/home/linjianma" diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin new file mode 100644 index 000000000..3121ac0e7 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin @@ -0,0 +1,16 @@ +run.model_path = "" +run.scenario_name = "Server" +run.batchsize = 5 +run.output_trace = False +run.data_producer_threads = 8 +run.compute_eval = False +run.find_peak_performance = False +run.train_split_percentage = 0.75 +run.sparse_quant = False + +# below will override mlperf rules compliant settings - don't use for official submission +run.target_qps = 5000 +run.num_queries = 30000 +run.numpy_rand_seed = 123 +run.dataset_path_prefix = "/home/linjianma" +run.dataset_percentage = 0.0625 diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin new file mode 100644 index 000000000..0655734c2 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin @@ -0,0 +1,15 @@ +# run.model_path = "/home/linjianma/ckpts/streaming_100b/89/" +run.scenario_name = "Server" +run.batchsize = 10 +run.output_trace = False +run.data_producer_threads = 16 +run.compute_eval = False +run.find_peak_performance = False +run.sparse_quant = False +run.numpy_rand_seed = 123 +run.dataset_path_prefix = "/home/linjianma" +run.dataset_percentage = 0.001 +run.warmup_ratio = 0.3 +run.num_queries = 20000 +# Needs to be tuned for different implementations to balance latency and throughput +run.target_qps = 1000 diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin new file mode 100644 index 000000000..eed13e0ff --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin @@ -0,0 +1,15 @@ +run.model_path = "" +run.scenario_name = "Server" +run.batchsize = 5 +run.output_trace = False +run.data_producer_threads = 8 +run.compute_eval = False +run.find_peak_performance = False +run.train_split_percentage = 0.75 +run.sparse_quant = False + +# below will override mlperf rules compliant settings - don't use for official submission +run.target_qps = 5000 +run.numpy_rand_seed = 123 +run.dataset_path_prefix = "/home/linjianma" +run.dataset_percentage = 0.00625 diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py new file mode 100644 index 000000000..cb567df63 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py @@ -0,0 +1,253 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-unsafe +""" +Inference modules for DLRMv3. + +This module provides inference-specific components for the HSTU model, +including sparse inference modules and utilities for moving tensors between devices. +""" + +from typing import Dict, Optional, Tuple + +import torch +import torchrec +from generative_recommenders.modules.dlrm_hstu import ( + DlrmHSTU, + DlrmHSTUConfig, + SequenceEmbedding, +) +from torchrec.modules.embedding_modules import ( + EmbeddingBagCollection, + EmbeddingCollection, +) +from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor +from torchrec.sparse.tensor_dict import maybe_td_to_kjt + + +IS_INFERENCE: bool = True + + +class _NoCopyEmbeddingCollection(torchrec.EmbeddingCollection): + """ + EmbeddingCollection variant that skips the dtype-cast copy in + ``EmbeddingCollection.forward`` and clamps indices into the hash-size + range. This is the script-mode replacement for the + ``functools.partial`` monkey-patch in + :func:`generative_recommenders.dlrm_v3.inference.model_family.ec_patched_forward_wo_embedding_copy`. + + The body mirrors that helper exactly so that the eager and scripted paths + produce the same embeddings. + """ + + def forward( + self, + features: KeyedJaggedTensor, + ) -> Dict[str, JaggedTensor]: + features = maybe_td_to_kjt(features, None) + feature_embeddings: Dict[str, JaggedTensor] = {} + jt_dict: Dict[str, JaggedTensor] = features.to_dict() + # Inline HASH_SIZE_1B - 1 as a literal so TorchScript can see it; the + # imported module-level constant is treated as an opaque "closed-over + # global" by jit.script and would fail with + # "python value of type 'int' cannot be used as a value". + max_index: int = 999_999_999 # HASH_SIZE_1B - 1 + for i, emb_module in enumerate(self.embeddings.values()): + feature_names = self._feature_names[i] + embedding_names = self._embedding_names_by_table[i] + for j, embedding_name in enumerate(embedding_names): + feature_name = feature_names[j] + f = jt_dict[feature_name] + indices = torch.clamp(f.values(), min=0, max=max_index) + lookup = emb_module(input=indices) + feature_embeddings[embedding_name] = JaggedTensor( + values=lookup, + lengths=f.lengths(), + weights=f.values() if self._need_indices else None, + ) + return feature_embeddings + + +def set_is_inference(is_inference: bool = False) -> None: + """ + Set the global inference mode flag. + + Args: + is_inference: If True, model operates in inference mode (no labels/weights). + If False, model operates in training/eval mode with labels. + """ + global IS_INFERENCE + IS_INFERENCE = is_inference + + +def get_hstu_model( + table_config, + hstu_config: DlrmHSTUConfig, + table_device: str = "meta", + max_hash_size: Optional[int] = None, + is_dense: bool = False, +) -> DlrmHSTU: + """ + Create and initialize an HSTU model for inference. + + Args: + table_config: Dictionary of embedding table configurations. + hstu_config: HSTU model configuration object. + table_device: Device to place embedding tables on ('meta', 'cpu', or 'cuda'). + max_hash_size: Optional maximum hash size to cap embedding table sizes. + is_dense: If True, creates model for dense-only operations. + + Returns: + Initialized DlrmHSTU model in eval mode. + """ + if max_hash_size is not None: + for t in table_config.values(): + t.num_embeddings = ( + max_hash_size if t.num_embeddings > max_hash_size else t.num_embeddings + ) + model = DlrmHSTU( + hstu_configs=hstu_config, + embedding_tables=table_config, + is_inference=IS_INFERENCE, + is_dense=is_dense, + ) + model.eval() + model.recursive_setattr("_use_triton_cc", False) + for _, module in model.named_modules(): + if isinstance(module, EmbeddingBagCollection) or isinstance( + module, EmbeddingCollection + ): + module.to_empty(device=table_device) + # to_empty leaves parameters uninitialized; fill with small random + # values so downstream bf16 ops don't produce NaN from + # uninitialized memory. + for p in module.parameters(): + if not p.is_meta: + torch.nn.init.uniform_(p, -0.01, 0.01) + return model + + +class HSTUSparseInferenceModule(torch.nn.Module): + """ + Module for sparse (embedding) inference operations. + + Handles embedding lookups and preprocessing for the HSTU model, + running on CPU to handle large embedding tables. + + Args: + table_config: Dictionary of embedding table configurations. + hstu_config: HSTU model configuration object. + """ + + def __init__( + self, + table_config, + hstu_config: DlrmHSTUConfig, + ) -> None: + super().__init__() + self._hstu_model: DlrmHSTU = get_hstu_model( + table_config, + hstu_config, + table_device="cpu", + ) + + def forward( + self, + uih_features: KeyedJaggedTensor, + candidates_features: KeyedJaggedTensor, + ) -> Tuple[ + Dict[str, SequenceEmbedding], + Dict[str, torch.Tensor], + int, + torch.Tensor, + int, + torch.Tensor, + ]: + """ + Run sparse preprocessing and embedding lookups. + + Args: + uih_features: User interaction history features as KeyedJaggedTensor. + candidates_features: Candidate item features as KeyedJaggedTensor. + + Returns: + Tuple containing: + - seq_embeddings: Dictionary of sequence embeddings per feature. + - payload_features: Dictionary of payload feature tensors. + - max_uih_len: Maximum user interaction history length. + - uih_seq_lengths: Tensor of UIH sequence lengths per batch item. + - max_num_candidates: Maximum number of candidates. + - num_candidates: Tensor of candidate counts per batch item. + """ + ( + seq_embeddings, + payload_features, + max_uih_len, + uih_seq_lengths, + max_num_candidates, + num_candidates, + ) = self._hstu_model.preprocess( + uih_features=uih_features, + candidates_features=candidates_features, + ) + return ( + seq_embeddings, + payload_features, + max_uih_len, + uih_seq_lengths, + max_num_candidates, + num_candidates, + ) + + +def move_sparse_output_to_device( + seq_embeddings: Dict[str, SequenceEmbedding], + payload_features: Dict[str, torch.Tensor], + uih_seq_lengths: torch.Tensor, + num_candidates: torch.Tensor, + device: torch.device, +) -> Tuple[ + Dict[str, SequenceEmbedding], + Dict[str, torch.Tensor], + torch.Tensor, + torch.Tensor, +]: + """ + Move sparse module outputs from CPU to the target device (typically GPU). + + Converts embeddings to bfloat16 for efficient GPU computation. + + Args: + seq_embeddings: Dictionary of sequence embeddings to move. + payload_features: Dictionary of payload features to move. + uih_seq_lengths: UIH sequence lengths tensor to move. + num_candidates: Number of candidates tensor to move. + device: Target device (e.g., torch.device('cuda:0')). + + Returns: + Tuple of moved tensors on the target device. + """ + num_candidates = num_candidates.to(device) + uih_seq_lengths = uih_seq_lengths.to(device) + seq_embeddings = { + k: SequenceEmbedding( + lengths=seq_embeddings[k].lengths.to(device), + embedding=seq_embeddings[k].embedding.to(device).to(torch.bfloat16), + ) + for k in seq_embeddings.keys() + } + for k, v in payload_features.items(): + payload_features[k] = v.to(device) + return seq_embeddings, payload_features, uih_seq_lengths, num_candidates diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py new file mode 100644 index 000000000..00e334119 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py @@ -0,0 +1,805 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict +""" +mlperf dlrm_v3 inference benchmarking tool. +""" + +import argparse +import array +import logging +import random +import threading + +logging.basicConfig(level=logging.INFO) +import os +import sys +import time +from typing import Any, Dict, List, Optional, Union + +import gin + +# pyre-ignore [21] +import mlperf_loadgen as lg # @manual +import numpy as np +import torch +from generative_recommenders.common import set_dev_mode, set_verbose_level +from generative_recommenders.dlrm_v3.configs import ( + get_embedding_table_config, + get_hstu_configs, +) +from generative_recommenders.dlrm_v3.datasets.dataset import Dataset, Samples +from generative_recommenders.dlrm_v3.datasets.synthetic_streaming import ( + DLRMv3SyntheticStreamingDataset, +) +from generative_recommenders.dlrm_v3.inference.data_producer import ( + MultiThreadDataProducer, + QueryItem, + SingleThreadDataProducer, +) +from generative_recommenders.dlrm_v3.inference.inference_modules import set_is_inference +from generative_recommenders.dlrm_v3.inference.model_family import HSTUModelFamily +from generative_recommenders.dlrm_v3.utils import ( + get_dataset, + profiler_or_nullcontext, + SUPPORTED_DATASETS, +) + + +logger: logging.Logger = logging.getLogger("main") + +torch.multiprocessing.set_start_method("spawn", force=True) + +USER_CONF = f"{os.path.dirname(__file__)}/user.conf" + +SUPPORTED_CONFIGS = { + "debug": "debug.gin", + "kuairand-1k": "kuairand_1k.gin", + "movielens-13b": "movielens_13b.gin", + "streaming-400m": "streaming_400m.gin", + "sampled-streaming-100b": "streaming_100b.gin", +} + + +SCENARIO_MAP = { # pyre-ignore [5] + "Server": lg.TestScenario.Server, + "Offline": lg.TestScenario.Offline, +} + + +def get_args(): # pyre-ignore [3] + """Parse commandline.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--dataset", default="debug", choices=SUPPORTED_DATASETS, help="dataset" + ) + args, unknown_args = parser.parse_known_args() + logger.warning(f"unknown_args: {unknown_args}") + return args + + +class Runner: + """ + Orchestrates inference benchmark execution. + + Manages data production, model inference, and result collection for + MLPerf LoadGen-based benchmarking. + + Args: + model: The HSTU model family instance for making predictions. + ds: Dataset to fetch samples from. + num_queries: Total number of queries to process. + data_producer_threads: Number of threads for data loading (default: 1). + batchsize: Batch size for inference (default: 128). + compute_eval: Whether to compute evaluation metrics (default: False). + """ + + def __init__( + self, + model: HSTUModelFamily, + ds: Dataset, + num_queries: int, + data_producer_threads: int = 1, + batchsize: int = 128, + compute_eval: bool = False, + ) -> None: + self.model = model + if data_producer_threads == 1: + self.data_producer: Union[ + MultiThreadDataProducer, SingleThreadDataProducer + ] = SingleThreadDataProducer(ds, self.run_one_item) + else: + self.data_producer = MultiThreadDataProducer( + ds, data_producer_threads, self.run_one_item + ) + self.batchsize = batchsize + self.compute_eval = compute_eval + self.reset_states(num_queries=num_queries) + + def reset_states(self, num_queries: int) -> None: + """ + Reset all internal state for a new benchmark run. + + Args: + num_queries: Number of queries expected in this run. + """ + self.result_timing: List[Dict[str, float]] = [] + self.result_batches: List[int] = [] + self.current_query_ids: List[int] = [] + self.current_content_ids: List[int] = [] + self.current_t0: List[float] = [] + self.num_queries: int = num_queries + self.processed_queries: int = 0 + + def run_one_item(self, qitem: QueryItem) -> None: + """ + Process a single query item through model inference. + + Runs prediction, records timing metrics, and sends results back to LoadGen. + + Args: + qitem: Query item containing batch of samples to process. + """ + try: + t0_prediction: float = time.time() + prediction_output = self.model.predict(qitem.samples) + dt_prediction: float = time.time() - t0_prediction + assert prediction_output is not None + ( + mt_target_preds, + mt_target_labels, + mt_target_weights, + dt_sparse, + dt_dense, + ) = prediction_output + if self.compute_eval: + assert mt_target_labels is not None + assert mt_target_weights is not None + self.result_timing.append( + { + "total": time.time() - qitem.start, + "prediction": dt_prediction, + "queue": qitem.dt_queue, + "batching": qitem.dt_batching, + "sparse": dt_sparse, + "dense": dt_dense, + } + ) + self.result_batches.append(len(qitem.query_ids)) + except Exception as ex: # pylint: disable=broad-except + logger.error("thread: failed, %s", ex) + finally: + candidate_size = mt_target_preds.size(1) // len(qitem.query_ids) + if not self.compute_eval: + for i, query_id in enumerate(qitem.query_ids): + query_mt_target_preds = ( + mt_target_preds[ # pyre-ignore [61] + 0, + candidate_size * i : candidate_size * (i + 1), + ] + .view(-1) + .float() + .numpy() + ) + response_array = array.array("B", query_mt_target_preds.tobytes()) + bi = response_array.buffer_info() + # since we send buffer to loadgen, needs `response_array` in memory during send + lg.QuerySamplesComplete( + [lg.QuerySampleResponse(query_id, bi[0], bi[1])] + ) + else: + for i, query_id in enumerate(qitem.query_ids): + query_mt_target_preds = ( + mt_target_preds[ # pyre-ignore [61] + 0, candidate_size * i : candidate_size * (i + 1) + ] + .view(-1) + .float() + .numpy() + ) + query_mt_target_labels = ( + mt_target_labels[ # pyre-ignore [16,61] + 0, candidate_size * i : candidate_size * (i + 1) + ] + .view(-1) + .float() + .numpy() + ) + query_mt_target_weights = ( + mt_target_weights[ # pyre-ignore [61] + 0, candidate_size * i : candidate_size * (i + 1) + ] + .view(-1) + .float() + .numpy() + ) + np_array = np.concatenate( + [ + query_mt_target_preds, + query_mt_target_labels, + query_mt_target_weights, + np.array([candidate_size]).astype(np.float32), + ] + ) + response_array = array.array("B", np_array.tobytes()) + bi = response_array.buffer_info() + # since we send buffer to loadgen, needs `response_array` in memory during send + lg.QuerySamplesComplete( + [lg.QuerySampleResponse(query_id, bi[0], bi[1])] + ) + + def enqueue(self, query_samples, t0: float) -> None: # pyre-ignore [2] + """ + Enqueue query samples for batch processing. + + Collects samples until batch size is reached, then dispatches to data producer. + + Args: + query_samples: List of LoadGen query sample objects. + t0: Timestamp when this batch started. + """ + self.current_query_ids.extend([q.id for q in query_samples]) + self.current_content_ids.extend([q.index for q in query_samples]) + self.current_t0.append(t0) + self.processed_queries += len(query_samples) + t0: float = min(self.current_t0) + dt_queue: float = max(self.current_t0) - min(self.current_t0) + if ( + self.processed_queries >= self.num_queries + or len(self.current_query_ids) >= self.batchsize + ): + for i in range(len(self.current_query_ids) // self.batchsize): + self.data_producer.enqueue( + query_ids=self.current_query_ids[ + i * self.batchsize : (i + 1) * self.batchsize + ], + content_ids=self.current_content_ids[ + i * self.batchsize : (i + 1) * self.batchsize + ], + t0=t0, + dt_queue=dt_queue, + ) + remaining_s: int = len(self.current_query_ids) % self.batchsize + if remaining_s > 0: + self.data_producer.enqueue( + query_ids=self.current_query_ids[-remaining_s:], + content_ids=self.current_content_ids[-remaining_s:], + t0=t0, + dt_queue=dt_queue, + ) + self.current_query_ids = [] + self.current_content_ids = [] + self.current_t0 = [] + + def finish(self) -> None: + """Signal data producer to finish and wait for completion.""" + self.data_producer.finish() + + +def add_results( + final_results: Dict[str, Any], + result_timing: List[Dict[str, float]], + result_batches: List[int], +) -> None: + """ + Aggregate and log benchmark results. + + Computes percentile statistics and QPS metrics from timing data. + + Args: + final_results: Dictionary to populate with aggregated results. + result_timing: List of timing dictionaries for each batch. + result_batches: List of batch sizes processed. + """ + percentiles: list[float] = [50.0, 80.0, 90.0, 95.0, 99.0, 99.9] + buckets_dict: Dict[str, List[float]] = {} + buckets_str_dict: Dict[str, str] = {} + total_timing: list[float] = [result["total"] for result in result_timing] + for key in ["total", "prediction", "queue", "batching", "sparse", "dense"]: + timing: list[float] = [result[key] for result in result_timing] + buckets: List[float] = np.percentile(timing, percentiles).tolist() + buckets_str: str = ",".join( + ["| {}:{:.4f}| ".format(p, b) for p, b in zip(percentiles, buckets)] + ) + buckets_dict[key] = buckets + buckets_str_dict[key] = buckets_str + total_batches = sum(result_batches) + + final_results["good"] = len(total_timing) + final_results["avg_time"] = np.mean(total_timing) + final_results["percentiles"] = { + str(k): v for k, v in zip(percentiles, buckets_dict["total"]) + } + final_results["qps"] = total_batches / final_results["took"] + final_results["count"] = total_batches + + for i, timing in enumerate(result_timing): + logger.warning(f"timing of {i}: {timing}") + + logger.warning( + "{} qps={:.2f}, avg_query_time={:.4f}, time={:.3f}, queries={}, tiles={}".format( + final_results["scenario"], + final_results["qps"], + final_results["avg_time"], + final_results["took"], + len(result_timing), + buckets_str_dict["total"], + ) + ) + for key in ["prediction", "queue", "batching", "sparse", "dense"]: + logger.warning(f"{key}: {buckets_str_dict[key]}") + + +def get_num_queries( + input_size: Optional[int], + one_pass_size: int, + scenario_name: str, + offline_target_qps: int, + target_duration: float, +) -> int: + """ + Determine the number of queries to run based on scenario and settings. + + Args: + input_size: User-specified query count (None to use defaults). + one_pass_size: Size of one complete pass through the dataset. + scenario_name: MLPerf scenario name ('Server' or 'Offline'). + offline_target_qps: Target QPS for offline scenario. + target_duration: Target duration in milliseconds. + + Returns: + Number of queries to execute in the benchmark run. + """ + if scenario_name == "Offline": + # consistent with https://github.com/mlcommons/inference/blob/8999c4d686f6e4a180da14597c97063fce7c9f33/loadgen/test_settings_internal.cc#L147 + return int(1.1 * target_duration / 1000 * offline_target_qps) + else: + if input_size is None: + return one_pass_size + return input_size + + +class StreamingQuerySampler: + """ + Sampler for streaming dataset + The execution order is determined by `StreamingQuerySampler.run_order`, not by the QSL or input query ID. + This ensures that queries are executed according to their timestamp constraints. + """ + + def __init__( + self, + ds: DLRMv3SyntheticStreamingDataset, + dataset_percentage: float, + scenario_name: str, + offline_target_qps: int, + target_duration: float, + input_queries: Optional[int] = None, + compute_eval: bool = False, + ) -> None: + self.ds: DLRMv3SyntheticStreamingDataset = ds + self.ds.is_inference = True + self.inference_ts: int = self.ds.total_ts - self.ds.train_ts + self.start_ts: int = self.ds.train_ts + self.dataset_percentage: float = dataset_percentage + self.num_unique_requests: List[int] = self.get_num_unique_requests( + warmup_ratio=1.0 + ) + self.num_unique_requests_cumsum: List[int] = np.cumsum( + self.num_unique_requests + ).tolist() + self.total_requests: int = sum(self.num_unique_requests) + self.run_order: List[List[int]] = self.build_random_exec_order() + self.ts_idx: int = 0 + self.ts_processed_cnt: int = 0 + self.last_loaded: float = -1.0 + num_queries: int = get_num_queries( + input_size=input_queries, + one_pass_size=self.total_requests, + scenario_name=scenario_name, + offline_target_qps=offline_target_qps, + target_duration=target_duration, + ) + logger.warning( + f"StreamingQuerySampler constructred to handle {num_queries} queries" + ) + self.num_repeats: int = ( + max(1, num_queries // self.total_requests) if not compute_eval else 1 + ) + self.remaining_queries: int = ( + num_queries % self.total_requests if not compute_eval else 0 + ) + self._lock = threading.Lock() + + def get_num_unique_requests(self, warmup_ratio: float) -> List[int]: + """ + Calculate number of unique requests per timestamp. + + Args: + warmup_ratio: Fraction of users to include in warmup. + + Returns: + List of request counts per timestamp. + """ + num_unique_requests = [ + int( + self.ds.ts_to_users_cumsum[t][-1] + * self.dataset_percentage + * warmup_ratio + ) + for t in range(self.start_ts, self.start_ts + self.inference_ts) + ] + return num_unique_requests + + def build_random_exec_order(self) -> List[List[int]]: + """ + Build randomized execution order for each timestamp. + + Returns: + List of shuffled index lists, one per timestamp. + """ + order = [] + for req_size in self.num_unique_requests: + within_ts_order = list(range(req_size)) + random.shuffle(within_ts_order) + order.append(within_ts_order) + return order + + def init_sut(self) -> None: + """Initialize System Under Test state for a new benchmark run.""" + self.ts_idx = 0 + self.ts_processed_cnt = 0 + self.ds.set_ts(self.start_ts) + + def load_query_samples(self, query_ids: List[Optional[int]]) -> None: + """ + Load query samples into memory for the benchmark. + + Args: + query_ids: List of query identifiers to load. + """ + length = len(query_ids) + ts_idx: int = 0 + while self.num_unique_requests_cumsum[ts_idx] < length: + ts_idx += 1 + for i in range(0, ts_idx): + self.ds.set_ts(i + self.start_ts) + self.ds.load_query_samples(self.run_order[i]) + self.ds.set_ts(ts_idx + self.start_ts) + delta_length = ( + length + if ts_idx == 0 + else length - self.num_unique_requests_cumsum[ts_idx - 1] + ) + self.ds.load_query_samples(self.run_order[ts_idx][:delta_length]) + self.init_sut() + self.last_loaded = time.time() + + def unload_query_samples(self, sample_list: List[int]) -> None: + """ + Unload query samples from memory. + + Args: + sample_list: List of sample identifiers to unload. + """ + self.ds.unload_query_samples(sample_list) + + def get_samples(self, id_list: List[int]) -> List[Samples]: + """ + Get samples for a batch of queries, handling timestamp boundaries. + + Args: + id_list: List of query identifiers. + + Returns: + List of Samples objects, potentially spanning multiple timestamps. + """ + batch_size: int = len(id_list) + with self._lock: + curr_ts_idx: int = self.ts_idx + curr_ts_unique_requests: int = self.num_unique_requests[curr_ts_idx] + curr_ts_queries: int = curr_ts_unique_requests * self.num_repeats + if curr_ts_idx == self.inference_ts - 1: + curr_ts_queries += self.remaining_queries + begin_query_idx: int = self.ts_processed_cnt + end_query_idx: int = min(begin_query_idx + batch_size, curr_ts_queries) + begin_request_idx: int = begin_query_idx % curr_ts_unique_requests + end_request_idx: int = end_query_idx % curr_ts_unique_requests + if begin_query_idx + batch_size >= curr_ts_queries: + self.ts_idx += 1 + self.ts_processed_cnt = begin_query_idx + batch_size - curr_ts_queries + else: + self.ts_processed_cnt = begin_query_idx + batch_size + # requests of current ts + outputs: List[Samples] = [] + if end_request_idx > begin_request_idx: + output: Samples = self.ds.get_samples_with_ts( + self.run_order[curr_ts_idx][begin_request_idx:end_request_idx], + curr_ts_idx + self.start_ts, + ) + outputs.append(output) + else: + if begin_request_idx < curr_ts_unique_requests: + output: Samples = self.ds.get_samples_with_ts( + self.run_order[curr_ts_idx][begin_request_idx:], + curr_ts_idx + self.start_ts, + ) + outputs.append(output) + if end_request_idx > 0: + output = self.ds.get_samples_with_ts( + self.run_order[curr_ts_idx][0:end_request_idx], + curr_ts_idx + self.start_ts, + ) + outputs.append(output) + # requests of next ts + if begin_query_idx + batch_size > curr_ts_queries: + output: Samples = self.ds.get_samples_with_ts( + self.run_order[curr_ts_idx + 1][ + : begin_query_idx + batch_size - curr_ts_queries + ], + curr_ts_idx + 1 + self.start_ts, + ) + outputs.append(output) + return outputs + + def get_item_count(self) -> int: + """ + Get total number of items in the dataset. + + Returns: + Total request count across all timestamps. + """ + return self.total_requests + + +@gin.configurable +def run( + dataset: str = "sampled-streaming-100b", + model_path: str = "", + scenario_name: str = "Server", + batchsize: int = 16, + output_trace: bool = False, + data_producer_threads: int = 4, + compute_eval: bool = False, + find_peak_performance: bool = False, + dataset_path_prefix: str = "", + train_split_percentage: float = 0.75, + warmup_ratio: float = 0.1, + target_qps: Optional[int] = None, + num_queries: Optional[int] = None, + numpy_rand_seed: int = 123, + sparse_quant: bool = False, + dataset_percentage: float = 1.0, +) -> None: + """ + Execute the MLPerf DLRMv3 inference benchmark. + + Sets up the model, dataset, and LoadGen infrastructure, then runs + warmup and official benchmark phases. + + Args: + dataset: Dataset identifier to use. + model_path: Path to model checkpoint directory. + scenario_name: MLPerf scenario ('Server' or 'Offline'). + batchsize: Batch size for inference. + output_trace: Whether to output profiling traces. + data_producer_threads: Number of data loading threads. + compute_eval: Whether to compute accuracy metrics. + find_peak_performance: Whether to run peak performance finding mode. + dataset_path_prefix: Prefix path for dataset files. + warmup_ratio: Fraction of data to use for warmup. + target_qps: Target queries per second. + num_queries: Number of queries to run (None for automatic). + numpy_rand_seed: Random seed for reproducibility. + sparse_quant: Whether to quantize sparse embeddings. + dataset_percentage: Fraction of dataset to use. + """ + set_dev_mode(False) + if scenario_name not in SCENARIO_MAP: + raise NotImplementedError("valid scanarios:" + str(list(SCENARIO_MAP.keys()))) + scenario = SCENARIO_MAP[scenario_name] + np.random.seed(numpy_rand_seed) + random.seed(numpy_rand_seed) + + hstu_config = get_hstu_configs(dataset) + hstu_config.max_num_candidates = hstu_config.max_num_candidates_inference + table_config = get_embedding_table_config(dataset) + set_is_inference(is_inference=not compute_eval) + + user_conf = os.path.abspath(USER_CONF) + if not os.path.exists(user_conf): + logger.error("{} not found".format(user_conf)) + sys.exit(1) + + settings = lg.TestSettings() + settings.FromConfig(user_conf, model_path, scenario_name) + settings.scenario = scenario + settings.mode = lg.TestMode.PerformanceOnly + if compute_eval: + settings.mode = lg.TestMode.AccuracyOnly + if find_peak_performance: + settings.mode = lg.TestMode.FindPeakPerformance + if target_qps: + settings.server_target_qps = float(target_qps) + settings.offline_expected_qps = float(target_qps) + + model_family = HSTUModelFamily( + hstu_config=hstu_config, + table_config=table_config, + sparse_quant=sparse_quant, + output_trace=output_trace, + compute_eval=compute_eval, + ) + is_streaming: bool = "streaming" in dataset + dataset, kwargs = get_dataset(dataset, dataset_path_prefix) + + ds: Dataset = dataset( + hstu_config=hstu_config, + embedding_config=table_config, + is_inference=not compute_eval, + **kwargs, + ) + if is_streaming: + ds = StreamingQuerySampler( # pyre-ignore + ds=ds, # pyre-ignore [6] + dataset_percentage=dataset_percentage, + input_queries=num_queries, + compute_eval=compute_eval, + scenario_name=scenario_name, + offline_target_qps=settings.offline_expected_qps, + target_duration=settings.min_duration_ms, + ) + model_family.load(model_path) + + # warmup + for autotune_bs in range(batchsize, 0, -1): + logger.warning(f"Autotune for batch size {autotune_bs}") + warmup_ids = list(range(autotune_bs)) + ds.load_query_samples(warmup_ids) + for _ in range(4 * int(os.environ.get("WORLD_SIZE", 1))): + if is_streaming: + ds.init_sut() # pyre-ignore [16] + sample: Union[Samples, List[Samples]] = ds.get_samples(warmup_ids) + if isinstance(sample, Samples): + model_family.predict(sample) + else: + for s in sample: + model_family.predict(s) + ds.unload_query_samples(None) + for h in logger.handlers: + h.flush() + logger.info("Model forward warmup done") + + count = int( + ds.get_item_count() * dataset_percentage + if not is_streaming + else ds.get_item_count() + ) + train_size: int = round(train_split_percentage * count) if not is_streaming else 0 + if compute_eval: + count = count - train_size + + runner: Runner = Runner( + model_family, + ds, + data_producer_threads=data_producer_threads, + batchsize=batchsize, + compute_eval=compute_eval, + num_queries=count, + ) + + def issue_queries(query_samples) -> None: # pyre-ignore [2] + if compute_eval: + for sample in query_samples: + sample.index = sample.index + train_size + runner.enqueue(query_samples, time.time()) + + def load_query_samples(query_ids: List[int]) -> None: + if compute_eval: + query_ids = [q + train_size for q in query_ids] + ds.load_query_samples(query_ids) + + def flush_queries() -> None: + pass + + if scenario == lg.TestScenario.Server: + # inference benchmark warmup + if is_streaming: + ds.init_sut() + warmup_count: int = sum( + ds.get_num_unique_requests( # pyre-ignore [16] + warmup_ratio=warmup_ratio + ) + ) + else: + warmup_count: int = int(count * warmup_ratio) + runner.reset_states(num_queries=warmup_count) + final_results = { + "runtime": model_family.name(), + "version": model_family.version(), + "time": int(time.time()), + "scenario": str(scenario), + } + settings.min_query_count = warmup_count + settings.max_query_count = warmup_count + sut = lg.ConstructSUT(issue_queries, flush_queries) + qsl = lg.ConstructQSL( + warmup_count, + warmup_count, + load_query_samples, + ds.unload_query_samples, + ) + with profiler_or_nullcontext(enabled=output_trace, with_stack=False): + logger.info(f"starting warmup {scenario} with {warmup_count} queries") + lg.StartTest(sut, qsl, settings) + lg.DestroyQSL(qsl) + lg.DestroySUT(sut) + + # official run + if is_streaming: + ds.init_sut() + final_results = { + "runtime": model_family.name(), + "version": model_family.version(), + "time": int(time.time()), + "scenario": str(scenario), + } + query_size: int = get_num_queries( + input_size=num_queries, + one_pass_size=count, + scenario_name=scenario_name, + offline_target_qps=settings.offline_expected_qps, + target_duration=settings.min_duration_ms, + ) + settings.min_query_count = query_size + settings.max_query_count = query_size + runner.reset_states(num_queries=query_size if not compute_eval else count) + sut = lg.ConstructSUT(issue_queries, flush_queries) + qsl = lg.ConstructQSL( + count, + count, + load_query_samples, + ds.unload_query_samples, + ) + with profiler_or_nullcontext(enabled=output_trace, with_stack=False): + logger.info( + f"starting {scenario} with {query_size} queries and {query_size // count} repeats" + ) + lg.StartTest(sut, qsl, settings) + runner.finish() + final_results["took"] = time.time() - ds.last_loaded + lg.DestroyQSL(qsl) + lg.DestroySUT(sut) + + add_results( + final_results, + runner.result_timing, + runner.result_batches, + ) + # If multiple subprocesses are running the model send a signal to stop them + if int(os.environ.get("WORLD_SIZE", 1)) > 1: + model_family.predict(None) + + +def main() -> None: + set_verbose_level(1) + args = get_args() + logger.info(args) + gin_path = f"{os.path.dirname(__file__)}/gin/{SUPPORTED_CONFIGS[args.dataset]}" + gin.parse_config_file(gin_path) + run(dataset=args.dataset) + + +if __name__ == "__main__": + main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf b/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf new file mode 100644 index 000000000..a2b4f6fff --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf @@ -0,0 +1,98 @@ +# The format of this config file is 'key = value'. +# The key has the format 'model.scenario.key'. Value is mostly int64_t. +# Model maybe '*' as wildcard. In that case the value applies to all models. +# All times are in milli seconds + +# Set performance_sample_count for each model. +# User can optionally set this to higher values in user.conf. +resnet50.*.performance_sample_count_override = 1024 +ssd-mobilenet.*.performance_sample_count_override = 256 +retinanet.*.performance_sample_count_override = 64 +bert.*.performance_sample_count_override = 10833 +dlrm.*.performance_sample_count_override = 204800 +dlrm-v2.*.performance_sample_count_override = 204800 +rnnt.*.performance_sample_count_override = 2513 +gptj.*.performance_sample_count_override = 13368 +llama2-70b.*.performance_sample_count_override = 24576 +stable-diffusion-xl.*.performance_sample_count_override = 5000 +# set to 0 to let entire sample set to be performance sample +3d-unet.*.performance_sample_count_override = 0 + +# Set seeds. The seeds will be distributed two weeks before the submission. +*.*.qsl_rng_seed = 3066443479025735752 +*.*.sample_index_rng_seed = 10688027786191513374 +*.*.schedule_rng_seed = 14962580496156340209 +# Set seeds for TEST_05. The seeds will be distributed two weeks before the submission. +*.*.test05_qsl_rng_seed = 16799458546791641818 +*.*.test05_sample_index_rng_seed = 5453809927556429288 +*.*.test05_schedule_rng_seed = 5435552105434836064 + + +*.SingleStream.target_latency_percentile = 90 +*.SingleStream.min_duration = 600000 + +*.MultiStream.target_latency_percentile = 99 +*.MultiStream.samples_per_query = 8 +*.MultiStream.min_duration = 600000 +*.MultiStream.min_query_count = 662 +retinanet.MultiStream.target_latency = 528 + +# 3D-UNet uses equal issue mode because it has non-uniform inputs +3d-unet.*.sample_concatenate_permutation = 1 + +# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario +gptj.*.sample_concatenate_permutation = 1 +llama2-70b.*.sample_concatenate_permutation = 1 +mixtral-8x7b.*.sample_concatenate_permutation = 1 + +*.Server.target_latency = 10 +*.Server.target_latency_percentile = 99 +*.Server.target_duration = 0 +*.Server.min_duration = 600000 +resnet50.Server.target_latency = 15 +retinanet.Server.target_latency = 100 +bert.Server.target_latency = 130 +dlrm.Server.target_latency = 60 +dlrm-v2.Server.target_latency = 60 +rnnt.Server.target_latency = 1000 +gptj.Server.target_latency = 20000 +stable-diffusion-xl.Server.target_latency = 20000 +# Llama2-70b benchmarks measures token latencies +llama2-70b.*.use_token_latencies = 1 +mixtral-8x7b.*.use_token_latencies = 1 +# gptj benchmark infers token latencies +gptj.*.infer_token_latencies = 1 +gptj.*.token_latency_scaling_factor = 69 +# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0 +llama2-70b.Server.target_latency = 0 +llama2-70b.Server.ttft_latency = 2000 +llama2-70b.Server.tpot_latency = 200 + +mixtral-8x7b.Server.target_latency = 0 +mixtral-8x7b.Server.ttft_latency = 2000 +mixtral-8x7b.Server.tpot_latency = 200 + +*.Offline.target_latency_percentile = 90 +*.Offline.min_duration = 600000 + +# In Offline scenario, we always have one query. But LoadGen maps this to +# min_sample_count internally in Offline scenario. If the dataset size is larger +# than 24576 we limit the min_query_count to 24576 and otherwise we use +# the dataset size as the limit + +resnet50.Offline.min_query_count = 24576 +retinanet.Offline.min_query_count = 24576 +dlrm-v2.Offline.min_query_count = 24576 +bert.Offline.min_query_count = 10833 +gptj.Offline.min_query_count = 13368 +rnnt.Offline.min_query_count = 2513 +3d-unet.Offline.min_query_count = 43 +stable-diffusion-xl.Offline.min_query_count = 5000 +llama2-70b.Offline.min_query_count = 24576 +mixtral-8x7b.Offline.min_query_count = 15000 + +# These fields should be defined and overridden by user.conf. +*.SingleStream.target_latency = 10 +*.MultiStream.target_latency = 80 +*.Server.target_qps = 1.0 +*.Offline.target_qps = 1.0 diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py new file mode 100644 index 000000000..1c8bcd237 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py @@ -0,0 +1,705 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict +""" +model_family for dlrm_v3. +""" + +import copy +import functools +import logging +import os +import time +import uuid +from threading import Event +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.multiprocessing as mp +import torchrec +from generative_recommenders.dlrm_v3.checkpoint import ( + load_nonsparse_checkpoint, + load_sparse_checkpoint, +) +from generative_recommenders.dlrm_v3.configs import HASH_SIZE_1B +from generative_recommenders.dlrm_v3.datasets.dataset import Samples +from generative_recommenders.dlrm_v3.inference.inference_modules import ( + get_hstu_model, + HSTUSparseInferenceModule, + move_sparse_output_to_device, + set_is_inference, +) +from generative_recommenders.dlrm_v3.utils import Profiler +from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig, SequenceEmbedding +from pyre_extensions import none_throws +from torch import quantization as quant +from torchrec.distributed.quant_embedding import QuantEmbeddingCollection +from torchrec.modules.embedding_configs import EmbeddingConfig, QuantConfig +from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor +from torchrec.sparse.tensor_dict import maybe_td_to_kjt +from torchrec.test_utils import get_free_port + +logger: logging.Logger = logging.getLogger(__name__) + + +class HSTUModelFamily: + """ + High-level interface for the HSTU model family. + + Manages both sparse (embedding) and dense (transformer) components of the + HSTU model, supporting distributed inference across multiple GPUs. + + Args: + hstu_config: Configuration object for the HSTU model. + table_config: Dictionary of embedding table configurations. + output_trace: Whether to enable profiling trace output. + sparse_quant: Whether to quantize sparse embeddings. + compute_eval: Whether to compute evaluation metrics (includes labels). + """ + + def __init__( + self, + hstu_config: DlrmHSTUConfig, + table_config: Dict[str, EmbeddingConfig], + output_trace: bool = False, + sparse_quant: bool = False, + compute_eval: bool = False, + ) -> None: + self.hstu_config = hstu_config + self.table_config = table_config + self.sparse: ModelFamilySparseDist = ModelFamilySparseDist( + hstu_config=hstu_config, + table_config=table_config, + quant=sparse_quant, + ) + + assert torch.cuda.is_available(), "CUDA is required for this benchmark." + ngpus = torch.cuda.device_count() + self.world_size = int(os.environ.get("WORLD_SIZE", str(ngpus))) + logger.warning(f"Using {self.world_size} GPU(s)...") + dense_model_family_clazz = ( + ModelFamilyDenseDist + if self.world_size > 1 + else ModelFamilyDenseSingleWorker + ) + self.dense: Union[ModelFamilyDenseDist, ModelFamilyDenseSingleWorker] = ( + dense_model_family_clazz( + hstu_config=hstu_config, + table_config=table_config, + output_trace=output_trace, + compute_eval=compute_eval, + ) + ) + + def version(self) -> str: + """Return the PyTorch version string.""" + return torch.__version__ + + def name(self) -> str: + """Return the model family name identifier.""" + return "model-family-hstu" + + def load(self, model_path: str) -> None: + """ + Load model checkpoints from disk. + + Args: + model_path: Base path to the model checkpoint directory. + """ + self.sparse.load(model_path=model_path) + self.dense.load(model_path=model_path) + + def predict( + self, samples: Optional[Samples] + ) -> Optional[ + Tuple[ + torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], float, float + ] + ]: + """ + Run inference on a batch of samples. + + Processes samples through sparse embeddings, then dense forward pass. + + Args: + samples: Input samples containing features. If None, signals shutdown. + + Returns: + Tuple of (predictions, labels, weights, sparse_time, dense_time) or None. + """ + with torch.no_grad(): + if samples is None: + self.dense.predict(None, None, 0, None, 0, None) + return None + ( + seq_embeddings, + payload_features, + max_uih_len, + uih_seq_lengths, + max_num_candidates, + num_candidates, + dt_sparse, + ) = self.sparse.predict(samples) + out = self.dense.predict( + seq_embeddings, + payload_features, + max_uih_len, + uih_seq_lengths, + max_num_candidates, + num_candidates, + ) + ( # pyre-ignore [23] + mt_target_preds, + mt_target_labels, + mt_target_weights, + dt_dense, + ) = out + return ( + mt_target_preds, + mt_target_labels, + mt_target_weights, + dt_sparse, + dt_dense, + ) + + +def ec_patched_forward_wo_embedding_copy( + ec_module: torchrec.EmbeddingCollection, + features: KeyedJaggedTensor, # can also take TensorDict as input +) -> Dict[str, JaggedTensor]: + """ + Run the EmbeddingBagCollection forward pass. This method takes in a `KeyedJaggedTensor` + and returns a `Dict[str, JaggedTensor]`, which is the result of the individual embeddings for each feature. + + Args: + features (KeyedJaggedTensor): KJT of form [F X B X L]. + + Returns: + Dict[str, JaggedTensor] + """ + features = maybe_td_to_kjt(features, None) + feature_embeddings: Dict[str, JaggedTensor] = {} + jt_dict: Dict[str, JaggedTensor] = features.to_dict() + for i, emb_module in enumerate(ec_module.embeddings.values()): + feature_names = ec_module._feature_names[i] + embedding_names = ec_module._embedding_names_by_table[i] + for j, embedding_name in enumerate(embedding_names): + feature_name = feature_names[j] + f = jt_dict[feature_name] + indices = torch.clamp(f.values(), min=0, max=HASH_SIZE_1B - 1) + lookup = emb_module( + input=indices + ) # remove the dtype cast at https://github.com/meta-pytorch/torchrec/blob/0a2cebd5472a7edc5072b3c912ad8aaa4179b9d9/torchrec/modules/embedding_modules.py#L486 + feature_embeddings[embedding_name] = JaggedTensor( + values=lookup, + lengths=f.lengths(), + weights=f.values() if ec_module._need_indices else None, + ) + return feature_embeddings + + +class ModelFamilySparseDist: + """ + Sparse Arch module manager. + + Handles loading and inference of sparse embedding lookups, optionally + with quantization for memory efficiency. + + Args: + hstu_config: HSTU model configuration. + table_config: Embedding table configurations. + quant: Whether to apply dynamic quantization to embeddings. + """ + + def __init__( + self, + hstu_config: DlrmHSTUConfig, + table_config: Dict[str, EmbeddingConfig], + quant: bool = False, + ) -> None: + super(ModelFamilySparseDist, self).__init__() + self.hstu_config = hstu_config + self.table_config = table_config + self.module: Optional[torch.nn.Module] = None + self.quant: bool = quant + + def load(self, model_path: str) -> None: + """ + Load sparse model checkpoint and optionally apply quantization. + + Args: + model_path: Path to the model checkpoint directory. + """ + logger.warning(f"Loading sparse module from {model_path}") + + sparse_arch: HSTUSparseInferenceModule = HSTUSparseInferenceModule( + table_config=self.table_config, + hstu_config=self.hstu_config, + ) + load_sparse_checkpoint(model=sparse_arch._hstu_model, path=model_path) + sparse_arch.eval() + if self.quant: + self.module = quant.quantize_dynamic( + sparse_arch, + qconfig_spec={ + torchrec.EmbeddingCollection: QuantConfig( + activation=quant.PlaceholderObserver.with_args( + dtype=torch.float + ), + weight=quant.PlaceholderObserver.with_args(dtype=torch.int8), + ), + }, + mapping={ + torchrec.EmbeddingCollection: QuantEmbeddingCollection, + }, + inplace=False, + ) + else: + sparse_arch._hstu_model._embedding_collection.forward = ( # pyre-ignore[8] + functools.partial( + ec_patched_forward_wo_embedding_copy, + sparse_arch._hstu_model._embedding_collection, + ) + ) + self.module = sparse_arch + logger.warning(f"sparse module is {self.module}") + + def predict( + self, samples: Samples + ) -> Tuple[ + Dict[str, SequenceEmbedding], + Dict[str, torch.Tensor], + int, + torch.Tensor, + int, + torch.Tensor, + float, + ]: + """ + Run sparse forward pass (embedding lookups). + + Args: + samples: Input samples with feature tensors. + + Returns: + Tuple of (seq_embeddings, payload_features, max_uih_len, uih_seq_lengths, + max_num_candidates, num_candidates, elapsed_time). + """ + with torch.profiler.record_function("sparse forward"): + module: torch.nn.Module = none_throws(self.module) + assert self.module is not None + uih_features = samples.uih_features_kjt + candidates_features = samples.candidates_features_kjt + t0: float = time.time() + ( + seq_embeddings, + payload_features, + max_uih_len, + uih_seq_lengths, + max_num_candidates, + num_candidates, + ) = module( + uih_features=uih_features, + candidates_features=candidates_features, + ) + dt_sparse: float = time.time() - t0 + return ( + seq_embeddings, + payload_features, + max_uih_len, + uih_seq_lengths, + max_num_candidates, + num_candidates, + dt_sparse, + ) + + +class ModelFamilyDenseDist: + """ + Distributed dense module manager for multi-GPU inference. + + Spawns worker processes for each GPU to run dense forward passes in parallel, + with samples distributed via inter-process queues. + + Args: + hstu_config: HSTU model configuration. + table_config: Embedding table configurations. + output_trace: Whether to enable profiling traces. + compute_eval: Whether to compute evaluation metrics. + """ + + def __init__( + self, + hstu_config: DlrmHSTUConfig, + table_config: Dict[str, EmbeddingConfig], + output_trace: bool = False, + compute_eval: bool = False, + ) -> None: + super(ModelFamilyDenseDist, self).__init__() + self.hstu_config = hstu_config + self.table_config = table_config + self.output_trace = output_trace + self.compute_eval = compute_eval + + ngpus = torch.cuda.device_count() + self.world_size = int(os.environ.get("WORLD_SIZE", str(ngpus))) + self.rank = 0 + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(get_free_port()) + self.dist_backend = "nccl" + + ctx = mp.get_context("spawn") + self.samples_q: List[mp.Queue] = [ctx.Queue() for _ in range(self.world_size)] + self.result_q: List[mp.Queue] = [ctx.Queue() for _ in range(self.world_size)] + + def load(self, model_path: str) -> None: + """ + Load dense model and spawn worker processes for distributed inference. + + Args: + model_path: Path to the model checkpoint directory. + """ + logger.warning(f"Loading dense module from {model_path}") + + ctx = mp.get_context("spawn") + processes = [] + for rank in range(self.world_size): + p = ctx.Process( + target=self.distributed_setup, + args=( + rank, + self.world_size, + model_path, + ), + ) + p.start() + processes.append(p) + + def distributed_setup(self, rank: int, world_size: int, model_path: str) -> None: + """ + Initialize and run a dense worker process. + + Each worker loads the model, processes samples from its queue, and + returns results. + + Args: + rank: Process rank (GPU index). + world_size: Total number of worker processes. + model_path: Path to model checkpoint. + """ + nprocs_per_rank = 16 + start_core: int = nprocs_per_rank * rank + cores: set[int] = set([start_core + i for i in range(nprocs_per_rank)]) + os.sched_setaffinity(0, cores) + set_is_inference(is_inference=not self.compute_eval) + model = get_hstu_model( + table_config=self.table_config, + hstu_config=self.hstu_config, + table_device="cpu", + max_hash_size=100, + is_dense=True, + ).to(torch.bfloat16) + model.set_training_dtype(torch.bfloat16) + device = torch.device(f"cuda:{rank}") + torch.cuda.set_device(f"cuda:{rank}") + load_nonsparse_checkpoint( + model=model, device=device, optimizer=None, path=model_path + ) + model = model.to(device) + model.eval() + profiler = Profiler(rank) if self.output_trace else None + + with torch.no_grad(): + while True: + item = self.samples_q[rank].get() + # If -1 is received terminate all subprocesses + if item == -1: + break + if self.output_trace: + assert profiler is not None + profiler.step() + with torch.profiler.record_function("get_item_from_queue"): + # Copy here to release data in the producer to avoid invalid cuda caching allocator release. + item = copy.deepcopy(item) + ( + id, + seq_embeddings, + payload_features, + max_uih_len, + uih_seq_lengths, + max_num_candidates, + num_candidates, + ) = item + assert seq_embeddings is not None + with torch.profiler.record_function("dense forward"): + ( + _, + _, + _, + mt_target_preds, + mt_target_labels, + mt_target_weights, + ) = model.main_forward( + seq_embeddings=seq_embeddings, + payload_features=payload_features, + max_uih_len=max_uih_len, + uih_seq_lengths=uih_seq_lengths, + max_num_candidates=max_num_candidates, + num_candidates=num_candidates, + ) + # mt_target_preds = torch.empty(1, 2048 * 20).to(device="cpu") + # mt_target_labels = None + # mt_target_weights = None + assert mt_target_preds is not None + mt_target_preds = mt_target_preds.detach().to(device="cpu") + if mt_target_labels is not None: + mt_target_labels = mt_target_labels.detach().to(device="cpu") + if mt_target_weights is not None: + mt_target_weights = mt_target_weights.detach().to(device="cpu") + self.result_q[rank].put( + (id, mt_target_preds, mt_target_labels, mt_target_weights) + ) + + def capture_output( + self, id: uuid.UUID, rank: int + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Retrieve inference results from a worker process. + + Args: + id: Unique identifier for the request. + rank: Worker rank to retrieve from. + + Returns: + Tuple of (predictions, labels, weights). + """ + while True: + recv_id, preds, labels, weights = self.result_q[rank].get() + assert recv_id == id + return preds, labels, weights + + def get_rank(self) -> int: + """ + Get the next worker rank for load balancing. + + Returns: + Rank index, cycling through available workers. + """ + rank = self.rank + self.rank = (self.rank + 1) % self.world_size + return rank + + def predict( + self, + seq_embeddings: Optional[Dict[str, SequenceEmbedding]], + payload_features: Optional[Dict[str, torch.Tensor]], + max_uih_len: int, + uih_seq_lengths: Optional[torch.Tensor], + max_num_candidates: int, + num_candidates: Optional[torch.Tensor], + ) -> Optional[ + Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], float] + ]: + """ + Run distributed dense forward pass. + + Dispatches work to a worker process and collects results. + + Args: + seq_embeddings: Sequence embeddings from sparse module. + payload_features: Additional feature tensors. + max_uih_len: Maximum UIH sequence length. + uih_seq_lengths: Per-sample UIH lengths. + max_num_candidates: Maximum candidates per sample. + num_candidates: Per-sample candidate counts. + + Returns: + Tuple of (predictions, labels, weights, elapsed_time) or None if shutdown. + """ + id = uuid.uuid4() + # If none is received terminate all subprocesses + if seq_embeddings is None: + for rank in range(self.world_size): + self.samples_q[rank].put(-1) + return None + rank = self.get_rank() + device = torch.device(f"cuda:{rank}") + assert ( + payload_features is not None + and num_candidates is not None + and uih_seq_lengths is not None + ) + t0: float = time.time() + seq_embeddings, payload_features, uih_seq_lengths, num_candidates = ( + move_sparse_output_to_device( + seq_embeddings=seq_embeddings, + payload_features=payload_features, + uih_seq_lengths=uih_seq_lengths, + num_candidates=num_candidates, + device=device, + ) + ) + self.samples_q[rank].put( + ( + id, + seq_embeddings, + payload_features, + max_uih_len, + uih_seq_lengths, + max_num_candidates, + num_candidates, + ) + ) + (mt_target_preds, mt_target_labels, mt_target_weights) = self.capture_output( + id, rank + ) + dt_dense = time.time() - t0 + return ( + mt_target_preds, + mt_target_labels, + mt_target_weights, + dt_dense, + ) + + +class ModelFamilyDenseSingleWorker: + """ + Single-worker dense module manager for single-GPU inference. + + Simpler alternative to ModelFamilyDenseDist for single-GPU setups. + + Args: + hstu_config: HSTU model configuration. + table_config: Embedding table configurations. + output_trace: Whether to enable profiling traces. + compute_eval: Whether to compute evaluation metrics. + """ + + def __init__( + self, + hstu_config: DlrmHSTUConfig, + table_config: Dict[str, EmbeddingConfig], + output_trace: bool = False, + compute_eval: bool = False, + ) -> None: + self.model: Optional[torch.nn.Module] = None + self.hstu_config = hstu_config + self.table_config = table_config + self.output_trace = output_trace + self.device: torch.device = torch.device("cuda:0") + torch.cuda.set_device(self.device) + self.profiler: Optional[Profiler] = ( + Profiler(rank=0) if self.output_trace else None + ) + + def load(self, model_path: str) -> None: + """ + Load dense model for single-GPU inference. + + Args: + model_path: Path to the model checkpoint directory. + """ + logger.warning(f"Loading dense module from {model_path}") + self.model = ( + get_hstu_model( + table_config=self.table_config, + hstu_config=self.hstu_config, + table_device="cpu", + is_dense=True, + ) + .to(self.device) + .to(torch.bfloat16) + ) + self.model.set_training_dtype(torch.bfloat16) + load_nonsparse_checkpoint( + model=self.model, device=self.device, optimizer=None, path=model_path + ) + assert self.model is not None + self.model.eval() + + def predict( + self, + seq_embeddings: Optional[Dict[str, SequenceEmbedding]], + payload_features: Optional[Dict[str, torch.Tensor]], + max_uih_len: int, + uih_seq_lengths: Optional[torch.Tensor], + max_num_candidates: int, + num_candidates: Optional[torch.Tensor], + ) -> Optional[ + Tuple[ + torch.Tensor, + Optional[torch.Tensor], + Optional[torch.Tensor], + float, + ] + ]: + """ + Run dense forward pass on single GPU. + + Args: + seq_embeddings: Sequence embeddings from sparse module. + payload_features: Additional feature tensors. + max_uih_len: Maximum UIH sequence length. + uih_seq_lengths: Per-sample UIH lengths. + max_num_candidates: Maximum candidates per sample. + num_candidates: Per-sample candidate counts. + + Returns: + Tuple of (predictions, labels, weights, elapsed_time). + """ + if self.output_trace: + assert self.profiler is not None + self.profiler.step() + assert ( + payload_features is not None + and uih_seq_lengths is not None + and num_candidates is not None + and seq_embeddings is not None + ) + t0: float = time.time() + with torch.profiler.record_function("dense forward"): + seq_embeddings, payload_features, uih_seq_lengths, num_candidates = ( + move_sparse_output_to_device( + seq_embeddings=seq_embeddings, + payload_features=payload_features, + uih_seq_lengths=uih_seq_lengths, + num_candidates=num_candidates, + device=self.device, + ) + ) + assert self.model is not None + ( + _, + _, + _, + mt_target_preds, + mt_target_labels, + mt_target_weights, + ) = self.model.main_forward( # pyre-ignore [29] + seq_embeddings=seq_embeddings, + payload_features=payload_features, + max_uih_len=max_uih_len, + uih_seq_lengths=uih_seq_lengths, + max_num_candidates=max_num_candidates, + num_candidates=num_candidates, + ) + assert mt_target_preds is not None + mt_target_preds = mt_target_preds.detach().to(device="cpu") + if mt_target_labels is not None: + mt_target_labels = mt_target_labels.detach().to(device="cpu") + if mt_target_weights is not None: + mt_target_weights = mt_target_weights.detach().to(device="cpu") + dt_dense: float = time.time() - t0 + return mt_target_preds, mt_target_labels, mt_target_weights, dt_dense diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py new file mode 100644 index 000000000..e3ec10415 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py @@ -0,0 +1,106 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict + +""" +TorchScript-friendly wrapper for the HSTU sparse path (CPU embedding lookup). + +``HSTUSparseScriptModule`` wraps :class:`HSTUSparseInferenceModule` and +flattens the ``Dict[str, SequenceEmbedding]`` output into the parallel +value/length dicts defined in :mod:`ts_types` so the boundary is composed +entirely of TorchScript-supported types. +""" + +from typing import Dict, Tuple + +import torch +from generative_recommenders.dlrm_v3.inference.inference_modules import ( + _NoCopyEmbeddingCollection, + HSTUSparseInferenceModule, +) +from generative_recommenders.dlrm_v3.inference.ts_types import ( + flatten_seq_embeddings, + SeqEmbLengths, + SeqEmbValues, +) +from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig +from torchrec.modules.embedding_configs import EmbeddingConfig +from torchrec.sparse.jagged_tensor import KeyedJaggedTensor + + +class HSTUSparseScriptModule(torch.nn.Module): + """Script-friendly sparse module. + + ``forward`` returns 5 tensors / dicts (no Python ``int`` scalars): + + 1. ``seq_emb_values`` ``Dict[str, Tensor]`` -- jagged embedding values. + 2. ``seq_emb_lengths`` ``Dict[str, Tensor]`` -- per-feature lengths. + 3. ``payload_features`` ``Dict[str, Tensor]`` -- side features. + 4. ``uih_seq_lengths`` ``Tensor[B]`` -- UIH lengths. + 5. ``num_candidates`` ``Tensor[B]`` -- candidate counts. + + The dense module (or the C++ glue) recovers the ``int`` ``max_uih_len`` / + ``max_num_candidates`` values from these tensors via ``.max().item()``. + """ + + def __init__( + self, + table_config: Dict[str, EmbeddingConfig], + hstu_config: DlrmHSTUConfig, + use_no_copy_embedding_collection: bool = True, + ) -> None: + super().__init__() + self._sparse: HSTUSparseInferenceModule = HSTUSparseInferenceModule( + table_config=table_config, + hstu_config=hstu_config, + ) + if use_no_copy_embedding_collection: + # Re-class the existing EmbeddingCollection so TorchScript picks up + # the no-copy ``forward`` override (matches the eager-only + # ``ec_patched_forward_wo_embedding_copy`` monkey-patch). + self._sparse._hstu_model._embedding_collection.__class__ = ( + _NoCopyEmbeddingCollection + ) + + def forward( + self, + uih_features: KeyedJaggedTensor, + candidates_features: KeyedJaggedTensor, + ) -> Tuple[ + SeqEmbValues, + SeqEmbLengths, + Dict[str, torch.Tensor], + torch.Tensor, + torch.Tensor, + ]: + ( + seq_embeddings, + payload_features, + _max_uih_len, + uih_seq_lengths, + _max_num_candidates, + num_candidates, + ) = self._sparse( + uih_features=uih_features, + candidates_features=candidates_features, + ) + seq_emb_values, seq_emb_lengths = flatten_seq_embeddings(seq_embeddings) + return ( + seq_emb_values, + seq_emb_lengths, + payload_features, + uih_seq_lengths, + num_candidates, + ) diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py new file mode 100644 index 000000000..948f10618 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py @@ -0,0 +1,39 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +from generative_recommenders.common import gpu_unavailable +from generative_recommenders.dlrm_v3.inference.main import main +from hypothesis import given, settings, strategies as st, Verbosity + + +class DLRMV3InferenceTest(unittest.TestCase): + @unittest.skipIf(*gpu_unavailable) + @given( + world_size=st.sampled_from([1]), + ) + @settings( + verbosity=Verbosity.verbose, + max_examples=1, + deadline=None, + ) + def test_e2e(self, world_size: int) -> None: + os.environ["WORLD_SIZE"] = str(world_size) + main() + + +if __name__ == "__main__": + unittest.main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py new file mode 100644 index 000000000..34d0388ea --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py @@ -0,0 +1,236 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# pyre-strict + +""" +Numerical parity test: eager HSTU vs traced (sparse + dense) on a synthetic +batch. + +The production deployment path (see ``end_to_end_test.py``) uses +``torch.jit.trace``, not ``torch.jit.script``, for the HSTU sparse/dense +wrappers. Tracing records the actual tensor ops executed during a forward +pass and ignores source-level dispatch logic (HammerKernel enum, +``is_fx_tracing()``, ``torch.autocast``, IntEnum branches) that scripting +cannot compile. This unit test mirrors that path. + +Tolerances are deliberately loose because the traced path replaces the +Triton fused kernels with PyTorch fallbacks and skips ``torch.autocast`` in +the user-forward block; both can perturb low-order bits in bf16. +""" + +import unittest +from typing import Dict, List, Tuple + +import torch +from generative_recommenders.common import gpu_unavailable, HammerKernel +from generative_recommenders.dlrm_v3.configs import ( + get_embedding_table_config, + get_hstu_configs, +) +from generative_recommenders.dlrm_v3.datasets.dataset import get_random_data +from generative_recommenders.dlrm_v3.inference.dense_predict_module import ( + HSTUDenseScriptModule, +) +from generative_recommenders.dlrm_v3.inference.sparse_predict_module import ( + HSTUSparseScriptModule, +) +from generative_recommenders.dlrm_v3.inference.ts_types import ( + SeqEmbLengths, + SeqEmbValues, +) +from torchrec.sparse.jagged_tensor import KeyedJaggedTensor + + +_DATASET = "kuairand-1k" + + +def _move_dense_inputs( + seq_emb_values: Dict[str, torch.Tensor], + seq_emb_lengths: Dict[str, torch.Tensor], + payload_features: Dict[str, torch.Tensor], + uih_seq_lengths: torch.Tensor, + num_candidates: torch.Tensor, + device: torch.device, +) -> Tuple[ + Dict[str, torch.Tensor], + Dict[str, torch.Tensor], + Dict[str, torch.Tensor], + torch.Tensor, + torch.Tensor, +]: + """C++-side ``move_sparse_output_to_device`` analog for the test.""" + return ( + {k: v.to(device).to(torch.bfloat16) for k, v in seq_emb_values.items()}, + {k: v.to(device) for k, v in seq_emb_lengths.items()}, + {k: v.to(device) for k, v in payload_features.items()}, + uih_seq_lengths.to(device), + num_candidates.to(device), + ) + + +class _SparseTraceShim(torch.nn.Module): + """Adapter that takes raw tensors and rebuilds the KJTs inside forward. + + ``torch.jit.trace`` does not accept ``KeyedJaggedTensor`` (or any + non-Tensor / non-collection-of-Tensor type) as a top-level forward + input, so we make the traced boundary tensor-only and bake the + ``List[str]`` of feature keys in as module attributes. + """ + + def __init__( + self, + sparse_module: HSTUSparseScriptModule, + uih_keys: List[str], + candidates_keys: List[str], + ) -> None: + super().__init__() + self._sparse_module: HSTUSparseScriptModule = sparse_module + self._uih_keys: List[str] = uih_keys + self._candidates_keys: List[str] = candidates_keys + + def forward( + self, + uih_lengths: torch.Tensor, + uih_values: torch.Tensor, + candidates_lengths: torch.Tensor, + candidates_values: torch.Tensor, + ) -> Tuple[ + SeqEmbValues, + SeqEmbLengths, + Dict[str, torch.Tensor], + torch.Tensor, + torch.Tensor, + ]: + uih_kjt = KeyedJaggedTensor( + keys=self._uih_keys, + lengths=uih_lengths, + values=uih_values, + ) + candidates_kjt = KeyedJaggedTensor( + keys=self._candidates_keys, + lengths=candidates_lengths, + values=candidates_values, + ) + return self._sparse_module( + uih_features=uih_kjt, candidates_features=candidates_kjt + ) + + +class HSTUScriptedParityTest(unittest.TestCase): + @unittest.skipIf(*gpu_unavailable) + def test_scripted_matches_eager(self) -> None: + torch.manual_seed(0) + device = torch.device("cuda:0") + torch.cuda.set_device(device) + + hstu_config = get_hstu_configs(_DATASET) + table_config = get_embedding_table_config(_DATASET) + + # Some embedding tables in kuairand-1k are tiny (e.g. + # user_active_degree has num_embeddings=8). Clamp the random value + # range so every index stays in range for every table; otherwise the + # default value_bound=1000 triggers an out-of-range embedding lookup. + min_rows = min(t.num_embeddings for t in table_config.values()) + value_bound = max(2, min_rows) + + uih_kjt, candidates_kjt = get_random_data( + contexual_features=list( + hstu_config.contextual_feature_to_max_length.keys() + ), + hstu_uih_keys=hstu_config.hstu_uih_feature_names, + hstu_candidates_keys=hstu_config.hstu_candidate_feature_names, + uih_max_seq_len=128, + max_num_candidates=hstu_config.max_num_candidates_inference, + value_bound=value_bound, + ) + + sparse_module = HSTUSparseScriptModule( + table_config=table_config, + hstu_config=hstu_config, + use_no_copy_embedding_collection=True, + ).eval() + dense_module = ( + HSTUDenseScriptModule( + hstu_config=hstu_config, + table_config=table_config, + ) + .to(torch.bfloat16) + .to(device) + .eval() + ) + + # Pin the HammerKernel to PyTorch on both wrappers. The Triton + # kernels use Python-level dispatch (autotune, constexpr arguments) + # that interacts badly with torch.jit.trace's recording pass. The + # eager reference run uses the same setting so the comparison is + # apples-to-apples. + sparse_module._sparse._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH) + dense_module._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH) + + # === Eager reference path === + with torch.no_grad(): + sparse_out_e = sparse_module( + uih_features=uih_kjt, candidates_features=candidates_kjt + ) + dense_inputs_e = _move_dense_inputs(*sparse_out_e, device=device) + preds_eager = dense_module(*dense_inputs_e) + + # === Traced path === + # Sparse is traced via a raw-tensor shim because KJT is not a valid + # traced input. Dense is traced directly with the eager sparse + # output as the example. + sparse_shim = _SparseTraceShim( + sparse_module=sparse_module, + uih_keys=list(uih_kjt.keys()), + candidates_keys=list(candidates_kjt.keys()), + ) + traced_sparse = torch.jit.trace( + sparse_shim, + example_inputs=( + uih_kjt.lengths(), + uih_kjt.values(), + candidates_kjt.lengths(), + candidates_kjt.values(), + ), + strict=False, + check_trace=False, + ) + traced_dense = torch.jit.trace( + dense_module, + example_inputs=tuple(dense_inputs_e), + strict=False, + check_trace=False, + ) + + with torch.no_grad(): + sparse_out_t = traced_sparse( + uih_kjt.lengths(), + uih_kjt.values(), + candidates_kjt.lengths(), + candidates_kjt.values(), + ) + dense_inputs_t = _move_dense_inputs(*sparse_out_t, device=device) + preds_traced = traced_dense(*dense_inputs_t) + + torch.testing.assert_close( + preds_eager.float(), + preds_traced.float(), + atol=1e-2, + rtol=1e-2, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format new file mode 100644 index 000000000..f08c9c2c8 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format @@ -0,0 +1,2 @@ +BasedOnStyle: Google +Standard: Cpp11 diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt new file mode 100644 index 000000000..4fec0e44f --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt @@ -0,0 +1,113 @@ +cmake_minimum_required(VERSION 3.12) + +project(mlperf_loadgen) + +# Read the version file +file(READ "${CMAKE_SOURCE_DIR}/VERSION.txt" VERSION_CONTENTS) + +# Extract the major, minor, and patch versions from the VERSION file (assuming "MAJOR.MINOR.PATCH" format) +string(REGEX MATCH "^([0-9]+)\\.([0-9]+)\\.([0-9]+)" VERSION_MATCH ${VERSION_CONTENTS}) + +# Set the variables for the major, minor, and patch versions +set(mlperf_loadgen_VERSION_MAJOR "${CMAKE_MATCH_1}") +set(mlperf_loadgen_VERSION_MINOR "${CMAKE_MATCH_2}") +set(mlperf_loadgen_VERSION_PATCH "${CMAKE_MATCH_3}") + +# Check if the version format was parsed correctly +if(NOT DEFINED mlperf_loadgen_VERSION_MAJOR OR NOT DEFINED mlperf_loadgen_VERSION_MINOR OR NOT DEFINED mlperf_loadgen_VERSION_PATCH) + message(FATAL_ERROR "Version format in VERSION.txt is incorrect. Expected format: MAJOR.MINOR.PATCH") +endif() + +# Print out the version +message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}.${mlperf_loadgen_VERSION_PATCH}") + +# Set build options. NB: CXX_STANDARD is supported since CMake 3.1. +if (NOT MSVC) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -W -Wall") +endif() +# Extra build options can be specified by setting the MLPERF_LOADGEN_CXX_FLAGS variable +if (MLPERF_LOADGEN_CXX_FLAGS) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MLPERF_LOADGEN_CXX_FLAGS}") +endif() +message(STATUS "Using C++ compiler flags: ${CMAKE_CXX_FLAGS}") +set(CMAKE_CXX_STANDARD "14") +message(STATUS "Using C++ standard: ${CMAKE_CXX_STANDARD}") +message(STATUS "Using static linker flags: ${CMAKE_STATIC_LINKER_FLAGS}") +message(STATUS "Using shared linker flags: ${CMAKE_SHARED_LINKER_FLAGS}") + +# Output directory for libraries. +set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}) +message(STATUS "Using output path: ${LIBRARY_OUTPUT_PATH}") + +# Detect Python to use for generating source file with version info. +# NB: PythonInterp has been deprecated since CMake 3.12 +# but it works with earlier versions of CMake. +find_package(PythonInterp) +message(STATUS "Using Python interpreter: ${PYTHON_EXECUTABLE}") + +# Specify the source and destination files +set(CONF_FILE "mlperf.conf") +set(HEADER_FILE "mlperf_conf.h") + +# Read the content of the configuration file +file(READ ${CONF_FILE} CONF_CONTENTS) + +# Escape all double quotes and backslashes +string(REPLACE "\\" "\\\\" CONF_CONTENTS "${CONF_CONTENTS}") +string(REPLACE "\"" "\\\"" CONF_CONTENTS "${CONF_CONTENTS}") + +# Handle new lines +string(REPLACE "\n" "\\n\"\n\"" CONF_CONTENTS "${CONF_CONTENTS}") + +# Wrap the content in a C++ string declaration +set(FORMATTED_CONTENT "const char* mlperf_conf =\n\"${CONF_CONTENTS}\";\n") + +# Write the formatted content to the header file +file(WRITE ${HEADER_FILE} "${FORMATTED_CONTENT}") + +message(STATUS "Output config: ${CMAKE_BINARY_DIR}/mlperf_conf.h") + +# Generate source file with version info. +execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/version_generator.py ${CMAKE_BINARY_DIR}/version_generated.cc ${CMAKE_CURRENT_SOURCE_DIR}) + +# Add source files. +set(SOURCE + ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.h + ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.cc + ${CMAKE_CURRENT_SOURCE_DIR}/early_stopping.cc + ${CMAKE_CURRENT_SOURCE_DIR}/issue_query_controller.cc + ${CMAKE_CURRENT_SOURCE_DIR}/loadgen.cc + ${CMAKE_CURRENT_SOURCE_DIR}/logging.cc + ${CMAKE_CURRENT_SOURCE_DIR}/logging.h + ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.cc + ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc + ${CMAKE_CURRENT_SOURCE_DIR}/utils.h + ${CMAKE_CURRENT_SOURCE_DIR}/results.h + ${CMAKE_CURRENT_SOURCE_DIR}/results.cc + ${CMAKE_CURRENT_SOURCE_DIR}/version.cc + ${CMAKE_CURRENT_SOURCE_DIR}/version.h + ${CMAKE_CURRENT_SOURCE_DIR}/mlperf_conf.h + ${CMAKE_CURRENT_SOURCE_DIR}/VERSION.txt + ${CMAKE_BINARY_DIR}/version_generated.cc +) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) + +add_library(mlperf_loadgen STATIC ${SOURCE}) +target_link_libraries(mlperf_loadgen) + +if(WIN32) +set (LIBS "") +else() +set (LIBS pthread) +endif() + +add_executable(benchmark benchmark/repro.cpp) +target_link_libraries(benchmark PUBLIC mlperf_loadgen ${LIBS}) + +# Install library and headers. +install(TARGETS mlperf_loadgen + DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/ + DESTINATION ${CMAKE_INSTALL_PREFIX}/include FILES_MATCHING PATTERN "*.h") diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in new file mode 100644 index 000000000..152b53111 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in @@ -0,0 +1,2 @@ +include VERSION.txt +include mlperf.conf diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md new file mode 100644 index 000000000..212c8a53c --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md @@ -0,0 +1,223 @@ +# Overview {#mainpage} + +## Introduction + +* The LoadGen is a *reusable* module that *efficiently* and *fairly* measures + the performance of inference systems. +* It generates traffic for scenarios as formulated by a diverse set of experts + in the [MLCommons working group](https://mlcommons.org/). +* The scenarios emulate the workloads seen in mobile devices, + autonomous vehicles, robotics, and cloud-based setups. +* Although the LoadGen is not model or dataset aware, its strength is in its + reusability with logic that is. + +## Integration Example and Flow +The following is an diagram of how the LoadGen can be integrated into an +inference system, resembling how some of the MLPerf reference models are +implemented. +
+ +
    +
  1. Benchmark knows the model, dataset, and preprocessing.
  2. +
  3. Benchmark hands dataset sample IDs to LoadGen.
  4. +
  5. LoadGen starts generating queries of sample IDs.
  6. +
  7. Benchmark creates requests to backend.
  8. +
  9. Result is post processed and forwarded to LoadGen.
  10. +
  11. LoadGen outputs logs for analysis.
    +
+
+ +## Useful Links +* [FAQ](README_FAQ.md) +* [LoadGen Build Instructions](README_BUILD.md) +* [LoadGen API](loadgen.h) +* [Test Settings](test_settings.h) - + A good description of available scenarios, modes, and knobs. +* [MLPerf Inference Code](https://github.com/mlcommons/inference) - + Includes source for the LoadGen and reference models that use the LoadGen. +* [MLPerf Inference Rules](https://github.com/mlcommons/inference_policies) - + Any mismatch with this is a bug in the LoadGen. + +## Scope of the LoadGen's Responsibilities + +### In Scope +* **Provide a reusable** C++ library with python bindings. +* **Implement** the traffic patterns of the MLPerf Inference scenarios and + modes. +* **Record** all traffic generated and received for later analysis and + verification. +* **Summarize** the results and whether performance constraints were met. +* **Target high-performance** systems with efficient multi-thread friendly + logging utilities. +* **Generate trust** via a shared, well-tested, and community-hardened + code base. + +### Out of Scope +The LoadGen is: +* **NOT** aware of the ML model it is running against. +* **NOT** aware of the data formats of the model's inputs and outputs. +* **NOT** aware of how to score the accuracy of a model's outputs. +* **NOT** aware of MLPerf rules regarding scenario-specific constraints. + +Limitting the scope of the LoadGen in this way keeps it reusable across +different models and datasets without modification. Using composition and +dependency injection, the user can define their own model, datasets, and +metrics. + +Additionally, not hardcoding MLPerf-specific test constraints, like test +duration and performance targets, allows users to use the LoadGen unmodified +for custom testing and continuous integration purposes. + +## Submission Considerations + +### Upstream all local modifications +* As a rule, no local modifications to the LoadGen's C++ library are allowed +for submission. +* Please upstream early and often to keep the playing field level. + +### Choose your TestSettings carefully! +* Since the LoadGen is oblivious to the model, it can't enforce the MLPerf +requirements for submission. *e.g.:* target percentiles and latencies. +* For verification, the values in TestSettings are logged. +* To help make sure your settings are spec compliant, use +TestSettings::FromConfig in conjunction with the relevant config file provided +with the reference models. + +## Responsibilities of a LoadGen User + +### Implement the Interfaces +* Implement the SystemUnderTest and QuerySampleLibrary interfaces and pass + them to the StartTest function. +* Call QuerySampleComplete for every sample received by + SystemUnderTest::IssueQuery. + +### Assess Accuracy +* Process the *mlperf_log_accuracy.json* output by the LoadGen to determine + the accuracy of your system. +* For the official models, Python scripts will be provided by the MLPerf model + owners for you to do this automatically. + +For templates of how to do the above in detail, refer to code for the demos, +tests, and reference models. + + +## LoadGen over the Network + +For reference, on a high level a submission looks like this: + +
+ +
+ +The LoadGen implementation is common to all submissions, while the QSL (“Query Sample Library”) and SUT (“System Under Test”) are implemented by submitters. QSL is responsible for loading the data and includes untimed preprocessing. + +A submission over the network introduces a new component “QDL” (query dispatch library) that is added to the system as presented in the following diagram: + +
+ +
+ +QDL is a proxy for a load-balancer, that dispatches queries to SUT over a physical network, receives the responses and passes them back to LoadGen. It is implemented by the submitter. The interface of the QDL is the same as the API to SUT. + +In scenarios using QDL, data may be compressed in QSL at the choice of the submitter in order to reduce network transmission time. Decompression is part of the timed processing in SUT. A set of approved standard compression schemes will be specified for each benchmark; additional compression schemes must be approved in advance by the Working Group. + +All communication between LoadGen/QSL and SUT is via QDL, and all communication between QDL and SUT must pass over a physical network. + +QDL implements the protocol to transmit queries over the network and receive responses. It also implements decompression of any response returned by the SUT, where compression of responses is allowed. Performing any part of the timed preprocessing or inference in QDL is specifically disallowed. Currently no batching is allowed in QDL, although this may be revisited in future. + +The MLperf over the Network will run in Server mode and Offline mode. All LoadGen modes are expected to work as is with insignificant changes. These include running the test in performance mode, accuracy mode, find peak performance mode and compliance mode. The same applies for power measurements. + +### QDL details +The Query Dispatch Library is implemented by the submitter and interfaces with LoadGen using the same SUT API. All MLPerf Inference SUTs implement the `mlperf::SystemUnderTest` class which is defined in system_under_test.h. The QDL implements `mlperf::QueryDispatchLibrary` class which inherits the `mlperf::SystemUnderTest` class and has the same API and support all existing `mlperf::SystemUnderTest` methods. It has a separate header file query_dispatch_library.h. Using sut with `mlperf::SystemUnderTest` class in LoadGen StartTest is natively upcasting `mlperf::QueryDispatchLibrary` class. + +#### QDL Query issue and response over the network + +The QDL gets the queries from the LoadGen through +```CPP +void IssueQuery(const std::vector& samples) +``` + +The QDL dispatches the queries to the SUT over the physical media. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. Submitter implementation includes all methods required to serialize the query, load balance, drive it to the Operating system and network interface card and send to the SUT. + +The QDL receives the query responses over the network from the SUT. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. The submitter implementation includes all methods required to receive the network data from the Network Interface card, go through the Operating system, deserialize the query response, and provide it back to the LoadGen through query completion by: + +```CPP +struct QuerySampleResponse { + ResponseId id; + uintptr_t data; + size_t size; +}; +void QuerySamplesComplete(QuerySampleResponse* responses, + size_t response_count); + +``` + +#### QDL Additional Methods + +In addition to that the QDL needs to implement the following methods that are provided by the SUT interface to the LoadGen: +```CPP +const std::string& Name(); +``` +The `Name` function returns a known string for over the Network SUTs to identify it as over the network benchmark. +```CPP +void FlushQueries(); +``` + +It is not specified here how the QDL would query and configure the SUT to execute the above methods. The QDL responds to the LoadGen after receiving its own response from the SUT. + +### Example + +Refer to [LON demo](demos/lon) for a reference example illustrating usage of Loadgen over the network. + +## Find Peak Performance Mode + +The Find Peak Performance mode can be used to find the optimal queries per second (QPS) for the server scenario. + +### Setup + +You can setup loadgen to run this mode by setting the `mode` variable in the `test_settings` used to run the test. Using the Python API: + +```python +settings = mlperf_loadgen.TestSettings() +settings.server_target_qps = 100 +settings.scenario = mlperf_loadgen.TestScenario.Server +settings.mode = mlperf_loadgen.TestMode.FindPeakPerformance +... + +mlperf_loadgen.StartTest(sut, qsl, settings) +``` + +Using the C/C++ API: +```CPP +mlperf::TestSettings settings; +setting.server_target_qps = 100; +settings.scenario = mlperf::TestScenario::Server; +settings.mode = mlperf::TestMode::FindPeakPerformance; +mlperf::LogSettings log_settings; +/* +Construct QSL and SUT +*/ +mlperf::StartTest(&sut, &qsl, settings, log_settings); +``` + +**Note:** Make sure you are setting the TestScenario to server and you are providing an initial target QPS. + +### Description + +The Find Peak Performance mode works by finding a lower and upper boundary for the optimal QPS. Then performing a binary search between the lower and upper bound to find the optimal QPS. + +#### Finding lower and upper boundary + +LoadGen begins by running performance mode at the specified target QPS. If the test passes, this value is used as the lower bound; otherwise, an error is raised. The algorithm then guesses the upper bound as twice the target QPS. + +Then LoadGen will run performance mode using the upper bound guess. If the test is successful, both the lower bound and upper bound will be doubled. This repeats until the upper bound guess fails the test. + +``` +[initial_target_qps, 2*initial_target_qps] -> [2*initial_target_qps, 4*initial_target_qps] -> [4*initial_target_qps, 8*initial_target_qps]... +``` + +Finally, the final lower bound and upper bound are set to their current values. This process assures that the lower bound passes the performance mode, but the upper bound doesn’t. + +#### Binary Search + +Once the lower and upper bounds are set, binary search can be performed over the range `[lower, upper]`` to find the optimal QPS. If a given QPS fails in performance mode, the optimal value lies below it; if it passes, the optimal is higher. diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md new file mode 100644 index 000000000..499cc360a --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md @@ -0,0 +1,47 @@ +# Building the LoadGen {#ReadmeBuild} + +## Prerequisites + + sudo apt-get install libglib2.0-dev python-pip python3-pip + pip2 install absl-py numpy + pip3 install absl-py numpy + +## Quick Start +### Installation - Python + + pip install absl-py numpy + git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference + cd mlperf_inference/loadgen + CFLAGS="-std=c++14 -O3" python -m pip install . + +This will fetch the loadgen source, build and install the loadgen as a python module, and run a simple end-to-end demo. + +Alternatively, we provide wheels for several python versions and operating system that can be installed using pip directly. + + pip install mlperf-loadgen + +**NOTE:** Take into account that we only update the published wheels after an official release, they may not include the latest changes. + +### Testing your Installation +The following command will run a simple end-to-end demo: + + python mlperf_inference/loadgen/demos/py_demo_single_stream.py + +A summary of the test results can be found in the *"mlperf_log_summary.txt"* logfile. + +For a timeline visualization of what happened during the test, open the *"mlperf_log_trace.json"* file in Chrome: +* Type “chrome://tracing” in the address bar, then drag-n-drop the json. +* This may be useful for SUT performance tuning and understanding + debugging the loadgen. + +### Installation - C++ +To build the loadgen as a C++ library, rather than a python module: + + git clone https://github.com/mlcommons/inference.git mlperf_inference + cd mlperf_inference + mkdir loadgen/build/ && cd loadgen/build/ + cmake .. && cmake --build . + cp libmlperf_loadgen.a .. + +## Quick start: Loadgen Over the Network + +Refer to [LON demo](demos/lon/README.md) for a basic example. diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md new file mode 100644 index 000000000..ab4e0c75d --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md @@ -0,0 +1,78 @@ +# LoadGen FAQ {#ReadmeFAQ} + +## Q: The LoadGen does not match the MLPerf specification. Who is right? +**A:** +The MLPerf spec is *always* right. +Please file a LoadGen bug so it may be resolved. + +## Q: How can I file a bug? +**A:** +On GitHub: https://github.com/mlcommons/inference/issues/new + +## Q: Can I make local modifications to the LoadGen for submission? +**A:** +No. To keep the playing field level, please upstream any local +modificiations you need to make. Ideally upstream such changes behind a runtime +flag or via an abstract interface the client can implement. This will help +with testability. + +## Q: Where can I find the results of a test? +**A:** +By default, the loadgen will output an *mlperf_log_summary.txt* file +that summarizes the target metrics and constraints of the test, along with +other stats about the run. + +*Note:* LogSettings also has a flag to forward the results to stdout and +there's an outstanding TODO to make this more programmable. + +## Q: The reference implementation for \<*some_model*\> prints out results of its own. Are those for submission? +**A:** +They are not. The LoadGen results are the ground truth for submission +results since they will work even for systems that forgo the python bindings. +If you notice a bug in the LoadGen's results, please file a bug or submit a +patch. + +## Q: I'm getting linker errors for LoadgenVersion definitions. Where is *version_generated.cc*? +**A:** +If you have a custom build setup, make sure you run the *version_generator.py* +script, which will create the cc file you are looking for. The official build +files that come with the LoadGen do this for you out of the box. + +## Q: What is this *version_generator.py* script? +**A:** +The LoadGen records git stats (if available) and the SHA1 of all its +source files (always) at build time for verification purposes. This is easy +to circumvent, but try your best to run *version_generator.py* correctly; +ideally integrated with your build system if you have a custom build. +The intention is more to help with debugging efforts and detect accidental +version missmatches than to detect bad actors. + +## Q: How do I view the *mlperf_log_trace.json* file? +**A:** +This file uses the [Trace Event Format] +(https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit) +to record a timeline of all the threads involved. +You can view the file by typing [chrome://tracing](chrome://tracing) into +Chrome's address bar and dragging the json file there. +This file zips well and you can drag the zip file directly into +[chrome://tracing](chrome://tracing) too. +Please include zipped traces (and the other logs) when filing bug reports. + +## Q: Why is the code littered with so many lambdas? My eyes hurt. +**A:** +Lambdas are a convenient and efficient way to ship arbitrary data + deferred +logic over to the logging thread without much boilerplate. +Much of the loadgen is built on top of the logging utilities. +Thus the lambdas. (Sorry about the eyes.) + +## Q: What C++ version does the LoadGen target? +**A:** +It currently targets and requires C++14. It should compile with recent +versions of clang, gcc, and msvc. + +## Q: What dependencies does the LoadGen code have? +**A:** +The C++ code has no external dependencies. The loadgen itself, logging +utilities, and unit test utilities are built solely on the C++ Standard Library. +The python bindings, however, do require +[pybind11](https://github.com/pybind/pybind11). diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt new file mode 100644 index 000000000..ac14c3dfa --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt @@ -0,0 +1 @@ +5.1.1 diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore new file mode 100644 index 000000000..e792c8e55 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore @@ -0,0 +1,2 @@ +loadgen_build +build \ No newline at end of file diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md new file mode 100644 index 000000000..24e872983 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md @@ -0,0 +1,10 @@ +Note: please install jemalloc first. See: http://jemalloc.net/ +Command: bash run.sh <0=Basic,1=Queue> + +Experiments: +- On Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz +- Basic SUT : 500-600k i/s +- Basic SUT + jemalloc: 800-900k i/s (`bash run.sh 800000 0`) +- Queued SUT (2 complete threads) + jemalloc: 1.2-1.3M i/s (`bash run.sh 1200000 1 2 2048`) +- Queued SUT (2 complete threads) + jemalloc + server_coalesce_queries: 1.4-1.5M is/ (`bash run.sh 1400000 1 2 512 1`) +- Basic SUT + jemalloc + server_coalesce_queries + 4 IssueQueryThreads: 2.4-2.5M is/ (`bash run.sh 2400000 0 2 512 1 4`) diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp new file mode 100644 index 000000000..44ff53efa --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "loadgen.h" +#include "query_sample_library.h" +#include "system_under_test.h" +#include "test_settings.h" + +class QSL : public mlperf::QuerySampleLibrary { + public: + ~QSL() override{}; + const std::string& Name() override { return mName; } + size_t TotalSampleCount() override { return 1000000; } + size_t PerformanceSampleCount() override { return TotalSampleCount(); } + void LoadSamplesToRam(const std::vector&) override { + } + void UnloadSamplesFromRam( + const std::vector&) override {} + + private: + std::string mName{"Dummy QSL"}; +}; + +class BasicSUT : public mlperf::SystemUnderTest { + public: + BasicSUT() { + // Start with some large value so that we don't reallocate memory. + initResponse(10000); + } + ~BasicSUT() override {} + const std::string& Name() override { return mName; } + void IssueQuery(const std::vector& samples) override { + size_t n = samples.size(); + if (n > mResponses.size()) { + std::cerr << "Warning: reallocating response buffer in BasicSUT. Maybe " + "you should initResponse with larger value!?" + << std::endl; + initResponse(samples.size()); + } + for (size_t i = 0; i < n; i++) { + mResponses[i].id = samples[i].id; + } + mlperf::QuerySamplesComplete(mResponses.data(), n); + } + void FlushQueries() override {} + + private: + void initResponse(int size) { + mResponses.resize(size, + {0, reinterpret_cast(&mBuf), sizeof(int)}); + } + int mBuf{0}; + std::string mName{"BasicSUT"}; + std::vector mResponses; +}; + +class QueueSUT : public mlperf::SystemUnderTest { + public: + QueueSUT(int numCompleteThreads, int maxSize) { + // Each thread handle at most maxSize at a time. + std::cout << "QueueSUT: maxSize = " << maxSize << std::endl; + initResponse(numCompleteThreads, maxSize); + // Launch complete threads + for (int i = 0; i < numCompleteThreads; i++) { + mThreads.emplace_back(&QueueSUT::CompleteThread, this, i); + } + } + ~QueueSUT() override { + { + std::unique_lock lck(mMtx); + mDone = true; + mCondVar.notify_all(); + } + for (auto& thread : mThreads) { + thread.join(); + } + } + const std::string& Name() override { return mName; } + void IssueQuery(const std::vector& samples) override { + std::unique_lock lck(mMtx); + for (const auto& sample : samples) { + mIdQueue.push_back(sample.id); + } + // Let some worker thread to consume tasks + mCondVar.notify_one(); + } + void FlushQueries() override {} + + private: + void CompleteThread(int threadIdx) { + auto& responses = mResponses[threadIdx]; + size_t maxSize{responses.size()}; + size_t actualSize{0}; + while (true) { + { + std::unique_lock lck(mMtx); + mCondVar.wait(lck, [&]() { return !mIdQueue.empty() || mDone; }); + + if (mDone) { + break; + } + + actualSize = std::min(maxSize, mIdQueue.size()); + for (size_t i = 0; i < actualSize; i++) { + responses[i].id = mIdQueue.front(); + mIdQueue.pop_front(); + } + mCondVar.notify_one(); + } + mlperf::QuerySamplesComplete(responses.data(), actualSize); + } + } + void initResponse(int numCompleteThreads, int size) { + mResponses.resize(numCompleteThreads); + for (auto& responses : mResponses) { + responses.resize(size, + {0, reinterpret_cast(&mBuf), sizeof(int)}); + } + } + int mBuf{0}; + std::string mName{"QueueSUT"}; + std::vector> mResponses; + std::vector mThreads; + std::deque mIdQueue; + std::mutex mMtx; + std::condition_variable mCondVar; + bool mDone{false}; +}; + +class MultiBasicSUT : public mlperf::SystemUnderTest { + public: + MultiBasicSUT(int numThreads) + : mNumThreads(numThreads), mResponses(numThreads) { + // Start with some large value so that we don't reallocate memory. + initResponse(10000); + for (int i = 0; i < mNumThreads; ++i) { + mThreads.emplace_back(&MultiBasicSUT::startIssueThread, this, i); + } + } + ~MultiBasicSUT() override { + for (auto& thread : mThreads) { + thread.join(); + } + } + const std::string& Name() override { return mName; } + void IssueQuery(const std::vector& samples) override { + int thread_idx = mThreadMap[std::this_thread::get_id()]; + size_t n = samples.size(); + auto& reponses = mResponses[thread_idx]; + if (n > reponses.size()) { + std::cout + << "Warning: reallocating response buffer in MultiBasicSUT. Maybe " + "you should initResponse with larger value!?" + << std::endl; + initResponse(samples.size()); + } + for (size_t i = 0; i < n; i++) { + reponses[i].id = samples[i].id; + } + mlperf::QuerySamplesComplete(reponses.data(), n); + } + void FlushQueries() override {} + + private: + void initResponse(int size) { + for (auto& responses : mResponses) { + responses.resize(size, + {0, reinterpret_cast(&mBuf), sizeof(int)}); + } + } + void startIssueThread(int thread_idx) { + { + std::lock_guard lock(mMtx); + mThreadMap[std::this_thread::get_id()] = thread_idx; + } + mlperf::RegisterIssueQueryThread(); + } + int mBuf{0}; + int mNumThreads{0}; + std::string mName{"MultiBasicSUT"}; + std::vector> mResponses; + std::mutex mMtx; + std::vector mThreads; + std::map mThreadMap; +}; + +int main(int argc, char** argv) { + assert(argc >= 2 && "Need to pass in at least one argument: target_qps"); + int target_qps = std::stoi(argv[1]); + std::cout << "target_qps = " << target_qps << std::endl; + + bool useQueue{false}; + int numCompleteThreads{4}; + int maxSize{1}; + bool server_coalesce_queries{false}; + int num_issue_threads{0}; + if (argc >= 3) { + useQueue = std::stoi(argv[2]) != 0; + } + if (argc >= 4) { + numCompleteThreads = std::stoi(argv[3]); + } + if (argc >= 5) { + maxSize = std::stoi(argv[4]); + } + if (argc >= 6) { + server_coalesce_queries = std::stoi(argv[5]) != 0; + } + if (argc >= 7) { + num_issue_threads = std::stoi(argv[6]); + } + + QSL qsl; + std::unique_ptr sut; + + // Configure the test settings + mlperf::TestSettings testSettings; + testSettings.scenario = mlperf::TestScenario::Server; + testSettings.mode = mlperf::TestMode::PerformanceOnly; + testSettings.server_target_qps = target_qps; + testSettings.server_target_latency_ns = 10000000; // 10ms + testSettings.server_target_latency_percentile = 0.99; + testSettings.min_duration_ms = 60000; + testSettings.min_query_count = 270000; + testSettings.server_coalesce_queries = server_coalesce_queries; + std::cout << "testSettings.server_coalesce_queries = " + << (server_coalesce_queries ? "True" : "False") << std::endl; + testSettings.server_num_issue_query_threads = num_issue_threads; + std::cout << "num_issue_threads = " << num_issue_threads << std::endl; + + // Configure the logging settings + mlperf::LogSettings logSettings; + logSettings.log_output.outdir = "build"; + logSettings.log_output.prefix = "mlperf_log_"; + logSettings.log_output.suffix = ""; + logSettings.log_output.prefix_with_datetime = false; + logSettings.log_output.copy_detail_to_stdout = false; + logSettings.log_output.copy_summary_to_stdout = true; + logSettings.log_mode = mlperf::LoggingMode::AsyncPoll; + logSettings.log_mode_async_poll_interval_ms = 1000; + logSettings.enable_trace = false; + + // Choose SUT + if (num_issue_threads == 0) { + if (useQueue) { + std::cout << "Using QueueSUT with " << numCompleteThreads + << " complete threads" << std::endl; + sut.reset(new QueueSUT(numCompleteThreads, maxSize)); + } else { + std::cout << "Using BasicSUT" << std::endl; + sut.reset(new BasicSUT()); + } + } else { + if (useQueue) { + std::cout << "Using MultiQueueSUT with " << numCompleteThreads + << " complete threads" << std::endl; + std::cerr << "!!!! MultiQueueSUT is NOT implemented yet !!!!" + << std::endl; + return 1; + // sut.reset(new MultiQueueSUT(num_issue_threads, numCompleteThreads, + // maxSize)); + } else { + std::cout << "Using MultiBasicSUT" << std::endl; + sut.reset(new MultiBasicSUT(num_issue_threads)); + } + } + + // Start test + std::cout << "Start test..." << std::endl; + mlperf::StartTest(sut.get(), &qsl, testSettings, logSettings); + std::cout << "Test done. Clean up SUT..." << std::endl; + sut.reset(); + std::cout << "Done!" << std::endl; + return 0; +} diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh new file mode 100644 index 000000000..62559c1a8 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh @@ -0,0 +1,21 @@ +#!/usr/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +echo "Building loadgen..." +if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi; +cd loadgen_build && cmake ../.. && make -j && cd .. +echo "Building test program..." +if [ ! -e build ]; then mkdir build; fi; +g++ --std=c++11 -O3 -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \ +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 build/repro.exe $1 $2 $3 $4 $5 $6 diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh new file mode 100644 index 000000000..ba63727c8 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh @@ -0,0 +1,21 @@ +#!/usr/bin/bash +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +echo "Building loadgen in Debug mode..." +if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi; +cd loadgen_build && cmake -DCMAKE_BUILD_TYPE=Debug ../.. && make -j && cd .. +echo "Building test program in Debug mode..." +if [ ! -e build ]; then mkdir build; fi; +g++ --std=c++11 -O0 -g -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \ +gdb --args build/repro.exe $1 $2 $3 $4 $5 $6 diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc new file mode 100644 index 000000000..0248a1c16 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc @@ -0,0 +1,176 @@ +/* Copyright 2019 The MLPerf Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "c_api.h" + +#include + +#include "../loadgen.h" +#include "../query_sample.h" +#include "../query_sample_library.h" +#include "../system_under_test.h" +#include "../test_settings.h" + +namespace mlperf { +namespace c { +namespace { + +// Forwards SystemUnderTest calls to relevant callbacks. +class SystemUnderTestTrampoline : public SystemUnderTest { + public: + SystemUnderTestTrampoline(ClientData client_data, std::string name, + IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb) + : client_data_(client_data), + name_(std::move(name)), + issue_cb_(issue_cb), + flush_queries_cb_(flush_queries_cb) {} + ~SystemUnderTestTrampoline() override = default; + + const std::string& Name() override { return name_; } + + void IssueQuery(const std::vector& samples) override { + (*issue_cb_)(client_data_, samples.data(), samples.size()); + } + + void FlushQueries() override { (*flush_queries_cb_)(); } + + private: + ClientData client_data_; + std::string name_; + IssueQueryCallback issue_cb_; + FlushQueriesCallback flush_queries_cb_; +}; + +} // namespace + +void* ConstructSUT(ClientData client_data, const char* name, size_t name_length, + IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb) { + SystemUnderTestTrampoline* sut = new SystemUnderTestTrampoline( + client_data, std::string(name, name_length), issue_cb, flush_queries_cb); + return reinterpret_cast(sut); +} + +void DestroySUT(void* sut) { + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + delete sut_cast; +} + +namespace { + +// Forwards QuerySampleLibrary calls to relevant callbacks. +class QuerySampleLibraryTrampoline : public QuerySampleLibrary { + public: + QuerySampleLibraryTrampoline( + ClientData client_data, std::string name, size_t total_sample_count, + size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb) + : client_data_(client_data), + name_(std::move(name)), + total_sample_count_(total_sample_count), + performance_sample_count_(performance_sample_count), + load_samples_to_ram_cb_(load_samples_to_ram_cb), + unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {} + ~QuerySampleLibraryTrampoline() override = default; + + const std::string& Name() override { return name_; } + size_t TotalSampleCount() override { return total_sample_count_; } + size_t PerformanceSampleCount() override { return performance_sample_count_; } + + void LoadSamplesToRam(const std::vector& samples) override { + (*load_samples_to_ram_cb_)(client_data_, samples.data(), samples.size()); + } + void UnloadSamplesFromRam( + const std::vector& samples) override { + (*unload_samples_from_ram_cb_)(client_data_, samples.data(), + samples.size()); + } + + private: + ClientData client_data_; + std::string name_; + size_t total_sample_count_; + size_t performance_sample_count_; + LoadSamplesToRamCallback load_samples_to_ram_cb_; + UnloadSamplesFromRamCallback unload_samples_from_ram_cb_; +}; + +} // namespace + +void* ConstructQSL(ClientData client_data, const char* name, size_t name_length, + size_t total_sample_count, size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb) { + QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline( + client_data, std::string(name, name_length), total_sample_count, + performance_sample_count, load_samples_to_ram_cb, + unload_samples_from_ram_cb); + return reinterpret_cast(qsl); +} + +void DestroyQSL(void* qsl) { + QuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + delete qsl_cast; +} + +// mlperf::c::StartTest just forwards to mlperf::StartTest after doing the +// proper cast. +void StartTest(void* sut, void* qsl, const TestSettings& settings, + const std::string& audit_config_filename = "audit.config") { + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + QuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + LogSettings default_log_settings; + mlperf::StartTest(sut_cast, qsl_cast, settings, default_log_settings, + audit_config_filename); +} + +void QuerySamplesComplete(QuerySampleResponse* responses, + size_t response_count) { + mlperf::QuerySamplesComplete(responses, response_count); +} + +void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses, + size_t response_count, + ResponseCallback response_cb, + ClientData client_data) { + mlperf::QuerySamplesComplete( + responses, response_count, + [client_data, response_cb](QuerySampleResponse* response) { + response_cb(client_data, response); + }); +} + +void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count) { + mlperf::FirstTokenComplete(responses, response_count); +} + +void FirstTokenCompleteResponseCb(QuerySampleResponse* responses, + size_t response_count, + ResponseCallback response_cb, + ClientData client_data) { + mlperf::FirstTokenComplete( + responses, response_count, + [client_data, response_cb](QuerySampleResponse* response) { + response_cb(client_data, response); + }); +} + +void RegisterIssueQueryThread() { mlperf::RegisterIssueQueryThread(); } + +} // namespace c +} // namespace mlperf diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h new file mode 100644 index 000000000..0ee44fb71 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h @@ -0,0 +1,95 @@ +/* Copyright 2019 The MLPerf Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +/// \file +/// \brief A C API wrapping the C++ loadgen. Not tested. Needs work. +/// \details The C API allows a C or Python client to easily create +/// a SystemUnderTest without having to expose the SystemUnderTest class +/// directly. +/// ConstructSUT works with a bunch of function poitners instead that are +/// called from an underlying trampoline class. + +#ifndef SYSTEM_UNDER_TEST_C_API_H_ +#define SYSTEM_UNDER_TEST_C_API_H_ + +#include +#include + +#include "../query_sample.h" +#include "../test_settings.h" + +namespace mlperf { + +namespace c { + +/// \brief Optional opaque client data that creators of SUTs and QSLs can have +/// the loadgen pass back to their callback invocations. +/// Helps avoids global variables. +typedef uintptr_t ClientData; + +typedef void (*IssueQueryCallback)(ClientData, const QuerySample*, size_t); +typedef void (*FlushQueriesCallback)(); +typedef void (*ResponseCallback)(ClientData, QuerySampleResponse*); + +/// \brief SUT calls this function to report query result back to loadgen +void QuerySamplesComplete(QuerySampleResponse* responses, + size_t response_count); + +void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses, + size_t response_count, + ResponseCallback response_cb, + ClientData client_data); + +void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count); + +void FirstTokenCompleteResponseCb(QuerySampleResponse* responses, + size_t response_count, + ResponseCallback response_cb, + ClientData client_data); + +/// \brief Create an opaque SUT pointer based on C callbacks. +void* ConstructSUT(ClientData client_data, const char* name, size_t name_length, + IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb); +/// \brief Destroys the SUT created by ConstructSUT. +void DestroySUT(void* sut); + +typedef void (*LoadSamplesToRamCallback)(ClientData, const QuerySampleIndex*, + size_t); +typedef void (*UnloadSamplesFromRamCallback)(ClientData, + const QuerySampleIndex*, size_t); + +/// \brief Create an opaque QSL pointer based on C callbacks. +void* ConstructQSL(ClientData client_data, const char* name, size_t name_length, + size_t total_sample_count, size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb); +/// \brief Destroys the QSL created by ConsructQSL. +void DestroyQSL(void* qsl); + +/// \brief Run tests on a SUT created by ConstructSUT(). +/// \details This is the C entry point. See mlperf::StartTest for the C++ entry +/// point. +void StartTest(void* sut, void* qsl, const TestSettings& settings, + const std::string& audit_config_filename); + +/// +/// \brief Register a thread for query issuing in Server scenario. +/// \details This is the C entry point. See mlperf::RegisterIssueQueryThread for +/// the C++ entry point. +/// +void RegisterIssueQueryThread(); + +} // namespace c +} // namespace mlperf + +#endif // SYSTEM_UNDER_TEST_C_API_H_ diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc new file mode 100644 index 000000000..96396dab9 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc @@ -0,0 +1,484 @@ +/* Copyright 2019 The MLPerf Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +/// \file +/// \brief Python bindings for the loadgen using pybind11. + +#ifndef PYTHON_BINDINGS_H +#define PYTHON_BINDINGS_H + +#include + +#include "../loadgen.h" +#include "../query_dispatch_library.h" +#include "../query_sample.h" +#include "../query_sample_library.h" +#include "../system_under_test.h" +#include "../test_settings.h" +#include "pybind11/functional.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" +#include "pybind11/stl_bind.h" + +namespace mlperf { + +namespace { + +using IssueQueryCallback = std::function)>; +using FastIssueQueriesCallback = + std::function, std::vector)>; +using FlushQueriesCallback = std::function; +using NameCallback = std::function; + +// Forwards SystemUnderTest calls to relevant callbacks. +class SystemUnderTestTrampoline : public SystemUnderTest { + public: + SystemUnderTestTrampoline(std::string name, IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb) + : name_(std::move(name)), + issue_cb_(issue_cb), + flush_queries_cb_(flush_queries_cb) {} + ~SystemUnderTestTrampoline() override = default; + + const std::string& Name() override { return name_; } + + void IssueQuery(const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + issue_cb_(samples); + } + + void FlushQueries() override { flush_queries_cb_(); } + + protected: + std::string name_; + IssueQueryCallback issue_cb_; + FlushQueriesCallback flush_queries_cb_; +}; + +class FastSystemUnderTestTrampoline : public SystemUnderTestTrampoline { + public: + FastSystemUnderTestTrampoline(std::string name, + FastIssueQueriesCallback fast_issue_cb, + FlushQueriesCallback flush_queries_cb) + : SystemUnderTestTrampoline(name, nullptr, flush_queries_cb), + fast_issue_cb_(fast_issue_cb) {} + ~FastSystemUnderTestTrampoline() override = default; + + void IssueQuery(const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + std::vector responseIds; + std::vector querySampleIndices; + for (auto& s : samples) { + responseIds.push_back(s.id); + querySampleIndices.push_back(s.index); + } + fast_issue_cb_(responseIds, querySampleIndices); + } + + private: + FastIssueQueriesCallback fast_issue_cb_; +}; + +using LoadSamplesToRamCallback = + std::function)>; +using UnloadSamplesFromRamCallback = + std::function)>; + +// Forwards QuerySampleLibrary calls to relevant callbacks. +class QuerySampleLibraryTrampoline : public QuerySampleLibrary { + public: + QuerySampleLibraryTrampoline( + std::string name, size_t total_sample_count, + size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb) + : name_(std::move(name)), + total_sample_count_(total_sample_count), + performance_sample_count_(performance_sample_count), + load_samples_to_ram_cb_(load_samples_to_ram_cb), + unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {} + ~QuerySampleLibraryTrampoline() override = default; + + const std::string& Name() override { return name_; } + size_t TotalSampleCount() { return total_sample_count_; } + size_t PerformanceSampleCount() { return performance_sample_count_; } + + void LoadSamplesToRam(const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + load_samples_to_ram_cb_(samples); + } + void UnloadSamplesFromRam( + const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + unload_samples_from_ram_cb_(samples); + } + + private: + std::string name_; + size_t total_sample_count_; + size_t performance_sample_count_; + LoadSamplesToRamCallback load_samples_to_ram_cb_; + UnloadSamplesFromRamCallback unload_samples_from_ram_cb_; +}; + +// A QDL that allows defining callbacks for +// IssueQuery, FlushQueries, and Name methods. +class QueryDispatchLibraryTrampoline : public QueryDispatchLibrary { + public: + QueryDispatchLibraryTrampoline(IssueQueryCallback issue_query_callback, + FlushQueriesCallback flush_queries_callback, + NameCallback name_callback) + : issue_query_callback_(issue_query_callback), + flush_queries_callback_(flush_queries_callback), + name_callback_(name_callback) {} + + // Returns the name of the SUT. Name shall be returned over the network + // TODO: other bindings should also be fixed eventually to be used over the + // network + const std::string& Name() override { + static std::string name; // HACK: avoid returning a reference to temporary. + pybind11::gil_scoped_acquire gil_acquirer; + name = name_callback_(); // name_callback_() shall returned name over the + // network. + return name; + } + + void IssueQuery(const std::vector& samples) override { + pybind11::gil_scoped_acquire gil_acquirer; + issue_query_callback_(samples); + } + + void FlushQueries() override { flush_queries_callback_(); } + + protected: + IssueQueryCallback issue_query_callback_; + FlushQueriesCallback flush_queries_callback_; + NameCallback name_callback_; +}; + +} // namespace + +/// \brief Python bindings. +namespace py { + +uintptr_t ConstructSUT(IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb) { + SystemUnderTestTrampoline* sut = + new SystemUnderTestTrampoline("PySUT", issue_cb, flush_queries_cb); + return reinterpret_cast(sut); +} + +void DestroySUT(uintptr_t sut) { + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + delete sut_cast; +} + +uintptr_t ConstructFastSUT(FastIssueQueriesCallback fast_issue_cb, + FlushQueriesCallback flush_queries_cb) { + FastSystemUnderTestTrampoline* sut = new FastSystemUnderTestTrampoline( + "PyFastSUT", fast_issue_cb, flush_queries_cb); + return reinterpret_cast(sut); +} + +void DestroyFastSUT(uintptr_t sut) { + FastSystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + delete sut_cast; +} + +uintptr_t ConstructQSL( + size_t total_sample_count, size_t performance_sample_count, + LoadSamplesToRamCallback load_samples_to_ram_cb, + UnloadSamplesFromRamCallback unload_samples_from_ram_cb) { + QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline( + "PyQSL", total_sample_count, performance_sample_count, + load_samples_to_ram_cb, unload_samples_from_ram_cb); + return reinterpret_cast(qsl); +} + +void DestroyQSL(uintptr_t qsl) { + QuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + delete qsl_cast; +} + +uintptr_t ConstructQDL(IssueQueryCallback issue_cb, + FlushQueriesCallback flush_queries_cb, + NameCallback name_callback) { + QueryDispatchLibraryTrampoline* qdl = new QueryDispatchLibraryTrampoline( + issue_cb, flush_queries_cb, name_callback); + return reinterpret_cast(qdl); +} + +void DestroyQDL(uintptr_t qdl) { + QueryDispatchLibraryTrampoline* qdl_cast = + reinterpret_cast(qdl); + delete qdl_cast; +} + +void StartTest(uintptr_t sut, uintptr_t qsl, mlperf::TestSettings test_settings, + const std::string& audit_config_filename) { + pybind11::gil_scoped_release gil_releaser; + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + QuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + LogSettings default_log_settings; + mlperf::StartTest(sut_cast, qsl_cast, test_settings, default_log_settings, + audit_config_filename); +} + +void StartTestWithLogSettings(uintptr_t sut, uintptr_t qsl, + mlperf::TestSettings test_settings, + mlperf::LogSettings log_settings, + const std::string& audit_config_filename) { + pybind11::gil_scoped_release gil_releaser; + SystemUnderTestTrampoline* sut_cast = + reinterpret_cast(sut); + QuerySampleLibraryTrampoline* qsl_cast = + reinterpret_cast(qsl); + mlperf::StartTest(sut_cast, qsl_cast, test_settings, log_settings, + audit_config_filename); +} + +using ResponseCallback = std::function; + +/// TODO: Get rid of copies. +void QuerySamplesComplete(std::vector responses, + ResponseCallback response_cb = {}) { + pybind11::gil_scoped_release gil_releaser; + mlperf::QuerySamplesComplete(responses.data(), responses.size(), response_cb); +} + +void FirstTokenComplete(std::vector responses, + ResponseCallback response_cb = {}) { + pybind11::gil_scoped_release gil_releaser; + mlperf::FirstTokenComplete(responses.data(), responses.size(), response_cb); +} + +PYBIND11_MODULE(mlperf_loadgen, m) { + m.doc() = "MLPerf Inference load generator."; + + pybind11::enum_(m, "TestScenario") + .value("SingleStream", TestScenario::SingleStream) + .value("MultiStream", TestScenario::MultiStream) + .value("Server", TestScenario::Server) + .value("Offline", TestScenario::Offline); + + pybind11::enum_(m, "TestMode") + .value("SubmissionRun", TestMode::SubmissionRun) + .value("AccuracyOnly", TestMode::AccuracyOnly) + .value("PerformanceOnly", TestMode::PerformanceOnly) + .value("FindPeakPerformance", TestMode::FindPeakPerformance); + + pybind11::class_(m, "TestSettings") + .def(pybind11::init<>()) + .def_readwrite("scenario", &TestSettings::scenario) + .def_readwrite("mode", &TestSettings::mode) + .def_readwrite("single_stream_expected_latency_ns", + &TestSettings::single_stream_expected_latency_ns) + .def_readwrite("single_stream_target_latency_percentile", + &TestSettings::single_stream_target_latency_percentile) + .def_readwrite("multi_stream_expected_latency_ns", + &TestSettings::multi_stream_expected_latency_ns) + .def_readwrite("multi_stream_target_latency_percentile", + &TestSettings::multi_stream_target_latency_percentile) + .def_readwrite("multi_stream_samples_per_query", + &TestSettings::multi_stream_samples_per_query) + .def_readwrite("server_target_qps", &TestSettings::server_target_qps) + .def_readwrite("server_target_latency_ns", + &TestSettings::server_target_latency_ns) + .def_readwrite("server_target_latency_percentile", + &TestSettings::server_target_latency_percentile) + .def_readwrite("server_coalesce_queries", + &TestSettings::server_coalesce_queries) + .def_readwrite("server_find_peak_qps_decimals_of_precision", + &TestSettings::server_find_peak_qps_decimals_of_precision) + .def_readwrite("server_find_peak_qps_boundary_step_size", + &TestSettings::server_find_peak_qps_boundary_step_size) + .def_readwrite("server_max_async_queries", + &TestSettings::server_max_async_queries) + .def_readwrite("server_num_issue_query_threads", + &TestSettings::server_num_issue_query_threads) + .def_readwrite("offline_expected_qps", + &TestSettings::offline_expected_qps) + .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms) + .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms) + .def_readwrite("min_query_count", &TestSettings::min_query_count) + .def_readwrite("max_query_count", &TestSettings::max_query_count) + .def_readwrite("qsl_rng_seed", &TestSettings::qsl_rng_seed) + .def_readwrite("sample_index_rng_seed", + &TestSettings::sample_index_rng_seed) + .def_readwrite("schedule_rng_seed", &TestSettings::schedule_rng_seed) + .def_readwrite("accuracy_log_rng_seed", + &TestSettings::accuracy_log_rng_seed) + .def_readwrite("accuracy_log_probability", + &TestSettings::accuracy_log_probability) + .def_readwrite("print_timestamps", &TestSettings::print_timestamps) + .def_readwrite("performance_issue_unique", + &TestSettings::performance_issue_unique) + .def_readwrite("performance_issue_same", + &TestSettings::performance_issue_same) + .def_readwrite("performance_issue_same_index", + &TestSettings::performance_issue_same_index) + .def_readwrite("performance_sample_count_override", + &TestSettings::performance_sample_count_override) + .def_readwrite("test05", &TestSettings::test05) + .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed) + .def_readwrite("test05_sample_index_rng_seed", + &TestSettings::test05_sample_index_rng_seed) + .def_readwrite("test05_schedule_rng_seed", + &TestSettings::test05_schedule_rng_seed) + .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies) + .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency) + .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency) + .def_readwrite("infer_token_latencies", + &TestSettings::infer_token_latencies) + .def_readwrite("token_latency_scaling_factor", + &TestSettings::token_latency_scaling_factor) + .def("FromConfig", &TestSettings::FromConfig, pybind11::arg("path"), + pybind11::arg("model"), pybind11::arg("scenario"), + pybind11::arg("conf_type") = 1, + "This function configures settings from the given user " + "configuration file, model, and scenario. The conf_type flag " + "should be set to 1 for loading user.conf or else only the default " + "mlperf_conf file " + "will be loaded by the loadgen."); + + pybind11::enum_(m, "LoggingMode") + .value("AsyncPoll", LoggingMode::AsyncPoll) + .value("EndOfTestOnly", LoggingMode::EndOfTestOnly) + .value("Synchronous", LoggingMode::Synchronous); + + pybind11::class_(m, "LogOutputSettings") + .def(pybind11::init<>()) + .def_readwrite("outdir", &LogOutputSettings::outdir) + .def_readwrite("prefix", &LogOutputSettings::prefix) + .def_readwrite("suffix", &LogOutputSettings::suffix) + .def_readwrite("prefix_with_datetime", + &LogOutputSettings::prefix_with_datetime) + .def_readwrite("copy_detail_to_stdout", + &LogOutputSettings::copy_detail_to_stdout) + .def_readwrite("copy_summary_to_stdout", + &LogOutputSettings::copy_summary_to_stdout); + + pybind11::class_(m, "LogSettings") + .def(pybind11::init<>()) + .def_readwrite("log_output", &LogSettings::log_output) + .def_readwrite("log_mode", &LogSettings::log_mode) + .def_readwrite("log_mode_async_poll_interval_ms", + &LogSettings::log_mode_async_poll_interval_ms) + .def_readwrite("enable_trace", &LogSettings::enable_trace); + + pybind11::class_(m, "QuerySample") + .def(pybind11::init<>()) + .def(pybind11::init()) + .def_readwrite("id", &QuerySample::id) + .def_readwrite("index", &QuerySample::index) + .def(pybind11::pickle( + [](const QuerySample& qs) { // __getstate__ + /*Return a tuple that fully encodes state of object*/ + return pybind11::make_tuple(qs.id, qs.index); + }, + [](pybind11::tuple t) { // __setstate__ + if (t.size() != 2) + throw std::runtime_error("Invalid state for QuerySample"); + /* Create a new C++ instance*/ + QuerySample q; + q.id = t[0].cast(); + q.index = t[1].cast(); + return q; + })); + + pybind11::class_(m, "QuerySampleResponse") + .def(pybind11::init<>()) + .def(pybind11::init()) + .def(pybind11::init()) + .def_readwrite("id", &QuerySampleResponse::id) + .def_readwrite("data", &QuerySampleResponse::data) + .def_readwrite("size", &QuerySampleResponse::size) + .def_readwrite("n_tokens", &QuerySampleResponse::n_tokens) + .def(pybind11::pickle( + [](const QuerySampleResponse& qsr) { // __getstate__ + /* Return a tuple that fully encodes state of object*/ + return pybind11::make_tuple(qsr.id, qsr.data, qsr.size); + }, + [](pybind11::tuple t) { // __setstate__ + if ((t.size() != 3) || (t.size() != 4)) + throw std::runtime_error("Invalid state for QuerySampleResponse"); + /* Create a new C++ instance*/ + QuerySampleResponse q; + q.id = t[0].cast(); + q.data = t[1].cast(); + q.size = t[2].cast(); + if (t.size() == 4) { + q.n_tokens = t[3].cast(); + } else { + q.n_tokens = 0; + } + return q; + })); + + // TODO: Use PYBIND11_MAKE_OPAQUE for the following vector types. + pybind11::bind_vector>(m, "VectorQuerySample"); + pybind11::bind_vector>( + m, "VectorQuerySampleResponse"); + + m.def("ConstructSUT", &py::ConstructSUT, "Construct the system under test."); + m.def("DestroySUT", &py::DestroySUT, + "Destroy the object created by ConstructSUT."); + + m.def("ConstructFastSUT", &py::ConstructFastSUT, + "Construct the system under test, fast issue query"); + m.def("DestroyFastSUT", &py::DestroyFastSUT, + "Destroy the object created by ConstructFastSUT."); + + m.def("ConstructQSL", &py::ConstructQSL, + "Construct the query sample library."); + m.def("DestroyQSL", &py::DestroyQSL, + "Destroy the object created by ConstructQSL."); + + m.def("ConstructQDL", &py::ConstructQDL, + "Construct the query sample library, communicating with the SUT over " + "the network."); + m.def("DestroyQDL", &py::DestroyQDL, + "Destroy the object created by ConstructQDL."); + + m.def("StartTest", &py::StartTest, + "Run tests on a SUT created by ConstructSUT() with the provided QSL. " + "Uses default log settings.", + pybind11::arg("sut"), pybind11::arg("qsl"), + pybind11::arg("test_settings"), + pybind11::arg("audit_config_filename") = "audit.config"); + m.def("StartTestWithLogSettings", &py::StartTestWithLogSettings, + "Run tests on a SUT created by ConstructSUT() with the provided QSL. " + "Accepts custom log settings.", + pybind11::arg("sut"), pybind11::arg("qsl"), + pybind11::arg("test_settings"), pybind11::arg("log_settings"), + pybind11::arg("audit_config_filename") = "audit.config"); + m.def("QuerySamplesComplete", &py::QuerySamplesComplete, + "Called by the SUT to indicate that samples from some combination of" + "IssueQuery calls have finished.", + pybind11::arg("responses"), + pybind11::arg("response_cb") = ResponseCallback{}); + m.def("FirstTokenComplete", &py::FirstTokenComplete, + "Called by the SUT to indicate that tokens from some combination of" + "IssueQuery calls have finished.", + pybind11::arg("responses"), + pybind11::arg("response_cb") = ResponseCallback{}); +} + +} // namespace py +} // namespace mlperf + +#endif // PYTHON_BINDINGS_H diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md new file mode 100644 index 000000000..f46e22a65 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md @@ -0,0 +1,67 @@ +# Demo + +## Loadgen Over the Network + +### Overview + + +This folder provides a demo implementation for LoadGen over the network.\ +Two sides are implemented: + +1. The SUT side which is implemented in [sut_over_network_demo.py](sut_over_network_demo.py). Each Node should run it for multiple Nodes operation. +2. The LoadGen node running the LoadGen, QSL and QDL instances, implemented in [py_demo_server_lon.py](py_demo_server_lon.py) + +The demo SUT is implemented with a Flask server. the LON node implements a Flask client for network operation. + +The test runs in MLPerf Server mode. the SUT is not implementing a benchmark but contains dummy interface to preprocessing, postprocessing and model calling functions. + +### Setup + +Install python packages: + +```sh +pip install absl-py numpy wheel flask requests +``` + +Clone: + +```sh +git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference +``` + +Build: + +```sh +cd mlperf_inference/loadgen +CFLAGS="-std=c++14 -O3" python setup.py bdist_wheel +cd ..; pip install --force-reinstall loadgen/dist/`ls -r loadgen/dist/ | head -n1` ; cd - +``` + +### Run the demo (single machine) + +Start the demo SUT server (run this at a separate terminal): + +```sh +python demos/lon/sut_over_network_demo.py --port 8000 +``` + +Start the test: + +```sh +python demos/lon/py_demo_server_lon.py --sut_server http://localhost:8000 +``` + +### Run the demo (over the network) + +To run over a network - simply run the demo SUT over on a different machine. For multiple Nodes run the demo SUT on each machine specifying the node number.\ + +```sh +python demos/lon/sut_over_network_demo.py --port 8000 --node N1 +``` + +Then, when running the client, replace `localhost` with the correct IP. + + +```sh +python demos/lon/py_demo_server_lon.py --sut_server IP1:8000,IP2:8000,IP3:8000 +``` diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py new file mode 100644 index 000000000..1248215db --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py @@ -0,0 +1,191 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +""" +Python demo showing how to use the MLPerf Inference LoadGen over the Network bindings. +This programs runs in the LON Node side. +It runs the demo in MLPerf server mode over the network. +It communicates over the network with a Network SUT node, +which is running the Network SUT demo based on a flask server, implemented in SUT_over_network.py +""" + +import threading +import requests +import array +import time + +from absl import app +from absl import flags +import mlperf_loadgen + +FLAGS = flags.FLAGS + +flags.DEFINE_list( + "sut_server", "http://localhost:8000", "Address of the server(s) under test." +) + + +class QSL: + """Demo QuerySampleLibrary with dummy features.""" + + def __init__(self, total_sample_count, performance_sample_count): + self.eval_features = { + i: f"what_is_my_dummy_feature_{i}?" for i in range(total_sample_count) + } + self.qsl = mlperf_loadgen.ConstructQSL( + total_sample_count, + performance_sample_count, + self.load_samples_to_ram, + self.unload_samples_from_ram, + ) + + def get_features(self, sample_id): + """Returns the feature for a given sample id.""" + return self.eval_features[sample_id] + + def load_samples_to_ram(self, query_samples): + """Loads the features for the given query samples into RAM.""" + # Current implementation is not using this functionality. + del query_samples + return + + def unload_samples_from_ram(self, query_samples): + """Unloads the features for the given query samples from RAM.""" + # Current implementation is not using this functionality. + del query_samples + return + + def __del__(self): + mlperf_loadgen.DestroyQSL(self.qsl) + + +class QDL: + """QDL acting as a proxy to the SUT. + This QDL communicates with the SUT via HTTP. + It uses two endpoints to communicate with the SUT: + - /predict/ : Send a query to the SUT and get a response. + - /getname/ : Get the name of the SUT. Send a getname to the SUT and get a response. + """ + + def __init__(self, qsl: QSL, sut_server_addr: list): + """ + Constructor for the QDL. + Args: + qsl: The QSL to use. + sut_server_addr: A list of addresses of the SUT. + """ + self.qsl = qsl + + # Construct QDL from the python binding + self.qdl = mlperf_loadgen.ConstructQDL( + self.issue_query, self.flush_queries, self.client_get_name + ) + self.sut_server_addr = sut_server_addr + self.num_nodes = len(sut_server_addr) + + # For round robin between the SUTs: + self.next_sut_id = 0 + self.lock = threading.Lock() + + def issue_query(self, query_samples): + """Process the query to send to the SUT""" + threading.Thread( + target=self.process_query_async, + args=[query_samples]).start() + + def flush_queries(self): + """Flush the queries. Dummy implementation.""" + pass + + def process_query_async(self, query_samples): + """ + This function is called by the Loadgen in a separate thread. + It is responsible for + 1. Creating a query for the SUT, by reading the features from the QSL. + 2. Sending the query to the SUT. + 3. Waiting for the response from the SUT. + 4. Deserializing the response. + 5. Calling mlperf_loadgen.QuerySamplesComplete(query_samples, response) + Args: + query_samples: A list of QuerySample objects. + """ + responses = [] + for s in query_samples: + # Overall process: + # QDL builds a real-world query and sends to SUT --> SUT processes --> SUT sends back to QDL + # Read features from the QSL + features = self.qsl.get_features(s.index) + + time.sleep(0.001) # Ensure a maximal rate of queries to the SUT + + # Send the query to SUT in round robin + # Wait for a response + sut_result = self.client_predict(features, s.index) + response_array = array.array("B", sut_result.encode("utf-8")) + bi = response_array.buffer_info() + responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, bi[0], bi[1])) + mlperf_loadgen.QuerySamplesComplete(responses) + + def get_sut_id_round_robin(self): + """Get the SUT id in round robin.""" + with self.lock: + res = self.next_sut_id + self.next_sut_id = (self.next_sut_id + 1) % self.num_nodes + return res + + def client_predict(self, query, id): + """Serialize the query, send it to the SUT in round robin, and return the deserialized response.""" + url = "{}/predict/".format( + self.sut_server_addr[self.get_sut_id_round_robin()]) + response = requests.post(url, json={"query": query, id: id}) + return response.json()["result"] + + def client_get_name(self): + """Get the name of the SUT from ALL the SUTS.""" + if len(self.sut_server_addr) == 1: + return requests.post( + f"{self.sut_server_addr[0]}/getname/").json()["name"] + + sut_names = [ + requests.post(f"{addr}/getname/").json()["name"] + for addr in self.sut_server_addr + ] + return "Multi-node SUT: " + ", ".join(sut_names) + + def __del__(self): + mlperf_loadgen.DestroyQDL(self.qdl) + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Server + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.server_target_qps = 100 + settings.server_target_latency_ns = 100000000 + settings.min_query_count = 100 + settings.min_duration_ms = 10000 + + # QDL and QSL + qsl = QSL(1024, 128) + qdl = QDL(qsl, sut_server_addr=FLAGS.sut_server) + + mlperf_loadgen.StartTest(qdl.qdl, qsl.qsl, settings) + + +if __name__ == "__main__": + app.run(main) diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py new file mode 100644 index 000000000..55e5e038d --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py @@ -0,0 +1,88 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + + +""" +Python demo showing how to use the MLPerf Inference load generator bindings over the network. +This part of the demo runs the "demo SUT" which is connected over the network to the LON node. +A corresponding "demo LON node" with the demo test is implemented in py_demo_server_lon.py. + +The SUT is implemented using a Flask server, with dummy implementation of the inference processing. +Two endpoints are exposed: +- /predict/ : Receives a query (e.g., a text) runs inference, and returns a prediction. +- /getname/ : Get the name of the SUT. + +The current implementation is a dummy implementation, which does not use +a real DNN model, batching, or pre/postprocessing code, +but rather just returns subset of the input query as a response, +Yet, it illustrates the basic structure of a SUT server. +""" + +import argparse +from flask import Flask, request, jsonify + + +app = Flask(__name__) + + +node = "" + + +def preprocess(query): + """[SUT Node] A dummy preprocess.""" + # Here may come for example batching, tokenization, resizing, + # normalization, etc. + response = query + return response + + +def dnn_model(query): + """[SUT Node] A dummy DNN model.""" + # Here may come for example a call to a dnn model such as resnet, bert, + # etc. + response = query + return response + + +def postprocess(query): + """[SUT Node] A dummy postprocess.""" + # Here may come for example a postprocessing call, e.g., NMS, + # detokenization, etc. + response = query + return response + + +@app.route("/predict/", methods=["POST"]) +def predict(): + """Receives a query (e.g., a text) runs inference, and returns a prediction.""" + query = request.get_json(force=True)["query"] + result = postprocess(dnn_model(preprocess(query))) + return jsonify(result=result) + + +@app.route("/getname/", methods=["POST", "GET"]) +def getname(): + """Returns the name of the SUT.""" + return jsonify(name=f"Demo SUT (Network SUT) node" + + (" " + node) if node else "") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--node", type=str, default="") + args = parser.parse_args() + node = args.node + app.run(debug=False, port=args.port) diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py new file mode 100644 index 000000000..f6082cad6 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py @@ -0,0 +1,86 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import threading +import time + +from absl import app +import mlperf_loadgen + +from datetime import datetime + +# Global var +NUM_AGENTS = 8 +LOOPBACK_LATENCY_S = 0.001 + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +# Processes queries in NUM_AGENTS slices that complete at different times. +def process_query_async(query_samples, i_slice): + time.sleep(LOOPBACK_LATENCY_S * (i_slice + 1)) + responses = [] + samples_to_complete = query_samples[i_slice: len( + query_samples): NUM_AGENTS] + for j, s in enumerate(samples_to_complete): + responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0)) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + for i in range(8): + threading.Thread( + target=process_query_async, args=( + query_samples, i)).start() + + +def flush_queries(): + pass + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.MultiStream + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.multi_stream_expected_latency_ns = 8000000 + settings.multi_stream_samples_per_query = 8 + settings.min_query_count = 100 + settings.min_duration_ms = 10000 + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py new file mode 100644 index 000000000..909585edc --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py @@ -0,0 +1,81 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import threading +import time + +from absl import app +import mlperf_loadgen + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +# Processes queries in 3 slices that complete at different times. +def process_query_async(query_samples, i_slice): + time.sleep(3 * (i_slice + 1)) + responses = [] + samples_to_complete = query_samples[i_slice: len(query_samples): 3] + for s in samples_to_complete: + responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0)) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + threading.Thread( + target=process_query_async, args=( + query_samples, 0)).start() + threading.Thread( + target=process_query_async, args=( + query_samples, 1)).start() + threading.Thread( + target=process_query_async, args=( + query_samples, 2)).start() + + +def flush_queries(): + pass + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Offline + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.offline_expected_qps = 1000 + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py new file mode 100644 index 000000000..8b6f2b826 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py @@ -0,0 +1,74 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import threading +import time + +from absl import app +import mlperf_loadgen + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +def process_query_async(query_samples): + time.sleep(0.001) + responses = [] + for s in query_samples: + responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0)) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, args=[query_samples]).start() + + +def flush_queries(): + pass + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Server + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.server_target_qps = 100 + settings.server_target_latency_ns = 100000000 + settings.min_query_count = 100 + settings.min_duration_ms = 10000 + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py new file mode 100644 index 000000000..8806271bd --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py @@ -0,0 +1,84 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import array +import threading +import time + +from absl import app +import mlperf_loadgen + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +def process_query_async(query_samples): + """Processes the list of queries.""" + time.sleep(0.001) + responses = [] + response_array = array.array( + "f", [0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 254, 255] + ) + response_info = response_array.buffer_info() + response_data = response_info[0] + response_size = response_info[1] * response_array.itemsize + for s in query_samples: + responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, response_data, response_size) + ) + mlperf_loadgen.QuerySamplesComplete(responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, args=[query_samples]).start() + + +def flush_queries(): + pass + + +def main(argv): + del argv + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.SingleStream + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + settings.single_stream_expected_latency_ns = 1000000 + settings.min_query_count = 100 + settings.min_duration_ms = 10000 + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + app.run(main) diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py new file mode 100644 index 000000000..e4b083853 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py @@ -0,0 +1,142 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import argparse +import threading +import time +import numpy as np +import array + +import mlperf_loadgen + +from datetime import datetime + +# Global var +NUM_AGENTS = 8 +LOOPBACK_LATENCY_S = 0.001 + + +def f(x, y): + return 4 + 3 * x * y + x**3 + y**2 + + +def create_responses(n, m, mod=4): + r = [] + for i in range(n): + r.append([f(i, j) for j in range(m + (i % mod))]) + return r + + +responses = create_responses(1024, 20) + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +# Processes queries in NUM_AGENTS slices that complete at different times. +def process_query_async(query_samples, i_slice): + time.sleep(LOOPBACK_LATENCY_S * (i_slice + 1)) + query_responses = [] + samples_to_complete = query_samples[i_slice: len( + query_samples): NUM_AGENTS] + for j, s in enumerate(samples_to_complete): + response_array = np.array(responses[s.index], np.int32) + token = response_array[0] + time.sleep(0.0002) + response_token = array.array("B", token.tobytes()) + response_token_info = response_token.buffer_info() + response_token_data = response_token_info[0] + response_token_size = response_token_info[1] * response_token.itemsize + mlperf_loadgen.FirstTokenComplete( + [ + mlperf_loadgen.QuerySampleResponse( + s.id, response_token_data, response_token_size + ) + ] + ) + time.sleep(0.02) + n_tokens = len(response_array) + response_array = array.array("B", response_array.tobytes()) + response_info = response_array.buffer_info() + response_data = response_info[0] + response_size = response_info[1] * response_array.itemsize + query_responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, response_data, response_size, n_tokens + ) + ) + mlperf_loadgen.QuerySamplesComplete(query_responses) + + +def issue_query(query_samples): + for i in range(8): + threading.Thread( + target=process_query_async, args=( + query_samples, i)).start() + + +def flush_queries(): + pass + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", choices=["performance", "accuracy"], default="performance" + ) + parser.add_argument("--expected-latency", type=int, default=8000000) + parser.add_argument("--samples-per-query", type=int, default=8) + parser.add_argument("--min-query-count", type=int, default=100) + parser.add_argument("--min-duration-ms", type=int, default=30000) + return parser.parse_args() + + +def main(): + args = get_args() + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.MultiStream + if args.mode == "performance": + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + else: + settings.mode = mlperf_loadgen.TestMode.AccuracyOnly + settings.multi_stream_expected_latency_ns = args.expected_latency + settings.multi_stream_samples_per_query = args.samples_per_query + settings.min_query_count = args.min_query_count + settings.min_duration_ms = args.min_duration_ms + settings.use_token_latencies = True + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py new file mode 100644 index 000000000..2e190cdd5 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py @@ -0,0 +1,130 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import argparse +import threading +import time +import numpy as np +import array + +import mlperf_loadgen + + +def f(x, y): + return 4 + 3 * x * y + x**3 + y**2 + + +def create_responses(n, m, mod=4): + r = [] + for i in range(n): + r.append([f(i, j) for j in range(m + (i % mod))]) + return r + + +responses = create_responses(1024, 20) + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +# Processes queries in 3 slices that complete at different times. +def process_query_async(query_samples, i_slice): + time.sleep(3 * (i_slice + 1)) + query_responses = [] + samples_to_complete = query_samples[i_slice: len(query_samples): 3] + for s in samples_to_complete: + response_array = np.array(responses[s.index], np.int32) + token = response_array[0] + time.sleep(0.0002) + response_token = array.array("B", token.tobytes()) + response_token_info = response_token.buffer_info() + response_token_data = response_token_info[0] + response_token_size = response_token_info[1] * response_token.itemsize + # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)]) + time.sleep(0.02) + n_tokens = len(response_array) + response_array = array.array("B", response_array.tobytes()) + response_info = response_array.buffer_info() + response_data = response_info[0] + response_size = response_info[1] * response_array.itemsize + query_responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, response_data, response_size, n_tokens + ) + ) + mlperf_loadgen.QuerySamplesComplete(query_responses) + + +def issue_query(query_samples): + threading.Thread( + target=process_query_async, args=( + query_samples, 0)).start() + threading.Thread( + target=process_query_async, args=( + query_samples, 1)).start() + threading.Thread( + target=process_query_async, args=( + query_samples, 2)).start() + + +def flush_queries(): + pass + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", choices=["performance", "accuracy"], default="performance" + ) + parser.add_argument("--expected-qps", type=int, default=1000) + parser.add_argument("--min-duration-ms", type=int, default=30000) + return parser.parse_args() + + +def main(): + args = get_args() + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Offline + if args.mode == "performance": + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + else: + settings.mode = mlperf_loadgen.TestMode.AccuracyOnly + settings.offline_expected_qps = args.expected_qps + settings.min_duration_ms = args.min_duration_ms + settings.use_token_latencies = True + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py new file mode 100644 index 000000000..9325b8410 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py @@ -0,0 +1,130 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import argparse +import threading +import time +import numpy as np +import array + +import mlperf_loadgen + + +def f(x, y): + return 4 + 3 * x * y + x**3 + y**2 + + +def create_responses(n, m, mod=4): + r = [] + for i in range(n): + r.append([f(i, j) for j in range(m + (i % mod))]) + return r + + +responses = create_responses(1024, 20, mod=3) + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +# Processes queries in 3 slices that complete at different times. +def process_query_async(query_samples, i_slice): + time.sleep(3 * (i_slice + 1)) + query_responses = [] + samples_to_complete = query_samples[i_slice: len(query_samples): 3] + for s in samples_to_complete: + response_array = np.array(responses[s.index], np.int32) + token = response_array[0] + time.sleep(0.0002) + response_token = array.array("B", token.tobytes()) + response_token_info = response_token.buffer_info() + response_token_data = response_token_info[0] + response_token_size = response_token_info[1] * response_token.itemsize + # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)]) + time.sleep(0.02) + n_tokens = len(response_array) + response_array = array.array("B", response_array.tobytes()) + response_info = response_array.buffer_info() + response_data = response_info[0] + response_size = response_info[1] * response_array.itemsize + query_responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, response_data, response_size) + ) + mlperf_loadgen.QuerySamplesComplete(query_responses) + + +def issue_query(query_samples): + threading.Thread( + target=process_query_async, args=( + query_samples, 0)).start() + threading.Thread( + target=process_query_async, args=( + query_samples, 1)).start() + threading.Thread( + target=process_query_async, args=( + query_samples, 2)).start() + + +def flush_queries(): + pass + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", choices=["performance", "accuracy"], default="performance" + ) + parser.add_argument("--expected-qps", type=int, default=1000) + parser.add_argument("--min-duration-ms", type=int, default=30000) + return parser.parse_args() + + +def main(): + args = get_args() + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Offline + if args.mode == "performance": + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + else: + settings.mode = mlperf_loadgen.TestMode.AccuracyOnly + settings.offline_expected_qps = args.expected_qps + settings.min_duration_ms = args.min_duration_ms + settings.infer_token_latencies = 1 + settings.token_latency_scaling_factor = 21 + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py new file mode 100644 index 000000000..b564543cd --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py @@ -0,0 +1,132 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import argparse +import array +import threading +import time +import numpy as np + +from absl import app +import mlperf_loadgen + + +def f(x, y): + return 4 + 3 * x * y + x**3 + y**2 + + +def create_responses(n, m, mod=4): + r = [] + for i in range(n): + r.append([f(i, j) for j in range(m + (i % mod))]) + return r + + +responses = create_responses(1024, 20) + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +def process_query_async(query_samples): + """Processes the list of queries.""" + query_responses = [] + for s in query_samples: + response_array = np.array(responses[s.index], np.int32) + token = response_array[0] + time.sleep(0.0002) + response_token = array.array("B", token.tobytes()) + response_token_info = response_token.buffer_info() + response_token_data = response_token_info[0] + response_token_size = response_token_info[1] * response_token.itemsize + mlperf_loadgen.FirstTokenComplete( + [ + mlperf_loadgen.QuerySampleResponse( + s.id, response_token_data, response_token_size + ) + ] + ) + time.sleep(0.02) + n_tokens = len(response_array) + response_array = array.array("B", response_array.tobytes()) + response_info = response_array.buffer_info() + response_data = response_info[0] + response_size = response_info[1] * response_array.itemsize + # print(f"Reported size python: {n_tokens}") + query_responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, response_data, response_size, n_tokens + ) + ) + mlperf_loadgen.QuerySamplesComplete(query_responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, args=[query_samples]).start() + + +def flush_queries(): + pass + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", choices=["performance", "accuracy"], default="performance" + ) + parser.add_argument("--target-qps", type=int, default=100) + parser.add_argument("--target-latency-ns", type=int, default=100000000) + parser.add_argument("--min-query-count", type=int, default=100) + parser.add_argument("--min-duration-ms", type=int, default=30000) + return parser.parse_args() + + +def main(): + args = get_args() + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Server + if args.mode == "performance": + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + else: + settings.mode = mlperf_loadgen.TestMode.AccuracyOnly + settings.server_target_qps = args.target_qps + settings.server_target_latency_ns = args.target_latency_ns + settings.min_query_count = args.min_query_count + settings.min_duration_ms = args.min_duration_ms + settings.use_token_latencies = True + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py new file mode 100644 index 000000000..76461a75d --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py @@ -0,0 +1,125 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import argparse +import array +import threading +import time +import numpy as np + +from absl import app +import mlperf_loadgen + + +def f(x, y): + return 4 + 3 * x * y + x**3 + y**2 + + +def create_responses(n, m, mod=4): + r = [] + for i in range(n): + r.append([f(i, j) for j in range(m + (i % mod))]) + return r + + +responses = create_responses(1024, 20, mod=3) + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +def process_query_async(query_samples): + """Processes the list of queries.""" + query_responses = [] + for s in query_samples: + response_array = np.array(responses[s.index], np.int32) + token = response_array[0] + time.sleep(0.0002) + response_token = array.array("B", token.tobytes()) + response_token_info = response_token.buffer_info() + response_token_data = response_token_info[0] + response_token_size = response_token_info[1] * response_token.itemsize + time.sleep(0.02) + n_tokens = len(response_array) + response_array = array.array("B", response_array.tobytes()) + response_info = response_array.buffer_info() + response_data = response_info[0] + response_size = response_info[1] * response_array.itemsize + # print(f"Reported size python: {n_tokens}") + query_responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, response_data, response_size) + ) + mlperf_loadgen.QuerySamplesComplete(query_responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, args=[query_samples]).start() + + +def flush_queries(): + pass + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", choices=["performance", "accuracy"], default="performance" + ) + parser.add_argument("--target-qps", type=int, default=100) + parser.add_argument("--target-latency-ns", type=int, default=100000000) + parser.add_argument("--min-query-count", type=int, default=100) + parser.add_argument("--min-duration-ms", type=int, default=30000) + return parser.parse_args() + + +def main(): + args = get_args() + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.Server + if args.mode == "performance": + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + else: + settings.mode = mlperf_loadgen.TestMode.AccuracyOnly + settings.server_target_qps = args.target_qps + settings.server_target_latency_ns = args.target_latency_ns + settings.min_query_count = args.min_query_count + settings.min_duration_ms = args.min_duration_ms + settings.infer_token_latencies = 1 + settings.token_latency_scaling_factor = 21 + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py new file mode 100644 index 000000000..ca8d84591 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py @@ -0,0 +1,129 @@ +# Copyright 2019 The MLPerf Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Python demo showing how to use the MLPerf Inference load generator bindings. +""" + +from __future__ import print_function + +import argparse +import array +import threading +import time +import numpy as np + +from absl import app +import mlperf_loadgen + + +def f(x, y): + return 4 + 3 * x * y + x**3 + y**2 + + +def create_responses(n, m, mod=4): + r = [] + for i in range(n): + r.append([f(i, j) for j in range(m + (i % mod))]) + return r + + +responses = create_responses(1024, 20) + + +def load_samples_to_ram(query_samples): + del query_samples + return + + +def unload_samples_from_ram(query_samples): + del query_samples + return + + +def process_query_async(query_samples): + """Processes the list of queries.""" + query_responses = [] + for s in query_samples: + response_array = np.array(responses[s.index], np.int32) + time.sleep(0.0002) + token = response_array[:1] + response_token = array.array("B", token.tobytes()) + response_token_info = response_token.buffer_info() + response_token_data = response_token_info[0] + response_token_size = response_token_info[1] * response_token.itemsize + mlperf_loadgen.FirstTokenComplete( + [ + mlperf_loadgen.QuerySampleResponse( + s.id, response_token_data, response_token_size + ) + ] + ) + time.sleep(0.02) + n_tokens = len(response_array) + response_array = array.array("B", response_array.tobytes()) + response_info = response_array.buffer_info() + response_data = response_info[0] + response_size = response_info[1] * response_array.itemsize + query_responses.append( + mlperf_loadgen.QuerySampleResponse( + s.id, response_data, response_size, n_tokens + ) + ) + mlperf_loadgen.QuerySamplesComplete(query_responses) + + +def issue_query(query_samples): + threading.Thread(target=process_query_async, args=[query_samples]).start() + + +def flush_queries(): + pass + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--mode", choices=["performance", "accuracy"], default="performance" + ) + parser.add_argument("--expected-latency", type=int, default=2050000) + parser.add_argument("--min-query-count", type=int, default=100) + parser.add_argument("--min-duration-ms", type=int, default=30000) + return parser.parse_args() + + +def main(): + args = get_args() + settings = mlperf_loadgen.TestSettings() + settings.scenario = mlperf_loadgen.TestScenario.SingleStream + if args.mode == "performance": + settings.mode = mlperf_loadgen.TestMode.PerformanceOnly + else: + settings.mode = mlperf_loadgen.TestMode.AccuracyOnly + settings.single_stream_expected_latency_ns = args.expected_latency + settings.min_query_count = args.min_query_count + settings.min_duration_ms = args.min_duration_ms + settings.use_token_latencies = True + + sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries) + qsl = mlperf_loadgen.ConstructQSL( + 1024, 128, load_samples_to_ram, unload_samples_from_ram + ) + mlperf_loadgen.StartTest(sut, qsl, settings) + mlperf_loadgen.DestroyQSL(qsl) + mlperf_loadgen.DestroySUT(sut) + + +if __name__ == "__main__": + main() diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png new file mode 100644 index 000000000..35663b97f Binary files /dev/null and b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png differ diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png new file mode 100644 index 000000000..3ed6429c0 Binary files /dev/null and b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png differ diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn new file mode 100644 index 000000000..865bc4d3b --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn @@ -0,0 +1,33 @@ +generated_doxygen_out_dir = + get_path_info(".", "gen_dir") + "/.." + +loadgen_doxygen_sources = [ + "doxygen.cfg", + "doxygen_footer.html", + "doxygen_header.html", + "doxygen_layout.xml", + "doxygen_stylesheet.css", + "loadgen-integration_diagram.dia", + "mlperf_icon.png", + "mlperf_logo_horizontal_color.svg", + "README.md" +] + +source_set("loadgen_doxygen_sources") { + sources = loadgen_doxygen_sources +} + +source_set("doxygen_html_generator_script") { + sources = [ "doxygen_html_generator.py" ] +} + +action("generate_doxygen_html") { + script = "doxygen_html_generator.py" + args = [ rebase_path(generated_doxygen_out_dir, root_build_dir), + rebase_path("../..") ] + outputs = [ generated_doxygen_out_dir ] + deps = [ ":loadgen_doxygen_sources", + ":doxygen_html_generator_script", + "../..:mlperf_loadgen_sources_no_gen", + "../..:docs" ] +} diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md new file mode 100644 index 000000000..d5cf5fe18 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md @@ -0,0 +1,34 @@ +# Generating the HTML docs {#ReadmeHtmlDocs} + +This document is generated from inline docstrings in the source and +various markdown files checked into the git repository. If you've +checked out the code, you can generate this documentation. + +*Prerequisite:* You must have [doxygen](http://www.doxygen.nl) installed +on your system: + +## With gn / ninja + +If you are using the gn build flow, you may run: + + ninja -C out/Release generate_doxygen_html + +* This will output the documentation to out/Release/gen/loadgen/docs/gen and +avoid poluting the source directory. + +## Manually + +Alternatively, you can manually run: + + python docs/src/doxygen_html_generator.py + +* If is omitted, it will default to ".". +* If is also omitted, it will default to "./docs/gen". + +## Hosting + +A version of this doc is currently hosted online at +https://mlperf.github.io/inference/loadgen/index.html + +To update the hosted version, submit a PR to the +[mlperf.github.io](https://github.com/mlperf/mlperf.github.io) repository. diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg new file mode 100644 index 000000000..fc05853d1 --- /dev/null +++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg @@ -0,0 +1,2495 @@ +# Doxyfile 1.8.13 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "LoadGen Guide" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_logo_horizontal_color.svg + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = $(MLPERF_DOXYGEN_OUT_PATH) + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = YES + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 0. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 1 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = YES + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = YES + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = YES + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_layout.xml + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = NO + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = $(MLPERF_LOADGEN_SRC_PATH) + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = depot_tools + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = $(MLPERF_LOADGEN_SRC_PATH)/docs/src + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = YES + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse-libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = YES + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = -I ../third_party/pybind/include --std=c++14 + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot o= +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_header.html + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_footer.html + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_stylesheet.css + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_icon.png \ + $(MLPERF_LOADGEN_SRC_PATH)/loadgen_integration_diagram.svg + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 127 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = YES + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 50 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = YES + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = NO + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /