mlcommons · FileSystemGuy · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
@@ -46,6 +46,7 @@ CLAUDE.md
 .roomodes
 LOCAL_BRANCH_NOTES.md
 .planning/
+.gsd-tmp/
 
 # DLIO test artifacts — created in cwd when running dlio_benchmark tests
 output/
@@ -70,6 +71,12 @@ env-fast
 sim_*.tsv
 sim_*.tsv.zst
 
+# Submission checker default CSV output (utility_args.py default --csv path,
+# README.md §submission-checker). Runtime artifact, not source — keep
+# untracked so contributors don't see a dirty working tree after running
+# `mlpstorage validate ...` without an explicit --csv path.
+summary.csv
+
 # Sweep run logs and results (local benchmark output)
 sweep_logs/
 sweep_flux_master.log

@@ -801,7 +801,18 @@ def generate_output_location(self) -> str:
         """
         if not self.BENCHMARK_TYPE:
             raise ValueError('No benchmark specified. Unable to generate output location')
-        return generate_output_location(self, self.run_datetime)
+        # Thread the validated orgname/systemname stashed by
+        # capture_or_verify_code_image (code_image.py: args._validated_orgname /
+        # args._validated_systemname) so generate_output_location's
+        # OPEN/CLOSED ConfigurationError path doesn't fire. For legacy /
+        # whatif modes these attrs are absent (getattr default None) and the
+        # function's mode check skips the orgname/systemname requirement.
+        return generate_output_location(
+            self,
+            self.run_datetime,
+            orgname=getattr(self.args, "_validated_orgname", None),
+            systemname=getattr(self.args, "_validated_systemname", None),
+        )
 
     _COLLISION_BUMP_BUDGET = DEFAULT_COLLISION_BUMP_BUDGET
 

@@ -118,6 +118,7 @@ def get_datetime_string():
 # VDB Benchmark Configuration
 VDB_INDEX_TYPES = ["DISKANN", "HNSW", "AISAQ", "IVF_FLAT", "IVF_SQ8", "FLAT"]
 VDB_INDEX_TYPES_CLOSED = ["DISKANN", "HNSW", "AISAQ"]
+
 VDB_ORCHESTRATION_MODES = ["ssh", "mpi"]
 VDB_BENCHMARK_MODES = ["timed", "query_count", "sweep"]
 # Vector-database engines. Only milvus is wired up today; the slot exists so
@@ -154,6 +155,8 @@ class EXIT_CODE(enum.IntEnum):
     SUCCESS = 0
     GENERAL_ERROR = 1
     INVALID_ARGUMENTS = 2
+    # CAP/VALR failure exit code (per 02-CONTEXT.md D-22). Aliased with INVALID_ARGUMENTS=2 for ergonomic naming at the typed-exception → exit mapping in main.py.
+    CODE_IMAGE_ERROR = 2
     FILE_NOT_FOUND = 3
     PERMISSION_DENIED = 4
     CONFIGURATION_ERROR = 5

@@ -147,6 +147,13 @@ def __init__(self, message: str, parameter: str = None,
             expected=expected,
             actual=actual
         )
+        # Expose the missing/invalid parameter name as a direct attribute so
+        # the CLI dispatch layer (and tests) can inspect it without poking at
+        # the structured-error context dict. Documented use case: the Phase 2
+        # generate_output_location trust-contract raises ConfigurationError
+        # with parameter="orgname" or "systemname" so the dispatch helper
+        # can map it back to the MLPSTORAGE_* env-var the user must set.
+        self.parameter = parameter
 
     @staticmethod
     def _default_suggestion(code: ErrorCode) -> str:

@@ -39,6 +39,7 @@
 )
 from mlpstorage_py.validation_helpers import validate_benchmark_environment
 from mlpstorage_py.progress import progress_context
+from mlpstorage_py.submission_checker.tools.code_image import capture_or_verify_code_image, CodeImageError
 
 logger = setup_logging("MLPerfStorage")
 signal_received = False
@@ -202,6 +203,18 @@ def run_benchmark(args, run_datetime):
     else:
         logger.warning("Skipping environment validation (--skip-validation flag)")
 
+    # Capture/verify code image BEFORE benchmark instantiation (Phase 2 D-07).
+    # Helper internally gates on (args.mode, args.command) per D-10, so it is
+    # safe to call unconditionally — non-result-generating commands no-op.
+    # Helper also owns ALL env-var reading and validation (POSIX regex + inline
+    # `.`/`..` path-traversal guard) — see Plan 02 REVIEWS.md consensus finding.
+    with progress_context(
+        "Capturing or verifying code image...",
+        total=None,
+        logger=logger
+    ) as (update, set_desc):
+        capture_or_verify_code_image(args, os.environ, logger)
+
     program_switch_dict = dict(
         training=TrainingBenchmark,
         checkpointing=CheckpointingBenchmark,
@@ -404,6 +417,15 @@ def main():
             logger.info(f"Suggestion: {e.suggestion}")
         return EXIT_CODE.FAILURE
 
+    except CodeImageError as e:
+        # Phase 2 D-22: code-image capture/verify failures (incl. MissingHashFile,
+        # MalformedHashFile, hash-mismatch CodeImageError) map to a dedicated
+        # exit code distinct from generic FAILURE so CI/scripts can detect them.
+        # CodeImageError is NOT a MLPStorageException subclass, so it requires
+        # an explicit handler ordered BEFORE the MLPStorageException catch-all.
+        logger.error(str(e))
+        return EXIT_CODE.CODE_IMAGE_ERROR
+
     except MLPStorageException as e:
         # Catch-all for any other custom exceptions
         logger.error(str(e))
@@ -424,8 +446,12 @@ def main():
         logger.error(f"Unexpected error: {str(e)}")
         logger.error(format_error('INTERNAL_ERROR', error=str(e)))
 
-        # Show traceback if in debug mode
-        if MLPS_DEBUG:
+        # Show traceback if in debug mode. MLPS_DEBUG is the env-var path
+        # (read at import time); also check `--debug` directly via sys.argv
+        # so the CLI flag emits a trace even though `args` is not in scope
+        # here. `--debug` is store_true so a bare-token check suffices.
+        debug_cli = '--debug' in sys.argv
+        if MLPS_DEBUG or debug_cli:
             logger.debug("Stack trace:")
             traceback.print_exc()
         else:

@@ -6,10 +6,44 @@
 """
 
 import os
+import re
 import sys
 from typing import Tuple, List, Optional
 
 from mlpstorage_py.config import BENCHMARK_TYPES, DATETIME_STR
+from mlpstorage_py.errors import ConfigurationError, ErrorCode
+
+# Env-var names used by the Phase 2 CLI dispatch layer to source orgname/systemname (D-01, D-02).
+# generate_output_location itself does NOT read these; the helper in
+# mlpstorage_py/submission_checker/tools/code_image.py reads + validates them and threads
+# the values through as keyword arguments. The names are exported here so the helper has a
+# single source of truth for the env-var spelling.
+MLPSTORAGE_ORGNAME_ENVVAR = "MLPSTORAGE_ORGNAME"
+MLPSTORAGE_SYSTEMNAME_ENVVAR = "MLPSTORAGE_SYSTEMNAME"
+
+# Each path segment appended to results_dir by generate_output_location must
+# match this — POSIX-safe alphanumeric plus '.', '_', '-' — and must not be
+# '.' or '..'. Blocks path-traversal ('../') and absolute-path resets ('/')
+# at the trust boundary between args/env-var input and os.path.join, even
+# for callers that bypass the CLI's argparse choices= validation.
+_SAFE_PATH_COMPONENT_RE = re.compile(r"^[A-Za-z0-9._-]+$")
+
+
+def _check_safe_path_component(name: str, value: str) -> None:
+    """Raise ValueError if value is not safe as a single path segment.
+
+    Caller handles None/empty upstream as a separate "missing required arg"
+    failure mode; this helper assumes value is a non-empty string.
+    """
+    if value in (".", ".."):
+        raise ValueError(
+            f"{name}={value!r} is not a safe path component (reserved name)"
+        )
+    if not _SAFE_PATH_COMPONENT_RE.match(value):
+        raise ValueError(
+            f"{name}={value!r} is not a safe path component "
+            f"(must match {_SAFE_PATH_COMPONENT_RE.pattern})"
+        )
 
 
 def calculate_training_data_size(args, cluster_information, dataset_params, reader_params, logger,
@@ -118,28 +152,61 @@ def calculate_training_data_size(args, cluster_information, dataset_params, read
     return int(required_file_count), int(required_subfolders_count), int(total_disk_bytes)
 
 
-def generate_output_location(benchmark, datetime_str=None, **kwargs) -> str:
+def generate_output_location(
+    benchmark,
+    datetime_str=None,
+    *,
+    orgname: Optional[str] = None,
+    systemname: Optional[str] = None,
+    **kwargs,
+) -> str:
     """
     Generate a standardized output location for benchmark results.
 
     Output structure follows this pattern:
-    RESULTS_DIR:
-        <benchmark_name>:
-            <model>:
-                <command>:
-                        <datetime>:
-                            run_<run_number> (Optional)
+
+      CLOSED (args.mode == "closed"):
+        <results_dir>/closed/<orgname>/<benchmark_name>/<model>/<command>/<datetime>/
+
+      OPEN (args.mode == "open"):
+        <results_dir>/open/<orgname>/results/<systemname>/<benchmark_name>/<model>/<command>/<datetime>/
+
+      Legacy (args.mode not in {"closed", "open"}, or attribute missing —
+      e.g. whatif, programmatic callers from tests):
+        <results_dir>/<benchmark_name>/<model>/<command>/<datetime>/
+
+    The per-``BENCHMARK_TYPES`` tail (training/checkpointing/vector_database/
+    kv_cache) is unchanged below the new prefix.
 
     Args:
         benchmark: Benchmark instance.
         datetime_str: Optional datetime string for the run.
-        **kwargs: Additional benchmark-specific parameters.
+        orgname: Keyword-only. Submitter organization name; required when
+            ``benchmark.args.mode`` is "closed" or "open". The CLI dispatch
+            layer (Plan 02-02) reads ``MLPSTORAGE_ORGNAME`` from the
+            environment, validates it per Rules.md §2.1.1, and threads the
+            validated value through as this keyword argument. This function
+            does NOT read ``os.environ`` — passing the value explicitly is a
+            trust-contract requirement so programmatic callers (tests,
+            future tooling) receive a typed ``ConfigurationError`` if they
+            forget to thread it through, rather than a hidden ``KeyError``.
+        systemname: Keyword-only. System name; required when
+            ``benchmark.args.mode`` is "open". Same trust-contract semantics
+            as ``orgname``; sourced from ``MLPSTORAGE_SYSTEMNAME`` by the
+            dispatch layer.
+        **kwargs: Additional benchmark-specific parameters (reserved).
 
     Returns:
         Full path to the output location.
 
     Raises:
-        ValueError: If required parameters are missing.
+        ValueError: If required parameters are missing (e.g. ``args.model``
+            for training/checkpointing benchmarks).
+        ConfigurationError: If ``benchmark.args.mode`` is "closed" or "open"
+            but ``orgname`` (and, for "open", ``systemname``) was not threaded
+            through by the caller. The ``parameter`` attribute identifies the
+            missing kwarg; the ``suggestion`` field references the
+            ``MLPSTORAGE_*`` env-var the dispatch layer must read.
     """
     if datetime_str is None:
         datetime_str = DATETIME_STR
@@ -151,17 +218,68 @@ def generate_output_location(benchmark, datetime_str=None, **kwargs) -> str:
     else:
         run_number = 0
 
+    # New D-03 prefix: insert {closed|open}/<orgname>[/results/<systemname>]/
+    # before the legacy per-type chain. The values are explicit kwargs threaded
+    # by the CLI dispatch layer (Plan 02-02); env-var reading is owned by that
+    # helper, not this function (see module-level constants above for the
+    # env-var-name source of truth).
+    mode = getattr(benchmark.args, "mode", None)
+    if mode in ("closed", "open"):
+        if not orgname:
+            raise ConfigurationError(
+                "orgname is required when args.mode in {closed, open} but was "
+                "not provided to generate_output_location",
+                parameter="orgname",
+                suggestion=(
+                    f"The CLI dispatch layer should read the "
+                    f"{MLPSTORAGE_ORGNAME_ENVVAR} environment variable and "
+                    "thread the validated value through as the orgname "
+                    "keyword. Programmatic callers must pass orgname= "
+                    "explicitly."
+                ),
+                code=ErrorCode.CONFIG_MISSING_REQUIRED,
+            )
+        _check_safe_path_component("orgname", orgname)
+        output_location = os.path.join(output_location, mode, orgname)
+
+        if mode == "open":
+            if not systemname:
+                raise ConfigurationError(
+                    "systemname is required when args.mode == 'open' but was "
+                    "not provided to generate_output_location",
+                    parameter="systemname",
+                    suggestion=(
+                        f"The CLI dispatch layer should read the "
+                        f"{MLPSTORAGE_SYSTEMNAME_ENVVAR} environment "
+                        "variable and thread the validated value through "
+                        "as the systemname keyword. Programmatic callers "
+                        "must pass systemname= explicitly."
+                    ),
+                    code=ErrorCode.CONFIG_MISSING_REQUIRED,
+                )
+            _check_safe_path_component("systemname", systemname)
+            output_location = os.path.join(output_location, "results", systemname)
+
+    # datetime_str is built into every per-type path below; validate once here.
+    _check_safe_path_component("datetime_str", datetime_str)
+
     # Handle different benchmark types
     if benchmark.BENCHMARK_TYPE == BENCHMARK_TYPES.training:
         if not hasattr(benchmark.args, "model"):
             raise ValueError("Model name is required for training benchmark output location")
 
+        _check_safe_path_component("model", benchmark.args.model)
+        _check_safe_path_component("command", benchmark.args.command)
         output_location = os.path.join(output_location, benchmark.BENCHMARK_TYPE.name)
         output_location = os.path.join(output_location, benchmark.args.model)
         output_location = os.path.join(output_location, benchmark.args.command)
         output_location = os.path.join(output_location, datetime_str)
 
     elif benchmark.BENCHMARK_TYPE == BENCHMARK_TYPES.vector_database:
+        # Results split by index_type because AISAQ is not comparable to
+        # DISKANN/HNSW — they must live in separate on-disk trees so
+        # submission validation and downstream tooling never collate them
+        # (per Rules.md §2.1.27).
         engine = getattr(benchmark.args, "vdb_engine", None)
         if not engine:
             raise ValueError(
@@ -177,7 +295,10 @@ def generate_output_location(benchmark, datetime_str=None, **kwargs) -> str:
                 "VectorDB index is required for output location "
                 "(set --vdb-index on the CLI)."
             )
-
+
+        _check_safe_path_component("vdb_engine", engine)
+        _check_safe_path_component("vdb_index", vdb_index)
+        _check_safe_path_component("command", benchmark.args.command)
         output_location = os.path.join(output_location, benchmark.BENCHMARK_TYPE.name)
         output_location = os.path.join(output_location, engine)
         output_location = os.path.join(output_location, vdb_index)
@@ -192,6 +313,8 @@ def generate_output_location(benchmark, datetime_str=None, **kwargs) -> str:
                 "args.model before calling generate_output_location "
                 "(KVCacheBenchmark.__init__ defaults this from KVCACHE_MODEL_DEFAULT)."
             )
+        _check_safe_path_component("model", model)
+        _check_safe_path_component("command", benchmark.args.command)
         output_location = os.path.join(output_location, benchmark.BENCHMARK_TYPE.name)
         output_location = os.path.join(output_location, model)
         output_location = os.path.join(output_location, benchmark.args.command)
@@ -201,6 +324,7 @@ def generate_output_location(benchmark, datetime_str=None, **kwargs) -> str:
         if not hasattr(benchmark.args, "model"):
             raise ValueError("Model name is required for checkpointing benchmark output location")
 
+        _check_safe_path_component("model", benchmark.args.model)
         output_location = os.path.join(output_location, benchmark.BENCHMARK_TYPE.name)
         output_location = os.path.join(output_location, benchmark.args.model)
         output_location = os.path.join(output_location, datetime_str)