NVIDIA-NeMo · wprazuch · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026 · Jun 26, 2026
diff --git a/fern/versions/latest/pages/data/download-huggingface.mdx b/fern/versions/latest/pages/data/download-huggingface.mdx
@@ -229,14 +229,15 @@ gym dataset download \
 
 <Accordion title="Automatic Downloads During Data Preparation">
 
-NeMo Gym can automatically download missing datasets during data preparation. Configure `huggingface_identifier` in your resources server config:
+NeMo Gym can automatically download missing datasets during data preparation. Declare a `source:` block (`type: huggingface`) in your resources server config:
 
 ```yaml
 datasets:
   - name: train
     type: train
     jsonl_fpath: resources_servers/code_gen/data/train.jsonl
-    huggingface_identifier:
+    source:
+      type: huggingface
       repo_id: nvidia/nemotron-RL-coding-competitive_coding
       artifact_fpath: opencodereasoning_filtered_25k_train.jsonl
     license: Apache 2.0
@@ -253,7 +254,7 @@ gym dataset collate \
     +data_source=huggingface
 ```
 
-If `jsonl_fpath` doesn't exist locally, NeMo Gym downloads from `huggingface_identifier` before processing.
+If `jsonl_fpath` doesn't exist locally, NeMo Gym downloads from the dataset's `source:` (here, the Hugging Face `repo_id`) before processing. The legacy `huggingface_identifier:` block still works but is deprecated in favor of `source:`.
 </Accordion>
 
 <Accordion title="Caching Behavior">

diff --git a/fern/versions/latest/pages/data/index.mdx b/fern/versions/latest/pages/data/index.mdx
@@ -101,7 +101,9 @@ datasets:
   - name: train
     type: train
     jsonl_fpath: resources_servers/workplace_assistant/data/train.jsonl
-    huggingface_identifier:
+    # Unified dataset source. `type` selects the backend; the other fields are backend-specific.
+    source:
+      type: huggingface
       repo_id: nvidia/Nemotron-RL-agent-workplace_assistant
       artifact_fpath: train.jsonl
     license: Apache 2.0
@@ -113,9 +115,33 @@ datasets:
 | `type` | Yes | `example`, `train`, or `validation` |
 | `jsonl_fpath` | Yes | Path to data file |
 | `license` | Train/validation | See valid values below |
-| `huggingface_identifier` | No | Remote download location |
+| `source` | No | Where to fetch the data from when it's missing locally (see below) |
 | `num_repeats` | No | Repeat count (default: `1`) |
 
+### Dataset `source`
+
+`source` is the unified way to declare where a dataset is fetched from. `type` selects the backend; the remaining fields are backend-specific:
+
+```yaml
+# Hugging Face Hub
+source:
+  type: huggingface
+  repo_id: nvidia/Nemotron-RL-agent-workplace_assistant
+  artifact_fpath: train.jsonl
+
+# GitLab dataset registry
+source:
+  type: gitlab
+  dataset_name: example_multi_step
+  version: 0.0.1
+  artifact_fpath: train.jsonl
+```
+
+<Note>
+The legacy `huggingface_identifier:` / `gitlab_identifier:` blocks still work (a deprecation warning is emitted), so existing configs keep running — but new configs should use `source:`.
+
+</Note>
+
 ### Valid Licenses
 
 `Apache 2.0` · `MIT` · `GNU General Public License v3.0` · `Creative Commons Attribution 4.0 International` · `Creative Commons Attribution-ShareAlike 4.0 International` · `TBD` · `NVIDIA Internal Use Only, Do Not Distribute`

diff --git a/fern/versions/latest/pages/environment-tutorials/single-step-environment.mdx b/fern/versions/latest/pages/environment-tutorials/single-step-environment.mdx
@@ -294,8 +294,8 @@ my_weather_tool_simple_agent:
         type: example
         jsonl_fpath: resources_servers/my_weather_tool/data/example.jsonl
       # The scaffold also generates train/validation dataset entries
-      # with gitlab_identifier blocks. Those are omitted here since
-      # we only have example data at this stage.
+      # (which can declare a `source:` block — the unified dataset source).
+      # Those are omitted here since we only have example data at this stage.
 ```
 
 The `domain` field categorizes your resource server and is **required**. Common values: `math`, `coding`, `agent`, `knowledge`, `instruction_following`, `long_context`, `safety`, `games`, `e2e`, `other`.

diff --git a/fern/versions/latest/pages/reference/cli-commands.mdx b/fern/versions/latest/pages/reference/cli-commands.mdx
@@ -19,9 +19,11 @@ gym --help                   # list all command groups
 gym --version [--json]        # print version and system info
 ng ...                        # 'ng' is an alias for 'gym'
 
-# Discover benchmarks
-gym list benchmarks [--json]  # list available benchmarks
-gym search <query> [--json]   # filter the benchmark list by name
+# Discover
+gym list benchmarks [--json]    # list available benchmarks
+gym list environments [--json]  # list available environments by name
+gym list agents [--json]        # list agent harnesses and how each composes
+gym search <query> [--json]     # filter the benchmark list by name
 
 # Datasets
 gym dataset upload            # upload a prepared dataset to HF (default) or GitLab
@@ -34,6 +36,7 @@ gym dataset collate           # validate and collate a dataset
 # Environments
 gym env init                  # scaffold a new resources server
 gym env resolve               # resolve and print the final merged config
+gym env validate              # validate a config (no Ray, no servers) — fast pre-flight check
 gym env packages              # list packages in a server's virtual environment
 gym env test                  # test resources server(s); all of them if none is given
 gym env start                   # start the servers
@@ -143,6 +146,32 @@ gym list benchmarks
 gym list benchmarks --json | jq '.[].name'
 ```
 
+### `gym list environments`
+
+List the environments available under `environments/`, by short name (with their domain and description). The names map to `environments/<name>/config.yaml` and can be passed to `--environment` on commands like `gym env start` / `gym eval run`.
+
+| Option | Description |
+| --- | --- |
+| `--json` | Output the environment list as JSON. |
+
+```bash
+gym list environments
+gym list environments --json | jq '.[].name'
+```
+
+### `gym list agents`
+
+List the agent harnesses under `responses_api_agents/`, with each one's composition pattern — **composable** (Pattern A: references a separate resources server, so it can be wired into a matching environment) vs **self-contained** (Pattern B: ships its own framework/environment) — plus its config variants.
+
+| Option | Description |
+| --- | --- |
+| `--json` | Output the agent list as JSON (includes the `self_contained` flag). |
+
+```bash
+gym list agents
+gym list agents --json | jq '.[] | {name, self_contained}'
+```
+
 ### `gym search`
 
 List benchmarks whose name, agent, resources server, dataset, or domain matches a query.
@@ -323,6 +352,26 @@ gym env resolve \
     ++responses_create_params.temperature=0.6
 ```
 
+### `gym env validate`
+
+Validate a config without starting Ray or any server subprocess — a fast pre-flight check that catches config mistakes (missing/malformed `config_paths`, unknown server cross-references, unset mandatory `???` values, schema errors) in well under a second instead of after a Ray bootstrap. Exits `0` when valid, or `1` with a clean message (no traceback) when not. A model config is **not** required — model interpolations resolve against a dummy model; pass one (or `--model-type`) if you want it validated too.
+
+| Option | Description |
+| --- | --- |
+| `--config PATH` | Config file to load. Repeatable. |
+| `--environment NAME` / `--benchmark NAME` | Validate a named environment / benchmark config. |
+| `--resources-server NAME` | Validate a named resources server config. |
+| `--model-type NAME` | Also load a named model config (otherwise a dummy `policy_model` is used). |
+| `--model` / `--model-url` / `--model-api-key` | Override model name, base URL, and API key. |
+
+```bash
+gym env validate --environment workplace_assistant
+gym env validate --benchmark gsm8k
+
+# or explicit config path(s)
+gym env validate --config resources_servers/example_single_tool_call/configs/example_single_tool_call.yaml
+```
+
 ### `gym env packages`
 
 Each server has its own isolated virtual environment. List the packages installed in a server's environment.

diff --git a/fern/versions/latest/pages/reference/configuration.mdx b/fern/versions/latest/pages/reference/configuration.mdx
@@ -57,7 +57,7 @@ my_resource:                                  # Server ID (your choice — agent
       value: "What this improves"             # Training value provided
 ```
 
-**Domain values:** `math`, `coding`, `agent`, `knowledge`, `instruction_following`, `long_context`, `safety`, `games`, `e2e`, `other` (see `Domain`)
+**Domain values:** `math`, `coding`, `agent`, `knowledge`, `instruction_following`, `long_context`, `safety`, `games`, `translation`, `e2e`, `rlhf`, `other` (see `Domain`)
 
 ### Agent Server Fields
 
@@ -100,6 +100,7 @@ datasets:
 | `type` | Yes | `example`, `train`, or `validation` |
 | `jsonl_fpath` | Yes | Path to data file |
 | `license` | For train/validation | License identifier (see values below) |
+| `source` | No | Where to fetch the data from when it's missing locally. A `source:` block with `type: gitlab` (`dataset_name`, `version`, `artifact_fpath`) or `type: huggingface` (`repo_id`, optional `artifact_fpath`). Replaces the deprecated `gitlab_identifier:` / `huggingface_identifier:` blocks. |
 | `num_repeats` | No | Repeat dataset n times (default: `1`) |
 
 **Dataset types:**

diff --git a/fern/versions/latest/pages/reference/faq.mdx b/fern/versions/latest/pages/reference/faq.mdx
@@ -168,27 +168,20 @@ example_multi_step_simple_agent:
         license: Apache 2.0
         jsonl_fpath: resources_servers/example_multi_step/data/train.jsonl
         num_repeats: 1
-        gitlab_identifier:
+        source:
+          type: gitlab
           dataset_name: example_multi_step
           version: 0.0.1
           artifact_fpath: example_multi_step/train.jsonl
-        huggingface_identifier:
-          repo_id: nvidia/Nemotron-RL-instruction_following
-          artifact_fpath: instruction_following.jsonl
-        license: Apache 2.0
       - name: validation
         type: validation
         license: Apache 2.0
         jsonl_fpath: resources_servers/example_multi_step/data/validation.jsonl
         num_repeats: 1
-        gitlab_identifier:
-          dataset_name: example_multi_step
-          version: 0.0.1
-          artifact_fpath: example_multi_step/validation.jsonl
-        huggingface_identifier:
+        source:
+          type: huggingface
           repo_id: nvidia/Nemotron-RL-instruction_following
           artifact_fpath: if_validation.jsonl
-        license: Apache 2.0
       - name: example
         type: example
         jsonl_fpath: resources_servers/example_multi_step/data/example.jsonl
@@ -200,21 +193,22 @@ A dataset object consists of:
 - Type: train, validation, or example. Train and validation are as used in NeMo RL or other train frameworks. More information about the example type is in the next section.
 - Jsonl fpath: the local file path to your jsonl file for this dataset.
 - Num repeats: optionally repeat each row when preparing or collating data. Defaults to 1 if unspecified.
-- GitLab identifier: The remote path to the dataset as held in the GitLab dataset registry. This field is required for train and validation datasets. Not required for example datasets since those are required to be committed to Git.
-- Hugging Face identifier: The remote path to the dataset on Hugging Face. Contains `repo_id` (required) and optionally `artifact_fpath` for raw file repos. If `artifact_fpath` is omitted, the datasets library will infer the `split` from the dataset `type`.
+- Source: the unified `source:` block declaring where the dataset is fetched from when it's missing locally. `type` selects the backend:
+  - `gitlab` — the GitLab dataset registry: `dataset_name`, `version`, `artifact_fpath`.
+  - `huggingface` — the Hugging Face Hub: `repo_id` (required) and optionally `artifact_fpath` for raw-file repos (if omitted, the split is inferred from the dataset `type`).
+
+  Required for train and validation datasets; not for example datasets (those are committed to Git). The legacy `gitlab_identifier:` / `huggingface_identifier:` fields still work (with a deprecation warning) and may be set together as a gitlab-primary / huggingface-fallback pair, but `source:` is preferred for new configs.
 - License: The license of that dataset. Required for train and validation datasets; not required for example datasets.
 - Start idx, end idx: used for slicing your dataset.
 ```yaml
 - name: train
   type: train
   jsonl_fpath: resources_servers/example_multi_step/data/train.jsonl
-  gitlab_identifier:
+  source:
+    type: gitlab
     dataset_name: example_multi_step
     version: 0.0.1
-    artifact_fpath: example_multi_step/validation.jsonl
-  huggingface_identifier:
-    repo_id: nvidia/example_multi_step
-    artifact_fpath: example_validation.jsonl
+    artifact_fpath: example_multi_step/train.jsonl
   license: Apache 2.0
 ```
 

diff --git a/fern/versions/latest/pages/troubleshooting/configuration.mdx b/fern/versions/latest/pages/troubleshooting/configuration.mdx
@@ -52,7 +52,9 @@ Did you mean: 'weather'?
 ```
 
 **When**: A server config references another server that doesn't exist or isn't loaded. The error
-names the referencing instance/field and, when a similarly named server exists, suggests it.
+names the referencing instance/field and then either suggests a similarly named server
+(`Did you mean: …`) or, when nothing is close, lists the available servers of that type
+(`Available resources_servers: …`).
 
 **Common causes**:
 - Typo in the server name
@@ -70,6 +72,56 @@ gym env start \
     --config agent.yaml
 ```
 
+### Config Path Not Found
+
+```
+nemo_gym.config_types.ConfigPathNotFoundError: config_paths entry 'resources_servers/weather/config.yaml' was not found. Looked in:
+  - /path/to/cwd/resources_servers/weather/config.yaml
+  - /path/to/gym/resources_servers/weather/config.yaml
+Check the path is spelled correctly and is relative to your working directory or the Gym install root.
+```
+
+**When**: A `config_paths` entry (in a config file or a `--config` flag) points at a file that doesn't
+exist under any search root. The error lists every location that was searched.
+
+**Fix**: Correct the path, or run from the repository root so relative `config_paths` resolve.
+
+### Malformed `config_paths`
+
+```
+nemo_gym.config_types.MalformedConfigPathsError: 'config_paths' must be a list of paths. Got: 'resources_servers/weather/config.yaml'.
+Pass each config with --config (it builds the list for you), e.g.:
+  gym env start --config resources_servers/<env>/configs/<env>.yaml
+```
+
+**When**: `config_paths` is set to a scalar or mapping instead of a list of path strings.
+
+**Fix**: Make it a YAML list, or pass each config with `--config`:
+
+```yaml
+config_paths:
+  - resources_servers/weather/config.yaml
+```
+
+### No Server Instances Configured
+
+```
+nemo_gym.config_types.NoServerInstancesError: No server instances are configured, so there is nothing to run. Pass one or more configs, e.g.:
+  gym env start --config resources_servers/<env>/configs/<env>.yaml --config responses_api_models/<model>/configs/<model>.yaml
+```
+
+**When**: The merged config defines no server instances — usually a wrong/empty `--config` or a
+`config_paths` list that resolved to nothing.
+
+**Fix**: Pass the configs that define your servers, or check the `config_paths` entries actually load.
+
+<Tip>
+Catch all of the above before launch with [`gym env validate`](/reference/cli-commands#gym-env-validate) — it
+merges and validates the same config `gym env start` uses, then exits with a clean message (no traceback)
+and a non-zero status on failure, so it fits in CI pre-flight checks.
+
+</Tip>
+
 ---
 
 ## Validation Errors
@@ -86,7 +138,7 @@ Configuration Warnings: Almost-Servers Detected
 
 - ResourcesServerInstanceConfig -> resources_servers -> example_server -> domain: 
   Input should be 'math', 'coding', 'agent', 'knowledge', 'instruction_following', 
-  'long_context', 'safety', 'games', 'e2e' or 'other'
+  'long_context', 'safety', 'games', 'translation', 'e2e', 'rlhf' or 'other'
 ```
 
 **When**: Config has the right structure (server type, entrypoint) but contains invalid field values.