diff --git a/docs.json b/docs.json index 6f1b72afe5..53cb9b7968 100644 --- a/docs.json +++ b/docs.json @@ -1261,7 +1261,10 @@ "support/models/tags/administrator", "support/models/tags/alerts", "support/models/tags/anonymous", + "support/models/tags/api", + "support/models/tags/api-keys", "support/models/tags/artifacts", + "support/models/tags/authentication", "support/models/tags/aws", "support/models/tags/billing", "support/models/tags/charts", @@ -1273,6 +1276,7 @@ "support/models/tags/logs", "support/models/tags/metrics", "support/models/tags/notebooks", + "support/models/tags/org-management", "support/models/tags/outage", "support/models/tags/privacy", "support/models/tags/projects", @@ -1281,13 +1285,16 @@ "support/models/tags/resuming", "support/models/tags/run-crashes", "support/models/tags/runs", + "support/models/tags/sdk", "support/models/tags/security", "support/models/tags/storage", "support/models/tags/sweeps", "support/models/tags/tables", "support/models/tags/team-management", + "support/models/tags/teams", "support/models/tags/tensorboard", "support/models/tags/user-management", + "support/models/tags/workspace", "support/models/tags/workspaces", "support/models/tags/wysiwyg" ] @@ -1393,6 +1400,80 @@ } ], "tab": "W&B Launch" + }, + { + "tab": "Support: W&B Models", + "hidden": true, + "pages": [ + "support/models", + "support/models/tags/academic", + "support/models/tags/administrator", + "support/models/tags/alerts", + "support/models/tags/anonymous", + "support/models/tags/api", + "support/models/tags/api-keys", + "support/models/tags/artifacts", + "support/models/tags/authentication", + "support/models/tags/aws", + "support/models/tags/billing", + "support/models/tags/charts", + "support/models/tags/connectivity", + "support/models/tags/environment-variables", + "support/models/tags/experiments", + "support/models/tags/hyperparameter", + "support/models/tags/inference", + "support/models/tags/logs", + "support/models/tags/metrics", + "support/models/tags/notebooks", + "support/models/tags/org-management", + "support/models/tags/outage", + "support/models/tags/privacy", + "support/models/tags/projects", + "support/models/tags/python", + "support/models/tags/reports", + "support/models/tags/resuming", + "support/models/tags/run-crashes", + "support/models/tags/runs", + "support/models/tags/sdk", + "support/models/tags/security", + "support/models/tags/storage", + "support/models/tags/sweeps", + "support/models/tags/tables", + "support/models/tags/team-management", + "support/models/tags/teams", + "support/models/tags/tensorboard", + "support/models/tags/user-management", + "support/models/tags/workspace", + "support/models/tags/workspaces", + "support/models/tags/wysiwyg" + ] + }, + { + "tab": "Support: W&B Weave", + "hidden": true, + "pages": [ + "support/weave", + "support/weave/tags/client-info", + "support/weave/tags/code-capture", + "support/weave/tags/data-capture", + "support/weave/tags/evaluation", + "support/weave/tags/performance", + "support/weave/tags/system-info", + "support/weave/tags/trace-data", + "support/weave/tags/ui-rendering" + ] + }, + { + "tab": "Support: W&B Inference", + "hidden": true, + "pages": [ + "support/inference", + "support/inference/tags/administrator", + "support/inference/tags/authentication-access", + "support/inference/tags/billing", + "support/inference/tags/quotas-rate-limits", + "support/inference/tags/server-errors" + ] } ] }, @@ -5725,6 +5806,10 @@ "destination": "/support/models/articles/how-do-i-change-my-billing-address", "source": "/models/support/change_billing_address" }, + { + "destination": "/support/models/articles/how-do-i-connect-to-wandb-self-managed", + "source": "/support/models/articles/how-do-i-connect-to-a-self-hosted-wb-server" + }, { "destination": "/support/models/articles/how-can-i-change-the-directory-my-sweep-", "source": "/models/support/change_directory_sweep_logs_locally" @@ -5922,9 +6007,13 @@ "source": "/models/support/how_can_i_disable_logging_of_system_metrics_to_wb" }, { - "destination": "/support/models/articles/how-can-i-log-in-to-wb-server", + "destination": "/support/models/articles/how-do-i-log-in-to-wandb-self-managed", "source": "/models/support/how_can_i_log_in_to_wb_server" }, + { + "destination": "/support/models/articles/how-do-i-log-in-to-wandb-self-managed", + "source": "/support/models/articles/how-can-i-log-in-to-wb-server" + }, { "destination": "/support/models/articles/how-can-i-overwrite-the-logs-from-previo", "source": "/models/support/how_can_i_overwrite_the_logs_from_previous_steps" diff --git a/models/app/console-logs.mdx b/models/app/console-logs.mdx index 5a6e822f4d..ef1eaf44d6 100644 --- a/models/app/console-logs.mdx +++ b/models/app/console-logs.mdx @@ -36,24 +36,53 @@ W&B captures three types of console logs and adds a prefix to indicate each log' ## Console log settings -To control which types of console output W&B captures and displays, pass a `wandb.Settings` object to `wandb.init()` when you initialize a run. The relevant parameters are `show_errors`, `show_warnings`, `show_info`, and `silent`. For details on each parameter and its default value, see the [`wandb.Settings` reference](/models/ref/python/experiments/settings). +To control which types of console output W&B captures and displays, pass a [`wandb.Settings`](/models/ref/python/experiments/settings) object to `wandb.init()` when you initialize a run. The relevant parameters include `show_errors`, `show_warnings`, `show_info`, and `silent`. The [Settings reference](/models/ref/python/experiments/settings) lists every parameter and is generated from the SDK, so it stays current as new options are added. -The following example shows how to configure these settings: +The example below toggles whether informational messages, warnings, and errors from W&B appear in the **Logs** tab. Set `silent=True` to suppress all W&B console output (useful when you want a quiet training script): ```python import wandb settings = wandb.Settings( - show_errors=True, # Show error messages in the W&B App - silent=False, # Disable all W&B console output - show_warnings=True # Show warning messages in the W&B App + show_errors=True, + silent=False, + show_warnings=True, ) with wandb.init(settings=settings) as run: - # Your training code here run.log({"accuracy": 0.95}) ``` +For stdout and stderr capture (`console`), multipart uploads (`console_multipart`, `console_chunk_max_bytes`, `console_chunk_max_seconds`), and troubleshooting, see the sections below and the Settings reference. Distributed training and other edge cases are covered in [Why is console output not captured for my run?](/support/models/articles/why-is-console-output-not-captured-for-my-run). + +### Multipart console logging + +By default, W&B stores your script's stdout and stderr as a single `output.log` file and uploads it *when the run finishes*. While a run is active, the **Logs** tab streams output for viewing, but `output.log` does not appear on the **Files** tab until the run completes. + +Enable multipart console logging when you need downloadable logs while a run is still active, when a run may crash before it finishes, or when you resume a run and want to preserve log output from earlier sessions. Set `console_multipart=True` (SDK v0.22.3 or later) so the SDK writes timestamped chunks under `logs/` and uploads each chunk when it closes. Use `console_chunk_max_bytes` and `console_chunk_max_seconds` to control rollover; see the [Settings reference](/models/ref/python/experiments/settings) for defaults and behavior when both are `0`. + + +Uploaded chunks are immutable. Terminal control sequences that modify previous lines (for example, progress bars that use carriage returns) only affect the current chunk. + + +```python +import wandb + +with wandb.init( + project="my-project", + settings=wandb.Settings( + console_multipart=True, + console_chunk_max_bytes=1_000_000, # rotate at ~1 MB + console_chunk_max_seconds=60, # or every 60 seconds, whichever first + ), +) as run: + print("Logs upload in chunks while this run is active.") +``` + +You must set `console_multipart` at `wandb.init` time. Upload cadence cannot be changed after a run has started. + +For troubleshooting (console capture disabled, distributed training, resumed runs, and display limits), see [Why is console output not captured for my run?](/support/models/articles/why-is-console-output-not-captured-for-my-run) and [How do I download the console log file from a run?](/support/models/articles/how-do-i-download-the-console-log-file-from-a-run). + ## Custom logging If you already have your own logging setup, you can continue to use it alongside W&B. W&B captures console logs from your application, but it doesn't interfere with your own logging setup. You can use Python's built-in `print()` function or the `logging` module to log messages. diff --git a/models/integrations/pytorch.mdx b/models/integrations/pytorch.mdx index bd5718c7e5..a89de9edf1 100644 --- a/models/integrations/pytorch.mdx +++ b/models/integrations/pytorch.mdx @@ -288,7 +288,7 @@ Two `wandb` functions come into play here: `watch` and `log`. `run.watch()` will log the gradients and the parameters of your model, every `log_freq` steps of training. -All you need to do is call it before you start training. +Call `run.watch()` before you start training. For log modes, multiple models, and performance tips, see [How do I log gradients and model weights with wandb.watch?](/support/models/articles/how-do-i-log-gradients-and-model-weights-with-wandb-watch). The rest of the training code remains the same: we iterate over epochs and batches, diff --git a/models/ref/cli/wandb-login.mdx b/models/ref/cli/wandb-login.mdx index 8beb0b3468..73fd5a3ef9 100644 --- a/models/ref/cli/wandb-login.mdx +++ b/models/ref/cli/wandb-login.mdx @@ -73,7 +73,13 @@ wandb login --host https://my-wandb-server.example.com wandb login --relogin ``` +## API key formats +W&B issues API keys in the `wandb_v1_` format (about 92 characters). Older SDK versions expect exactly 40 characters and reject the new format with `API key must be 40 characters long`. Upgrade the SDK (`pip install -U wandb`) and run `wandb login --relogin`. + +Keys generated at [wandb.ai/authorize](https://wandb.ai/authorize) are shown only once at creation. Copy them immediately; you cannot retrieve them again after closing the dialog. + +For pinned SDK versions, legacy keys, and related errors, see [Why does my API key fail with 'must be 40 characters long'?](/support/models/articles/why-does-my-api-key-fail-with-must-be-40-characters). ## Arguments diff --git a/models/sweeps/signal-handling-sweep-runs.mdx b/models/sweeps/signal-handling-sweep-runs.mdx index ff830025f1..f97dd6b079 100644 --- a/models/sweeps/signal-handling-sweep-runs.mdx +++ b/models/sweeps/signal-handling-sweep-runs.mdx @@ -97,6 +97,8 @@ On preemption, the **training process** must receive the signal, mark the run as **Multi-agent coordination:** When many agents run at once (such as SLURM array jobs), they can race to claim the same preempted run. This is a known limitation. Stagger agent startup or use external coordination mechanisms like locks to help work around this potential issue. +For multi-GPU SLURM jobs where only one process should call `wandb.agent()`, see [How do I run sweeps with distributed training on SLURM?](/support/models/articles/how-do-i-run-sweeps-with-distributed-training-on-slurm). + ## `wandb sweep --cancel` You cancel a sweep using the W&B API, not an OS signal. Run a command like `wandb sweep --cancel entity/project/sweep_ID`. The server tells the agent to exit, and the agent then terminates running child processes and stops. There can be a short delay (on the order of the agent's API polling interval) before cancellation takes effect. diff --git a/scripts/knowledgebase-nav/config.yaml b/scripts/knowledgebase-nav/config.yaml index a75af25ad5..96cdade6b4 100644 --- a/scripts/knowledgebase-nav/config.yaml +++ b/scripts/knowledgebase-nav/config.yaml @@ -39,7 +39,10 @@ products: - Administrator - Alerts - Anonymous + - API + - API Keys - Artifacts + - Authentication - AWS - Billing - Charts @@ -51,6 +54,7 @@ products: - Logs - Metrics - Notebooks + - Org Management - Outage - Privacy - Projects @@ -59,13 +63,16 @@ products: - Resuming - Run Crashes - Runs + - SDK - Security - Storage - Sweeps - Tables - Team Management + - Teams - Tensorboard - User Management + - Workspace - Workspaces - Wysiwyg diff --git a/snippets/kb_article_map.mdx b/snippets/kb_article_map.mdx index 893087f94d..083f0ded54 100644 --- a/snippets/kb_article_map.mdx +++ b/snippets/kb_article_map.mdx @@ -516,7 +516,7 @@ export const kbArticleMap = [ "tags": [ "user management" ], - "title": "How can I log in to W&B Server?" + "title": "How do I log in to W&B Self-Managed?" }, { "page": "/models/support/how_can_i_overwrite_the_logs_from_previous_steps", diff --git a/support.mdx b/support.mdx index 545aa89e74..f5a3c99d9a 100644 --- a/support.mdx +++ b/support.mdx @@ -47,17 +47,17 @@ and the W&B community. {/* AUTO-GENERATED: counts */} - 190 articles · 33 tags + 210 articles · 40 tags {/* END AUTO-GENERATED: counts */} {/* AUTO-GENERATED: counts */} - 16 articles · 8 tags + 17 articles · 8 tags {/* END AUTO-GENERATED: counts */} {/* AUTO-GENERATED: counts */} - 7 articles · 5 tags + 9 articles · 5 tags {/* END AUTO-GENERATED: counts */} diff --git a/support/inference.mdx b/support/inference.mdx index 77107a8e38..c7446e08b5 100644 --- a/support/inference.mdx +++ b/support/inference.mdx @@ -19,5 +19,5 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2" 2 articles - 2 articles + 4 articles diff --git a/support/inference/articles/api-error-code-404-model-not-found.mdx b/support/inference/articles/api-error-code-404-model-not-found.mdx new file mode 100644 index 0000000000..3af6a03ee4 --- /dev/null +++ b/support/inference/articles/api-error-code-404-model-not-found.mdx @@ -0,0 +1,33 @@ +--- +title: "API error code 404 - Model not found" +keywords: ["Server Errors"] +--- + +A 404 error with the message "Model not found" means the model identifier in your request does not match any model available through the W&B Inference API. + +## Why this happens + +- **Incorrect model ID**: The model name or path in your request is misspelled or formatted incorrectly. +- **Model is not available on W&B Inference**: Not all models hosted on other providers are available through the W&B Inference API. The model you are requesting may not have been onboarded. +- **Using the wrong base URL**: If you are pointing at a different provider's endpoint but using a W&B API key, the model ID will not resolve correctly. +- **Model was removed**: A model that was previously available may have been deprecated or removed. + +## What you can do + +1. **Check the model ID** + - Verify the exact model identifier against the [W&B Inference supported models list](/inference/supported-models). + - Model IDs are case-sensitive and typically use the format `provider/model-name` (for example, `meta-llama/Llama-3.3-70B-Instruct`). + +2. **Confirm the base URL** + - The W&B Inference base URL is `https://api.wandb.ai/v1`. Ensure your client is pointed at this URL and not another provider's endpoint. + +3. **Request access to a new model** + - If the model you need is not currently supported, contact [W&B support](mailto:support@wandb.com) to request it be added. + +For more information, see [Serverless Inference](/inference) and the [list models API reference](/inference/api-reference/list-models). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Server Errors](/support/inference/tags/server-errors) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/inference/articles/api-error-code-422-invalid-request-parameters.mdx b/support/inference/articles/api-error-code-422-invalid-request-parameters.mdx new file mode 100644 index 0000000000..a058a0f45f --- /dev/null +++ b/support/inference/articles/api-error-code-422-invalid-request-parameters.mdx @@ -0,0 +1,35 @@ +--- +title: "API error code 422 - Invalid request parameters" +keywords: ["Server Errors"] +--- + +A 422 error means the request was understood by the server but could not be processed because one or more parameters are invalid, missing, or out of range. + +## Why this happens + +- **Unsupported parameter for the model**: Some parameters (such as `frequency_penalty`, `logprobs`, or `response_format`) are not supported by all models. Passing an unsupported parameter returns a 422. +- **Parameter value out of range**: Values like `temperature` must fall within a valid range (typically 0–2). Passing a value outside that range causes a 422. +- **Malformed messages array**: The `messages` field must be a list of objects with `role` and `content` keys. Missing or incorrectly typed fields return a 422. +- **Invalid `response_format`**: Requesting JSON mode (`response_format: {"type": "json_object"}`) with a model that does not support structured outputs causes a 422. + +## What you can do + +1. **Check the error message body** + - The 422 response body contains a `detail` field that identifies which parameter is invalid and why. Read it before troubleshooting. + +2. **Verify parameter support for your model** + - Review the [W&B Inference supported models list](/inference/supported-models) for the specific parameters and ranges each model accepts. + +3. **Validate your messages array** + - Each message must have a `role` (`system`, `user`, or `assistant`) and a `content` string. Tool call messages require additional fields—consult the API reference for the correct schema. + +4. **Remove unsupported parameters** + - If you are adapting code from another provider, remove any parameters that are not in the W&B Inference API reference. Extra parameters that the model does not support will trigger a 422. + +For more information, see [Serverless Inference](/inference) and the [chat completions API reference](/inference/api-reference/chat-completions). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Server Errors](/support/inference/tags/server-errors) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/inference/tags/server-errors.mdx b/support/inference/tags/server-errors.mdx index 2c758a2dd3..f820da9f3f 100644 --- a/support/inference/tags/server-errors.mdx +++ b/support/inference/tags/server-errors.mdx @@ -1,10 +1,16 @@ --- title: "Server Errors" -tag: "2" +tag: "4" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- + + A 404 error with the message "Model not found" means the model identifier in your request does not match any model avail ... + + + A 422 error means the request was understood by the server but could not be processed because one or more parameters are ... + A 500 error with the message The server had an error while processing your request indicates an internal server error in ... diff --git a/support/models.mdx b/support/models.mdx index b6be37d00d..63c8298eb9 100644 --- a/support/models.mdx +++ b/support/models.mdx @@ -28,6 +28,12 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2" ## Browse by category + + 2 articles + + + 1 article + 2 articles @@ -35,16 +41,19 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2" 2 articles - 25 articles + 26 articles - 2 articles + 3 articles 2 articles - 14 articles + 15 articles + + + 1 article 4 articles @@ -53,13 +62,13 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2" 4 articles - 4 articles + 5 articles - 12 articles + 13 articles - 39 articles + 51 articles 3 articles @@ -68,14 +77,17 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2" 5 articles - 7 articles + 10 articles - 19 articles + 23 articles 3 articles + + 2 articles + 3 articles @@ -92,13 +104,16 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2" 15 articles - 2 articles + 3 articles - 10 articles + 11 articles - 19 articles + 31 articles + + + 1 article 7 articles @@ -107,19 +122,25 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2" 2 articles - 19 articles + 23 articles 2 articles - 12 articles + 13 articles + + + 3 articles 2 articles - 13 articles + 14 articles + + + 2 articles 7 articles diff --git a/support/models/articles/can-i-resume-a-run-inside-a-sweep.mdx b/support/models/articles/can-i-resume-a-run-inside-a-sweep.mdx new file mode 100644 index 0000000000..5e624f509a --- /dev/null +++ b/support/models/articles/can-i-resume-a-run-inside-a-sweep.mdx @@ -0,0 +1,30 @@ +--- +title: "Can I resume a run inside a sweep?" +keywords: ["Sweeps", "Resuming"] +--- + +Run resumption is not supported inside a W&B sweep. If you pass a `run_id` or use `wandb.init(resume=...)` while a sweep agent is running, W&B ignores the run ID and starts a fresh run instead. You will see the following warning: + +``` +wandb: WARNING Ignoring run_id when running a sweep +``` + +This is expected behavior, not a bug. Sweep agents are designed to launch independent runs for each hyperparameter configuration. Resuming a specific run would conflict with the sweep controller's job scheduling. + +**Workarounds** + +If you need fault tolerance for long sweep runs, consider these approaches: + +- **Checkpoint and reload within a single run**: Save model checkpoints at regular intervals inside your training function. On restart, load the latest checkpoint at the beginning of `train()`. The sweep starts a new run, but training picks up from the saved state. + +- **Use `--count 1` on SLURM with requeue**: Submit each sweep agent job with `wandb agent --count 1 SWEEP_ID`. If the job is preempted, SLURM can requeue it and the sweep controller will assign a new configuration. + +- **Mark a run as failed and requeue manually**: If a run crashes mid-way, the sweep controller will eventually mark it as failed and may assign the same configuration to a new agent depending on your sweep settings. + +If you need to continue an interrupted training job outside of a sweep, use `wandb.init(resume="allow", id="YOUR_RUN_ID")` in a standalone script instead. See [Resume runs](/models/runs/resuming) and [Troubleshoot sweeps](/models/sweeps/troubleshoot-sweeps). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Sweeps](/support/models/tags/sweeps)[Resuming](/support/models/tags/resuming) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-can-i-log-in-to-wb-server.mdx b/support/models/articles/how-can-i-log-in-to-wb-server.mdx deleted file mode 100644 index 5075160ba0..0000000000 --- a/support/models/articles/how-can-i-log-in-to-wb-server.mdx +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: "How can I log in to W&B Server?" -keywords: ["User Management"] ---- - -Set the login URL by either of these methods: - -- Set the [environment variable](/models/track/environment-variables) `WANDB_BASE_URL` to the Server URL. -- Set the `--host` flag of [`wandb login`](/models/ref/cli/wandb-login) to the Server URL. - ---- - -{/* AUTO-GENERATED: tab badges */} -[User Management](/support/models/tags/user-management) -{/* END AUTO-GENERATED: tab badges */} \ No newline at end of file diff --git a/support/models/articles/how-do-i-connect-to-wandb-self-managed.mdx b/support/models/articles/how-do-i-connect-to-wandb-self-managed.mdx new file mode 100644 index 0000000000..b30182e3b8 --- /dev/null +++ b/support/models/articles/how-do-i-connect-to-wandb-self-managed.mdx @@ -0,0 +1,43 @@ +--- +title: "How do I connect to W&B Self-Managed?" +keywords: ["Administrator", "User Management", "Environment Variables"] +--- + +[W&B Self-Managed](/platform/hosting/hosting-options/self-managed) is a self-hosted deployment that runs in your infrastructure. Point the SDK and CLI at your instance URL instead of the [Multi-tenant Cloud](/platform/hosting/hosting-options/multi_tenant_cloud) API (`api.wandb.ai`). For login commands, `WANDB_BASE_URL`, and `--host`, see [`wandb login`](/models/ref/cli/wandb-login) and [Environment variables](/models/track/environment-variables). + +## Verify the connection + +```bash +wandb status +``` + +Confirm the host matches your instance URL and the API key is present (masked). + +## SSL / certificate issues + +If your instance uses a self-signed or internal CA certificate, set the CA bundle before initializing W&B: + +```bash +export REQUESTS_CA_BUNDLE=/path/to/ca-bundle.crt +``` + +## Switching between Multi-tenant Cloud and Self-Managed + +To use Multi-tenant Cloud again, unset `WANDB_BASE_URL` and run `wandb login` without `--host`: + +```bash +unset WANDB_BASE_URL +wandb login +``` + +## Common issues + +- **401 on `wandb.init()`**: API keys are not shared between Multi-tenant Cloud and a Self-Managed instance. Create a key on your Self-Managed instance profile. +- **Connection refused / timeout**: Check the instance URL and VPN or network access to your deployment. +- **License not recognized**: See [Why is my enterprise license not recognized?](/support/models/articles/enterprise-license-not-recognized). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Administrator](/support/models/tags/administrator)[User Management](/support/models/tags/user-management)[Environment Variables](/support/models/tags/environment-variables) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-create-a-new-team-in-wandb.mdx b/support/models/articles/how-do-i-create-a-new-team-in-wandb.mdx new file mode 100644 index 0000000000..5600173cae --- /dev/null +++ b/support/models/articles/how-do-i-create-a-new-team-in-wandb.mdx @@ -0,0 +1,47 @@ +--- +title: "How do I create a new team in W&B?" +keywords: ["Teams", "Org Management"] +--- + +Teams are the primary unit of collaboration in W&B. A team has its own entity name, shared projects, and member roster. Anyone with a W&B account can create a team. + +**Creating a team** + +1. Log in to [wandb.ai](https://wandb.ai). +2. Click the **+** icon or **Create new team** in the left sidebar (or navigate to [wandb.ai/create-team](https://wandb.ai/create-team)). +3. Enter a **team name**. This becomes the team's entity name (e.g., `wandb.ai/my-team`). Team names must be globally unique, lowercase, and can contain letters, numbers, and hyphens. +4. Click **Create team**. + +You are automatically made an admin of the new team. + +**Inviting members after creation** + +After creating the team, go to **Team settings > Members > Invite members** to add colleagues. See [How do I invite a user to my W&B team?](/support/models/articles/how-do-i-invite-a-user-to-my-wb-team) for full details. + +**Choosing a team name** + +The team name becomes part of every project and run URL under that team, so choose a name that will remain meaningful long-term. Team names cannot be changed after creation without contacting W&B support. + +**Personal accounts vs. team accounts** + +Your personal W&B account has its own entity (your username) that is separate from any teams you belong to. Runs logged to your personal entity are not visible to your team unless you explicitly log them to the team entity: + +```python +import wandb + +# Log to a team project (visible to all team members) +wandb.init(entity="my-team", project="shared-project") + +# Log to personal entity (only visible to you) +wandb.init(entity="my-username", project="personal-experiments") +``` + +**Team limits** + +On the free plan, teams support a limited number of seats and storage. Check your plan details at **Team settings > Billing** or [wandb.ai/site/pricing](https://wandb.ai/site/pricing). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Teams](/support/models/tags/teams)[Org Management](/support/models/tags/org-management) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-delete-a-team-from-my-account.mdx b/support/models/articles/how-do-i-delete-a-team-from-my-account.mdx index 5d075e6537..0e41c067ea 100644 --- a/support/models/articles/how-do-i-delete-a-team-from-my-account.mdx +++ b/support/models/articles/how-do-i-delete-a-team-from-my-account.mdx @@ -3,13 +3,33 @@ title: "How do I delete a team from my account?" keywords: ["Administrator", "Team Management"] --- -To delete a team from an account: +## Delete a team -- Access team settings as an admin. -- Click the **Delete** button at the bottom of the page. +1. Log in as a team admin. +2. Open **Team settings** from the team page in the W&B UI. +3. Scroll to the bottom of the settings page and click **Delete team**. +4. Confirm the deletion when prompted. + + +If you delete your *only* team, your account is locked until [support@wandb.com](mailto:support@wandb.com) restores access. You may also see a "rate limit exceeded" error when you log in. W&B accounts must belong to at least one team to create projects and run experiments, including the personal team created at signup. + + +See [I deleted my team and now I can't create a new one — what do I do?](/support/models/articles/i-deleted-my-team-and-now-i-cant-create-a-new-one) for recovery steps. + +## If you want a different team name + +Team names cannot be changed after creation. To use a new name without locking your account: + +1. [Create a new team](/support/models/articles/how-do-i-create-a-new-team-in-wandb) with the name you want. +2. Move projects and runs to the new team as needed. +3. Delete the old team *only if* you still belong to at least one other team (including your personal team). + +If the old team was your only team, contact [support@wandb.com](mailto:support@wandb.com) before deleting so support can help you transition without losing access. + +On [Dedicated Cloud](/platform/hosting/hosting-options/dedicated-cloud) or [Self-Managed](/platform/hosting/hosting-options/self-managed), contact your account team or support for rename requests before you delete a team. --- {/* AUTO-GENERATED: tab badges */} [Administrator](/support/models/tags/administrator)[Team Management](/support/models/tags/team-management) -{/* END AUTO-GENERATED: tab badges */} \ No newline at end of file +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-download-the-console-log-file-from-a-run.mdx b/support/models/articles/how-do-i-download-the-console-log-file-from-a-run.mdx new file mode 100644 index 0000000000..c67eeca2ef --- /dev/null +++ b/support/models/articles/how-do-i-download-the-console-log-file-from-a-run.mdx @@ -0,0 +1,48 @@ +--- +title: "How do I download the console log file from a run?" +keywords: ["Logs", "Runs"] +--- + +W&B stores your script's stdout and stderr as `output.log` (or multipart chunks under `logs/`). Where to retrieve it depends on whether the run finished, is still active, or crashed. For capture settings and `console_multipart`, see [Console logs](/models/app/console-logs). + +## From the UI + +1. Open the run page. +2. Click the **Files** tab. +3. Find `output.log` (or files under `logs/`) and click the download icon. + +By default, `output.log` uploads *when the run finishes*. It may not appear on **Files** while the run is still active unless you enabled multipart logging at init time. + +## Programmatically via the API + +```python +import wandb + +api = wandb.Api() +run = api.run("entity/project/run_id") +run.file("output.log").download(replace=True) +``` + +For multipart runs, list and download individual files under `logs/` the same way. + +## While the run is still running + +Enable [multipart console logging](/models/app/console-logs) at `wandb.init` time (SDK v0.22.3+). Chunks upload under `logs/` as they close so you can download them before the run finishes. Upload cadence cannot be changed after the run starts. + +## If the run crashed + +Without multipart logging, a crashed run may have no `output.log` on the server. Chunks uploaded before the crash remain downloadable when multipart was enabled. Check the local copy at `wandb/run--/logs/output.log` if server-side files are missing. + +## Resuming a run + +Without multipart logging, `wandb.init(resume="allow", id=...)` can overwrite a single `output.log`. With SDK v0.20.1+ and `console_multipart=True`, each session keeps its own chunks. See [Console logs](/models/app/console-logs). + +## More lines than the Logs tab shows + +The **Logs** tab shows at most 10,000 lines at a time for performance. Download from **Files** or the API for the full log. For capture issues, see [Why is console output not captured for my run?](/support/models/articles/why-is-console-output-not-captured-for-my-run). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Logs](/support/models/tags/logs)[Runs](/support/models/tags/runs) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-invite-a-user-to-my-wb-team.mdx b/support/models/articles/how-do-i-invite-a-user-to-my-wb-team.mdx new file mode 100644 index 0000000000..4315568be6 --- /dev/null +++ b/support/models/articles/how-do-i-invite-a-user-to-my-wb-team.mdx @@ -0,0 +1,35 @@ +--- +title: "How do I invite a user to my W&B team?" +keywords: ["Teams", "User Management"] +--- + +Only team admins can send invitations. The invited person receives an email and must accept before they appear as an active team member. + +**Inviting by email** + +1. Navigate to your team's page at `wandb.ai/`. +2. Click **Team settings** (gear icon or Settings in the sidebar). +3. Select the **Members** tab. +4. Click **Invite members** and enter the invitee's email address. +5. Choose the role they should have: **Member** or **Admin**. +6. Click **Send invite**. + +The invitee receives an email with an acceptance link. Once they accept, they appear in the Members list and can access the team's projects. + +**Inviting someone who already has a W&B account** + +If the person already has a W&B account under a different email, invite using the email address associated with their existing account. They can also accept by logging in to W&B and navigating to the team URL directly — pending invitations appear as a banner prompt. + +**What happens if the invitation email isn't received** + +Ask the invitee to check their spam folder for an email from `no-reply@wandb.ai`. If the email still isn't found, a team admin can cancel the pending invitation and re-send it: in **Team settings > Members**, look for the **Pending** section and click **Resend** next to the invitee's address. + +**Inviting to a specific project only** + +W&B teams do not have project-level membership. Access is controlled at the team level — all team members can see all non-private projects in the team. To restrict access, set a project's visibility to **Private** (only team admins and explicitly added members can view it) via **Project settings > Privacy**. + +--- + +{/* AUTO-GENERATED: tab badges */} +[Teams](/support/models/tags/teams)[User Management](/support/models/tags/user-management) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-log-gradients-and-model-weights-with-wandb-watch.mdx b/support/models/articles/how-do-i-log-gradients-and-model-weights-with-wandb-watch.mdx new file mode 100644 index 0000000000..76fc675bf5 --- /dev/null +++ b/support/models/articles/how-do-i-log-gradients-and-model-weights-with-wandb-watch.mdx @@ -0,0 +1,84 @@ +--- +title: "How do I log gradients and model weights with wandb.watch()?" +keywords: ["Experiments", "Metrics", "Runs"] +--- + +`wandb.watch()` hooks into a PyTorch model's parameters and gradients and logs histograms of their values at regular intervals. This is useful for diagnosing training instability, vanishing gradients, and dead neurons. + +**Basic usage** + +Call `wandb.watch()` after `wandb.init()` and before the first training step: + +```python +import wandb +import torch.nn as nn + +wandb.init(project="my-project") + +model = MyModel() +wandb.watch(model, log="gradients", log_freq=100) + +for step, batch in enumerate(dataloader): + loss = train_step(model, batch) + loss.backward() + optimizer.step() + optimizer.zero_grad() + + wandb.log({"train/loss": loss.item()}, step=step) + +wandb.finish() +``` + +Gradient histograms are logged every `log_freq` steps. They appear in the **Charts** tab under keys like `gradients/layer_name.weight`. + +**`log` parameter options** + +| Value | What is logged | +|---|---| +| `"gradients"` | Gradient histograms only (default) | +| `"parameters"` | Weight/parameter histograms only | +| `"all"` | Both gradients and parameters | +| `None` | Neither — only logs model graph topology | + +```python +wandb.watch(model, log="all", log_freq=50) +``` + +**Logging model graph topology** + +`wandb.watch()` also captures the model's computational graph and logs it as a summary. View the graph in the run's **Overview** tab under **Model**. This works even with `log=None`: + +```python +wandb.watch(model, log=None) # graph only, no histograms +``` + +**Watching multiple models** + +Call `wandb.watch()` separately for each model (useful in GAN training): + +```python +wandb.watch(generator, log="gradients", log_freq=100) +wandb.watch(discriminator, log="gradients", log_freq=100) +``` + +Each model's gradients are logged with its parameter names as prefixes. + +**Performance considerations** + +Gradient logging adds overhead proportional to `log_freq`. Logging every step (`log_freq=1`) can significantly slow training. A value between 50 and 200 is typical for most training runs. If performance is critical, set `log="parameters"` rather than `log="gradients"` — parameter histograms are computed without a backward pass hook and are cheaper. + +**Stopping the watch** + +To stop logging gradients mid-training: + +```python +wandb.unwatch(model) +``` + +This removes the hooks without ending the run, so metric logging continues unaffected. + +--- + +{/* AUTO-GENERATED: tab badges */} +[Experiments](/support/models/tags/experiments)[Metrics](/support/models/tags/metrics)[Runs](/support/models/tags/runs) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-log-nlp-metrics-and-text-outputs-in-wandb.mdx b/support/models/articles/how-do-i-log-nlp-metrics-and-text-outputs-in-wandb.mdx new file mode 100644 index 0000000000..63cebecdcc --- /dev/null +++ b/support/models/articles/how-do-i-log-nlp-metrics-and-text-outputs-in-wandb.mdx @@ -0,0 +1,107 @@ +--- +title: "How do I log NLP metrics and text outputs in W&B?" +keywords: ["Experiments", "Metrics", "Runs"] +--- + +Log corpus-level NLP scores (BLEU, ROUGE, perplexity) with `wandb.log()` and per-example outputs with `wandb.Table`. For general logging patterns, see [Log objects and media](/models/track/log) and [Log tables](/models/track/log/log-tables). + +## Log scalar NLP metrics + +Log BLEU, ROUGE, perplexity, and other scalar scores the same way you log loss: + +```python +import wandb +from sacrebleu.metrics import BLEU +from rouge_score import rouge_scorer + +with wandb.init(project="nmt-project") as run: + bleu = BLEU() + scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"]) + + for epoch in range(num_epochs): + train(model) + hypotheses, references = evaluate(model, val_set) + + bleu_score = bleu.corpus_score(hypotheses, [references]) + rouge_scores = [scorer.score(ref, hyp) for ref, hyp in zip(references, hypotheses)] + + run.log({ + "epoch": epoch, + "val/bleu": bleu_score.score, + "val/rouge1": sum(s.rouge1.fmeasure for s in rouge_scores) / len(rouge_scores), + "val/rougeL": sum(s.rougeL.fmeasure for s in rouge_scores) / len(rouge_scores), + "val/perplexity": compute_perplexity(model, val_loader), + }) +``` + +## Log text predictions as a table + +Track model outputs alongside source and reference text to spot qualitative changes across epochs: + +```python +text_table = wandb.Table(columns=["source", "reference", "hypothesis", "bleu"]) + +for src, ref, hyp in zip(sources[:50], references[:50], hypotheses[:50]): + sent_bleu = bleu.sentence_score(hyp, [ref]).score + text_table.add_data(src, ref, hyp, round(sent_bleu, 2)) + +run.log({"val/text_outputs": text_table}) +``` + +In the UI, sort the table by `bleu` ascending to surface the worst-performing examples. + +## Log token-level probabilities + +For language model analysis, log per-token log-probabilities as a table: + +```python +token_table = wandb.Table(columns=["token", "log_prob", "position"]) +for pos, (tok, lp) in enumerate(zip(tokens, log_probs)): + token_table.add_data(tok, float(lp), pos) + +run.log({"token_probs": token_table}) +``` + +## Track vocabulary and data statistics + +Log dataset characteristics as config values so they are searchable across runs: + +```python +with wandb.init( + project="lm-project", + config={ + "vocab_size": tokenizer.vocab_size, + "max_seq_len": 512, + "train_tokens": total_train_tokens, + "dataset": "c4-en", + }, +) as run: + ... +``` + +## Log metrics with Hugging Face evaluate + +The Hugging Face `evaluate` library returns dicts that map to `wandb.log()`. The example below uses `rouge` only (BLEU is covered above with `sacrebleu.metrics`): + +```python +import evaluate +import wandb + +rouge_metric = evaluate.load("rouge") + +with wandb.init(project="nmt-project") as run: + rouge_result = rouge_metric.compute( + predictions=hypotheses, + references=references, + ) + run.log({ + "val/rouge1": rouge_result["rouge1"], + "val/rougeL": rouge_result["rougeL"], + }) +``` + +--- + +{/* AUTO-GENERATED: tab badges */} +[Experiments](/support/models/tags/experiments)[Metrics](/support/models/tags/metrics)[Runs](/support/models/tags/runs) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-paginate-through-large-api-results-in-wandb.mdx b/support/models/articles/how-do-i-paginate-through-large-api-results-in-wandb.mdx new file mode 100644 index 0000000000..1ccb11a164 --- /dev/null +++ b/support/models/articles/how-do-i-paginate-through-large-api-results-in-wandb.mdx @@ -0,0 +1,56 @@ +--- +title: "How do I paginate through large API results in W&B?" +keywords: ["Runs", "Experiments", "API", "Artifacts"] +--- + +The W&B Public API loads runs and artifacts in pages. For the standard lazy-iterator pattern and `per_page`, see the [Public API guide](/models/track/public-api-guide). This article covers checkpointing and bulk-download tips not spelled out in the main guide. + +## Checkpoint processed run IDs + +For very large projects, record IDs you have already handled and skip them on restart: + +```python +import json +import pathlib +import wandb + +checkpoint_file = pathlib.Path("processed_ids.json") +processed = set(json.loads(checkpoint_file.read_text())) if checkpoint_file.exists() else set() + +api = wandb.Api() +for run in api.runs("my-entity/my-project"): + if run.id in processed: + continue + process(run) + processed.add(run.id) + checkpoint_file.write_text(json.dumps(list(processed))) +``` + +Avoid `list(api.runs(...))` on huge projects unless you need random access — it forces every page into memory. + +## Artifacts + +The same iterator pattern applies to `api.artifacts()`: + +```python +for artifact in api.artifacts(type_name="model", project="my-entity/my-project", per_page=100): + print(artifact.name, artifact.version) +``` + +## Rate limits on bulk downloads + +If each run triggers extra API calls (for example `run.file("output.log").download()`), add a short delay to avoid `429` errors: + +```python +import time + +for run in api.runs("my-entity/my-project"): + run.file("output.log").download() + time.sleep(0.1) +``` + +--- + +{/* AUTO-GENERATED: tab badges */} +[Runs](/support/models/tags/runs)[Experiments](/support/models/tags/experiments)[API](/support/models/tags/api)[Artifacts](/support/models/tags/artifacts) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-run-sweeps-with-distributed-training-on-slurm.mdx b/support/models/articles/how-do-i-run-sweeps-with-distributed-training-on-slurm.mdx new file mode 100644 index 0000000000..bc3baa0757 --- /dev/null +++ b/support/models/articles/how-do-i-run-sweeps-with-distributed-training-on-slurm.mdx @@ -0,0 +1,39 @@ +--- +title: "How do I run sweeps with distributed training on SLURM?" +keywords: ["Sweeps", "Experiments"] +--- + +When running a W&B sweep with distributed training on SLURM (for example, multi-GPU jobs with `--gpus-per-node`), only one process per SLURM job should call `wandb.agent()`. All other processes on the same node should join the run directly. + +Use the `SLURM_PROCID` environment variable to restrict `wandb.agent()` to rank 0: + +```python +import os +import wandb + +def train(): + wandb.init() + # your training code here + +if os.environ.get("SLURM_PROCID", "0") == "0": + wandb.agent(sweep_id, function=train, count=1) +else: + # Non-rank-0 processes join the run created by rank 0 + train() +``` + +This pattern ensures that: + +- Each SLURM job registers exactly one run with the sweep controller. +- Other ranks on the same node participate in the distributed run without creating duplicate sweep entries. +- The sweep controller correctly tracks progress and schedules new hyperparameter configurations. + +If you use `submitit` or a similar launcher, apply the same check in your training entry point before calling `wandb.agent()`. + +For single-GPU or non-distributed jobs, use `wandb agent --count 1 SWEEP_ID` as described in [How should I run sweeps on SLURM?](/support/models/articles/how-should-i-run-sweeps-on-slurm). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Sweeps](/support/models/tags/sweeps)[Experiments](/support/models/tags/experiments) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-set-up-wandb-alerts-and-notifications.mdx b/support/models/articles/how-do-i-set-up-wandb-alerts-and-notifications.mdx new file mode 100644 index 0000000000..6ee5d8ac6f --- /dev/null +++ b/support/models/articles/how-do-i-set-up-wandb-alerts-and-notifications.mdx @@ -0,0 +1,40 @@ +--- +title: "How do I set up W&B alerts and notifications?" +keywords: ["Alerts", "Experiments", "Runs"] +--- + +W&B supports automated alerts in User Settings (run finished, run crashed) and programmatic alerts from training code. For `run.alert()` setup, Slack connection, and examples, see [Send an alert](/models/runs/alert). For event-driven Slack notifications on artifacts or metrics, see [Automations](/models/automations/). + +## Automated alerts via User Settings + +Go to [wandb.ai/settings](https://wandb.ai/settings) and open **Alerts**. You can enable: + +- **Run finished** — notifies when a run completes successfully. +- **Run crashed** — notifies when a run exits without calling `wandb.finish()`. + +Deliver to email, Slack, or both. **Run finished** alerts do not fire in Jupyter notebooks — use `run.alert()` in notebooks instead. + +Team-wide alerting (for example, a shared Slack channel when any teammate's run crashes) is not available in the UI today. Each member connects Slack in their own settings, or use `run.alert()` with a shared workflow. + +## Throttling programmatic alerts + +If the same `run.alert()` could fire every step, set `wait_duration` so repeated calls with the same title are skipped until the interval elapses: + +```python +import datetime +import wandb + +with wandb.init(project="my-project") as run: + run.alert( + title="High loss", + text=f"Loss is {loss:.2f}", + level=wandb.AlertLevel.WARN, + wait_duration=datetime.timedelta(minutes=30), + ) +``` + +--- + +{/* AUTO-GENERATED: tab badges */} +[Alerts](/support/models/tags/alerts)[Experiments](/support/models/tags/experiments)[Runs](/support/models/tags/runs) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-update-run-config-tags-and-notes-via-the-wandb-api.mdx b/support/models/articles/how-do-i-update-run-config-tags-and-notes-via-the-wandb-api.mdx new file mode 100644 index 0000000000..64cece261c --- /dev/null +++ b/support/models/articles/how-do-i-update-run-config-tags-and-notes-via-the-wandb-api.mdx @@ -0,0 +1,43 @@ +--- +title: "How do I update run config, tags, and notes via the W&B API?" +keywords: ["Runs", "Experiments", "API"] +--- + +After a run finishes, use the [Public API guide](/models/track/public-api-guide) to edit config, display name, tags, and notes without re-running the experiment. For tag concepts and UI workflows, see [Tags](/models/runs/tags). For config during a run, see [Configuration](/models/track/config). + +The examples below use `run.update()` to persist pending changes in one API call. + +```python +import wandb + +api = wandb.Api() +run = api.run("my-entity/my-project/run_id_here") + +run.config["post_hoc_label"] = "baseline-v2" +run.name = "resnet50-lr1e-3-aug-v2" +run.tags.append("reviewed") +run.notes = "Final baseline. Val accuracy: 92.4%." +run.update() +``` + +## Bulk updates + +To tag or annotate many runs (for example, all runs from a sweep), iterate with filters and call `update()` per run: + +```python +for run in api.runs("my-entity/my-project", filters={"sweep": "sweep_id_here"}): + run.tags.append("sweep-reviewed") + run.update() +``` + +Each `update()` is a separate API call. For large sets, add a short sleep between calls to avoid rate limits. + +## What you cannot change after a run finishes + +You cannot add new logged metrics (summary or history) to a finished run via the Public API. To attach post-hoc metrics, start a new run (for example `job_type="evaluation"`), log there, and link via tags or artifact lineage. + +--- + +{/* AUTO-GENERATED: tab badges */} +[Runs](/support/models/tags/runs)[Experiments](/support/models/tags/experiments)[API](/support/models/tags/api) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-use-the-parallel-coordinates-chart-in-wandb.mdx b/support/models/articles/how-do-i-use-the-parallel-coordinates-chart-in-wandb.mdx new file mode 100644 index 0000000000..ab500aeb42 --- /dev/null +++ b/support/models/articles/how-do-i-use-the-parallel-coordinates-chart-in-wandb.mdx @@ -0,0 +1,28 @@ +--- +title: "How do I use the parallel coordinates chart in W&B?" +keywords: ["Runs", "Experiments", "Workspace", "Sweeps"] +--- + +The parallel coordinates chart shows how hyperparameters relate to metrics across many runs. Each line is one run; each axis is a config key or summary metric. To add a panel, configure axes, colors, and brushes, see [Parallel coordinates](/models/app/features/panels/parallel-coordinates) and [Visualize sweep results](/models/sweeps/visualize-sweep-results). + +## Reading the chart + +- Lines that **cross** between adjacent axes suggest an inverse relationship between those parameters. +- Lines that **run parallel** suggest a positive correlation. +- A tight **bundle** of lines at a good metric value marks a promising hyperparameter region. + +Color lines by a summary metric (for example `val/accuracy`) so strong runs stand out on a gradient from cool to warm colors. + +## Filtering with brushes + +Drag on an axis to brush a value range. Only runs in range stay highlighted. Brushing multiple axes narrows to runs that satisfy all ranges — useful for questions like which learning-rate and batch-size pairs exceed 90% accuracy. + +## Sweeps + +After a sweep, filter to the sweep (**Filter > Sweep > your-sweep-id**) so the panel shows trial runs only. Use the chart to see the explored space and plan a tighter follow-up sweep. + +--- + +{/* AUTO-GENERATED: tab badges */} +[Runs](/support/models/tags/runs)[Experiments](/support/models/tags/experiments)[Workspace](/support/models/tags/workspace)[Sweeps](/support/models/tags/sweeps) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/how-do-i-use-wandb-with-jax.mdx b/support/models/articles/how-do-i-use-wandb-with-jax.mdx new file mode 100644 index 0000000000..9cecde99b5 --- /dev/null +++ b/support/models/articles/how-do-i-use-wandb-with-jax.mdx @@ -0,0 +1,91 @@ +--- +title: "How do I use W&B with JAX?" +keywords: ["Experiments", "Runs", "Metrics"] +--- + +W&B has no JAX-specific integration. Use `wandb.log()` in your training loop like any other Python workflow. Convert JAX device arrays to Python scalars before logging so values serialize correctly. + +For experiment setup and logging patterns, see [Create an experiment](/models/track/create-an-experiment) and [Log objects and media](/models/track/log). For Flax checkpoints, see [Artifacts](/models/artifacts/). + +## Log metrics from a JAX training loop + +```python +import jax +import wandb + +with wandb.init( + project="my-jax-project", + config={"learning_rate": 1e-3, "batch_size": 64, "epochs": 50}, +) as run: + lr = run.config.learning_rate + + @jax.jit + def train_step(params, batch): + loss, grads = jax.value_and_grad(loss_fn)(params, batch) + params = update_params(params, grads) + return params, loss + + for step, batch in enumerate(dataloader): + params, loss = train_step(params, batch) + run.log({"train/loss": float(loss)}, step=step) +``` + +Values returned from `@jax.jit` are device arrays. Pass scalars to `run.log()` with `float(loss)` or `.item()` on 0-dimensional arrays. Logging a raw JAX array can fail serialization or record an unexpected value depending on your SDK version. + +## Log validation metrics + +Aggregate validation metrics in Python and log once per epoch: + +```python +for epoch in range(num_epochs): + val_losses = [] + for val_batch in val_loader: + val_loss = eval_step(params, val_batch) + val_losses.append(float(val_loss)) + + run.log({ + "epoch": epoch, + "val/loss": sum(val_losses) / len(val_losses), + }) +``` + +## Save checkpoints as artifacts + +Save JAX or Flax parameters with `orbax` or `flax.serialization`, then log them as artifacts: + +```python +import orbax.checkpoint as ocp + +checkpointer = ocp.StandardCheckpointer() +checkpointer.save("/tmp/checkpoint", params) + +artifact = wandb.Artifact("jax-model", type="model") +artifact.add_dir("/tmp/checkpoint") +run.log_artifact(artifact) +``` + +## Debug NaN values + +JAX does not raise on NaN values by default. Log a NaN flag with your loss: + +```python +import jax.numpy as jnp + +run.log({ + "train/loss": float(loss), + "train/loss_is_nan": bool(jnp.isnan(loss)), +}) +``` + +During development you can enable JAX debug NaN checking (at a performance cost): + +```python +from jax import config +config.update("jax_debug_nans", True) +``` + +--- + +{/* AUTO-GENERATED: tab badges */} +[Experiments](/support/models/tags/experiments)[Runs](/support/models/tags/runs)[Metrics](/support/models/tags/metrics) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/i-deleted-my-team-and-now-i-cant-create-a-new-one.mdx b/support/models/articles/i-deleted-my-team-and-now-i-cant-create-a-new-one.mdx new file mode 100644 index 0000000000..77513fb371 --- /dev/null +++ b/support/models/articles/i-deleted-my-team-and-now-i-cant-create-a-new-one.mdx @@ -0,0 +1,44 @@ +--- +title: "I deleted my team and now I can't create a new one — what do I do?" +keywords: ["Teams", "Team Management", "Org Management"] +--- + +When you sign up for W&B, the platform automatically creates a personal team with the same name as your username. If you delete that team — perhaps to rename it or because it looked like a duplicate — you may find that you can no longer create projects or that the **Create team** button has disappeared entirely. You may also see a "rate limit exceeded" error every time you log in. + +**Why this happens** + +W&B accounts must belong to at least one team to create projects and run experiments. When the auto-generated team is deleted, the account loses its team membership. In this state: + +- The **Create team** button in **Account Settings > Teams** may not appear. +- Attempting to log in or navigate to your workspace may immediately redirect to a "rate limit exceeded" or "An application error occurred" page. +- You cannot create a new team to resolve the issue yourself — team creation is blocked while the account is in this state. + +**What to do** + +This state cannot be resolved through the W&B UI. Contact W&B support and include: + +- Your W&B **username** (the entity name shown in your profile URL, e.g. `wandb.ai/your-username`) +- Your **organization name** if you are on a team or enterprise plan +- The **desired team name** you want created (team names must be globally unique, lowercase, with no spaces) + +Support will manually create a team for your account and restore your access. Response times vary by plan; enterprise customers can escalate via their dedicated support channel. + +**If you are on an academic or free plan** + +Academic accounts may not have the ability to create teams directly through the UI — the "Create team" button may be absent even on a fresh account. This is not a bug; the free plan limits team creation in some cases. Contact support with your username and the team name you want, and include a note that you are using W&B for academic or research purposes. + +**Preventing this in the future** + +- Do not delete your initial auto-generated team unless you plan to contact support immediately to replace it. +- If you want a different team name, see [How do I delete a team from my account?](/support/models/articles/how-do-i-delete-a-team-from-my-account) for creating a new team before you delete your only team. +- To check whether your account belongs to a team, go to [wandb.ai/settings](https://wandb.ai/settings) and look under **Teams**. + +**Related: rate limit exceeded loop** + +If you are stuck in a redirect loop with a "rate limit exceeded" error after deleting your team, this is a symptom of the same issue. The error appears because W&B's routing logic expects every logged-in user to belong to a team. Logging out and back in will not fix it — only support restoring a team on your account will resolve the loop. + +--- + +{/* AUTO-GENERATED: tab badges */} +[Teams](/support/models/tags/teams)[Team Management](/support/models/tags/team-management)[Org Management](/support/models/tags/org-management) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/why-are-my-metrics-missing-from-wandb-log.mdx b/support/models/articles/why-are-my-metrics-missing-from-wandb-log.mdx new file mode 100644 index 0000000000..5514244d69 --- /dev/null +++ b/support/models/articles/why-are-my-metrics-missing-from-wandb-log.mdx @@ -0,0 +1,66 @@ +--- +title: "Why are my metrics missing from wandb.log()?" +keywords: ["Logs", "Metrics", "Experiments"] +--- + +If metrics logged with `wandb.log()` are not appearing in the W&B UI, there are several common causes. + +**SDK version issue** + +Certain SDK versions have known bugs that cause logged metrics to be silently dropped. If you recently upgraded or installed W&B and are seeing missing metrics, check your SDK version: + +```bash +pip show wandb +``` + +Update to the latest stable release: + +```bash +pip install --upgrade wandb +``` + +If the issue started after a specific upgrade, you can pin to an earlier version while the bug is investigated. Check the [W&B release notes](https://github.com/wandb/wandb/releases) for known issues with your version. + +**Metrics logged before `wandb.init()`** + +Metrics logged before `wandb.init()` is called are silently discarded. Ensure `wandb.init()` is called before any `wandb.log()` calls: + +```python +import wandb + +wandb.init(project="my-project") # must come first + +for step in range(100): + wandb.log({"loss": loss, "accuracy": acc}) +``` + +**Metrics logged after `wandb.finish()`** + +Any `wandb.log()` calls after `wandb.finish()` are also discarded. If you call `wandb.finish()` mid-script and then log more data, those metrics will not appear. + +**Offline mode without syncing** + +If `WANDB_MODE=offline` is set, metrics are saved locally but not uploaded until you run `wandb sync`. Check whether the run shows data locally in your `wandb/` directory and sync it: + +```bash +wandb sync wandb/run-- +``` + +**Non-deterministic drops in distributed training** + +In distributed training setups, only the process that called `wandb.init()` should call `wandb.log()`. If multiple processes log to the same run without coordination, metrics can overwrite each other or drop. Use rank checks to ensure only one process logs: + +```python +import os + +if int(os.environ.get("RANK", 0)) == 0: + wandb.log({"loss": loss}) +``` + +For distributed logging patterns, see [Log distributed experiments](/models/track/log/distributed-training). For offline sync, see [Environment variables](/models/track/environment-variables). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Logs](/support/models/tags/logs)[Metrics](/support/models/tags/metrics)[Experiments](/support/models/tags/experiments) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/why-do-my-workspace-settings-not-persist.mdx b/support/models/articles/why-do-my-workspace-settings-not-persist.mdx new file mode 100644 index 0000000000..8963813179 --- /dev/null +++ b/support/models/articles/why-do-my-workspace-settings-not-persist.mdx @@ -0,0 +1,29 @@ +--- +title: "Why do my workspace settings not persist between sessions?" +keywords: ["Workspace", "Runs", "Experiments"] +--- + +Workspace layout (panels, filters, grouping) persists _only_ when you save a view. Unsaved changes stay in your browser session. To create and manage views, see [Workspaces](/models/track/workspaces). + +## Unsaved changes + +W&B does not auto-save workspace edits. After changing filters or panels, choose **Views > Save view** or **Save changes** on an existing view. The **Views** dropdown shows an asterisk (`*`) while changes are pending. + +## Browser-only vs server-side settings + +| Setting | Stored | +|---------|--------| +| Panel layout, filters, grouping, run colors in a view | Server-side (saved view) | +| Browser zoom, personally hidden runs | Local browser only | + +Incognito or a different browser loads saved views correctly but not local-only preferences. + +## Team projects and conflicting edits + +On shared projects, whoever saves last updates the default view. Use **Views > Save as new view** for a personal copy so teammates do not overwrite each other's layouts. + +--- + +{/* AUTO-GENERATED: tab badges */} +[Workspace](/support/models/tags/workspace)[Runs](/support/models/tags/runs)[Experiments](/support/models/tags/experiments) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/why-does-my-api-key-fail-with-must-be-40-characters.mdx b/support/models/articles/why-does-my-api-key-fail-with-must-be-40-characters.mdx new file mode 100644 index 0000000000..75b8d6cda7 --- /dev/null +++ b/support/models/articles/why-does-my-api-key-fail-with-must-be-40-characters.mdx @@ -0,0 +1,46 @@ +--- +title: "Why does my API key fail with 'must be 40 characters long'?" +keywords: ["API Keys", "Authentication", "SDK"] +--- + +W&B issues API keys in the `wandb_v1_` format (about 92 characters). Older `wandb` SDK versions validate exactly 40 characters and reject the new format: + +`ValueError: API key must be 40 characters long, yours was 92` + +On [Dedicated Cloud](/platform/hosting/hosting-options/dedicated-cloud) or [Self-Managed](/platform/hosting/hosting-options/self-managed) instances, the message may instead be `ERROR API key must be 40 characters long, yours was 92`. + +For key formats and login steps, see [`wandb login`](/models/ref/cli/wandb-login). + +## Upgrade the wandb SDK + +```bash +pip install -U wandb +wandb login --relogin +``` + +## Save your API key at creation time + +Keys generated at [wandb.ai/authorize](https://wandb.ai/authorize) are shown *only once* at creation. Copy the key before you close the dialog. If you lose it, generate a new key. + +## If you cannot upgrade the SDK + +- If a legacy 40-character key is still active on your account, keep using it. New-format keys do not automatically revoke legacy keys. +- If you deleted the legacy key and cannot upgrade, contact [support@wandb.com](mailto:support@wandb.com). [Dedicated Cloud](/platform/hosting/hosting-options/dedicated-cloud) and [Self-Managed](/platform/hosting/hosting-options/self-managed) deployments may be able to issue a legacy-format key. +- As a workaround, set `WANDB_API_KEY` in the environment instead of `wandb login`. Some older SDK versions skip local length checks: + + ```bash + export WANDB_API_KEY= + python train.py + ``` + +## Different error: key reported as too short + +`API key must have 40+ characters, has 20` can appear when calling `wandb.Api().viewer.generate_api_key()` in certain SDK versions. This is separate from the `wandb_v1_` format migration. Upgrade the SDK. If the error persists after upgrading, contact [support@wandb.com](mailto:support@wandb.com). + +For more information, see [How do I find my API key?](/support/models/articles/how-do-i-find-my-api-key) and [SDK release notes](/release-notes/sdk-releases). + +--- + +{/* AUTO-GENERATED: tab badges */} +[API Keys](/support/models/tags/api-keys)[Authentication](/support/models/tags/authentication)[SDK](/support/models/tags/sdk) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/why-is-console-output-not-captured-for-my-run.mdx b/support/models/articles/why-is-console-output-not-captured-for-my-run.mdx new file mode 100644 index 0000000000..425700563f --- /dev/null +++ b/support/models/articles/why-is-console-output-not-captured-for-my-run.mdx @@ -0,0 +1,60 @@ +--- +title: "Why is console output not captured for my run?" +keywords: ["Logs", "Runs"] +--- + +W&B captures your script's stdout and stderr and stores it as `output.log` on the run's **Files** tab. By default, that file uploads *when the run finishes*, so an empty or missing log is often a timing issue, not failed capture. + +For multipart uploads (`console_multipart`), chunk rollover, and when to enable them, see [Console logs](/models/app/console-logs) and [`wandb.Settings`](/models/ref/python/experiments/settings). To download logs, see [How do I download the console log file from a run?](/support/models/articles/how-do-i-download-the-console-log-file-from-a-run). + +## Console capture is disabled + +Console capture can be turned off via settings or environment variable: + +```python +wandb.init(settings=wandb.Settings(console="off")) +``` + +```bash +WANDB_CONSOLE=off python my_script.py +``` + +Check whether either is set in your environment or launch configuration. To re-enable, remove the setting or set `WANDB_CONSOLE=wrap`. + +## Distributed training (DDP / multiprocessing) + +The **Logs** tab records output only from the process that owns the active W&B run. In Lightning/DDP, `print()` or `wandb.termlog()` from worker processes that do not own the run appear in the local terminal only. Initialize the run on rank 0 and use `console="wrap"`, which is more robust than the default under DDP: + +```python +from lightning.pytorch.loggers import WandbLogger + +wandb_logger = WandbLogger( + project="my_project", + settings=wandb.Settings(console="wrap"), # or WANDB_CONSOLE=wrap +) +trainer = Trainer(logger=wandb_logger, strategy="ddp", devices=..., accelerator="gpu") +``` + +If the **Logs** tab stays empty, try `console="redirect"` — output may appear in `output.log` on the **Files** tab even when it does not stream live. See [Log distributed experiments](/models/track/log/distributed-training) for rank-0 logging patterns. + +## Run still active but no file on Files tab + +While a run is active, the **Logs** tab streams output, but `output.log` usually does not appear on **Files** until the run finishes unless you enabled [multipart console logging](/models/app/console-logs) at `wandb.init` time. Upload cadence cannot be changed after the run starts. + +## Run crashed before flush + +Without multipart logging, a killed run (OOM, SIGKILL, and similar) may upload no `output.log` and show no download button. Enable `console_multipart` before the run starts so chunks uploaded before the crash remain on the server. A local copy is always written to `wandb/run--/logs/output.log`. + +## Resumed runs lose earlier console output + +On older SDKs, `wandb.init(resume="allow", id=...)` can overwrite a single `output.log`. From SDK v0.20.1+ with `console_multipart=True`, each session gets separate chunks under `logs/`. See [Console logs](/models/app/console-logs) for setup. + +## Logs tab shows fewer lines than expected + +The **Logs** tab caps display for performance (latest 10,000 lines for runs under 100,000 total lines). The full log is in `output.log` or multipart chunks — download via the **Files** tab or API. See [How do I download the console log file from a run?](/support/models/articles/how-do-i-download-the-console-log-file-from-a-run). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Logs](/support/models/tags/logs)[Runs](/support/models/tags/runs) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/why-is-my-run-showing-as-crashed.mdx b/support/models/articles/why-is-my-run-showing-as-crashed.mdx new file mode 100644 index 0000000000..ddd8d14c2a --- /dev/null +++ b/support/models/articles/why-is-my-run-showing-as-crashed.mdx @@ -0,0 +1,55 @@ +--- +title: "Why is my run showing as crashed?" +keywords: ["Runs", "Run Crashes"] +--- + +W&B marks a run as **Crashed** when it stops receiving heartbeats from the process that called `wandb.init()`, without the process having called `wandb.finish()`. This happens when the training process is killed, exits unexpectedly, or loses connectivity before it can report a clean finish. + +**Common causes** + +- **Out-of-memory (OOM) error**: The process is killed by the OS or GPU driver when it exceeds available memory. Check `output.log` for `CUDA out of memory` or `Killed` messages. +- **Uncaught exception**: An unhandled Python exception causes the process to exit without calling `wandb.finish()`. The exception appears in `output.log`. +- **Job scheduler preemption**: On SLURM or other cluster schedulers, jobs can be preempted and killed without warning. The run never gets a chance to finish cleanly. +- **Network loss**: In rare cases, a long network outage causes the W&B backend to time out waiting for heartbeats and mark the run as crashed, even though the process is still running. The run will resume uploading when connectivity is restored. +- **Process killed manually**: Using `kill -9` or `SIGKILL` bypasses Python's signal handlers, preventing `wandb.finish()` from being called. + +**How to debug** + +1. Open the run page and click the **Files** tab. +2. Download `output.log` for stdout/stderr — this usually contains the error that caused the crash. +3. Download `debug.log` and `debug-internal.log` for W&B-level diagnostics (connectivity issues, upload errors). +4. If the run was on a cluster, also check the scheduler's job log for preemption or OOM signals. + +**Data from a crashed run** + +Metrics logged before the crash are preserved and visible in the UI. The run's charts, system metrics, and any artifacts that were fully uploaded before the crash are all accessible. Partially-uploaded artifacts may be incomplete. + +**Preventing crashes from losing data** + +Wrap your training loop in a try/except and call `wandb.finish(exit_code=1)` explicitly on error to ensure the run is marked as **Failed** (rather than **Crashed**) and all buffered data is flushed: + +```python +import wandb + +wandb.init(project="my-project") + +try: + for step in range(1000): + # training logic + wandb.log({"loss": loss}) +except Exception as e: + wandb.finish(exit_code=1) + raise +``` + +**Re-marking a crashed run** + +Crashed runs can be manually re-marked as **Failed** in the UI (run page > kebab menu > **Mark as failed**). This is useful for sweeps, where crashed runs may block the controller from scheduling new configurations. + +For run state definitions, see [Run states](/models/runs/run-states). For console logs after a crash, see [Why is console output not captured for my run?](/support/models/articles/why-is-console-output-not-captured-for-my-run). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Runs](/support/models/tags/runs)[Run Crashes](/support/models/tags/run-crashes) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/why-is-my-sweep-agent-not-picking-up-new-runs.mdx b/support/models/articles/why-is-my-sweep-agent-not-picking-up-new-runs.mdx new file mode 100644 index 0000000000..6477eb59c7 --- /dev/null +++ b/support/models/articles/why-is-my-sweep-agent-not-picking-up-new-runs.mdx @@ -0,0 +1,50 @@ +--- +title: "Why is my sweep agent not picking up new runs?" +keywords: ["Sweeps", "Experiments"] +--- + +If your sweep agent starts but does not receive new run configurations, or receives one run and then idles, there are several common causes. + +**The sweep has exhausted its search space (grid search)** + +In `grid` search, the sweep controller assigns every combination of hyperparameter values exactly once. Once all combinations are assigned, no new runs are generated. If you started multiple agents simultaneously, they may have collectively consumed all configurations before any single agent finished its current run. + +To confirm: open the sweep page in the W&B UI and check the run count against the total grid size. If they match, the sweep is complete. + +**The `--count` flag is limiting the agent** + +Running `wandb agent --count N SWEEP_ID` tells the agent to accept at most N runs before exiting. If you set `--count 1`, the agent exits after a single run. This is intentional for SLURM and other job schedulers, but can be surprising if you expected the agent to loop. + +Remove `--count` (or increase it) to allow the agent to keep pulling runs: + +```bash +wandb agent SWEEP_ID +``` + +**The sweep is paused or stopped** + +Check the sweep status in the W&B UI (**Sweeps > your sweep > Status**). If the sweep was manually paused or stopped, agents will not receive new configurations until the sweep is resumed. + +**The agent is waiting for a crashed run to time out** + +By default, the sweep controller marks a run as failed after it does not report progress for a configurable timeout. If an agent crashes mid-run without cleanly signaling failure, the controller holds the run's slot until the timeout expires. You can monitor this in the sweep UI and manually mark hung runs as failed to unblock the queue. + +**Multiple processes calling `wandb.agent()` on the same job** + +In distributed training setups, if every process on a node calls `wandb.agent()`, each process registers as a separate agent and consumes a run configuration. This leads to runs that crash immediately (because only one process was meant to drive the sweep) and a quickly exhausted configuration pool. Restrict `wandb.agent()` to rank 0 only. See [How do I run sweeps with distributed training on SLURM?](/support/models/articles/how-do-i-run-sweeps-with-distributed-training-on-slurm) for the recommended pattern. + +**SDK version bug after upgrade** + +Some SDK versions between 0.19.6 and 0.19.10 introduced a regression where the sweep agent teardown raised an error that caused the agent loop to exit prematurely rather than requesting the next run. If you recently upgraded and agents stop after one run with a teardown-related traceback, upgrade to the latest SDK version: + +```bash +pip install --upgrade wandb +``` + +For more causes and fixes, see [Troubleshoot sweeps](/models/sweeps/troubleshoot-sweeps). For SLURM distributed jobs, see [How do I run sweeps with distributed training on SLURM?](/support/models/articles/how-do-i-run-sweeps-with-distributed-training-on-slurm). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Sweeps](/support/models/tags/sweeps)[Experiments](/support/models/tags/experiments) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/articles/why-is-my-wandb-run-slow-to-initialize-or-upload.mdx b/support/models/articles/why-is-my-wandb-run-slow-to-initialize-or-upload.mdx new file mode 100644 index 0000000000..5579165e17 --- /dev/null +++ b/support/models/articles/why-is-my-wandb-run-slow-to-initialize-or-upload.mdx @@ -0,0 +1,71 @@ +--- +title: "Why is my W&B run slow to initialize or upload?" +keywords: ["Runs", "Experiments", "Connectivity"] +--- + +Slow `wandb.init()` or sluggish metric uploads are usually caused by network latency, large media payloads, high logging frequency, or slow startup of the W&B service process. + +## Slow wandb.init() + +`wandb.init()` contacts the W&B API to create the run and verify credentials. If it hangs for more than a few seconds: + +- **Check connectivity**: Run `curl -I https://api.wandb.ai` to confirm your machine can reach the W&B API. Firewall rules or proxy configurations on clusters are a common cause. +- **Increase the init timeout**: If the connection is intermittent, give `wandb.init()` more time before it gives up: + + ```python + import os + os.environ["WANDB_INIT_TIMEOUT"] = "120" # seconds + ``` + +- **Use offline mode during testing**: If you do not need live syncing while iterating, run offline and sync later: + + ```bash + WANDB_MODE=offline python train.py + wandb sync wandb/run-- + ``` + +## Slow metric uploads during training + +W&B uploads metrics asynchronously in background threads so your training loop is not blocked. Uploads can fall behind when: + +- **You log too frequently**: Calling `wandb.log()` every step on a fast GPU can generate more data than the background threads can upload. Log every N steps instead: + + ```python + if step % 50 == 0: + wandb.log({"loss": loss}, step=step) + ``` + +- **You log large media on every step**: `wandb.Image`, `wandb.Table`, and `wandb.Video` objects are significantly larger than scalar metrics. Log rich media every epoch or every N steps rather than every step. +- **Rate limits**: If you hit the `429 Rate limit exceeded` error, see [How do I fix rate limit exceeded errors?](/support/models/articles/rate-limit-exceeded-on-metric-logging). + +## Run finalization is slow + +After your script calls `wandb.finish()` (or exits), W&B flushes any remaining buffered data. This can take time if a large backlog built up during training. Keep logging frequency reasonable throughout training rather than batching everything at the end. + +## Slow startup of `wandb-service` process + +Recent SDK versions use a separate service process (`wandb-service`) to handle uploads. On some machines, starting this process for the first time can be slow due to Python startup overhead. Subsequent runs on the same machine are faster. If the service process is consistently slow, you can disable it (reverts to the older thread-based backend): + +```bash +WANDB_DISABLE_SERVICE=true python train.py +``` + +Disabling the service process removes some reliability improvements in newer SDK versions. + +## Diagnosing with debug logs + +Enable debug logging to see where time is spent: + +```bash +WANDB_DEBUG=true python train.py +``` + +This writes detailed timing information to `wandb/debug.log` and `wandb/debug-internal.log`. + +For more information, see [Experiments limits and performance](/models/track/limits) and [How do I deal with network issues?](/support/models/articles/how-do-i-deal-with-network-issues). + +--- + +{/* AUTO-GENERATED: tab badges */} +[Runs](/support/models/tags/runs)[Experiments](/support/models/tags/experiments)[Connectivity](/support/models/tags/connectivity) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/models/tags/administrator.mdx b/support/models/tags/administrator.mdx index 409c801eb6..ddf95c7647 100644 --- a/support/models/tags/administrator.mdx +++ b/support/models/tags/administrator.mdx @@ -1,6 +1,6 @@ --- title: "Administrator" -tag: "25" +tag: "26" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -32,8 +32,11 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" To change the billing address, contact the support team (support@wandb.com). + + W&B Self-Managed is a self-hosted deployment that runs in your infrastructure. Point the SDK and CLI at your instance UR ... + - To delete a team from an account: Access team settings as an admin. Click the Delete button at the bottom of the page. + Delete a team 1. Log in as a team admin. 2. Open Team settings from the team page in the W&B UI. 3. Scroll to the bottom ... To delete an organization account, follow these steps, contact the support team (support@wandb.com). diff --git a/support/models/tags/alerts.mdx b/support/models/tags/alerts.mdx index 9f16e1b659..41852835cb 100644 --- a/support/models/tags/alerts.mdx +++ b/support/models/tags/alerts.mdx @@ -1,6 +1,6 @@ --- title: "Alerts" -tag: "2" +tag: "3" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -11,3 +11,6 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" To receive W&B alerts in Teams, follow these steps: Set up an email address for your Teams channel. Create an email addr ... + + W&B supports automated alerts in User Settings (run finished, run crashed) and programmatic alerts from training code. F ... + diff --git a/support/models/tags/api-keys.mdx b/support/models/tags/api-keys.mdx new file mode 100644 index 0000000000..f62d946dd0 --- /dev/null +++ b/support/models/tags/api-keys.mdx @@ -0,0 +1,10 @@ +--- +title: "API Keys" +tag: "1" +generator: "knowledgebase-nav" +template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" +--- + + + W&B issues API keys in the wandb_v1_ format (about 92 characters). Older wandb SDK versions validate exactly 40 characte ... + diff --git a/support/models/tags/api.mdx b/support/models/tags/api.mdx new file mode 100644 index 0000000000..c5430121ad --- /dev/null +++ b/support/models/tags/api.mdx @@ -0,0 +1,13 @@ +--- +title: "API" +tag: "2" +generator: "knowledgebase-nav" +template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" +--- + + + The W&B Public API loads runs and artifacts in pages. For the standard lazy-iterator pattern and per_page, see the Publi ... + + + After a run finishes, use the Public API guide to edit config, display name, tags, and notes without re-running the expe ... + diff --git a/support/models/tags/artifacts.mdx b/support/models/tags/artifacts.mdx index 2c9bd54302..cde8b4a98c 100644 --- a/support/models/tags/artifacts.mdx +++ b/support/models/tags/artifacts.mdx @@ -1,6 +1,6 @@ --- title: "Artifacts" -tag: "14" +tag: "15" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -23,6 +23,9 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" Occasionally, it is necessary to mark an artifact as the output of a previously logged run. In this case, reinitialize t ... + + The W&B Public API loads runs and artifacts in pages. For the standard lazy-iterator pattern and per_page, see the Publi ... + Use save_code=True in wandb.init to save the main script or notebook that launches the run. To save all code for a run, ... diff --git a/support/models/tags/authentication.mdx b/support/models/tags/authentication.mdx new file mode 100644 index 0000000000..8f3b91d233 --- /dev/null +++ b/support/models/tags/authentication.mdx @@ -0,0 +1,10 @@ +--- +title: "Authentication" +tag: "1" +generator: "knowledgebase-nav" +template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" +--- + + + W&B issues API keys in the wandb_v1_ format (about 92 characters). Older wandb SDK versions validate exactly 40 characte ... + diff --git a/support/models/tags/connectivity.mdx b/support/models/tags/connectivity.mdx index 740acd480a..841c5435f9 100644 --- a/support/models/tags/connectivity.mdx +++ b/support/models/tags/connectivity.mdx @@ -1,6 +1,6 @@ --- title: "Connectivity" -tag: "4" +tag: "5" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -17,3 +17,6 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" Check if the W&B Multi-tenant Cloud at wandb.ai is experiencing an outage by visiting the W&B status page. + + Slow wandb.init() or sluggish metric uploads are usually caused by network latency, large media payloads, high logging f ... + diff --git a/support/models/tags/environment-variables.mdx b/support/models/tags/environment-variables.mdx index 4ae803357c..87dd5cade1 100644 --- a/support/models/tags/environment-variables.mdx +++ b/support/models/tags/environment-variables.mdx @@ -1,6 +1,6 @@ --- title: "Environment Variables" -tag: "12" +tag: "13" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -14,6 +14,9 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" WANDB_DIR= or wandb.init(dir= ): Controls the location of the wandb folder created for your training script. Defaults to ... + + W&B Self-Managed is a self-hosted deployment that runs in your infrastructure. Point the SDK and CLI at your instance UR ... + If you encounter the error message "Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_N ... diff --git a/support/models/tags/experiments.mdx b/support/models/tags/experiments.mdx index 55099a165d..7dcfaff7f9 100644 --- a/support/models/tags/experiments.mdx +++ b/support/models/tags/experiments.mdx @@ -1,6 +1,6 @@ --- title: "Experiments" -tag: "39" +tag: "51" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -59,15 +59,39 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" Finish previous runs before starting new runs to log multiple runs within a single script. The recommended way to do thi ... + + Log corpus-level NLP scores (BLEU, ROUGE, perplexity) with wandb.log() and per-example outputs with wandb.Table. For gen ... + These examples show logging losses a couple of different ways using wandb.Run.log(). For more, see the documentation on ... + + wandb.watch() hooks into a PyTorch model's parameters and gradients and logs histograms of their values at regular inter ... + + + The W&B Public API loads runs and artifacts in pages. For the standard lazy-iterator pattern and per_page, see the Publi ... + Create a multi-line custom chart with wandb.plot.line_series(). Navigate to the project page to view the line chart. To ... The .name attribute of a wandb.Run is accessible as follows: + + When running a W&B sweep with distributed training on SLURM (for example, multi-GPU jobs with --gpus-per-node), only one ... + + + W&B supports automated alerts in User Settings (run finished, run crashed) and programmatic alerts from training code. F ... + + + After a run finishes, use the Public API guide to edit config, display name, tags, and notes without re-running the expe ... + + + W&B has no JAX-specific integration. Use wandb.log() in your training loop like any other Python workflow. Convert JAX d ... + + + The parallel coordinates chart shows how hyperparameters relate to metrics across many runs. Each line is one run; each ... + If a run is not explicitly named, W&B assigns a random name to identify it in your project. Examples of random names are ... @@ -101,18 +125,30 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" When visualizing metrics against an X-axis other than Step, expect to see fewer data points. Metrics must log at the sam ... + + If metrics logged with wandb.log() are not appearing in the W&B UI, there are several common causes. SDK version issue C ... + Export limits can prevent the entire run history from being exported as a CSV or using the run.history API. To access th ... Metric names in W&B must follow GraphQL naming conventions to ensure they can be properly sorted and filtered in the UI. ... + + Workspace layout (panels, filters, grouping) persists only when you save a view. Unsaved changes stay in your browser se ... + If your process hangs when started with Hydra, this is likely caused by a multiprocessing conflict between Hydra and W&B ... There are two common reasons training hangs when using W&B with distributed training: 1. Hanging at the beginning of tra ... + + Slow wandb.init() or sluggish metric uploads are usually caused by network latency, large media payloads, high logging f ... + + + If your sweep agent starts but does not receive new run configurations, or receives one run and then idles, there are se ... + If the message "No visualization data logged yet" appears, the script has not executed the first wandb.log call. This si ... diff --git a/support/models/tags/logs.mdx b/support/models/tags/logs.mdx index 783c233fe9..b4a63fb245 100644 --- a/support/models/tags/logs.mdx +++ b/support/models/tags/logs.mdx @@ -1,6 +1,6 @@ --- title: "Logs" -tag: "7" +tag: "10" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -11,6 +11,9 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" To overwrite logs from previous steps, use forking and rewind. + + W&B stores your script's stdout and stderr as output.log (or multipart chunks under logs/). Where to retrieve it depends ... + These examples show logging losses a couple of different ways using wandb.Run.log(). For more, see the documentation on ... @@ -26,3 +29,9 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" For the affected run, check debug.log and debug-internal.log in wandb/run- _ - /logs in the directory where your code is ... + + If metrics logged with wandb.log() are not appearing in the W&B UI, there are several common causes. SDK version issue C ... + + + W&B captures your script's stdout and stderr and stores it as output.log on the run's Files tab. By default, that file u ... + diff --git a/support/models/tags/metrics.mdx b/support/models/tags/metrics.mdx index 3c92261b69..85603d15aa 100644 --- a/support/models/tags/metrics.mdx +++ b/support/models/tags/metrics.mdx @@ -1,6 +1,6 @@ --- title: "Metrics" -tag: "19" +tag: "23" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -29,6 +29,15 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" The following error usually occurs when you do not log the metric that you are optimizing: To fix this, make sure you ar ... + + Log corpus-level NLP scores (BLEU, ROUGE, perplexity) with wandb.log() and per-example outputs with wandb.Table. For gen ... + + + wandb.watch() hooks into a PyTorch model's parameters and gradients and logs histograms of their values at regular inter ... + + + W&B has no JAX-specific integration. Use wandb.log() in your training loop like any other Python workflow. Convert JAX d ... + Metrics collect by default every 10 seconds. For higher resolution metrics, email contact@wandb.com. @@ -50,6 +59,9 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" When visualizing metrics against an X-axis other than Step, expect to see fewer data points. Metrics must log at the sam ... + + If metrics logged with wandb.log() are not appearing in the W&B UI, there are several common causes. SDK version issue C ... + Metric names in W&B must follow GraphQL naming conventions to ensure they can be properly sorted and filtered in the UI. ... diff --git a/support/models/tags/org-management.mdx b/support/models/tags/org-management.mdx new file mode 100644 index 0000000000..c788f24755 --- /dev/null +++ b/support/models/tags/org-management.mdx @@ -0,0 +1,13 @@ +--- +title: "Org Management" +tag: "2" +generator: "knowledgebase-nav" +template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" +--- + + + Teams are the primary unit of collaboration in W&B. A team has its own entity name, shared projects, and member roster. ... + + + When you sign up for W&B, the platform automatically creates a personal team with the same name as your username. If you ... + diff --git a/support/models/tags/resuming.mdx b/support/models/tags/resuming.mdx index f70eddd754..d215559558 100644 --- a/support/models/tags/resuming.mdx +++ b/support/models/tags/resuming.mdx @@ -1,10 +1,13 @@ --- title: "Resuming" -tag: "2" +tag: "3" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- + + Run resumption is not supported inside a W&B sweep. If you pass a run_id or use wandb.init(resume=...) while a sweep age ... + If you encounter the error resume='must' but run ( ) doesn't exist, the run you are attempting to resume does not exist ... diff --git a/support/models/tags/run-crashes.mdx b/support/models/tags/run-crashes.mdx index 2605cdaae8..5af714965a 100644 --- a/support/models/tags/run-crashes.mdx +++ b/support/models/tags/run-crashes.mdx @@ -1,6 +1,6 @@ --- title: "Run Crashes" -tag: "10" +tag: "11" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -35,3 +35,6 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" This indicates a connection problem. If the server loses internet access and data stops syncing to W&B, the system marks ... + + W&B marks a run as Crashed when it stops receiving heartbeats from the process that called wandb.init(), without the pro ... + diff --git a/support/models/tags/runs.mdx b/support/models/tags/runs.mdx index 85ec520d99..8c7457bc06 100644 --- a/support/models/tags/runs.mdx +++ b/support/models/tags/runs.mdx @@ -1,6 +1,6 @@ --- title: "Runs" -tag: "19" +tag: "31" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -38,18 +38,42 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" Learn how to recover recently deleted runs in the W&B App for up to 7 days. + + W&B stores your script's stdout and stderr as output.log (or multipart chunks under logs/). Where to retrieve it depends ... + If you see both CommError, Run does not exist and ERROR Error uploading during a sweep, the most likely cause is that yo ... If you encounter the error resume='must' but run ( ) doesn't exist, the run you are attempting to resume does not exist ... + + Log corpus-level NLP scores (BLEU, ROUGE, perplexity) with wandb.log() and per-example outputs with wandb.Table. For gen ... + + + wandb.watch() hooks into a PyTorch model's parameters and gradients and logs histograms of their values at regular inter ... + To launch automated tests or internal tools that log to W&B, create a Service Account on the team settings page. This ac ... + + The W&B Public API loads runs and artifacts in pages. For the standard lazy-iterator pattern and per_page, see the Publi ... + To resolve permission errors when logging a run to a W&B entity, follow these steps: Verify entity and project names: En ... + + W&B supports automated alerts in User Settings (run finished, run crashed) and programmatic alerts from training code. F ... + + + After a run finishes, use the Public API guide to edit config, display name, tags, and notes without re-running the expe ... + + + W&B has no JAX-specific integration. Use wandb.log() in your training loop like any other Python workflow. Convert JAX d ... + + + The parallel coordinates chart shows how hyperparameters relate to metrics across many runs. Each line is one run; each ... + Limit each project to approximately 10,000 runs for optimal performance. @@ -62,3 +86,15 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" Export limits can prevent the entire run history from being exported as a CSV or using the run.history API. To access th ... + + Workspace layout (panels, filters, grouping) persists only when you save a view. Unsaved changes stay in your browser se ... + + + W&B captures your script's stdout and stderr and stores it as output.log on the run's Files tab. By default, that file u ... + + + Slow wandb.init() or sluggish metric uploads are usually caused by network latency, large media payloads, high logging f ... + + + W&B marks a run as Crashed when it stops receiving heartbeats from the process that called wandb.init(), without the pro ... + diff --git a/support/models/tags/sdk.mdx b/support/models/tags/sdk.mdx new file mode 100644 index 0000000000..bc138a7f92 --- /dev/null +++ b/support/models/tags/sdk.mdx @@ -0,0 +1,10 @@ +--- +title: "SDK" +tag: "1" +generator: "knowledgebase-nav" +template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" +--- + + + W&B issues API keys in the wandb_v1_ format (about 92 characters). Older wandb SDK versions validate exactly 40 characte ... + diff --git a/support/models/tags/sweeps.mdx b/support/models/tags/sweeps.mdx index 1f907f0f4f..e6d7bf2d83 100644 --- a/support/models/tags/sweeps.mdx +++ b/support/models/tags/sweeps.mdx @@ -1,6 +1,6 @@ --- title: "Sweeps" -tag: "19" +tag: "23" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -11,6 +11,9 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" If a grid search completes but some W&B Runs need re-execution due to crashes, delete the specific W&B Runs to re-run. T ... + + Run resumption is not supported inside a W&B sweep. If you pass a run_id or use wandb.init(resume=...) while a sweep age ... + To authenticate W&B, complete the following steps: create a requirements.txt file if using a built-in Amazon SageMaker e ... @@ -44,9 +47,15 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" The following error usually occurs when you do not log the metric that you are optimizing: To fix this, make sure you ar ... + + When running a W&B sweep with distributed training on SLURM (for example, multi-GPU jobs with --gpus-per-node), only one ... + You can use W&B Sweeps with custom CLI commands if training configuration passes command-line arguments. In the example ... + + The parallel coordinates chart shows how hyperparameters relate to metrics across many runs. Each line is one run; each ... + When using sweeps with the SLURM scheduling system, run wandb agent --count 1 SWEEP_ID in each scheduled job. This comma ... @@ -62,3 +71,6 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" W&B provides an estimated number of Runs generated when creating a W&B Sweep with a discrete search space. This total re ... + + If your sweep agent starts but does not receive new run configurations, or receives one run and then idles, there are se ... + diff --git a/support/models/tags/team-management.mdx b/support/models/tags/team-management.mdx index d5d7836468..9d345592f5 100644 --- a/support/models/tags/team-management.mdx +++ b/support/models/tags/team-management.mdx @@ -1,6 +1,6 @@ --- title: "Team Management" -tag: "12" +tag: "13" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -12,11 +12,14 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" A team admin can remove you from a team from the Users tab of the team settings. - To delete a team from an account: Access team settings as an admin. Click the Delete button at the bottom of the page. + Delete a team 1. Log in as a team admin. 2. Open Team settings from the team page in the W&B UI. 3. Scroll to the bottom ... To join a team, follow these steps: Contact a team admin or someone with administrative privileges to request an invite. ... + + When you sign up for W&B, the platform automatically creates a personal team with the same name as your username. If you ... + A service account cannot be added to multiple teams in W&B. Each service account is tied to a specific team. diff --git a/support/models/tags/teams.mdx b/support/models/tags/teams.mdx new file mode 100644 index 0000000000..4f6dad7550 --- /dev/null +++ b/support/models/tags/teams.mdx @@ -0,0 +1,16 @@ +--- +title: "Teams" +tag: "3" +generator: "knowledgebase-nav" +template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" +--- + + + Teams are the primary unit of collaboration in W&B. A team has its own entity name, shared projects, and member roster. ... + + + Only team admins can send invitations. The invited person receives an email and must accept before they appear as an act ... + + + When you sign up for W&B, the platform automatically creates a personal team with the same name as your username. If you ... + diff --git a/support/models/tags/user-management.mdx b/support/models/tags/user-management.mdx index 94930136c7..15afecbb0a 100644 --- a/support/models/tags/user-management.mdx +++ b/support/models/tags/user-management.mdx @@ -1,6 +1,6 @@ --- title: "User Management" -tag: "13" +tag: "14" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -14,21 +14,24 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" Delete your user account by clicking Delete account in your user settings. Note that this action is irreversible and it ... - - Set the login URL by either of these methods: Set the environment variable WANDB_BASE_URL to the Server URL. Set the --h ... - To regain access to an account when unable to receive a password reset email: 1. Check Spam or Junk Folders: Ensure the ... To resolve login issues, follow these steps: Verify access: Confirm you are using the correct email or username and chec ... + + W&B Self-Managed is a self-hosted deployment that runs in your infrastructure. Point the SDK and CLI at your instance UR ... + To export a list of users from a W&B organization, an admin uses the SCIM API with the following code: Modify the script ... import ApiKeyFind from "/snippets/_includes/api-key-find.mdx"; + + Only team admins can send invitations. The invited person receives an email and must accept before they appear as an act ... + To delete a W&B account, navigate to the User settings page, scroll to the bottom, and click the Delete Account button. diff --git a/support/models/tags/workspace.mdx b/support/models/tags/workspace.mdx new file mode 100644 index 0000000000..f9c45ed32f --- /dev/null +++ b/support/models/tags/workspace.mdx @@ -0,0 +1,13 @@ +--- +title: "Workspace" +tag: "2" +generator: "knowledgebase-nav" +template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" +--- + + + The parallel coordinates chart shows how hyperparameters relate to metrics across many runs. Each line is one run; each ... + + + Workspace layout (panels, filters, grouping) persists only when you save a view. Unsaved changes stay in your browser se ... + diff --git a/support/weave.mdx b/support/weave.mdx index 16454173b9..c291496f7f 100644 --- a/support/weave.mdx +++ b/support/weave.mdx @@ -13,7 +13,7 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2" 2 articles - 2 articles + 3 articles 2 articles @@ -25,7 +25,7 @@ template: "scripts/knowledgebase-nav/templates/support_product_index.mdx.j2" 2 articles - 4 articles + 5 articles 2 articles diff --git a/support/weave/articles/why-does-my-weave-cost-or-token-estimate-differ-from-my-provider.mdx b/support/weave/articles/why-does-my-weave-cost-or-token-estimate-differ-from-my-provider.mdx new file mode 100644 index 0000000000..25b499c8be --- /dev/null +++ b/support/weave/articles/why-does-my-weave-cost-or-token-estimate-differ-from-my-provider.mdx @@ -0,0 +1,44 @@ +--- +title: "Why does my Weave cost or token estimate differ from my provider?" +keywords: ["Data Capture", "Trace Data"] +--- + +Weave displays cost and token usage estimates based on data captured from your LLM calls. Discrepancies between Weave's numbers and your provider's invoice or dashboard are common and have several causes. + +**Token counts come from the provider response, not Weave** + +For supported integrations (OpenAI, Anthropic, Google, etc.), Weave reads token usage directly from the API response object—the same `usage` field your code receives. If your provider reports a different count on their billing page, the difference is on the provider side (for example, they may aggregate tokens across streaming chunks differently than per-call reporting). + +**Weave cost estimates use a static pricing table** + +Weave calculates estimated cost by multiplying token counts by known per-token prices for each model. This table is updated periodically but may lag behind provider pricing changes. If a provider recently changed pricing for a model, Weave's estimate will be stale until the next SDK release that updates the table. + +To check the model pricing Weave is using, see the [pricing reference in the Weave source](https://github.com/wandb/weave/blob/master/weave/trace/util/inference_cost). + +**Custom or fine-tuned models may not have pricing entries** + +If you use a fine-tuned model or a model identifier that is not in Weave's pricing table, the cost column will show `—` or `$0.00`. You can see token counts but cost will not be estimated for unknown models. + +**Sampling reduces total captured tokens** + +If you set `tracing_sample_rate` on an op, only a fraction of calls are traced. The token totals in Weave reflect only the sampled calls, not your full usage: + +```python +@weave.op(tracing_sample_rate=0.1) +def my_llm_call(prompt): + ... +``` + +In this case Weave captures roughly 10% of calls, so the token and cost totals in the UI represent only that fraction. + +**Prompt caching and batch API calls** + +Some providers (for example, OpenAI with prompt caching enabled) apply discounts to cached input tokens. Weave captures the `usage` object as returned by the provider, which should reflect cached token pricing if the provider reports it in the response. However, Weave's static pricing table reflects standard (non-cached) prices for each token category. If you use prompt caching heavily, the gap between Weave's estimate and your actual bill will be larger. + +Batch API requests may report token usage differently than real-time requests; verify that your batch responses include standard `usage` fields if you expect Weave to capture them. + +--- + +{/* AUTO-GENERATED: tab badges */} +[Data Capture](/support/weave/tags/data-capture)[Trace Data](/support/weave/tags/trace-data) +{/* END AUTO-GENERATED: tab badges */} diff --git a/support/weave/tags/data-capture.mdx b/support/weave/tags/data-capture.mdx index 9dca64c08d..eee560ad63 100644 --- a/support/weave/tags/data-capture.mdx +++ b/support/weave/tags/data-capture.mdx @@ -1,6 +1,6 @@ --- title: "Data Capture" -tag: "2" +tag: "3" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -11,3 +11,6 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" A function can be designated as a Weave Op either manually through a decorator or automatically as part of an enabled in ... + + Weave displays cost and token usage estimates based on data captured from your LLM calls. Discrepancies between Weave's ... + diff --git a/support/weave/tags/trace-data.mdx b/support/weave/tags/trace-data.mdx index 288afc1fa4..8aa1c79596 100644 --- a/support/weave/tags/trace-data.mdx +++ b/support/weave/tags/trace-data.mdx @@ -1,6 +1,6 @@ --- title: "Trace Data" -tag: "4" +tag: "5" generator: "knowledgebase-nav" template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" --- @@ -14,6 +14,9 @@ template: "scripts/knowledgebase-nav/templates/support_tag.mdx.j2" If trace pages load slowly, reduce the number of rows displayed to improve load time. The default value is 50. You can r ... + + Weave displays cost and token usage estimates based on data captured from your LLM calls. Discrepancies between Weave's ... + By default, Weave's .call() method captures exceptions and stores them in call.exception instead of raising them. This i ...