From 48044d5b459b9768f4d1854500cde075b611a471 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 28 May 2026 09:15:03 +0000 Subject: [PATCH 1/2] feat(skills): source dd-apm sub-skills from agent-skills Cargo crate Replace the submodule approach proposed in #530 with a Cargo path dependency. No .gitmodules, no `submodules: recursive` peppered across every CI checkout step. - crates/agent-skills/: new crate vendoring the 12 dd-apm SKILL.md files from datadog-labs/agent-skills@c447f4d (root router + 11 nested sub-skills under k8s-ssi/, linux-ssi/, service-remapping/) - Cargo.toml: add `agent-skills = { path = "crates/agent-skills" }`; a comment shows how to flip to a git dep once agent-skills ships a Cargo.toml, with no other pup changes needed - src/skills.rs: use agent_skills::DD_APM_SKILL / DD_APM_SUB_SKILLS instead of bare include_str!(); extend install_paths to write nested sub-skill files alongside the root SKILL.md (SkillMd format only); update files field doc; add two new tests for sub-skill install paths https://claude.ai/code/session_01BfYL9qPvktFuLXHNyH3rPB --- Cargo.toml | 4 + crates/agent-skills/Cargo.toml | 5 + crates/agent-skills/skills/dd-apm/SKILL.md | 209 +++++++++ .../dd-apm/k8s-ssi/agent-install/SKILL.md | 270 ++++++++++++ .../skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md | 266 ++++++++++++ .../k8s-ssi/onboarding-summary/SKILL.md | 129 ++++++ .../dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md | 405 +++++++++++++++++ .../skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md | 159 +++++++ .../dd-apm/linux-ssi/agent-install/SKILL.md | 280 ++++++++++++ .../dd-apm/linux-ssi/enable-ssi/SKILL.md | 235 ++++++++++ .../linux-ssi/onboarding-summary/SKILL.md | 139 ++++++ .../linux-ssi/troubleshoot-ssi/SKILL.md | 400 +++++++++++++++++ .../dd-apm/linux-ssi/verify-ssi/SKILL.md | 198 +++++++++ .../skills/dd-apm/service-remapping/SKILL.md | 411 ++++++++++++++++++ crates/agent-skills/src/lib.rs | 48 ++ src/skills.rs | 67 ++- 16 files changed, 3219 insertions(+), 6 deletions(-) create mode 100644 crates/agent-skills/Cargo.toml create mode 100644 crates/agent-skills/skills/dd-apm/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/k8s-ssi/agent-install/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/k8s-ssi/onboarding-summary/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/linux-ssi/agent-install/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/linux-ssi/enable-ssi/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/linux-ssi/onboarding-summary/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/linux-ssi/troubleshoot-ssi/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/linux-ssi/verify-ssi/SKILL.md create mode 100644 crates/agent-skills/skills/dd-apm/service-remapping/SKILL.md create mode 100644 crates/agent-skills/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 7198eaf..645fa4f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,6 +68,10 @@ browser = [ ] [dependencies] +# Skills content from datadog-labs/agent-skills (local path dep; switch to +# `{ git = "...", rev = "..." }` once agent-skills ships a Cargo.toml) +agent-skills = { path = "crates/agent-skills" } + # CLI (optional — not needed for browser WASM library) clap = { version = "4", features = ["derive"], optional = true } clap_complete = { version = "4", optional = true } diff --git a/crates/agent-skills/Cargo.toml b/crates/agent-skills/Cargo.toml new file mode 100644 index 0000000..8d8b3e1 --- /dev/null +++ b/crates/agent-skills/Cargo.toml @@ -0,0 +1,5 @@ +[package] +name = "agent-skills" +version = "0.1.0" +edition = "2021" +publish = false diff --git a/crates/agent-skills/skills/dd-apm/SKILL.md b/crates/agent-skills/skills/dd-apm/SKILL.md new file mode 100644 index 0000000..95ccb11 --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/SKILL.md @@ -0,0 +1,209 @@ +--- +name: dd-apm +description: APM - install, onboard, instrument, enable, set up, configure, traces, services, dependencies, performance analysis. Use for any request involving Datadog APM setup, instrumentation (SSI, ddtrace, agent install), or analysis. +alwaysApply: true +metadata: + version: "1.0.1" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,tracing,performance,distributed-tracing,dd-apm,install,onboarding,instrumentation,ssi,agent + globs: "**/ddtrace*,**/datadog*.yaml,**/*trace*" +--- + +# Datadog APM + +Distributed tracing, service maps, and performance analysis. + +## Routing — Read This First + +**For Kubernetes APM install / instrument / onboard tasks — follow this procedure exactly:** + +> ⚠️ **Common wrong approaches that LOOK like they work but silently fail:** +> - `helm install datadog datadog/datadog` — the standard chart does NOT support SSI via DatadogAgent CR. Do NOT use it. +> - Adding `ddtrace` imports or `ddtrace-run` to the app — SSI auto-instruments WITHOUT any code changes. +> - `admission.datadoghq.com/enabled` annotations — this is admission controller config injection, not SSI init container injection. + +### Step 0: Pre-flight checks (MANDATORY — run before installing anything) + +```bash +# Check for ddtrace/otel — even without imports it SILENTLY DISABLES SSI at runtime +grep -rn "ddtrace\|dd-trace\|opentelemetry" requirements.txt package.json Gemfile go.mod pom.xml 2>/dev/null || echo "No tracer dependency found" +``` + +If ddtrace found → remove that line entirely from the dependency file, rebuild image, reload into cluster, restart pod. + +### Step 1: Install the Datadog Operator (NOT `datadog/datadog`) + +```bash +helm repo add datadog https://helm.datadoghq.com && helm repo update +helm upgrade --install datadog-operator datadog/datadog-operator \ + --namespace datadog --create-namespace +kubectl wait --for=condition=Ready pod \ + -l app.kubernetes.io/name=datadog-operator -n datadog --timeout=120s + +kubectl create secret generic datadog-secret \ + --from-literal api-key=$DD_API_KEY -n datadog +``` + +### Step 2: Create DatadogAgent CR with SSI enabled + +Save as `datadog-agent.yaml`, then `kubectl apply -f datadog-agent.yaml`: + +```yaml +apiVersion: datadoghq.com/v2alpha1 +kind: DatadogAgent +metadata: + name: datadog + namespace: datadog +spec: + global: + site: + credentials: + apiSecret: + secretName: datadog-secret + keyName: api-key + kubelet: + tlsVerify: false # required for kind/minikube; omit for cloud clusters + features: + apm: + instrumentation: + enabled: true +``` + +### Step 3: Apply Unified Service Tags to the application Deployment + +Add to **both** `metadata.labels` and `spec.template.metadata.labels`: +```yaml +tags.datadoghq.com/env: "dev" +tags.datadoghq.com/service: "" +tags.datadoghq.com/version: "1.0.0" +``` + +### Step 4: Restart app pods and verify SSI init containers + +> **Confirm with the user before restarting.** Tell the user: "I need to restart `` in `` for SSI to inject into the pods. This will cause a brief outage. Ready to proceed?" Wait for confirmation. + +```bash +kubectl rollout restart deployment/ -n +kubectl get pods -A -o json | grep -o '"datadog-lib[^"]*"' | sort -u +``` + +Expected: `"datadog-lib-python-init"` (or the language-appropriate init container). + +**Immediately read** `.claude/skills/dd-apm/k8s-ssi/agent-install/SKILL.md` now, then `.claude/skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md` — do not proceed from memory or the summary above. + +--- + +**For Linux APM install/instrument tasks:** **Immediately read** `.claude/skills/dd-apm/linux-ssi/agent-install/SKILL.md` now, then enable-ssi then verify-ssi — do not proceed from memory or the summary above. + +**For trace search, service analysis, metrics:** Continue below. + +## Requirements + +Datadog Labs Pup should be installed. See [Setup Pup](https://github.com/datadog-labs/agent-skills/tree/main?tab=readme-ov-file#setup-pup) if not. + +## Command Execution Order (Token-Efficient) + +For scoped commands, use this order: + +1. Check context first (prior outputs, conversation, saved values). +2. If a required value is missing, run a discovery command first. +3. If still ambiguous, ask the user to confirm. +4. Then run the target command. +5. Avoid speculative commands likely to fail. + +## Quick Start + +```bash +pup auth login +# Confirm env tag with the user first (do not assume production/prod/prd). +pup apm services list --env --from 1h --to now +pup traces search --query "service:api-gateway" --from 1h +``` + +## Services + +### List Services + +```bash +pup apm services list --env --from 1h --to now +pup apm services stats --env --from 1h --to now +``` + +### Service Stats + +```bash +pup apm services stats --env --from 1h --to now +``` + +### Service Map + +```bash +# View dependencies +pup apm flow-map --query "service:api-gateway&from=$(($(date +%s)-3600))000&to=$(date +%s)000" --env --limit 10 +``` + +## Traces + +### Search Traces + +```bash +# By service +pup traces search --query "service:api-gateway" --from 1h + +# Errors only +pup traces search --query "service:api-gateway status:error" --from 1h + +# Slow traces (>1s) +pup traces search --query "service:api-gateway @duration:>1000ms" --from 1h + +# With specific tag +pup traces search --query "service:api-gateway @http.url:/api/users" --from 1h +``` + +### Trace Detail + +```bash +# No direct get command for a single trace ID. +# Use traces search with a narrow query and time window. +pup traces search --query "trace_id:" --from 1h +``` + +## Key Metrics + +| Metric | What It Measures | +|--------|------------------| +| `trace.http.request.hits` | Request count | +| `trace.http.request.duration` | Latency | +| `trace.http.request.errors` | Error count | +| `trace.http.request.apdex` | User satisfaction | + +## Service Level Objectives + +Link APM to SLOs: + +```bash +pup slos create --file slo.json +``` + +## Common Queries + +| Goal | Query | +|------|-------| +| Slowest endpoints | `avg:trace.http.request.duration{*} by {resource_name}` | +| Error rate | `sum:trace.http.request.errors{*} / sum:trace.http.request.hits{*}` | +| Throughput | `sum:trace.http.request.hits{*}.as_rate()` | + +## Troubleshooting + +| Problem | Fix | +|---------|-----| +| No traces | Check ddtrace installed, DD_TRACE_ENABLED=true | +| Missing service | Verify DD_SERVICE env var | +| Traces not linked | Check trace headers propagated | +| High cardinality | Don't tag with user_id/request_id | + +## References/Docs + +- [APM Setup](https://docs.datadoghq.com/tracing/) +- [Trace Search](https://docs.datadoghq.com/tracing/trace_explorer/) diff --git a/crates/agent-skills/skills/dd-apm/k8s-ssi/agent-install/SKILL.md b/crates/agent-skills/skills/dd-apm/k8s-ssi/agent-install/SKILL.md new file mode 100644 index 0000000..78e20c6 --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/k8s-ssi/agent-install/SKILL.md @@ -0,0 +1,270 @@ +--- +name: agent-install +description: Install the Datadog Agent on Kubernetes using the Datadog Operator — required before enabling Single Step Instrumentation (SSI), which automatically instruments applications for APM without code changes. Only use if no Datadog Agent is deployed on the cluster yet. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,kubernetes,agent,operator,install + alwaysApply: "false" + tools: helm,kubectl,curl,pup +--- + +# Install the Datadog Agent on Kubernetes + +> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 1 until every variable has a concrete value. + +## Phase 0: Load Credentials + +```bash +[ -f environment ] && source environment +echo "DD_API_KEY set: $([ -n "${DD_API_KEY:-}" ] && echo yes || echo no)" +echo "DD_SITE: ${DD_SITE:-not set}" +echo "helm: $(helm version --short 2>/dev/null || echo NOT FOUND)" +``` + +**If `helm` is not found** — tell the user: + +> `helm` is required for this skill. Install it with: +> ```bash +> brew install helm # macOS +> # or see https://helm.sh/docs/intro/install/ for other platforms +> ``` +> Once installed, let me know and I'll continue. + +Do not proceed until `helm` is available. + +**If `DD_API_KEY` is already set** — proceed to Prerequisites. + +**If `DD_API_KEY` is not set** — tell the user: + +> I need two things to continue: +> +> **1. Datadog API Key** — used to authenticate the Agent with your Datadog account. You can find or create one at: https://app.datadoghq.com/organization-settings/api-keys +> +> **2. Datadog Site** — the region your Datadog account is on. Most accounts use `datadoghq.com`. Check your Datadog URL to confirm (e.g. `app.datadoghq.eu` → site is `datadoghq.eu`). Other options: `us3.datadoghq.com`, `us5.datadoghq.com`, `ap1.datadoghq.com`. +> +> Please run the following in this chat to set your credentials (the `!` prefix executes it in this session): +> ``` +> ! export DD_API_KEY=your-api-key-here +> ! export DD_SITE=datadoghq.com +> ``` + +Wait for the user to run the commands, then re-run the check above before continuing. + +--- + +## Prerequisites + +- [ ] Kubernetes v1.20+ — `kubectl version` +- [ ] helm v3+ — `helm version` +- [ ] kubectl configured to target cluster — `kubectl config current-context` +- [ ] pup-cli installed — check with `pup --version`; if missing, install it now: + ```bash + if [[ "$(uname)" == "Darwin" ]]; then + brew tap datadog-labs/pack && brew install pup + else + PUP_VERSION=$(curl -s https://api.github.com/repos/datadog-labs/pup/releases/latest | grep '"tag_name"' | cut -d'"' -f4) + curl -L "https://github.com/datadog-labs/pup/releases/download/${PUP_VERSION}/pup_linux_amd64.tar.gz" | tar xz -C /usr/local/bin pup + chmod +x /usr/local/bin/pup + fi + pup --version + ``` + Do not skip — proceed only once `pup --version` succeeds. + +--- + +## Context to resolve before acting + +| Variable | How to resolve | +|---|---| +| `CLUSTER_NAME` | Check repo IaC, scripts, or `kubectl config current-context` | +| `DD_SITE` | Ask the user. Default: `datadoghq.com`. Common options: `datadoghq.eu`, `us3.datadoghq.com`, `us5.datadoghq.com`, `ap1.datadoghq.com`. Full list: https://docs.datadoghq.com/getting_started/site/ | +| `AGENT_NAMESPACE` | Use `datadog` unless the repo already uses `datadog-agent` consistently | +| `CHART_VERSION` | Run `helm search repo datadog/datadog-operator --versions \| head -5` and use the latest stable | + +--- + +## Step 1: Check for an Existing Agent Installation + +### Claude runs + +```bash +helm list -A | grep -i datadog +``` + +If a release shows `deployed` — Agent already installed. Skip to Step 5 to confirm health, then exit. + +If there is no output — no existing install. Continue to Step 2. + +--- + +## Step 2: Install the Datadog Operator + +### Claude runs + +```bash +helm repo add datadog https://helm.datadoghq.com +helm repo update + +helm upgrade --install datadog-operator datadog/datadog-operator \ + --namespace \ + --create-namespace \ + --version + +kubectl wait --for=condition=Ready pod \ + -l app.kubernetes.io/name=datadog-operator \ + -n \ + --timeout=120s +``` + +If the Operator pod is Running — continue to Step 3. + +ERROR: Pod not ready after 120s — check image pull: `kubectl describe pod -l app.kubernetes.io/name=datadog-operator -n `. + +--- + +## Step 3: Create the API Key Secret + +### What you need to do in a terminal + +```bash +export DD_API_KEY= + +kubectl create secret generic datadog-secret \ + --from-literal api-key=$DD_API_KEY \ + --namespace +``` + +If `secret/datadog-secret created` — continue to Step 4. + +ERROR: `AlreadyExists` — confirm which key it holds via Step 5 before deciding whether to recreate. + +--- + +## Step 4: Deploy the DatadogAgent Resource + +[DECISION: cluster type] +- Self-hosted (minikube, kind): include `kubelet.tlsVerify: false` inside `spec.global` +- Managed (GKE, EKS, AKS): omit `kubelet.tlsVerify` entirely + +[DECISION: APM/SSI also being enabled in this session] +- If yes: do not create a separate `DatadogAgent` for APM — extend this same manifest with `features.apm` per `enable-ssi`. One manifest, not two. +- If no: use the manifest below as-is. + +Save the following as `datadog-agent.yaml`: + +```yaml +apiVersion: datadoghq.com/v2alpha1 +kind: DatadogAgent +metadata: + name: datadog + namespace: +spec: + global: + clusterName: + site: + credentials: + apiSecret: + secretName: datadog-secret + keyName: api-key + # Self-hosted clusters only (minikube, kind): + # kubelet: + # tlsVerify: false + features: + orchestratorExplorer: + enabled: true + clusterChecks: + enabled: true + logCollection: + enabled: true + containerCollectAll: false +``` + +### Claude runs + +```bash +kubectl apply -f datadog-agent.yaml + +kubectl wait --for=condition=Ready pod \ + -l app.kubernetes.io/component=agent \ + -n \ + --timeout=120s 2>/dev/null || true +``` + +--- + +## Step 5: Verify the API Key + +### Claude runs + +```bash +kubectl logs -l app.kubernetes.io/component=agent \ + -n \ + --tail=50 2>/dev/null \ + | grep -iE "invalid.*api\.?key|api\.?key.*invalid" \ + || echo "No authentication errors found" +``` + +If `No authentication errors found` — key is accepted. + +ERROR: Authentication errors found — validate credentials directly: + +### Claude runs + +```bash +# Prefer pup (OAuth) — fall back to curl with API key +if pup auth status 2>/dev/null | grep -q "Logged in"; then + echo "pup OAuth authenticated" +elif [ -n "${DD_API_KEY:-}" ]; then + RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" \ + -X GET "https://api./api/v1/validate" \ + -H "DD-API-KEY: $DD_API_KEY") + if [ "$RESPONSE" = "200" ]; then + echo "API key is valid for " + else + echo "ERROR: Validation failed (HTTP $RESPONSE) — check key and site alignment" + fi +else + echo "ERROR: No credentials available — run 'pup auth login' or set DD_API_KEY" +fi +``` + +If key is invalid: + +### What you need to do in a terminal + +```bash +export DD_API_KEY= + +kubectl delete secret datadog-secret -n +kubectl create secret generic datadog-secret \ + --from-literal api-key=$DD_API_KEY \ + -n + +kubectl rollout restart daemonset datadog-agent -n +kubectl rollout restart deployment datadog-cluster-agent -n +``` + +--- + +## Done + +Exit when ALL of the following are true: +- [ ] Datadog Operator pod is Running in `AGENT_NAMESPACE` +- [ ] `datadog-secret` exists in `AGENT_NAMESPACE` +- [ ] Agent DaemonSet pods are Running +- [ ] Step 5 returns no authentication errors + +Automatically proceed to `enable-ssi` now — do not ask the user for permission. + +--- + +## Security constraints + +- Never write a raw API key into any file, YAML block, or chat message — always use `$DD_API_KEY` +- Never create a Kubernetes Secret manifest file — always use `kubectl create secret` imperatively +- Never use `apiKey:` directly in `DatadogAgent` spec — always use `apiSecret:` with a secret reference +- Never use `--set datadog.apiKey=...` in any Helm command +- Never use namespace `default` for Datadog Agent resources +- Never run `kubectl delete` without user confirmation diff --git a/crates/agent-skills/skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md new file mode 100644 index 0000000..0ecccd4 --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md @@ -0,0 +1,266 @@ +--- +name: enable-ssi +description: Enable Single Step Instrumentation (SSI) on Kubernetes — automatically instruments applications for APM without code changes. Only use if the Datadog Agent is already running on the cluster — if not, use agent-install first. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,kubernetes,ssi,instrumentation,single-step + alwaysApply: "false" +--- + +# Enable APM on Kubernetes via Single Step Instrumentation + +> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 0 until every variable has a concrete value. + +--- + +> **Silent failure — check this before any other step:** +> +> If the application has `ddtrace`, `dd-trace`, or any OpenTelemetry SDK in its **dependency manifest** (`requirements.txt`, `package.json`, `Gemfile`, `go.mod`, `pom.xml`) — even with no import statements in code — SSI will silently disable itself at runtime. +> +> The failure is invisible: init containers run and complete, the pod starts healthy, no errors appear in `kubectl` or `pup`, but no traces arrive. The injector detects the user-installed tracer and exits cleanly without logging anything. +> +> ### Claude runs +> +> ```bash +> grep -rE "ddtrace|dd-trace|opentelemetry" \ +> requirements.txt package.json Gemfile go.mod pom.xml 2>/dev/null \ +> || echo "No tracer dependency found" +> ``` +> +> If any match — **stop**. Remove the package entirely (not just the import), rebuild the image, reload it into the cluster, and restart the pod before continuing. A package present in the manifest is enough to trigger this even if it is never imported. + +--- + +## Triggers + +Invoke this skill when the user expresses intent to: +- Enable APM on a Kubernetes cluster +- Instrument Kubernetes applications with Datadog tracing +- Set up Single Step Instrumentation (SSI) + +Do NOT invoke this skill if: +- The Datadog Agent is not yet installed — run `agent-install` first +- The user wants to verify SSI after setup — use `verify-ssi` +- The user wants to enable Profiler, AppSec, or Data Streams — use `dd-apm-k8s-sdk-features` + +--- + +## Prerequisites + +> **These are not a reading exercise — actively verify each one before proceeding.** + +**Environment** +- [ ] Datadog Agent is installed and healthy — `agent-install` complete +- [ ] Kubernetes v1.20+ +- [ ] Linux node pools only — Windows pods require explicit namespace exclusion +- [ ] Cluster is not ECS Fargate — unsupported +- [ ] Not a hardened SELinux environment — unsupported +- [ ] Not a very small VM instance (e.g. t2.micro) — SSI can hit init timeouts +- [ ] No PodSecurity baseline or restricted policy enforced + +**Language and runtime** +- [ ] Application language is one of: Java, Python, Ruby, Node.js, .NET, PHP +- [ ] Runtime version is within SSI's supported range — verify against the [SSI compatibility matrix](https://docs.datadoghq.com/tracing/trace_collection/automatic_instrumentation/single-step-apm/compatibility/) +- [ ] Node.js app is not using ESM — SSI does not support ESM +- [ ] Java app is not already using a `-javaagent` JVM flag + +**Existing instrumentation** — confirmed clean by the check at the top of this skill. If you skipped that check, go back and run it now. + +--- + +## Context to resolve before acting + +> **Discover from the cluster — do not ask the user for information you can find yourself.** + +| Variable | How to resolve | +|---|---| +| `AGENT_NAMESPACE` | Same namespace used in `agent-install` (e.g. `datadog`) | +| `APP_NAMESPACE` | Run `kubectl get namespaces --no-headers \| awk '{print $1}' \| grep -vE '^(kube-system\|kube-public\|kube-node-lease\|datadog\|local-path-storage)$'` — instrument all non-system namespaces, or use the namespace(s) the user mentioned | +| `TARGET_LANGUAGES` | Run `kubectl get pods -A -o jsonpath='{.items[*].spec.containers[*].image}'` and infer language from image names, or check Dockerfiles/manifests in the workspace. If uncertain, enable all languages. | +| `DEPLOYMENT_NAME` | Run `kubectl get deployments -A --no-headers` — identify application deployments (exclude system components) | +| `APP_LABEL` | Check `spec.selector.matchLabels` in the Deployment manifest via `kubectl get deployment -n -o yaml` | +| `CLUSTER_NAME` | Check `spec.global.clusterName` in `datadog-agent.yaml`, or `kubectl config current-context` — needed for kind clusters in Step 0 | +| `ENV` | Use `apm-evals` if running in an eval cluster (kind cluster names contain "evalya"). Otherwise use `production` unless the user specifies otherwise. | +| `SERVICE_NAME` | Use the deployment name (e.g. `python-app` → service `python-app`). Do not ask the user. | +| `VERSION` | Use `1.0.0` as the default. Do not ask the user. | + +--- + +## Step 0 (Only if existing instrumentation detected): Remove Manual Instrumentation + +Scan all source files for: `import ddtrace`, `from ddtrace`, `require 'ddtrace'`, `require("dd-trace")`, `opentelemetry`, `tracer.trace(` + +Also check dependency manifests for `ddtrace` / `dd-trace` / OTel SDK packages. + +If found — remove the import/package, then rebuild and reload: + +### Claude runs + +```bash +docker build -f -t +``` + +[DECISION: how does this cluster get local images?] + +Check the repo's setup script (e.g. `create.sh`, `Makefile`, `justfile`) for how images are loaded — do not guess from the cluster name or context. Common patterns: + +| What you find in the setup script | Load command | +|---|---| +| `minikube image load` or `minikube cache add` | `minikube -p image load ` — profile is the `-p` flag value in the script, NOT necessarily the kubectl context name | +| `kind load docker-image` | `kind load docker-image --name ` | +| `docker push` to a registry | Push the new image; the cluster will pull on restart — skip local load | +| `k3d image import` | `k3d image import -c ` | +| No image load step (cloud cluster, always pulls from registry) | Skip — image will be pulled on next deployment | + +If the setup script is ambiguous, run the load command it uses exactly as written. + +- Registry-based: skip — image will be pulled on next deployment + +> **Confirm with the user before restarting.** Tell the user: "I need to restart `` in `` to pick up the rebuilt image. Ready to proceed?" Wait for confirmation. + +### Claude runs + +```bash +kubectl rollout restart deployment/ -n +kubectl wait --for=condition=Ready pod \ + -l app= \ + -n \ + --timeout=120s +``` + +--- + +## Step 1: Extend the DatadogAgent Manifest with APM + +SSI is configured on the existing `DatadogAgent` resource — do not create a separate manifest. + +**Choose targeting scope based on what the user asked for:** +- User asked to instrument **all applications** or didn't specify scope → **use Option A (cluster-wide)** +- User asked for specific namespaces only → use Option B +- User asked to exclude namespaces from cluster-wide → use Option C +- User asked for specific pods/workloads → use Option D + +> **Default is cluster-wide (Option A).** If the user said "all my applications", "my whole cluster", or didn't restrict scope, use Option A with no `enabledNamespaces` or `targets`. + +Recommended `ddTraceVersions`: `java: "1"`, `python: "2"`, `js: "5"`, `dotnet: "3"`, `ruby: "2"`, `php: "1"` + +**Option A — Cluster-wide (default):** +```yaml +features: + apm: + instrumentation: + enabled: true +``` + +**Option B — Specific namespaces only:** +```yaml +features: + apm: + instrumentation: + enabled: true + enabledNamespaces: + - +``` + +**Option C — Cluster-wide with exclusions:** +```yaml +features: + apm: + instrumentation: + enabled: true + disabledNamespaces: + - jenkins + - kube-system +``` + +**Option D — Target specific workloads:** +```yaml +features: + apm: + instrumentation: + enabled: true + targets: + - name: + namespaceSelector: + matchNames: + - + ddTraceVersions: + : "" +``` + +> **Note:** `ddTraceVersions` only applies inside a `targets[]` entry (Option D). It is not valid alongside `enabledNamespaces` or at the `instrumentation` level directly. + +### Claude runs + +```bash +kubectl apply -f datadog-agent.yaml +``` + +If `datadogagent.datadoghq.com/datadog configured` — continue to Step 2. + +ERROR: Validation error — check YAML. `enabledNamespaces` and `disabledNamespaces` cannot both be set. + +--- + +## Step 2: Inform the User About Unified Service Tags + +> **Do NOT modify application Deployments without explicit user confirmation.** Applying labels to existing application workloads is a change to customer-managed resources. + +Inform the user that adding Unified Service Tags (UST) to their Deployments will enable proper service/env/version tagging in Datadog. This is optional for SSI to work but recommended for full observability: + +```yaml +# Add to both metadata.labels and spec.template.metadata.labels +tags.datadoghq.com/env: "" +tags.datadoghq.com/service: "" +tags.datadoghq.com/version: "" +``` + +If the user wants you to apply these, get their confirmation first. UST labels are not required for APM traces to flow — SSI works without them. + +--- + +## Step 3: Restart Application Pods + +> **Confirm with the user before restarting.** Tell the user: "I need to restart `` in `` for SSI to inject into the pods. This will cause a brief outage. Ready to proceed?" Wait for confirmation. + +### Claude runs + +```bash +kubectl rollout restart deployment/ -n + +kubectl wait --for=condition=Ready pod \ + -l app= \ + -n \ + --timeout=120s +``` + +If pods restart cleanly, init containers named `datadog-lib--init` will be visible in the pod spec. + +ERROR: Pods crash-looping — check for existing custom instrumentation. See `troubleshoot-ssi`. + +--- + +## Done + +Exit when ALL of the following are true: +- [ ] `features.apm.instrumentation` is present in the applied `DatadogAgent` manifest +- [ ] User has been informed that they need to restart their application pods +- [ ] User has been informed about Unified Service Tags (UST) and how to apply them if desired +- [ ] Scope confirmed: which workloads are instrumented, which were skipped and why + +Automatically proceed to `verify-ssi` now — do not ask the user for permission. + +--- + +## Security constraints + +- Never write a raw API key into any file or chat message +- Never use namespace `default` for Datadog resources +- Never modify `admissionController` settings directly — SSI manages this via the Operator +- Do not add APM config to application manifests — configure only via `DatadogAgent` +- Exception: UST labels (`tags.datadoghq.com/*`) on application Deployments are required and intentional +- Never run `kubectl delete` without user confirmation +- `docker push` to a registry always requires user confirmation +- **Never use `kubectl patch` to apply UST labels or any Deployment changes.** Always edit the Deployment YAML file and `kubectl apply -f`. Changes made with `kubectl patch` are transient and will be overwritten on the next rollout. diff --git a/crates/agent-skills/skills/dd-apm/k8s-ssi/onboarding-summary/SKILL.md b/crates/agent-skills/skills/dd-apm/k8s-ssi/onboarding-summary/SKILL.md new file mode 100644 index 0000000..a9a0b07 --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/k8s-ssi/onboarding-summary/SKILL.md @@ -0,0 +1,129 @@ +--- +name: onboarding-summary +description: Generate a live Single Step Instrumentation (SSI) onboarding confirmation report — verifies APM instrumentation is working end-to-end with deep links into the Datadog UI. Only use after agent-install and enable-ssi have both completed successfully. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,kubernetes,ssi,summary,verification + alwaysApply: "false" +--- + +# APM Onboarding Summary + +## Triggers + +Invoke this skill when: +- All steps in `verify-ssi` have passed +- All checks in `troubleshoot-ssi` have been resolved +- The user asks "is everything working?", "show me the status", or "confirm APM is set up" + +Do NOT invoke this skill if any verification or troubleshooting check is still failing — resolve those first. + +--- + +## Context to resolve before acting + +| Variable | How to resolve | +|---|---| +| `AGENT_NAMESPACE` | Namespace where Datadog Agent is installed | +| `APP_NAMESPACE` | Namespace of the application | +| `APP_LABEL` | Check `spec.selector.matchLabels.app` in the Deployment manifest | +| `CLUSTER_NAME` | `spec.global.clusterName` in `datadog-agent.yaml` | +| `SERVICE_NAME` | `tags.datadoghq.com/service` label on the Deployment | +| `ENV` | `tags.datadoghq.com/env` label on the Deployment | +| `DD_SITE` | `spec.global.site` in `datadog-agent.yaml` | + +--- + +## Prerequisites + +### Claude runs + +```bash +pup auth status --site +``` + +If valid token — proceed. + +ERROR: Not authenticated: + +### Claude runs + +```bash +pup auth login --site +``` + +> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. + +--- + +## Collect live confirmation data + +Run all of the following. Each populates a row in the final report. + +### Claude runs + +```bash +# Agent pod count and status +kubectl get pods -n \ + -l app.kubernetes.io/component=agent \ + --no-headers + +# SSI instrumentation config live in cluster +kubectl get datadogagent datadog -n \ + -o jsonpath='{.spec.features.apm.instrumentation}' + +# Init container confirmed in app pod spec +kubectl get pod -l app= -n \ + -o jsonpath='{.items[0].spec.initContainers[*].name}' + +# Pod confirmed instrumented — init containers in pod spec +kubectl get pod -l app= -n \ + -o jsonpath='{.items[0].spec.initContainers[*].name}' + +# Service visible and traced in APM +DD_SITE= pup apm services list --env --from 1h + +# Traces arriving in the last hour +DD_SITE= pup traces search --query "service:" --from 1h --limit 5 +``` + +--- + +## Present the report + +Fill in every value from live command output. Do not leave any placeholder unfilled. If a value cannot be confirmed, mark that row as failed and link to `troubleshoot-ssi`. + +--- + +**APM onboarding complete** + +| Check | Detail | Status | +|---|---|---| +| Datadog Agent | `` pod(s) Running in `` | OK | +| SSI enabled | Targeting namespace ``, language `` v`` | OK | +| Init container injected | `datadog-lib--init` present in pod spec | OK | +| Pod instrumented | `` in `pup fleet instrumented-pods list` | OK | +| Tracer reporting | Service ``, ``, tracer v`` | OK | +| APM service visible | `` in env `` | OK | +| Traces arriving | `` trace(s) found in the last hour | OK | + +--- + +**Your service in Datadog — click to open:** + +Construct each URL by substituting real values. Do not print placeholder URLs. + +| View | URL | +|---|---| +| Service overview | `https://app./apm/services/?env=` | +| Traces explorer | `https://app./apm/traces?query=service:%20env:` | +| Service map | `https://app./apm/map?env=&service=` | +| Agent fleet | `https://app./fleet-automation` | + +--- + +## Security constraints + +- Never write a raw API key into any file or chat message diff --git a/crates/agent-skills/skills/dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md new file mode 100644 index 0000000..134970a --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md @@ -0,0 +1,405 @@ +--- +name: troubleshoot-ssi +description: Diagnose and fix Single Step Instrumentation (SSI) issues on Kubernetes — SSI automatically instruments applications for APM without code changes. Only use if the agent and SSI are already configured but traces are missing or instrumentation is not working. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,kubernetes,ssi,troubleshooting,instrumentation + alwaysApply: "false" +--- + +# Troubleshoot APM SSI on Kubernetes + +## Triggers + +Invoke this skill when the user expresses intent to: +- Debug why a pod is not being instrumented +- Investigate why traces are not appearing in Datadog +- Diagnose admission webhook or init container injection failures +- Follow up on failed checks from `verify-ssi` +- Report that a specific service or pod has no traces + +Do NOT invoke this skill if: +- SSI has not been enabled yet — run `enable-ssi` first + +--- + +## Prerequisites + +- [ ] kubectl configured to target cluster — `kubectl config current-context` + +### pup-cli: check, install, and authenticate + +### Claude runs + +```bash +pup --version +``` + +If not found, install it (OS-aware): + +### Claude runs + +```bash +if [[ "$(uname)" == "Darwin" ]]; then + brew tap datadog-labs/pack && brew install pup +else + PUP_VERSION=$(curl -s https://api.github.com/repos/datadog-labs/pup/releases/latest | grep '"tag_name"' | cut -d'"' -f4) + curl -L "https://github.com/datadog-labs/pup/releases/download/${PUP_VERSION}/pup_linux_amd64.tar.gz" | tar xz -C /usr/local/bin pup + chmod +x /usr/local/bin/pup +fi +pup --version +``` + +Check auth: +```bash +pup auth status +``` + +If not authenticated: + +### Claude runs + +```bash +pup auth login +``` + +> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. + +If no browser available: `export DD_APP_KEY=`. + +--- + +## Context to resolve before acting + +| Variable | How to resolve | +|---|---| +| `AGENT_NAMESPACE` | Namespace where Datadog Agent is installed | +| `APP_NAMESPACE` | Namespace of the application with missing traces | +| `CLUSTER_NAME` | `kubectl config current-context` or `spec.global.clusterName` in `datadog-agent.yaml` | +| `SERVICE_NAME` | `tags.datadoghq.com/service` label on the Deployment, or ask the user | +| `ENV` | `tags.datadoghq.com/env` label on the Deployment, or ask the user | +| `POD_NAME` | `kubectl get pods -n ` — use the specific pod the user mentioned | +| `DEPLOYMENT_NAME` | Check `metadata.name` in the Deployment manifest, or ask the user | +| `APP_LABEL` | Check `spec.selector.matchLabels.app` in the Deployment manifest | + +--- + +## How SSI Works — Domain Knowledge + +Read this before investigating. It gives you the mental model to reason about novel failures, not just known ones. + +**Injection chain:** +1. Admission webhook (registered by Cluster Agent) intercepts pod creation +2. Webhook mutates the pod spec — adds a `datadog-lib--init` init container +3. Init container downloads the tracer library onto a shared volume +4. `LD_PRELOAD` env var is set pointing to the library `.so` file +5. Application process loads the library automatically on startup via `LD_PRELOAD` + +**What each diagnostic layer can see:** +- **pup** — sees what Datadog's backend received. Blind to cluster-side injection failures. If pup shows no instrumented pods, the problem is in the cluster. +- **kubectl** — sees cluster state. Blind to whether data reached Datadog. If kubectl shows the init container but pup shows no traces, the problem is post-injection. + +**What healthy looks like:** +- `pup fleet instrumented-pods list` shows the pod with correct language/version +- `pup fleet tracers list` shows the service as active +- `kubectl get pod -o jsonpath='{.spec.initContainers[*].name}'` includes `datadog-lib--init` + +**Known silent failures — SSI produces no error when these occur:** +- **Existing ddtrace or OTel instrumentation** — SSI detects it and silently disables itself +- **Unsupported runtime version** — silently skipped +- **`admission.datadoghq.com/enabled: "false"` annotation** — webhook skips the pod entirely +- **Pod not restarted after SSI enabled** — injection happens at startup; existing pods keep running uninstrumented +- **Pod in Agent namespace** — SSI never instruments its own namespace + +**Reasoning shortcuts:** +- No init container → webhook didn't fire → check: namespace targeting, pod-selector, opt-out annotation, webhook registration, pod not restarted +- Init container present + no traces → injection attempted but failed or tracer not reporting → check: existing ddtrace, runtime version, Agent connectivity, DD_SITE mismatch + +--- + +## Step 1: Triage + +Run all seven simultaneously and surface them back to the user as the diagnostics you're running. Everything after this is driven by what you find here. Resolve `` from `kubectl get pod -n -o jsonpath='{.spec.nodeName}'` once you have a pod name; if no pod context yet, run the `pup` commands without `--hostname` first. + +### Claude runs + +```bash +pup traces search --query "service:" --from 1h --limit 5 +pup fleet instrumented-pods list +pup apm troubleshooting list --hostname --timeframe 1h +pup apm service-library-config get --service-name --env +kubectl get pod -n \ + -o jsonpath='{.spec.initContainers[*].name}' +kubectl describe pod -n | grep -A 10 "Events:" +kubectl get mutatingwebhookconfigurations | grep datadog +``` + +The last command confirms the Admission Controller webhook is registered cluster-wide — this is the precondition for SSI injection working at all and must be checked even when most other services are being instrumented (any deviation in one webhook config can silently skip a subset of pods). + +`pup apm troubleshooting list` surfaces injection errors that Datadog's backend received from the cluster — these point to cluster-side mutation failures that may not be visible from `kubectl describe` alone. `pup apm service-library-config get` shows the runtime SDK config the tracer is operating under; an empty result with `ddTraceConfigs` configured, or unexpected values, points to UST/config-propagation issues. + +--- + +## Step 2: State Your Hypotheses + +Before investigating, explicitly state your ranked hypotheses based on triage output. Do not skip this step. + +**When the user reports multiple affected services in the same namespace, diagnose each independently.** Two pods can fail injection for entirely different reasons (one opt-out annotation, one missing namespace label, one with pre-existing ddtrace). Do not assume a shared root cause — investigate each service's pod spec, annotations, and runtime separately and surface findings per-service. + +| Triage signal | Strong hypothesis | +|---|---| +| Traces arriving + pod in instrumented list | Not a real problem — likely a UI filter or time window. Tell the user and stop | +| No traces + pod NOT in instrumented list + no init container | Injection never happened — investigate: namespace targeting, webhook, pod-selector, opt-out annotation, pod not restarted | +| No traces + pod NOT in instrumented list + init container present | Injection attempted but failed — check `pup apm troubleshooting list` for injection errors | +| No traces + pod in instrumented list + init container present | Tracer injected but not reporting — investigate: connectivity, DD_SITE, API key | +| Pod events show CrashLoopBackOff or init container errors | Init container failure — check existing ddtrace, runtime version | +| Traces arriving but wrong service/env | UST labels missing or misconfigured on the Deployment | + +State your top 1-3 hypotheses explicitly: *"Based on triage, I think the most likely cause is X because Y."* + +--- + +## Step 3: Investigate + +Use only the tools relevant to your hypotheses. Each observation informs your next action. + +--- + +### Cluster-side investigation tools + +**Is the pod in the Agent namespace?** +SSI never instruments pods in the same namespace as the Datadog Agent. +```bash +kubectl get pods -n +``` + +**Were pods restarted after SSI was enabled?** + +> **Confirm with the user before restarting.** Tell the user: "Pods must be restarted for SSI to inject into them. I'll restart `` in ``. Ready to proceed?" Wait for confirmation. + +### Claude runs + +```bash +kubectl rollout restart deployment/ -n +kubectl wait --for=condition=Ready pod -l app= -n --timeout=120s +``` + +### Claude runs + +```bash +pup fleet instrumented-pods list +``` + +**Does the namespace carry the Admission Controller opt-in label?** +When the Admission Controller runs with `mutateUnlabelled: false`, injection happens only in namespaces explicitly labeled `admission.datadoghq.com/mutate-pods=true`. A namespace missing this label silently has SSI skipped for every pod in it — a common cause when most cluster services are instrumented but one namespace's services aren't. + +```bash +kubectl get namespace -o jsonpath='{.metadata.labels}' +kubectl get namespace --show-labels +``` + +Fix: label the namespace, then restart the affected deployments so the AC mutates them on pod recreate. +```bash +kubectl label namespace admission.datadoghq.com/mutate-pods=true +``` + +**Is namespace targeting filtering the pod out?** +```bash +kubectl get datadogagent datadog -n -o yaml | grep -A 15 instrumentation +``` +Fix: update `enabledNamespaces` in `datadog-agent.yaml`. + +### Claude runs + +```bash +kubectl apply -f datadog-agent.yaml +``` + +**Is a `podSelector` target filtering the pod out?** +If `targets` with `podSelector` is configured, only pods whose labels match the selector are instrumented. Check whether the app pod's labels match any target: +```bash +kubectl get datadogagent datadog -n -o yaml | grep -A 20 targets +kubectl get pod -n --show-labels +``` +Fix: add a matching label to the pod template, or broaden the `podSelector`, then apply and restart. + +**Is a pod annotation opting it out — or missing the AC's injection-success annotation?** +Two annotations to look for: +- `admission.datadoghq.com/enabled: "false"` — explicit opt-out, AC skips the pod. +- `admission.datadoghq.com/status: injected` — set by the AC after successful mutation; its **absence** on a running pod is positive evidence the AC never mutated it. + +```bash +kubectl get pod -n -o jsonpath='{.metadata.annotations}' +kubectl get pod -n -o yaml | grep -A 10 annotations +``` +Fix: remove an opt-out annotation from the Deployment pod template, then apply and restart. + +**Are the expected `DD_*` environment variables present in the running pod?** +SSI injects `DD_SERVICE`, `DD_ENV`, `DD_VERSION`, `DD_TRACE_*`, and `LD_PRELOAD` into the container env when it mutates a pod. Their absence confirms the mutation did not run; their presence with unexpected values points to UST label mismatches or `ddTraceConfigs` issues. + +```bash +kubectl exec -n -- env | grep -E '^(DD_|LD_PRELOAD)' +kubectl describe pod -n | grep -E 'DD_|LD_PRELOAD' +``` + +### Claude runs + +```bash +kubectl apply -f +``` + +> **Confirm with the user before restarting.** Tell the user: "I need to restart `` in `` for this change to take effect. Ready to proceed?" Wait for confirmation. + +### Claude runs + +```bash +kubectl rollout restart deployment/ -n +``` + +**Does the app have existing custom instrumentation?** +SSI silently disables itself when it detects existing tracer code. Scan source files for: +- Python: `import ddtrace`, `ddtrace.patch_all()` +- Node.js: `require('dd-trace')`, `DD.init()` +- Java: `GlobalTracer.register(`, `dd-java-agent` +- .NET: `Tracer.Instance`, `DD.Trace` +- Ruby: `require 'ddtrace'`, `Datadog.configure` +- PHP: `DDTrace\` + +Also check dependency manifests: `requirements.txt`, `package.json`, `Gemfile`, `pom.xml`. + +Fix: remove the import/package, rebuild image, reload into cluster, restart pod. + +**Is the base image Alpine (musl libc)?** +K8s SSI injects `LD_PRELOAD` as an environment variable into the pod — it does not rely on `/etc/ld.so.preload`, so musl/Alpine images are supported. This is not a blocker for Kubernetes SSI. + +**Is the runtime version supported?** +```bash +kubectl exec -n -- python --version +kubectl exec -n -- node --version +kubectl exec -n -- java -version +``` +Verify against [SSI compatibility matrix](https://docs.datadoghq.com/tracing/trace_collection/automatic_instrumentation/single-step-apm/compatibility/). + +**Is the admission webhook registered?** +```bash +kubectl get mutatingwebhookconfigurations | grep datadog +kubectl get pods -n -l app=datadog-cluster-agent +kubectl logs -n -l app=datadog-cluster-agent --tail=100 +``` + +**Did injection produce errors?** +Get the node hostname first, then query Datadog for injection errors: +```bash +kubectl get pod -n -o jsonpath='{.spec.nodeName}' +pup apm troubleshooting list --hostname --timeframe 1h +``` + +**Is the Agent sending data to Datadog?** +```bash +kubectl exec -n \ + $(kubectl get pod -n -l app=datadog-agent -o name | head -1) \ + -- agent status | grep -A 5 "APM Agent" +``` + +--- + +### Datadog-side investigation tools + +**Is the tracer reporting?** +```bash +pup fleet tracers list --filter "service:" +``` + +**Does APM recognise the service?** +```bash +pup apm services list --env +``` + +**What SDK configuration is the service running with?** +Shows env vars the tracer is configured with (e.g. `DD_TRACE_ENABLED`, `DD_SERVICE`, `DD_ENV`, sampling rules). Empty output is expected if `ddTraceConfigs` was not set in `enable-ssi`; a populated output mismatching what was configured indicates the change didn't propagate. +```bash +pup apm service-library-config get --service-name --env +``` + +**Are traces arriving?** +```bash +pup traces search --query "service:" --from 1h --limit 10 +``` + +**Which agent is the tracer connected to?** +Use if connectivity between tracer and Agent is suspected. +```bash +pup fleet agents list --filter "hostname:" +pup fleet agents tracers --filter "service:" +``` + +--- + +## Step 4: Reflect Before Concluding + +Before applying any fix, answer: +1. What evidence confirms my hypothesis? +2. What evidence would contradict it — and have I checked? +3. Is there a simpler explanation I haven't considered? + +If the conclusion doesn't hold up, return to Step 2 with new hypotheses. Keep iterating until you can defend the conclusion against all three questions. + +--- + +## Step 5: Fix + +Apply the fix for the confirmed root cause. If the fix requires a code or Dockerfile change, rebuild and reload: + +### Claude runs + +```bash +docker build -f -t +``` + +[DECISION: cluster type] +- kind (local): load the image into the cluster + +### Claude runs + +```bash +kind load docker-image --name +``` + +- Registry-based: skip — image will be pulled on next deployment + +> **Confirm with the user before restarting.** Tell the user: "I need to restart `` in `` to apply the fix. Ready to proceed?" Wait for confirmation. + +### Claude runs + +```bash +kubectl rollout restart deployment/ -n +kubectl wait --for=condition=Ready pod -l app= -n --timeout=120s +``` + +--- + +## Step 6: Verify + +Re-run triage to confirm the fix worked: + +### Claude runs + +```bash +pup traces search --query "service:" --from 1h --limit 5 +pup fleet instrumented-pods list +``` + +If traces are arriving and the pod is in the instrumented list — resolved. Automatically proceed to `onboarding-summary` now — do not ask the user for permission. + +ERROR: Still not resolved — return to Step 2 with the new triage data and form updated hypotheses. + +--- + +## Security constraints + +- Never write a raw API key into any file or chat message +- Never run `kubectl delete` without user confirmation +- Never modify `admissionController` settings directly +- `docker push` to a registry always requires user confirmation diff --git a/crates/agent-skills/skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md new file mode 100644 index 0000000..5b3a155 --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md @@ -0,0 +1,159 @@ +--- +name: verify-ssi +description: Verify Single Step Instrumentation (SSI) is working end-to-end on Kubernetes — SSI automatically instruments applications for APM without code changes. Only use after enable-ssi has run. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,kubernetes,ssi,verification,instrumentation + alwaysApply: "false" +--- + +# Verify APM SSI on Kubernetes + +> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 1 until every variable has a concrete value. + +## Triggers + +Invoke this skill when the user expresses intent to: +- Confirm SSI is working after enabling APM +- Check whether pods are being instrumented +- Verify the tracer is running and reporting telemetry +- Confirm tracer config is applied correctly + +Do NOT invoke this skill if: +- SSI has not been enabled yet — run `enable-ssi` first +- Pods are not being instrumented at all — use `troubleshoot-ssi` + +--- + +## Prerequisites + +- [ ] `enable-ssi` is complete +- [ ] Application pods have been restarted since SSI was enabled + +### pup-cli: check, install, and authenticate + +### Claude runs + +```bash +pup --version +``` + +If not found: + +### Claude runs + +```bash +brew tap datadog-labs/pack +brew install pup +``` + +Check auth: +```bash +pup auth status --site +``` + +If not authenticated: + +### Claude runs + +```bash +pup auth login --site +``` + +> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. + +If valid token — proceed. +ERROR: No browser available — use API key fallback: `export DD_APP_KEY=` + +--- + +## Context to resolve before acting + +| Variable | How to resolve | +|---|---| +| `CLUSTER_NAME` | Check `spec.global.clusterName` in `datadog-agent.yaml`, or `kubectl config current-context` | +| `ENV` | Check `tags.datadoghq.com/env` label on the application Deployment | +| `SERVICE_NAME` | Check `tags.datadoghq.com/service` label on the application Deployment | + +--- + +## Step 1: Confirm Pods are Instrumented + +### Claude runs + +```bash +kubectl get pod -l app= -n \ + -o jsonpath='{.items[0].spec.initContainers[*].name}' +``` + +If the output includes `datadog-lib--init` and `datadog-init-apm-inject` — SSI init containers are injected. + +ERROR: Init containers missing — pod was not restarted after SSI was enabled, or namespace targeting is not matching. Restart the pod and recheck. + +--- + +## Step 2: Confirm the Tracer is Reporting Telemetry + +### Claude runs + +```bash +DD_SITE= pup apm services list --env --from 1h +``` + +If `` appears in the services list with `isTraced: true` — continue to Step 3. + +ERROR: Service missing — send some traffic to the app first, then retry: + +### Claude runs + +```bash +# Port-forward and send test traffic +kubectl port-forward deployment/ 8099:8000 -n & +sleep 2 && for i in $(seq 1 10); do curl -s -o /dev/null http://localhost:8099/; done +sleep 30 && kill %1 2>/dev/null +DD_SITE= pup apm services list --env --from 10m +``` + +ERROR: Still missing after traffic — check the agent's trace receiver: `kubectl exec -n -c agent -- agent status | grep -A 10 "Receiver (previous minute)"`. If receiver shows 0 traces, go to `troubleshoot-ssi`. + +--- + +## Step 3: Confirm Tracer Configuration + +**Only run this step if `ddTraceConfigs` was explicitly configured in `enable-ssi`** (e.g. profiling, AppSec, Data Streams). If basic SSI was set up without `ddTraceConfigs`, skip this step — an empty response here is expected and not a failure. + +### Claude runs + +```bash +pup apm service-library-config get \ + --service-name \ + --env +``` + +If the output shows expected environment variables matching what was configured in `ddTraceConfigs` — done. + +If the output is empty and `ddTraceConfigs` was not configured — expected, not a failure. + +ERROR: Config missing but `ddTraceConfigs` was configured — check it is present in the `DatadogAgent` manifest under the correct target, and that pods were restarted after the config change. + +--- + +## Done + +Exit when ALL of the following are true: +- [ ] Step 1: target pods appear in `instrumented-pods list` +- [ ] Step 2: service appears in `tracers list` with active status +- [ ] Step 3: tracer config matches what was set in `DatadogAgent` + +If any check fails, go to `troubleshoot-ssi`. + +When all steps pass, automatically proceed to `onboarding-summary` now — do not ask the user for permission. + +--- + +## Security constraints + +- Never write a raw API key into any file or chat message +- Never run `kubectl delete` without user confirmation diff --git a/crates/agent-skills/skills/dd-apm/linux-ssi/agent-install/SKILL.md b/crates/agent-skills/skills/dd-apm/linux-ssi/agent-install/SKILL.md new file mode 100644 index 0000000..b995617 --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/linux-ssi/agent-install/SKILL.md @@ -0,0 +1,280 @@ +--- +name: agent-install +description: Install the Datadog Agent on Linux hosts via SSH with Single Step Instrumentation (SSI) enabled — SSI automatically instruments applications for APM without code changes. Only use if no agent is installed yet. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,linux,agent,install,ssi,ssh + alwaysApply: "false" +--- + +# Install Datadog Agent on Linux + +> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 1 until every variable has a concrete value. + +## Triggers + +Invoke this skill when the user expresses intent to: +- Install the Datadog Agent on Linux hosts or VMs +- Set up Datadog monitoring on bare-metal or cloud Linux instances +- Prepare Linux hosts for APM onboarding + +Do NOT invoke this skill if: +- The Agent is already installed on all hosts — check with `datadog-agent status` first +- The target is a Kubernetes cluster — use `dd-apm-k8s-agent-install` instead + +--- + +## Phase 0: Load Credentials + +```bash +[ -f environment ] && source environment +echo "DD_API_KEY set: $([ -n "${DD_API_KEY:-}" ] && echo yes || echo no)" +echo "DD_SITE: ${DD_SITE:-not set}" +``` + +**If `DD_API_KEY` is already set** — proceed directly to gathering infrastructure info. + +**If `DD_API_KEY` is not set** — tell the user: + +> Please run the following in this chat to set your credentials (the `!` prefix executes it in this session): +> ``` +> ! export DD_API_KEY=your-api-key-here +> ! export DD_SITE=datadoghq.com +> ``` + +Wait for the user to run the commands, then re-run the check above before continuing. + +--- + +## Phase 1: Gather Infrastructure Info + +Only do this phase if the user hasn't already provided the information. If SSH credentials are known, skip to Phase 2. + +Ask the user: +1. **Which hosts** need the agent? Get a list of IPs or hostnames. +2. **How do I SSH to them?** Get the SSH user, key path, and any jump host or bastion configuration. +3. **Do any hosts already have the Datadog Agent installed?** If so, skip install for those hosts and go straight to `verify-ssi`. + +### Claude runs + +Verify SSH works for each host before proceeding: + +```bash +ssh -o StrictHostKeyChecking=no -i @ "hostname" +``` + +If it returns a hostname — proceed. +ERROR: Connection refused or timeout — resolve connectivity before continuing. + +Once SSH is confirmed, present a plan to the user before proceeding. For example: + +``` +Here's what I'm going to do: + 1. Install the Datadog Agent with SSI on: , , ... + 2. Verify each agent is running and healthy + 3. Discover services on each host that need restarting for SSI to take effect + 4. After you restart services, verify instrumentation is working + +Ready to proceed? +``` + +Wait for user confirmation before starting installs. + +--- + +## Prerequisites + +**Per host — check before installing:** + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "uname -m && cat /etc/os-release | grep -E '^(ID|VERSION_ID|PRETTY_NAME)='" +``` + +If architecture is `x86_64` or `aarch64`, and the OS is a supported distribution (Ubuntu 16.04+, Debian 9+, RHEL/CentOS 6-9, Amazon Linux 2/2023, SUSE 12+) — proceed. + +ERROR: Architecture is `armv7l` (32-bit ARM) or unsupported OS — stop. Datadog Agent 7 and SSI do not support this configuration. + +--- + +## Context to resolve before acting + +| Variable | How to resolve | +|---|---| +| `DD_API_KEY` | Check `echo $DD_API_KEY` first — if set, use it. Otherwise ask the user for their API key from Datadog UI: Organization Settings → API Keys. Never log or print the key. | +| `DD_SITE` | Check `echo $DD_SITE` first — if set, use it. Otherwise ask the user. Default: `datadoghq.com`. Options: `datadoghq.com`, `us3.datadoghq.com`, `us5.datadoghq.com`, `datadoghq.eu`, `ap1.datadoghq.com` | +| `SSH_KEY` | Ask the user for the path to their SSH private key, or check `CLAUDE.md` | +| `SSH_USER` | Ask the user for the SSH username. Default: `root` | +| `SSH_HOST` | Ask the user for the hostname or IP of the target host | +| `SSH_PORT` | Ask the user for the SSH port. Default: `22` | + +--- + +## Phase 2: Install the Datadog Agent with SSI + +Run for each host that does not already have the agent installed. + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "DD_API_KEY=${DD_API_KEY} DD_SITE=${DD_SITE} DD_APM_INSTRUMENTATION_ENABLED=host bash -c \"\$(curl -L https://install.datadoghq.com/scripts/install_script_agent7.sh)\"" +``` + +`DD_APM_INSTRUMENTATION_ENABLED=host` causes the install script to also install `datadog-apm-inject` and language library packages under `/opt/datadog-packages/` in one pass. + +If the script completes without errors — proceed to Phase 2. + +ERROR: `curl: command not found`: +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "apt-get install -y curl 2>/dev/null || yum install -y curl" +``` + +ERROR: Permission error — ensure the SSH user has sudo access. The install script requires root. + +ERROR: Script fails with GPG key error — retry; if it persists, check the host's DNS resolution for `keys.datadoghq.com`. + +--- + +## Phase 3: Verify the Agent is Running and Healthy + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo datadog-agent status 2>&1 | head -40" +``` + +Healthy output shows: +- `Agent (v7.XX.X)` with `Status: Running` +- `API Keys status: API Key ending with XXXX: Valid` + +ERROR: `command not found` — installation did not complete. Re-run Phase 1. + +ERROR: `API key invalid` — update and restart: +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo sed -i 's/^api_key:.*/api_key: /' /etc/datadog-agent/datadog.yaml && \ + (sudo systemctl restart datadog-agent 2>/dev/null || sudo service datadog-agent restart)" +``` + +ERROR: Agent service not running: +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo systemctl start datadog-agent 2>/dev/null && sudo systemctl enable datadog-agent 2>/dev/null || sudo service datadog-agent start" +``` + +**Verify APM inject packages are present on disk** (not just registered): +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "ls /opt/datadog-packages/ && sudo datadog-installer status 2>/dev/null | grep apm | head -10" +``` + +If `/opt/datadog-packages/datadog-apm-inject` exists — injection is available. + +ERROR: Directory missing or empty — `datadog-installer status` may show the package as registered while its directory is actually empty (stale registration). Reinstall: +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo datadog-installer remove datadog-apm-inject && \ + DD_API_KEY=${DD_API_KEY} DD_SITE=${DD_SITE} DD_APM_INSTRUMENTATION_ENABLED=host bash -c \"\$(curl -L https://install.datadoghq.com/scripts/install_script_agent7.sh)\"" +``` + +**Verify hostname registration** — the Agent must resolve and register its hostname for the host to appear in Datadog. DNS lookup failures are common in containers and minimal VMs: + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo datadog-agent status 2>&1 | grep -iE '^\s+Hostname' | head -3" +``` + +If `Hostname: ` is shown — hostname resolved. Record this as `DD_HOSTNAME` for all subsequent steps. + +ERROR: `Hostname: (none)` or any DNS resolution error — the agent can't resolve its own FQDN. Fix by setting the hostname explicitly in `datadog.yaml`: + +```bash +# Read the actual system hostname +ACTUAL_HOSTNAME=$(ssh -o StrictHostKeyChecking=no -i @ "hostname") + +# Append to datadog.yaml only if not already set +ssh -o StrictHostKeyChecking=no -i @ \ + "grep -q '^hostname:' /etc/datadog-agent/datadog.yaml || \ + echo \"hostname: ${ACTUAL_HOSTNAME}\" | sudo tee -a /etc/datadog-agent/datadog.yaml" + +# Restart the Agent +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo systemctl restart datadog-agent 2>/dev/null || sudo service datadog-agent restart" + +# Confirm hostname is now registered +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo datadog-agent status 2>&1 | grep -iE '^\s+Hostname' | head -2" +``` + +--- + +## Phase 4: Discover Services That Need Restarting + +SSI only injects into processes at startup. Existing processes keep running uninstrumented until restarted. Discover what's running so the user knows what to restart. + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo ss -lntp 2>/dev/null || sudo netstat -tlnp 2>/dev/null || cat /proc/net/tcp" +``` + +For each application-level listener (ignore sshd, systemd, chronyd): + +```bash +ssh -o StrictHostKeyChecking=no -i @ " +# Command line of the process +sudo cat /proc//cmdline | tr '\0' ' ' +# Service manager (may not be available in all environments) +sudo systemctl status 2>/dev/null | head -3 || true +# Parent process +PPID=\$(sudo awk '/PPid/ {print \$2}' /proc//status) +sudo cat /proc/\$PPID/cmdline | tr '\0' ' ' +" +``` + +Present findings to the user: + +``` +I found the following application services on : + + Port 8080 — PID 1234 — /usr/bin/python3 /app/server.py + Managed by: systemd unit flask-app.service + + Port 3000 — PID 5678 — node /app/server.js + Managed by: supervisord + +These services need to be restarted for Datadog SSI to inject into them. +Restart them however is appropriate for your environment, then let me know +and I'll verify the instrumentation. +``` + +**Do not offer to restart services. Do not restart services unless the user explicitly asks.** + +--- + +## Done + +Exit when ALL of the following are true: +- [ ] Agent running on each target host (`datadog-agent status` shows Running, API key valid) +- [ ] `/opt/datadog-packages/datadog-apm-inject` exists on disk on each host +- [ ] User has been informed which services need restarting +- [ ] User has confirmed they are ready to restart services + +Automatically proceed to `enable-ssi` (if services need UST labels configured) or `verify-ssi` (if services have already been restarted) — do not ask the user for permission. + +--- + +## Security constraints + +- Never write a raw API key into any file or chat message +- Never store `DD_API_KEY` in shell history — pass it inline in the SSH command only +- If the user's API key appears in any output, redact it before displaying +- Always confirm before restarting production services diff --git a/crates/agent-skills/skills/dd-apm/linux-ssi/enable-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/linux-ssi/enable-ssi/SKILL.md new file mode 100644 index 0000000..bcc5941 --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/linux-ssi/enable-ssi/SKILL.md @@ -0,0 +1,235 @@ +--- +name: enable-ssi +description: Configure Unified Service Tags and verify Single Step Instrumentation (SSI) injection on Linux hosts — SSI automatically instruments applications for APM without code changes. Only use if the Datadog Agent is already installed. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,linux,ssi,instrumentation,single-step,ld-preload,ust + alwaysApply: "false" +--- + +# Configure SSI and Unified Service Tags on Linux + +> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 0 until every variable has a concrete value. + +## Triggers + +Invoke this skill when: +- The Datadog Agent is already installed with SSI (`DD_APM_INSTRUMENTATION_ENABLED=host` was used) and you need to configure Unified Service Tags on the application service +- The user wants to set `DD_SERVICE`, `DD_ENV`, `DD_VERSION` on a running service +- SSI is installed but `/proc//maps` doesn't show the language tracer (launcher-only injection) + +Do NOT invoke this skill if: +- The Datadog Agent is not yet installed — run `agent-install` first +- SSI packages are missing from `/opt/datadog-packages/` — re-run `agent-install` +- The target is a Kubernetes cluster — use `dd-apm-k8s-enable-ssi` instead + +--- + +## Background + +When the install script runs with `DD_APM_INSTRUMENTATION_ENABLED=host`, it: +1. Installs `datadog-apm-inject` and language library packages under `/opt/datadog-packages/` +2. Writes the launcher path into `/etc/ld.so.preload` +3. SSI is now armed — every new process on the host gets the launcher injected at startup + +**What SSI does NOT configure automatically:** +- `DD_SERVICE`, `DD_ENV`, `DD_VERSION` — these must be set on the application process for traces to be tagged correctly +- Without `DD_SERVICE`, the tracer auto-detects a service name (often the process name or framework name), which may not match what the user expects + +--- + +## Prerequisites + +**Verify SSI is armed:** + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "cat /etc/ld.so.preload && ls /opt/datadog-packages/ | grep apm" +``` + +If `/etc/ld.so.preload` contains a path to the launcher, and `/opt/datadog-packages/datadog-apm-inject` exists — SSI is armed. + +ERROR: Either missing — run `agent-install` first. + +**Check for existing manual instrumentation:** + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ " +grep -r 'import ddtrace\|from ddtrace\|require .dd-trace.\|opentelemetry' 2>/dev/null | head -5 || echo 'No manual instrumentation found' +" +``` + +ERROR: Manual instrumentation found — SSI silently disables itself when it detects an existing tracer. Remove the manual import/package before proceeding. + +**Check base libc:** + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "ldd --version 2>&1 | head -1" +``` + +ERROR: musl — SSI requires glibc. No workaround; must use a glibc-based OS. + +--- + +## Context to resolve before acting + +| Variable | How to resolve | +|---|---| +| `SERVICE_NAME` | Ask the user — how the service should appear in Datadog APM (e.g. `payment-api`) | +| `ENV` | Ask the user — environment name (e.g. `production`, `staging`, `dev`) | +| `VERSION` | Ask the user or read from the app's version file / git tag | +| `SYSTEMD_SERVICE_NAME` | From `systemctl list-units --type=service --state=running` on the host — the unit running the app | +| `SSH_KEY` | Path to SSH private key | +| `SSH_USER` | SSH username | +| `SSH_HOST` | Hostname or IP of the target host | + +--- + +## Step 0 (Only if existing instrumentation detected): Remove Manual Instrumentation + +- Python: `pip uninstall ddtrace`, remove `import ddtrace` / `ddtrace-run` from CMD +- Node.js: `npm uninstall dd-trace`, remove `require('dd-trace')` +- Java: remove `-javaagent:/path/to/dd-java-agent.jar` JVM flag +- Ruby: `gem uninstall ddtrace`, remove `require 'ddtrace'` +- .NET: remove `Datadog.Trace` NuGet and profiler env vars + +After removing, restart the service. **Confirm with the user before restarting.** Tell the user: "I need to restart `` to remove the old instrumentation. This will cause a brief outage. Ready to proceed?" Wait for confirmation. + +--- + +## Step 1: Set Unified Service Tags on the Application Process + +Without UST, traces arrive with an auto-detected service name that may not match user expectations, and won't be tagged with env or version. + +**For systemd-managed services** (most common): + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo systemctl cat " +``` + +Add a drop-in override (preserves the original unit file): + +### What you need to do in a terminal + +```bash +ssh -o StrictHostKeyChecking=no -i @ +sudo systemctl edit +``` + +Add to the editor: + +```ini +[Service] +Environment="DD_SERVICE=" +Environment="DD_ENV=" +Environment="DD_VERSION=" +``` + +Apply: + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo systemctl daemon-reload && sudo systemctl show | grep -E 'DD_SERVICE|DD_ENV|DD_VERSION'" +``` + +If the UST vars appear in the output — configuration applied. + +**For supervisord:** +```ini +# In [program:] section of supervisord.conf +environment=DD_SERVICE="",DD_ENV="",DD_VERSION="" +``` +Reload: `sudo supervisorctl reload` + +**For pm2:** +```js +// ecosystem.config.js +env: { DD_SERVICE: "", DD_ENV: "", DD_VERSION: "" } +``` +Reload: `pm2 reload ` + +--- + +## Step 2: Restart the Service + +**Confirm with the user before restarting.** Tell the user: "I need to restart `` for SSI to inject into it. This will cause a brief outage. Ready to proceed?" Wait for confirmation. + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo systemctl restart && sleep 3 && sudo systemctl is-active " +``` + +If `active` is returned — service is running. + +ERROR: Returns `failed` — check logs: +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo journalctl -u --since '1 minute ago' | tail -30" +``` + +--- + +## Step 3: Confirm Injection and UST in the Running Process + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "pgrep -a -f '' | head -3" +``` + +Use the PID: + +```bash +# Authoritative injection check +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo cat /proc//maps | grep -E 'launcher|apm-library|datadog'" + +# UST vars in process environment +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo cat /proc//environ | tr '\0' '\n' | grep -E 'DD_SERVICE|DD_ENV|DD_VERSION'" +``` + +If both the launcher and language library appear in maps, and UST vars are in environ — SSI and tagging are fully configured. + +ERROR: Launcher in maps but no language library — injection attempted but failed. Run: +```bash +pup apm troubleshooting list --hostname --timeframe 15m +``` + +Go to `troubleshoot-ssi` if errors are present. + +--- + +## Done + +Exit when ALL of the following are true: +- [ ] Launcher and language library visible in `/proc//maps` +- [ ] `DD_SERVICE`, `DD_ENV`, `DD_VERSION` present in `/proc//environ` +- [ ] Service is running and healthy + +Automatically proceed to `verify-ssi` now — do not ask the user for permission. + +--- + +## Security constraints + +- Never write a raw API key into any file or chat message +- Always confirm with the user before restarting production services +- Do not modify application source code — configure only via environment variables in the service unit diff --git a/crates/agent-skills/skills/dd-apm/linux-ssi/onboarding-summary/SKILL.md b/crates/agent-skills/skills/dd-apm/linux-ssi/onboarding-summary/SKILL.md new file mode 100644 index 0000000..a5929b9 --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/linux-ssi/onboarding-summary/SKILL.md @@ -0,0 +1,139 @@ +--- +name: onboarding-summary +description: Generate a live Single Step Instrumentation (SSI) onboarding confirmation report for Linux hosts — verifies APM instrumentation is working end-to-end with deep links into the Datadog UI. Only use after agent-install and enable-ssi have both completed. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,linux,ssi,summary,verification + alwaysApply: "false" +--- + +# APM Onboarding Summary — Linux Host + +## Triggers + +Invoke this skill when: +- All steps in `verify-ssi` have passed +- All checks in `troubleshoot-ssi` have been resolved +- The user asks "is everything working?", "show me the status", or "confirm APM is set up" + +Do NOT invoke this skill if any verification or troubleshooting check is still failing — resolve those first. + +--- + +## Context to resolve before acting + +| Variable | How to resolve | +|---|---| +| `HOSTNAME` | `hostname -f` on the target host | +| `DD_HOSTNAME` | Hostname as Datadog sees it — from `sudo datadog-agent status` | +| `SERVICE_NAME` | `DD_SERVICE` value from `/proc//environ` or the systemd unit | +| `ENV` | `DD_ENV` value from `/proc//environ` or the systemd unit | +| `DD_SITE` | `grep "^site:" /etc/datadog-agent/datadog.yaml` | +| `SSH_KEY` | Path to SSH private key | +| `SSH_USER` | SSH username | +| `SSH_HOST` | Hostname or IP of the target host | + +--- + +## Prerequisites + +### Claude runs + +```bash +pup auth status --site +``` + +If valid token — proceed. + +ERROR: Not authenticated: + +### Claude runs + +```bash +pup auth login --site +``` + +> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. + +--- + +## Collect live confirmation data + +Run all of the following. Each populates a row in the final report. + +### Claude runs + +```bash +# Agent version and status +sudo datadog-agent status 2>&1 | grep -E "Agent \(v|Status:|API Keys status" + +# Inject library armed in ld.so.preload +ssh -o StrictHostKeyChecking=no -i @ "cat /etc/ld.so.preload" + +# Process confirmed injected — launcher + language library in /proc//maps +ssh -o StrictHostKeyChecking=no -i @ \ + "pgrep -a -f '' | head -3" +``` + +Use the PID from above: + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo cat /proc//maps | grep -E 'launcher|apm-library|datadog'" + +# UST vars in process environment +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo cat /proc//environ | tr '\0' '\n' | grep -E 'DD_SERVICE|DD_ENV|DD_VERSION'" + +# Agent APM receiver — trace counts +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo datadog-agent status 2>&1 | grep -A 10 'Receiver (previous minute)'" + +# Service visible and traced in APM backend +DD_SITE= pup apm services list --env --from 1h + +# Traces arriving in the last hour +DD_SITE= pup traces search --query "service:" --from 1h --limit 5 +``` + +--- + +## Present the report + +Fill in every value from live command output. Do not leave any placeholder unfilled. If a value cannot be confirmed, mark that row as failed and link to `troubleshoot-ssi`. + +--- + +**APM onboarding complete** + +| Check | Detail | Status | +|---|---|---| +| Datadog Agent | v`` running on ``, API key valid | OK | +| SSI armed | `/etc/ld.so.preload` contains launcher path | OK | +| Process injected | launcher + language library in `/proc//maps` for `` | OK | +| Unified Service Tags | `DD_SERVICE=` `DD_ENV=` `DD_VERSION=` | OK | +| Agent receiving traces | `` trace(s)/min in APM receiver | OK | +| APM service visible | `` in env `` | OK | +| Traces arriving | `` trace(s) found in the last hour | OK | + +--- + +**Your service in Datadog — click to open:** + +Construct each URL by substituting real values. Do not print placeholder URLs. + +| View | URL | +|---|---| +| Service overview | `https://app./apm/services/?env=` | +| Traces explorer | `https://app./apm/traces?query=service:%20env:` | +| Service map | `https://app./apm/map?env=&service=` | +| Infrastructure host | `https://app./infrastructure?q=host:` | +| Agent fleet | `https://app./fleet-automation` | + +--- + +## Security constraints + +- Never write a raw API key into any file or chat message diff --git a/crates/agent-skills/skills/dd-apm/linux-ssi/troubleshoot-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/linux-ssi/troubleshoot-ssi/SKILL.md new file mode 100644 index 0000000..95035df --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/linux-ssi/troubleshoot-ssi/SKILL.md @@ -0,0 +1,400 @@ +--- +name: troubleshoot-ssi +description: Diagnose and fix Single Step Instrumentation (SSI) issues on Linux hosts — SSI automatically instruments applications for APM without code changes. Only use if the agent and SSI are configured but traces are missing or instrumentation is not working. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,linux,ssi,troubleshooting,instrumentation,ld-preload + alwaysApply: "false" +--- + +# Troubleshoot APM SSI on Linux + +## Triggers + +Invoke this skill when the user expresses intent to: +- Debug why a Linux process is not being instrumented +- Investigate why traces are not appearing in Datadog from a Linux host +- Diagnose SSI injection failures on Linux +- Follow up on failed checks from `verify-ssi` +- Report that a specific service or host has no traces + +Do NOT invoke this skill if: +- SSI has not been enabled yet — run `enable-ssi` first + +--- + +## Critical: pup First, SSH Second + +**You do NOT need SSH access to start troubleshooting.** The `pup` CLI queries Datadog's backend directly. Start with pup commands immediately using information the user already gave you (hostname, service name, env). Only go to SSH if pup doesn't reveal the cause. + +### pup-cli: check, install, and authenticate + +### Claude runs + +```bash +pup --version +``` + +If not found, install it (OS-aware): + +### Claude runs + +```bash +if [[ "$(uname)" == "Darwin" ]]; then + brew tap datadog-labs/pack && brew install datadog-labs/pack/pup +else + PUP_VERSION=$(curl -s https://api.github.com/repos/datadog-labs/pup/releases/latest | grep '"tag_name"' | cut -d'"' -f4) + curl -L "https://github.com/datadog-labs/pup/releases/download/${PUP_VERSION}/pup_linux_amd64.tar.gz" | tar xz -C /usr/local/bin pup + chmod +x /usr/local/bin/pup +fi +pup --version +``` + +**Auth — check in this order:** + +1. Check OAuth status: +```bash +pup auth status --site +``` + +If authenticated — proceed directly to Step 1. + +ERROR: Not authenticated: + +### Claude runs + +```bash +pup auth login --site +``` + +> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. + +2. If OAuth login is not possible (e.g., no browser access), fall back to API keys: +```bash +echo "DD_API_KEY set: $([ -n "${DD_API_KEY:-}" ] && echo yes || echo no)" +echo "DD_APP_KEY set: $([ -n "${DD_APP_KEY:-}" ] && echo yes || echo no)" +``` + +If `DD_API_KEY` and `DD_APP_KEY` are both set — **proceed to Step 1**. pup will use them automatically even if `pup auth status` shows unauthenticated. + +--- + +## Context + +Use what the user already provided. Do not ask for missing context upfront — resolve variables lazily, only when a specific step needs them. + +| Variable | How to resolve | When needed | +|---|---|---| +| `DD_HOSTNAME` | From the user's message, or `datadog-agent status` via SSH | Step 1 — start here | +| `SERVICE_NAME` | From the user's message | Step 1 — start here | +| `ENV` | Ask the user only when a command requires it | Step 1 (`service-library-config get`), Step 3 | +| `DD_SITE` | Ask the user, or `grep "^site:" /etc/datadog-agent/datadog.yaml` via SSH | Only if pup auth check fails | +| `SSH_KEY` | From user or `/workspace/.ssh/id_ed25519` | Step 4 (SSH investigation) only | +| `SSH_USER` | From user or default `root` | Step 4 (SSH investigation) only | +| `SSH_HOST` | From user's message | Step 4 (SSH investigation) only | + +**If the user has already provided `DD_HOSTNAME` and `SERVICE_NAME`, go directly to Step 1. Do not ask for ENV or SSH details first.** + +--- + +## How SSI Works on Linux — Domain Knowledge + +Read this before investigating. It gives you the mental model to reason about novel failures. + +**Injection chain:** +1. Install script (with `DD_APM_INSTRUMENTATION_ENABLED=host`) installs `datadog-apm-inject` and language library packages under `/opt/datadog-packages/` +2. The inject package writes its launcher path into `/etc/ld.so.preload` +3. The Linux dynamic linker pre-loads the launcher into every new process at startup +4. The launcher detects the process language and loads the appropriate tracer `.so` from `/opt/datadog-packages/datadog-apm-library-/` +5. The tracer sends spans to the Agent at `localhost:8126` +6. The Agent forwards traces to Datadog at `intake.` + +**Diagnostic layers:** +- **`pup`** — sees what Datadog's backend received + injection errors reported by the launcher. Start here. +- **`/proc//maps`** — sees the actual shared libraries loaded into a running process. The authoritative check for whether injection succeeded. +- **`datadog-agent status`** — sees whether the local Agent is receiving traces. + +**Known silent failures:** +- **musl libc (Alpine)** — launcher is glibc-compiled; musl is ABI-incompatible. Linker loads it but injection silently aborts +- **Existing ddtrace/OTel** — launcher detects user-installed tracer and silently disables itself (`already_instrumented` result class) +- **Unsupported runtime version** — silently skipped +- **Process started before SSI was enabled** — `/etc/ld.so.preload` only affects new processes +- **Static binary / Go** — Go programs link statically and ignore `LD_PRELOAD` entirely +- **SELinux/AppArmor** — can block `/etc/ld.so.preload` reads for confined processes +- **Package directory empty/corrupt** — `datadog-installer status` reflects DB registration, not actual files. A package can show as installed while its directory is empty. Always verify files exist under `/opt/datadog-packages//` + +**Service name identity — important:** +With SSI, `DD_SERVICE` is often not set in the process environment. The tracer auto-detects a service name. The telemetry-reported name (what `pup fleet tracers list` and `service-library-config get` show) may not match what you expect in the APM UI: +- **JVM**: telemetry reports jar artifact name with version (e.g. `inventory-service-1.0.0`), spans use the base name (`inventory-service`) +- **Python**: telemetry may report `fastapi` or `django` rather than the app name +- **Node.js**: names typically match + +If `service-library-config get` returns empty, use `pup traces search --query "host:" --from 1h --limit 5` to discover what service names have been sending traces, then retry. + +--- + +## Step 1: Triage with pup (no SSH required) + +Run these first. The answers determine everything that follows. + +### Claude runs + +```bash +# Check for injection errors (failures only — successful injections don't appear here) +pup apm troubleshooting list --hostname + +# Check full tracer config — look at apm_enabled, trace_agent_url, site +pup apm service-library-config get --service-name --env + +# Check what services have sent traces (reveals actual service names visible to backend) +pup apm services list --from 1h + +# Check if traces exist at all +pup traces search --query "service:" --from 15m --limit 5 + +# Fastest trace confirmation — metrics appear before indexed traces +pup metrics query --query "sum:trace.*.request.hits{host:,service:}.as_count()" --from 15m +``` + +`ENV` is required for `service-library-config get`. If the user didn't provide it, ask for it before running that command. + +Key values to check in `service-library-config get` output: +- `apm_enabled` — must be `true`. If `false`, the tracer won't send traces regardless of injection. +- `trace_agent_url` — must point to `http://localhost:8126` or the correct agent socket. Wrong value = tracer can't reach the Agent. +- `site` — must match your Datadog org's site. + +--- + +## Step 2: State Your Hypotheses + +Before investigating, explicitly state your ranked hypotheses based on triage output. Do not skip this step. + +| Triage signal | Strong hypothesis | +|---|---| +| `pup troubleshooting list` shows `result: error`, `result_class: incorrect_installation` | Package directory empty or corrupt — verify files exist under `/opt/datadog-packages/datadog-apm-library-/`, then use remediation flow | +| `pup troubleshooting list` shows `result: error`, import/load error | Tracer library couldn't be loaded — check runtime version, libc compatibility | +| `pup troubleshooting list` shows `result: abort`, reason `already_instrumented` | Manual ddtrace/OTel already in the app — launcher silently disabled itself | +| `pup troubleshooting list` shows `result: abort`, reason `language not detected` | Expected for non-app processes (e.g., bash, cron). Not a failure. | +| `pup troubleshooting list` empty | Either no injection attempts yet (process not restarted), or injection succeeded silently | +| `service-library-config get` shows `apm_enabled: false` | Tracer is loaded but explicitly disabled — check `source` field to see who set it | +| `service-library-config get` shows `trace_agent_url` pointing to wrong host/port | Tracer can't reach the Agent — fix the URL | +| `service-library-config get` shows wrong `site` | Traces going to wrong Datadog org | +| No traces in `pup traces search`, no troubleshooting errors | Process was never injected — check: process not restarted after SSI enabled, `/etc/ld.so.preload` missing, static binary | +| Unexpected service name in `pup apm services list` results | Service name mismatch — use the actual name from trace data for subsequent config lookups | +| Traces arriving in pup | Not a real problem — likely a UI filter or time window. Tell the user and stop. | + +State your top 1-3 hypotheses explicitly: *"Based on triage, I think the most likely cause is X because Y."* + +--- + +## Step 3: Investigate with pup (deeper) + +Use only the tools relevant to your hypotheses. + +**Check SDK config in detail:** +```bash +# Show all config values with their source (env_var, remote_config, code, default) +pup apm service-library-config get --service-name --env + +# Show only configs where instances disagree (config drift) +pup apm service-library-config get --service-name --mixed +``` + +Key values to check: +- `apm_enabled` — if `false`, tracer won't send traces. Check `source` to see who disabled it (`code` > `env_var` > `remote_config` > `default`) +- `trace_agent_url` — should be `http://localhost:8126` or a Unix socket. Wrong value = tracer can't reach Agent +- `site` — must match your Datadog org's site. Mismatch = traces going to wrong org +- `service` — with SSI and no `DD_SERVICE` set, `source: default` is expected + +**If `service-library-config get` returns empty** — the service name you're using may not match the actual name in trace data: +```bash +pup traces search --query "host:" --from 1h --limit 5 +``` +Use the `service` field from trace results for subsequent config lookups. + +**Check injection error details:** +```bash +pup apm troubleshooting list --hostname --timeframe 4h +``` + +--- + +## Step 4: Investigate via SSH (if pup didn't reveal the cause) + +**Before asking for SSH credentials, briefly explain what you need to check and why**, so the user understands the diagnostic plan before handing over access. + +**Is `/etc/ld.so.preload` set?** +```bash +ssh -o StrictHostKeyChecking=no -i @ "cat /etc/ld.so.preload" +``` +If it contains a path ending in `launcher.preload.so` or `libdatadog-apm-inject.so` — launcher is armed for new processes. +ERROR: Empty or missing — SSI was not fully set up. Re-run the install script with `DD_APM_INSTRUMENTATION_ENABLED=host`. + +**Is the tracer actually loaded into the running process?** + +This is the authoritative injection check — use `/proc//maps`, not environ: +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "pgrep -a -f '' | head -3" +``` +Use the PID: +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo cat /proc//maps | grep -E 'launcher|apm-library|datadog'" +``` +- **Launcher + language library present** — injection succeeded for this process +- **Launcher only, no language library** — launcher ran but couldn't inject the tracer (check `pup troubleshooting list` for the reason) +- **Nothing** — `/etc/ld.so.preload` not set, process started before SSI was enabled, or static binary + +**Was the process started before SSI was enabled?** +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "ps -p -o pid,lstart,cmd; stat /etc/ld.so.preload" +``` +If process started before `/etc/ld.so.preload` was written, restart the service. **Always confirm with the user before restarting production services.** + +**Is the base libc musl?** +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "ldd --version 2>&1 | head -1 && cat /etc/os-release | grep PRETTY_NAME" +``` +ERROR: musl — SSI's launcher requires glibc. No workaround; must migrate to Debian/Ubuntu/RHEL/Amazon Linux. + +**Is it a static binary?** +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "file /proc//exe; ldd /proc//exe 2>&1" +``` +ERROR: `statically linked` — SSI cannot instrument this binary. Manual instrumentation required. + +**Are the APM packages actually present on disk?** + +`datadog-installer status` reflects only DB registration — a package can show as installed while its directory is empty. Always verify: +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "ls /opt/datadog-packages/ && ls /opt/datadog-packages/datadog-apm-library-/ | head -5" +``` +ERROR: Directory empty or missing — package is registered but broken on disk. Use the remediation flow. + +**Does the app have existing manual instrumentation?** +```bash +ssh -o StrictHostKeyChecking=no -i @ " +sudo cat /proc//maps | grep -E 'ddtrace|opentelemetry|dd-trace' +" +``` +Also check dependency manifests: `requirements.txt`, `package.json`, `Gemfile`, `pom.xml`. +ERROR: Found — SSI silently disabled itself. Remove manual tracer, restart the service. + +**Is the Agent APM receiver listening and receiving traces?** +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo datadog-agent status 2>&1 | grep -A 15 'APM Agent'" +``` +- `feature_auto_instrumentation_enabled: true` — SSI is active on the agent +- `Receiver (previous minute)` — trace count received by the agent +- `Endpoints` — where traces are forwarded + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo ss -tlnp 2>/dev/null | grep 8126 || sudo netstat -tlnp 2>/dev/null | grep 8126" +``` +ERROR: Port 8126 not listening — APM receiver disabled. Check `apm_config.enabled` in `/etc/datadog-agent/datadog.yaml`. + +**What service name did the tracer register?** + +With SSI, `DD_SERVICE` is often not set. Read the tracer's memfd to find the real service name: +```bash +ssh -o StrictHostKeyChecking=no -i @ " +sudo ls -la /proc//fd/ | grep 'datadog-tracer-info' +" +``` +Use the fd number: +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo cat /proc//fd/ | python3 -c \"import sys,msgpack; d=msgpack.unpackb(sys.stdin.buffer.read()); print(d)\"" +``` +Returns `service_name`, `service_env`, `tracer_version`. + +**Is SELinux/AppArmor blocking `/etc/ld.so.preload`?** +```bash +ssh -o StrictHostKeyChecking=no -i @ " +getenforce 2>/dev/null +ausearch -m AVC -ts recent 2>/dev/null | grep 'ld.so.preload\|datadog' | tail -10 +dmesg | grep -i 'apparmor.*denied.*datadog' | tail -5 +" +``` +If SELinux/AppArmor is denying access, work with the user's security team. Do not disable SELinux systemwide. + +--- + +## Step 5: Reflect Before Concluding + +Before applying any fix, answer: +1. What evidence confirms my hypothesis? +2. What evidence would contradict it — and have I checked? +3. Is there a simpler explanation I haven't considered? + +If the conclusion doesn't hold up, return to Step 2 with new hypotheses. + +--- + +## Step 6: Fix + +**Remediation: Reinstalling a Broken APM Package** + +`datadog-installer status` reflects DB registration, not actual file presence. If `pup troubleshooting list` shows `incorrect_installation` but the installer says the package is installed, the registration is stale: + +```bash +# Remove the stale registration first +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo datadog-installer remove datadog-apm-library-" + +# Re-run install — now it will actually download and extract +ssh -o StrictHostKeyChecking=no -i @ \ + "DD_API_KEY=${DD_API_KEY} DD_SITE=${DD_SITE} DD_APM_INSTRUMENTATION_ENABLED=host bash -c \"\$(curl -L https://install.datadoghq.com/scripts/install_script_agent7.sh)\"" +``` + +If re-running the install script is sufficient (package files are intact), use `remove` first only if the script reports success but the problem persists. + +**After any config change — restart the service** (confirm with user first for production): + +The user must restart the affected service for SSI to re-inject. Identify the service manager and present restart instructions — do not restart automatically unless the user explicitly asks. + +Common restart commands: +```bash +# systemd +sudo systemctl restart +# supervisord +sudo supervisorctl restart +# pm2 +pm2 reload +``` + +--- + +## Step 7: Verify + +Re-run the pup triage commands to confirm the fix worked: + +### Claude runs + +```bash +pup apm troubleshooting list --hostname --timeframe 15m +pup traces search --query "service:" --from 15m --limit 5 +pup metrics query --query "sum:trace.*.request.hits{host:,service:}.as_count()" --from 15m +``` + +If there are no new injection errors and traces are arriving — resolved. Automatically proceed to `onboarding-summary` now — do not ask the user for permission. + +ERROR: Still failing — return to Step 2 with updated hypotheses. + +--- + +## Security constraints + +- Never write a raw API key into any file or chat message +- Never disable SELinux systemwide +- Always confirm before restarting production services +- `datadog-installer remove` requires explicit confirmation — confirm with user before running diff --git a/crates/agent-skills/skills/dd-apm/linux-ssi/verify-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/linux-ssi/verify-ssi/SKILL.md new file mode 100644 index 0000000..15465ba --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/linux-ssi/verify-ssi/SKILL.md @@ -0,0 +1,198 @@ +--- +name: verify-ssi +description: Verify Single Step Instrumentation (SSI) is working end-to-end on Linux hosts — SSI automatically instruments applications for APM without code changes. Only use after enable-ssi has run. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,linux,ssi,verification,instrumentation,ld-preload + alwaysApply: "false" +--- + +# Verify APM SSI on Linux + +> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 1 until every variable has a concrete value. + +## Triggers + +Invoke this skill when the user expresses intent to: +- Confirm SSI is working after installing the Datadog Agent on Linux +- Check whether a Linux process is being instrumented +- Verify the tracer is running and reporting telemetry + +Do NOT invoke this skill if: +- SSI has not been enabled yet — run `agent-install` first +- Services have not been restarted since the agent was installed — restart them first, then verify + +--- + +## Prerequisites + +- [ ] `agent-install` is complete +- [ ] Application services have been restarted since the agent was installed + +### pup-cli: check, install, and authenticate + +### Claude runs + +```bash +pup --version +``` + +If not found: + +### Claude runs + +```bash +brew tap datadog-labs/pack +brew install pup +``` + +Check auth: +```bash +pup auth status --site +``` + +If not authenticated: + +### Claude runs + +```bash +pup auth login --site +``` + +> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. + +If valid token — proceed. +ERROR: No browser available: `export DD_APP_KEY=` + +--- + +## Context to resolve before acting + +| Variable | How to resolve | +|---|---| +| `DD_HOSTNAME` | Hostname as Datadog sees it — from `sudo datadog-agent status` output | +| `SERVICE_NAME` | Expected service name in APM — ask the user | +| `ENV` | Environment tag — ask the user | +| `DD_SITE` | `grep "^site:" /etc/datadog-agent/datadog.yaml` via SSH, or ask the user | +| `SSH_KEY` | Path to SSH private key | +| `SSH_USER` | SSH username | +| `SSH_HOST` | Hostname or IP of the target host | + +--- + +## Step 1: Confirm the Process is Injected + +Use `/proc//maps` — this is the authoritative check. It shows the actual shared libraries loaded into the running process, which is the only way to confirm the launcher and tracer `.so` files were actually loaded. + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "pgrep -a -f '' | head -5" +``` + +Use the PID from above: + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo cat /proc//maps | grep -E 'launcher|apm-library|datadog'" +``` + +If the output includes both the launcher (e.g. `launcher.preload.so`) and a language library (e.g. `apm-library-python`) — injection succeeded for this process. + +ERROR: Launcher present but no language library — launcher ran but couldn't inject. Check for injection errors: + +### Claude runs + +```bash +pup apm troubleshooting list --hostname --timeframe 1h +``` + +ERROR: Neither present — process was not injected. Check `/etc/ld.so.preload`: + +```bash +ssh -o StrictHostKeyChecking=no -i @ "cat /etc/ld.so.preload" +``` + +If empty — install did not set up the launcher. Re-run the install script with `DD_APM_INSTRUMENTATION_ENABLED=host`. If non-empty but the process still isn't injected — the process was started before the launcher was installed. Restart the service and recheck. + +--- + +## Step 2: Confirm the Agent is Receiving Traces + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo datadog-agent status 2>&1 | grep -A 15 'APM Agent'" +``` + +Healthy output shows: +- `feature_auto_instrumentation_enabled: true` +- `Receiver (previous minute)` with `> 0` traces + +ERROR: `feature_auto_instrumentation_enabled: false` — SSI not active on the agent. Check `apm_config` in `/etc/datadog-agent/datadog.yaml`. + +ERROR: `Receiver (previous minute): 0` — agent running but no traces yet. Generate traffic first (see Step 3), then recheck. + +--- + +## Step 3: Confirm the Service is Visible in Datadog + +### Claude runs + +```bash +DD_SITE= pup apm services list --env --from 1h +``` + +If `` appears with `isTraced: true` — traces are reaching the Datadog backend. + +> **Flask / ddtrace v3 naming note:** With ddtrace >=3.x, Flask spans are emitted as `service:flask` rather than `service:`. The `DD_SERVICE` value appears as `base_service` on the spans. If you set `DD_SERVICE=my-app`, search for `service:flask` in the APM UI — the service list will show `flask`, not `my-app`. Check the `base_service` tag to confirm it matches your `DD_SERVICE`. + +ERROR: Service missing — generate traffic to trigger trace creation: + +### Claude runs + +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "sudo ss -tlnp 2>/dev/null | grep || sudo netstat -tlnp 2>/dev/null | grep " +``` + +Use the port from above: +```bash +ssh -o StrictHostKeyChecking=no -i @ \ + "for i in \$(seq 1 10); do curl -s -o /dev/null http://localhost:/; done" +``` + +Wait 30 seconds, then retry: +```bash +DD_SITE= pup apm services list --env --from 10m +DD_SITE= pup traces search --query "service:" --from 10m --limit 5 +``` + +ERROR: Still missing — check for injection errors and go to `troubleshoot-ssi`: +```bash +pup apm troubleshooting list --hostname --timeframe 1h +``` + +--- + +## Done + +Exit when ALL of the following are true: +- [ ] Step 1: launcher + language library both visible in `/proc//maps` +- [ ] Step 2: agent APM receiver shows `> 0` traces/min +- [ ] Step 3: service appears in `pup apm services list` + +If any check fails, go to `troubleshoot-ssi`. + +When all steps pass, automatically proceed to `onboarding-summary` now — do not ask the user for permission. + +--- + +## Security constraints + +- Never write a raw API key into any file or chat message +- Always confirm before restarting production services diff --git a/crates/agent-skills/skills/dd-apm/service-remapping/SKILL.md b/crates/agent-skills/skills/dd-apm/service-remapping/SKILL.md new file mode 100644 index 0000000..8e387b7 --- /dev/null +++ b/crates/agent-skills/skills/dd-apm/service-remapping/SKILL.md @@ -0,0 +1,411 @@ +--- +name: service-remapping +description: Create and manage APM service remapping rules — rewrite service names at ingestion time to collapse noisy inferred entities, clean up auto-generated names, handle org renames, or normalize naming conventions. Use for any request involving service renaming, service mapping, inferred service cleanup, peer.service normalization, or collapsing fragmented service names. +metadata: + version: "1.0.0" + author: datadog-labs + repository: https://github.com/datadog-labs/agent-skills + tags: datadog,apm,service-remapping,service-naming,inferred-services,peer-service + alwaysApply: "false" + tools: pup +--- + +# APM Service Remapping + +> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 0 until every variable has a concrete value. + +--- + +## How Service Remapping Works — Domain Knowledge + +Read this before building any rule. It gives you the mental model to construct the right filter and catch edge cases. + +**What remapping does:** A rule intercepts telemetry at ingestion time and rewrites the service name before indexing. A rule says: "for any entity matching this filter, replace its service name with this new value." + +**Two entity types — pick the right one:** + +| Entity type | `rule_type` integer | What it targets | +|---|---|---| +| **SERVICE** | `0` | Instrumented services — have spans with an explicit `service` tag set by a tracer | +| **INFERRED_ENTITY** | `1` | Auto-detected from outbound calls — named from `peer.service`. **Requires `peer.service` to be set on outbound spans** (see prerequisite below). | + +**Prerequisite for inferred entity remapping — `peer.service` must be set:** + +Inferred entity remapping only works when the tracer sets `peer.service` on outbound spans. Without it, entities are keyed by `peer.hostname` and remapping rules will not apply. + +To enable this, set the following env var on the **instrumented service** (not the downstream dependency): + +```bash +DD_TRACE_PEER_SERVICE_DEFAULTS_ENABLED=true +``` + +This makes the ddtrace tracer automatically propagate `peer.service` from `peer.hostname` on outbound HTTP, gRPC, and database calls. Without this, `pup traces search` will show spans with `peer.hostname` but no `peer.service`, and no service remapping rule will match. + +To verify `peer.service` is being set before building a rule: + +```bash +pup traces search --query "@peer.service:" --from 15m --limit 5 +``` + +If zero results — the tracer is not setting `peer.service`. Ask the user to add `DD_TRACE_PEER_SERVICE_DEFAULTS_ENABLED=true` to their service's environment and redeploy before continuing. + +**Filter syntax** — a standard Datadog event-grammar query string: + +| Goal | Filter | +|---|---| +| Exact service match | `service:payments` | +| All services with a prefix | `service:deploy-test*` | +| All services with a suffix | `service:*.tropos` | +| All services containing a string | `service:*payments*` | +| All inferred services under a domain | `peer.service:*.shopify.com` | +| Service in one environment only | `service:payments AND env:prod` | + +**New name syntax** — the `value` field in `rewrite_tag_rules`: + +| Form | Example | Use for | +|---|---|---| +| Static string | `my-service` | Every matched entity gets exactly this name | +| Tag interpolation | `{{service}}` | Substitute the full value of a tag | +| Tag + regex capture | `{{service\|^(.+?)\..*$}}` | Extract part of a tag value (non-greedy capture) | + +**Regex constraints for `{{tag\|regex}}`:** +- Maximum **1 capture group** per expression +- **No greedy quantifiers inside capture groups** — use non-greedy variants: `(.+?)` not `(.+)`, `(.*?)` not `(.*)` +- Quantifiers on capture groups themselves (e.g. `(foo)+`) are not allowed + +**Five remapping patterns:** + +| Pattern | User says… | Filter example | New name example | +|---|---|---|---| +| **N:1 group** | "These N services are all the same thing" | `peer.service:*.shopify.com` | `shopify` | +| **Strip suffix/prefix** | "The name has junk at the end/start" | `service:*.tropos` | `{{service\|^(.+?)\..*$}}` | +| **1:1 rename** | "We renamed this service and Datadog needs to match" | `service:old-auth-service` | `auth-service` | +| **Env split** | "I want separate services per env but they all have the same name" | `service:my-service AND env:prod` | `my-service-prod` | +| **Prefix normalization** | "All services should start with an env or team name" | `service:payments*` | `{{env}}-{{service}}` | + +--- + +## Triggers + +Invoke this skill when the user wants to: +- Rename a service in Datadog without re-instrumenting +- Collapse multiple inferred service names into one (e.g. many `api.shopify.com/*` variants → `shopify`) +- Strip environment suffixes, version tags, or deployment metadata baked into service names +- Normalize `peer.service` names to something meaningful +- Rename a service after an org change, product rebrand, or migration +- Split a single service into per-env variants (`my-service` + `env:prod` → `my-service-prod`) +- List, review, or delete existing service remapping rules + +Do NOT invoke this skill if: +- The user wants to rename the service in their application code — that requires a tracer config change (`DD_SERVICE`), not a remapping rule +- The user wants to correlate telemetry across infrastructure tags — that is the "Correlate telemetry" action type in the UI, not remapping + +--- + +## Prerequisites + +### pup-cli: check, install, and authenticate + +### Claude runs + +```bash +pup --version +``` + +If not found: + +### Claude runs + +```bash +brew tap datadog-labs/pack +brew install pup +``` + +Check auth: +```bash +pup auth status +``` + +If not authenticated: + +### Claude runs + +```bash +pup auth login +``` + +> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. + +### Credentials for write operations + +`pup apm service-remapping list` and `get` work with OAuth. Create, update, and delete require API keys (`DD_API_KEY`, `DD_APP_KEY`, `DD_SITE`) until `apm_service_renaming_write` is added to pup's OAuth scopes. + +### Claude runs + +```bash +echo "DD_API_KEY set: $([ -n "${DD_API_KEY:-}" ] && echo yes || echo no)" +echo "DD_APP_KEY set: $([ -n "${DD_APP_KEY:-}" ] && echo yes || echo no)" +echo "DD_SITE: ${DD_SITE:-not set (defaulting to datadoghq.com)}" +``` + +If any are missing and you need to create/update/delete rules: + +### What you need to do in a terminal + +```bash +export DD_API_KEY= +export DD_APP_KEY= +export DD_SITE=datadoghq.com # adjust for your site +``` + +> Common sites: `datadoghq.com` (US1), `datadoghq.eu` (EU1), `us3.datadoghq.com`, `us5.datadoghq.com`, `ap1.datadoghq.com` + +Wait for the user to set credentials, then re-run the check above before continuing. + +--- + +## Context to resolve before acting + +| Variable | How to resolve | +|---|---| +| `ENV` | Ask the user which environment to target. Do NOT assume `prod`. | +| `ORIGINAL_SERVICE` | Current service name(s) to remap — discover with `pup apm services list` or ask the user | +| `ENTITY_TYPE` | Instrumented service (`rule_type: 0`) or inferred entity (`rule_type: 1`)? Ask if unclear — see Domain Knowledge | +| `TARGET_NAME` | The desired new service name — ask the user | +| `PATTERN` | Which pattern applies — identify from the user's description (see Domain Knowledge above) | + +--- + +## Step 0: Discover Current Service Names + +If the user hasn't specified exact names to remap, discover what exists first: + +### Claude runs + +```bash +pup apm services list --env --from 1h +pup traces search --query "service:" --from 1h --limit 20 +``` + +Use the output to help the user identify exact service names. Ask the user to confirm which names they want remapped before proceeding. + +--- + +## Step 1: Build the Rule + +Work through each component before writing any JSON. + +### 1. Entity type + +[DECISION: entity type — ask the user if unclear] +- Does the service appear because a tracer explicitly set its `service` tag? → `rule_type: 0` (SERVICE) +- Does it appear in the service map from outbound calls (e.g. a database, queue, or external API)? → `rule_type: 1` (INFERRED_ENTITY) + +If the user wants to remap an inferred entity, verify `peer.service` is set before proceeding — see the prerequisite in Domain Knowledge. If it is not set, stop and ask the user to enable `DD_TRACE_PEER_SERVICE_DEFAULTS_ENABLED=true` first. + +### 2. Filter + +Write a single event-grammar query string targeting the service(s) to remap. Use the filter syntax and pattern table in Domain Knowledge to pick the right form. + +### 3. New name (`value`) + +Use the new name syntax and regex table in Domain Knowledge to pick the right form. For regex values, apply the constraints listed there. + +### 4. Rule name + +Suggest a descriptive name. Examples: +- `collapse-shopify-inferred-services` +- `strip-tropos-suffix` +- `rename-old-auth-to-auth-service` +- `env-split-my-service-prod` + +--- + +## Step 2: Preview Impact + +Before constructing the JSON, check what will be affected: + +### Claude runs + +```bash +# Confirm telemetry exists for the targeted service (zero spans = wrong query or wrong env) +pup traces search --query "service:" --from 15m --limit 5 + +# Check for monitors referencing the old service name +pup monitors list | grep -i "" + +# Check for dashboards referencing the old service name +pup dashboards list | grep -i "" + +# List existing service remapping rules that may conflict +pup apm service-remapping list +``` + +Report to the user: + +| Item | What to surface | +|---|---| +| **Telemetry volume** | Non-zero spans confirm the filter will match real data. Zero = likely wrong service name or env. | +| **Monitors** | Any monitor referencing the old service name will silently break after remapping. List them and offer to update. | +| **Dashboards** | Any dashboard with the old service name in its title will have stale references after remapping. List them and offer to update. | +| **Conflicting rules** | Existing rules targeting the same service may be overridden. Show conflicts and ask the user to confirm. | + +If monitors reference the old service name, ask: +> *"I found `` monitor(s) referencing ``. After remapping, they'll need to be updated to use ``. Want me to update them now?"* + +--- + +## Step 3: Confirm the Rule + +Show the user the planned rule and confirm before creating: + +> *"I'm going to create a service remapping rule named `` with filter `` that maps `` → `` (rule_type: ``). Ready to proceed?"* + +Wait for confirmation before continuing. + +--- + +## Step 4: Create the Rule + +### Claude runs + +```bash +pup apm service-remapping create \ + --name "" \ + --filter "" \ + --rule-type \ + --value "" +``` + +If the response contains an `id` field — creation succeeded. Record the `id` and `version` values from the response. + +ERROR: `400 Bad Request` with "Filter expression has invalid syntax" — the filter query is malformed. Check glob syntax and boolean operators. + +ERROR: `400 Bad Request` with "Template value in target name is invalid" — the `value` regex is invalid. Check: max 1 capture group, non-greedy quantifiers inside groups (`(.+?)` not `(.+)`). + +ERROR: `401 Unauthorized` — credentials are invalid or expired. Re-check `DD_API_KEY` and `DD_APP_KEY`. + +ERROR: `403 Forbidden` — the API key lacks `apm_service_renaming_write` permission. + +--- + +## Step 5: Verify + +Allow 2–5 minutes for the rule to propagate, then confirm it is active. + +### For SERVICE rules (rule_type 0) + +### Claude runs + +```bash +# Confirm new service name appears in APM +pup apm services list --env --from 5m + +# Confirm traces are arriving under the new name +pup traces search --query "service:" --from 5m --limit 5 +``` + +If `` appears in either — rule is active. + +### For INFERRED_ENTITY rules (rule_type 1) + +Inferred entities don't produce their own spans, so they won't appear in `pup apm services list` or `pup traces search`. Verify in two steps: + +**Step 5a — confirm the rule is stored correctly:** + +### Claude runs + +```bash +pup apm service-remapping get +``` + +Confirm the filter and value match what you intended. + +**Step 5b — confirm the entity name changed in the service map:** + +Ask the user to check the APM Service Map in the Datadog UI and look for `` where `` used to appear. The service map is the authoritative view for inferred entity names. + +Alternatively, confirm new `peer.service` values are arriving on spans from the instrumented service: + +### Claude runs + +```bash +pup traces search --query "service: @peer.service:" --from 5m --limit 5 +``` + +If spans appear with `peer.service:` — rule is active. + +ERROR: New name not appearing after 5 minutes: +- Confirm old service is still sending traces with the original `peer.service`: `pup traces search --query "@peer.service:" --from 5m` +- If old name still appears, propagation may still be in progress — wait 2 more minutes and retry +- If neither name appears, confirm `DD_TRACE_PEER_SERVICE_DEFAULTS_ENABLED=true` is set on the instrumented service — without it `peer.service` is never set and the rule will never fire + +--- + +## Managing Existing Rules + +### List all rules + +### Claude runs + +```bash +pup apm service-remapping list +``` + +### Get a single rule + +### Claude runs + +```bash +pup apm service-remapping get +``` + +### Update a rule + +Update requires the current `version` from list/get output. Show the proposed changes to the user and confirm before running: + +### Claude runs + +```bash +pup apm service-remapping update \ + --name "" \ + --filter "" \ + --rule-type \ + --value "" \ + --version +``` + +ERROR: `409 Conflict` — the rule was modified since you fetched it. Re-fetch with `get` to get the current version and retry. + +### Delete a rule + +Show the user the rule's name and filter first, then ask for confirmation. Delete requires both the rule `id` and `version` from the list/get output: + +### Claude runs + +```bash +pup apm service-remapping delete +``` + +ERROR: `409 Conflict` — the rule was modified since you fetched it. Re-fetch with `get` to get the current version and retry. + +--- + +## Done + +Exit when ALL of the following are true: +- [ ] Rule shown to user and confirmed before creation +- [ ] Rule created and `id` returned in response +- [ ] New service name visible in `pup apm services list` +- [ ] Impacted monitors identified and offered for update +- [ ] User confirmed the remapping matches their intent + +--- + +## Security constraints + +- Never write a raw API key into any file or chat message — always use `$DD_API_KEY` and `$DD_APP_KEY` +- Never create or delete a rule without explicit user confirmation — show the full rule before creating +- Never assume `prod` as the environment — always confirm with the user +- Never run DELETE without showing the user the rule's name and filter first +- Never enable `enabled_org_wide` without explicit user confirmation — it affects the entire org diff --git a/crates/agent-skills/src/lib.rs b/crates/agent-skills/src/lib.rs new file mode 100644 index 0000000..94d24d8 --- /dev/null +++ b/crates/agent-skills/src/lib.rs @@ -0,0 +1,48 @@ +pub const DD_APM_SKILL: &str = include_str!("../skills/dd-apm/SKILL.md"); + +pub static DD_APM_SUB_SKILLS: &[(&str, &str)] = &[ + ( + "service-remapping/SKILL.md", + include_str!("../skills/dd-apm/service-remapping/SKILL.md"), + ), + ( + "k8s-ssi/agent-install/SKILL.md", + include_str!("../skills/dd-apm/k8s-ssi/agent-install/SKILL.md"), + ), + ( + "k8s-ssi/enable-ssi/SKILL.md", + include_str!("../skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md"), + ), + ( + "k8s-ssi/verify-ssi/SKILL.md", + include_str!("../skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md"), + ), + ( + "k8s-ssi/troubleshoot-ssi/SKILL.md", + include_str!("../skills/dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md"), + ), + ( + "k8s-ssi/onboarding-summary/SKILL.md", + include_str!("../skills/dd-apm/k8s-ssi/onboarding-summary/SKILL.md"), + ), + ( + "linux-ssi/agent-install/SKILL.md", + include_str!("../skills/dd-apm/linux-ssi/agent-install/SKILL.md"), + ), + ( + "linux-ssi/enable-ssi/SKILL.md", + include_str!("../skills/dd-apm/linux-ssi/enable-ssi/SKILL.md"), + ), + ( + "linux-ssi/verify-ssi/SKILL.md", + include_str!("../skills/dd-apm/linux-ssi/verify-ssi/SKILL.md"), + ), + ( + "linux-ssi/troubleshoot-ssi/SKILL.md", + include_str!("../skills/dd-apm/linux-ssi/troubleshoot-ssi/SKILL.md"), + ), + ( + "linux-ssi/onboarding-summary/SKILL.md", + include_str!("../skills/dd-apm/linux-ssi/onboarding-summary/SKILL.md"), + ), +]; diff --git a/src/skills.rs b/src/skills.rs index 37a0237..3257ba7 100644 --- a/src/skills.rs +++ b/src/skills.rs @@ -12,9 +12,14 @@ pub struct SkillEntry { /// Platform slug for entry_type == "extension". One of: "pi". /// Empty for skills and agents. pub platform: &'static str, - /// Files to materialize for entry_type == "extension". - /// Each tuple is `(relative_path_within_extension_dir, file_contents)`. - /// Empty for skills and agents. + /// Extra files to materialize alongside the entry. + /// Each tuple is `(relative_path, file_contents)`, written verbatim. + /// - For `extension`: relative to the extension's install dir; this is + /// the only source of files (`content` is empty). + /// - For `skill`: relative to the parent skill's install dir, used to + /// ship nested sub-skill SKILL.md files (e.g. `dd-apm` ships + /// `service-remapping/SKILL.md` and the k8s-ssi/linux-ssi trees). + /// - For `agent`: unused; leave empty. pub files: &'static [(&'static str, &'static str)], } @@ -64,9 +69,9 @@ pub static SKILLS: &[SkillEntry] = &[ name: "dd-apm", description: "APM - traces, services, dependencies, performance analysis.", entry_type: "skill", - content: include_str!("../skills/dd-apm/SKILL.md"), + content: agent_skills::DD_APM_SKILL, platform: "", - files: &[], + files: agent_skills::DD_APM_SUB_SKILLS, }, SkillEntry { name: "dd-debugger", @@ -905,7 +910,19 @@ pub fn install_paths( else { return Ok(vec![]); }; - Ok(vec![(path, format_content(entry, &fmt))]) + let mut out = vec![(path.clone(), format_content(entry, &fmt))]; + // Skills can ship nested sub-skill files alongside the root SKILL.md + // (e.g. dd-apm's k8s-ssi/, linux-ssi/, service-remapping/ trees). + // Only applies when the parent installs as a skill directory; subagent + // .md files have no surrounding directory to nest under. + if fmt == InstallFormat::SkillMd && !entry.files.is_empty() { + if let Some(parent_dir) = path.parent() { + for (rel, body) in entry.files { + out.push((parent_dir.join(rel), (*body).to_string())); + } + } + } + Ok(out) } #[derive(Debug, PartialEq)] @@ -1466,6 +1483,44 @@ mod tests { assert_eq!(extensions_dir("claude-code", &root, false), None); } + #[test] + fn test_install_paths_skill_with_sub_skills() { + static SUB_FILES: &[(&str, &str)] = &[ + ("service-remapping/SKILL.md", "# Service Remapping"), + ("k8s-ssi/agent-install/SKILL.md", "# K8s Agent Install"), + ]; + let root = PathBuf::from("/tmp/proj"); + let e = SkillEntry { + files: SUB_FILES, + ..entry("dd-apm", "skill", "body") + }; + let paths = install_paths(&e, "claude-code", &root, None, false).unwrap(); + assert_eq!(paths.len(), 3); + assert_eq!(paths[0].0, root.join(".claude/skills/dd-apm/SKILL.md")); + assert_eq!( + paths[1].0, + root.join(".claude/skills/dd-apm/service-remapping/SKILL.md") + ); + assert_eq!( + paths[2].0, + root.join(".claude/skills/dd-apm/k8s-ssi/agent-install/SKILL.md") + ); + } + + #[test] + fn test_install_paths_sub_skills_skipped_for_agent_md() { + // AgentMd format (Claude Code agents dir) has no surrounding directory, + // so sub-skill files must not be written. + static SUB_FILES: &[(&str, &str)] = &[("sub/SKILL.md", "# Sub")]; + let root = PathBuf::from("/tmp/proj"); + let e = SkillEntry { + files: SUB_FILES, + ..entry("dd-apm", "agent", "body") + }; + let paths = install_paths(&e, "claude-code", &root, None, false).unwrap(); + assert_eq!(paths.len(), 1, "sub-skills must not be written for agent-md format"); + } + #[test] fn test_install_paths_skill_single_file() { let root = PathBuf::from("/tmp/proj"); From ffb4d51e90a7203accba1e6e8ca924edb2e13940 Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 28 May 2026 14:51:54 +0000 Subject: [PATCH 2/2] refactor(skills): replace local crate copy with git dep on agent-skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous commit vendored the agent-skills content into a local crates/ directory, which is equivalent to Option 1 (direct copy). This commit corrects that: remove crates/agent-skills/ entirely and point Cargo.toml at the upstream repo directly. Cargo.toml now uses a git dep pinned to the same commit referenced by PR #530, matching the pattern already used for datadog-api-client: agent-skills = { git = "https://github.com/datadog-labs/agent-skills", rev = "c447f4d42f05fa8497c6fa0d1ee3889b7020dce3" } The SKILL.md files stay in the agent-skills repo only. Updating means bumping the rev in Cargo.toml — a single, reviewable line change. This PR requires a companion change to datadog-labs/agent-skills to add a Cargo.toml and src/lib.rs that expose DD_APM_SKILL and DD_APM_SUB_SKILLS as compile-time constants (the same API consumed in src/skills.rs). https://claude.ai/code/session_01BfYL9qPvktFuLXHNyH3rPB --- Cargo.toml | 8 +- crates/agent-skills/Cargo.toml | 5 - crates/agent-skills/skills/dd-apm/SKILL.md | 209 --------- .../dd-apm/k8s-ssi/agent-install/SKILL.md | 270 ------------ .../skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md | 266 ------------ .../k8s-ssi/onboarding-summary/SKILL.md | 129 ------ .../dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md | 405 ----------------- .../skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md | 159 ------- .../dd-apm/linux-ssi/agent-install/SKILL.md | 280 ------------ .../dd-apm/linux-ssi/enable-ssi/SKILL.md | 235 ---------- .../linux-ssi/onboarding-summary/SKILL.md | 139 ------ .../linux-ssi/troubleshoot-ssi/SKILL.md | 400 ----------------- .../dd-apm/linux-ssi/verify-ssi/SKILL.md | 198 --------- .../skills/dd-apm/service-remapping/SKILL.md | 411 ------------------ crates/agent-skills/src/lib.rs | 48 -- 15 files changed, 5 insertions(+), 3157 deletions(-) delete mode 100644 crates/agent-skills/Cargo.toml delete mode 100644 crates/agent-skills/skills/dd-apm/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/k8s-ssi/agent-install/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/k8s-ssi/onboarding-summary/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/linux-ssi/agent-install/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/linux-ssi/enable-ssi/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/linux-ssi/onboarding-summary/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/linux-ssi/troubleshoot-ssi/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/linux-ssi/verify-ssi/SKILL.md delete mode 100644 crates/agent-skills/skills/dd-apm/service-remapping/SKILL.md delete mode 100644 crates/agent-skills/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 645fa4f..65a2ab9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,9 +68,11 @@ browser = [ ] [dependencies] -# Skills content from datadog-labs/agent-skills (local path dep; switch to -# `{ git = "...", rev = "..." }` once agent-skills ships a Cargo.toml) -agent-skills = { path = "crates/agent-skills" } +# Skills content from datadog-labs/agent-skills, consumed as a Cargo git dep. +# Pinned to a specific rev so updates are explicit and auditable, matching +# the same pattern used for datadog-api-client below. +# Requires agent-skills to carry a Cargo.toml + src/lib.rs (see companion PR). +agent-skills = { git = "https://github.com/datadog-labs/agent-skills", rev = "c447f4d42f05fa8497c6fa0d1ee3889b7020dce3" } # CLI (optional — not needed for browser WASM library) clap = { version = "4", features = ["derive"], optional = true } diff --git a/crates/agent-skills/Cargo.toml b/crates/agent-skills/Cargo.toml deleted file mode 100644 index 8d8b3e1..0000000 --- a/crates/agent-skills/Cargo.toml +++ /dev/null @@ -1,5 +0,0 @@ -[package] -name = "agent-skills" -version = "0.1.0" -edition = "2021" -publish = false diff --git a/crates/agent-skills/skills/dd-apm/SKILL.md b/crates/agent-skills/skills/dd-apm/SKILL.md deleted file mode 100644 index 95ccb11..0000000 --- a/crates/agent-skills/skills/dd-apm/SKILL.md +++ /dev/null @@ -1,209 +0,0 @@ ---- -name: dd-apm -description: APM - install, onboard, instrument, enable, set up, configure, traces, services, dependencies, performance analysis. Use for any request involving Datadog APM setup, instrumentation (SSI, ddtrace, agent install), or analysis. -alwaysApply: true -metadata: - version: "1.0.1" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,tracing,performance,distributed-tracing,dd-apm,install,onboarding,instrumentation,ssi,agent - globs: "**/ddtrace*,**/datadog*.yaml,**/*trace*" ---- - -# Datadog APM - -Distributed tracing, service maps, and performance analysis. - -## Routing — Read This First - -**For Kubernetes APM install / instrument / onboard tasks — follow this procedure exactly:** - -> ⚠️ **Common wrong approaches that LOOK like they work but silently fail:** -> - `helm install datadog datadog/datadog` — the standard chart does NOT support SSI via DatadogAgent CR. Do NOT use it. -> - Adding `ddtrace` imports or `ddtrace-run` to the app — SSI auto-instruments WITHOUT any code changes. -> - `admission.datadoghq.com/enabled` annotations — this is admission controller config injection, not SSI init container injection. - -### Step 0: Pre-flight checks (MANDATORY — run before installing anything) - -```bash -# Check for ddtrace/otel — even without imports it SILENTLY DISABLES SSI at runtime -grep -rn "ddtrace\|dd-trace\|opentelemetry" requirements.txt package.json Gemfile go.mod pom.xml 2>/dev/null || echo "No tracer dependency found" -``` - -If ddtrace found → remove that line entirely from the dependency file, rebuild image, reload into cluster, restart pod. - -### Step 1: Install the Datadog Operator (NOT `datadog/datadog`) - -```bash -helm repo add datadog https://helm.datadoghq.com && helm repo update -helm upgrade --install datadog-operator datadog/datadog-operator \ - --namespace datadog --create-namespace -kubectl wait --for=condition=Ready pod \ - -l app.kubernetes.io/name=datadog-operator -n datadog --timeout=120s - -kubectl create secret generic datadog-secret \ - --from-literal api-key=$DD_API_KEY -n datadog -``` - -### Step 2: Create DatadogAgent CR with SSI enabled - -Save as `datadog-agent.yaml`, then `kubectl apply -f datadog-agent.yaml`: - -```yaml -apiVersion: datadoghq.com/v2alpha1 -kind: DatadogAgent -metadata: - name: datadog - namespace: datadog -spec: - global: - site: - credentials: - apiSecret: - secretName: datadog-secret - keyName: api-key - kubelet: - tlsVerify: false # required for kind/minikube; omit for cloud clusters - features: - apm: - instrumentation: - enabled: true -``` - -### Step 3: Apply Unified Service Tags to the application Deployment - -Add to **both** `metadata.labels` and `spec.template.metadata.labels`: -```yaml -tags.datadoghq.com/env: "dev" -tags.datadoghq.com/service: "" -tags.datadoghq.com/version: "1.0.0" -``` - -### Step 4: Restart app pods and verify SSI init containers - -> **Confirm with the user before restarting.** Tell the user: "I need to restart `` in `` for SSI to inject into the pods. This will cause a brief outage. Ready to proceed?" Wait for confirmation. - -```bash -kubectl rollout restart deployment/ -n -kubectl get pods -A -o json | grep -o '"datadog-lib[^"]*"' | sort -u -``` - -Expected: `"datadog-lib-python-init"` (or the language-appropriate init container). - -**Immediately read** `.claude/skills/dd-apm/k8s-ssi/agent-install/SKILL.md` now, then `.claude/skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md` — do not proceed from memory or the summary above. - ---- - -**For Linux APM install/instrument tasks:** **Immediately read** `.claude/skills/dd-apm/linux-ssi/agent-install/SKILL.md` now, then enable-ssi then verify-ssi — do not proceed from memory or the summary above. - -**For trace search, service analysis, metrics:** Continue below. - -## Requirements - -Datadog Labs Pup should be installed. See [Setup Pup](https://github.com/datadog-labs/agent-skills/tree/main?tab=readme-ov-file#setup-pup) if not. - -## Command Execution Order (Token-Efficient) - -For scoped commands, use this order: - -1. Check context first (prior outputs, conversation, saved values). -2. If a required value is missing, run a discovery command first. -3. If still ambiguous, ask the user to confirm. -4. Then run the target command. -5. Avoid speculative commands likely to fail. - -## Quick Start - -```bash -pup auth login -# Confirm env tag with the user first (do not assume production/prod/prd). -pup apm services list --env --from 1h --to now -pup traces search --query "service:api-gateway" --from 1h -``` - -## Services - -### List Services - -```bash -pup apm services list --env --from 1h --to now -pup apm services stats --env --from 1h --to now -``` - -### Service Stats - -```bash -pup apm services stats --env --from 1h --to now -``` - -### Service Map - -```bash -# View dependencies -pup apm flow-map --query "service:api-gateway&from=$(($(date +%s)-3600))000&to=$(date +%s)000" --env --limit 10 -``` - -## Traces - -### Search Traces - -```bash -# By service -pup traces search --query "service:api-gateway" --from 1h - -# Errors only -pup traces search --query "service:api-gateway status:error" --from 1h - -# Slow traces (>1s) -pup traces search --query "service:api-gateway @duration:>1000ms" --from 1h - -# With specific tag -pup traces search --query "service:api-gateway @http.url:/api/users" --from 1h -``` - -### Trace Detail - -```bash -# No direct get command for a single trace ID. -# Use traces search with a narrow query and time window. -pup traces search --query "trace_id:" --from 1h -``` - -## Key Metrics - -| Metric | What It Measures | -|--------|------------------| -| `trace.http.request.hits` | Request count | -| `trace.http.request.duration` | Latency | -| `trace.http.request.errors` | Error count | -| `trace.http.request.apdex` | User satisfaction | - -## Service Level Objectives - -Link APM to SLOs: - -```bash -pup slos create --file slo.json -``` - -## Common Queries - -| Goal | Query | -|------|-------| -| Slowest endpoints | `avg:trace.http.request.duration{*} by {resource_name}` | -| Error rate | `sum:trace.http.request.errors{*} / sum:trace.http.request.hits{*}` | -| Throughput | `sum:trace.http.request.hits{*}.as_rate()` | - -## Troubleshooting - -| Problem | Fix | -|---------|-----| -| No traces | Check ddtrace installed, DD_TRACE_ENABLED=true | -| Missing service | Verify DD_SERVICE env var | -| Traces not linked | Check trace headers propagated | -| High cardinality | Don't tag with user_id/request_id | - -## References/Docs - -- [APM Setup](https://docs.datadoghq.com/tracing/) -- [Trace Search](https://docs.datadoghq.com/tracing/trace_explorer/) diff --git a/crates/agent-skills/skills/dd-apm/k8s-ssi/agent-install/SKILL.md b/crates/agent-skills/skills/dd-apm/k8s-ssi/agent-install/SKILL.md deleted file mode 100644 index 78e20c6..0000000 --- a/crates/agent-skills/skills/dd-apm/k8s-ssi/agent-install/SKILL.md +++ /dev/null @@ -1,270 +0,0 @@ ---- -name: agent-install -description: Install the Datadog Agent on Kubernetes using the Datadog Operator — required before enabling Single Step Instrumentation (SSI), which automatically instruments applications for APM without code changes. Only use if no Datadog Agent is deployed on the cluster yet. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,kubernetes,agent,operator,install - alwaysApply: "false" - tools: helm,kubectl,curl,pup ---- - -# Install the Datadog Agent on Kubernetes - -> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 1 until every variable has a concrete value. - -## Phase 0: Load Credentials - -```bash -[ -f environment ] && source environment -echo "DD_API_KEY set: $([ -n "${DD_API_KEY:-}" ] && echo yes || echo no)" -echo "DD_SITE: ${DD_SITE:-not set}" -echo "helm: $(helm version --short 2>/dev/null || echo NOT FOUND)" -``` - -**If `helm` is not found** — tell the user: - -> `helm` is required for this skill. Install it with: -> ```bash -> brew install helm # macOS -> # or see https://helm.sh/docs/intro/install/ for other platforms -> ``` -> Once installed, let me know and I'll continue. - -Do not proceed until `helm` is available. - -**If `DD_API_KEY` is already set** — proceed to Prerequisites. - -**If `DD_API_KEY` is not set** — tell the user: - -> I need two things to continue: -> -> **1. Datadog API Key** — used to authenticate the Agent with your Datadog account. You can find or create one at: https://app.datadoghq.com/organization-settings/api-keys -> -> **2. Datadog Site** — the region your Datadog account is on. Most accounts use `datadoghq.com`. Check your Datadog URL to confirm (e.g. `app.datadoghq.eu` → site is `datadoghq.eu`). Other options: `us3.datadoghq.com`, `us5.datadoghq.com`, `ap1.datadoghq.com`. -> -> Please run the following in this chat to set your credentials (the `!` prefix executes it in this session): -> ``` -> ! export DD_API_KEY=your-api-key-here -> ! export DD_SITE=datadoghq.com -> ``` - -Wait for the user to run the commands, then re-run the check above before continuing. - ---- - -## Prerequisites - -- [ ] Kubernetes v1.20+ — `kubectl version` -- [ ] helm v3+ — `helm version` -- [ ] kubectl configured to target cluster — `kubectl config current-context` -- [ ] pup-cli installed — check with `pup --version`; if missing, install it now: - ```bash - if [[ "$(uname)" == "Darwin" ]]; then - brew tap datadog-labs/pack && brew install pup - else - PUP_VERSION=$(curl -s https://api.github.com/repos/datadog-labs/pup/releases/latest | grep '"tag_name"' | cut -d'"' -f4) - curl -L "https://github.com/datadog-labs/pup/releases/download/${PUP_VERSION}/pup_linux_amd64.tar.gz" | tar xz -C /usr/local/bin pup - chmod +x /usr/local/bin/pup - fi - pup --version - ``` - Do not skip — proceed only once `pup --version` succeeds. - ---- - -## Context to resolve before acting - -| Variable | How to resolve | -|---|---| -| `CLUSTER_NAME` | Check repo IaC, scripts, or `kubectl config current-context` | -| `DD_SITE` | Ask the user. Default: `datadoghq.com`. Common options: `datadoghq.eu`, `us3.datadoghq.com`, `us5.datadoghq.com`, `ap1.datadoghq.com`. Full list: https://docs.datadoghq.com/getting_started/site/ | -| `AGENT_NAMESPACE` | Use `datadog` unless the repo already uses `datadog-agent` consistently | -| `CHART_VERSION` | Run `helm search repo datadog/datadog-operator --versions \| head -5` and use the latest stable | - ---- - -## Step 1: Check for an Existing Agent Installation - -### Claude runs - -```bash -helm list -A | grep -i datadog -``` - -If a release shows `deployed` — Agent already installed. Skip to Step 5 to confirm health, then exit. - -If there is no output — no existing install. Continue to Step 2. - ---- - -## Step 2: Install the Datadog Operator - -### Claude runs - -```bash -helm repo add datadog https://helm.datadoghq.com -helm repo update - -helm upgrade --install datadog-operator datadog/datadog-operator \ - --namespace \ - --create-namespace \ - --version - -kubectl wait --for=condition=Ready pod \ - -l app.kubernetes.io/name=datadog-operator \ - -n \ - --timeout=120s -``` - -If the Operator pod is Running — continue to Step 3. - -ERROR: Pod not ready after 120s — check image pull: `kubectl describe pod -l app.kubernetes.io/name=datadog-operator -n `. - ---- - -## Step 3: Create the API Key Secret - -### What you need to do in a terminal - -```bash -export DD_API_KEY= - -kubectl create secret generic datadog-secret \ - --from-literal api-key=$DD_API_KEY \ - --namespace -``` - -If `secret/datadog-secret created` — continue to Step 4. - -ERROR: `AlreadyExists` — confirm which key it holds via Step 5 before deciding whether to recreate. - ---- - -## Step 4: Deploy the DatadogAgent Resource - -[DECISION: cluster type] -- Self-hosted (minikube, kind): include `kubelet.tlsVerify: false` inside `spec.global` -- Managed (GKE, EKS, AKS): omit `kubelet.tlsVerify` entirely - -[DECISION: APM/SSI also being enabled in this session] -- If yes: do not create a separate `DatadogAgent` for APM — extend this same manifest with `features.apm` per `enable-ssi`. One manifest, not two. -- If no: use the manifest below as-is. - -Save the following as `datadog-agent.yaml`: - -```yaml -apiVersion: datadoghq.com/v2alpha1 -kind: DatadogAgent -metadata: - name: datadog - namespace: -spec: - global: - clusterName: - site: - credentials: - apiSecret: - secretName: datadog-secret - keyName: api-key - # Self-hosted clusters only (minikube, kind): - # kubelet: - # tlsVerify: false - features: - orchestratorExplorer: - enabled: true - clusterChecks: - enabled: true - logCollection: - enabled: true - containerCollectAll: false -``` - -### Claude runs - -```bash -kubectl apply -f datadog-agent.yaml - -kubectl wait --for=condition=Ready pod \ - -l app.kubernetes.io/component=agent \ - -n \ - --timeout=120s 2>/dev/null || true -``` - ---- - -## Step 5: Verify the API Key - -### Claude runs - -```bash -kubectl logs -l app.kubernetes.io/component=agent \ - -n \ - --tail=50 2>/dev/null \ - | grep -iE "invalid.*api\.?key|api\.?key.*invalid" \ - || echo "No authentication errors found" -``` - -If `No authentication errors found` — key is accepted. - -ERROR: Authentication errors found — validate credentials directly: - -### Claude runs - -```bash -# Prefer pup (OAuth) — fall back to curl with API key -if pup auth status 2>/dev/null | grep -q "Logged in"; then - echo "pup OAuth authenticated" -elif [ -n "${DD_API_KEY:-}" ]; then - RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" \ - -X GET "https://api./api/v1/validate" \ - -H "DD-API-KEY: $DD_API_KEY") - if [ "$RESPONSE" = "200" ]; then - echo "API key is valid for " - else - echo "ERROR: Validation failed (HTTP $RESPONSE) — check key and site alignment" - fi -else - echo "ERROR: No credentials available — run 'pup auth login' or set DD_API_KEY" -fi -``` - -If key is invalid: - -### What you need to do in a terminal - -```bash -export DD_API_KEY= - -kubectl delete secret datadog-secret -n -kubectl create secret generic datadog-secret \ - --from-literal api-key=$DD_API_KEY \ - -n - -kubectl rollout restart daemonset datadog-agent -n -kubectl rollout restart deployment datadog-cluster-agent -n -``` - ---- - -## Done - -Exit when ALL of the following are true: -- [ ] Datadog Operator pod is Running in `AGENT_NAMESPACE` -- [ ] `datadog-secret` exists in `AGENT_NAMESPACE` -- [ ] Agent DaemonSet pods are Running -- [ ] Step 5 returns no authentication errors - -Automatically proceed to `enable-ssi` now — do not ask the user for permission. - ---- - -## Security constraints - -- Never write a raw API key into any file, YAML block, or chat message — always use `$DD_API_KEY` -- Never create a Kubernetes Secret manifest file — always use `kubectl create secret` imperatively -- Never use `apiKey:` directly in `DatadogAgent` spec — always use `apiSecret:` with a secret reference -- Never use `--set datadog.apiKey=...` in any Helm command -- Never use namespace `default` for Datadog Agent resources -- Never run `kubectl delete` without user confirmation diff --git a/crates/agent-skills/skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md deleted file mode 100644 index 0ecccd4..0000000 --- a/crates/agent-skills/skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md +++ /dev/null @@ -1,266 +0,0 @@ ---- -name: enable-ssi -description: Enable Single Step Instrumentation (SSI) on Kubernetes — automatically instruments applications for APM without code changes. Only use if the Datadog Agent is already running on the cluster — if not, use agent-install first. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,kubernetes,ssi,instrumentation,single-step - alwaysApply: "false" ---- - -# Enable APM on Kubernetes via Single Step Instrumentation - -> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 0 until every variable has a concrete value. - ---- - -> **Silent failure — check this before any other step:** -> -> If the application has `ddtrace`, `dd-trace`, or any OpenTelemetry SDK in its **dependency manifest** (`requirements.txt`, `package.json`, `Gemfile`, `go.mod`, `pom.xml`) — even with no import statements in code — SSI will silently disable itself at runtime. -> -> The failure is invisible: init containers run and complete, the pod starts healthy, no errors appear in `kubectl` or `pup`, but no traces arrive. The injector detects the user-installed tracer and exits cleanly without logging anything. -> -> ### Claude runs -> -> ```bash -> grep -rE "ddtrace|dd-trace|opentelemetry" \ -> requirements.txt package.json Gemfile go.mod pom.xml 2>/dev/null \ -> || echo "No tracer dependency found" -> ``` -> -> If any match — **stop**. Remove the package entirely (not just the import), rebuild the image, reload it into the cluster, and restart the pod before continuing. A package present in the manifest is enough to trigger this even if it is never imported. - ---- - -## Triggers - -Invoke this skill when the user expresses intent to: -- Enable APM on a Kubernetes cluster -- Instrument Kubernetes applications with Datadog tracing -- Set up Single Step Instrumentation (SSI) - -Do NOT invoke this skill if: -- The Datadog Agent is not yet installed — run `agent-install` first -- The user wants to verify SSI after setup — use `verify-ssi` -- The user wants to enable Profiler, AppSec, or Data Streams — use `dd-apm-k8s-sdk-features` - ---- - -## Prerequisites - -> **These are not a reading exercise — actively verify each one before proceeding.** - -**Environment** -- [ ] Datadog Agent is installed and healthy — `agent-install` complete -- [ ] Kubernetes v1.20+ -- [ ] Linux node pools only — Windows pods require explicit namespace exclusion -- [ ] Cluster is not ECS Fargate — unsupported -- [ ] Not a hardened SELinux environment — unsupported -- [ ] Not a very small VM instance (e.g. t2.micro) — SSI can hit init timeouts -- [ ] No PodSecurity baseline or restricted policy enforced - -**Language and runtime** -- [ ] Application language is one of: Java, Python, Ruby, Node.js, .NET, PHP -- [ ] Runtime version is within SSI's supported range — verify against the [SSI compatibility matrix](https://docs.datadoghq.com/tracing/trace_collection/automatic_instrumentation/single-step-apm/compatibility/) -- [ ] Node.js app is not using ESM — SSI does not support ESM -- [ ] Java app is not already using a `-javaagent` JVM flag - -**Existing instrumentation** — confirmed clean by the check at the top of this skill. If you skipped that check, go back and run it now. - ---- - -## Context to resolve before acting - -> **Discover from the cluster — do not ask the user for information you can find yourself.** - -| Variable | How to resolve | -|---|---| -| `AGENT_NAMESPACE` | Same namespace used in `agent-install` (e.g. `datadog`) | -| `APP_NAMESPACE` | Run `kubectl get namespaces --no-headers \| awk '{print $1}' \| grep -vE '^(kube-system\|kube-public\|kube-node-lease\|datadog\|local-path-storage)$'` — instrument all non-system namespaces, or use the namespace(s) the user mentioned | -| `TARGET_LANGUAGES` | Run `kubectl get pods -A -o jsonpath='{.items[*].spec.containers[*].image}'` and infer language from image names, or check Dockerfiles/manifests in the workspace. If uncertain, enable all languages. | -| `DEPLOYMENT_NAME` | Run `kubectl get deployments -A --no-headers` — identify application deployments (exclude system components) | -| `APP_LABEL` | Check `spec.selector.matchLabels` in the Deployment manifest via `kubectl get deployment -n -o yaml` | -| `CLUSTER_NAME` | Check `spec.global.clusterName` in `datadog-agent.yaml`, or `kubectl config current-context` — needed for kind clusters in Step 0 | -| `ENV` | Use `apm-evals` if running in an eval cluster (kind cluster names contain "evalya"). Otherwise use `production` unless the user specifies otherwise. | -| `SERVICE_NAME` | Use the deployment name (e.g. `python-app` → service `python-app`). Do not ask the user. | -| `VERSION` | Use `1.0.0` as the default. Do not ask the user. | - ---- - -## Step 0 (Only if existing instrumentation detected): Remove Manual Instrumentation - -Scan all source files for: `import ddtrace`, `from ddtrace`, `require 'ddtrace'`, `require("dd-trace")`, `opentelemetry`, `tracer.trace(` - -Also check dependency manifests for `ddtrace` / `dd-trace` / OTel SDK packages. - -If found — remove the import/package, then rebuild and reload: - -### Claude runs - -```bash -docker build -f -t -``` - -[DECISION: how does this cluster get local images?] - -Check the repo's setup script (e.g. `create.sh`, `Makefile`, `justfile`) for how images are loaded — do not guess from the cluster name or context. Common patterns: - -| What you find in the setup script | Load command | -|---|---| -| `minikube image load` or `minikube cache add` | `minikube -p image load ` — profile is the `-p` flag value in the script, NOT necessarily the kubectl context name | -| `kind load docker-image` | `kind load docker-image --name ` | -| `docker push` to a registry | Push the new image; the cluster will pull on restart — skip local load | -| `k3d image import` | `k3d image import -c ` | -| No image load step (cloud cluster, always pulls from registry) | Skip — image will be pulled on next deployment | - -If the setup script is ambiguous, run the load command it uses exactly as written. - -- Registry-based: skip — image will be pulled on next deployment - -> **Confirm with the user before restarting.** Tell the user: "I need to restart `` in `` to pick up the rebuilt image. Ready to proceed?" Wait for confirmation. - -### Claude runs - -```bash -kubectl rollout restart deployment/ -n -kubectl wait --for=condition=Ready pod \ - -l app= \ - -n \ - --timeout=120s -``` - ---- - -## Step 1: Extend the DatadogAgent Manifest with APM - -SSI is configured on the existing `DatadogAgent` resource — do not create a separate manifest. - -**Choose targeting scope based on what the user asked for:** -- User asked to instrument **all applications** or didn't specify scope → **use Option A (cluster-wide)** -- User asked for specific namespaces only → use Option B -- User asked to exclude namespaces from cluster-wide → use Option C -- User asked for specific pods/workloads → use Option D - -> **Default is cluster-wide (Option A).** If the user said "all my applications", "my whole cluster", or didn't restrict scope, use Option A with no `enabledNamespaces` or `targets`. - -Recommended `ddTraceVersions`: `java: "1"`, `python: "2"`, `js: "5"`, `dotnet: "3"`, `ruby: "2"`, `php: "1"` - -**Option A — Cluster-wide (default):** -```yaml -features: - apm: - instrumentation: - enabled: true -``` - -**Option B — Specific namespaces only:** -```yaml -features: - apm: - instrumentation: - enabled: true - enabledNamespaces: - - -``` - -**Option C — Cluster-wide with exclusions:** -```yaml -features: - apm: - instrumentation: - enabled: true - disabledNamespaces: - - jenkins - - kube-system -``` - -**Option D — Target specific workloads:** -```yaml -features: - apm: - instrumentation: - enabled: true - targets: - - name: - namespaceSelector: - matchNames: - - - ddTraceVersions: - : "" -``` - -> **Note:** `ddTraceVersions` only applies inside a `targets[]` entry (Option D). It is not valid alongside `enabledNamespaces` or at the `instrumentation` level directly. - -### Claude runs - -```bash -kubectl apply -f datadog-agent.yaml -``` - -If `datadogagent.datadoghq.com/datadog configured` — continue to Step 2. - -ERROR: Validation error — check YAML. `enabledNamespaces` and `disabledNamespaces` cannot both be set. - ---- - -## Step 2: Inform the User About Unified Service Tags - -> **Do NOT modify application Deployments without explicit user confirmation.** Applying labels to existing application workloads is a change to customer-managed resources. - -Inform the user that adding Unified Service Tags (UST) to their Deployments will enable proper service/env/version tagging in Datadog. This is optional for SSI to work but recommended for full observability: - -```yaml -# Add to both metadata.labels and spec.template.metadata.labels -tags.datadoghq.com/env: "" -tags.datadoghq.com/service: "" -tags.datadoghq.com/version: "" -``` - -If the user wants you to apply these, get their confirmation first. UST labels are not required for APM traces to flow — SSI works without them. - ---- - -## Step 3: Restart Application Pods - -> **Confirm with the user before restarting.** Tell the user: "I need to restart `` in `` for SSI to inject into the pods. This will cause a brief outage. Ready to proceed?" Wait for confirmation. - -### Claude runs - -```bash -kubectl rollout restart deployment/ -n - -kubectl wait --for=condition=Ready pod \ - -l app= \ - -n \ - --timeout=120s -``` - -If pods restart cleanly, init containers named `datadog-lib--init` will be visible in the pod spec. - -ERROR: Pods crash-looping — check for existing custom instrumentation. See `troubleshoot-ssi`. - ---- - -## Done - -Exit when ALL of the following are true: -- [ ] `features.apm.instrumentation` is present in the applied `DatadogAgent` manifest -- [ ] User has been informed that they need to restart their application pods -- [ ] User has been informed about Unified Service Tags (UST) and how to apply them if desired -- [ ] Scope confirmed: which workloads are instrumented, which were skipped and why - -Automatically proceed to `verify-ssi` now — do not ask the user for permission. - ---- - -## Security constraints - -- Never write a raw API key into any file or chat message -- Never use namespace `default` for Datadog resources -- Never modify `admissionController` settings directly — SSI manages this via the Operator -- Do not add APM config to application manifests — configure only via `DatadogAgent` -- Exception: UST labels (`tags.datadoghq.com/*`) on application Deployments are required and intentional -- Never run `kubectl delete` without user confirmation -- `docker push` to a registry always requires user confirmation -- **Never use `kubectl patch` to apply UST labels or any Deployment changes.** Always edit the Deployment YAML file and `kubectl apply -f`. Changes made with `kubectl patch` are transient and will be overwritten on the next rollout. diff --git a/crates/agent-skills/skills/dd-apm/k8s-ssi/onboarding-summary/SKILL.md b/crates/agent-skills/skills/dd-apm/k8s-ssi/onboarding-summary/SKILL.md deleted file mode 100644 index a9a0b07..0000000 --- a/crates/agent-skills/skills/dd-apm/k8s-ssi/onboarding-summary/SKILL.md +++ /dev/null @@ -1,129 +0,0 @@ ---- -name: onboarding-summary -description: Generate a live Single Step Instrumentation (SSI) onboarding confirmation report — verifies APM instrumentation is working end-to-end with deep links into the Datadog UI. Only use after agent-install and enable-ssi have both completed successfully. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,kubernetes,ssi,summary,verification - alwaysApply: "false" ---- - -# APM Onboarding Summary - -## Triggers - -Invoke this skill when: -- All steps in `verify-ssi` have passed -- All checks in `troubleshoot-ssi` have been resolved -- The user asks "is everything working?", "show me the status", or "confirm APM is set up" - -Do NOT invoke this skill if any verification or troubleshooting check is still failing — resolve those first. - ---- - -## Context to resolve before acting - -| Variable | How to resolve | -|---|---| -| `AGENT_NAMESPACE` | Namespace where Datadog Agent is installed | -| `APP_NAMESPACE` | Namespace of the application | -| `APP_LABEL` | Check `spec.selector.matchLabels.app` in the Deployment manifest | -| `CLUSTER_NAME` | `spec.global.clusterName` in `datadog-agent.yaml` | -| `SERVICE_NAME` | `tags.datadoghq.com/service` label on the Deployment | -| `ENV` | `tags.datadoghq.com/env` label on the Deployment | -| `DD_SITE` | `spec.global.site` in `datadog-agent.yaml` | - ---- - -## Prerequisites - -### Claude runs - -```bash -pup auth status --site -``` - -If valid token — proceed. - -ERROR: Not authenticated: - -### Claude runs - -```bash -pup auth login --site -``` - -> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. - ---- - -## Collect live confirmation data - -Run all of the following. Each populates a row in the final report. - -### Claude runs - -```bash -# Agent pod count and status -kubectl get pods -n \ - -l app.kubernetes.io/component=agent \ - --no-headers - -# SSI instrumentation config live in cluster -kubectl get datadogagent datadog -n \ - -o jsonpath='{.spec.features.apm.instrumentation}' - -# Init container confirmed in app pod spec -kubectl get pod -l app= -n \ - -o jsonpath='{.items[0].spec.initContainers[*].name}' - -# Pod confirmed instrumented — init containers in pod spec -kubectl get pod -l app= -n \ - -o jsonpath='{.items[0].spec.initContainers[*].name}' - -# Service visible and traced in APM -DD_SITE= pup apm services list --env --from 1h - -# Traces arriving in the last hour -DD_SITE= pup traces search --query "service:" --from 1h --limit 5 -``` - ---- - -## Present the report - -Fill in every value from live command output. Do not leave any placeholder unfilled. If a value cannot be confirmed, mark that row as failed and link to `troubleshoot-ssi`. - ---- - -**APM onboarding complete** - -| Check | Detail | Status | -|---|---|---| -| Datadog Agent | `` pod(s) Running in `` | OK | -| SSI enabled | Targeting namespace ``, language `` v`` | OK | -| Init container injected | `datadog-lib--init` present in pod spec | OK | -| Pod instrumented | `` in `pup fleet instrumented-pods list` | OK | -| Tracer reporting | Service ``, ``, tracer v`` | OK | -| APM service visible | `` in env `` | OK | -| Traces arriving | `` trace(s) found in the last hour | OK | - ---- - -**Your service in Datadog — click to open:** - -Construct each URL by substituting real values. Do not print placeholder URLs. - -| View | URL | -|---|---| -| Service overview | `https://app./apm/services/?env=` | -| Traces explorer | `https://app./apm/traces?query=service:%20env:` | -| Service map | `https://app./apm/map?env=&service=` | -| Agent fleet | `https://app./fleet-automation` | - ---- - -## Security constraints - -- Never write a raw API key into any file or chat message diff --git a/crates/agent-skills/skills/dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md deleted file mode 100644 index 134970a..0000000 --- a/crates/agent-skills/skills/dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md +++ /dev/null @@ -1,405 +0,0 @@ ---- -name: troubleshoot-ssi -description: Diagnose and fix Single Step Instrumentation (SSI) issues on Kubernetes — SSI automatically instruments applications for APM without code changes. Only use if the agent and SSI are already configured but traces are missing or instrumentation is not working. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,kubernetes,ssi,troubleshooting,instrumentation - alwaysApply: "false" ---- - -# Troubleshoot APM SSI on Kubernetes - -## Triggers - -Invoke this skill when the user expresses intent to: -- Debug why a pod is not being instrumented -- Investigate why traces are not appearing in Datadog -- Diagnose admission webhook or init container injection failures -- Follow up on failed checks from `verify-ssi` -- Report that a specific service or pod has no traces - -Do NOT invoke this skill if: -- SSI has not been enabled yet — run `enable-ssi` first - ---- - -## Prerequisites - -- [ ] kubectl configured to target cluster — `kubectl config current-context` - -### pup-cli: check, install, and authenticate - -### Claude runs - -```bash -pup --version -``` - -If not found, install it (OS-aware): - -### Claude runs - -```bash -if [[ "$(uname)" == "Darwin" ]]; then - brew tap datadog-labs/pack && brew install pup -else - PUP_VERSION=$(curl -s https://api.github.com/repos/datadog-labs/pup/releases/latest | grep '"tag_name"' | cut -d'"' -f4) - curl -L "https://github.com/datadog-labs/pup/releases/download/${PUP_VERSION}/pup_linux_amd64.tar.gz" | tar xz -C /usr/local/bin pup - chmod +x /usr/local/bin/pup -fi -pup --version -``` - -Check auth: -```bash -pup auth status -``` - -If not authenticated: - -### Claude runs - -```bash -pup auth login -``` - -> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. - -If no browser available: `export DD_APP_KEY=`. - ---- - -## Context to resolve before acting - -| Variable | How to resolve | -|---|---| -| `AGENT_NAMESPACE` | Namespace where Datadog Agent is installed | -| `APP_NAMESPACE` | Namespace of the application with missing traces | -| `CLUSTER_NAME` | `kubectl config current-context` or `spec.global.clusterName` in `datadog-agent.yaml` | -| `SERVICE_NAME` | `tags.datadoghq.com/service` label on the Deployment, or ask the user | -| `ENV` | `tags.datadoghq.com/env` label on the Deployment, or ask the user | -| `POD_NAME` | `kubectl get pods -n ` — use the specific pod the user mentioned | -| `DEPLOYMENT_NAME` | Check `metadata.name` in the Deployment manifest, or ask the user | -| `APP_LABEL` | Check `spec.selector.matchLabels.app` in the Deployment manifest | - ---- - -## How SSI Works — Domain Knowledge - -Read this before investigating. It gives you the mental model to reason about novel failures, not just known ones. - -**Injection chain:** -1. Admission webhook (registered by Cluster Agent) intercepts pod creation -2. Webhook mutates the pod spec — adds a `datadog-lib--init` init container -3. Init container downloads the tracer library onto a shared volume -4. `LD_PRELOAD` env var is set pointing to the library `.so` file -5. Application process loads the library automatically on startup via `LD_PRELOAD` - -**What each diagnostic layer can see:** -- **pup** — sees what Datadog's backend received. Blind to cluster-side injection failures. If pup shows no instrumented pods, the problem is in the cluster. -- **kubectl** — sees cluster state. Blind to whether data reached Datadog. If kubectl shows the init container but pup shows no traces, the problem is post-injection. - -**What healthy looks like:** -- `pup fleet instrumented-pods list` shows the pod with correct language/version -- `pup fleet tracers list` shows the service as active -- `kubectl get pod -o jsonpath='{.spec.initContainers[*].name}'` includes `datadog-lib--init` - -**Known silent failures — SSI produces no error when these occur:** -- **Existing ddtrace or OTel instrumentation** — SSI detects it and silently disables itself -- **Unsupported runtime version** — silently skipped -- **`admission.datadoghq.com/enabled: "false"` annotation** — webhook skips the pod entirely -- **Pod not restarted after SSI enabled** — injection happens at startup; existing pods keep running uninstrumented -- **Pod in Agent namespace** — SSI never instruments its own namespace - -**Reasoning shortcuts:** -- No init container → webhook didn't fire → check: namespace targeting, pod-selector, opt-out annotation, webhook registration, pod not restarted -- Init container present + no traces → injection attempted but failed or tracer not reporting → check: existing ddtrace, runtime version, Agent connectivity, DD_SITE mismatch - ---- - -## Step 1: Triage - -Run all seven simultaneously and surface them back to the user as the diagnostics you're running. Everything after this is driven by what you find here. Resolve `` from `kubectl get pod -n -o jsonpath='{.spec.nodeName}'` once you have a pod name; if no pod context yet, run the `pup` commands without `--hostname` first. - -### Claude runs - -```bash -pup traces search --query "service:" --from 1h --limit 5 -pup fleet instrumented-pods list -pup apm troubleshooting list --hostname --timeframe 1h -pup apm service-library-config get --service-name --env -kubectl get pod -n \ - -o jsonpath='{.spec.initContainers[*].name}' -kubectl describe pod -n | grep -A 10 "Events:" -kubectl get mutatingwebhookconfigurations | grep datadog -``` - -The last command confirms the Admission Controller webhook is registered cluster-wide — this is the precondition for SSI injection working at all and must be checked even when most other services are being instrumented (any deviation in one webhook config can silently skip a subset of pods). - -`pup apm troubleshooting list` surfaces injection errors that Datadog's backend received from the cluster — these point to cluster-side mutation failures that may not be visible from `kubectl describe` alone. `pup apm service-library-config get` shows the runtime SDK config the tracer is operating under; an empty result with `ddTraceConfigs` configured, or unexpected values, points to UST/config-propagation issues. - ---- - -## Step 2: State Your Hypotheses - -Before investigating, explicitly state your ranked hypotheses based on triage output. Do not skip this step. - -**When the user reports multiple affected services in the same namespace, diagnose each independently.** Two pods can fail injection for entirely different reasons (one opt-out annotation, one missing namespace label, one with pre-existing ddtrace). Do not assume a shared root cause — investigate each service's pod spec, annotations, and runtime separately and surface findings per-service. - -| Triage signal | Strong hypothesis | -|---|---| -| Traces arriving + pod in instrumented list | Not a real problem — likely a UI filter or time window. Tell the user and stop | -| No traces + pod NOT in instrumented list + no init container | Injection never happened — investigate: namespace targeting, webhook, pod-selector, opt-out annotation, pod not restarted | -| No traces + pod NOT in instrumented list + init container present | Injection attempted but failed — check `pup apm troubleshooting list` for injection errors | -| No traces + pod in instrumented list + init container present | Tracer injected but not reporting — investigate: connectivity, DD_SITE, API key | -| Pod events show CrashLoopBackOff or init container errors | Init container failure — check existing ddtrace, runtime version | -| Traces arriving but wrong service/env | UST labels missing or misconfigured on the Deployment | - -State your top 1-3 hypotheses explicitly: *"Based on triage, I think the most likely cause is X because Y."* - ---- - -## Step 3: Investigate - -Use only the tools relevant to your hypotheses. Each observation informs your next action. - ---- - -### Cluster-side investigation tools - -**Is the pod in the Agent namespace?** -SSI never instruments pods in the same namespace as the Datadog Agent. -```bash -kubectl get pods -n -``` - -**Were pods restarted after SSI was enabled?** - -> **Confirm with the user before restarting.** Tell the user: "Pods must be restarted for SSI to inject into them. I'll restart `` in ``. Ready to proceed?" Wait for confirmation. - -### Claude runs - -```bash -kubectl rollout restart deployment/ -n -kubectl wait --for=condition=Ready pod -l app= -n --timeout=120s -``` - -### Claude runs - -```bash -pup fleet instrumented-pods list -``` - -**Does the namespace carry the Admission Controller opt-in label?** -When the Admission Controller runs with `mutateUnlabelled: false`, injection happens only in namespaces explicitly labeled `admission.datadoghq.com/mutate-pods=true`. A namespace missing this label silently has SSI skipped for every pod in it — a common cause when most cluster services are instrumented but one namespace's services aren't. - -```bash -kubectl get namespace -o jsonpath='{.metadata.labels}' -kubectl get namespace --show-labels -``` - -Fix: label the namespace, then restart the affected deployments so the AC mutates them on pod recreate. -```bash -kubectl label namespace admission.datadoghq.com/mutate-pods=true -``` - -**Is namespace targeting filtering the pod out?** -```bash -kubectl get datadogagent datadog -n -o yaml | grep -A 15 instrumentation -``` -Fix: update `enabledNamespaces` in `datadog-agent.yaml`. - -### Claude runs - -```bash -kubectl apply -f datadog-agent.yaml -``` - -**Is a `podSelector` target filtering the pod out?** -If `targets` with `podSelector` is configured, only pods whose labels match the selector are instrumented. Check whether the app pod's labels match any target: -```bash -kubectl get datadogagent datadog -n -o yaml | grep -A 20 targets -kubectl get pod -n --show-labels -``` -Fix: add a matching label to the pod template, or broaden the `podSelector`, then apply and restart. - -**Is a pod annotation opting it out — or missing the AC's injection-success annotation?** -Two annotations to look for: -- `admission.datadoghq.com/enabled: "false"` — explicit opt-out, AC skips the pod. -- `admission.datadoghq.com/status: injected` — set by the AC after successful mutation; its **absence** on a running pod is positive evidence the AC never mutated it. - -```bash -kubectl get pod -n -o jsonpath='{.metadata.annotations}' -kubectl get pod -n -o yaml | grep -A 10 annotations -``` -Fix: remove an opt-out annotation from the Deployment pod template, then apply and restart. - -**Are the expected `DD_*` environment variables present in the running pod?** -SSI injects `DD_SERVICE`, `DD_ENV`, `DD_VERSION`, `DD_TRACE_*`, and `LD_PRELOAD` into the container env when it mutates a pod. Their absence confirms the mutation did not run; their presence with unexpected values points to UST label mismatches or `ddTraceConfigs` issues. - -```bash -kubectl exec -n -- env | grep -E '^(DD_|LD_PRELOAD)' -kubectl describe pod -n | grep -E 'DD_|LD_PRELOAD' -``` - -### Claude runs - -```bash -kubectl apply -f -``` - -> **Confirm with the user before restarting.** Tell the user: "I need to restart `` in `` for this change to take effect. Ready to proceed?" Wait for confirmation. - -### Claude runs - -```bash -kubectl rollout restart deployment/ -n -``` - -**Does the app have existing custom instrumentation?** -SSI silently disables itself when it detects existing tracer code. Scan source files for: -- Python: `import ddtrace`, `ddtrace.patch_all()` -- Node.js: `require('dd-trace')`, `DD.init()` -- Java: `GlobalTracer.register(`, `dd-java-agent` -- .NET: `Tracer.Instance`, `DD.Trace` -- Ruby: `require 'ddtrace'`, `Datadog.configure` -- PHP: `DDTrace\` - -Also check dependency manifests: `requirements.txt`, `package.json`, `Gemfile`, `pom.xml`. - -Fix: remove the import/package, rebuild image, reload into cluster, restart pod. - -**Is the base image Alpine (musl libc)?** -K8s SSI injects `LD_PRELOAD` as an environment variable into the pod — it does not rely on `/etc/ld.so.preload`, so musl/Alpine images are supported. This is not a blocker for Kubernetes SSI. - -**Is the runtime version supported?** -```bash -kubectl exec -n -- python --version -kubectl exec -n -- node --version -kubectl exec -n -- java -version -``` -Verify against [SSI compatibility matrix](https://docs.datadoghq.com/tracing/trace_collection/automatic_instrumentation/single-step-apm/compatibility/). - -**Is the admission webhook registered?** -```bash -kubectl get mutatingwebhookconfigurations | grep datadog -kubectl get pods -n -l app=datadog-cluster-agent -kubectl logs -n -l app=datadog-cluster-agent --tail=100 -``` - -**Did injection produce errors?** -Get the node hostname first, then query Datadog for injection errors: -```bash -kubectl get pod -n -o jsonpath='{.spec.nodeName}' -pup apm troubleshooting list --hostname --timeframe 1h -``` - -**Is the Agent sending data to Datadog?** -```bash -kubectl exec -n \ - $(kubectl get pod -n -l app=datadog-agent -o name | head -1) \ - -- agent status | grep -A 5 "APM Agent" -``` - ---- - -### Datadog-side investigation tools - -**Is the tracer reporting?** -```bash -pup fleet tracers list --filter "service:" -``` - -**Does APM recognise the service?** -```bash -pup apm services list --env -``` - -**What SDK configuration is the service running with?** -Shows env vars the tracer is configured with (e.g. `DD_TRACE_ENABLED`, `DD_SERVICE`, `DD_ENV`, sampling rules). Empty output is expected if `ddTraceConfigs` was not set in `enable-ssi`; a populated output mismatching what was configured indicates the change didn't propagate. -```bash -pup apm service-library-config get --service-name --env -``` - -**Are traces arriving?** -```bash -pup traces search --query "service:" --from 1h --limit 10 -``` - -**Which agent is the tracer connected to?** -Use if connectivity between tracer and Agent is suspected. -```bash -pup fleet agents list --filter "hostname:" -pup fleet agents tracers --filter "service:" -``` - ---- - -## Step 4: Reflect Before Concluding - -Before applying any fix, answer: -1. What evidence confirms my hypothesis? -2. What evidence would contradict it — and have I checked? -3. Is there a simpler explanation I haven't considered? - -If the conclusion doesn't hold up, return to Step 2 with new hypotheses. Keep iterating until you can defend the conclusion against all three questions. - ---- - -## Step 5: Fix - -Apply the fix for the confirmed root cause. If the fix requires a code or Dockerfile change, rebuild and reload: - -### Claude runs - -```bash -docker build -f -t -``` - -[DECISION: cluster type] -- kind (local): load the image into the cluster - -### Claude runs - -```bash -kind load docker-image --name -``` - -- Registry-based: skip — image will be pulled on next deployment - -> **Confirm with the user before restarting.** Tell the user: "I need to restart `` in `` to apply the fix. Ready to proceed?" Wait for confirmation. - -### Claude runs - -```bash -kubectl rollout restart deployment/ -n -kubectl wait --for=condition=Ready pod -l app= -n --timeout=120s -``` - ---- - -## Step 6: Verify - -Re-run triage to confirm the fix worked: - -### Claude runs - -```bash -pup traces search --query "service:" --from 1h --limit 5 -pup fleet instrumented-pods list -``` - -If traces are arriving and the pod is in the instrumented list — resolved. Automatically proceed to `onboarding-summary` now — do not ask the user for permission. - -ERROR: Still not resolved — return to Step 2 with the new triage data and form updated hypotheses. - ---- - -## Security constraints - -- Never write a raw API key into any file or chat message -- Never run `kubectl delete` without user confirmation -- Never modify `admissionController` settings directly -- `docker push` to a registry always requires user confirmation diff --git a/crates/agent-skills/skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md deleted file mode 100644 index 5b3a155..0000000 --- a/crates/agent-skills/skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md +++ /dev/null @@ -1,159 +0,0 @@ ---- -name: verify-ssi -description: Verify Single Step Instrumentation (SSI) is working end-to-end on Kubernetes — SSI automatically instruments applications for APM without code changes. Only use after enable-ssi has run. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,kubernetes,ssi,verification,instrumentation - alwaysApply: "false" ---- - -# Verify APM SSI on Kubernetes - -> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 1 until every variable has a concrete value. - -## Triggers - -Invoke this skill when the user expresses intent to: -- Confirm SSI is working after enabling APM -- Check whether pods are being instrumented -- Verify the tracer is running and reporting telemetry -- Confirm tracer config is applied correctly - -Do NOT invoke this skill if: -- SSI has not been enabled yet — run `enable-ssi` first -- Pods are not being instrumented at all — use `troubleshoot-ssi` - ---- - -## Prerequisites - -- [ ] `enable-ssi` is complete -- [ ] Application pods have been restarted since SSI was enabled - -### pup-cli: check, install, and authenticate - -### Claude runs - -```bash -pup --version -``` - -If not found: - -### Claude runs - -```bash -brew tap datadog-labs/pack -brew install pup -``` - -Check auth: -```bash -pup auth status --site -``` - -If not authenticated: - -### Claude runs - -```bash -pup auth login --site -``` - -> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. - -If valid token — proceed. -ERROR: No browser available — use API key fallback: `export DD_APP_KEY=` - ---- - -## Context to resolve before acting - -| Variable | How to resolve | -|---|---| -| `CLUSTER_NAME` | Check `spec.global.clusterName` in `datadog-agent.yaml`, or `kubectl config current-context` | -| `ENV` | Check `tags.datadoghq.com/env` label on the application Deployment | -| `SERVICE_NAME` | Check `tags.datadoghq.com/service` label on the application Deployment | - ---- - -## Step 1: Confirm Pods are Instrumented - -### Claude runs - -```bash -kubectl get pod -l app= -n \ - -o jsonpath='{.items[0].spec.initContainers[*].name}' -``` - -If the output includes `datadog-lib--init` and `datadog-init-apm-inject` — SSI init containers are injected. - -ERROR: Init containers missing — pod was not restarted after SSI was enabled, or namespace targeting is not matching. Restart the pod and recheck. - ---- - -## Step 2: Confirm the Tracer is Reporting Telemetry - -### Claude runs - -```bash -DD_SITE= pup apm services list --env --from 1h -``` - -If `` appears in the services list with `isTraced: true` — continue to Step 3. - -ERROR: Service missing — send some traffic to the app first, then retry: - -### Claude runs - -```bash -# Port-forward and send test traffic -kubectl port-forward deployment/ 8099:8000 -n & -sleep 2 && for i in $(seq 1 10); do curl -s -o /dev/null http://localhost:8099/; done -sleep 30 && kill %1 2>/dev/null -DD_SITE= pup apm services list --env --from 10m -``` - -ERROR: Still missing after traffic — check the agent's trace receiver: `kubectl exec -n -c agent -- agent status | grep -A 10 "Receiver (previous minute)"`. If receiver shows 0 traces, go to `troubleshoot-ssi`. - ---- - -## Step 3: Confirm Tracer Configuration - -**Only run this step if `ddTraceConfigs` was explicitly configured in `enable-ssi`** (e.g. profiling, AppSec, Data Streams). If basic SSI was set up without `ddTraceConfigs`, skip this step — an empty response here is expected and not a failure. - -### Claude runs - -```bash -pup apm service-library-config get \ - --service-name \ - --env -``` - -If the output shows expected environment variables matching what was configured in `ddTraceConfigs` — done. - -If the output is empty and `ddTraceConfigs` was not configured — expected, not a failure. - -ERROR: Config missing but `ddTraceConfigs` was configured — check it is present in the `DatadogAgent` manifest under the correct target, and that pods were restarted after the config change. - ---- - -## Done - -Exit when ALL of the following are true: -- [ ] Step 1: target pods appear in `instrumented-pods list` -- [ ] Step 2: service appears in `tracers list` with active status -- [ ] Step 3: tracer config matches what was set in `DatadogAgent` - -If any check fails, go to `troubleshoot-ssi`. - -When all steps pass, automatically proceed to `onboarding-summary` now — do not ask the user for permission. - ---- - -## Security constraints - -- Never write a raw API key into any file or chat message -- Never run `kubectl delete` without user confirmation diff --git a/crates/agent-skills/skills/dd-apm/linux-ssi/agent-install/SKILL.md b/crates/agent-skills/skills/dd-apm/linux-ssi/agent-install/SKILL.md deleted file mode 100644 index b995617..0000000 --- a/crates/agent-skills/skills/dd-apm/linux-ssi/agent-install/SKILL.md +++ /dev/null @@ -1,280 +0,0 @@ ---- -name: agent-install -description: Install the Datadog Agent on Linux hosts via SSH with Single Step Instrumentation (SSI) enabled — SSI automatically instruments applications for APM without code changes. Only use if no agent is installed yet. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,linux,agent,install,ssi,ssh - alwaysApply: "false" ---- - -# Install Datadog Agent on Linux - -> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 1 until every variable has a concrete value. - -## Triggers - -Invoke this skill when the user expresses intent to: -- Install the Datadog Agent on Linux hosts or VMs -- Set up Datadog monitoring on bare-metal or cloud Linux instances -- Prepare Linux hosts for APM onboarding - -Do NOT invoke this skill if: -- The Agent is already installed on all hosts — check with `datadog-agent status` first -- The target is a Kubernetes cluster — use `dd-apm-k8s-agent-install` instead - ---- - -## Phase 0: Load Credentials - -```bash -[ -f environment ] && source environment -echo "DD_API_KEY set: $([ -n "${DD_API_KEY:-}" ] && echo yes || echo no)" -echo "DD_SITE: ${DD_SITE:-not set}" -``` - -**If `DD_API_KEY` is already set** — proceed directly to gathering infrastructure info. - -**If `DD_API_KEY` is not set** — tell the user: - -> Please run the following in this chat to set your credentials (the `!` prefix executes it in this session): -> ``` -> ! export DD_API_KEY=your-api-key-here -> ! export DD_SITE=datadoghq.com -> ``` - -Wait for the user to run the commands, then re-run the check above before continuing. - ---- - -## Phase 1: Gather Infrastructure Info - -Only do this phase if the user hasn't already provided the information. If SSH credentials are known, skip to Phase 2. - -Ask the user: -1. **Which hosts** need the agent? Get a list of IPs or hostnames. -2. **How do I SSH to them?** Get the SSH user, key path, and any jump host or bastion configuration. -3. **Do any hosts already have the Datadog Agent installed?** If so, skip install for those hosts and go straight to `verify-ssi`. - -### Claude runs - -Verify SSH works for each host before proceeding: - -```bash -ssh -o StrictHostKeyChecking=no -i @ "hostname" -``` - -If it returns a hostname — proceed. -ERROR: Connection refused or timeout — resolve connectivity before continuing. - -Once SSH is confirmed, present a plan to the user before proceeding. For example: - -``` -Here's what I'm going to do: - 1. Install the Datadog Agent with SSI on: , , ... - 2. Verify each agent is running and healthy - 3. Discover services on each host that need restarting for SSI to take effect - 4. After you restart services, verify instrumentation is working - -Ready to proceed? -``` - -Wait for user confirmation before starting installs. - ---- - -## Prerequisites - -**Per host — check before installing:** - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "uname -m && cat /etc/os-release | grep -E '^(ID|VERSION_ID|PRETTY_NAME)='" -``` - -If architecture is `x86_64` or `aarch64`, and the OS is a supported distribution (Ubuntu 16.04+, Debian 9+, RHEL/CentOS 6-9, Amazon Linux 2/2023, SUSE 12+) — proceed. - -ERROR: Architecture is `armv7l` (32-bit ARM) or unsupported OS — stop. Datadog Agent 7 and SSI do not support this configuration. - ---- - -## Context to resolve before acting - -| Variable | How to resolve | -|---|---| -| `DD_API_KEY` | Check `echo $DD_API_KEY` first — if set, use it. Otherwise ask the user for their API key from Datadog UI: Organization Settings → API Keys. Never log or print the key. | -| `DD_SITE` | Check `echo $DD_SITE` first — if set, use it. Otherwise ask the user. Default: `datadoghq.com`. Options: `datadoghq.com`, `us3.datadoghq.com`, `us5.datadoghq.com`, `datadoghq.eu`, `ap1.datadoghq.com` | -| `SSH_KEY` | Ask the user for the path to their SSH private key, or check `CLAUDE.md` | -| `SSH_USER` | Ask the user for the SSH username. Default: `root` | -| `SSH_HOST` | Ask the user for the hostname or IP of the target host | -| `SSH_PORT` | Ask the user for the SSH port. Default: `22` | - ---- - -## Phase 2: Install the Datadog Agent with SSI - -Run for each host that does not already have the agent installed. - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "DD_API_KEY=${DD_API_KEY} DD_SITE=${DD_SITE} DD_APM_INSTRUMENTATION_ENABLED=host bash -c \"\$(curl -L https://install.datadoghq.com/scripts/install_script_agent7.sh)\"" -``` - -`DD_APM_INSTRUMENTATION_ENABLED=host` causes the install script to also install `datadog-apm-inject` and language library packages under `/opt/datadog-packages/` in one pass. - -If the script completes without errors — proceed to Phase 2. - -ERROR: `curl: command not found`: -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "apt-get install -y curl 2>/dev/null || yum install -y curl" -``` - -ERROR: Permission error — ensure the SSH user has sudo access. The install script requires root. - -ERROR: Script fails with GPG key error — retry; if it persists, check the host's DNS resolution for `keys.datadoghq.com`. - ---- - -## Phase 3: Verify the Agent is Running and Healthy - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo datadog-agent status 2>&1 | head -40" -``` - -Healthy output shows: -- `Agent (v7.XX.X)` with `Status: Running` -- `API Keys status: API Key ending with XXXX: Valid` - -ERROR: `command not found` — installation did not complete. Re-run Phase 1. - -ERROR: `API key invalid` — update and restart: -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo sed -i 's/^api_key:.*/api_key: /' /etc/datadog-agent/datadog.yaml && \ - (sudo systemctl restart datadog-agent 2>/dev/null || sudo service datadog-agent restart)" -``` - -ERROR: Agent service not running: -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo systemctl start datadog-agent 2>/dev/null && sudo systemctl enable datadog-agent 2>/dev/null || sudo service datadog-agent start" -``` - -**Verify APM inject packages are present on disk** (not just registered): -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "ls /opt/datadog-packages/ && sudo datadog-installer status 2>/dev/null | grep apm | head -10" -``` - -If `/opt/datadog-packages/datadog-apm-inject` exists — injection is available. - -ERROR: Directory missing or empty — `datadog-installer status` may show the package as registered while its directory is actually empty (stale registration). Reinstall: -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo datadog-installer remove datadog-apm-inject && \ - DD_API_KEY=${DD_API_KEY} DD_SITE=${DD_SITE} DD_APM_INSTRUMENTATION_ENABLED=host bash -c \"\$(curl -L https://install.datadoghq.com/scripts/install_script_agent7.sh)\"" -``` - -**Verify hostname registration** — the Agent must resolve and register its hostname for the host to appear in Datadog. DNS lookup failures are common in containers and minimal VMs: - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo datadog-agent status 2>&1 | grep -iE '^\s+Hostname' | head -3" -``` - -If `Hostname: ` is shown — hostname resolved. Record this as `DD_HOSTNAME` for all subsequent steps. - -ERROR: `Hostname: (none)` or any DNS resolution error — the agent can't resolve its own FQDN. Fix by setting the hostname explicitly in `datadog.yaml`: - -```bash -# Read the actual system hostname -ACTUAL_HOSTNAME=$(ssh -o StrictHostKeyChecking=no -i @ "hostname") - -# Append to datadog.yaml only if not already set -ssh -o StrictHostKeyChecking=no -i @ \ - "grep -q '^hostname:' /etc/datadog-agent/datadog.yaml || \ - echo \"hostname: ${ACTUAL_HOSTNAME}\" | sudo tee -a /etc/datadog-agent/datadog.yaml" - -# Restart the Agent -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo systemctl restart datadog-agent 2>/dev/null || sudo service datadog-agent restart" - -# Confirm hostname is now registered -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo datadog-agent status 2>&1 | grep -iE '^\s+Hostname' | head -2" -``` - ---- - -## Phase 4: Discover Services That Need Restarting - -SSI only injects into processes at startup. Existing processes keep running uninstrumented until restarted. Discover what's running so the user knows what to restart. - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo ss -lntp 2>/dev/null || sudo netstat -tlnp 2>/dev/null || cat /proc/net/tcp" -``` - -For each application-level listener (ignore sshd, systemd, chronyd): - -```bash -ssh -o StrictHostKeyChecking=no -i @ " -# Command line of the process -sudo cat /proc//cmdline | tr '\0' ' ' -# Service manager (may not be available in all environments) -sudo systemctl status 2>/dev/null | head -3 || true -# Parent process -PPID=\$(sudo awk '/PPid/ {print \$2}' /proc//status) -sudo cat /proc/\$PPID/cmdline | tr '\0' ' ' -" -``` - -Present findings to the user: - -``` -I found the following application services on : - - Port 8080 — PID 1234 — /usr/bin/python3 /app/server.py - Managed by: systemd unit flask-app.service - - Port 3000 — PID 5678 — node /app/server.js - Managed by: supervisord - -These services need to be restarted for Datadog SSI to inject into them. -Restart them however is appropriate for your environment, then let me know -and I'll verify the instrumentation. -``` - -**Do not offer to restart services. Do not restart services unless the user explicitly asks.** - ---- - -## Done - -Exit when ALL of the following are true: -- [ ] Agent running on each target host (`datadog-agent status` shows Running, API key valid) -- [ ] `/opt/datadog-packages/datadog-apm-inject` exists on disk on each host -- [ ] User has been informed which services need restarting -- [ ] User has confirmed they are ready to restart services - -Automatically proceed to `enable-ssi` (if services need UST labels configured) or `verify-ssi` (if services have already been restarted) — do not ask the user for permission. - ---- - -## Security constraints - -- Never write a raw API key into any file or chat message -- Never store `DD_API_KEY` in shell history — pass it inline in the SSH command only -- If the user's API key appears in any output, redact it before displaying -- Always confirm before restarting production services diff --git a/crates/agent-skills/skills/dd-apm/linux-ssi/enable-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/linux-ssi/enable-ssi/SKILL.md deleted file mode 100644 index bcc5941..0000000 --- a/crates/agent-skills/skills/dd-apm/linux-ssi/enable-ssi/SKILL.md +++ /dev/null @@ -1,235 +0,0 @@ ---- -name: enable-ssi -description: Configure Unified Service Tags and verify Single Step Instrumentation (SSI) injection on Linux hosts — SSI automatically instruments applications for APM without code changes. Only use if the Datadog Agent is already installed. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,linux,ssi,instrumentation,single-step,ld-preload,ust - alwaysApply: "false" ---- - -# Configure SSI and Unified Service Tags on Linux - -> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 0 until every variable has a concrete value. - -## Triggers - -Invoke this skill when: -- The Datadog Agent is already installed with SSI (`DD_APM_INSTRUMENTATION_ENABLED=host` was used) and you need to configure Unified Service Tags on the application service -- The user wants to set `DD_SERVICE`, `DD_ENV`, `DD_VERSION` on a running service -- SSI is installed but `/proc//maps` doesn't show the language tracer (launcher-only injection) - -Do NOT invoke this skill if: -- The Datadog Agent is not yet installed — run `agent-install` first -- SSI packages are missing from `/opt/datadog-packages/` — re-run `agent-install` -- The target is a Kubernetes cluster — use `dd-apm-k8s-enable-ssi` instead - ---- - -## Background - -When the install script runs with `DD_APM_INSTRUMENTATION_ENABLED=host`, it: -1. Installs `datadog-apm-inject` and language library packages under `/opt/datadog-packages/` -2. Writes the launcher path into `/etc/ld.so.preload` -3. SSI is now armed — every new process on the host gets the launcher injected at startup - -**What SSI does NOT configure automatically:** -- `DD_SERVICE`, `DD_ENV`, `DD_VERSION` — these must be set on the application process for traces to be tagged correctly -- Without `DD_SERVICE`, the tracer auto-detects a service name (often the process name or framework name), which may not match what the user expects - ---- - -## Prerequisites - -**Verify SSI is armed:** - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "cat /etc/ld.so.preload && ls /opt/datadog-packages/ | grep apm" -``` - -If `/etc/ld.so.preload` contains a path to the launcher, and `/opt/datadog-packages/datadog-apm-inject` exists — SSI is armed. - -ERROR: Either missing — run `agent-install` first. - -**Check for existing manual instrumentation:** - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ " -grep -r 'import ddtrace\|from ddtrace\|require .dd-trace.\|opentelemetry' 2>/dev/null | head -5 || echo 'No manual instrumentation found' -" -``` - -ERROR: Manual instrumentation found — SSI silently disables itself when it detects an existing tracer. Remove the manual import/package before proceeding. - -**Check base libc:** - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "ldd --version 2>&1 | head -1" -``` - -ERROR: musl — SSI requires glibc. No workaround; must use a glibc-based OS. - ---- - -## Context to resolve before acting - -| Variable | How to resolve | -|---|---| -| `SERVICE_NAME` | Ask the user — how the service should appear in Datadog APM (e.g. `payment-api`) | -| `ENV` | Ask the user — environment name (e.g. `production`, `staging`, `dev`) | -| `VERSION` | Ask the user or read from the app's version file / git tag | -| `SYSTEMD_SERVICE_NAME` | From `systemctl list-units --type=service --state=running` on the host — the unit running the app | -| `SSH_KEY` | Path to SSH private key | -| `SSH_USER` | SSH username | -| `SSH_HOST` | Hostname or IP of the target host | - ---- - -## Step 0 (Only if existing instrumentation detected): Remove Manual Instrumentation - -- Python: `pip uninstall ddtrace`, remove `import ddtrace` / `ddtrace-run` from CMD -- Node.js: `npm uninstall dd-trace`, remove `require('dd-trace')` -- Java: remove `-javaagent:/path/to/dd-java-agent.jar` JVM flag -- Ruby: `gem uninstall ddtrace`, remove `require 'ddtrace'` -- .NET: remove `Datadog.Trace` NuGet and profiler env vars - -After removing, restart the service. **Confirm with the user before restarting.** Tell the user: "I need to restart `` to remove the old instrumentation. This will cause a brief outage. Ready to proceed?" Wait for confirmation. - ---- - -## Step 1: Set Unified Service Tags on the Application Process - -Without UST, traces arrive with an auto-detected service name that may not match user expectations, and won't be tagged with env or version. - -**For systemd-managed services** (most common): - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo systemctl cat " -``` - -Add a drop-in override (preserves the original unit file): - -### What you need to do in a terminal - -```bash -ssh -o StrictHostKeyChecking=no -i @ -sudo systemctl edit -``` - -Add to the editor: - -```ini -[Service] -Environment="DD_SERVICE=" -Environment="DD_ENV=" -Environment="DD_VERSION=" -``` - -Apply: - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo systemctl daemon-reload && sudo systemctl show | grep -E 'DD_SERVICE|DD_ENV|DD_VERSION'" -``` - -If the UST vars appear in the output — configuration applied. - -**For supervisord:** -```ini -# In [program:] section of supervisord.conf -environment=DD_SERVICE="",DD_ENV="",DD_VERSION="" -``` -Reload: `sudo supervisorctl reload` - -**For pm2:** -```js -// ecosystem.config.js -env: { DD_SERVICE: "", DD_ENV: "", DD_VERSION: "" } -``` -Reload: `pm2 reload ` - ---- - -## Step 2: Restart the Service - -**Confirm with the user before restarting.** Tell the user: "I need to restart `` for SSI to inject into it. This will cause a brief outage. Ready to proceed?" Wait for confirmation. - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo systemctl restart && sleep 3 && sudo systemctl is-active " -``` - -If `active` is returned — service is running. - -ERROR: Returns `failed` — check logs: -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo journalctl -u --since '1 minute ago' | tail -30" -``` - ---- - -## Step 3: Confirm Injection and UST in the Running Process - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "pgrep -a -f '' | head -3" -``` - -Use the PID: - -```bash -# Authoritative injection check -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo cat /proc//maps | grep -E 'launcher|apm-library|datadog'" - -# UST vars in process environment -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo cat /proc//environ | tr '\0' '\n' | grep -E 'DD_SERVICE|DD_ENV|DD_VERSION'" -``` - -If both the launcher and language library appear in maps, and UST vars are in environ — SSI and tagging are fully configured. - -ERROR: Launcher in maps but no language library — injection attempted but failed. Run: -```bash -pup apm troubleshooting list --hostname --timeframe 15m -``` - -Go to `troubleshoot-ssi` if errors are present. - ---- - -## Done - -Exit when ALL of the following are true: -- [ ] Launcher and language library visible in `/proc//maps` -- [ ] `DD_SERVICE`, `DD_ENV`, `DD_VERSION` present in `/proc//environ` -- [ ] Service is running and healthy - -Automatically proceed to `verify-ssi` now — do not ask the user for permission. - ---- - -## Security constraints - -- Never write a raw API key into any file or chat message -- Always confirm with the user before restarting production services -- Do not modify application source code — configure only via environment variables in the service unit diff --git a/crates/agent-skills/skills/dd-apm/linux-ssi/onboarding-summary/SKILL.md b/crates/agent-skills/skills/dd-apm/linux-ssi/onboarding-summary/SKILL.md deleted file mode 100644 index a5929b9..0000000 --- a/crates/agent-skills/skills/dd-apm/linux-ssi/onboarding-summary/SKILL.md +++ /dev/null @@ -1,139 +0,0 @@ ---- -name: onboarding-summary -description: Generate a live Single Step Instrumentation (SSI) onboarding confirmation report for Linux hosts — verifies APM instrumentation is working end-to-end with deep links into the Datadog UI. Only use after agent-install and enable-ssi have both completed. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,linux,ssi,summary,verification - alwaysApply: "false" ---- - -# APM Onboarding Summary — Linux Host - -## Triggers - -Invoke this skill when: -- All steps in `verify-ssi` have passed -- All checks in `troubleshoot-ssi` have been resolved -- The user asks "is everything working?", "show me the status", or "confirm APM is set up" - -Do NOT invoke this skill if any verification or troubleshooting check is still failing — resolve those first. - ---- - -## Context to resolve before acting - -| Variable | How to resolve | -|---|---| -| `HOSTNAME` | `hostname -f` on the target host | -| `DD_HOSTNAME` | Hostname as Datadog sees it — from `sudo datadog-agent status` | -| `SERVICE_NAME` | `DD_SERVICE` value from `/proc//environ` or the systemd unit | -| `ENV` | `DD_ENV` value from `/proc//environ` or the systemd unit | -| `DD_SITE` | `grep "^site:" /etc/datadog-agent/datadog.yaml` | -| `SSH_KEY` | Path to SSH private key | -| `SSH_USER` | SSH username | -| `SSH_HOST` | Hostname or IP of the target host | - ---- - -## Prerequisites - -### Claude runs - -```bash -pup auth status --site -``` - -If valid token — proceed. - -ERROR: Not authenticated: - -### Claude runs - -```bash -pup auth login --site -``` - -> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. - ---- - -## Collect live confirmation data - -Run all of the following. Each populates a row in the final report. - -### Claude runs - -```bash -# Agent version and status -sudo datadog-agent status 2>&1 | grep -E "Agent \(v|Status:|API Keys status" - -# Inject library armed in ld.so.preload -ssh -o StrictHostKeyChecking=no -i @ "cat /etc/ld.so.preload" - -# Process confirmed injected — launcher + language library in /proc//maps -ssh -o StrictHostKeyChecking=no -i @ \ - "pgrep -a -f '' | head -3" -``` - -Use the PID from above: - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo cat /proc//maps | grep -E 'launcher|apm-library|datadog'" - -# UST vars in process environment -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo cat /proc//environ | tr '\0' '\n' | grep -E 'DD_SERVICE|DD_ENV|DD_VERSION'" - -# Agent APM receiver — trace counts -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo datadog-agent status 2>&1 | grep -A 10 'Receiver (previous minute)'" - -# Service visible and traced in APM backend -DD_SITE= pup apm services list --env --from 1h - -# Traces arriving in the last hour -DD_SITE= pup traces search --query "service:" --from 1h --limit 5 -``` - ---- - -## Present the report - -Fill in every value from live command output. Do not leave any placeholder unfilled. If a value cannot be confirmed, mark that row as failed and link to `troubleshoot-ssi`. - ---- - -**APM onboarding complete** - -| Check | Detail | Status | -|---|---|---| -| Datadog Agent | v`` running on ``, API key valid | OK | -| SSI armed | `/etc/ld.so.preload` contains launcher path | OK | -| Process injected | launcher + language library in `/proc//maps` for `` | OK | -| Unified Service Tags | `DD_SERVICE=` `DD_ENV=` `DD_VERSION=` | OK | -| Agent receiving traces | `` trace(s)/min in APM receiver | OK | -| APM service visible | `` in env `` | OK | -| Traces arriving | `` trace(s) found in the last hour | OK | - ---- - -**Your service in Datadog — click to open:** - -Construct each URL by substituting real values. Do not print placeholder URLs. - -| View | URL | -|---|---| -| Service overview | `https://app./apm/services/?env=` | -| Traces explorer | `https://app./apm/traces?query=service:%20env:` | -| Service map | `https://app./apm/map?env=&service=` | -| Infrastructure host | `https://app./infrastructure?q=host:` | -| Agent fleet | `https://app./fleet-automation` | - ---- - -## Security constraints - -- Never write a raw API key into any file or chat message diff --git a/crates/agent-skills/skills/dd-apm/linux-ssi/troubleshoot-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/linux-ssi/troubleshoot-ssi/SKILL.md deleted file mode 100644 index 95035df..0000000 --- a/crates/agent-skills/skills/dd-apm/linux-ssi/troubleshoot-ssi/SKILL.md +++ /dev/null @@ -1,400 +0,0 @@ ---- -name: troubleshoot-ssi -description: Diagnose and fix Single Step Instrumentation (SSI) issues on Linux hosts — SSI automatically instruments applications for APM without code changes. Only use if the agent and SSI are configured but traces are missing or instrumentation is not working. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,linux,ssi,troubleshooting,instrumentation,ld-preload - alwaysApply: "false" ---- - -# Troubleshoot APM SSI on Linux - -## Triggers - -Invoke this skill when the user expresses intent to: -- Debug why a Linux process is not being instrumented -- Investigate why traces are not appearing in Datadog from a Linux host -- Diagnose SSI injection failures on Linux -- Follow up on failed checks from `verify-ssi` -- Report that a specific service or host has no traces - -Do NOT invoke this skill if: -- SSI has not been enabled yet — run `enable-ssi` first - ---- - -## Critical: pup First, SSH Second - -**You do NOT need SSH access to start troubleshooting.** The `pup` CLI queries Datadog's backend directly. Start with pup commands immediately using information the user already gave you (hostname, service name, env). Only go to SSH if pup doesn't reveal the cause. - -### pup-cli: check, install, and authenticate - -### Claude runs - -```bash -pup --version -``` - -If not found, install it (OS-aware): - -### Claude runs - -```bash -if [[ "$(uname)" == "Darwin" ]]; then - brew tap datadog-labs/pack && brew install datadog-labs/pack/pup -else - PUP_VERSION=$(curl -s https://api.github.com/repos/datadog-labs/pup/releases/latest | grep '"tag_name"' | cut -d'"' -f4) - curl -L "https://github.com/datadog-labs/pup/releases/download/${PUP_VERSION}/pup_linux_amd64.tar.gz" | tar xz -C /usr/local/bin pup - chmod +x /usr/local/bin/pup -fi -pup --version -``` - -**Auth — check in this order:** - -1. Check OAuth status: -```bash -pup auth status --site -``` - -If authenticated — proceed directly to Step 1. - -ERROR: Not authenticated: - -### Claude runs - -```bash -pup auth login --site -``` - -> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. - -2. If OAuth login is not possible (e.g., no browser access), fall back to API keys: -```bash -echo "DD_API_KEY set: $([ -n "${DD_API_KEY:-}" ] && echo yes || echo no)" -echo "DD_APP_KEY set: $([ -n "${DD_APP_KEY:-}" ] && echo yes || echo no)" -``` - -If `DD_API_KEY` and `DD_APP_KEY` are both set — **proceed to Step 1**. pup will use them automatically even if `pup auth status` shows unauthenticated. - ---- - -## Context - -Use what the user already provided. Do not ask for missing context upfront — resolve variables lazily, only when a specific step needs them. - -| Variable | How to resolve | When needed | -|---|---|---| -| `DD_HOSTNAME` | From the user's message, or `datadog-agent status` via SSH | Step 1 — start here | -| `SERVICE_NAME` | From the user's message | Step 1 — start here | -| `ENV` | Ask the user only when a command requires it | Step 1 (`service-library-config get`), Step 3 | -| `DD_SITE` | Ask the user, or `grep "^site:" /etc/datadog-agent/datadog.yaml` via SSH | Only if pup auth check fails | -| `SSH_KEY` | From user or `/workspace/.ssh/id_ed25519` | Step 4 (SSH investigation) only | -| `SSH_USER` | From user or default `root` | Step 4 (SSH investigation) only | -| `SSH_HOST` | From user's message | Step 4 (SSH investigation) only | - -**If the user has already provided `DD_HOSTNAME` and `SERVICE_NAME`, go directly to Step 1. Do not ask for ENV or SSH details first.** - ---- - -## How SSI Works on Linux — Domain Knowledge - -Read this before investigating. It gives you the mental model to reason about novel failures. - -**Injection chain:** -1. Install script (with `DD_APM_INSTRUMENTATION_ENABLED=host`) installs `datadog-apm-inject` and language library packages under `/opt/datadog-packages/` -2. The inject package writes its launcher path into `/etc/ld.so.preload` -3. The Linux dynamic linker pre-loads the launcher into every new process at startup -4. The launcher detects the process language and loads the appropriate tracer `.so` from `/opt/datadog-packages/datadog-apm-library-/` -5. The tracer sends spans to the Agent at `localhost:8126` -6. The Agent forwards traces to Datadog at `intake.` - -**Diagnostic layers:** -- **`pup`** — sees what Datadog's backend received + injection errors reported by the launcher. Start here. -- **`/proc//maps`** — sees the actual shared libraries loaded into a running process. The authoritative check for whether injection succeeded. -- **`datadog-agent status`** — sees whether the local Agent is receiving traces. - -**Known silent failures:** -- **musl libc (Alpine)** — launcher is glibc-compiled; musl is ABI-incompatible. Linker loads it but injection silently aborts -- **Existing ddtrace/OTel** — launcher detects user-installed tracer and silently disables itself (`already_instrumented` result class) -- **Unsupported runtime version** — silently skipped -- **Process started before SSI was enabled** — `/etc/ld.so.preload` only affects new processes -- **Static binary / Go** — Go programs link statically and ignore `LD_PRELOAD` entirely -- **SELinux/AppArmor** — can block `/etc/ld.so.preload` reads for confined processes -- **Package directory empty/corrupt** — `datadog-installer status` reflects DB registration, not actual files. A package can show as installed while its directory is empty. Always verify files exist under `/opt/datadog-packages//` - -**Service name identity — important:** -With SSI, `DD_SERVICE` is often not set in the process environment. The tracer auto-detects a service name. The telemetry-reported name (what `pup fleet tracers list` and `service-library-config get` show) may not match what you expect in the APM UI: -- **JVM**: telemetry reports jar artifact name with version (e.g. `inventory-service-1.0.0`), spans use the base name (`inventory-service`) -- **Python**: telemetry may report `fastapi` or `django` rather than the app name -- **Node.js**: names typically match - -If `service-library-config get` returns empty, use `pup traces search --query "host:" --from 1h --limit 5` to discover what service names have been sending traces, then retry. - ---- - -## Step 1: Triage with pup (no SSH required) - -Run these first. The answers determine everything that follows. - -### Claude runs - -```bash -# Check for injection errors (failures only — successful injections don't appear here) -pup apm troubleshooting list --hostname - -# Check full tracer config — look at apm_enabled, trace_agent_url, site -pup apm service-library-config get --service-name --env - -# Check what services have sent traces (reveals actual service names visible to backend) -pup apm services list --from 1h - -# Check if traces exist at all -pup traces search --query "service:" --from 15m --limit 5 - -# Fastest trace confirmation — metrics appear before indexed traces -pup metrics query --query "sum:trace.*.request.hits{host:,service:}.as_count()" --from 15m -``` - -`ENV` is required for `service-library-config get`. If the user didn't provide it, ask for it before running that command. - -Key values to check in `service-library-config get` output: -- `apm_enabled` — must be `true`. If `false`, the tracer won't send traces regardless of injection. -- `trace_agent_url` — must point to `http://localhost:8126` or the correct agent socket. Wrong value = tracer can't reach the Agent. -- `site` — must match your Datadog org's site. - ---- - -## Step 2: State Your Hypotheses - -Before investigating, explicitly state your ranked hypotheses based on triage output. Do not skip this step. - -| Triage signal | Strong hypothesis | -|---|---| -| `pup troubleshooting list` shows `result: error`, `result_class: incorrect_installation` | Package directory empty or corrupt — verify files exist under `/opt/datadog-packages/datadog-apm-library-/`, then use remediation flow | -| `pup troubleshooting list` shows `result: error`, import/load error | Tracer library couldn't be loaded — check runtime version, libc compatibility | -| `pup troubleshooting list` shows `result: abort`, reason `already_instrumented` | Manual ddtrace/OTel already in the app — launcher silently disabled itself | -| `pup troubleshooting list` shows `result: abort`, reason `language not detected` | Expected for non-app processes (e.g., bash, cron). Not a failure. | -| `pup troubleshooting list` empty | Either no injection attempts yet (process not restarted), or injection succeeded silently | -| `service-library-config get` shows `apm_enabled: false` | Tracer is loaded but explicitly disabled — check `source` field to see who set it | -| `service-library-config get` shows `trace_agent_url` pointing to wrong host/port | Tracer can't reach the Agent — fix the URL | -| `service-library-config get` shows wrong `site` | Traces going to wrong Datadog org | -| No traces in `pup traces search`, no troubleshooting errors | Process was never injected — check: process not restarted after SSI enabled, `/etc/ld.so.preload` missing, static binary | -| Unexpected service name in `pup apm services list` results | Service name mismatch — use the actual name from trace data for subsequent config lookups | -| Traces arriving in pup | Not a real problem — likely a UI filter or time window. Tell the user and stop. | - -State your top 1-3 hypotheses explicitly: *"Based on triage, I think the most likely cause is X because Y."* - ---- - -## Step 3: Investigate with pup (deeper) - -Use only the tools relevant to your hypotheses. - -**Check SDK config in detail:** -```bash -# Show all config values with their source (env_var, remote_config, code, default) -pup apm service-library-config get --service-name --env - -# Show only configs where instances disagree (config drift) -pup apm service-library-config get --service-name --mixed -``` - -Key values to check: -- `apm_enabled` — if `false`, tracer won't send traces. Check `source` to see who disabled it (`code` > `env_var` > `remote_config` > `default`) -- `trace_agent_url` — should be `http://localhost:8126` or a Unix socket. Wrong value = tracer can't reach Agent -- `site` — must match your Datadog org's site. Mismatch = traces going to wrong org -- `service` — with SSI and no `DD_SERVICE` set, `source: default` is expected - -**If `service-library-config get` returns empty** — the service name you're using may not match the actual name in trace data: -```bash -pup traces search --query "host:" --from 1h --limit 5 -``` -Use the `service` field from trace results for subsequent config lookups. - -**Check injection error details:** -```bash -pup apm troubleshooting list --hostname --timeframe 4h -``` - ---- - -## Step 4: Investigate via SSH (if pup didn't reveal the cause) - -**Before asking for SSH credentials, briefly explain what you need to check and why**, so the user understands the diagnostic plan before handing over access. - -**Is `/etc/ld.so.preload` set?** -```bash -ssh -o StrictHostKeyChecking=no -i @ "cat /etc/ld.so.preload" -``` -If it contains a path ending in `launcher.preload.so` or `libdatadog-apm-inject.so` — launcher is armed for new processes. -ERROR: Empty or missing — SSI was not fully set up. Re-run the install script with `DD_APM_INSTRUMENTATION_ENABLED=host`. - -**Is the tracer actually loaded into the running process?** - -This is the authoritative injection check — use `/proc//maps`, not environ: -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "pgrep -a -f '' | head -3" -``` -Use the PID: -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo cat /proc//maps | grep -E 'launcher|apm-library|datadog'" -``` -- **Launcher + language library present** — injection succeeded for this process -- **Launcher only, no language library** — launcher ran but couldn't inject the tracer (check `pup troubleshooting list` for the reason) -- **Nothing** — `/etc/ld.so.preload` not set, process started before SSI was enabled, or static binary - -**Was the process started before SSI was enabled?** -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "ps -p -o pid,lstart,cmd; stat /etc/ld.so.preload" -``` -If process started before `/etc/ld.so.preload` was written, restart the service. **Always confirm with the user before restarting production services.** - -**Is the base libc musl?** -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "ldd --version 2>&1 | head -1 && cat /etc/os-release | grep PRETTY_NAME" -``` -ERROR: musl — SSI's launcher requires glibc. No workaround; must migrate to Debian/Ubuntu/RHEL/Amazon Linux. - -**Is it a static binary?** -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "file /proc//exe; ldd /proc//exe 2>&1" -``` -ERROR: `statically linked` — SSI cannot instrument this binary. Manual instrumentation required. - -**Are the APM packages actually present on disk?** - -`datadog-installer status` reflects only DB registration — a package can show as installed while its directory is empty. Always verify: -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "ls /opt/datadog-packages/ && ls /opt/datadog-packages/datadog-apm-library-/ | head -5" -``` -ERROR: Directory empty or missing — package is registered but broken on disk. Use the remediation flow. - -**Does the app have existing manual instrumentation?** -```bash -ssh -o StrictHostKeyChecking=no -i @ " -sudo cat /proc//maps | grep -E 'ddtrace|opentelemetry|dd-trace' -" -``` -Also check dependency manifests: `requirements.txt`, `package.json`, `Gemfile`, `pom.xml`. -ERROR: Found — SSI silently disabled itself. Remove manual tracer, restart the service. - -**Is the Agent APM receiver listening and receiving traces?** -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo datadog-agent status 2>&1 | grep -A 15 'APM Agent'" -``` -- `feature_auto_instrumentation_enabled: true` — SSI is active on the agent -- `Receiver (previous minute)` — trace count received by the agent -- `Endpoints` — where traces are forwarded - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo ss -tlnp 2>/dev/null | grep 8126 || sudo netstat -tlnp 2>/dev/null | grep 8126" -``` -ERROR: Port 8126 not listening — APM receiver disabled. Check `apm_config.enabled` in `/etc/datadog-agent/datadog.yaml`. - -**What service name did the tracer register?** - -With SSI, `DD_SERVICE` is often not set. Read the tracer's memfd to find the real service name: -```bash -ssh -o StrictHostKeyChecking=no -i @ " -sudo ls -la /proc//fd/ | grep 'datadog-tracer-info' -" -``` -Use the fd number: -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo cat /proc//fd/ | python3 -c \"import sys,msgpack; d=msgpack.unpackb(sys.stdin.buffer.read()); print(d)\"" -``` -Returns `service_name`, `service_env`, `tracer_version`. - -**Is SELinux/AppArmor blocking `/etc/ld.so.preload`?** -```bash -ssh -o StrictHostKeyChecking=no -i @ " -getenforce 2>/dev/null -ausearch -m AVC -ts recent 2>/dev/null | grep 'ld.so.preload\|datadog' | tail -10 -dmesg | grep -i 'apparmor.*denied.*datadog' | tail -5 -" -``` -If SELinux/AppArmor is denying access, work with the user's security team. Do not disable SELinux systemwide. - ---- - -## Step 5: Reflect Before Concluding - -Before applying any fix, answer: -1. What evidence confirms my hypothesis? -2. What evidence would contradict it — and have I checked? -3. Is there a simpler explanation I haven't considered? - -If the conclusion doesn't hold up, return to Step 2 with new hypotheses. - ---- - -## Step 6: Fix - -**Remediation: Reinstalling a Broken APM Package** - -`datadog-installer status` reflects DB registration, not actual file presence. If `pup troubleshooting list` shows `incorrect_installation` but the installer says the package is installed, the registration is stale: - -```bash -# Remove the stale registration first -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo datadog-installer remove datadog-apm-library-" - -# Re-run install — now it will actually download and extract -ssh -o StrictHostKeyChecking=no -i @ \ - "DD_API_KEY=${DD_API_KEY} DD_SITE=${DD_SITE} DD_APM_INSTRUMENTATION_ENABLED=host bash -c \"\$(curl -L https://install.datadoghq.com/scripts/install_script_agent7.sh)\"" -``` - -If re-running the install script is sufficient (package files are intact), use `remove` first only if the script reports success but the problem persists. - -**After any config change — restart the service** (confirm with user first for production): - -The user must restart the affected service for SSI to re-inject. Identify the service manager and present restart instructions — do not restart automatically unless the user explicitly asks. - -Common restart commands: -```bash -# systemd -sudo systemctl restart -# supervisord -sudo supervisorctl restart -# pm2 -pm2 reload -``` - ---- - -## Step 7: Verify - -Re-run the pup triage commands to confirm the fix worked: - -### Claude runs - -```bash -pup apm troubleshooting list --hostname --timeframe 15m -pup traces search --query "service:" --from 15m --limit 5 -pup metrics query --query "sum:trace.*.request.hits{host:,service:}.as_count()" --from 15m -``` - -If there are no new injection errors and traces are arriving — resolved. Automatically proceed to `onboarding-summary` now — do not ask the user for permission. - -ERROR: Still failing — return to Step 2 with updated hypotheses. - ---- - -## Security constraints - -- Never write a raw API key into any file or chat message -- Never disable SELinux systemwide -- Always confirm before restarting production services -- `datadog-installer remove` requires explicit confirmation — confirm with user before running diff --git a/crates/agent-skills/skills/dd-apm/linux-ssi/verify-ssi/SKILL.md b/crates/agent-skills/skills/dd-apm/linux-ssi/verify-ssi/SKILL.md deleted file mode 100644 index 15465ba..0000000 --- a/crates/agent-skills/skills/dd-apm/linux-ssi/verify-ssi/SKILL.md +++ /dev/null @@ -1,198 +0,0 @@ ---- -name: verify-ssi -description: Verify Single Step Instrumentation (SSI) is working end-to-end on Linux hosts — SSI automatically instruments applications for APM without code changes. Only use after enable-ssi has run. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,linux,ssi,verification,instrumentation,ld-preload - alwaysApply: "false" ---- - -# Verify APM SSI on Linux - -> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 1 until every variable has a concrete value. - -## Triggers - -Invoke this skill when the user expresses intent to: -- Confirm SSI is working after installing the Datadog Agent on Linux -- Check whether a Linux process is being instrumented -- Verify the tracer is running and reporting telemetry - -Do NOT invoke this skill if: -- SSI has not been enabled yet — run `agent-install` first -- Services have not been restarted since the agent was installed — restart them first, then verify - ---- - -## Prerequisites - -- [ ] `agent-install` is complete -- [ ] Application services have been restarted since the agent was installed - -### pup-cli: check, install, and authenticate - -### Claude runs - -```bash -pup --version -``` - -If not found: - -### Claude runs - -```bash -brew tap datadog-labs/pack -brew install pup -``` - -Check auth: -```bash -pup auth status --site -``` - -If not authenticated: - -### Claude runs - -```bash -pup auth login --site -``` - -> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. - -If valid token — proceed. -ERROR: No browser available: `export DD_APP_KEY=` - ---- - -## Context to resolve before acting - -| Variable | How to resolve | -|---|---| -| `DD_HOSTNAME` | Hostname as Datadog sees it — from `sudo datadog-agent status` output | -| `SERVICE_NAME` | Expected service name in APM — ask the user | -| `ENV` | Environment tag — ask the user | -| `DD_SITE` | `grep "^site:" /etc/datadog-agent/datadog.yaml` via SSH, or ask the user | -| `SSH_KEY` | Path to SSH private key | -| `SSH_USER` | SSH username | -| `SSH_HOST` | Hostname or IP of the target host | - ---- - -## Step 1: Confirm the Process is Injected - -Use `/proc//maps` — this is the authoritative check. It shows the actual shared libraries loaded into the running process, which is the only way to confirm the launcher and tracer `.so` files were actually loaded. - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "pgrep -a -f '' | head -5" -``` - -Use the PID from above: - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo cat /proc//maps | grep -E 'launcher|apm-library|datadog'" -``` - -If the output includes both the launcher (e.g. `launcher.preload.so`) and a language library (e.g. `apm-library-python`) — injection succeeded for this process. - -ERROR: Launcher present but no language library — launcher ran but couldn't inject. Check for injection errors: - -### Claude runs - -```bash -pup apm troubleshooting list --hostname --timeframe 1h -``` - -ERROR: Neither present — process was not injected. Check `/etc/ld.so.preload`: - -```bash -ssh -o StrictHostKeyChecking=no -i @ "cat /etc/ld.so.preload" -``` - -If empty — install did not set up the launcher. Re-run the install script with `DD_APM_INSTRUMENTATION_ENABLED=host`. If non-empty but the process still isn't injected — the process was started before the launcher was installed. Restart the service and recheck. - ---- - -## Step 2: Confirm the Agent is Receiving Traces - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo datadog-agent status 2>&1 | grep -A 15 'APM Agent'" -``` - -Healthy output shows: -- `feature_auto_instrumentation_enabled: true` -- `Receiver (previous minute)` with `> 0` traces - -ERROR: `feature_auto_instrumentation_enabled: false` — SSI not active on the agent. Check `apm_config` in `/etc/datadog-agent/datadog.yaml`. - -ERROR: `Receiver (previous minute): 0` — agent running but no traces yet. Generate traffic first (see Step 3), then recheck. - ---- - -## Step 3: Confirm the Service is Visible in Datadog - -### Claude runs - -```bash -DD_SITE= pup apm services list --env --from 1h -``` - -If `` appears with `isTraced: true` — traces are reaching the Datadog backend. - -> **Flask / ddtrace v3 naming note:** With ddtrace >=3.x, Flask spans are emitted as `service:flask` rather than `service:`. The `DD_SERVICE` value appears as `base_service` on the spans. If you set `DD_SERVICE=my-app`, search for `service:flask` in the APM UI — the service list will show `flask`, not `my-app`. Check the `base_service` tag to confirm it matches your `DD_SERVICE`. - -ERROR: Service missing — generate traffic to trigger trace creation: - -### Claude runs - -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "sudo ss -tlnp 2>/dev/null | grep || sudo netstat -tlnp 2>/dev/null | grep " -``` - -Use the port from above: -```bash -ssh -o StrictHostKeyChecking=no -i @ \ - "for i in \$(seq 1 10); do curl -s -o /dev/null http://localhost:/; done" -``` - -Wait 30 seconds, then retry: -```bash -DD_SITE= pup apm services list --env --from 10m -DD_SITE= pup traces search --query "service:" --from 10m --limit 5 -``` - -ERROR: Still missing — check for injection errors and go to `troubleshoot-ssi`: -```bash -pup apm troubleshooting list --hostname --timeframe 1h -``` - ---- - -## Done - -Exit when ALL of the following are true: -- [ ] Step 1: launcher + language library both visible in `/proc//maps` -- [ ] Step 2: agent APM receiver shows `> 0` traces/min -- [ ] Step 3: service appears in `pup apm services list` - -If any check fails, go to `troubleshoot-ssi`. - -When all steps pass, automatically proceed to `onboarding-summary` now — do not ask the user for permission. - ---- - -## Security constraints - -- Never write a raw API key into any file or chat message -- Always confirm before restarting production services diff --git a/crates/agent-skills/skills/dd-apm/service-remapping/SKILL.md b/crates/agent-skills/skills/dd-apm/service-remapping/SKILL.md deleted file mode 100644 index 8e387b7..0000000 --- a/crates/agent-skills/skills/dd-apm/service-remapping/SKILL.md +++ /dev/null @@ -1,411 +0,0 @@ ---- -name: service-remapping -description: Create and manage APM service remapping rules — rewrite service names at ingestion time to collapse noisy inferred entities, clean up auto-generated names, handle org renames, or normalize naming conventions. Use for any request involving service renaming, service mapping, inferred service cleanup, peer.service normalization, or collapsing fragmented service names. -metadata: - version: "1.0.0" - author: datadog-labs - repository: https://github.com/datadog-labs/agent-skills - tags: datadog,apm,service-remapping,service-naming,inferred-services,peer-service - alwaysApply: "false" - tools: pup ---- - -# APM Service Remapping - -> **Before doing anything else:** Fully resolve all variables in `## Context to resolve before acting`. Do not begin Step 0 until every variable has a concrete value. - ---- - -## How Service Remapping Works — Domain Knowledge - -Read this before building any rule. It gives you the mental model to construct the right filter and catch edge cases. - -**What remapping does:** A rule intercepts telemetry at ingestion time and rewrites the service name before indexing. A rule says: "for any entity matching this filter, replace its service name with this new value." - -**Two entity types — pick the right one:** - -| Entity type | `rule_type` integer | What it targets | -|---|---|---| -| **SERVICE** | `0` | Instrumented services — have spans with an explicit `service` tag set by a tracer | -| **INFERRED_ENTITY** | `1` | Auto-detected from outbound calls — named from `peer.service`. **Requires `peer.service` to be set on outbound spans** (see prerequisite below). | - -**Prerequisite for inferred entity remapping — `peer.service` must be set:** - -Inferred entity remapping only works when the tracer sets `peer.service` on outbound spans. Without it, entities are keyed by `peer.hostname` and remapping rules will not apply. - -To enable this, set the following env var on the **instrumented service** (not the downstream dependency): - -```bash -DD_TRACE_PEER_SERVICE_DEFAULTS_ENABLED=true -``` - -This makes the ddtrace tracer automatically propagate `peer.service` from `peer.hostname` on outbound HTTP, gRPC, and database calls. Without this, `pup traces search` will show spans with `peer.hostname` but no `peer.service`, and no service remapping rule will match. - -To verify `peer.service` is being set before building a rule: - -```bash -pup traces search --query "@peer.service:" --from 15m --limit 5 -``` - -If zero results — the tracer is not setting `peer.service`. Ask the user to add `DD_TRACE_PEER_SERVICE_DEFAULTS_ENABLED=true` to their service's environment and redeploy before continuing. - -**Filter syntax** — a standard Datadog event-grammar query string: - -| Goal | Filter | -|---|---| -| Exact service match | `service:payments` | -| All services with a prefix | `service:deploy-test*` | -| All services with a suffix | `service:*.tropos` | -| All services containing a string | `service:*payments*` | -| All inferred services under a domain | `peer.service:*.shopify.com` | -| Service in one environment only | `service:payments AND env:prod` | - -**New name syntax** — the `value` field in `rewrite_tag_rules`: - -| Form | Example | Use for | -|---|---|---| -| Static string | `my-service` | Every matched entity gets exactly this name | -| Tag interpolation | `{{service}}` | Substitute the full value of a tag | -| Tag + regex capture | `{{service\|^(.+?)\..*$}}` | Extract part of a tag value (non-greedy capture) | - -**Regex constraints for `{{tag\|regex}}`:** -- Maximum **1 capture group** per expression -- **No greedy quantifiers inside capture groups** — use non-greedy variants: `(.+?)` not `(.+)`, `(.*?)` not `(.*)` -- Quantifiers on capture groups themselves (e.g. `(foo)+`) are not allowed - -**Five remapping patterns:** - -| Pattern | User says… | Filter example | New name example | -|---|---|---|---| -| **N:1 group** | "These N services are all the same thing" | `peer.service:*.shopify.com` | `shopify` | -| **Strip suffix/prefix** | "The name has junk at the end/start" | `service:*.tropos` | `{{service\|^(.+?)\..*$}}` | -| **1:1 rename** | "We renamed this service and Datadog needs to match" | `service:old-auth-service` | `auth-service` | -| **Env split** | "I want separate services per env but they all have the same name" | `service:my-service AND env:prod` | `my-service-prod` | -| **Prefix normalization** | "All services should start with an env or team name" | `service:payments*` | `{{env}}-{{service}}` | - ---- - -## Triggers - -Invoke this skill when the user wants to: -- Rename a service in Datadog without re-instrumenting -- Collapse multiple inferred service names into one (e.g. many `api.shopify.com/*` variants → `shopify`) -- Strip environment suffixes, version tags, or deployment metadata baked into service names -- Normalize `peer.service` names to something meaningful -- Rename a service after an org change, product rebrand, or migration -- Split a single service into per-env variants (`my-service` + `env:prod` → `my-service-prod`) -- List, review, or delete existing service remapping rules - -Do NOT invoke this skill if: -- The user wants to rename the service in their application code — that requires a tracer config change (`DD_SERVICE`), not a remapping rule -- The user wants to correlate telemetry across infrastructure tags — that is the "Correlate telemetry" action type in the UI, not remapping - ---- - -## Prerequisites - -### pup-cli: check, install, and authenticate - -### Claude runs - -```bash -pup --version -``` - -If not found: - -### Claude runs - -```bash -brew tap datadog-labs/pack -brew install pup -``` - -Check auth: -```bash -pup auth status -``` - -If not authenticated: - -### Claude runs - -```bash -pup auth login -``` - -> This opens a browser tab for OAuth. Complete the login there — Claude will continue once the command exits. - -### Credentials for write operations - -`pup apm service-remapping list` and `get` work with OAuth. Create, update, and delete require API keys (`DD_API_KEY`, `DD_APP_KEY`, `DD_SITE`) until `apm_service_renaming_write` is added to pup's OAuth scopes. - -### Claude runs - -```bash -echo "DD_API_KEY set: $([ -n "${DD_API_KEY:-}" ] && echo yes || echo no)" -echo "DD_APP_KEY set: $([ -n "${DD_APP_KEY:-}" ] && echo yes || echo no)" -echo "DD_SITE: ${DD_SITE:-not set (defaulting to datadoghq.com)}" -``` - -If any are missing and you need to create/update/delete rules: - -### What you need to do in a terminal - -```bash -export DD_API_KEY= -export DD_APP_KEY= -export DD_SITE=datadoghq.com # adjust for your site -``` - -> Common sites: `datadoghq.com` (US1), `datadoghq.eu` (EU1), `us3.datadoghq.com`, `us5.datadoghq.com`, `ap1.datadoghq.com` - -Wait for the user to set credentials, then re-run the check above before continuing. - ---- - -## Context to resolve before acting - -| Variable | How to resolve | -|---|---| -| `ENV` | Ask the user which environment to target. Do NOT assume `prod`. | -| `ORIGINAL_SERVICE` | Current service name(s) to remap — discover with `pup apm services list` or ask the user | -| `ENTITY_TYPE` | Instrumented service (`rule_type: 0`) or inferred entity (`rule_type: 1`)? Ask if unclear — see Domain Knowledge | -| `TARGET_NAME` | The desired new service name — ask the user | -| `PATTERN` | Which pattern applies — identify from the user's description (see Domain Knowledge above) | - ---- - -## Step 0: Discover Current Service Names - -If the user hasn't specified exact names to remap, discover what exists first: - -### Claude runs - -```bash -pup apm services list --env --from 1h -pup traces search --query "service:" --from 1h --limit 20 -``` - -Use the output to help the user identify exact service names. Ask the user to confirm which names they want remapped before proceeding. - ---- - -## Step 1: Build the Rule - -Work through each component before writing any JSON. - -### 1. Entity type - -[DECISION: entity type — ask the user if unclear] -- Does the service appear because a tracer explicitly set its `service` tag? → `rule_type: 0` (SERVICE) -- Does it appear in the service map from outbound calls (e.g. a database, queue, or external API)? → `rule_type: 1` (INFERRED_ENTITY) - -If the user wants to remap an inferred entity, verify `peer.service` is set before proceeding — see the prerequisite in Domain Knowledge. If it is not set, stop and ask the user to enable `DD_TRACE_PEER_SERVICE_DEFAULTS_ENABLED=true` first. - -### 2. Filter - -Write a single event-grammar query string targeting the service(s) to remap. Use the filter syntax and pattern table in Domain Knowledge to pick the right form. - -### 3. New name (`value`) - -Use the new name syntax and regex table in Domain Knowledge to pick the right form. For regex values, apply the constraints listed there. - -### 4. Rule name - -Suggest a descriptive name. Examples: -- `collapse-shopify-inferred-services` -- `strip-tropos-suffix` -- `rename-old-auth-to-auth-service` -- `env-split-my-service-prod` - ---- - -## Step 2: Preview Impact - -Before constructing the JSON, check what will be affected: - -### Claude runs - -```bash -# Confirm telemetry exists for the targeted service (zero spans = wrong query or wrong env) -pup traces search --query "service:" --from 15m --limit 5 - -# Check for monitors referencing the old service name -pup monitors list | grep -i "" - -# Check for dashboards referencing the old service name -pup dashboards list | grep -i "" - -# List existing service remapping rules that may conflict -pup apm service-remapping list -``` - -Report to the user: - -| Item | What to surface | -|---|---| -| **Telemetry volume** | Non-zero spans confirm the filter will match real data. Zero = likely wrong service name or env. | -| **Monitors** | Any monitor referencing the old service name will silently break after remapping. List them and offer to update. | -| **Dashboards** | Any dashboard with the old service name in its title will have stale references after remapping. List them and offer to update. | -| **Conflicting rules** | Existing rules targeting the same service may be overridden. Show conflicts and ask the user to confirm. | - -If monitors reference the old service name, ask: -> *"I found `` monitor(s) referencing ``. After remapping, they'll need to be updated to use ``. Want me to update them now?"* - ---- - -## Step 3: Confirm the Rule - -Show the user the planned rule and confirm before creating: - -> *"I'm going to create a service remapping rule named `` with filter `` that maps `` → `` (rule_type: ``). Ready to proceed?"* - -Wait for confirmation before continuing. - ---- - -## Step 4: Create the Rule - -### Claude runs - -```bash -pup apm service-remapping create \ - --name "" \ - --filter "" \ - --rule-type \ - --value "" -``` - -If the response contains an `id` field — creation succeeded. Record the `id` and `version` values from the response. - -ERROR: `400 Bad Request` with "Filter expression has invalid syntax" — the filter query is malformed. Check glob syntax and boolean operators. - -ERROR: `400 Bad Request` with "Template value in target name is invalid" — the `value` regex is invalid. Check: max 1 capture group, non-greedy quantifiers inside groups (`(.+?)` not `(.+)`). - -ERROR: `401 Unauthorized` — credentials are invalid or expired. Re-check `DD_API_KEY` and `DD_APP_KEY`. - -ERROR: `403 Forbidden` — the API key lacks `apm_service_renaming_write` permission. - ---- - -## Step 5: Verify - -Allow 2–5 minutes for the rule to propagate, then confirm it is active. - -### For SERVICE rules (rule_type 0) - -### Claude runs - -```bash -# Confirm new service name appears in APM -pup apm services list --env --from 5m - -# Confirm traces are arriving under the new name -pup traces search --query "service:" --from 5m --limit 5 -``` - -If `` appears in either — rule is active. - -### For INFERRED_ENTITY rules (rule_type 1) - -Inferred entities don't produce their own spans, so they won't appear in `pup apm services list` or `pup traces search`. Verify in two steps: - -**Step 5a — confirm the rule is stored correctly:** - -### Claude runs - -```bash -pup apm service-remapping get -``` - -Confirm the filter and value match what you intended. - -**Step 5b — confirm the entity name changed in the service map:** - -Ask the user to check the APM Service Map in the Datadog UI and look for `` where `` used to appear. The service map is the authoritative view for inferred entity names. - -Alternatively, confirm new `peer.service` values are arriving on spans from the instrumented service: - -### Claude runs - -```bash -pup traces search --query "service: @peer.service:" --from 5m --limit 5 -``` - -If spans appear with `peer.service:` — rule is active. - -ERROR: New name not appearing after 5 minutes: -- Confirm old service is still sending traces with the original `peer.service`: `pup traces search --query "@peer.service:" --from 5m` -- If old name still appears, propagation may still be in progress — wait 2 more minutes and retry -- If neither name appears, confirm `DD_TRACE_PEER_SERVICE_DEFAULTS_ENABLED=true` is set on the instrumented service — without it `peer.service` is never set and the rule will never fire - ---- - -## Managing Existing Rules - -### List all rules - -### Claude runs - -```bash -pup apm service-remapping list -``` - -### Get a single rule - -### Claude runs - -```bash -pup apm service-remapping get -``` - -### Update a rule - -Update requires the current `version` from list/get output. Show the proposed changes to the user and confirm before running: - -### Claude runs - -```bash -pup apm service-remapping update \ - --name "" \ - --filter "" \ - --rule-type \ - --value "" \ - --version -``` - -ERROR: `409 Conflict` — the rule was modified since you fetched it. Re-fetch with `get` to get the current version and retry. - -### Delete a rule - -Show the user the rule's name and filter first, then ask for confirmation. Delete requires both the rule `id` and `version` from the list/get output: - -### Claude runs - -```bash -pup apm service-remapping delete -``` - -ERROR: `409 Conflict` — the rule was modified since you fetched it. Re-fetch with `get` to get the current version and retry. - ---- - -## Done - -Exit when ALL of the following are true: -- [ ] Rule shown to user and confirmed before creation -- [ ] Rule created and `id` returned in response -- [ ] New service name visible in `pup apm services list` -- [ ] Impacted monitors identified and offered for update -- [ ] User confirmed the remapping matches their intent - ---- - -## Security constraints - -- Never write a raw API key into any file or chat message — always use `$DD_API_KEY` and `$DD_APP_KEY` -- Never create or delete a rule without explicit user confirmation — show the full rule before creating -- Never assume `prod` as the environment — always confirm with the user -- Never run DELETE without showing the user the rule's name and filter first -- Never enable `enabled_org_wide` without explicit user confirmation — it affects the entire org diff --git a/crates/agent-skills/src/lib.rs b/crates/agent-skills/src/lib.rs deleted file mode 100644 index 94d24d8..0000000 --- a/crates/agent-skills/src/lib.rs +++ /dev/null @@ -1,48 +0,0 @@ -pub const DD_APM_SKILL: &str = include_str!("../skills/dd-apm/SKILL.md"); - -pub static DD_APM_SUB_SKILLS: &[(&str, &str)] = &[ - ( - "service-remapping/SKILL.md", - include_str!("../skills/dd-apm/service-remapping/SKILL.md"), - ), - ( - "k8s-ssi/agent-install/SKILL.md", - include_str!("../skills/dd-apm/k8s-ssi/agent-install/SKILL.md"), - ), - ( - "k8s-ssi/enable-ssi/SKILL.md", - include_str!("../skills/dd-apm/k8s-ssi/enable-ssi/SKILL.md"), - ), - ( - "k8s-ssi/verify-ssi/SKILL.md", - include_str!("../skills/dd-apm/k8s-ssi/verify-ssi/SKILL.md"), - ), - ( - "k8s-ssi/troubleshoot-ssi/SKILL.md", - include_str!("../skills/dd-apm/k8s-ssi/troubleshoot-ssi/SKILL.md"), - ), - ( - "k8s-ssi/onboarding-summary/SKILL.md", - include_str!("../skills/dd-apm/k8s-ssi/onboarding-summary/SKILL.md"), - ), - ( - "linux-ssi/agent-install/SKILL.md", - include_str!("../skills/dd-apm/linux-ssi/agent-install/SKILL.md"), - ), - ( - "linux-ssi/enable-ssi/SKILL.md", - include_str!("../skills/dd-apm/linux-ssi/enable-ssi/SKILL.md"), - ), - ( - "linux-ssi/verify-ssi/SKILL.md", - include_str!("../skills/dd-apm/linux-ssi/verify-ssi/SKILL.md"), - ), - ( - "linux-ssi/troubleshoot-ssi/SKILL.md", - include_str!("../skills/dd-apm/linux-ssi/troubleshoot-ssi/SKILL.md"), - ), - ( - "linux-ssi/onboarding-summary/SKILL.md", - include_str!("../skills/dd-apm/linux-ssi/onboarding-summary/SKILL.md"), - ), -];