From 8197bb9e89efd46cc160e46351ffe0fb9ed49880 Mon Sep 17 00:00:00 2001 From: Dongdong Tao Date: Sat, 23 May 2026 11:13:49 +0900 Subject: [PATCH] ci: add bot to refresh embedded DWARF for new Ceph releases (phase 1) Adds a weekly scheduled GHA workflow that detects newly-published Ceph point releases on download.ceph.com, generates the corresponding osdtrace + radostrace embedded DWARF JSONs inside a disposable centos:stream9 container, re-aggregates the header, relinks both tools to prove the new data is well-formed, and opens a follow-up PR with the added files. Phase 1 scope: centos-stream / el9 only. This is the easiest lane -- upstream maintains a stable RPM URL pattern at download.ceph.com and ships matching debuginfo packages, so the bot does not need any distro-specific build infrastructure (no Launchpad / cloud-archive mirroring, no Debian snapshot proxying). The same shape will work for quay.io container images (phase 2) and Ubuntu / Cloud Archive / Debian respins (phase 3), each as a sibling workflow. Pieces: * tools/detect_missing_dwarf.py HTTP-HEADs ceph-osd RPM URLs across all candidate (X.2.Y) point releases and diffs against the JSONs already in files/centos-stream/. Outputs one TSV row per missing (version, tool-set) pair. Self-tested today: identifies 15 missing (X.Y.Z, [osdtrace, radostrace]) rows across quincy / reef / squid / tentacle that are publicly available on download.ceph.com but not yet covered by our committed JSONs. * tools/gen_dwarf_for_version.sh Spins up a disposable centos:stream9 podman container, installs ceph-osd + ceph-common + librados2 + librbd1 and every matching -debuginfo + -debugsource at the requested version, builds cephtrace inside the container (matched glibc), holds a ceph-osd process at its entry point via gdb's `starti` command (no Ceph init code runs, but /proc//exe is valid for osdtrace's DWARF parser to attach), and runs `./osdtrace -j` and/or `./radostrace -j` against that holder PID. Writes the JSON(s) directly to files/// via a bind-mounted repo root. * .github/workflows/refresh-embedded-dwarf.yaml Weekly cron (Monday 06:00 UTC) + workflow_dispatch trigger. Runs the detector, generator, and rebuild gate; on success opens a PR via peter-evans/create-pull-request@v6 listing the additions in a markdown table. Failures (e.g. a missing debuginfo subpackage in one release) are non-fatal -- they are reported in the PR body so the next scheduled run can retry, and the rest of the run still PRs the successful ones. The bot requires only the default GITHUB_TOKEN; the contents:write + pull-requests:write permissions are scoped in the workflow YAML. --- .github/workflows/refresh-embedded-dwarf.yaml | 156 +++++++++++++++++ tools/detect_missing_dwarf.py | 135 +++++++++++++++ tools/gen_dwarf_for_version.sh | 162 ++++++++++++++++++ 3 files changed, 453 insertions(+) create mode 100644 .github/workflows/refresh-embedded-dwarf.yaml create mode 100755 tools/detect_missing_dwarf.py create mode 100755 tools/gen_dwarf_for_version.sh diff --git a/.github/workflows/refresh-embedded-dwarf.yaml b/.github/workflows/refresh-embedded-dwarf.yaml new file mode 100644 index 0000000..4952523 --- /dev/null +++ b/.github/workflows/refresh-embedded-dwarf.yaml @@ -0,0 +1,156 @@ +name: Refresh embedded DWARF for new Ceph releases + +# Phase 1 (this file): centos-stream / el9 only. Detects newly-published +# Ceph point releases (quincy / reef / squid / tentacle) on +# download.ceph.com, generates osdtrace + radostrace DWARF JSONs for the +# missing ones inside a disposable centos:stream9 podman container, and +# opens a follow-up PR with the new files. +# +# Phases 2-3 (future): mirror the same detect/generate/PR flow for +# quay.io container-image build-ids, then for Ubuntu / Cloud Archive / +# Debian respin pockets, which need their own discovery and host +# environment. Each phase will be a sibling workflow file in this +# directory so failures are scoped per-distro. + +on: + schedule: + # Weekly, Monday 06:00 UTC -- well after Ceph upstream's typical + # Friday/Tuesday point-release cadence. + - cron: '0 6 * * 1' + workflow_dispatch: # also runnable on demand from the Actions UI + +permissions: + contents: write + pull-requests: write + +jobs: + refresh: + runs-on: ubuntu-24.04 + # Worst case: 15 missing versions * ~6 min/version = 90 min for the + # generators alone, plus ~5 min for the host build + ~5 min for the + # final rebuild + PR open. 120 min leaves headroom for slow downloads. + timeout-minutes: 180 + + steps: + - name: Checkout code and submodules + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Install host build deps + podman + run: | + sudo apt-get update + sudo apt-get install -y g++ clang libelf-dev libc6-dev-i386 \ + libdw-dev python3 podman + + - name: Detect missing DWARF JSONs (centos-stream / el9) + id: detect + run: | + python3 tools/detect_missing_dwarf.py > /tmp/missing.tsv + echo "==== missing rows ====" + cat /tmp/missing.tsv + echo "====================" + COUNT=$(wc -l < /tmp/missing.tsv | tr -d ' ') + echo "count=$COUNT" >> "$GITHUB_OUTPUT" + echo "Detected $COUNT missing JSON-set(s)." + + - name: Exit early if nothing to do + if: steps.detect.outputs.count == '0' + run: | + echo "All upstream RPMs already have embedded DWARF JSONs." + echo "No PR will be opened." + + - name: Build cephtrace on the host (used only for the final-rebuild gate) + if: steps.detect.outputs.count != '0' + run: make -j"$(nproc)" all + + - name: Generate missing JSONs + if: steps.detect.outputs.count != '0' + id: generate + run: | + : > /tmp/succeeded.tsv + : > /tmp/failed.tsv + while IFS=$'\t' read -r distro tools version pkgver url; do + if ./tools/gen_dwarf_for_version.sh \ + "$distro" "$tools" "$version" "$pkgver"; then + printf '%s\t%s\t%s\t%s\n' "$distro" "$tools" "$version" "$pkgver" \ + >> /tmp/succeeded.tsv + else + printf '%s\t%s\t%s\t%s\n' "$distro" "$tools" "$version" "$pkgver" \ + >> /tmp/failed.tsv + echo "::warning::dwarf generation failed for $version" + fi + done < /tmp/missing.tsv + + S=$(wc -l < /tmp/succeeded.tsv | tr -d ' ') + F=$(wc -l < /tmp/failed.tsv | tr -d ' ') + echo "Generation summary: $S succeeded, $F failed." + echo "succeeded=$S" >> "$GITHUB_OUTPUT" + echo "failed=$F" >> "$GITHUB_OUTPUT" + + - name: Re-aggregate embedded DWARF header + relink + # This step proves the new JSONs parse cleanly through + # tools/generate_embedded_dwarf.py and that osdtrace + radostrace + # still link with the larger header. Failing here means one of + # the just-generated JSONs is malformed -- we want CI to catch that + # before the PR is opened, not after. + if: steps.generate.outputs.succeeded != '0' + run: | + make clean + make -j"$(nproc)" osdtrace radostrace + + - name: Compose pull-request body + if: steps.generate.outputs.succeeded != '0' + run: | + { + echo "## Newly added embedded DWARF JSONs" + echo + echo "| distro | tools | version | pkgver |" + echo "|---|---|---|---|" + while IFS=$'\t' read -r d t v p; do + printf '| %s | %s | %s | `%s` |\n' "$d" "$t" "$v" "$p" + done < /tmp/succeeded.tsv + echo + if [ -s /tmp/failed.tsv ]; then + echo "## Versions that failed to generate" + echo + echo "These will be retried by the next scheduled run." + echo + echo '```' + cat /tmp/failed.tsv + echo '```' + echo + fi + echo "## Verification" + echo "- \`tools/detect_missing_dwarf.py\` identified the rows above" + echo " by probing \`download.ceph.com/rpm-X.Y.Z/el9/x86_64/\`." + echo "- Each JSON was generated inside a disposable" + echo " \`quay.io/centos/centos:stream9\` container with the" + echo " matching ceph-osd + lib*-debuginfo packages installed." + echo "- \`make -j\` re-aggregated the headers and linked" + echo " \`osdtrace\` + \`radostrace\` cleanly." + echo + echo "_Generated by \`.github/workflows/refresh-embedded-dwarf.yaml\` ($(date -u +'%Y-%m-%d %H:%MZ'))._" + } > /tmp/pr_body.md + cat /tmp/pr_body.md + + - name: Open pull request + if: steps.generate.outputs.succeeded != '0' + uses: peter-evans/create-pull-request@v6 + with: + branch: chore/refresh-embedded-dwarf-${{ github.run_id }} + delete-branch: true + title: "chore: refresh embedded DWARF for new Ceph releases" + commit-message: | + chore: refresh embedded DWARF for new Ceph point releases + + Auto-generated by the refresh-embedded-dwarf workflow. + See PR body for the list of versions added. + body-path: /tmp/pr_body.md + labels: | + dwarf-refresh + automated + add-paths: | + files/centos-stream/osdtrace/*.json + files/centos-stream/radostrace/*.json diff --git a/tools/detect_missing_dwarf.py b/tools/detect_missing_dwarf.py new file mode 100755 index 0000000..0ec0982 --- /dev/null +++ b/tools/detect_missing_dwarf.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""Detect Ceph point releases for which we don't yet ship embedded DWARF JSONs. + +Phase 1: centos-stream el9 only. Probes download.ceph.com for the ceph-osd +RPMs of each (major.2.patch) candidate version and diffs against the JSONs +already present under files/centos-stream/{osdtrace,radostrace}/. + +Output (one row per (version, missing-tool-list)) is TSV on stdout so the +companion shell driver can read it line-by-line: + + centos-stream osdtrace,radostrace 17.2.4 2:17.2.4-0.el9 https://download.ceph.com/rpm-17.2.4/el9/x86_64/ceph-osd-17.2.4-0.el9.x86_64.rpm + +The columns are: distro, comma-joined-tool-list, upstream-version, +package-version-string (matches what `osdtrace -j` records as the JSON's +`version` field), and the RPM URL the row's existence was inferred from +(included for traceability / debuggability of CI runs). +""" + +from __future__ import annotations + +import sys +import urllib.request +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +CENTOS_DIR = REPO_ROOT / "files" / "centos-stream" +DOWNLOAD_BASE = "https://download.ceph.com" + +# We only care about the modern lines (quincy, reef, squid, tentacle). +# Each major has a single (X.2) minor line. +MAJOR_VERSIONS = [17, 18, 19, 20] + +# Centos-stream el9 ceph-osd RPM URL template. +RPM_URL_TMPL = "{base}/rpm-{ver}/el9/x86_64/ceph-osd-{ver}-0.el9.x86_64.rpm" + +# Probe up to this many patch releases per major. 20 is generous; quincy +# topped out at 17.2.9 and the longest historical Ceph line (octopus) ran +# through 15.2.17 so this leaves plenty of headroom. +CANDIDATE_PATCHES = list(range(0, 20)) + + +def head(url: str, *, timeout: float = 15.0) -> int: + """HTTP HEAD returning the status code, or 0 on network error. + + Used to probe whether a given RPM URL exists; HEAD is much cheaper than + GET and download.ceph.com supports it. A 0 return means we treat the + URL as unavailable -- safer than retrying the workflow with a partial + discovery on a flaky run. + """ + req = urllib.request.Request(url, method="HEAD") + try: + with urllib.request.urlopen(req, timeout=timeout) as r: + return r.status + except Exception: + return 0 + + +def upstream_el9_versions() -> list[str]: + """Versions that have an el9 ceph-osd RPM published upstream. + + Probes every (major.2.0 .. major.2.19) combination; cheap (~80 HEAD + requests, ~10 s total) and avoids fragile HTML scraping of the directory + index. Returns a sorted (version-tuple-ascending) list. + """ + out: list[str] = [] + for maj in MAJOR_VERSIONS: + for patch in CANDIDATE_PATCHES: + ver = f"{maj}.2.{patch}" + url = RPM_URL_TMPL.format(base=DOWNLOAD_BASE, ver=ver) + if head(url) == 200: + out.append(ver) + return out + + +def existing_versions(tool: str) -> set[str]: + """Versions already covered by JSONs under files/centos-stream//.""" + d = CENTOS_DIR / tool + if not d.is_dir(): + return set() + prefix = {"osdtrace": "osd-2:", "radostrace": "rados-2:"}[tool] + suffix = "-0.el9_dwarf.json" + return { + name[len(prefix):-len(suffix)] + for name in (p.name for p in d.iterdir()) + if name.startswith(prefix) and name.endswith(suffix) + } + + +def version_key(v: str) -> tuple[int, ...]: + return tuple(int(x) for x in v.split(".")) + + +def main() -> None: + upstream = upstream_el9_versions() + if not upstream: + # Treat a fully-empty probe set as a hard error: it almost always + # means download.ceph.com is unreachable from the runner, and + # opening a PR that deletes nothing is harmless but auto-merging + # against an empty diff would be misleading. + print("ERROR: no upstream RPMs detected; aborting", file=sys.stderr) + sys.exit(1) + + osd_have = existing_versions("osdtrace") + rados_have = existing_versions("radostrace") + + # Group missing-tool sets by version so one container session can + # generate both JSONs for the same version. + missing: dict[str, list[str]] = {} + for ver in upstream: + tools: list[str] = [] + if ver not in osd_have: + tools.append("osdtrace") + if ver not in rados_have: + tools.append("radostrace") + if tools: + missing[ver] = tools + + for ver in sorted(missing, key=version_key): + tools = missing[ver] + url = RPM_URL_TMPL.format(base=DOWNLOAD_BASE, ver=ver) + print( + "\t".join( + [ + "centos-stream", + ",".join(sorted(tools)), + ver, + f"2:{ver}-0.el9", + url, + ] + ) + ) + + +if __name__ == "__main__": + main() diff --git a/tools/gen_dwarf_for_version.sh b/tools/gen_dwarf_for_version.sh new file mode 100755 index 0000000..2abad2c --- /dev/null +++ b/tools/gen_dwarf_for_version.sh @@ -0,0 +1,162 @@ +#!/bin/bash +# Generate embedded-DWARF JSON(s) for a specific (distro, version) of Ceph +# using a disposable podman container. +# +# Usage: +# gen_dwarf_for_version.sh +# +# distro: centos-stream (only one supported in Phase 1) +# tools: comma-separated, subset of {osdtrace,radostrace} +# version: e.g. 17.2.4 -- the upstream RPM version +# pkgver: e.g. 2:17.2.4-0.el9 -- recorded in JSON "version" field +# +# Side-effect: writes the JSON(s) under files/// in the repo +# (the repo is bind-mounted into the container, so the writes appear on +# the host immediately and the caller's `git status` shows them). +# +# The hard part of running `osdtrace -j` / `radostrace -j` is that both +# tools require a live PID whose /proc//exe resolves to ceph-osd +# (so the DWARF parser can open the on-disk binary). In a no-cluster +# container we have no naturally-running ceph-osd; we synthesise one by +# starting ceph-osd under gdb with `starti`, which stops the inferior at +# the first user-space instruction (after ld.so has loaded shared libs +# but before main runs). /proc//exe is then valid and stable for +# the lifetime of the gdb session, which we keep alive with a `shell` +# infinite-sleep command. + +set -euo pipefail + +usage() { echo "usage: $0 " >&2; exit 2; } + +[ $# -eq 4 ] || usage +DISTRO=$1 +TOOLS=$2 +VERSION=$3 +PKGVER=$4 + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +case "$DISTRO" in + centos-stream) ;; + *) echo "ERROR: unsupported distro $DISTRO (only centos-stream in phase 1)" >&2; exit 2 ;; +esac + +CTR="dwarfgen-${VERSION//./-}-$$" +cleanup() { podman rm -f "$CTR" >/dev/null 2>&1 || true; } +trap cleanup EXIT + +echo "==> generating DWARF for centos-stream ${VERSION} (tools: ${TOOLS})" + +# --userns=keep-id: writes to bind-mounted files/ stay readable on the +# host (default podman maps container root to a sub-uid range that the +# host user can't write to without chowning afterward). --privileged +# isn't needed: we don't manipulate the BPF subsystem from inside the +# container. +podman run -d --rm --name="$CTR" \ + --userns=keep-id \ + -v "$REPO_ROOT":/workspace:Z \ + --workdir /workspace \ + quay.io/centos/centos:stream9 sleep infinity >/dev/null + +echo "==> installing build deps + ceph ${VERSION} + debuginfo" + +# crb enables glibc-devel.i686 + clang. The build needs gdb only for the +# starti trick below. +podman exec "$CTR" dnf install -y --enablerepo=crb \ + gcc gcc-c++ clang make \ + elfutils-libelf-devel elfutils-devel \ + glibc-devel glibc-devel.i686 \ + python3 openssl-devel \ + gdb curl which >/dev/null + +# Install ceph-osd + the three core libraries plus *every* matching debuginfo +# subpackage. ceph-debuginfo + ceph-debugsource carry the inlined-frame +# info that osdtrace's DWARF walker needs even when the symbol it's +# resolving is in a per-binary -debuginfo package. +podman exec "$CTR" bash -ec " + cd /tmp + pkgs='ceph-osd ceph-common librbd1 librados2 + ceph-osd-debuginfo ceph-common-debuginfo + librbd1-debuginfo librados2-debuginfo + ceph-debuginfo ceph-debugsource' + for p in \$pkgs; do + curl -sfLO https://download.ceph.com/rpm-${VERSION}/el9/x86_64/\${p}-${VERSION}-0.el9.x86_64.rpm + done + rpm -ivh --force /tmp/*.rpm >/dev/null +" + +echo "==> building cephtrace inside the container" + +# Always start from a clean .output so the previous host build's libbpf.a +# (compiled against a newer glibc, with __isoc23_strtoull etc.) doesn't +# pollute the el9 link. This is the same trap we hit during the manual +# 17.2.8 / 17.2.9 prep work for PR #106. +podman exec --workdir=/workspace "$CTR" bash -ec ' + rm -rf .output + make -j"$(nproc)" osdtrace radostrace >/dev/null +' + +echo "==> starting holder process (gdb starti on ceph-osd --version)" + +# starti starts ceph-osd, ld.so loads shared libraries, control transfers +# to the entry point (_start), gdb stops the inferior there. None of +# ceph-osd's own initialisers run, so the process is harmless to hold +# indefinitely. The trailing `shell` command keeps gdb attached. +podman exec "$CTR" bash -ec ' + rm -f /tmp/osd_holder.pid /tmp/osd_pid + nohup gdb -nx -batch-silent \ + -ex "set follow-fork-mode parent" \ + -ex "set pagination off" \ + -ex "starti" \ + -ex "shell echo \$\$ > /tmp/osd_holder.pid; while true; do sleep 60; done" \ + --args /usr/bin/ceph-osd --version >/tmp/gdb.log 2>&1 & + for i in $(seq 1 60); do + [ -s /tmp/osd_holder.pid ] && break + sleep 0.5 + done +' + +OSD_PID=$(podman exec "$CTR" bash -ec ' + HOLDER=$(cat /tmp/osd_holder.pid 2>/dev/null || true) + [ -n "$HOLDER" ] || { echo "gdb holder did not start" >&2; cat /tmp/gdb.log >&2; exit 1; } + OSD=$(pgrep -P "$HOLDER" -x ceph-osd || true) + [ -n "$OSD" ] || { echo "ceph-osd subprocess not found" >&2; ps -ef >&2; exit 1; } + echo "$OSD" +') + +echo " ceph-osd holder PID: $OSD_PID" + +for tool in ${TOOLS//,/ }; do + case "$tool" in + osdtrace) + out="files/centos-stream/osdtrace/osd-${PKGVER}_dwarf.json" + ;; + radostrace) + # radostrace's DWARF parse target is librados/librbd/libceph- + # common. It resolves library paths via /proc//root (a + # chroot-based filesystem walk), not via /proc//maps -- so + # the holder process doesn't need to have those libraries + # *loaded*; it just needs them installed in the same mount + # namespace, which the dnf install above guarantees. + out="files/centos-stream/radostrace/rados-${PKGVER}_dwarf.json" + ;; + *) + echo "ERROR: unknown tool $tool" >&2 + exit 2 + ;; + esac + echo "==> generating ${tool} JSON -> ${out}" + podman exec --workdir=/workspace "$CTR" \ + ./"$tool" -j "$out" -p "$OSD_PID" >/tmp/${tool}-${VERSION}.log 2>&1 || { + echo "ERROR: ${tool} -j failed; last 20 lines of /tmp/${tool}-${VERSION}.log:" >&2 + tail -20 /tmp/${tool}-${VERSION}.log >&2 || true + exit 1 + } + # Sanity: file exists and is non-trivial JSON. + if ! [ -s "$REPO_ROOT/$out" ]; then + echo "ERROR: $out was not written" >&2 + exit 1 + fi +done + +echo "==> done: DWARF JSON(s) for ${VERSION} written under files/centos-stream/"