diff --git a/.github/workflows/refresh-embedded-dwarf.yaml b/.github/workflows/refresh-embedded-dwarf.yaml new file mode 100644 index 0000000..4952523 --- /dev/null +++ b/.github/workflows/refresh-embedded-dwarf.yaml @@ -0,0 +1,156 @@ +name: Refresh embedded DWARF for new Ceph releases + +# Phase 1 (this file): centos-stream / el9 only. Detects newly-published +# Ceph point releases (quincy / reef / squid / tentacle) on +# download.ceph.com, generates osdtrace + radostrace DWARF JSONs for the +# missing ones inside a disposable centos:stream9 podman container, and +# opens a follow-up PR with the new files. +# +# Phases 2-3 (future): mirror the same detect/generate/PR flow for +# quay.io container-image build-ids, then for Ubuntu / Cloud Archive / +# Debian respin pockets, which need their own discovery and host +# environment. Each phase will be a sibling workflow file in this +# directory so failures are scoped per-distro. + +on: + schedule: + # Weekly, Monday 06:00 UTC -- well after Ceph upstream's typical + # Friday/Tuesday point-release cadence. + - cron: '0 6 * * 1' + workflow_dispatch: # also runnable on demand from the Actions UI + +permissions: + contents: write + pull-requests: write + +jobs: + refresh: + runs-on: ubuntu-24.04 + # Worst case: 15 missing versions * ~6 min/version = 90 min for the + # generators alone, plus ~5 min for the host build + ~5 min for the + # final rebuild + PR open. 120 min leaves headroom for slow downloads. + timeout-minutes: 180 + + steps: + - name: Checkout code and submodules + uses: actions/checkout@v4 + with: + submodules: recursive + fetch-depth: 0 + + - name: Install host build deps + podman + run: | + sudo apt-get update + sudo apt-get install -y g++ clang libelf-dev libc6-dev-i386 \ + libdw-dev python3 podman + + - name: Detect missing DWARF JSONs (centos-stream / el9) + id: detect + run: | + python3 tools/detect_missing_dwarf.py > /tmp/missing.tsv + echo "==== missing rows ====" + cat /tmp/missing.tsv + echo "====================" + COUNT=$(wc -l < /tmp/missing.tsv | tr -d ' ') + echo "count=$COUNT" >> "$GITHUB_OUTPUT" + echo "Detected $COUNT missing JSON-set(s)." + + - name: Exit early if nothing to do + if: steps.detect.outputs.count == '0' + run: | + echo "All upstream RPMs already have embedded DWARF JSONs." + echo "No PR will be opened." + + - name: Build cephtrace on the host (used only for the final-rebuild gate) + if: steps.detect.outputs.count != '0' + run: make -j"$(nproc)" all + + - name: Generate missing JSONs + if: steps.detect.outputs.count != '0' + id: generate + run: | + : > /tmp/succeeded.tsv + : > /tmp/failed.tsv + while IFS=$'\t' read -r distro tools version pkgver url; do + if ./tools/gen_dwarf_for_version.sh \ + "$distro" "$tools" "$version" "$pkgver"; then + printf '%s\t%s\t%s\t%s\n' "$distro" "$tools" "$version" "$pkgver" \ + >> /tmp/succeeded.tsv + else + printf '%s\t%s\t%s\t%s\n' "$distro" "$tools" "$version" "$pkgver" \ + >> /tmp/failed.tsv + echo "::warning::dwarf generation failed for $version" + fi + done < /tmp/missing.tsv + + S=$(wc -l < /tmp/succeeded.tsv | tr -d ' ') + F=$(wc -l < /tmp/failed.tsv | tr -d ' ') + echo "Generation summary: $S succeeded, $F failed." + echo "succeeded=$S" >> "$GITHUB_OUTPUT" + echo "failed=$F" >> "$GITHUB_OUTPUT" + + - name: Re-aggregate embedded DWARF header + relink + # This step proves the new JSONs parse cleanly through + # tools/generate_embedded_dwarf.py and that osdtrace + radostrace + # still link with the larger header. Failing here means one of + # the just-generated JSONs is malformed -- we want CI to catch that + # before the PR is opened, not after. + if: steps.generate.outputs.succeeded != '0' + run: | + make clean + make -j"$(nproc)" osdtrace radostrace + + - name: Compose pull-request body + if: steps.generate.outputs.succeeded != '0' + run: | + { + echo "## Newly added embedded DWARF JSONs" + echo + echo "| distro | tools | version | pkgver |" + echo "|---|---|---|---|" + while IFS=$'\t' read -r d t v p; do + printf '| %s | %s | %s | `%s` |\n' "$d" "$t" "$v" "$p" + done < /tmp/succeeded.tsv + echo + if [ -s /tmp/failed.tsv ]; then + echo "## Versions that failed to generate" + echo + echo "These will be retried by the next scheduled run." + echo + echo '```' + cat /tmp/failed.tsv + echo '```' + echo + fi + echo "## Verification" + echo "- \`tools/detect_missing_dwarf.py\` identified the rows above" + echo " by probing \`download.ceph.com/rpm-X.Y.Z/el9/x86_64/\`." + echo "- Each JSON was generated inside a disposable" + echo " \`quay.io/centos/centos:stream9\` container with the" + echo " matching ceph-osd + lib*-debuginfo packages installed." + echo "- \`make -j\` re-aggregated the headers and linked" + echo " \`osdtrace\` + \`radostrace\` cleanly." + echo + echo "_Generated by \`.github/workflows/refresh-embedded-dwarf.yaml\` ($(date -u +'%Y-%m-%d %H:%MZ'))._" + } > /tmp/pr_body.md + cat /tmp/pr_body.md + + - name: Open pull request + if: steps.generate.outputs.succeeded != '0' + uses: peter-evans/create-pull-request@v6 + with: + branch: chore/refresh-embedded-dwarf-${{ github.run_id }} + delete-branch: true + title: "chore: refresh embedded DWARF for new Ceph releases" + commit-message: | + chore: refresh embedded DWARF for new Ceph point releases + + Auto-generated by the refresh-embedded-dwarf workflow. + See PR body for the list of versions added. + body-path: /tmp/pr_body.md + labels: | + dwarf-refresh + automated + add-paths: | + files/centos-stream/osdtrace/*.json + files/centos-stream/radostrace/*.json diff --git a/tools/detect_missing_dwarf.py b/tools/detect_missing_dwarf.py new file mode 100755 index 0000000..0ec0982 --- /dev/null +++ b/tools/detect_missing_dwarf.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +"""Detect Ceph point releases for which we don't yet ship embedded DWARF JSONs. + +Phase 1: centos-stream el9 only. Probes download.ceph.com for the ceph-osd +RPMs of each (major.2.patch) candidate version and diffs against the JSONs +already present under files/centos-stream/{osdtrace,radostrace}/. + +Output (one row per (version, missing-tool-list)) is TSV on stdout so the +companion shell driver can read it line-by-line: + + centos-stream osdtrace,radostrace 17.2.4 2:17.2.4-0.el9 https://download.ceph.com/rpm-17.2.4/el9/x86_64/ceph-osd-17.2.4-0.el9.x86_64.rpm + +The columns are: distro, comma-joined-tool-list, upstream-version, +package-version-string (matches what `osdtrace -j` records as the JSON's +`version` field), and the RPM URL the row's existence was inferred from +(included for traceability / debuggability of CI runs). +""" + +from __future__ import annotations + +import sys +import urllib.request +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +CENTOS_DIR = REPO_ROOT / "files" / "centos-stream" +DOWNLOAD_BASE = "https://download.ceph.com" + +# We only care about the modern lines (quincy, reef, squid, tentacle). +# Each major has a single (X.2) minor line. +MAJOR_VERSIONS = [17, 18, 19, 20] + +# Centos-stream el9 ceph-osd RPM URL template. +RPM_URL_TMPL = "{base}/rpm-{ver}/el9/x86_64/ceph-osd-{ver}-0.el9.x86_64.rpm" + +# Probe up to this many patch releases per major. 20 is generous; quincy +# topped out at 17.2.9 and the longest historical Ceph line (octopus) ran +# through 15.2.17 so this leaves plenty of headroom. +CANDIDATE_PATCHES = list(range(0, 20)) + + +def head(url: str, *, timeout: float = 15.0) -> int: + """HTTP HEAD returning the status code, or 0 on network error. + + Used to probe whether a given RPM URL exists; HEAD is much cheaper than + GET and download.ceph.com supports it. A 0 return means we treat the + URL as unavailable -- safer than retrying the workflow with a partial + discovery on a flaky run. + """ + req = urllib.request.Request(url, method="HEAD") + try: + with urllib.request.urlopen(req, timeout=timeout) as r: + return r.status + except Exception: + return 0 + + +def upstream_el9_versions() -> list[str]: + """Versions that have an el9 ceph-osd RPM published upstream. + + Probes every (major.2.0 .. major.2.19) combination; cheap (~80 HEAD + requests, ~10 s total) and avoids fragile HTML scraping of the directory + index. Returns a sorted (version-tuple-ascending) list. + """ + out: list[str] = [] + for maj in MAJOR_VERSIONS: + for patch in CANDIDATE_PATCHES: + ver = f"{maj}.2.{patch}" + url = RPM_URL_TMPL.format(base=DOWNLOAD_BASE, ver=ver) + if head(url) == 200: + out.append(ver) + return out + + +def existing_versions(tool: str) -> set[str]: + """Versions already covered by JSONs under files/centos-stream//.""" + d = CENTOS_DIR / tool + if not d.is_dir(): + return set() + prefix = {"osdtrace": "osd-2:", "radostrace": "rados-2:"}[tool] + suffix = "-0.el9_dwarf.json" + return { + name[len(prefix):-len(suffix)] + for name in (p.name for p in d.iterdir()) + if name.startswith(prefix) and name.endswith(suffix) + } + + +def version_key(v: str) -> tuple[int, ...]: + return tuple(int(x) for x in v.split(".")) + + +def main() -> None: + upstream = upstream_el9_versions() + if not upstream: + # Treat a fully-empty probe set as a hard error: it almost always + # means download.ceph.com is unreachable from the runner, and + # opening a PR that deletes nothing is harmless but auto-merging + # against an empty diff would be misleading. + print("ERROR: no upstream RPMs detected; aborting", file=sys.stderr) + sys.exit(1) + + osd_have = existing_versions("osdtrace") + rados_have = existing_versions("radostrace") + + # Group missing-tool sets by version so one container session can + # generate both JSONs for the same version. + missing: dict[str, list[str]] = {} + for ver in upstream: + tools: list[str] = [] + if ver not in osd_have: + tools.append("osdtrace") + if ver not in rados_have: + tools.append("radostrace") + if tools: + missing[ver] = tools + + for ver in sorted(missing, key=version_key): + tools = missing[ver] + url = RPM_URL_TMPL.format(base=DOWNLOAD_BASE, ver=ver) + print( + "\t".join( + [ + "centos-stream", + ",".join(sorted(tools)), + ver, + f"2:{ver}-0.el9", + url, + ] + ) + ) + + +if __name__ == "__main__": + main() diff --git a/tools/gen_dwarf_for_version.sh b/tools/gen_dwarf_for_version.sh new file mode 100755 index 0000000..2abad2c --- /dev/null +++ b/tools/gen_dwarf_for_version.sh @@ -0,0 +1,162 @@ +#!/bin/bash +# Generate embedded-DWARF JSON(s) for a specific (distro, version) of Ceph +# using a disposable podman container. +# +# Usage: +# gen_dwarf_for_version.sh +# +# distro: centos-stream (only one supported in Phase 1) +# tools: comma-separated, subset of {osdtrace,radostrace} +# version: e.g. 17.2.4 -- the upstream RPM version +# pkgver: e.g. 2:17.2.4-0.el9 -- recorded in JSON "version" field +# +# Side-effect: writes the JSON(s) under files/// in the repo +# (the repo is bind-mounted into the container, so the writes appear on +# the host immediately and the caller's `git status` shows them). +# +# The hard part of running `osdtrace -j` / `radostrace -j` is that both +# tools require a live PID whose /proc//exe resolves to ceph-osd +# (so the DWARF parser can open the on-disk binary). In a no-cluster +# container we have no naturally-running ceph-osd; we synthesise one by +# starting ceph-osd under gdb with `starti`, which stops the inferior at +# the first user-space instruction (after ld.so has loaded shared libs +# but before main runs). /proc//exe is then valid and stable for +# the lifetime of the gdb session, which we keep alive with a `shell` +# infinite-sleep command. + +set -euo pipefail + +usage() { echo "usage: $0 " >&2; exit 2; } + +[ $# -eq 4 ] || usage +DISTRO=$1 +TOOLS=$2 +VERSION=$3 +PKGVER=$4 + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +case "$DISTRO" in + centos-stream) ;; + *) echo "ERROR: unsupported distro $DISTRO (only centos-stream in phase 1)" >&2; exit 2 ;; +esac + +CTR="dwarfgen-${VERSION//./-}-$$" +cleanup() { podman rm -f "$CTR" >/dev/null 2>&1 || true; } +trap cleanup EXIT + +echo "==> generating DWARF for centos-stream ${VERSION} (tools: ${TOOLS})" + +# --userns=keep-id: writes to bind-mounted files/ stay readable on the +# host (default podman maps container root to a sub-uid range that the +# host user can't write to without chowning afterward). --privileged +# isn't needed: we don't manipulate the BPF subsystem from inside the +# container. +podman run -d --rm --name="$CTR" \ + --userns=keep-id \ + -v "$REPO_ROOT":/workspace:Z \ + --workdir /workspace \ + quay.io/centos/centos:stream9 sleep infinity >/dev/null + +echo "==> installing build deps + ceph ${VERSION} + debuginfo" + +# crb enables glibc-devel.i686 + clang. The build needs gdb only for the +# starti trick below. +podman exec "$CTR" dnf install -y --enablerepo=crb \ + gcc gcc-c++ clang make \ + elfutils-libelf-devel elfutils-devel \ + glibc-devel glibc-devel.i686 \ + python3 openssl-devel \ + gdb curl which >/dev/null + +# Install ceph-osd + the three core libraries plus *every* matching debuginfo +# subpackage. ceph-debuginfo + ceph-debugsource carry the inlined-frame +# info that osdtrace's DWARF walker needs even when the symbol it's +# resolving is in a per-binary -debuginfo package. +podman exec "$CTR" bash -ec " + cd /tmp + pkgs='ceph-osd ceph-common librbd1 librados2 + ceph-osd-debuginfo ceph-common-debuginfo + librbd1-debuginfo librados2-debuginfo + ceph-debuginfo ceph-debugsource' + for p in \$pkgs; do + curl -sfLO https://download.ceph.com/rpm-${VERSION}/el9/x86_64/\${p}-${VERSION}-0.el9.x86_64.rpm + done + rpm -ivh --force /tmp/*.rpm >/dev/null +" + +echo "==> building cephtrace inside the container" + +# Always start from a clean .output so the previous host build's libbpf.a +# (compiled against a newer glibc, with __isoc23_strtoull etc.) doesn't +# pollute the el9 link. This is the same trap we hit during the manual +# 17.2.8 / 17.2.9 prep work for PR #106. +podman exec --workdir=/workspace "$CTR" bash -ec ' + rm -rf .output + make -j"$(nproc)" osdtrace radostrace >/dev/null +' + +echo "==> starting holder process (gdb starti on ceph-osd --version)" + +# starti starts ceph-osd, ld.so loads shared libraries, control transfers +# to the entry point (_start), gdb stops the inferior there. None of +# ceph-osd's own initialisers run, so the process is harmless to hold +# indefinitely. The trailing `shell` command keeps gdb attached. +podman exec "$CTR" bash -ec ' + rm -f /tmp/osd_holder.pid /tmp/osd_pid + nohup gdb -nx -batch-silent \ + -ex "set follow-fork-mode parent" \ + -ex "set pagination off" \ + -ex "starti" \ + -ex "shell echo \$\$ > /tmp/osd_holder.pid; while true; do sleep 60; done" \ + --args /usr/bin/ceph-osd --version >/tmp/gdb.log 2>&1 & + for i in $(seq 1 60); do + [ -s /tmp/osd_holder.pid ] && break + sleep 0.5 + done +' + +OSD_PID=$(podman exec "$CTR" bash -ec ' + HOLDER=$(cat /tmp/osd_holder.pid 2>/dev/null || true) + [ -n "$HOLDER" ] || { echo "gdb holder did not start" >&2; cat /tmp/gdb.log >&2; exit 1; } + OSD=$(pgrep -P "$HOLDER" -x ceph-osd || true) + [ -n "$OSD" ] || { echo "ceph-osd subprocess not found" >&2; ps -ef >&2; exit 1; } + echo "$OSD" +') + +echo " ceph-osd holder PID: $OSD_PID" + +for tool in ${TOOLS//,/ }; do + case "$tool" in + osdtrace) + out="files/centos-stream/osdtrace/osd-${PKGVER}_dwarf.json" + ;; + radostrace) + # radostrace's DWARF parse target is librados/librbd/libceph- + # common. It resolves library paths via /proc//root (a + # chroot-based filesystem walk), not via /proc//maps -- so + # the holder process doesn't need to have those libraries + # *loaded*; it just needs them installed in the same mount + # namespace, which the dnf install above guarantees. + out="files/centos-stream/radostrace/rados-${PKGVER}_dwarf.json" + ;; + *) + echo "ERROR: unknown tool $tool" >&2 + exit 2 + ;; + esac + echo "==> generating ${tool} JSON -> ${out}" + podman exec --workdir=/workspace "$CTR" \ + ./"$tool" -j "$out" -p "$OSD_PID" >/tmp/${tool}-${VERSION}.log 2>&1 || { + echo "ERROR: ${tool} -j failed; last 20 lines of /tmp/${tool}-${VERSION}.log:" >&2 + tail -20 /tmp/${tool}-${VERSION}.log >&2 || true + exit 1 + } + # Sanity: file exists and is non-trivial JSON. + if ! [ -s "$REPO_ROOT/$out" ]; then + echo "ERROR: $out was not written" >&2 + exit 1 + fi +done + +echo "==> done: DWARF JSON(s) for ${VERSION} written under files/centos-stream/"