From cf846e9917cda6cd485732f51f2d8d3c7956d2e2 Mon Sep 17 00:00:00 2001 From: Zhang Hua Date: Thu, 14 May 2026 22:10:10 +0800 Subject: [PATCH] Report incomplete ops on SIGINT or timeout Previously, radostrace silently discarded any ops that were in-flight (tracked in the BPF ops map via _send_op) but had not yet completed (_finish_op not yet fired) when the session ended. This made it impossible for users to identify hung or slow operations when interrupting a trace or when a timeout expired. This commit adds a report_hung_ops() function that iterates the BPF ops hash map at exit and prints any remaining entries as "incomplete" rows, using the same tabular format as the normal "complete" output. Signed-off-by: Zhang Hua --- src/radostrace.bpf.c | 14 +- src/radostrace.cc | 270 ++++++++++++++++++++--------- tests/functional-test-microceph.sh | 89 +++++++++- tests/lib/verify-trace-output.sh | 4 +- 4 files changed, 288 insertions(+), 89 deletions(-) diff --git a/src/radostrace.bpf.c b/src/radostrace.bpf.c index f6bde46..b7bcb26 100644 --- a/src/radostrace.bpf.c +++ b/src/radostrace.bpf.c @@ -82,7 +82,6 @@ int uprobe_send_op(struct pt_regs *ctx) { return 0; } memset(val, 0, sizeof(struct client_op_v)); - val->sent_stamp = bpf_ktime_get_boot_ns(); val->tid = key.tid; val->cid = key.cid; val->rw = 0; @@ -180,6 +179,7 @@ int uprobe_send_op(struct pt_regs *ctx) { bpf_printk("uprobe_send_op got M_start %d\n", M_start); } else { bpf_printk("uprobe_send_op got NULL vf at varid %d\n", varid); + bpf_map_delete_elem(&ops, &key); return 0; } @@ -195,6 +195,7 @@ int uprobe_send_op(struct pt_regs *ctx) { bpf_printk("uprobe_send_op got m_finish %d\n", m_finish); } else { bpf_printk("uprobe_send_op got NULL vf at varid %d\n", varid); + bpf_map_delete_elem(&ops, &key); return 0; } @@ -220,6 +221,7 @@ int uprobe_send_op(struct pt_regs *ctx) { bpf_printk("uprobe_send_op got m_start %lld\n", m_start); } else { bpf_printk("uprobe_send_op got NULL vf at varid %d\n", varid); + bpf_map_delete_elem(&ops, &key); return 0; } @@ -234,10 +236,13 @@ int uprobe_send_op(struct pt_regs *ctx) { bpf_printk("uprobe_send_op got m_start %d\n", val->ops_size); } else { bpf_printk("uprobe_send_op got NULL vf at varid %d\n", varid); + bpf_map_delete_elem(&ops, &key); return 0; } - val->ops_size &= 3; + if (val->ops_size > 3) { + val->ops_size = 3; + } val->offset = 0; val->length = 0; for (__u32 i = 0; i < 3; ++i) { @@ -276,6 +281,9 @@ int uprobe_send_op(struct pt_regs *ctx) { } } + val->sent_stamp = bpf_ktime_get_boot_ns(); + val->pid = get_pid(); + //bpf_map_update_elem(&ops, &key, val, 0);// no need to update again here return 0; } @@ -318,10 +326,10 @@ int uprobe_finish_op(struct pt_regs *ctx) { return 0; } opv->finish_stamp = bpf_ktime_get_boot_ns(); - opv->pid = get_pid(); // submit to ringbuf struct client_op_v *e = bpf_ringbuf_reserve(&rb, sizeof(struct client_op_v), 0); if (NULL == e) { + bpf_map_delete_elem(&ops, &key); return 0; } *e = *opv; diff --git a/src/radostrace.cc b/src/radostrace.cc index eec2b13..a18a909 100644 --- a/src/radostrace.cc +++ b/src/radostrace.cc @@ -2,6 +2,7 @@ //observe the client <-> osds latency for each request. #include +#include #include #include #include @@ -85,11 +86,20 @@ DwarfParser::probes_t rados_probes = { }; volatile sig_atomic_t timeout_occurred = 0; +volatile sig_atomic_t got_sigint = 0; // CSV Output bool export_csv = false; std::string csv_output_file = "radostrace_events.csv"; +static __u64 get_boottime_ns() { + struct timespec ts; + if (clock_gettime(CLOCK_BOOTTIME, &ts) != 0) { + return 0; + } + return (__u64)ts.tv_sec * 1000000000ULL + (__u64)ts.tv_nsec; +} + const char * ceph_osd_op_str(int opc) { const char *op_str = NULL; #define GENERATE_CASE_ENTRY(op, opcode, str) case CEPH_OSD_OP_##op: op_str=str; break; @@ -124,11 +134,11 @@ void fill_map_hprobes(std::string mod_path, DwarfParser &dwarfparser, struct bpf } void signal_handler(int signum){ - clog << "Caught signal " << signum << endl; if (signum == SIGINT) { - clog << "process killed" << endl; + got_sigint = 1; + } else { + _exit(signum); } - exit(signum); } void timeout_handler(int signum) { @@ -223,6 +233,142 @@ int attach_retuprobe(struct radostrace_bpf *skel, return 0; } +static void print_op_output_tail(FILE *out, bool print_offset_length, + long long offset, long long length) { + if (print_offset_length) + fprintf(out, "[%lld, %lld]\n", offset, length); + else + fprintf(out, "\n"); +} + +struct FieldWidths { + int pid = 8; + int client = 8; + int tid = 8; + int pool = 6; + int pg = 4; + int acting = 18; + int wr = 4; + int size = 9; + int latency = 9; +}; + +static void print_op_header(FILE *out, const FieldWidths &w) { + fprintf(out, "%*s%*s%*s%*s%*s %*s%*s%*s%*s%11s%s\n", + w.pid, "pid", w.client, "client", w.tid, "tid", + w.pool, "pool", w.pg, "pg", w.acting, "acting", + w.wr, "WR", w.size, "size", w.latency, "latency", + "Complete", " object[ops]"); +} + +static void print_op_row(FILE *out, const FieldWidths &w, + const char *pid_str, long long cid, long long tid, + long long pool, const char *pgid, const char *acting, + const char *wr, long long size, long long latency, + const char *status, const char *name, const char *ops, + bool pol, long long offset, long long length) { + fprintf(out, "%*s%*lld%*lld%*lld%*s %*s%*s%*lld%*lld%11s %s %s", + w.pid, pid_str, w.client, cid, w.tid, tid, + w.pool, pool, w.pg, pgid, w.acting, acting, + w.wr, wr, w.size, size, w.latency, latency, + status, name, ops); + print_op_output_tail(out, pol, offset, length); +} + +static FieldWidths g_widths; +static bool g_firsttime = true; + +static int report_hung_ops(struct radostrace_bpf *skel, + __u64 min_duration_ns = 0) { + __u64 now_ns = get_boottime_ns(); + if (now_ns == 0) { + fprintf(stderr, "\nFailed to read CLOCK_BOOTTIME.\n"); + return -1; + } + + int map_fd = bpf_map__fd(skel->maps.ops); + struct client_op_k current_key, next_key; + struct client_op_v val; + struct client_op_k *key_ptr = NULL; + int found = 0; + + while (bpf_map_get_next_key(map_fd, key_ptr, &next_key) == 0) { + if (bpf_map_lookup_elem(map_fd, &next_key, &val) == 0) { + if (val.sent_stamp == 0) { + current_key = next_key; + key_ptr = ¤t_key; + continue; + } + __u64 duration_ns = now_ns - val.sent_stamp; + if (duration_ns < min_duration_ns) { + current_key = next_key; + key_ptr = ¤t_key; + continue; + } + std::stringstream ss; + ss << std::hex << val.m_seed; + std::string pgid(ss.str()); + + std::stringstream acting_osd_list; + acting_osd_list << "["; + { + bool first = true; + for (int i = 0; i < MAX_ACTING; ++i) { + if (val.acting[i] < 0) break; + if (!first) acting_osd_list << ","; + acting_osd_list << val.acting[i]; + first = false; + } + acting_osd_list << "]"; + } + std::string acting_str = acting_osd_list.str(); + + bool print_offset_length = false; + std::stringstream ops_list; + ops_list << "["; + for (__u32 i = 0; i < val.ops_size; ++i) { + if (i) ops_list << " "; + if (ceph_osd_op_extent(val.ops[i])) { + ops_list << ceph_osd_op_str(val.ops[i]); + print_offset_length = true; + } else if (ceph_osd_op_call(val.ops[i])) { + ops_list << "call(" << val.cls_ops[i].cls_name + << "." << val.cls_ops[i].method_name << ")"; + } else { + ops_list << ceph_osd_op_str(val.ops[i]); + } + } + ops_list << "]"; + std::string ops_str = ops_list.str(); + + long long latency_us = (long long)(duration_ns / 1000ULL); + if (g_firsttime) { + print_op_header(stdout, g_widths); + g_firsttime = false; + } + + char safe_name[sizeof(val.object_name) + 1]; + memcpy(safe_name, val.object_name, sizeof(val.object_name)); + safe_name[sizeof(val.object_name)] = '\0'; + const char *wr_str = (val.rw & CEPH_OSD_FLAG_WRITE) ? "W" : "R"; + char pid_buf[16]; + snprintf(pid_buf, sizeof(pid_buf), "%u", val.pid); + print_op_row(stdout, g_widths, pid_buf, val.cid, val.tid, val.m_pool, + pgid.c_str(), acting_str.c_str(), wr_str, + val.length, latency_us, "0", + safe_name, ops_str.c_str(), + print_offset_length, val.offset, val.length); + found++; + } + current_key = next_key; + key_ptr = ¤t_key; + } + if (found > 0) { + fprintf(stderr, "Total reported ops: %d\n", found); + } + return found; +} + int digitnum(int x) { int cnt = 1; while(x / 10) { @@ -261,23 +407,6 @@ static int handle_event(void *ctx, void *data, size_t size) { return out; }; - // Define field widths based on actual data - struct FieldWidths { - int pid = 8; - int client = 8; - int tid = 8; - int pool = 6; - int pg = 4; - int acting = 18; - int wr = 3; - int size = 7; - int latency = 8; - }; - - static FieldWidths widths; - static bool firsttime = true; - - // Compile acting OSD list std::stringstream acting_osd_list; acting_osd_list << "["; { @@ -292,9 +421,8 @@ static int handle_event(void *ctx, void *data, size_t size) { } std::string acting_str = acting_osd_list.str(); - // Compile Ops list - std::stringstream ops_list; bool print_offset_length = false; + std::stringstream ops_list; ops_list << "["; for (__u32 i = 0; i < op_v->ops_size; ++i) { if (i) ops_list << " "; @@ -358,59 +486,28 @@ static int handle_event(void *ctx, void *data, size_t size) { } // Standard output - if (firsttime) { - // Calculate field widths based on actual data from first event - widths.pid = MAX(8, (int)std::to_string(op_v->pid).length() + 1); - widths.client = MAX(8, (int)std::to_string(op_v->cid).length() + 1); - widths.tid = MAX(8, (int)std::to_string(op_v->tid).length() + 1); - widths.pool = MAX(6, (int)std::to_string(op_v->m_pool).length() + 1); - widths.pg = MAX(4, (int)pgid.length() + 1); - widths.acting = MAX(15, (int)acting_str.length() + 1); - widths.wr = 4; - widths.size = MAX(9, (int)std::to_string(op_v->length).length() + 1); - widths.latency = MAX(9, (int)std::to_string(latency_us).length() + 1); - - // Print header using calculated widths - printf("%*s%*s%*s%*s%*s %*s%*s%*s%*s%s\n", - widths.pid, "pid", - widths.client, "client", - widths.tid, "tid", - widths.pool, "pool", - widths.pg, "pg", - widths.acting, "acting", - widths.wr, "WR", - widths.size, "size", - widths.latency, "latency", - " object[ops]"); - - firsttime = false; + if (g_firsttime) { + g_widths.pid = MAX(8, (int)std::to_string(op_v->pid).length() + 1); + g_widths.client = MAX(8, (int)std::to_string(op_v->cid).length() + 1); + g_widths.tid = MAX(8, (int)std::to_string(op_v->tid).length() + 1); + g_widths.pool = MAX(6, (int)std::to_string(op_v->m_pool).length() + 1); + g_widths.pg = MAX(4, (int)pgid.length() + 1); + g_widths.acting = MAX(15, (int)acting_str.length() + 1); + g_widths.wr = 4; + g_widths.size = MAX(9, (int)std::to_string(op_v->length).length() + 1); + g_widths.latency = MAX(9, (int)std::to_string(latency_us).length() + 1); + + print_op_header(stdout, g_widths); + g_firsttime = false; } - // Format output using calculated widths - // Note: explicit space after pg ensures separation from acting column - // even when acting_str exceeds its calculated width - printf("%*d%*lld%*lld%*lld%*s %*s", - widths.pid, op_v->pid, - widths.client, op_v->cid, - widths.tid, op_v->tid, - widths.pool, op_v->m_pool, - widths.pg, pgid.c_str(), - widths.acting, acting_str.c_str()); - - printf("%*s%*lld%*lld", - widths.wr, wr_str.c_str(), - widths.size, op_v->length, - widths.latency, latency_us); - - // Object name and operations (no fixed width needed) - printf(" %s ", op_v->object_name); - printf("%s", ops_str.c_str()); - - if (print_offset_length) { - printf("[%lld, %lld]\n", op_v->offset, op_v->length); - } else { - printf("\n"); - } + char pid_buf[16]; + snprintf(pid_buf, sizeof(pid_buf), "%d", op_v->pid); + print_op_row(stdout, g_widths, pid_buf, op_v->cid, op_v->tid, op_v->m_pool, + pgid.c_str(), acting_str.c_str(), wr_str.c_str(), + op_v->length, latency_us, "1", + op_v->object_name, ops_str.c_str(), + print_offset_length, op_v->offset, op_v->length); return 0; } @@ -477,7 +574,7 @@ int main(int argc, char **argv) { return 0; } else if (arg == "-h" || arg == "--help") { std::cout << "Usage: " << argv[0] << " [-t ] [-j [filename]] [-i ] [-o [filename]] [-p ] [--skip-version-check]\n"; - std::cout << " -t, --timeout Set execution timeout in seconds\n"; + std::cout << " -t, --timeout Set execution timeout in seconds and report pending ops before exit\n"; std::cout << " -j, --export-json Export DWARF info to JSON (default: radostrace_dwarf.json)\n"; std::cout << " -i, --import-json Import DWARF info from JSON file\n"; std::cout << " -o, --output Export events data info to CSV (default: radostrace_events.csv)\n"; @@ -486,6 +583,9 @@ int main(int argc, char **argv) { std::cout << " -V, --version Print version information and exit\n"; std::cout << " -h, --help Show this help message\n"; return 0; + } else { + std::cerr << "Unknown argument: " << arg << "\n"; + return 1; } } @@ -501,7 +601,8 @@ int main(int argc, char **argv) { struct radostrace_bpf *skel; // long uprobe_offset; int ret = 0; - struct ring_buffer *rb; + int exit_code = 0; + struct ring_buffer *rb = nullptr; DwarfParser dwarfparser(rados_probes, probe_units); @@ -708,6 +809,7 @@ int main(int argc, char **argv) { rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL); if (!rb) { cerr << "failed to setup ring_buffer" << endl; + exit_code = 1; goto cleanup; } @@ -722,18 +824,28 @@ int main(int argc, char **argv) { clog << "Started to poll from ring buffer" << endl; - while ((!timeout_occurred || timeout == -1) && (ret = ring_buffer__poll(rb, 1000)) >= 0) { - // Continue polling while timeout hasn't occurred or if unlimited execution time + while (!got_sigint && (!timeout_occurred || timeout == -1) && (ret = ring_buffer__poll(rb, 1000)) >= 0) { + } + + if (ret == -EINTR && (got_sigint || timeout_occurred)) { + ret = 0; + } + + if (ret < 0 && ret != -EINTR) { + exit_code = 1; + cerr << "Error polling ring buffer: " << -ret << endl; } if (timeout_occurred) { - cerr << "Timeout occurred. Exiting." << endl; + cerr << "Timeout occurred. Reporting pending ops before exit." << endl; } + report_hung_ops(skel); + cleanup: clog << "Clean up the eBPF program" << endl; ring_buffer__free(rb); radostrace_bpf__destroy(skel); - return timeout_occurred ? -1 : -errno; + return exit_code; } diff --git a/tests/functional-test-microceph.sh b/tests/functional-test-microceph.sh index 1c5df8b..94ec7bf 100755 --- a/tests/functional-test-microceph.sh +++ b/tests/functional-test-microceph.sh @@ -33,24 +33,27 @@ cleanup() { pkill -f radostrace || true pkill -f "rbd bench" || true + # Ensure OSD processes are resumed in case the hung-op test failed mid-way + kill -CONT $(pgrep -f "ceph-osd") 2>/dev/null || true + if [[ -e $OSDTRACE_LOG ]]; then info "OSD trace output:" - cat $OSDTRACE_LOG + cat $OSDTRACE_LOG || true info " === END of OSD trace === " fi if [[ -e $RADOSTRACE_LOG ]]; then info "RADOS trace output:" - cat $RADOSTRACE_LOG + cat $RADOSTRACE_LOG || true info " === END of RADOS trace === " fi # Remove test files - rm -f $OSDTRACE_LOG $RADOSTRACE_LOG + rm -f $OSDTRACE_LOG $RADOSTRACE_LOG /tmp/radostrace_hung.log # Remove test RBD resources - microceph.rbd rm test_pool/testimage 2>/dev/null || true - microceph.ceph osd pool delete test_pool test_pool --yes-i-really-really-mean-it 2>/dev/null || true + timeout 15 microceph.rbd rm test_pool/testimage 2>/dev/null || true + timeout 15 microceph.ceph osd pool delete test_pool test_pool --yes-i-really-really-mean-it 2>/dev/null || true info "Cleanup completed" } @@ -229,9 +232,85 @@ verify_osdtrace_output "$OSDTRACE_LOG" "$TEST_POOL_ID" "$MAX_OSD_ID" "$TOT_PG" 5 info "=== Step 12: Verify radostrace output ===" verify_radostrace_output "$RADOSTRACE_LOG" "$TEST_POOL_ID" "$MAX_OSD_ID" 50 +info "=== Step 12: Test hung op detection ===" +HUNG_OP_LOG="/tmp/radostrace_hung.log" + +# Start a continuous rados bench so we have a long-lived librados process +# and a steady stream of ops to capture. +microceph.rados -p test_pool bench 120 write -O 4M --no-cleanup > /dev/null 2>&1 & +BENCH_PID=$! +sleep 1 + +# Find the bench process that has librados loaded +BENCH_RADOS_PID="" +for i in $(seq 1 30); do + for pid in $(pgrep -f "rados" 2>/dev/null); do + if grep -q "librados" /proc/$pid/maps 2>/dev/null; then + BENCH_RADOS_PID=$pid + break 2 + fi + done + sleep 0.5 +done + +if [ -z "$BENCH_RADOS_PID" ]; then + err "Could not find rados bench process with librados for hung op test" + kill $BENCH_PID 2>/dev/null || true + exit 1 +fi +info "Attaching radostrace to bench PID $BENCH_RADOS_PID" + +# Start radostrace with a 12-second timeout BEFORE freezing the OSD. +# This ensures _send_op uprobes are active when the OSD is frozen, so +# in-flight ops (tracked via _send_op but never finished via _finish_op) +# appear as pending when radostrace reports on timeout. +$PROJECT_ROOT/radostrace -p $BENCH_RADOS_PID -i $RADOS_DWARF --skip-version-check -t 12 \ + >$HUNG_OP_LOG 2>&1 & +RADOSTRACE_HUNG_PID=$! + +# Wait for uprobe to be fully attached before freezing the OSD processes +for i in $(seq 1 30); do + if grep -q "Started to poll" $HUNG_OP_LOG 2>/dev/null; then + info "uprobe attached, now freezing OSD processes with SIGSTOP..." + break + fi + sleep 0.5 +done + +# Freeze ceph-osd processes with SIGSTOP so they cannot reply to any pending +# requests. Unlike 'ceph osd pause' (a soft cluster flag that only prevents +# new client dispatch while in-flight ops continue to complete), SIGSTOP halts +# the OS-level process immediately, guaranteeing that _finish_op never fires +# for ops already sent — they remain in the BPF map as hung ops. +OSD_PIDS=$(pgrep -f "ceph-osd" | tr '\n' ' ') +info "Freezing OSD PIDs with SIGSTOP: $OSD_PIDS" +kill -STOP $OSD_PIDS 2>/dev/null || true + +# Wait for radostrace to timeout and call report_hung_ops +wait $RADOSTRACE_HUNG_PID 2>/dev/null || true + +# Resume OSD processes and clean up bench +kill -CONT $OSD_PIDS 2>/dev/null || true +kill $BENCH_PID 2>/dev/null || true +wait $BENCH_PID 2>/dev/null || true + +info "Hung op radostrace output:" +cat $HUNG_OP_LOG + +# Verify reported hung ops: at least one row with Complete=0 must be present +if ! awk '$1 ~ /^[0-9]+$/ && NF >= 9 && $10 == "0" { found=1 } END { exit !found }' $HUNG_OP_LOG; then + err "Expected at least one incomplete op (Complete=0) not found in radostrace hung-op output" + rm -f $HUNG_OP_LOG + exit 1 +fi + +info "✓ Hung op detection validated: radostrace correctly reported incomplete ops (Complete=0)" +rm -f $HUNG_OP_LOG + info "=== Test Summary ===" info "✓ MicroCeph cluster deployed successfully" info "✓ osdtrace and radostrace output validated" +info "✓ Hung op detection confirmed with 'incomplete' status output" info "✓ All functional tests passed!" exit 0 diff --git a/tests/lib/verify-trace-output.sh b/tests/lib/verify-trace-output.sh index 33b3e26..5fac6af 100644 --- a/tests/lib/verify-trace-output.sh +++ b/tests/lib/verify-trace-output.sh @@ -68,9 +68,9 @@ _osdtrace_rows() { # the size/latency fields. _radostrace_rows() { awk ' - $1 ~ /^[0-9]+$/ && NF >= 10 { + $1 ~ /^[0-9]+$/ && NF >= 11 { print $1 "|" $2 "|" $3 "|" $4 "|" $5 "|" $6 "|" $7 "|" \ - ($8 + 0) "|" ($9 + 0) "|" $10 + ($8 + 0) "|" ($9 + 0) "|" $11 } ' "$1" }