From ed223b581d5dc84498ea18b2e5131a6e799ae95f Mon Sep 17 00:00:00 2001 From: Maxime Leroy Date: Tue, 2 Jun 2026 16:28:06 +0200 Subject: [PATCH 1/2] infra: add the --napi adaptive interrupt rx mode Add an opt-in --napi mode where an idle worker arms the interrupts on its rx queues and blocks on them through the generic rte_eth_dev_rx_intr_* / rte_epoll_wait API instead of busy-polling. A packet wakes the worker, which disarms and resumes polling: the usual poll/interrupt hybrid, with the interrupt acting only as a doorbell since frames are still pulled by the graph walk. A worker blocks only after staying idle for NAPI_EMPTY_WINDOWS housekeeping windows with all of its queues empty, so a single busy queue keeps it polling. --napi implies poll-mode and replaces the micro-sleep ramp with the interrupt block. As that block can last up to a second it is measured explicitly and the timestamp advanced past it, keeping the sleep in total_cycles but out of busy_cycles. napi_wait() tracks the queues it actually armed and disarms them through a single exit path, so a queue without interrupt support does not leave its predecessors armed, and only marks a queue epoll-registered once. A PMD without rx queue interrupt support keeps polling. Signed-off-by: Maxime Leroy --- main/config.h | 1 + main/main.c | 10 +- modules/infra/control/iface_test.c | 1 + modules/infra/control/port.c | 5 + modules/infra/control/worker.c | 25 +++++ modules/infra/control/worker.h | 2 + modules/infra/datapath/main_loop.c | 152 +++++++++++++++++++++++++++-- 7 files changed, 186 insertions(+), 10 deletions(-) diff --git a/main/config.h b/main/config.h index 7ad34a54f..1065be67a 100644 --- a/main/config.h +++ b/main/config.h @@ -19,6 +19,7 @@ struct gr_config { unsigned max_mtu; bool test_mode; bool poll_mode; + bool napi; // --napi: adaptive interrupt (NAPI) rx; implies poll_mode bool log_syslog; bool log_packets; bool override_default_route; diff --git a/main/main.c b/main/main.c index d0fb2922e..d66d90446 100644 --- a/main/main.c +++ b/main/main.c @@ -65,6 +65,9 @@ static void usage(void) { puts(" -m, --socket-mode PERMISSIONS API socket file permissions (Default: 0660)."); puts(" -o, --socket-owner USER:GROUP API socket file ownership"); puts(" -p, --poll-mode Disable automatic micro-sleep."); + puts(" -n, --napi Adaptive interrupt (NAPI) rx: idle"); + puts(" workers block on rxq interrupts"); + puts(" instead of polling. Implies poll-mode."); puts(" -s, --socket PATH Path the control plane API socket."); puts(" Default: GROUT_SOCK_PATH from env or"); printf(" %s).\n", GR_DEFAULT_SOCK_PATH); @@ -185,11 +188,12 @@ static bool parse_bool_env(const char *name) { static int parse_args(int argc, char **argv) { int c; -#define FLAGS ":M:Vhm:o:pSs:tu:vx" +#define FLAGS ":M:Vhm:no:pSs:tu:vx" static struct option long_options[] = { {"help", no_argument, NULL, 'h'}, {"max-mtu", required_argument, NULL, 'u'}, {"metrics", required_argument, NULL, 'M'}, + {"napi", no_argument, NULL, 'n'}, {"poll-mode", no_argument, NULL, 'p'}, {"socket", required_argument, NULL, 's'}, {"socket-mode", required_argument, NULL, 'm'}, @@ -233,6 +237,10 @@ static int parse_args(int argc, char **argv) { case 'p': gr_config.poll_mode = true; break; + case 'n': + gr_config.napi = true; + gr_config.poll_mode = true; + break; case 'M': if (parse_metrics_addr(optarg) < 0) return errno_set(EINVAL); diff --git a/modules/infra/control/iface_test.c b/modules/infra/control/iface_test.c index 5612764ec..3307fdfd9 100644 --- a/modules/infra/control/iface_test.c +++ b/modules/infra/control/iface_test.c @@ -43,6 +43,7 @@ bool vrf_has_interfaces(uint16_t) { return false; } void control_queue_drain(uint32_t, const void *) { } +void worker_wakeup_all(void) { } mock_func(struct iface *, __wrap_iface_from_id(uint16_t)); mock_func(int, port_unplug(struct iface_info_port *)); mock_func(int, port_plug(struct iface_info_port *)); diff --git a/modules/infra/control/port.c b/modules/infra/control/port.c index 7b7937fac..ed83d4dbb 100644 --- a/modules/infra/control/port.c +++ b/modules/infra/control/port.c @@ -152,6 +152,9 @@ int port_configure(struct iface_info_port *p, uint16_t n_txq_min) { if (info.dev_flags != NULL && *info.dev_flags & RTE_ETH_DEV_INTR_LSC) { conf.intr_conf.lsc = 1; } + // --napi: request RX queue interrupts so idle workers can block on them. + if (gr_config.napi) + conf.intr_conf.rxq = 1; if ((ret = rte_eth_dev_configure(p->port_id, p->n_rxq, p->n_txq, &conf)) < 0) return errno_log(-ret, "rte_eth_dev_configure"); @@ -269,6 +272,7 @@ static int port_mtu_set(struct iface *iface, uint16_t mtu) { if ((ret = rte_eth_dev_start(p->port_id)) < 0) return errno_log(-ret, "rte_eth_dev_start"); p->started = true; + worker_wakeup_all(); // let workers (re)arm now that the port is started iface->mtu = mtu; vec_foreach (struct iface *s, iface->subinterfaces) @@ -361,6 +365,7 @@ static int iface_port_reconfig( return errno_log(-ret, "rte_eth_dev_start"); p->started = true; + worker_wakeup_all(); // let workers (re)arm now that the port is started return 0; } diff --git a/modules/infra/control/worker.c b/modules/infra/control/worker.c index ad28fd1f0..dec898c49 100644 --- a/modules/infra/control/worker.c +++ b/modules/infra/control/worker.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -42,9 +43,16 @@ int worker_create(unsigned cpu_id) { worker->cpu_id = cpu_id; worker->lcore_id = LCORE_ID_ANY; + worker->wakeup_fd = -1; pthread_mutex_init(&worker->wakeup.lock, NULL); pthread_cond_init(&worker->wakeup.cond, NULL); + worker->wakeup_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (worker->wakeup_fd < 0) { + ret = errno; + goto end; + } + CPU_ZERO(&cpuset); CPU_SET(cpu_id, &cpuset); pthread_attr_init(&attr); @@ -76,6 +84,8 @@ int worker_create(unsigned cpu_id) { pthread_cancel(worker->thread); pthread_cond_destroy(&worker->wakeup.cond); pthread_mutex_destroy(&worker->wakeup.lock); + if (worker->wakeup_fd >= 0) + close(worker->wakeup_fd); rte_free(worker); } LOG(ERR, "worker %u start failed: %s", cpu_id, strerror(ret)); @@ -97,6 +107,7 @@ int worker_destroy(unsigned cpu_id) { pthread_join(worker->thread, NULL); pthread_cond_destroy(&worker->wakeup.cond); pthread_mutex_destroy(&worker->wakeup.lock); + close(worker->wakeup_fd); worker_graph_free(worker); vec_free(worker->rxqs); vec_free(worker->txqs); @@ -115,10 +126,24 @@ void worker_wait_wakeup(struct worker *w) { } void worker_wakeup(struct worker *w) { + uint64_t one = 1; + pthread_mutex_lock(&w->wakeup.lock); w->wakeup.set = true; pthread_cond_signal(&w->wakeup.cond); pthread_mutex_unlock(&w->wakeup.lock); + + // The condvar above only reaches a worker parked on graph == NULL. A napi + // worker idle on rte_epoll_wait is woken through the eventfd instead. + // EAGAIN means a kick is already pending and undrained, which is enough. + if (write(w->wakeup_fd, &one, sizeof(one)) != sizeof(one) && errno != EAGAIN) + LOG(ERR, "worker %u wakeup_fd write: %s", w->cpu_id, strerror(errno)); +} + +void worker_wakeup_all(void) { + struct worker *worker; + STAILQ_FOREACH (worker, &workers, next) + worker_wakeup(worker); } unsigned worker_count(void) { diff --git a/modules/infra/control/worker.h b/modules/infra/control/worker.h index 5a4e91169..b15a5c9cf 100644 --- a/modules/infra/control/worker.h +++ b/modules/infra/control/worker.h @@ -66,6 +66,7 @@ struct worker { unsigned cpu_id; unsigned lcore_id; pid_t tid; + int wakeup_fd; // eventfd: ctlplane writes, dataplane epoll-waits + drains (napi) struct { pthread_mutex_t lock; @@ -87,6 +88,7 @@ int worker_rxq_assign(uint16_t port_id, uint16_t rxq_id, uint16_t cpu_id); int worker_queue_distribute(const cpu_set_t *affinity, vec struct iface_info_port **ports); void worker_wait_wakeup(struct worker *); void worker_wakeup(struct worker *); +void worker_wakeup_all(void); vec struct gr_stat *worker_dump_stats(uint16_t cpu_id); int port_unplug(struct iface_info_port *); diff --git a/modules/infra/datapath/main_loop.c b/modules/infra/datapath/main_loop.c index f462cfbd4..bad34e2b5 100644 --- a/modules/infra/datapath/main_loop.c +++ b/modules/infra/datapath/main_loop.c @@ -13,12 +13,15 @@ #include #include #include +#include #include +#include #include #include #include #include +#include #include #include @@ -182,6 +185,115 @@ static int stats_reload(const struct rte_graph *graph, struct stats_context *ctx static struct rte_rcu_qsbr *rcu; +#define NAPI_EMPTY_WINDOWS 2 +#define NAPI_MAX_EVENTS 32 +// A few short timeout waits after going idle give schedutil repeated wakes to +// ratchet the frequency back down from the uclamp_min max; one wait alone is +// not enough to drop it from the boot-time peak. After NAPI_SETTLE_TRIES the +// worker blocks indefinitely. +#define NAPI_SETTLE_MS 100 +#define NAPI_SETTLE_TRIES 3 + +// --napi: idle worker blocks on rxq interrupts instead of polling. +// +// Each owned rxq is armed via the generic rte_eth_dev_rx_intr_* API and its +// eventfd added to this thread's epoll, then the worker blocks until one fires. +// rx_intr_enable returning < 0 means the queue has no interrupt support (e.g. no +// notification channel) -- fall back to polling. Newly owned rxqs (runtime queue +// reassignment) are added to the epoll set on the fly; entries are never +// removed, as several queues may share a single portal eventfd and a per-queue +// removal would also disarm its siblings. The worker's wakeup_fd is also in the +// epoll set (registered once at start), so a reconfig or shutdown kick breaks +// the block immediately instead of waiting for a packet. +static void napi_wait(struct worker *w, vec struct queue_map **registered) { + static __thread struct rte_epoll_event wakeup_ev; + static __thread bool wakeup_done; + struct rte_epoll_event events[NAPI_MAX_EVENTS]; + vec struct queue_map *armed = NULL; + struct queue_map *qm, *e; + int n, ret; + + // Register the wakeup eventfd on the SAME per-thread epfd this wait uses, + // so a reconfig/shutdown kick breaks epoll_wait. Done from napi_wait (not + // worker start) to guarantee it lands on the epfd rte_epoll_wait blocks on. + if (!wakeup_done) { + wakeup_ev.epdata.event = EPOLLIN; + ret = rte_epoll_ctl(RTE_EPOLL_PER_THREAD, EPOLL_CTL_ADD, w->wakeup_fd, &wakeup_ev); + if (ret == 0 || errno == EEXIST) + wakeup_done = true; + } + + vec_foreach_ref (qm, w->rxqs) { + const struct iface *iface = port_get_iface(qm->port_id); + // skip a port not yet registered or not started (the rx node has its + // iface in ctx; here port_get_iface() may still be NULL) + if (iface == NULL || !iface_info_port(iface)->started) + continue; + if (rte_eth_dev_rx_intr_enable(qm->port_id, qm->queue_id) < 0) + goto disarm; + vec_add(armed, *qm); + } + + // register any rxq not yet in this thread's epoll set + vec_foreach_ref (qm, armed) { + bool reg = false; + for (uint32_t i = 0; i < vec_len(*registered); i++) { + if ((*registered)[i].port_id == qm->port_id + && (*registered)[i].queue_id == qm->queue_id) { + reg = true; + break; + } + } + if (reg) + continue; + ret = rte_eth_dev_rx_intr_ctl_q( + qm->port_id, qm->queue_id, RTE_EPOLL_PER_THREAD, RTE_INTR_EVENT_ADD, NULL + ); + // -EEXIST: the fd is already in the epoll set (several queues can + // share one DPAA2 portal eventfd); treat it as registered. + if (ret == 0 || ret == -EEXIST) + vec_add(*registered, *qm); + } + + // A packet may have arrived between the empty-poll decision and arming + // the interrupts; recheck and resume polling instead of blocking until + // the next packet. rte_eth_rx_queue_count() is now safe against a + // concurrent port stop (the reset fast-path op returns -ENOTSUP, no + // longer a NULL-deref segfault). + vec_foreach_ref (qm, armed) { + if (rte_eth_rx_queue_count(qm->port_id, qm->queue_id) > 0) + goto disarm; + } + + rte_rcu_qsbr_thread_offline(rcu, rte_lcore_id()); + // A few short waits first: each timeout wake lets schedutil re-evaluate the + // decayed utilization and ratchet the frequency down from the uclamp_min + // max, then block indefinitely so an idle worker stops waking (a packet + // wakes it via the rxq interrupt, a reconfig/shutdown via the wakeup + // eventfd). Each is a single wait, never a drain loop: epoll_wait does not + // consume the events (the graph walk pulls the frames), so looping on a + // full batch would spin forever on any level-ready fd. + for (unsigned i = 0; i < NAPI_SETTLE_TRIES; i++) { + n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, events, NAPI_MAX_EVENTS, NAPI_SETTLE_MS); + if (n != 0) + break; + } + if (n == 0) + n = rte_epoll_wait(RTE_EPOLL_PER_THREAD, events, NAPI_MAX_EVENTS, -1); + rte_rcu_qsbr_thread_online(rcu, rte_lcore_id()); + + // drain a reconfig/shutdown kick so the eventfd does not keep the epoll + // readable on the next block. + uint64_t kick; + while (read(w->wakeup_fd, &kick, sizeof(kick)) > 0) + ; + +disarm: + vec_foreach_ref (e, armed) + rte_eth_dev_rx_intr_disable(e->port_id, e->queue_id); + vec_free(armed); +} + void *gr_datapath_loop(void *priv) { struct stats_context ctx = { .stats = NULL, @@ -193,7 +305,8 @@ void *gr_datapath_loop(void *priv) { uint32_t sleep, max_sleep_us; struct worker *w = priv; struct rte_graph *graph; - unsigned cur, loop; + unsigned cur, loop, napi_empty = 0; + vec struct queue_map *napi_registered = NULL; char name[16]; #define log(lvl, fmt, ...) LOG(lvl, "[CPU %d] " fmt, w->cpu_id __VA_OPT__(, ) __VA_ARGS__) @@ -256,6 +369,7 @@ void *gr_datapath_loop(void *priv) { loop = 0; sleep = 0; + napi_empty = 0; timestamp = rte_rdtsc(); for (;;) { rte_graph_walk(graph); @@ -276,15 +390,34 @@ void *gr_datapath_loop(void *priv) { rte_graph_cluster_stats_get(ctx.stats, false); timestamp_tmp = rte_rdtsc(); cycles = timestamp_tmp - timestamp; - max_sleep_us = atomic_load(&w->max_sleep_us); - if (ctx.last_count == 0 && max_sleep_us > 0) { - sleep = sleep >= max_sleep_us ? max_sleep_us : (sleep + 1); - usleep(sleep); - ctx.w_stats->sleep_cycles += rte_rdtsc() - timestamp_tmp; - ctx.w_stats->n_sleeps += 1; + if (gr_config.napi) { + if (ctx.last_count == 0 && ++napi_empty >= NAPI_EMPTY_WINDOWS) { + uint64_t now; + napi_empty = 0; + napi_wait(w, &napi_registered); + now = rte_rdtsc(); + ctx.w_stats->sleep_cycles += now - timestamp_tmp; + ctx.w_stats->n_sleeps += 1; + // fold the block into timestamp_tmp/cycles so the + // shared accounting below bills it as sleep, not busy + timestamp_tmp = now; + cycles = now - timestamp; + } else { + if (ctx.last_count) + napi_empty = 0; + ctx.w_stats->busy_cycles += cycles; + } } else { - sleep = 0; - ctx.w_stats->busy_cycles += cycles; + max_sleep_us = atomic_load(&w->max_sleep_us); + if (ctx.last_count == 0 && max_sleep_us > 0) { + sleep = sleep >= max_sleep_us ? max_sleep_us : (sleep + 1); + usleep(sleep); + ctx.w_stats->sleep_cycles += rte_rdtsc() - timestamp_tmp; + ctx.w_stats->n_sleeps += 1; + } else { + sleep = 0; + ctx.w_stats->busy_cycles += cycles; + } } loop = 0; @@ -305,6 +438,7 @@ void *gr_datapath_loop(void *priv) { rte_rcu_qsbr_thread_unregister(rcu, rte_lcore_id()); rte_thread_unregister(); w->lcore_id = LCORE_ID_ANY; + vec_free(napi_registered); return NULL; } From 8667ee695abc46a9191919c44a888915b2b6b20a Mon Sep 17 00:00:00 2001 From: Maxime Leroy Date: Tue, 9 Jun 2026 19:41:27 +0200 Subject: [PATCH 2/2] infra: pin worker cpufreq to max in napi mode via uclamp_min In --napi mode an idle worker blocks on the rxq interrupt, so the schedutil governor sees a low utilization and downclocks the core even when it later runs at line rate. Pin the worker's uclamp_min to the max capacity through sched_setattr(): the governor keeps the core at full speed while the worker is runnable and lets it drop only when it actually sleeps on the interrupt. glibc exposes neither struct sched_attr nor a sched_setattr() wrapper, so both are declared locally. The syscall fails on kernels without uclamp support or without the privilege to set it, which would otherwise warn on every worker. Report the expected EOPNOTSUPP/ENOSYS/EPERM/EINVAL cases at NOTICE and keep WARNING for anything unexpected. Signed-off-by: Maxime Leroy --- meson.build | 5 +++ modules/infra/datapath/main_loop.c | 55 ++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/meson.build b/meson.build index 4415e729d..1f4fe5517 100644 --- a/meson.build +++ b/meson.build @@ -141,6 +141,11 @@ elif compiler.has_function( grout_cflags += ['-DHAVE_RTE_FIB_TBL8_GET_STATS'] endif +# glibc >= 2.41 provides struct sched_attr and the sched_setattr() wrapper. +if compiler.has_header_symbol('sched.h', 'sched_setattr', args: '-D_GNU_SOURCE') + grout_cflags += ['-DHAVE_SCHED_SETATTR'] +endif + src = [] inc = [] diff --git a/modules/infra/datapath/main_loop.c b/modules/infra/datapath/main_loop.c index bad34e2b5..8ee7b45e2 100644 --- a/modules/infra/datapath/main_loop.c +++ b/modules/infra/datapath/main_loop.c @@ -20,9 +20,11 @@ #include #include +#include #include #include #include +#include #include LOG_TYPE("graph"); @@ -294,6 +296,57 @@ static void napi_wait(struct worker *w, vec struct queue_map **registered) { vec_free(armed); } +// napi: the worker blocks on the rxq IRQ when idle, so schedutil sees low +// utilization and downclocks the core even at line rate. Pin uclamp_min to the +// max capacity: the governor runs the core at full speed while the worker is +// runnable and lets it drop only when it actually sleeps on the interrupt. +// glibc < 2.41 exposes neither struct sched_attr nor a sched_setattr() wrapper +// (see HAVE_SCHED_SETATTR in meson.build); define them locally and fall back to +// the raw syscall there. +#ifndef HAVE_SCHED_SETATTR +struct sched_attr { + uint32_t size; + uint32_t sched_policy; + uint64_t sched_flags; + int32_t sched_nice; + uint32_t sched_priority; + uint64_t sched_runtime; + uint64_t sched_deadline; + uint64_t sched_period; + uint32_t sched_util_min; + uint32_t sched_util_max; +}; +static inline int sched_setattr(pid_t pid, struct sched_attr *attr, unsigned int flags) { + return syscall(SYS_sched_setattr, pid, attr, flags); +} +#endif +#ifndef SCHED_FLAG_KEEP_ALL +#define SCHED_FLAG_KEEP_POLICY 0x08 +#define SCHED_FLAG_KEEP_PARAMS 0x10 +#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | SCHED_FLAG_KEEP_PARAMS) +#endif +#ifndef SCHED_FLAG_UTIL_CLAMP_MIN +#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 +#endif + +static void worker_perf_floor(const struct worker *w) { + struct sched_attr attr = { + .size = sizeof(attr), + .sched_flags = SCHED_FLAG_KEEP_ALL | SCHED_FLAG_UTIL_CLAMP_MIN, + .sched_util_min = 1024, // SCHED_CAPACITY_SCALE + }; + + if (sched_setattr(0, &attr, 0) < 0) { + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EPERM || errno == EINVAL) + LOG(NOTICE, + "[CPU %d] uclamp_min unavailable: %s", + w->cpu_id, + strerror(errno)); + else + LOG(WARNING, "[CPU %d] uclamp_min: %s", w->cpu_id, strerror(errno)); + } +} + void *gr_datapath_loop(void *priv) { struct stats_context ctx = { .stats = NULL, @@ -332,6 +385,8 @@ void *gr_datapath_loop(void *priv) { return NULL; } } + if (gr_config.napi) + worker_perf_floor(w); log(INFO, "lcore_id = %d", w->lcore_id);