From 835aa18db8fc21e88d53a21c722d386558eb5f81 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Mon, 22 Jun 2026 15:07:12 +0800 Subject: [PATCH 1/2] blk-cgroup: fix blkg leak in blkg_create() error path When radix_tree_insert() fails in blkg_create(), the error path calls blkg_put() to release the blkg. This was correct when blkg->refcnt was an atomic_t: blkg_put() dropped it to 0 and triggered the release path. But commit 7fcf2b033b84 ("blkcg: change blkg reference counting to use percpu_ref") switched refcnt to a percpu_ref. In percpu mode percpu_ref_put() never checks for zero, so the release callback is never invoked. This blkg is on neither blkcg->blkg_list nor queue->blkg_list, so blkg_destroy_all() / blkcg_destroy_blkgs() can never reach it to call blkg_destroy()->percpu_ref_kill() either, cause the leak. Fix it by killing the percpu_ref instead, which switches it to atomic mode and drops the initial ref. Fixes: 7fcf2b033b84 ("blkcg: change blkg reference counting to use percpu_ref") Signed-off-by: Zizhi Wo Signed-off-by: Zizhi Wo --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3093c1c039022..7d2b4c2e82af8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -439,7 +439,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, return blkg; /* @blkg failed fully initialized, use the usual release path */ - blkg_put(blkg); + percpu_ref_kill(&blkg->refcnt); return ERR_PTR(ret); err_put_css: From b1b04ac50371fd3796d041dfc02a813158373774 Mon Sep 17 00:00:00 2001 From: Zizhi Wo Date: Mon, 22 Jun 2026 15:07:13 +0800 Subject: [PATCH 2/2] blk-cgroup: fix null-ptr-deref by freeing blkg pd on blkg_create error path When blkg_create() fails before the blkg is linked onto blkcg->blkg_list and q->blkg_list (e.g. radix_tree_insert() fails or the blkg_lookup() returns NULL), the blkg is freed asynchronously via blkg_free_workfn(). Since such a blkg was never linked, it is invisible to blkcg_deactivate_policy(), so its blkg->pd[] entries can not be cleared in it. blkg_free_workfn() then calls blkcg_policy->pd_free_fn() on them, which can race with bfq module exit (bfq_exit() -> blkcg_policy_unregister()) clearing the blkcg_policy[] slot, leading to a NULL pointer dereference: [ 72.597786] KASAN: null-ptr-deref in range [0x0000000000000048-0x000000000000004f] [ 72.598690] CPU: 35 UID: 0 PID: 458 Comm: kworker/35:1 Not tainted 7.1.0+ #33 PREEMPT(full) [ 72.599518] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-4.fc41 04/01/2014 [ 72.600342] Workqueue: events blkg_free_workfn [ 72.600991] RIP: 0010:blkg_free_workfn+0x115/0x3d0 ...... [ 72.613278] Call Trace: [ 72.613988] [ 72.614357] process_one_work+0x6b4/0xff0 [ 72.615251] ? __pfx_blkg_free_workfn+0x10/0x10 [ 72.616041] ? assign_work+0x131/0x3f0 [ 72.616962] worker_thread+0x4eb/0xd50 [ 72.617599] ? __kthread_parkme+0x8d/0x170 [ 72.618565] ? __pfx_worker_thread+0x10/0x10 [ 72.619566] ? __pfx_worker_thread+0x10/0x10 [ 72.620213] kthread+0x327/0x410 ...... Fix this by introducing blkg_free_pd() to synchronously free the pd and clear blkg->pd[] in the blkg_create() error path, while the blkcg_policy is still valid. Signed-off-by: Zizhi Wo --- block/blk-cgroup.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 7d2b4c2e82af8..eb362843b8e2e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -111,12 +111,23 @@ static struct cgroup_subsys_state *blkcg_css(void) return task_css(current, io_cgrp_id); } +static void blkg_free_pd(struct blkcg_gq *blkg) +{ + int i; + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + if (blkg->pd[i]) { + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + blkg->pd[i] = NULL; + } + } +} + static void blkg_free_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, free_work); struct request_queue *q = blkg->q; - int i; /* * pd_free_fn() can also be called from blkcg_deactivate_policy(), @@ -126,9 +137,7 @@ static void blkg_free_workfn(struct work_struct *work) * blkcg_deactivate_policy(). */ mutex_lock(&q->blkcg_mutex); - for (i = 0; i < BLKCG_MAX_POLS; i++) - if (blkg->pd[i]) - blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + blkg_free_pd(blkg); if (blkg->parent) blkg_put(blkg->parent); spin_lock_irq(&q->queue_lock); @@ -438,15 +447,24 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, if (!ret) return blkg; - /* @blkg failed fully initialized, use the usual release path */ + /* + * @blkg failed fully initialized and never linked, so its pd[] is + * invisible to blkcg_deactivate_policy(). Free pd[] synchronously + * while blkcg_policy[] is still valid, otherwise the async free path + * may call pd_free_fn() after the policy is unregistered (e.g. rmmod bfq). + * The err_free_blkg path below frees pd[] for the same reason. + */ + blkg_free_pd(blkg); percpu_ref_kill(&blkg->refcnt); return ERR_PTR(ret); err_put_css: css_put(&blkcg->css); err_free_blkg: - if (new_blkg) + if (new_blkg) { + blkg_free_pd(new_blkg); blkg_free(new_blkg); + } return ERR_PTR(ret); }