summaryrefslogtreecommitdiff
path: root/io_uring/io-wq.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 17:22:00 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-09 17:22:00 -0800
commitf5d4feed174ce9fb3c42886a3c36038fd5a43e25 (patch)
tree2e1940643a141621ef28b2ecb757d0dbce6ef9d7 /io_uring/io-wq.c
parent26c9342bb761e463774a64fb6210b4f95f5bc035 (diff)
parent442ae406603a94f1a263654494f425302ceb0445 (diff)
Merge tag 'for-7.0/io_uring-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe: - Clean up the IORING_SETUP_R_DISABLED and submitter task checking, mostly just in preparation for relaxing the locking for SINGLE_ISSUER in the future. - Improve IOPOLL by using a doubly linked list to manage completions. Previously it was singly listed, which meant that to complete request N in the chain 0..N-1 had to have completed first. With a doubly linked list we can complete whatever request completes in that order, rather than need to wait for a consecutive range to be available. This reduces latencies. - Improve the restriction setup and checking. Mostly in preparation for adding further features on top of that. Coming in a separate pull request. - Split out task_work and wait handling into separate files. These are mostly nicely abstracted already, but still remained in the io_uring.c file which is on the larger side. - Use GFP_KERNEL_ACCOUNT in a few more spots, where appropriate. - Ensure even the idle io-wq worker exits if a task no longer has any rings open. - Add support for a non-circular submission queue. By default, the SQ ring keeps moving around, even if only a few entries are used for each submission. This can be wasteful in terms of cachelines. If IORING_SETUP_SQ_REWIND is set for the ring when created, each submission will start at offset 0 instead of where we last left off doing submissions. - Various little cleanups * tag 'for-7.0/io_uring-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (30 commits) io_uring/kbuf: fix memory leak if io_buffer_add_list fails io_uring: Add SPDX id lines to remaining source files io_uring: allow io-wq workers to exit when unused io_uring/io-wq: add exit-on-idle state io_uring/net: don't continue send bundle if poll was required for retry io_uring/rsrc: use GFP_KERNEL_ACCOUNT consistently io_uring/futex: use GFP_KERNEL_ACCOUNT for futex data allocation io_uring/io-wq: handle !sysctl_hung_task_timeout_secs io_uring: fix bad indentation for setup flags if statement io_uring/rsrc: take unsigned index in io_rsrc_node_lookup() io_uring: introduce non-circular SQ io_uring: split out CQ waiting code into wait.c io_uring: split out task work code into tw.c io_uring/io-wq: don't trigger hung task for syzbot craziness io_uring: add IO_URING_EXIT_WAIT_MAX definition io_uring/sync: validate passed in offset io_uring/eventfd: remove unused ctx->evfd_last_cq_tail member io_uring/timeout: annotate data race in io_flush_timeouts() io_uring/uring_cmd: explicitly disallow cancelations for IOPOLL io_uring: fix IOPOLL with passthrough I/O ...
Diffstat (limited to 'io_uring/io-wq.c')
-rw-r--r--io_uring/io-wq.c51
1 files changed, 48 insertions, 3 deletions
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 2fa7d3601edb..7ed04911f7b9 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -17,6 +17,7 @@
#include <linux/task_work.h>
#include <linux/audit.h>
#include <linux/mmu_context.h>
+#include <linux/sched/sysctl.h>
#include <uapi/linux/io_uring.h>
#include "io-wq.h"
@@ -34,6 +35,7 @@ enum {
enum {
IO_WQ_BIT_EXIT = 0, /* wq exiting */
+ IO_WQ_BIT_EXIT_ON_IDLE = 1, /* allow all workers to exit on idle */
};
enum {
@@ -706,9 +708,13 @@ static int io_wq_worker(void *data)
raw_spin_lock(&acct->workers_lock);
/*
* Last sleep timed out. Exit if we're not the last worker,
- * or if someone modified our affinity.
+ * or if someone modified our affinity. If wq is marked
+ * idle-exit, drop the worker as well. This is used to avoid
+ * keeping io-wq workers around for tasks that no longer have
+ * any active io_uring instances.
*/
- if (last_timeout && (exit_mask || acct->nr_workers > 1)) {
+ if ((last_timeout && (exit_mask || acct->nr_workers > 1)) ||
+ test_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state)) {
acct->nr_workers--;
raw_spin_unlock(&acct->workers_lock);
__set_current_state(TASK_RUNNING);
@@ -963,6 +969,24 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
return false;
}
+void io_wq_set_exit_on_idle(struct io_wq *wq, bool enable)
+{
+ if (!wq->task)
+ return;
+
+ if (!enable) {
+ clear_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state);
+ return;
+ }
+
+ if (test_and_set_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state))
+ return;
+
+ rcu_read_lock();
+ io_wq_for_each_worker(wq, io_wq_worker_wake, NULL);
+ rcu_read_unlock();
+}
+
static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
{
do {
@@ -1313,6 +1337,8 @@ static void io_wq_cancel_tw_create(struct io_wq *wq)
static void io_wq_exit_workers(struct io_wq *wq)
{
+ unsigned long timeout, warn_timeout;
+
if (!wq->task)
return;
@@ -1322,7 +1348,26 @@ static void io_wq_exit_workers(struct io_wq *wq)
io_wq_for_each_worker(wq, io_wq_worker_wake, NULL);
rcu_read_unlock();
io_worker_ref_put(wq);
- wait_for_completion(&wq->worker_done);
+
+ /*
+ * Shut up hung task complaint, see for example
+ *
+ * https://lore.kernel.org/all/696fc9e7.a70a0220.111c58.0006.GAE@google.com/
+ *
+ * where completely overloading the system with tons of long running
+ * io-wq items can easily trigger the hung task timeout. Only sleep
+ * uninterruptibly for half that time, and warn if we exceeded end
+ * up waiting more than IO_URING_EXIT_WAIT_MAX.
+ */
+ timeout = sysctl_hung_task_timeout_secs * HZ / 2;
+ if (!timeout)
+ timeout = MAX_SCHEDULE_TIMEOUT;
+ warn_timeout = jiffies + IO_URING_EXIT_WAIT_MAX;
+ do {
+ if (wait_for_completion_timeout(&wq->worker_done, timeout))
+ break;
+ WARN_ON_ONCE(time_after(jiffies, warn_timeout));
+ } while (1);
spin_lock_irq(&wq->hash->wait.lock);
list_del_init(&wq->wait.entry);