diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-09 17:22:00 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-02-09 17:22:00 -0800 |
| commit | f5d4feed174ce9fb3c42886a3c36038fd5a43e25 (patch) | |
| tree | 2e1940643a141621ef28b2ecb757d0dbce6ef9d7 /io_uring/io-wq.c | |
| parent | 26c9342bb761e463774a64fb6210b4f95f5bc035 (diff) | |
| parent | 442ae406603a94f1a263654494f425302ceb0445 (diff) | |
Merge tag 'for-7.0/io_uring-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe:
- Clean up the IORING_SETUP_R_DISABLED and submitter task checking,
mostly just in preparation for relaxing the locking for SINGLE_ISSUER
in the future.
- Improve IOPOLL by using a doubly linked list to manage completions.
Previously it was singly listed, which meant that to complete request
N in the chain 0..N-1 had to have completed first. With a doubly
linked list we can complete whatever request completes in that order,
rather than need to wait for a consecutive range to be available.
This reduces latencies.
- Improve the restriction setup and checking. Mostly in preparation for
adding further features on top of that. Coming in a separate pull
request.
- Split out task_work and wait handling into separate files. These are
mostly nicely abstracted already, but still remained in the
io_uring.c file which is on the larger side.
- Use GFP_KERNEL_ACCOUNT in a few more spots, where appropriate.
- Ensure even the idle io-wq worker exits if a task no longer has any
rings open.
- Add support for a non-circular submission queue.
By default, the SQ ring keeps moving around, even if only a few
entries are used for each submission. This can be wasteful in terms
of cachelines.
If IORING_SETUP_SQ_REWIND is set for the ring when created, each
submission will start at offset 0 instead of where we last left off
doing submissions.
- Various little cleanups
* tag 'for-7.0/io_uring-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (30 commits)
io_uring/kbuf: fix memory leak if io_buffer_add_list fails
io_uring: Add SPDX id lines to remaining source files
io_uring: allow io-wq workers to exit when unused
io_uring/io-wq: add exit-on-idle state
io_uring/net: don't continue send bundle if poll was required for retry
io_uring/rsrc: use GFP_KERNEL_ACCOUNT consistently
io_uring/futex: use GFP_KERNEL_ACCOUNT for futex data allocation
io_uring/io-wq: handle !sysctl_hung_task_timeout_secs
io_uring: fix bad indentation for setup flags if statement
io_uring/rsrc: take unsigned index in io_rsrc_node_lookup()
io_uring: introduce non-circular SQ
io_uring: split out CQ waiting code into wait.c
io_uring: split out task work code into tw.c
io_uring/io-wq: don't trigger hung task for syzbot craziness
io_uring: add IO_URING_EXIT_WAIT_MAX definition
io_uring/sync: validate passed in offset
io_uring/eventfd: remove unused ctx->evfd_last_cq_tail member
io_uring/timeout: annotate data race in io_flush_timeouts()
io_uring/uring_cmd: explicitly disallow cancelations for IOPOLL
io_uring: fix IOPOLL with passthrough I/O
...
Diffstat (limited to 'io_uring/io-wq.c')
| -rw-r--r-- | io_uring/io-wq.c | 51 |
1 files changed, 48 insertions, 3 deletions
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 2fa7d3601edb..7ed04911f7b9 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -17,6 +17,7 @@ #include <linux/task_work.h> #include <linux/audit.h> #include <linux/mmu_context.h> +#include <linux/sched/sysctl.h> #include <uapi/linux/io_uring.h> #include "io-wq.h" @@ -34,6 +35,7 @@ enum { enum { IO_WQ_BIT_EXIT = 0, /* wq exiting */ + IO_WQ_BIT_EXIT_ON_IDLE = 1, /* allow all workers to exit on idle */ }; enum { @@ -706,9 +708,13 @@ static int io_wq_worker(void *data) raw_spin_lock(&acct->workers_lock); /* * Last sleep timed out. Exit if we're not the last worker, - * or if someone modified our affinity. + * or if someone modified our affinity. If wq is marked + * idle-exit, drop the worker as well. This is used to avoid + * keeping io-wq workers around for tasks that no longer have + * any active io_uring instances. */ - if (last_timeout && (exit_mask || acct->nr_workers > 1)) { + if ((last_timeout && (exit_mask || acct->nr_workers > 1)) || + test_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state)) { acct->nr_workers--; raw_spin_unlock(&acct->workers_lock); __set_current_state(TASK_RUNNING); @@ -963,6 +969,24 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) return false; } +void io_wq_set_exit_on_idle(struct io_wq *wq, bool enable) +{ + if (!wq->task) + return; + + if (!enable) { + clear_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state); + return; + } + + if (test_and_set_bit(IO_WQ_BIT_EXIT_ON_IDLE, &wq->state)) + return; + + rcu_read_lock(); + io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); + rcu_read_unlock(); +} + static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { @@ -1313,6 +1337,8 @@ static void io_wq_cancel_tw_create(struct io_wq *wq) static void io_wq_exit_workers(struct io_wq *wq) { + unsigned long timeout, warn_timeout; + if (!wq->task) return; @@ -1322,7 +1348,26 @@ static void io_wq_exit_workers(struct io_wq *wq) io_wq_for_each_worker(wq, io_wq_worker_wake, NULL); rcu_read_unlock(); io_worker_ref_put(wq); - wait_for_completion(&wq->worker_done); + + /* + * Shut up hung task complaint, see for example + * + * https://lore.kernel.org/all/696fc9e7.a70a0220.111c58.0006.GAE@google.com/ + * + * where completely overloading the system with tons of long running + * io-wq items can easily trigger the hung task timeout. Only sleep + * uninterruptibly for half that time, and warn if we exceeded end + * up waiting more than IO_URING_EXIT_WAIT_MAX. + */ + timeout = sysctl_hung_task_timeout_secs * HZ / 2; + if (!timeout) + timeout = MAX_SCHEDULE_TIMEOUT; + warn_timeout = jiffies + IO_URING_EXIT_WAIT_MAX; + do { + if (wait_for_completion_timeout(&wq->worker_done, timeout)) + break; + WARN_ON_ONCE(time_after(jiffies, warn_timeout)); + } while (1); spin_lock_irq(&wq->hash->wait.lock); list_del_init(&wq->wait.entry); |
