From b52fe51f724385b3ed81e37e510a4a33107e8161 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Tue, 17 Feb 2026 17:35:42 +0000 Subject: btrfs: fix super block offset in error message in btrfs_validate_super() Fix the superblock offset mismatch error message in btrfs_validate_super(): we changed it so that it considers all the superblocks, but the message still assumes we're only looking at the first one. The change from %u to %llu is because we're changing from a constant to a u64. Fixes: 069ec957c35e ("btrfs: Refactor btrfs_check_super_valid") Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b1b53d713ee9..3524976ccc1d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2531,8 +2531,8 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, if (mirror_num >= 0 && btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { - btrfs_err(fs_info, "super offset mismatch %llu != %u", - btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET); + btrfs_err(fs_info, "super offset mismatch %llu != %llu", + btrfs_super_bytenr(sb), btrfs_sb_offset(mirror_num)); ret = -EINVAL; } -- cgit v1.2.3 From 5254d4181add9dfaa5e3519edd71cc8f752b2f85 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 17 Feb 2026 14:46:50 +0000 Subject: btrfs: fix zero size inode with non-zero size after log replay When logging that an inode exists, as part of logging a new name or logging new dir entries for a directory, we always set the generation of the logged inode item to 0. This is to signal during log replay (in overwrite_item()), that we should not set the i_size since we only logged that an inode exists, so the i_size of the inode in the subvolume tree must be preserved (as when we log new names or that an inode exists, we don't log extents). This works fine except when we have already logged an inode in full mode or it's the first time we are logging an inode created in a past transaction, that inode has a new i_size of 0 and then we log a new name for the inode (due to a new hardlink or a rename), in which case we log an i_size of 0 for the inode and a generation of 0, which causes the log replay code to not update the inode's i_size to 0 (in overwrite_item()). An example scenario: mkdir /mnt/dir xfs_io -f -c "pwrite 0 64K" /mnt/dir/foo sync xfs_io -c "truncate 0" -c "fsync" /mnt/dir/foo ln /mnt/dir/foo /mnt/dir/bar xfs_io -c "fsync" /mnt/dir After log replay the file remains with a size of 64K. This is because when we first log the inode, when we fsync file foo, we log its current i_size of 0, and then when we create a hard link we log again the inode in exists mode (LOG_INODE_EXISTS) but we set a generation of 0 for the inode item we add to the log tree, so during log replay overwrite_item() sees that the generation is 0 and i_size is 0 so we skip updating the inode's i_size from 64K to 0. Fix this by making sure at fill_inode_item() we always log the real generation of the inode if it was logged in the current transaction with the i_size we logged before. Also if an inode created in a previous transaction is logged in exists mode only, make sure we log the i_size stored in the inode item located from the commit root, so that if we log multiple times that the inode exists we get the correct i_size. A test case for fstests will follow soon. Reported-by: Vyacheslav Kovalevsky Link: https://lore.kernel.org/linux-btrfs/af8c15fa-4e41-4bb2-885c-0bc4e97532a6@gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 98 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9ff3933bc382..fce1b16a882b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4616,21 +4616,32 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode, bool log_inode_only, u64 logged_isize) { + u64 gen = BTRFS_I(inode)->generation; u64 flags; if (log_inode_only) { - /* set the generation to zero so the recover code - * can tell the difference between an logging - * just to say 'this inode exists' and a logging - * to say 'update this inode with these values' + /* + * Set the generation to zero so the recover code can tell the + * difference between a logging just to say 'this inode exists' + * and a logging to say 'update this inode with these values'. + * But only if the inode was not already logged before. + * We access ->logged_trans directly since it was already set + * up in the call chain by btrfs_log_inode(), and data_race() + * to avoid false alerts from KCSAN and since it was set already + * and one can set it to 0 since that only happens on eviction + * and we are holding a ref on the inode. */ - btrfs_set_inode_generation(leaf, item, 0); + ASSERT(data_race(BTRFS_I(inode)->logged_trans) > 0); + if (data_race(BTRFS_I(inode)->logged_trans) < trans->transid) + gen = 0; + btrfs_set_inode_size(leaf, item, logged_isize); } else { - btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); btrfs_set_inode_size(leaf, item, inode->i_size); } + btrfs_set_inode_generation(leaf, item, gen); + btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); btrfs_set_inode_mode(leaf, item, inode->i_mode); @@ -5448,42 +5459,63 @@ process: return 0; } -static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, - struct btrfs_path *path, u64 *size_ret) +static int get_inode_size_to_log(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, u64 *size_ret) { struct btrfs_key key; + struct btrfs_inode_item *item; int ret; key.objectid = btrfs_ino(inode); key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, log, &key, path, 0, 0); - if (ret < 0) { - return ret; - } else if (ret > 0) { - *size_ret = 0; - } else { - struct btrfs_inode_item *item; + /* + * Our caller called inode_logged(), so logged_trans is up to date. + * Use data_race() to silence any warning from KCSAN. Once logged_trans + * is set, it can only be reset to 0 after inode eviction. + */ + if (data_race(inode->logged_trans) == trans->transid) { + ret = btrfs_search_slot(NULL, inode->root->log_root, &key, path, 0, 0); + } else if (inode->generation < trans->transid) { + path->search_commit_root = true; + path->skip_locking = true; + ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0); + path->search_commit_root = false; + path->skip_locking = false; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_inode_item); - *size_ret = btrfs_inode_size(path->nodes[0], item); - /* - * If the in-memory inode's i_size is smaller then the inode - * size stored in the btree, return the inode's i_size, so - * that we get a correct inode size after replaying the log - * when before a power failure we had a shrinking truncate - * followed by addition of a new name (rename / new hard link). - * Otherwise return the inode size from the btree, to avoid - * data loss when replaying a log due to previously doing a - * write that expands the inode's size and logging a new name - * immediately after. - */ - if (*size_ret > inode->vfs_inode.i_size) - *size_ret = inode->vfs_inode.i_size; + } else { + *size_ret = 0; + return 0; } + /* + * If the inode was logged before or is from a past transaction, then + * its inode item must exist in the log root or in the commit root. + */ + ASSERT(ret <= 0); + if (WARN_ON_ONCE(ret > 0)) + ret = -ENOENT; + + if (ret < 0) + return ret; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + *size_ret = btrfs_inode_size(path->nodes[0], item); + /* + * If the in-memory inode's i_size is smaller then the inode size stored + * in the btree, return the inode's i_size, so that we get a correct + * inode size after replaying the log when before a power failure we had + * a shrinking truncate followed by addition of a new name (rename / new + * hard link). Otherwise return the inode size from the btree, to avoid + * data loss when replaying a log due to previously doing a write that + * expands the inode's size and logging a new name immediately after. + */ + if (*size_ret > inode->vfs_inode.i_size) + *size_ret = inode->vfs_inode.i_size; + btrfs_release_path(path); return 0; } @@ -6996,7 +7028,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, ret = drop_inode_items(trans, log, path, inode, BTRFS_XATTR_ITEM_KEY); } else { - if (inode_only == LOG_INODE_EXISTS && ctx->logged_before) { + if (inode_only == LOG_INODE_EXISTS) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -7010,7 +7042,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * (zeroes), as if an expanding truncate happened, * instead of getting a file of 4Kb only. */ - ret = logged_inode_size(log, inode, path, &logged_isize); + ret = get_inode_size_to_log(trans, inode, path, &logged_isize); if (ret) goto out_unlock; } -- cgit v1.2.3 From a4376d9a5d4c9610e69def3fc0b32c86a7ab7a41 Mon Sep 17 00:00:00 2001 From: Shin'ichiro Kawasaki Date: Sun, 1 Mar 2026 21:17:04 +0900 Subject: btrfs: fix leak of kobject name for sub-group space_info When create_space_info_sub_group() allocates elements of space_info->sub_group[], kobject_init_and_add() is called for each element via btrfs_sysfs_add_space_info_type(). However, when check_removing_space_info() frees these elements, it does not call btrfs_sysfs_remove_space_info() on them. As a result, kobject_put() is not called and the associated kobj->name objects are leaked. This memory leak is reproduced by running the blktests test case zbd/009 on kernels built with CONFIG_DEBUG_KMEMLEAK. The kmemleak feature reports the following error: unreferenced object 0xffff888112877d40 (size 16): comm "mount", pid 1244, jiffies 4294996972 hex dump (first 16 bytes): 64 61 74 61 2d 72 65 6c 6f 63 00 c4 c6 a7 cb 7f data-reloc...... backtrace (crc 53ffde4d): __kmalloc_node_track_caller_noprof+0x619/0x870 kstrdup+0x42/0xc0 kobject_set_name_vargs+0x44/0x110 kobject_init_and_add+0xcf/0x150 btrfs_sysfs_add_space_info_type+0xfc/0x210 [btrfs] create_space_info_sub_group.constprop.0+0xfb/0x1b0 [btrfs] create_space_info+0x211/0x320 [btrfs] btrfs_init_space_info+0x15a/0x1b0 [btrfs] open_ctree+0x33c7/0x4a50 [btrfs] btrfs_get_tree.cold+0x9f/0x1ee [btrfs] vfs_get_tree+0x87/0x2f0 vfs_cmd_create+0xbd/0x280 __do_sys_fsconfig+0x3df/0x990 do_syscall_64+0x136/0x1540 entry_SYSCALL_64_after_hwframe+0x76/0x7e To avoid the leak, call btrfs_sysfs_remove_space_info() instead of kfree() for the elements. Fixes: f92ee31e031c ("btrfs: introduce btrfs_space_info sub-group") Link: https://lore.kernel.org/linux-block/b9488881-f18d-4f47-91a5-3c9bf63955a5@wdc.com/ Reviewed-by: Johannes Thumshirn Signed-off-by: Shin'ichiro Kawasaki Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index fa55d868ecd8..f7fcff7dca8f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -4584,7 +4584,7 @@ static void check_removing_space_info(struct btrfs_space_info *space_info) for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { if (space_info->sub_group[i]) { check_removing_space_info(space_info->sub_group[i]); - kfree(space_info->sub_group[i]); + btrfs_sysfs_remove_space_info(space_info->sub_group[i]); space_info->sub_group[i] = NULL; } } -- cgit v1.2.3 From 0dcabcb920a5c143c568f37c26c6f2b4b9206bd1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 Mar 2026 18:35:26 +1030 Subject: btrfs: zlib: handle page aligned compressed size correctly [BUG] Since commit 3d74a7556fba ("btrfs: zlib: introduce zlib_compress_bio() helper"), there are some reports about different crashes in zlib compression path. One of the symptoms is list corruption like the following: list_del corruption. next->prev should be fffffbb340204a08, but was ffff8d6517cb7de0. (next=fffffbb3402d62c8) ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:65! Oops: invalid opcode: 0000 [#1] SMP NOPTI CPU: 1 UID: 0 PID: 21436 Comm: kworker/u16:7 Not tainted 7.0.0-rc2-jcg+ #1 PREEMPT Hardware name: LENOVO 10VGS02P00/3130, BIOS M1XKT57A 02/10/2022 Workqueue: btrfs-delalloc btrfs_work_helper [btrfs] RIP: 0010:__list_del_entry_valid_or_report+0xec/0xf0 Call Trace: btrfs_alloc_compr_folio+0xae/0xc0 [btrfs] zlib_compress_bio+0x39d/0x6a0 [btrfs] btrfs_compress_bio+0x2e3/0x3d0 [btrfs] compress_file_range+0x2b0/0x660 [btrfs] btrfs_work_helper+0xdb/0x3e0 [btrfs] process_one_work+0x192/0x3d0 worker_thread+0x19a/0x310 kthread+0xdf/0x120 ret_from_fork+0x22e/0x310 ret_from_fork_asm+0x1a/0x30 ---[ end trace 0000000000000000 ]--- Other symptoms include VM_BUG_ON() during folio_put() but it's rarer. David Sterba firstly reported this during his CI runs but unfortunately I'm unable to hit it. Meanwhile zstd/lzo doesn't seem to have the same problem. [CAUSE] During zlib_compress_bio() every time the output buffer is full, we queue the full folio into the compressed bio, and allocate a new folio as the output folio. After the input has finished, we loop through zlib_deflate() with Z_FINISH to flush all output. And when that is done, we still need to check if the last folio has any content, and if so we still need to queue that part into the compressed bio. The problem is in the final folio handling, if the final folio is full (for x86_64 the folio size is 4K), the length to queue is calculated by u32 cur_len = offset_in_folio(out_folio, workspace->strm.total_out); But since total_out is 4K aligned, the resulted @cur_len will be 0, then we hit the bio_add_folio(), which has a quirk that if bio_add_folio() got an length 0, it will still queue the folio into the bio, but return false. In that case we go to out: tag, which calls btrfs_free_compr_folio() to release @out_folio, which may put the out folio into the btrfs global pool list. On the other hand, that @out_folio is already added to the compressed bio, and will later be released again by cleanup_compressed_bio(), which results double release. And if this time we still need to put the folio into the btrfs global pool list, it will result a list corruption because it's already in the list. [FIX] Instead of offset_inside_folio(), directly use the difference between strm.total_out and bi_size. So that if the last folio is completely full, we can still properly queue the full folio other than queueing zero byte. Fixes: 3d74a7556fba ("btrfs: zlib: introduce zlib_compress_bio() helper") Reported-by: David Sterba Reported-by: Jean-Christophe Guillain Reported-by: syzbot+3c4d8371d65230f852a2@syzkaller.appspotmail.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=221176 Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 0a8fcee16428..27fc2b828002 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -308,7 +308,9 @@ int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) } /* Queue the remaining part of the folio. */ if (workspace->strm.total_out > bio->bi_iter.bi_size) { - u32 cur_len = offset_in_folio(out_folio, workspace->strm.total_out); + const u32 cur_len = workspace->strm.total_out - bio->bi_iter.bi_size; + + ASSERT(cur_len <= folio_size(out_folio)); if (!bio_add_folio(bio, out_folio, cur_len, 0)) { ret = -E2BIG; -- cgit v1.2.3 From a85b46db143fda5869e7d8df8f258ccef5fa1719 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 13 Mar 2026 14:11:39 -0400 Subject: btrfs: tracepoints: get correct superblock from dentry in event btrfs_sync_file() If overlay is used on top of btrfs, dentry->d_sb translates to overlay's super block and fsid assignment will lead to a crash. Use file_inode(file)->i_sb to always get btrfs_sb. Reviewed-by: Boris Burkov Signed-off-by: Goldwyn Rodrigues Signed-off-by: David Sterba --- include/trace/events/btrfs.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 125bdc166bfe..0864700f76e0 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -769,12 +769,15 @@ TRACE_EVENT(btrfs_sync_file, ), TP_fast_assign( - const struct dentry *dentry = file->f_path.dentry; - const struct inode *inode = d_inode(dentry); + struct dentry *dentry = file_dentry(file); + struct inode *inode = file_inode(file); + struct dentry *parent = dget_parent(dentry); + struct inode *parent_inode = d_inode(parent); - TP_fast_assign_fsid(btrfs_sb(file->f_path.dentry->d_sb)); + dput(parent); + TP_fast_assign_fsid(btrfs_sb(inode->i_sb)); __entry->ino = btrfs_ino(BTRFS_I(inode)); - __entry->parent = btrfs_ino(BTRFS_I(d_inode(dentry->d_parent))); + __entry->parent = btrfs_ino(BTRFS_I(parent_inode)); __entry->datasync = datasync; __entry->root_objectid = btrfs_root_id(BTRFS_I(inode)->root); ), -- cgit v1.2.3 From 1c37d896b12dfd0d4c96e310b0033c6676933917 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 18 Mar 2026 16:17:59 +0000 Subject: btrfs: fix lost error when running device stats on multiple devices fs Whenever we get an error updating the device stats item for a device in btrfs_run_dev_stats() we allow the loop to go to the next device, and if updating the stats item for the next device succeeds, we end up losing the error we had from the previous device. Fix this by breaking out of the loop once we get an error and make sure it's returned to the caller. Since we are in the transaction commit path (and in the critical section actually), returning the error will result in a transaction abort. Fixes: 733f4fbbc108 ("Btrfs: read device stats on mount, write modified ones during commit") Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8fbd736aad9f..117e13d245f6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -8099,8 +8099,9 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) smp_rmb(); ret = update_dev_stat_item(trans, device); - if (!ret) - atomic_sub(stats_cnt, &device->dev_stats_ccnt); + if (ret) + break; + atomic_sub(stats_cnt, &device->dev_stats_ccnt); } mutex_unlock(&fs_devices->device_list_mutex); -- cgit v1.2.3