fs/xfs/xfs_verify_media.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2026 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs_platform.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_bit.h"
#include "xfs_btree.h"
#include "xfs_inode.h"
#include "xfs_icache.h"
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_ag.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
#include "xfs_rtgroup.h"
#include "xfs_rtrmap_btree.h"
#include "xfs_health.h"
#include "xfs_healthmon.h"
#include "xfs_trace.h"
#include "xfs_verify_media.h"

#include <linux/fserror.h>

struct xfs_group_data_lost {
	xfs_agblock_t		startblock;
	xfs_extlen_t		blockcount;
};

/* Report lost file data from rmap records */
static int
xfs_verify_report_data_lost(
	struct xfs_btree_cur		*cur,
	const struct xfs_rmap_irec	*rec,
	void				*data)
{
	struct xfs_mount		*mp = cur->bc_mp;
	struct xfs_inode		*ip;
	struct xfs_group_data_lost	*lost = data;
	xfs_fileoff_t			fileoff = rec->rm_offset;
	xfs_extlen_t			blocks = rec->rm_blockcount;
	const bool			is_attr =
			(rec->rm_flags & XFS_RMAP_ATTR_FORK);
	const xfs_agblock_t		lost_end =
			lost->startblock + lost->blockcount;
	const xfs_agblock_t		rmap_end =
			rec->rm_startblock + rec->rm_blockcount;
	int				error = 0;

	if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
	       return 0;

	error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, 0, 0, &ip);
	if (error)
		return 0;

	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
		xfs_bmap_mark_sick(ip, is_attr ? XFS_ATTR_FORK : XFS_DATA_FORK);
		goto out_rele;
	}

	if (is_attr) {
		xfs_inode_mark_sick(ip, XFS_SICK_INO_XATTR);
		goto out_rele;
	}

	if (lost->startblock > rec->rm_startblock) {
		fileoff += lost->startblock - rec->rm_startblock;
		blocks -= lost->startblock - rec->rm_startblock;
	}
	if (rmap_end > lost_end)
		blocks -= rmap_end - lost_end;

	fserror_report_data_lost(VFS_I(ip), XFS_FSB_TO_B(mp, fileoff),
			XFS_FSB_TO_B(mp, blocks), GFP_NOFS);

out_rele:
	xfs_irele(ip);
	return 0;
}

/* Walk reverse mappings to look for all file data loss */
static int
xfs_verify_report_losses(
	struct xfs_mount	*mp,
	enum xfs_group_type	type,
	xfs_daddr_t		daddr,
	u64			bblen)
{
	struct xfs_group	*xg = NULL;
	struct xfs_trans	*tp;
	xfs_fsblock_t		start_bno, end_bno;
	uint32_t		start_gno, end_gno;
	int			error;

	if (type == XG_TYPE_RTG) {
		start_bno = xfs_daddr_to_rtb(mp, daddr);
		end_bno = xfs_daddr_to_rtb(mp, daddr + bblen - 1);
	} else {
		start_bno = XFS_DADDR_TO_FSB(mp, daddr);
		end_bno = XFS_DADDR_TO_FSB(mp, daddr + bblen - 1);
	}

	tp = xfs_trans_alloc_empty(mp);
	start_gno = xfs_fsb_to_gno(mp, start_bno, type);
	end_gno = xfs_fsb_to_gno(mp, end_bno, type);
	while ((xg = xfs_group_next_range(mp, xg, start_gno, end_gno, type))) {
		struct xfs_buf		*agf_bp = NULL;
		struct xfs_rtgroup	*rtg = NULL;
		struct xfs_btree_cur	*cur;
		struct xfs_rmap_irec	ri_low = { };
		struct xfs_rmap_irec	ri_high;
		struct xfs_group_data_lost lost;

		if (type == XG_TYPE_AG) {
			struct xfs_perag	*pag = to_perag(xg);

			error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
			if (error) {
				xfs_perag_rele(pag);
				break;
			}

			cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, pag);
		} else {
			rtg = to_rtg(xg);
			xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
			cur = xfs_rtrmapbt_init_cursor(tp, rtg);
		}

		/*
		 * Set the rmap range from ri_low to ri_high, which represents
		 * a [start, end] where we looking for the files or metadata.
		 */
		memset(&ri_high, 0xFF, sizeof(ri_high));
		if (xg->xg_gno == start_gno)
			ri_low.rm_startblock =
				xfs_fsb_to_gbno(mp, start_bno, type);
		if (xg->xg_gno == end_gno)
			ri_high.rm_startblock =
				xfs_fsb_to_gbno(mp, end_bno, type);

		lost.startblock = ri_low.rm_startblock;
		lost.blockcount = min(xg->xg_block_count,
				      ri_high.rm_startblock + 1) -
							ri_low.rm_startblock;

		error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
				xfs_verify_report_data_lost, &lost);
		xfs_btree_del_cursor(cur, error);
		if (agf_bp)
			xfs_trans_brelse(tp, agf_bp);
		if (rtg)
			xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
		if (error) {
			xfs_group_rele(xg);
			break;
		}
	}

	xfs_trans_cancel(tp);
	return 0;
}

/*
 * Compute the desired verify IO size.
 *
 * To minimize command overhead, we'd like to create bios that are 1MB, though
 * we allow the user to ask for a smaller size.
 */
static unsigned int
xfs_verify_iosize(
	const struct xfs_verify_media	*me,
	struct xfs_buftarg		*btp,
	uint64_t			bbcount)
{
	unsigned int			iosize =
			min_not_zero(SZ_1M, me->me_max_io_size);

	BUILD_BUG_ON(BBSHIFT != SECTOR_SHIFT);
	ASSERT(BBTOB(bbcount) >= btp->bt_logical_sectorsize);

	return clamp(iosize, btp->bt_logical_sectorsize, BBTOB(bbcount));
}

/* Allocate as much memory as we can get for verification buffer. */
static struct folio *
xfs_verify_alloc_folio(
	const unsigned int	iosize)
{
	unsigned int		order = get_order(iosize);

	while (order > 0) {
		struct folio	*folio =
			folio_alloc(GFP_KERNEL | __GFP_NORETRY, order);

		if (folio)
			return folio;
		order--;
	}

	return folio_alloc(GFP_KERNEL, 0);
}

/* Report any kind of problem verifying media */
static void
xfs_verify_media_error(
	struct xfs_mount	*mp,
	struct xfs_verify_media	*me,
	struct xfs_buftarg	*btp,
	xfs_daddr_t		daddr,
	unsigned int		bio_bbcount,
	blk_status_t		bio_status)
{
	trace_xfs_verify_media_error(mp, me, btp->bt_dev, daddr, bio_bbcount,
			bio_status);

	/*
	 * Pass any error, I/O or otherwise, up to the caller if we didn't
	 * successfully verify any bytes at all.
	 */
	if (me->me_start_daddr == daddr)
		me->me_ioerror = -blk_status_to_errno(bio_status);

	/*
	 * PI validation failures, medium errors, or general IO errors are
	 * treated as indicators of data loss.  Everything else are (hopefully)
	 * transient errors and are not reported to healthmon or fsnotify.
	 */
	switch (bio_status) {
	case BLK_STS_PROTECTION:
	case BLK_STS_IOERR:
	case BLK_STS_MEDIUM:
		break;
	default:
		return;
	}

	if (!(me->me_flags & XFS_VERIFY_MEDIA_REPORT))
		return;

	xfs_healthmon_report_media(mp, me->me_dev, daddr, bio_bbcount);

	if (!xfs_has_rmapbt(mp))
		return;

	switch (me->me_dev) {
	case XFS_DEV_DATA:
		xfs_verify_report_losses(mp, XG_TYPE_AG, daddr, bio_bbcount);
		break;
	case XFS_DEV_RT:
		xfs_verify_report_losses(mp, XG_TYPE_RTG, daddr, bio_bbcount);
		break;
	}
}

/* Verify the media of an xfs device by submitting read requests to the disk. */
static int
xfs_verify_media(
	struct xfs_mount	*mp,
	struct xfs_verify_media	*me)
{
	struct xfs_buftarg	*btp = NULL;
	struct bio		*bio;
	struct folio		*folio;
	xfs_daddr_t		daddr;
	uint64_t		bbcount;
	int			error = 0;

	me->me_ioerror = 0;

	switch (me->me_dev) {
	case XFS_DEV_DATA:
		btp = mp->m_ddev_targp;
		break;
	case XFS_DEV_LOG:
		if (mp->m_logdev_targp != mp->m_ddev_targp)
			btp = mp->m_logdev_targp;
		break;
	case XFS_DEV_RT:
		btp = mp->m_rtdev_targp;
		break;
	}
	if (!btp)
		return -ENODEV;

	/*
	 * If the caller told us to verify beyond the end of the disk, tell the
	 * user exactly where that was.
	 */
	if (me->me_end_daddr > btp->bt_nr_sectors)
		me->me_end_daddr = btp->bt_nr_sectors;

	/* start and end have to be aligned to the lba size */
	if (!IS_ALIGNED(BBTOB(me->me_start_daddr | me->me_end_daddr),
			btp->bt_logical_sectorsize))
		return -EINVAL;

	/*
	 * end_daddr is the exclusive end of the range, so if start_daddr
	 * reaches there (or beyond), there's no work to be done.
	 */
	if (me->me_start_daddr >= me->me_end_daddr)
		return 0;

	/*
	 * There are three ranges involved here:
	 *
	 *  - [me->me_start_daddr, me->me_end_daddr) is the range that the
	 *    user wants to verify.  end_daddr can be beyond the end of the
	 *    disk; we'll constrain it to the end if necessary.
	 *
	 *  - [daddr, me->me_end_daddr) is the range that we have not yet
	 *    verified.  We update daddr after each successful read.
	 *    me->me_start_daddr is set to daddr before returning.
	 *
	 *  - [daddr, daddr + bio_bbcount) is the range that we're currently
	 *    verifying.
	 */
	daddr = me->me_start_daddr;
	bbcount = min_t(sector_t, me->me_end_daddr, btp->bt_nr_sectors) -
			  me->me_start_daddr;

	folio = xfs_verify_alloc_folio(xfs_verify_iosize(me, btp, bbcount));
	if (!folio)
		return -ENOMEM;

	trace_xfs_verify_media(mp, me, btp->bt_dev, daddr, bbcount, folio);

	bio = bio_alloc(btp->bt_bdev, 1, REQ_OP_READ, GFP_KERNEL);
	if (!bio) {
		error = -ENOMEM;
		goto out_folio;
	}

	while (bbcount > 0) {
		unsigned int	bio_bbcount;
		blk_status_t	bio_status;

		bio_reset(bio, btp->bt_bdev, REQ_OP_READ);
		bio->bi_iter.bi_sector = daddr;
		bio_add_folio_nofail(bio, folio,
				min(bbcount << SECTOR_SHIFT, folio_size(folio)),
				0);

		/*
		 * Save the length of the bio before we submit it, because we
		 * need the original daddr and length for reporting IO errors
		 * if the bio fails.
		 */
		bio_bbcount = bio->bi_iter.bi_size >> SECTOR_SHIFT;
		submit_bio_wait(bio);
		bio_status = bio->bi_status;
		if (bio_status != BLK_STS_OK) {
			xfs_verify_media_error(mp, me, btp, daddr, bio_bbcount,
					bio_status);
			error = 0;
			break;
		}

		daddr += bio_bbcount;
		bbcount -= bio_bbcount;

		if (bbcount == 0)
			break;

		if (me->me_rest_us) {
			ktime_t	expires;

			expires = ktime_add_ns(ktime_get(),
					me->me_rest_us * 1000);
			set_current_state(TASK_KILLABLE);
			schedule_hrtimeout(&expires, HRTIMER_MODE_ABS);
		}

		if (fatal_signal_pending(current)) {
			error = -EINTR;
			break;
		}

		cond_resched();
	}

	bio_put(bio);
out_folio:
	folio_put(folio);

	if (error)
		return error;

	/*
	 * Advance start_daddr to the end of what we verified if there wasn't
	 * an operational error.
	 */
	me->me_start_daddr = daddr;
	trace_xfs_verify_media_end(mp, me, btp->bt_dev);
	return 0;
}

int
xfs_ioc_verify_media(
	struct file			*file,
	struct xfs_verify_media __user	*arg)
{
	struct xfs_verify_media		me;
	struct xfs_inode		*ip = XFS_I(file_inode(file));
	struct xfs_mount		*mp = ip->i_mount;
	int				error;

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

	if (copy_from_user(&me, arg, sizeof(me)))
		return -EFAULT;

	if (me.me_pad)
		return -EINVAL;
	if (me.me_flags & ~XFS_VERIFY_MEDIA_FLAGS)
		return -EINVAL;

	switch (me.me_dev) {
	case XFS_DEV_DATA:
	case XFS_DEV_LOG:
	case XFS_DEV_RT:
		break;
	default:
		return -EINVAL;
	}

	error = xfs_verify_media(mp, &me);
	if (error)
		return error;

	if (copy_to_user(arg, &me, sizeof(me)))
		return -EFAULT;

	return 0;
}