CoCalc -- direct-io.c

GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/direct-io.c
²⁹²⁶⁷ views
1
// SPDX-License-Identifier: GPL-2.0
2

3
#include <linux/fsverity.h>
4
#include <linux/iomap.h>
5
#include "ctree.h"
6
#include "delalloc-space.h"
7
#include "direct-io.h"
8
#include "extent-tree.h"
9
#include "file.h"
10
#include "fs.h"
11
#include "transaction.h"
12
#include "volumes.h"
13

14
struct btrfs_dio_data {
15
	ssize_t submitted;
16
	struct extent_changeset *data_reserved;
17
	struct btrfs_ordered_extent *ordered;
18
	bool data_space_reserved;
19
	bool nocow_done;
20
};
21

22
struct btrfs_dio_private {
23
	/* Range of I/O */
24
	u64 file_offset;
25
	u32 bytes;
26

27
	/* This must be last */
28
	struct btrfs_bio bbio;
29
};
30

31
static struct bio_set btrfs_dio_bioset;
32

33
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
34
			      struct extent_state **cached_state,
35
			      unsigned int iomap_flags)
36
{
37
	const bool writing = (iomap_flags & IOMAP_WRITE);
38
	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
39
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
40
	struct btrfs_ordered_extent *ordered;
41
	int ret = 0;
42

43
	/* Direct lock must be taken before the extent lock. */
44
	if (nowait) {
45
		if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
46
			return -EAGAIN;
47
	} else {
48
		btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);
49
	}
50

51
	while (1) {
52
		if (nowait) {
53
			if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,
54
						   cached_state)) {
55
				ret = -EAGAIN;
56
				break;
57
			}
58
		} else {
59
			btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);
60
		}
61
		/*
62
		 * We're concerned with the entire range that we're going to be
63
		 * doing DIO to, so we need to make sure there's no ordered
64
		 * extents in this range.
65
		 */
66
		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
67
						     lockend - lockstart + 1);
68

69
		/*
70
		 * We need to make sure there are no buffered pages in this
71
		 * range either, we could have raced between the invalidate in
72
		 * generic_file_direct_write and locking the extent.  The
73
		 * invalidate needs to happen so that reads after a write do not
74
		 * get stale data.
75
		 */
76
		if (!ordered &&
77
		    (!writing || !filemap_range_has_page(inode->i_mapping,
78
							 lockstart, lockend)))
79
			break;
80

81
		btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);
82

83
		if (ordered) {
84
			if (nowait) {
85
				btrfs_put_ordered_extent(ordered);
86
				ret = -EAGAIN;
87
				break;
88
			}
89
			/*
90
			 * If we are doing a DIO read and the ordered extent we
91
			 * found is for a buffered write, we can not wait for it
92
			 * to complete and retry, because if we do so we can
93
			 * deadlock with concurrent buffered writes on page
94
			 * locks. This happens only if our DIO read covers more
95
			 * than one extent map, if at this point has already
96
			 * created an ordered extent for a previous extent map
97
			 * and locked its range in the inode's io tree, and a
98
			 * concurrent write against that previous extent map's
99
			 * range and this range started (we unlock the ranges
100
			 * in the io tree only when the bios complete and
101
			 * buffered writes always lock pages before attempting
102
			 * to lock range in the io tree).
103
			 */
104
			if (writing ||
105
			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
106
				btrfs_start_ordered_extent(ordered);
107
			else
108
				ret = nowait ? -EAGAIN : -ENOTBLK;
109
			btrfs_put_ordered_extent(ordered);
110
		} else {
111
			/*
112
			 * We could trigger writeback for this range (and wait
113
			 * for it to complete) and then invalidate the pages for
114
			 * this range (through invalidate_inode_pages2_range()),
115
			 * but that can lead us to a deadlock with a concurrent
116
			 * call to readahead (a buffered read or a defrag call
117
			 * triggered a readahead) on a page lock due to an
118
			 * ordered dio extent we created before but did not have
119
			 * yet a corresponding bio submitted (whence it can not
120
			 * complete), which makes readahead wait for that
121
			 * ordered extent to complete while holding a lock on
122
			 * that page.
123
			 */
124
			ret = nowait ? -EAGAIN : -ENOTBLK;
125
		}
126

127
		if (ret)
128
			break;
129

130
		cond_resched();
131
	}
132

133
	if (ret)
134
		btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
135
	return ret;
136
}
137

138
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
139
						  struct btrfs_dio_data *dio_data,
140
						  const u64 start,
141
						  const struct btrfs_file_extent *file_extent,
142
						  const int type)
143
{
144
	struct extent_map *em = NULL;
145
	struct btrfs_ordered_extent *ordered;
146

147
	if (type != BTRFS_ORDERED_NOCOW) {
148
		em = btrfs_create_io_em(inode, start, file_extent, type);
149
		if (IS_ERR(em))
150
			goto out;
151
	}
152

153
	ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,
154
					     (1U << type) |
155
					     (1U << BTRFS_ORDERED_DIRECT));
156
	if (IS_ERR(ordered)) {
157
		if (em) {
158
			btrfs_free_extent_map(em);
159
			btrfs_drop_extent_map_range(inode, start,
160
					start + file_extent->num_bytes - 1, false);
161
		}
162
		em = ERR_CAST(ordered);
163
	} else {
164
		ASSERT(!dio_data->ordered);
165
		dio_data->ordered = ordered;
166
	}
167
 out:
168

169
	return em;
170
}
171

172
static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
173
						  struct btrfs_dio_data *dio_data,
174
						  u64 start, u64 len)
175
{
176
	struct btrfs_root *root = inode->root;
177
	struct btrfs_fs_info *fs_info = root->fs_info;
178
	struct btrfs_file_extent file_extent;
179
	struct extent_map *em;
180
	struct btrfs_key ins;
181
	u64 alloc_hint;
182
	int ret;
183

184
	alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);
185
again:
186
	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
187
				   0, alloc_hint, &ins, 1, 1);
188
	if (ret == -EAGAIN) {
189
		ASSERT(btrfs_is_zoned(fs_info));
190
		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
191
			       TASK_UNINTERRUPTIBLE);
192
		goto again;
193
	}
194
	if (ret)
195
		return ERR_PTR(ret);
196

197
	file_extent.disk_bytenr = ins.objectid;
198
	file_extent.disk_num_bytes = ins.offset;
199
	file_extent.num_bytes = ins.offset;
200
	file_extent.ram_bytes = ins.offset;
201
	file_extent.offset = 0;
202
	file_extent.compression = BTRFS_COMPRESS_NONE;
203
	em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,
204
				     BTRFS_ORDERED_REGULAR);
205
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
206
	if (IS_ERR(em))
207
		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
208

209
	return em;
210
}
211

212
static int btrfs_get_blocks_direct_write(struct extent_map **map,
213
					 struct inode *inode,
214
					 struct btrfs_dio_data *dio_data,
215
					 u64 start, u64 *lenp,
216
					 unsigned int iomap_flags)
217
{
218
	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
219
	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
220
	struct btrfs_file_extent file_extent;
221
	struct extent_map *em = *map;
222
	int type;
223
	u64 block_start;
224
	struct btrfs_block_group *bg;
225
	bool can_nocow = false;
226
	bool space_reserved = false;
227
	u64 len = *lenp;
228
	u64 prev_len;
229
	int ret = 0;
230

231
	/*
232
	 * We don't allocate a new extent in the following cases
233
	 *
234
	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
235
	 * existing extent.
236
	 * 2) The extent is marked as PREALLOC. We're good to go here and can
237
	 * just use the extent.
238
	 *
239
	 */
240
	if ((em->flags & EXTENT_FLAG_PREALLOC) ||
241
	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
242
	     em->disk_bytenr != EXTENT_MAP_HOLE)) {
243
		if (em->flags & EXTENT_FLAG_PREALLOC)
244
			type = BTRFS_ORDERED_PREALLOC;
245
		else
246
			type = BTRFS_ORDERED_NOCOW;
247
		len = min(len, em->len - (start - em->start));
248
		block_start = btrfs_extent_map_block_start(em) + (start - em->start);
249

250
		if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,
251
				     false) == 1) {
252
			bg = btrfs_inc_nocow_writers(fs_info, block_start);
253
			if (bg)
254
				can_nocow = true;
255
		}
256
	}
257

258
	prev_len = len;
259
	if (can_nocow) {
260
		struct extent_map *em2;
261

262
		/* We can NOCOW, so only need to reserve metadata space. */
263
		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
264
						      nowait);
265
		if (ret < 0) {
266
			/* Our caller expects us to free the input extent map. */
267
			btrfs_free_extent_map(em);
268
			*map = NULL;
269
			btrfs_dec_nocow_writers(bg);
270
			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
271
				ret = -EAGAIN;
272
			goto out;
273
		}
274
		space_reserved = true;
275

276
		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,
277
					      &file_extent, type);
278
		btrfs_dec_nocow_writers(bg);
279
		if (type == BTRFS_ORDERED_PREALLOC) {
280
			btrfs_free_extent_map(em);
281
			*map = em2;
282
			em = em2;
283
		}
284

285
		if (IS_ERR(em2)) {
286
			ret = PTR_ERR(em2);
287
			goto out;
288
		}
289

290
		dio_data->nocow_done = true;
291
	} else {
292
		/* Our caller expects us to free the input extent map. */
293
		btrfs_free_extent_map(em);
294
		*map = NULL;
295

296
		if (nowait) {
297
			ret = -EAGAIN;
298
			goto out;
299
		}
300

301
		/*
302
		 * If we could not allocate data space before locking the file
303
		 * range and we can't do a NOCOW write, then we have to fail.
304
		 */
305
		if (!dio_data->data_space_reserved) {
306
			ret = -ENOSPC;
307
			goto out;
308
		}
309

310
		/*
311
		 * We have to COW and we have already reserved data space before,
312
		 * so now we reserve only metadata.
313
		 */
314
		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
315
						      false);
316
		if (ret < 0)
317
			goto out;
318
		space_reserved = true;
319

320
		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
321
		if (IS_ERR(em)) {
322
			ret = PTR_ERR(em);
323
			goto out;
324
		}
325
		*map = em;
326
		len = min(len, em->len - (start - em->start));
327
		if (len < prev_len)
328
			btrfs_delalloc_release_metadata(BTRFS_I(inode),
329
							prev_len - len, true);
330
	}
331

332
	/*
333
	 * We have created our ordered extent, so we can now release our reservation
334
	 * for an outstanding extent.
335
	 */
336
	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
337

338
	/*
339
	 * Need to update the i_size under the extent lock so buffered
340
	 * readers will get the updated i_size when we unlock.
341
	 */
342
	if (start + len > i_size_read(inode))
343
		i_size_write(inode, start + len);
344
out:
345
	if (ret && space_reserved) {
346
		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
347
		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
348
	}
349
	*lenp = len;
350
	return ret;
351
}
352

353
static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
354
		loff_t length, unsigned int flags, struct iomap *iomap,
355
		struct iomap *srcmap)
356
{
357
	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
358
	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
359
	struct extent_map *em;
360
	struct extent_state *cached_state = NULL;
361
	struct btrfs_dio_data *dio_data = iter->private;
362
	u64 lockstart, lockend;
363
	const bool write = !!(flags & IOMAP_WRITE);
364
	int ret = 0;
365
	u64 len = length;
366
	const u64 data_alloc_len = length;
367
	u32 unlock_bits = EXTENT_LOCKED;
368

369
	/*
370
	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
371
	 * we're NOWAIT we may submit a bio for a partial range and return
372
	 * EIOCBQUEUED, which would result in an errant short read.
373
	 *
374
	 * The best way to handle this would be to allow for partial completions
375
	 * of iocb's, so we could submit the partial bio, return and fault in
376
	 * the rest of the pages, and then submit the io for the rest of the
377
	 * range.  However we don't have that currently, so simply return
378
	 * -EAGAIN at this point so that the normal path is used.
379
	 */
380
	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
381
		return -EAGAIN;
382

383
	/*
384
	 * Cap the size of reads to that usually seen in buffered I/O as we need
385
	 * to allocate a contiguous array for the checksums.
386
	 */
387
	if (!write)
388
		len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
389

390
	lockstart = start;
391
	lockend = start + len - 1;
392

393
	/*
394
	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
395
	 * enough if we've written compressed pages to this area, so we need to
396
	 * flush the dirty pages again to make absolutely sure that any
397
	 * outstanding dirty pages are on disk - the first flush only starts
398
	 * compression on the data, while keeping the pages locked, so by the
399
	 * time the second flush returns we know bios for the compressed pages
400
	 * were submitted and finished, and the pages no longer under writeback.
401
	 *
402
	 * If we have a NOWAIT request and we have any pages in the range that
403
	 * are locked, likely due to compression still in progress, we don't want
404
	 * to block on page locks. We also don't want to block on pages marked as
405
	 * dirty or under writeback (same as for the non-compression case).
406
	 * iomap_dio_rw() did the same check, but after that and before we got
407
	 * here, mmap'ed writes may have happened or buffered reads started
408
	 * (readpage() and readahead(), which lock pages), as we haven't locked
409
	 * the file range yet.
410
	 */
411
	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
412
		     &BTRFS_I(inode)->runtime_flags)) {
413
		if (flags & IOMAP_NOWAIT) {
414
			if (filemap_range_needs_writeback(inode->i_mapping,
415
							  lockstart, lockend))
416
				return -EAGAIN;
417
		} else {
418
			ret = filemap_fdatawrite_range(inode->i_mapping, start,
419
						       start + length - 1);
420
			if (ret)
421
				return ret;
422
		}
423
	}
424

425
	memset(dio_data, 0, sizeof(*dio_data));
426

427
	/*
428
	 * We always try to allocate data space and must do it before locking
429
	 * the file range, to avoid deadlocks with concurrent writes to the same
430
	 * range if the range has several extents and the writes don't expand the
431
	 * current i_size (the inode lock is taken in shared mode). If we fail to
432
	 * allocate data space here we continue and later, after locking the
433
	 * file range, we fail with ENOSPC only if we figure out we can not do a
434
	 * NOCOW write.
435
	 */
436
	if (write && !(flags & IOMAP_NOWAIT)) {
437
		ret = btrfs_check_data_free_space(BTRFS_I(inode),
438
						  &dio_data->data_reserved,
439
						  start, data_alloc_len, false);
440
		if (!ret)
441
			dio_data->data_space_reserved = true;
442
		else if (!(BTRFS_I(inode)->flags &
443
			   (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
444
			goto err;
445
	}
446

447
	/*
448
	 * If this errors out it's because we couldn't invalidate pagecache for
449
	 * this range and we need to fallback to buffered IO, or we are doing a
450
	 * NOWAIT read/write and we need to block.
451
	 */
452
	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
453
	if (ret < 0)
454
		goto err;
455

456
	em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);
457
	if (IS_ERR(em)) {
458
		ret = PTR_ERR(em);
459
		goto unlock_err;
460
	}
461

462
	/*
463
	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
464
	 * io.  INLINE is special, and we could probably kludge it in here, but
465
	 * it's still buffered so for safety lets just fall back to the generic
466
	 * buffered path.
467
	 *
468
	 * For COMPRESSED we _have_ to read the entire extent in so we can
469
	 * decompress it, so there will be buffering required no matter what we
470
	 * do, so go ahead and fallback to buffered.
471
	 *
472
	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
473
	 * to buffered IO.  Don't blame me, this is the price we pay for using
474
	 * the generic code.
475
	 */
476
	if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {
477
		btrfs_free_extent_map(em);
478
		/*
479
		 * If we are in a NOWAIT context, return -EAGAIN in order to
480
		 * fallback to buffered IO. This is not only because we can
481
		 * block with buffered IO (no support for NOWAIT semantics at
482
		 * the moment) but also to avoid returning short reads to user
483
		 * space - this happens if we were able to read some data from
484
		 * previous non-compressed extents and then when we fallback to
485
		 * buffered IO, at btrfs_file_read_iter() by calling
486
		 * filemap_read(), we fail to fault in pages for the read buffer,
487
		 * in which case filemap_read() returns a short read (the number
488
		 * of bytes previously read is > 0, so it does not return -EFAULT).
489
		 */
490
		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
491
		goto unlock_err;
492
	}
493

494
	len = min(len, em->len - (start - em->start));
495

496
	/*
497
	 * If we have a NOWAIT request and the range contains multiple extents
498
	 * (or a mix of extents and holes), then we return -EAGAIN to make the
499
	 * caller fallback to a context where it can do a blocking (without
500
	 * NOWAIT) request. This way we avoid doing partial IO and returning
501
	 * success to the caller, which is not optimal for writes and for reads
502
	 * it can result in unexpected behaviour for an application.
503
	 *
504
	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
505
	 * iomap_dio_rw(), we can end up returning less data then what the caller
506
	 * asked for, resulting in an unexpected, and incorrect, short read.
507
	 * That is, the caller asked to read N bytes and we return less than that,
508
	 * which is wrong unless we are crossing EOF. This happens if we get a
509
	 * page fault error when trying to fault in pages for the buffer that is
510
	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
511
	 * have previously submitted bios for other extents in the range, in
512
	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
513
	 * those bios have completed by the time we get the page fault error,
514
	 * which we return back to our caller - we should only return EIOCBQUEUED
515
	 * after we have submitted bios for all the extents in the range.
516
	 */
517
	if ((flags & IOMAP_NOWAIT) && len < length) {
518
		btrfs_free_extent_map(em);
519
		ret = -EAGAIN;
520
		goto unlock_err;
521
	}
522

523
	if (write) {
524
		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
525
						    start, &len, flags);
526
		if (ret < 0)
527
			goto unlock_err;
528
		/* Recalc len in case the new em is smaller than requested */
529
		len = min(len, em->len - (start - em->start));
530
		if (dio_data->data_space_reserved) {
531
			u64 release_offset;
532
			u64 release_len = 0;
533

534
			if (dio_data->nocow_done) {
535
				release_offset = start;
536
				release_len = data_alloc_len;
537
			} else if (len < data_alloc_len) {
538
				release_offset = start + len;
539
				release_len = data_alloc_len - len;
540
			}
541

542
			if (release_len > 0)
543
				btrfs_free_reserved_data_space(BTRFS_I(inode),
544
							       dio_data->data_reserved,
545
							       release_offset,
546
							       release_len);
547
		}
548
	}
549

550
	/*
551
	 * Translate extent map information to iomap.
552
	 * We trim the extents (and move the addr) even though iomap code does
553
	 * that, since we have locked only the parts we are performing I/O in.
554
	 */
555
	if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||
556
	    ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
557
		iomap->addr = IOMAP_NULL_ADDR;
558
		iomap->type = IOMAP_HOLE;
559
	} else {
560
		iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);
561
		iomap->type = IOMAP_MAPPED;
562
	}
563
	iomap->offset = start;
564
	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
565
	iomap->length = len;
566
	btrfs_free_extent_map(em);
567

568
	/*
569
	 * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,
570
	 * writes only hold it for this part.  We hold the extent lock until
571
	 * we're completely done with the extent map to make sure it remains
572
	 * valid.
573
	 */
574
	if (write)
575
		unlock_bits |= EXTENT_DIO_LOCKED;
576

577
	btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
578
			       unlock_bits, &cached_state);
579

580
	/* We didn't use everything, unlock the dio extent for the remainder. */
581
	if (!write && (start + len) < lockend)
582
		btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,
583
					lockend, NULL);
584

585
	return 0;
586

587
unlock_err:
588
	/*
589
	 * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
590
	 * to update this, be explicit that we expect EXTENT_LOCKED and
591
	 * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
592
	 */
593
	btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
594
			       EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
595
err:
596
	if (dio_data->data_space_reserved) {
597
		btrfs_free_reserved_data_space(BTRFS_I(inode),
598
					       dio_data->data_reserved,
599
					       start, data_alloc_len);
600
		extent_changeset_free(dio_data->data_reserved);
601
	}
602

603
	return ret;
604
}
605

606
static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
607
		ssize_t written, unsigned int flags, struct iomap *iomap)
608
{
609
	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
610
	struct btrfs_dio_data *dio_data = iter->private;
611
	size_t submitted = dio_data->submitted;
612
	const bool write = !!(flags & IOMAP_WRITE);
613
	int ret = 0;
614

615
	if (!write && (iomap->type == IOMAP_HOLE)) {
616
		/* If reading from a hole, unlock and return */
617
		btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
618
					pos + length - 1, NULL);
619
		return 0;
620
	}
621

622
	if (submitted < length) {
623
		pos += submitted;
624
		length -= submitted;
625
		if (write)
626
			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
627
						    pos, length, false);
628
		else
629
			btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,
630
						pos + length - 1, NULL);
631
		ret = -ENOTBLK;
632
	}
633
	if (write) {
634
		btrfs_put_ordered_extent(dio_data->ordered);
635
		dio_data->ordered = NULL;
636
	}
637

638
	if (write)
639
		extent_changeset_free(dio_data->data_reserved);
640
	return ret;
641
}
642

643
static void btrfs_dio_end_io(struct btrfs_bio *bbio)
644
{
645
	struct btrfs_dio_private *dip =
646
		container_of(bbio, struct btrfs_dio_private, bbio);
647
	struct btrfs_inode *inode = bbio->inode;
648
	struct bio *bio = &bbio->bio;
649

650
	if (bio->bi_status) {
651
		btrfs_warn(inode->root->fs_info,
652
		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
653
			   btrfs_ino(inode), bio->bi_opf,
654
			   dip->file_offset, dip->bytes, bio->bi_status);
655
	}
656

657
	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
658
		btrfs_finish_ordered_extent(bbio->ordered, NULL,
659
					    dip->file_offset, dip->bytes,
660
					    !bio->bi_status);
661
	} else {
662
		btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,
663
					dip->file_offset + dip->bytes - 1, NULL);
664
	}
665

666
	bbio->bio.bi_private = bbio->private;
667
	iomap_dio_bio_end_io(bio);
668
}
669

670
static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
671
					struct btrfs_ordered_extent *ordered)
672
{
673
	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
674
	u64 len = bbio->bio.bi_iter.bi_size;
675
	struct btrfs_ordered_extent *new;
676
	int ret;
677

678
	/* Must always be called for the beginning of an ordered extent. */
679
	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
680
		return -EINVAL;
681

682
	/* No need to split if the ordered extent covers the entire bio. */
683
	if (ordered->disk_num_bytes == len) {
684
		refcount_inc(&ordered->refs);
685
		bbio->ordered = ordered;
686
		return 0;
687
	}
688

689
	/*
690
	 * Don't split the extent_map for NOCOW extents, as we're writing into
691
	 * a pre-existing one.
692
	 */
693
	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
694
		ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,
695
					     ordered->num_bytes, len,
696
					     ordered->disk_bytenr);
697
		if (ret)
698
			return ret;
699
	}
700

701
	new = btrfs_split_ordered_extent(ordered, len);
702
	if (IS_ERR(new))
703
		return PTR_ERR(new);
704
	bbio->ordered = new;
705
	return 0;
706
}
707

708
static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
709
				loff_t file_offset)
710
{
711
	struct btrfs_bio *bbio = btrfs_bio(bio);
712
	struct btrfs_dio_private *dip =
713
		container_of(bbio, struct btrfs_dio_private, bbio);
714
	struct btrfs_dio_data *dio_data = iter->private;
715

716
	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
717
		       btrfs_dio_end_io, bio->bi_private);
718
	bbio->inode = BTRFS_I(iter->inode);
719
	bbio->file_offset = file_offset;
720

721
	dip->file_offset = file_offset;
722
	dip->bytes = bio->bi_iter.bi_size;
723

724
	dio_data->submitted += bio->bi_iter.bi_size;
725

726
	/*
727
	 * Check if we are doing a partial write.  If we are, we need to split
728
	 * the ordered extent to match the submitted bio.  Hang on to the
729
	 * remaining unfinishable ordered_extent in dio_data so that it can be
730
	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
731
	 * remaining pages is blocked on the outstanding ordered extent.
732
	 */
733
	if (iter->flags & IOMAP_WRITE) {
734
		int ret;
735

736
		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
737
		if (ret) {
738
			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
739
						    file_offset, dip->bytes,
740
						    !ret);
741
			bio->bi_status = errno_to_blk_status(ret);
742
			iomap_dio_bio_end_io(bio);
743
			return;
744
		}
745
	}
746

747
	btrfs_submit_bbio(bbio, 0);
748
}
749

750
static const struct iomap_ops btrfs_dio_iomap_ops = {
751
	.iomap_begin            = btrfs_dio_iomap_begin,
752
	.iomap_end              = btrfs_dio_iomap_end,
753
};
754

755
static const struct iomap_dio_ops btrfs_dio_ops = {
756
	.submit_io		= btrfs_dio_submit_io,
757
	.bio_set		= &btrfs_dio_bioset,
758
};
759

760
static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
761
			      size_t done_before)
762
{
763
	struct btrfs_dio_data data = { 0 };
764

765
	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
766
			    IOMAP_DIO_PARTIAL, &data, done_before);
767
}
768

769
static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
770
					 size_t done_before)
771
{
772
	struct btrfs_dio_data data = { 0 };
773

774
	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
775
			    IOMAP_DIO_PARTIAL, &data, done_before);
776
}
777

778
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
779
			       const struct iov_iter *iter, loff_t offset)
780
{
781
	const u32 blocksize_mask = fs_info->sectorsize - 1;
782

783
	if (offset & blocksize_mask)
784
		return -EINVAL;
785

786
	if (iov_iter_alignment(iter) & blocksize_mask)
787
		return -EINVAL;
788

789
	/*
790
	 * For bs > ps support, we heavily rely on large folios to make sure no
791
	 * block will cross large folio boundaries.
792
	 *
793
	 * But memory provided by direct IO is only virtually contiguous, not
794
	 * physically contiguous, and will break the btrfs' large folio requirement.
795
	 *
796
	 * So for bs > ps support, all direct IOs should fallback to buffered ones.
797
	 */
798
	if (fs_info->sectorsize > PAGE_SIZE)
799
		return -EINVAL;
800

801
	return 0;
802
}
803

804
ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
805
{
806
	struct file *file = iocb->ki_filp;
807
	struct inode *inode = file_inode(file);
808
	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
809
	loff_t pos;
810
	ssize_t written = 0;
811
	ssize_t written_buffered;
812
	size_t prev_left = 0;
813
	loff_t endbyte;
814
	ssize_t ret;
815
	unsigned int ilock_flags = 0;
816
	struct iomap_dio *dio;
817

818
	if (iocb->ki_flags & IOCB_NOWAIT)
819
		ilock_flags |= BTRFS_ILOCK_TRY;
820

821
	/*
822
	 * If the write DIO is within EOF, use a shared lock and also only if
823
	 * security bits will likely not be dropped by file_remove_privs() called
824
	 * from btrfs_write_check(). Either will need to be rechecked after the
825
	 * lock was acquired.
826
	 */
827
	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
828
		ilock_flags |= BTRFS_ILOCK_SHARED;
829

830
relock:
831
	ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
832
	if (ret < 0)
833
		return ret;
834

835
	/* Shared lock cannot be used with security bits set. */
836
	if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
837
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
838
		ilock_flags &= ~BTRFS_ILOCK_SHARED;
839
		goto relock;
840
	}
841

842
	ret = generic_write_checks(iocb, from);
843
	if (ret <= 0) {
844
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
845
		return ret;
846
	}
847

848
	ret = btrfs_write_check(iocb, ret);
849
	if (ret < 0) {
850
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
851
		goto out;
852
	}
853

854
	pos = iocb->ki_pos;
855
	/*
856
	 * Re-check since file size may have changed just before taking the
857
	 * lock or pos may have changed because of O_APPEND in generic_write_check()
858
	 */
859
	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
860
	    pos + iov_iter_count(from) > i_size_read(inode)) {
861
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
862
		ilock_flags &= ~BTRFS_ILOCK_SHARED;
863
		goto relock;
864
	}
865

866
	if (check_direct_IO(fs_info, from, pos)) {
867
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
868
		goto buffered;
869
	}
870
	/*
871
	 * We can't control the folios being passed in, applications can write
872
	 * to them while a direct IO write is in progress.  This means the
873
	 * content might change after we calculated the data checksum.
874
	 * Therefore we can end up storing a checksum that doesn't match the
875
	 * persisted data.
876
	 *
877
	 * To be extra safe and avoid false data checksum mismatch, if the
878
	 * inode requires data checksum, just fallback to buffered IO.
879
	 * For buffered IO we have full control of page cache and can ensure
880
	 * no one is modifying the content during writeback.
881
	 */
882
	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
883
		btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
884
		goto buffered;
885
	}
886

887
	/*
888
	 * The iov_iter can be mapped to the same file range we are writing to.
889
	 * If that's the case, then we will deadlock in the iomap code, because
890
	 * it first calls our callback btrfs_dio_iomap_begin(), which will create
891
	 * an ordered extent, and after that it will fault in the pages that the
892
	 * iov_iter refers to. During the fault in we end up in the readahead
893
	 * pages code (starting at btrfs_readahead()), which will lock the range,
894
	 * find that ordered extent and then wait for it to complete (at
895
	 * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
896
	 * obviously the ordered extent can never complete as we didn't submit
897
	 * yet the respective bio(s). This always happens when the buffer is
898
	 * memory mapped to the same file range, since the iomap DIO code always
899
	 * invalidates pages in the target file range (after starting and waiting
900
	 * for any writeback).
901
	 *
902
	 * So here we disable page faults in the iov_iter and then retry if we
903
	 * got -EFAULT, faulting in the pages before the retry.
904
	 */
905
again:
906
	from->nofault = true;
907
	dio = btrfs_dio_write(iocb, from, written);
908
	from->nofault = false;
909

910
	if (IS_ERR_OR_NULL(dio)) {
911
		ret = PTR_ERR_OR_ZERO(dio);
912
	} else {
913
		/*
914
		 * If we have a synchronous write, we must make sure the fsync
915
		 * triggered by the iomap_dio_complete() call below doesn't
916
		 * deadlock on the inode lock - we are already holding it and we
917
		 * can't call it after unlocking because we may need to complete
918
		 * partial writes due to the input buffer (or parts of it) not
919
		 * being already faulted in.
920
		 */
921
		ASSERT(current->journal_info == NULL);
922
		current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;
923
		ret = iomap_dio_complete(dio);
924
		current->journal_info = NULL;
925
	}
926

927
	/* No increment (+=) because iomap returns a cumulative value. */
928
	if (ret > 0)
929
		written = ret;
930

931
	if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {
932
		const size_t left = iov_iter_count(from);
933
		/*
934
		 * We have more data left to write. Try to fault in as many as
935
		 * possible of the remainder pages and retry. We do this without
936
		 * releasing and locking again the inode, to prevent races with
937
		 * truncate.
938
		 *
939
		 * Also, in case the iov refers to pages in the file range of the
940
		 * file we want to write to (due to a mmap), we could enter an
941
		 * infinite loop if we retry after faulting the pages in, since
942
		 * iomap will invalidate any pages in the range early on, before
943
		 * it tries to fault in the pages of the iov. So we keep track of
944
		 * how much was left of iov in the previous EFAULT and fallback
945
		 * to buffered IO in case we haven't made any progress.
946
		 */
947
		if (left == prev_left) {
948
			ret = -ENOTBLK;
949
		} else {
950
			fault_in_iov_iter_readable(from, left);
951
			prev_left = left;
952
			goto again;
953
		}
954
	}
955

956
	btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
957

958
	/*
959
	 * If 'ret' is -ENOTBLK or we have not written all data, then it means
960
	 * we must fallback to buffered IO.
961
	 */
962
	if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))
963
		goto out;
964

965
buffered:
966
	/*
967
	 * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
968
	 * it must retry the operation in a context where blocking is acceptable,
969
	 * because even if we end up not blocking during the buffered IO attempt
970
	 * below, we will block when flushing and waiting for the IO.
971
	 */
972
	if (iocb->ki_flags & IOCB_NOWAIT) {
973
		ret = -EAGAIN;
974
		goto out;
975
	}
976

977
	pos = iocb->ki_pos;
978
	written_buffered = btrfs_buffered_write(iocb, from);
979
	if (written_buffered < 0) {
980
		ret = written_buffered;
981
		goto out;
982
	}
983
	/*
984
	 * Ensure all data is persisted. We want the next direct IO read to be
985
	 * able to read what was just written.
986
	 */
987
	endbyte = pos + written_buffered - 1;
988
	ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);
989
	if (ret)
990
		goto out;
991
	ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
992
	if (ret)
993
		goto out;
994
	written += written_buffered;
995
	iocb->ki_pos = pos + written_buffered;
996
	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
997
				 endbyte >> PAGE_SHIFT);
998
out:
999
	return ret < 0 ? ret : written;
1000
}
1001

1002
static int check_direct_read(struct btrfs_fs_info *fs_info,
1003
			     const struct iov_iter *iter, loff_t offset)
1004
{
1005
	int ret;
1006
	int i, seg;
1007

1008
	ret = check_direct_IO(fs_info, iter, offset);
1009
	if (ret < 0)
1010
		return ret;
1011

1012
	if (!iter_is_iovec(iter))
1013
		return 0;
1014

1015
	for (seg = 0; seg < iter->nr_segs; seg++) {
1016
		for (i = seg + 1; i < iter->nr_segs; i++) {
1017
			const struct iovec *iov1 = iter_iov(iter) + seg;
1018
			const struct iovec *iov2 = iter_iov(iter) + i;
1019

1020
			if (iov1->iov_base == iov2->iov_base)
1021
				return -EINVAL;
1022
		}
1023
	}
1024
	return 0;
1025
}
1026

1027
ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
1028
{
1029
	struct inode *inode = file_inode(iocb->ki_filp);
1030
	size_t prev_left = 0;
1031
	ssize_t read = 0;
1032
	ssize_t ret;
1033

1034
	if (fsverity_active(inode))
1035
		return 0;
1036

1037
	if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))
1038
		return 0;
1039

1040
	btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1041
again:
1042
	/*
1043
	 * This is similar to what we do for direct IO writes, see the comment
1044
	 * at btrfs_direct_write(), but we also disable page faults in addition
1045
	 * to disabling them only at the iov_iter level. This is because when
1046
	 * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
1047
	 * which can still trigger page fault ins despite having set ->nofault
1048
	 * to true of our 'to' iov_iter.
1049
	 *
1050
	 * The difference to direct IO writes is that we deadlock when trying
1051
	 * to lock the extent range in the inode's tree during he page reads
1052
	 * triggered by the fault in (while for writes it is due to waiting for
1053
	 * our own ordered extent). This is because for direct IO reads,
1054
	 * btrfs_dio_iomap_begin() returns with the extent range locked, which
1055
	 * is only unlocked in the endio callback (end_bio_extent_readpage()).
1056
	 */
1057
	pagefault_disable();
1058
	to->nofault = true;
1059
	ret = btrfs_dio_read(iocb, to, read);
1060
	to->nofault = false;
1061
	pagefault_enable();
1062

1063
	/* No increment (+=) because iomap returns a cumulative value. */
1064
	if (ret > 0)
1065
		read = ret;
1066

1067
	if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
1068
		const size_t left = iov_iter_count(to);
1069

1070
		if (left == prev_left) {
1071
			/*
1072
			 * We didn't make any progress since the last attempt,
1073
			 * fallback to a buffered read for the remainder of the
1074
			 * range. This is just to avoid any possibility of looping
1075
			 * for too long.
1076
			 */
1077
			ret = read;
1078
		} else {
1079
			/*
1080
			 * We made some progress since the last retry or this is
1081
			 * the first time we are retrying. Fault in as many pages
1082
			 * as possible and retry.
1083
			 */
1084
			fault_in_iov_iter_writeable(to, left);
1085
			prev_left = left;
1086
			goto again;
1087
		}
1088
	}
1089
	btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
1090
	return ret < 0 ? ret : read;
1091
}
1092

1093
int __init btrfs_init_dio(void)
1094
{
1095
	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
1096
			offsetof(struct btrfs_dio_private, bbio.bio),
1097
			BIOSET_NEED_BVECS))
1098
		return -ENOMEM;
1099

1100
	return 0;
1101
}
1102

1103
void __cold btrfs_destroy_dio(void)
1104
{
1105
	bioset_exit(&btrfs_dio_bioset);
1106
}
1107

1108
Product

Resources

Company