Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/bio.c
29267 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2007 Oracle. All rights reserved.
4
* Copyright (C) 2022 Christoph Hellwig.
5
*/
6
7
#include <linux/bio.h>
8
#include "bio.h"
9
#include "ctree.h"
10
#include "volumes.h"
11
#include "raid56.h"
12
#include "async-thread.h"
13
#include "dev-replace.h"
14
#include "zoned.h"
15
#include "file-item.h"
16
#include "raid-stripe-tree.h"
17
18
static struct bio_set btrfs_bioset;
19
static struct bio_set btrfs_clone_bioset;
20
static struct bio_set btrfs_repair_bioset;
21
static mempool_t btrfs_failed_bio_pool;
22
23
struct btrfs_failed_bio {
24
struct btrfs_bio *bbio;
25
int num_copies;
26
atomic_t repair_count;
27
};
28
29
/* Is this a data path I/O that needs storage layer checksum and repair? */
30
static inline bool is_data_bbio(const struct btrfs_bio *bbio)
31
{
32
return bbio->inode && is_data_inode(bbio->inode);
33
}
34
35
static bool bbio_has_ordered_extent(const struct btrfs_bio *bbio)
36
{
37
return is_data_bbio(bbio) && btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE;
38
}
39
40
/*
41
* Initialize a btrfs_bio structure. This skips the embedded bio itself as it
42
* is already initialized by the block layer.
43
*/
44
void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
45
btrfs_bio_end_io_t end_io, void *private)
46
{
47
memset(bbio, 0, offsetof(struct btrfs_bio, bio));
48
bbio->fs_info = fs_info;
49
bbio->end_io = end_io;
50
bbio->private = private;
51
atomic_set(&bbio->pending_ios, 1);
52
WRITE_ONCE(bbio->status, BLK_STS_OK);
53
}
54
55
/*
56
* Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
57
* btrfs, and is used for all I/O submitted through btrfs_submit_bbio().
58
*
59
* Just like the underlying bio_alloc_bioset it will not fail as it is backed by
60
* a mempool.
61
*/
62
struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
63
struct btrfs_fs_info *fs_info,
64
btrfs_bio_end_io_t end_io, void *private)
65
{
66
struct btrfs_bio *bbio;
67
struct bio *bio;
68
69
bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
70
bbio = btrfs_bio(bio);
71
btrfs_bio_init(bbio, fs_info, end_io, private);
72
return bbio;
73
}
74
75
static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
76
struct btrfs_bio *orig_bbio,
77
u64 map_length)
78
{
79
struct btrfs_bio *bbio;
80
struct bio *bio;
81
82
bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, GFP_NOFS,
83
&btrfs_clone_bioset);
84
if (IS_ERR(bio))
85
return ERR_CAST(bio);
86
87
bbio = btrfs_bio(bio);
88
btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
89
bbio->inode = orig_bbio->inode;
90
bbio->file_offset = orig_bbio->file_offset;
91
orig_bbio->file_offset += map_length;
92
if (bbio_has_ordered_extent(bbio)) {
93
refcount_inc(&orig_bbio->ordered->refs);
94
bbio->ordered = orig_bbio->ordered;
95
}
96
bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root;
97
atomic_inc(&orig_bbio->pending_ios);
98
return bbio;
99
}
100
101
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
102
{
103
bbio->bio.bi_status = status;
104
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
105
struct btrfs_bio *orig_bbio = bbio->private;
106
107
/* Free bio that was never submitted to the underlying device. */
108
if (bbio_has_ordered_extent(bbio))
109
btrfs_put_ordered_extent(bbio->ordered);
110
bio_put(&bbio->bio);
111
112
bbio = orig_bbio;
113
}
114
115
/*
116
* At this point, bbio always points to the original btrfs_bio. Save
117
* the first error in it.
118
*/
119
if (status != BLK_STS_OK)
120
cmpxchg(&bbio->status, BLK_STS_OK, status);
121
122
if (atomic_dec_and_test(&bbio->pending_ios)) {
123
/* Load split bio's error which might be set above. */
124
if (status == BLK_STS_OK)
125
bbio->bio.bi_status = READ_ONCE(bbio->status);
126
127
if (bbio_has_ordered_extent(bbio)) {
128
struct btrfs_ordered_extent *ordered = bbio->ordered;
129
130
bbio->end_io(bbio);
131
btrfs_put_ordered_extent(ordered);
132
} else {
133
bbio->end_io(bbio);
134
}
135
}
136
}
137
138
static int next_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
139
{
140
if (cur_mirror == fbio->num_copies)
141
return cur_mirror + 1 - fbio->num_copies;
142
return cur_mirror + 1;
143
}
144
145
static int prev_repair_mirror(const struct btrfs_failed_bio *fbio, int cur_mirror)
146
{
147
if (cur_mirror == 1)
148
return fbio->num_copies;
149
return cur_mirror - 1;
150
}
151
152
static void btrfs_repair_done(struct btrfs_failed_bio *fbio)
153
{
154
if (atomic_dec_and_test(&fbio->repair_count)) {
155
btrfs_bio_end_io(fbio->bbio, fbio->bbio->bio.bi_status);
156
mempool_free(fbio, &btrfs_failed_bio_pool);
157
}
158
}
159
160
static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
161
struct btrfs_device *dev)
162
{
163
struct btrfs_failed_bio *fbio = repair_bbio->private;
164
struct btrfs_inode *inode = repair_bbio->inode;
165
struct btrfs_fs_info *fs_info = inode->root->fs_info;
166
struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
167
int mirror = repair_bbio->mirror_num;
168
169
if (repair_bbio->bio.bi_status ||
170
!btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) {
171
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
172
repair_bbio->bio.bi_iter = repair_bbio->saved_iter;
173
174
mirror = next_repair_mirror(fbio, mirror);
175
if (mirror == fbio->bbio->mirror_num) {
176
btrfs_debug(fs_info, "no mirror left");
177
fbio->bbio->bio.bi_status = BLK_STS_IOERR;
178
goto done;
179
}
180
181
btrfs_submit_bbio(repair_bbio, mirror);
182
return;
183
}
184
185
do {
186
mirror = prev_repair_mirror(fbio, mirror);
187
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
188
repair_bbio->file_offset, fs_info->sectorsize,
189
repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
190
bvec_phys(bv), mirror);
191
} while (mirror != fbio->bbio->mirror_num);
192
193
done:
194
btrfs_repair_done(fbio);
195
bio_put(&repair_bbio->bio);
196
}
197
198
/*
199
* Try to kick off a repair read to the next available mirror for a bad sector.
200
*
201
* This primarily tries to recover good data to serve the actual read request,
202
* but also tries to write the good data back to the bad mirror(s) when a
203
* read succeeded to restore the redundancy.
204
*/
205
static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
206
u32 bio_offset,
207
phys_addr_t paddr,
208
struct btrfs_failed_bio *fbio)
209
{
210
struct btrfs_inode *inode = failed_bbio->inode;
211
struct btrfs_fs_info *fs_info = inode->root->fs_info;
212
struct folio *folio = page_folio(phys_to_page(paddr));
213
const u32 sectorsize = fs_info->sectorsize;
214
const u32 foff = offset_in_folio(folio, paddr);
215
const u64 logical = (failed_bbio->saved_iter.bi_sector << SECTOR_SHIFT);
216
struct btrfs_bio *repair_bbio;
217
struct bio *repair_bio;
218
int num_copies;
219
int mirror;
220
221
ASSERT(foff + sectorsize <= folio_size(folio));
222
btrfs_debug(fs_info, "repair read error: read error at %llu",
223
failed_bbio->file_offset + bio_offset);
224
225
num_copies = btrfs_num_copies(fs_info, logical, sectorsize);
226
if (num_copies == 1) {
227
btrfs_debug(fs_info, "no copy to repair from");
228
failed_bbio->bio.bi_status = BLK_STS_IOERR;
229
return fbio;
230
}
231
232
if (!fbio) {
233
fbio = mempool_alloc(&btrfs_failed_bio_pool, GFP_NOFS);
234
fbio->bbio = failed_bbio;
235
fbio->num_copies = num_copies;
236
atomic_set(&fbio->repair_count, 1);
237
}
238
239
atomic_inc(&fbio->repair_count);
240
241
repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
242
&btrfs_repair_bioset);
243
repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
244
bio_add_folio_nofail(repair_bio, folio, sectorsize, foff);
245
246
repair_bbio = btrfs_bio(repair_bio);
247
btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
248
repair_bbio->inode = failed_bbio->inode;
249
repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
250
251
mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
252
btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
253
btrfs_submit_bbio(repair_bbio, mirror);
254
return fbio;
255
}
256
257
static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *dev)
258
{
259
struct btrfs_inode *inode = bbio->inode;
260
struct btrfs_fs_info *fs_info = inode->root->fs_info;
261
u32 sectorsize = fs_info->sectorsize;
262
struct bvec_iter *iter = &bbio->saved_iter;
263
blk_status_t status = bbio->bio.bi_status;
264
struct btrfs_failed_bio *fbio = NULL;
265
phys_addr_t paddr;
266
u32 offset = 0;
267
268
/* Read-repair requires the inode field to be set by the submitter. */
269
ASSERT(inode);
270
271
/*
272
* Hand off repair bios to the repair code as there is no upper level
273
* submitter for them.
274
*/
275
if (bbio->bio.bi_pool == &btrfs_repair_bioset) {
276
btrfs_end_repair_bio(bbio, dev);
277
return;
278
}
279
280
/* Clear the I/O error. A failed repair will reset it. */
281
bbio->bio.bi_status = BLK_STS_OK;
282
283
btrfs_bio_for_each_block(paddr, &bbio->bio, iter, fs_info->sectorsize) {
284
if (status || !btrfs_data_csum_ok(bbio, dev, offset, paddr))
285
fbio = repair_one_sector(bbio, offset, paddr, fbio);
286
offset += sectorsize;
287
}
288
if (bbio->csum != bbio->csum_inline)
289
kfree(bbio->csum);
290
291
if (fbio)
292
btrfs_repair_done(fbio);
293
else
294
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
295
}
296
297
static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev)
298
{
299
if (!dev || !dev->bdev)
300
return;
301
if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
302
return;
303
304
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
305
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
306
else if (!(bio->bi_opf & REQ_RAHEAD))
307
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
308
if (bio->bi_opf & REQ_PREFLUSH)
309
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
310
}
311
312
static struct workqueue_struct *btrfs_end_io_wq(const struct btrfs_fs_info *fs_info,
313
const struct bio *bio)
314
{
315
if (bio->bi_opf & REQ_META)
316
return fs_info->endio_meta_workers;
317
return fs_info->endio_workers;
318
}
319
320
static void btrfs_end_bio_work(struct work_struct *work)
321
{
322
struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
323
324
/* Metadata reads are checked and repaired by the submitter. */
325
if (is_data_bbio(bbio))
326
btrfs_check_read_bio(bbio, bbio->bio.bi_private);
327
else
328
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
329
}
330
331
static void btrfs_simple_end_io(struct bio *bio)
332
{
333
struct btrfs_bio *bbio = btrfs_bio(bio);
334
struct btrfs_device *dev = bio->bi_private;
335
struct btrfs_fs_info *fs_info = bbio->fs_info;
336
337
btrfs_bio_counter_dec(fs_info);
338
339
if (bio->bi_status)
340
btrfs_log_dev_io_error(bio, dev);
341
342
if (bio_op(bio) == REQ_OP_READ) {
343
INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
344
queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
345
} else {
346
if (bio_is_zone_append(bio) && !bio->bi_status)
347
btrfs_record_physical_zoned(bbio);
348
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
349
}
350
}
351
352
static void btrfs_raid56_end_io(struct bio *bio)
353
{
354
struct btrfs_io_context *bioc = bio->bi_private;
355
struct btrfs_bio *bbio = btrfs_bio(bio);
356
357
btrfs_bio_counter_dec(bioc->fs_info);
358
bbio->mirror_num = bioc->mirror_num;
359
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio))
360
btrfs_check_read_bio(bbio, NULL);
361
else
362
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
363
364
btrfs_put_bioc(bioc);
365
}
366
367
static void btrfs_orig_write_end_io(struct bio *bio)
368
{
369
struct btrfs_io_stripe *stripe = bio->bi_private;
370
struct btrfs_io_context *bioc = stripe->bioc;
371
struct btrfs_bio *bbio = btrfs_bio(bio);
372
373
btrfs_bio_counter_dec(bioc->fs_info);
374
375
if (bio->bi_status) {
376
atomic_inc(&bioc->error);
377
btrfs_log_dev_io_error(bio, stripe->dev);
378
}
379
380
/*
381
* Only send an error to the higher layers if it is beyond the tolerance
382
* threshold.
383
*/
384
if (atomic_read(&bioc->error) > bioc->max_errors)
385
bio->bi_status = BLK_STS_IOERR;
386
else
387
bio->bi_status = BLK_STS_OK;
388
389
if (bio_is_zone_append(bio) && !bio->bi_status)
390
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
391
392
btrfs_bio_end_io(bbio, bbio->bio.bi_status);
393
btrfs_put_bioc(bioc);
394
}
395
396
static void btrfs_clone_write_end_io(struct bio *bio)
397
{
398
struct btrfs_io_stripe *stripe = bio->bi_private;
399
400
if (bio->bi_status) {
401
atomic_inc(&stripe->bioc->error);
402
btrfs_log_dev_io_error(bio, stripe->dev);
403
} else if (bio_is_zone_append(bio)) {
404
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
405
}
406
407
/* Pass on control to the original bio this one was cloned from */
408
bio_endio(stripe->bioc->orig_bio);
409
bio_put(bio);
410
}
411
412
static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
413
{
414
if (!dev || !dev->bdev ||
415
test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
416
(btrfs_op(bio) == BTRFS_MAP_WRITE &&
417
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
418
bio_io_error(bio);
419
return;
420
}
421
422
bio_set_dev(bio, dev->bdev);
423
424
/*
425
* For zone append writing, bi_sector must point the beginning of the
426
* zone
427
*/
428
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
429
u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
430
u64 zone_start = round_down(physical, dev->fs_info->zone_size);
431
432
ASSERT(btrfs_dev_is_sequential(dev, physical));
433
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
434
}
435
btrfs_debug(dev->fs_info,
436
"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
437
__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
438
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
439
dev->devid, bio->bi_iter.bi_size);
440
441
/*
442
* Track reads if tracking is enabled; ignore I/O operations before the
443
* filesystem is fully initialized.
444
*/
445
if (dev->fs_devices->collect_fs_stats && bio_op(bio) == REQ_OP_READ && dev->fs_info)
446
percpu_counter_add(&dev->fs_info->stats_read_blocks,
447
bio->bi_iter.bi_size >> dev->fs_info->sectorsize_bits);
448
449
if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
450
blkcg_punt_bio_submit(bio);
451
else
452
submit_bio(bio);
453
}
454
455
static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
456
{
457
struct bio *orig_bio = bioc->orig_bio, *bio;
458
459
ASSERT(bio_op(orig_bio) != REQ_OP_READ);
460
461
/* Reuse the bio embedded into the btrfs_bio for the last mirror */
462
if (dev_nr == bioc->num_stripes - 1) {
463
bio = orig_bio;
464
bio->bi_end_io = btrfs_orig_write_end_io;
465
} else {
466
bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
467
bio_inc_remaining(orig_bio);
468
bio->bi_end_io = btrfs_clone_write_end_io;
469
}
470
471
bio->bi_private = &bioc->stripes[dev_nr];
472
bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
473
bioc->stripes[dev_nr].bioc = bioc;
474
bioc->size = bio->bi_iter.bi_size;
475
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
476
}
477
478
static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
479
struct btrfs_io_stripe *smap, int mirror_num)
480
{
481
if (!bioc) {
482
/* Single mirror read/write fast path. */
483
btrfs_bio(bio)->mirror_num = mirror_num;
484
bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
485
if (bio_op(bio) != REQ_OP_READ)
486
btrfs_bio(bio)->orig_physical = smap->physical;
487
bio->bi_private = smap->dev;
488
bio->bi_end_io = btrfs_simple_end_io;
489
btrfs_submit_dev_bio(smap->dev, bio);
490
} else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
491
/* Parity RAID write or read recovery. */
492
bio->bi_private = bioc;
493
bio->bi_end_io = btrfs_raid56_end_io;
494
if (bio_op(bio) == REQ_OP_READ)
495
raid56_parity_recover(bio, bioc, mirror_num);
496
else
497
raid56_parity_write(bio, bioc);
498
} else {
499
/* Write to multiple mirrors. */
500
int total_devs = bioc->num_stripes;
501
502
bioc->orig_bio = bio;
503
for (int dev_nr = 0; dev_nr < total_devs; dev_nr++)
504
btrfs_submit_mirrored_bio(bioc, dev_nr);
505
}
506
}
507
508
static int btrfs_bio_csum(struct btrfs_bio *bbio)
509
{
510
if (bbio->bio.bi_opf & REQ_META)
511
return btree_csum_one_bio(bbio);
512
return btrfs_csum_one_bio(bbio);
513
}
514
515
/*
516
* Async submit bios are used to offload expensive checksumming onto the worker
517
* threads.
518
*/
519
struct async_submit_bio {
520
struct btrfs_bio *bbio;
521
struct btrfs_io_context *bioc;
522
struct btrfs_io_stripe smap;
523
int mirror_num;
524
struct btrfs_work work;
525
};
526
527
/*
528
* In order to insert checksums into the metadata in large chunks, we wait
529
* until bio submission time. All the pages in the bio are checksummed and
530
* sums are attached onto the ordered extent record.
531
*
532
* At IO completion time the csums attached on the ordered extent record are
533
* inserted into the btree.
534
*/
535
static void run_one_async_start(struct btrfs_work *work)
536
{
537
struct async_submit_bio *async =
538
container_of(work, struct async_submit_bio, work);
539
int ret;
540
541
ret = btrfs_bio_csum(async->bbio);
542
if (ret)
543
async->bbio->bio.bi_status = errno_to_blk_status(ret);
544
}
545
546
/*
547
* In order to insert checksums into the metadata in large chunks, we wait
548
* until bio submission time. All the pages in the bio are checksummed and
549
* sums are attached onto the ordered extent record.
550
*
551
* At IO completion time the csums attached on the ordered extent record are
552
* inserted into the tree.
553
*
554
* If called with @do_free == true, then it will free the work struct.
555
*/
556
static void run_one_async_done(struct btrfs_work *work, bool do_free)
557
{
558
struct async_submit_bio *async =
559
container_of(work, struct async_submit_bio, work);
560
struct bio *bio = &async->bbio->bio;
561
562
if (do_free) {
563
kfree(container_of(work, struct async_submit_bio, work));
564
return;
565
}
566
567
/* If an error occurred we just want to clean up the bio and move on. */
568
if (bio->bi_status) {
569
btrfs_bio_end_io(async->bbio, bio->bi_status);
570
return;
571
}
572
573
/*
574
* All of the bios that pass through here are from async helpers.
575
* Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
576
* context. This changes nothing when cgroups aren't in use.
577
*/
578
bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
579
btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
580
}
581
582
static bool should_async_write(struct btrfs_bio *bbio)
583
{
584
bool auto_csum_mode = true;
585
586
#ifdef CONFIG_BTRFS_EXPERIMENTAL
587
struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
588
enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
589
590
if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_OFF)
591
return false;
592
593
auto_csum_mode = (csum_mode == BTRFS_OFFLOAD_CSUM_AUTO);
594
#endif
595
596
/* Submit synchronously if the checksum implementation is fast. */
597
if (auto_csum_mode && test_bit(BTRFS_FS_CSUM_IMPL_FAST, &bbio->fs_info->flags))
598
return false;
599
600
/*
601
* Try to defer the submission to a workqueue to parallelize the
602
* checksum calculation unless the I/O is issued synchronously.
603
*/
604
if (op_is_sync(bbio->bio.bi_opf))
605
return false;
606
607
/* Zoned devices require I/O to be submitted in order. */
608
if ((bbio->bio.bi_opf & REQ_META) && btrfs_is_zoned(bbio->fs_info))
609
return false;
610
611
return true;
612
}
613
614
/*
615
* Submit bio to an async queue.
616
*
617
* Return true if the work has been successfully submitted, else false.
618
*/
619
static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
620
struct btrfs_io_context *bioc,
621
struct btrfs_io_stripe *smap, int mirror_num)
622
{
623
struct btrfs_fs_info *fs_info = bbio->fs_info;
624
struct async_submit_bio *async;
625
626
async = kmalloc(sizeof(*async), GFP_NOFS);
627
if (!async)
628
return false;
629
630
async->bbio = bbio;
631
async->bioc = bioc;
632
async->smap = *smap;
633
async->mirror_num = mirror_num;
634
635
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done);
636
btrfs_queue_work(fs_info->workers, &async->work);
637
return true;
638
}
639
640
static u64 btrfs_append_map_length(struct btrfs_bio *bbio, u64 map_length)
641
{
642
unsigned int nr_segs;
643
int sector_offset;
644
645
map_length = min(map_length, bbio->fs_info->max_zone_append_size);
646
sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
647
&nr_segs, map_length);
648
if (sector_offset) {
649
/*
650
* bio_split_rw_at() could split at a size smaller than our
651
* sectorsize and thus cause unaligned I/Os. Fix that by
652
* always rounding down to the nearest boundary.
653
*/
654
return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize);
655
}
656
return map_length;
657
}
658
659
static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
660
{
661
struct btrfs_inode *inode = bbio->inode;
662
struct btrfs_fs_info *fs_info = bbio->fs_info;
663
struct bio *bio = &bbio->bio;
664
u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
665
u64 length = bio->bi_iter.bi_size;
666
u64 map_length = length;
667
bool use_append = btrfs_use_zone_append(bbio);
668
struct btrfs_io_context *bioc = NULL;
669
struct btrfs_io_stripe smap;
670
blk_status_t status;
671
int ret;
672
673
if (!bbio->inode || btrfs_is_data_reloc_root(inode->root))
674
smap.rst_search_commit_root = true;
675
else
676
smap.rst_search_commit_root = false;
677
678
btrfs_bio_counter_inc_blocked(fs_info);
679
ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
680
&bioc, &smap, &mirror_num);
681
if (ret) {
682
status = errno_to_blk_status(ret);
683
btrfs_bio_counter_dec(fs_info);
684
goto end_bbio;
685
}
686
687
map_length = min(map_length, length);
688
if (use_append)
689
map_length = btrfs_append_map_length(bbio, map_length);
690
691
if (map_length < length) {
692
struct btrfs_bio *split;
693
694
split = btrfs_split_bio(fs_info, bbio, map_length);
695
if (IS_ERR(split)) {
696
status = errno_to_blk_status(PTR_ERR(split));
697
btrfs_bio_counter_dec(fs_info);
698
goto end_bbio;
699
}
700
bbio = split;
701
bio = &bbio->bio;
702
}
703
704
/*
705
* Save the iter for the end_io handler and preload the checksums for
706
* data reads.
707
*/
708
if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) {
709
bbio->saved_iter = bio->bi_iter;
710
ret = btrfs_lookup_bio_sums(bbio);
711
status = errno_to_blk_status(ret);
712
if (status)
713
goto fail;
714
}
715
716
if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
717
if (use_append) {
718
bio->bi_opf &= ~REQ_OP_WRITE;
719
bio->bi_opf |= REQ_OP_ZONE_APPEND;
720
}
721
722
if (is_data_bbio(bbio) && bioc && bioc->use_rst) {
723
/*
724
* No locking for the list update, as we only add to
725
* the list in the I/O submission path, and list
726
* iteration only happens in the completion path, which
727
* can't happen until after the last submission.
728
*/
729
btrfs_get_bioc(bioc);
730
list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
731
}
732
733
/*
734
* Csum items for reloc roots have already been cloned at this
735
* point, so they are handled as part of the no-checksum case.
736
*/
737
if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
738
!test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
739
!btrfs_is_data_reloc_root(inode->root)) {
740
if (should_async_write(bbio) &&
741
btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
742
goto done;
743
744
ret = btrfs_bio_csum(bbio);
745
status = errno_to_blk_status(ret);
746
if (status)
747
goto fail;
748
} else if (use_append ||
749
(btrfs_is_zoned(fs_info) && inode &&
750
inode->flags & BTRFS_INODE_NODATASUM)) {
751
ret = btrfs_alloc_dummy_sum(bbio);
752
status = errno_to_blk_status(ret);
753
if (status)
754
goto fail;
755
}
756
}
757
758
btrfs_submit_bio(bio, bioc, &smap, mirror_num);
759
done:
760
return map_length == length;
761
762
fail:
763
btrfs_bio_counter_dec(fs_info);
764
/*
765
* We have split the original bbio, now we have to end both the current
766
* @bbio and remaining one, as the remaining one will never be submitted.
767
*/
768
if (map_length < length) {
769
struct btrfs_bio *remaining = bbio->private;
770
771
ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset);
772
ASSERT(remaining);
773
774
btrfs_bio_end_io(remaining, status);
775
}
776
end_bbio:
777
btrfs_bio_end_io(bbio, status);
778
/* Do not submit another chunk */
779
return true;
780
}
781
782
static void assert_bbio_alignment(struct btrfs_bio *bbio)
783
{
784
#ifdef CONFIG_BTRFS_ASSERT
785
struct btrfs_fs_info *fs_info = bbio->fs_info;
786
struct bio_vec bvec;
787
struct bvec_iter iter;
788
const u32 blocksize = fs_info->sectorsize;
789
790
/* Metadata has no extra bs > ps alignment requirement. */
791
if (!is_data_bbio(bbio))
792
return;
793
794
bio_for_each_bvec(bvec, &bbio->bio, iter)
795
ASSERT(IS_ALIGNED(bvec.bv_offset, blocksize) &&
796
IS_ALIGNED(bvec.bv_len, blocksize),
797
"root=%llu inode=%llu logical=%llu length=%u index=%u bv_offset=%u bv_len=%u",
798
btrfs_root_id(bbio->inode->root),
799
btrfs_ino(bbio->inode),
800
bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT,
801
bbio->bio.bi_iter.bi_size, iter.bi_idx,
802
bvec.bv_offset,
803
bvec.bv_len);
804
#endif
805
}
806
807
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
808
{
809
/* If bbio->inode is not populated, its file_offset must be 0. */
810
ASSERT(bbio->inode || bbio->file_offset == 0);
811
812
assert_bbio_alignment(bbio);
813
814
while (!btrfs_submit_chunk(bbio, mirror_num))
815
;
816
}
817
818
/*
819
* Submit a repair write.
820
*
821
* This bypasses btrfs_submit_bbio() deliberately, as that writes all copies in a
822
* RAID setup. Here we only want to write the one bad copy, so we do the
823
* mapping ourselves and submit the bio directly.
824
*
825
* The I/O is issued synchronously to block the repair read completion from
826
* freeing the bio.
827
*/
828
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
829
u64 length, u64 logical, phys_addr_t paddr, int mirror_num)
830
{
831
struct btrfs_io_stripe smap = { 0 };
832
struct bio_vec bvec;
833
struct bio bio;
834
int ret = 0;
835
836
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
837
BUG_ON(!mirror_num);
838
839
if (btrfs_repair_one_zone(fs_info, logical))
840
return 0;
841
842
/*
843
* Avoid races with device replace and make sure our bioc has devices
844
* associated to its stripes that don't go away while we are doing the
845
* read repair operation.
846
*/
847
btrfs_bio_counter_inc_blocked(fs_info);
848
ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
849
if (ret < 0)
850
goto out_counter_dec;
851
852
if (unlikely(!smap.dev->bdev ||
853
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state))) {
854
ret = -EIO;
855
goto out_counter_dec;
856
}
857
858
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
859
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
860
__bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr));
861
ret = submit_bio_wait(&bio);
862
if (ret) {
863
/* try to remap that extent elsewhere? */
864
btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
865
goto out_bio_uninit;
866
}
867
868
btrfs_info_rl(fs_info,
869
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
870
ino, start, btrfs_dev_name(smap.dev),
871
smap.physical >> SECTOR_SHIFT);
872
ret = 0;
873
874
out_bio_uninit:
875
bio_uninit(&bio);
876
out_counter_dec:
877
btrfs_bio_counter_dec(fs_info);
878
return ret;
879
}
880
881
/*
882
* Submit a btrfs_bio based repair write.
883
*
884
* If @dev_replace is true, the write would be submitted to dev-replace target.
885
*/
886
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
887
{
888
struct btrfs_fs_info *fs_info = bbio->fs_info;
889
u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
890
u64 length = bbio->bio.bi_iter.bi_size;
891
struct btrfs_io_stripe smap = { 0 };
892
int ret;
893
894
ASSERT(fs_info);
895
ASSERT(mirror_num > 0);
896
ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
897
ASSERT(!bbio->inode);
898
899
btrfs_bio_counter_inc_blocked(fs_info);
900
ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
901
if (ret < 0)
902
goto fail;
903
904
if (dev_replace) {
905
ASSERT(smap.dev == fs_info->dev_replace.srcdev);
906
smap.dev = fs_info->dev_replace.tgtdev;
907
}
908
btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
909
return;
910
911
fail:
912
btrfs_bio_counter_dec(fs_info);
913
btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
914
}
915
916
int __init btrfs_bioset_init(void)
917
{
918
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
919
offsetof(struct btrfs_bio, bio),
920
BIOSET_NEED_BVECS))
921
return -ENOMEM;
922
if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE,
923
offsetof(struct btrfs_bio, bio), 0))
924
goto out;
925
if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE,
926
offsetof(struct btrfs_bio, bio),
927
BIOSET_NEED_BVECS))
928
goto out;
929
if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE,
930
sizeof(struct btrfs_failed_bio)))
931
goto out;
932
return 0;
933
934
out:
935
btrfs_bioset_exit();
936
return -ENOMEM;
937
}
938
939
void __cold btrfs_bioset_exit(void)
940
{
941
mempool_exit(&btrfs_failed_bio_pool);
942
bioset_exit(&btrfs_repair_bioset);
943
bioset_exit(&btrfs_clone_bioset);
944
bioset_exit(&btrfs_bioset);
945
}
946
947