Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/block-group.c
29266 views
1
// SPDX-License-Identifier: GPL-2.0
2
3
#include <linux/sizes.h>
4
#include <linux/list_sort.h>
5
#include "misc.h"
6
#include "ctree.h"
7
#include "block-group.h"
8
#include "space-info.h"
9
#include "disk-io.h"
10
#include "free-space-cache.h"
11
#include "free-space-tree.h"
12
#include "volumes.h"
13
#include "transaction.h"
14
#include "ref-verify.h"
15
#include "sysfs.h"
16
#include "tree-log.h"
17
#include "delalloc-space.h"
18
#include "discard.h"
19
#include "raid56.h"
20
#include "zoned.h"
21
#include "fs.h"
22
#include "accessors.h"
23
#include "extent-tree.h"
24
25
#ifdef CONFIG_BTRFS_DEBUG
26
int btrfs_should_fragment_free_space(const struct btrfs_block_group *block_group)
27
{
28
struct btrfs_fs_info *fs_info = block_group->fs_info;
29
30
return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
31
block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
32
(btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
33
block_group->flags & BTRFS_BLOCK_GROUP_DATA);
34
}
35
#endif
36
37
static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
38
{
39
/* The meta_write_pointer is available only on the zoned setup. */
40
if (!btrfs_is_zoned(block_group->fs_info))
41
return false;
42
43
if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
44
return false;
45
46
return block_group->start + block_group->alloc_offset >
47
block_group->meta_write_pointer;
48
}
49
50
/*
51
* Return target flags in extended format or 0 if restripe for this chunk_type
52
* is not in progress
53
*
54
* Should be called with balance_lock held
55
*/
56
static u64 get_restripe_target(const struct btrfs_fs_info *fs_info, u64 flags)
57
{
58
const struct btrfs_balance_control *bctl = fs_info->balance_ctl;
59
u64 target = 0;
60
61
if (!bctl)
62
return 0;
63
64
if (flags & BTRFS_BLOCK_GROUP_DATA &&
65
bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
66
target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
67
} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
68
bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
69
target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
70
} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
71
bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
72
target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
73
}
74
75
return target;
76
}
77
78
/*
79
* @flags: available profiles in extended format (see ctree.h)
80
*
81
* Return reduced profile in chunk format. If profile changing is in progress
82
* (either running or paused) picks the target profile (if it's already
83
* available), otherwise falls back to plain reducing.
84
*/
85
static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
86
{
87
u64 num_devices = fs_info->fs_devices->rw_devices;
88
u64 target;
89
u64 raid_type;
90
u64 allowed = 0;
91
92
/*
93
* See if restripe for this chunk_type is in progress, if so try to
94
* reduce to the target profile
95
*/
96
spin_lock(&fs_info->balance_lock);
97
target = get_restripe_target(fs_info, flags);
98
if (target) {
99
spin_unlock(&fs_info->balance_lock);
100
return extended_to_chunk(target);
101
}
102
spin_unlock(&fs_info->balance_lock);
103
104
/* First, mask out the RAID levels which aren't possible */
105
for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
106
if (num_devices >= btrfs_raid_array[raid_type].devs_min)
107
allowed |= btrfs_raid_array[raid_type].bg_flag;
108
}
109
allowed &= flags;
110
111
/* Select the highest-redundancy RAID level. */
112
if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
113
allowed = BTRFS_BLOCK_GROUP_RAID1C4;
114
else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
115
allowed = BTRFS_BLOCK_GROUP_RAID6;
116
else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
117
allowed = BTRFS_BLOCK_GROUP_RAID1C3;
118
else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
119
allowed = BTRFS_BLOCK_GROUP_RAID5;
120
else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
121
allowed = BTRFS_BLOCK_GROUP_RAID10;
122
else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
123
allowed = BTRFS_BLOCK_GROUP_RAID1;
124
else if (allowed & BTRFS_BLOCK_GROUP_DUP)
125
allowed = BTRFS_BLOCK_GROUP_DUP;
126
else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
127
allowed = BTRFS_BLOCK_GROUP_RAID0;
128
129
flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
130
131
return extended_to_chunk(flags | allowed);
132
}
133
134
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
135
{
136
unsigned seq;
137
u64 flags;
138
139
do {
140
flags = orig_flags;
141
seq = read_seqbegin(&fs_info->profiles_lock);
142
143
if (flags & BTRFS_BLOCK_GROUP_DATA)
144
flags |= fs_info->avail_data_alloc_bits;
145
else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
146
flags |= fs_info->avail_system_alloc_bits;
147
else if (flags & BTRFS_BLOCK_GROUP_METADATA)
148
flags |= fs_info->avail_metadata_alloc_bits;
149
} while (read_seqretry(&fs_info->profiles_lock, seq));
150
151
return btrfs_reduce_alloc_profile(fs_info, flags);
152
}
153
154
void btrfs_get_block_group(struct btrfs_block_group *cache)
155
{
156
refcount_inc(&cache->refs);
157
}
158
159
void btrfs_put_block_group(struct btrfs_block_group *cache)
160
{
161
if (refcount_dec_and_test(&cache->refs)) {
162
WARN_ON(cache->pinned > 0);
163
/*
164
* If there was a failure to cleanup a log tree, very likely due
165
* to an IO failure on a writeback attempt of one or more of its
166
* extent buffers, we could not do proper (and cheap) unaccounting
167
* of their reserved space, so don't warn on reserved > 0 in that
168
* case.
169
*/
170
if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
171
!BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
172
WARN_ON(cache->reserved > 0);
173
174
/*
175
* A block_group shouldn't be on the discard_list anymore.
176
* Remove the block_group from the discard_list to prevent us
177
* from causing a panic due to NULL pointer dereference.
178
*/
179
if (WARN_ON(!list_empty(&cache->discard_list)))
180
btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
181
cache);
182
183
kfree(cache->free_space_ctl);
184
btrfs_free_chunk_map(cache->physical_map);
185
kfree(cache);
186
}
187
}
188
189
static int btrfs_bg_start_cmp(const struct rb_node *new,
190
const struct rb_node *exist)
191
{
192
const struct btrfs_block_group *new_bg =
193
rb_entry(new, struct btrfs_block_group, cache_node);
194
const struct btrfs_block_group *exist_bg =
195
rb_entry(exist, struct btrfs_block_group, cache_node);
196
197
if (new_bg->start < exist_bg->start)
198
return -1;
199
if (new_bg->start > exist_bg->start)
200
return 1;
201
return 0;
202
}
203
204
/*
205
* This adds the block group to the fs_info rb tree for the block group cache
206
*/
207
static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group)
208
{
209
struct btrfs_fs_info *fs_info = block_group->fs_info;
210
struct rb_node *exist;
211
int ret = 0;
212
213
ASSERT(block_group->length != 0);
214
215
write_lock(&fs_info->block_group_cache_lock);
216
217
exist = rb_find_add_cached(&block_group->cache_node,
218
&fs_info->block_group_cache_tree, btrfs_bg_start_cmp);
219
if (exist)
220
ret = -EEXIST;
221
write_unlock(&fs_info->block_group_cache_lock);
222
223
return ret;
224
}
225
226
/*
227
* This will return the block group at or after bytenr if contains is 0, else
228
* it will return the block group that contains the bytenr
229
*/
230
static struct btrfs_block_group *block_group_cache_tree_search(
231
struct btrfs_fs_info *info, u64 bytenr, int contains)
232
{
233
struct btrfs_block_group *cache, *ret = NULL;
234
struct rb_node *n;
235
u64 end, start;
236
237
read_lock(&info->block_group_cache_lock);
238
n = info->block_group_cache_tree.rb_root.rb_node;
239
240
while (n) {
241
cache = rb_entry(n, struct btrfs_block_group, cache_node);
242
end = cache->start + cache->length - 1;
243
start = cache->start;
244
245
if (bytenr < start) {
246
if (!contains && (!ret || start < ret->start))
247
ret = cache;
248
n = n->rb_left;
249
} else if (bytenr > start) {
250
if (contains && bytenr <= end) {
251
ret = cache;
252
break;
253
}
254
n = n->rb_right;
255
} else {
256
ret = cache;
257
break;
258
}
259
}
260
if (ret)
261
btrfs_get_block_group(ret);
262
read_unlock(&info->block_group_cache_lock);
263
264
return ret;
265
}
266
267
/*
268
* Return the block group that starts at or after bytenr
269
*/
270
struct btrfs_block_group *btrfs_lookup_first_block_group(
271
struct btrfs_fs_info *info, u64 bytenr)
272
{
273
return block_group_cache_tree_search(info, bytenr, 0);
274
}
275
276
/*
277
* Return the block group that contains the given bytenr
278
*/
279
struct btrfs_block_group *btrfs_lookup_block_group(
280
struct btrfs_fs_info *info, u64 bytenr)
281
{
282
return block_group_cache_tree_search(info, bytenr, 1);
283
}
284
285
struct btrfs_block_group *btrfs_next_block_group(
286
struct btrfs_block_group *cache)
287
{
288
struct btrfs_fs_info *fs_info = cache->fs_info;
289
struct rb_node *node;
290
291
read_lock(&fs_info->block_group_cache_lock);
292
293
/* If our block group was removed, we need a full search. */
294
if (RB_EMPTY_NODE(&cache->cache_node)) {
295
const u64 next_bytenr = cache->start + cache->length;
296
297
read_unlock(&fs_info->block_group_cache_lock);
298
btrfs_put_block_group(cache);
299
return btrfs_lookup_first_block_group(fs_info, next_bytenr);
300
}
301
node = rb_next(&cache->cache_node);
302
btrfs_put_block_group(cache);
303
if (node) {
304
cache = rb_entry(node, struct btrfs_block_group, cache_node);
305
btrfs_get_block_group(cache);
306
} else
307
cache = NULL;
308
read_unlock(&fs_info->block_group_cache_lock);
309
return cache;
310
}
311
312
/*
313
* Check if we can do a NOCOW write for a given extent.
314
*
315
* @fs_info: The filesystem information object.
316
* @bytenr: Logical start address of the extent.
317
*
318
* Check if we can do a NOCOW write for the given extent, and increments the
319
* number of NOCOW writers in the block group that contains the extent, as long
320
* as the block group exists and it's currently not in read-only mode.
321
*
322
* Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
323
* is responsible for calling btrfs_dec_nocow_writers() later.
324
*
325
* Or NULL if we can not do a NOCOW write
326
*/
327
struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
328
u64 bytenr)
329
{
330
struct btrfs_block_group *bg;
331
bool can_nocow = true;
332
333
bg = btrfs_lookup_block_group(fs_info, bytenr);
334
if (!bg)
335
return NULL;
336
337
spin_lock(&bg->lock);
338
if (bg->ro)
339
can_nocow = false;
340
else
341
atomic_inc(&bg->nocow_writers);
342
spin_unlock(&bg->lock);
343
344
if (!can_nocow) {
345
btrfs_put_block_group(bg);
346
return NULL;
347
}
348
349
/* No put on block group, done by btrfs_dec_nocow_writers(). */
350
return bg;
351
}
352
353
/*
354
* Decrement the number of NOCOW writers in a block group.
355
*
356
* This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
357
* and on the block group returned by that call. Typically this is called after
358
* creating an ordered extent for a NOCOW write, to prevent races with scrub and
359
* relocation.
360
*
361
* After this call, the caller should not use the block group anymore. It it wants
362
* to use it, then it should get a reference on it before calling this function.
363
*/
364
void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
365
{
366
if (atomic_dec_and_test(&bg->nocow_writers))
367
wake_up_var(&bg->nocow_writers);
368
369
/* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
370
btrfs_put_block_group(bg);
371
}
372
373
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
374
{
375
wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
376
}
377
378
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
379
const u64 start)
380
{
381
struct btrfs_block_group *bg;
382
383
bg = btrfs_lookup_block_group(fs_info, start);
384
ASSERT(bg);
385
if (atomic_dec_and_test(&bg->reservations))
386
wake_up_var(&bg->reservations);
387
btrfs_put_block_group(bg);
388
}
389
390
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
391
{
392
struct btrfs_space_info *space_info = bg->space_info;
393
394
ASSERT(bg->ro);
395
396
if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
397
return;
398
399
/*
400
* Our block group is read only but before we set it to read only,
401
* some task might have had allocated an extent from it already, but it
402
* has not yet created a respective ordered extent (and added it to a
403
* root's list of ordered extents).
404
* Therefore wait for any task currently allocating extents, since the
405
* block group's reservations counter is incremented while a read lock
406
* on the groups' semaphore is held and decremented after releasing
407
* the read access on that semaphore and creating the ordered extent.
408
*/
409
down_write(&space_info->groups_sem);
410
up_write(&space_info->groups_sem);
411
412
wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
413
}
414
415
struct btrfs_caching_control *btrfs_get_caching_control(
416
struct btrfs_block_group *cache)
417
{
418
struct btrfs_caching_control *ctl;
419
420
spin_lock(&cache->lock);
421
if (!cache->caching_ctl) {
422
spin_unlock(&cache->lock);
423
return NULL;
424
}
425
426
ctl = cache->caching_ctl;
427
refcount_inc(&ctl->count);
428
spin_unlock(&cache->lock);
429
return ctl;
430
}
431
432
static void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
433
{
434
if (refcount_dec_and_test(&ctl->count))
435
kfree(ctl);
436
}
437
438
/*
439
* When we wait for progress in the block group caching, its because our
440
* allocation attempt failed at least once. So, we must sleep and let some
441
* progress happen before we try again.
442
*
443
* This function will sleep at least once waiting for new free space to show
444
* up, and then it will check the block group free space numbers for our min
445
* num_bytes. Another option is to have it go ahead and look in the rbtree for
446
* a free extent of a given size, but this is a good start.
447
*
448
* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
449
* any of the information in this block group.
450
*/
451
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
452
u64 num_bytes)
453
{
454
struct btrfs_caching_control *caching_ctl;
455
int progress;
456
457
caching_ctl = btrfs_get_caching_control(cache);
458
if (!caching_ctl)
459
return;
460
461
/*
462
* We've already failed to allocate from this block group, so even if
463
* there's enough space in the block group it isn't contiguous enough to
464
* allow for an allocation, so wait for at least the next wakeup tick,
465
* or for the thing to be done.
466
*/
467
progress = atomic_read(&caching_ctl->progress);
468
469
wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
470
(progress != atomic_read(&caching_ctl->progress) &&
471
(cache->free_space_ctl->free_space >= num_bytes)));
472
473
btrfs_put_caching_control(caching_ctl);
474
}
475
476
static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
477
struct btrfs_caching_control *caching_ctl)
478
{
479
wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
480
return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
481
}
482
483
static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
484
{
485
struct btrfs_caching_control *caching_ctl;
486
int ret;
487
488
caching_ctl = btrfs_get_caching_control(cache);
489
if (!caching_ctl)
490
return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
491
ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
492
btrfs_put_caching_control(caching_ctl);
493
return ret;
494
}
495
496
#ifdef CONFIG_BTRFS_DEBUG
497
static void fragment_free_space(struct btrfs_block_group *block_group)
498
{
499
struct btrfs_fs_info *fs_info = block_group->fs_info;
500
u64 start = block_group->start;
501
u64 len = block_group->length;
502
u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
503
fs_info->nodesize : fs_info->sectorsize;
504
u64 step = chunk << 1;
505
506
while (len > chunk) {
507
btrfs_remove_free_space(block_group, start, chunk);
508
start += step;
509
if (len < step)
510
len = 0;
511
else
512
len -= step;
513
}
514
}
515
#endif
516
517
/*
518
* Add a free space range to the in memory free space cache of a block group.
519
* This checks if the range contains super block locations and any such
520
* locations are not added to the free space cache.
521
*
522
* @block_group: The target block group.
523
* @start: Start offset of the range.
524
* @end: End offset of the range (exclusive).
525
* @total_added_ret: Optional pointer to return the total amount of space
526
* added to the block group's free space cache.
527
*
528
* Returns 0 on success or < 0 on error.
529
*/
530
int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start,
531
u64 end, u64 *total_added_ret)
532
{
533
struct btrfs_fs_info *info = block_group->fs_info;
534
u64 extent_start, extent_end, size;
535
int ret;
536
537
if (total_added_ret)
538
*total_added_ret = 0;
539
540
while (start < end) {
541
if (!btrfs_find_first_extent_bit(&info->excluded_extents, start,
542
&extent_start, &extent_end,
543
EXTENT_DIRTY, NULL))
544
break;
545
546
if (extent_start <= start) {
547
start = extent_end + 1;
548
} else if (extent_start > start && extent_start < end) {
549
size = extent_start - start;
550
ret = btrfs_add_free_space_async_trimmed(block_group,
551
start, size);
552
if (ret)
553
return ret;
554
if (total_added_ret)
555
*total_added_ret += size;
556
start = extent_end + 1;
557
} else {
558
break;
559
}
560
}
561
562
if (start < end) {
563
size = end - start;
564
ret = btrfs_add_free_space_async_trimmed(block_group, start,
565
size);
566
if (ret)
567
return ret;
568
if (total_added_ret)
569
*total_added_ret += size;
570
}
571
572
return 0;
573
}
574
575
/*
576
* Get an arbitrary extent item index / max_index through the block group
577
*
578
* @block_group the block group to sample from
579
* @index: the integral step through the block group to grab from
580
* @max_index: the granularity of the sampling
581
* @key: return value parameter for the item we find
582
*
583
* Pre-conditions on indices:
584
* 0 <= index <= max_index
585
* 0 < max_index
586
*
587
* Returns: 0 on success, 1 if the search didn't yield a useful item, negative
588
* error code on error.
589
*/
590
static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl,
591
struct btrfs_block_group *block_group,
592
int index, int max_index,
593
struct btrfs_key *found_key)
594
{
595
struct btrfs_fs_info *fs_info = block_group->fs_info;
596
struct btrfs_root *extent_root;
597
u64 search_offset;
598
u64 search_end = block_group->start + block_group->length;
599
BTRFS_PATH_AUTO_FREE(path);
600
struct btrfs_key search_key;
601
int ret = 0;
602
603
ASSERT(index >= 0);
604
ASSERT(index <= max_index);
605
ASSERT(max_index > 0);
606
lockdep_assert_held(&caching_ctl->mutex);
607
lockdep_assert_held_read(&fs_info->commit_root_sem);
608
609
path = btrfs_alloc_path();
610
if (!path)
611
return -ENOMEM;
612
613
extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start,
614
BTRFS_SUPER_INFO_OFFSET));
615
616
path->skip_locking = 1;
617
path->search_commit_root = 1;
618
path->reada = READA_FORWARD;
619
620
search_offset = index * div_u64(block_group->length, max_index);
621
search_key.objectid = block_group->start + search_offset;
622
search_key.type = BTRFS_EXTENT_ITEM_KEY;
623
search_key.offset = 0;
624
625
btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) {
626
/* Success; sampled an extent item in the block group */
627
if (found_key->type == BTRFS_EXTENT_ITEM_KEY &&
628
found_key->objectid >= block_group->start &&
629
found_key->objectid + found_key->offset <= search_end)
630
break;
631
632
/* We can't possibly find a valid extent item anymore */
633
if (found_key->objectid >= search_end) {
634
ret = 1;
635
break;
636
}
637
}
638
639
lockdep_assert_held(&caching_ctl->mutex);
640
lockdep_assert_held_read(&fs_info->commit_root_sem);
641
return ret;
642
}
643
644
/*
645
* Best effort attempt to compute a block group's size class while caching it.
646
*
647
* @block_group: the block group we are caching
648
*
649
* We cannot infer the size class while adding free space extents, because that
650
* logic doesn't care about contiguous file extents (it doesn't differentiate
651
* between a 100M extent and 100 contiguous 1M extents). So we need to read the
652
* file extent items. Reading all of them is quite wasteful, because usually
653
* only a handful are enough to give a good answer. Therefore, we just grab 5 of
654
* them at even steps through the block group and pick the smallest size class
655
* we see. Since size class is best effort, and not guaranteed in general,
656
* inaccuracy is acceptable.
657
*
658
* To be more explicit about why this algorithm makes sense:
659
*
660
* If we are caching in a block group from disk, then there are three major cases
661
* to consider:
662
* 1. the block group is well behaved and all extents in it are the same size
663
* class.
664
* 2. the block group is mostly one size class with rare exceptions for last
665
* ditch allocations
666
* 3. the block group was populated before size classes and can have a totally
667
* arbitrary mix of size classes.
668
*
669
* In case 1, looking at any extent in the block group will yield the correct
670
* result. For the mixed cases, taking the minimum size class seems like a good
671
* approximation, since gaps from frees will be usable to the size class. For
672
* 2., a small handful of file extents is likely to yield the right answer. For
673
* 3, we can either read every file extent, or admit that this is best effort
674
* anyway and try to stay fast.
675
*
676
* Returns: 0 on success, negative error code on error.
677
*/
678
static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl,
679
struct btrfs_block_group *block_group)
680
{
681
struct btrfs_fs_info *fs_info = block_group->fs_info;
682
struct btrfs_key key;
683
int i;
684
u64 min_size = block_group->length;
685
enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE;
686
int ret;
687
688
if (!btrfs_block_group_should_use_size_class(block_group))
689
return 0;
690
691
lockdep_assert_held(&caching_ctl->mutex);
692
lockdep_assert_held_read(&fs_info->commit_root_sem);
693
for (i = 0; i < 5; ++i) {
694
ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key);
695
if (ret < 0)
696
goto out;
697
if (ret > 0)
698
continue;
699
min_size = min_t(u64, min_size, key.offset);
700
size_class = btrfs_calc_block_group_size_class(min_size);
701
}
702
if (size_class != BTRFS_BG_SZ_NONE) {
703
spin_lock(&block_group->lock);
704
block_group->size_class = size_class;
705
spin_unlock(&block_group->lock);
706
}
707
out:
708
return ret;
709
}
710
711
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
712
{
713
struct btrfs_block_group *block_group = caching_ctl->block_group;
714
struct btrfs_fs_info *fs_info = block_group->fs_info;
715
struct btrfs_root *extent_root;
716
BTRFS_PATH_AUTO_FREE(path);
717
struct extent_buffer *leaf;
718
struct btrfs_key key;
719
u64 total_found = 0;
720
u64 last = 0;
721
u32 nritems;
722
int ret;
723
bool wakeup = true;
724
725
path = btrfs_alloc_path();
726
if (!path)
727
return -ENOMEM;
728
729
last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
730
extent_root = btrfs_extent_root(fs_info, last);
731
732
#ifdef CONFIG_BTRFS_DEBUG
733
/*
734
* If we're fragmenting we don't want to make anybody think we can
735
* allocate from this block group until we've had a chance to fragment
736
* the free space.
737
*/
738
if (btrfs_should_fragment_free_space(block_group))
739
wakeup = false;
740
#endif
741
/*
742
* We don't want to deadlock with somebody trying to allocate a new
743
* extent for the extent root while also trying to search the extent
744
* root to add free space. So we skip locking and search the commit
745
* root, since its read-only
746
*/
747
path->skip_locking = 1;
748
path->search_commit_root = 1;
749
path->reada = READA_FORWARD;
750
751
key.objectid = last;
752
key.type = BTRFS_EXTENT_ITEM_KEY;
753
key.offset = 0;
754
755
next:
756
ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
757
if (ret < 0)
758
goto out;
759
760
leaf = path->nodes[0];
761
nritems = btrfs_header_nritems(leaf);
762
763
while (1) {
764
if (btrfs_fs_closing(fs_info) > 1) {
765
last = (u64)-1;
766
break;
767
}
768
769
if (path->slots[0] < nritems) {
770
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
771
} else {
772
ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
773
if (ret)
774
break;
775
776
if (need_resched() ||
777
rwsem_is_contended(&fs_info->commit_root_sem)) {
778
btrfs_release_path(path);
779
up_read(&fs_info->commit_root_sem);
780
mutex_unlock(&caching_ctl->mutex);
781
cond_resched();
782
mutex_lock(&caching_ctl->mutex);
783
down_read(&fs_info->commit_root_sem);
784
goto next;
785
}
786
787
ret = btrfs_next_leaf(extent_root, path);
788
if (ret < 0)
789
goto out;
790
if (ret)
791
break;
792
leaf = path->nodes[0];
793
nritems = btrfs_header_nritems(leaf);
794
continue;
795
}
796
797
if (key.objectid < last) {
798
key.objectid = last;
799
key.type = BTRFS_EXTENT_ITEM_KEY;
800
key.offset = 0;
801
btrfs_release_path(path);
802
goto next;
803
}
804
805
if (key.objectid < block_group->start) {
806
path->slots[0]++;
807
continue;
808
}
809
810
if (key.objectid >= block_group->start + block_group->length)
811
break;
812
813
if (key.type == BTRFS_EXTENT_ITEM_KEY ||
814
key.type == BTRFS_METADATA_ITEM_KEY) {
815
u64 space_added;
816
817
ret = btrfs_add_new_free_space(block_group, last,
818
key.objectid, &space_added);
819
if (ret)
820
goto out;
821
total_found += space_added;
822
if (key.type == BTRFS_METADATA_ITEM_KEY)
823
last = key.objectid +
824
fs_info->nodesize;
825
else
826
last = key.objectid + key.offset;
827
828
if (total_found > CACHING_CTL_WAKE_UP) {
829
total_found = 0;
830
if (wakeup) {
831
atomic_inc(&caching_ctl->progress);
832
wake_up(&caching_ctl->wait);
833
}
834
}
835
}
836
path->slots[0]++;
837
}
838
839
ret = btrfs_add_new_free_space(block_group, last,
840
block_group->start + block_group->length,
841
NULL);
842
out:
843
return ret;
844
}
845
846
static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg)
847
{
848
btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start,
849
bg->start + bg->length - 1, EXTENT_DIRTY, NULL);
850
}
851
852
static noinline void caching_thread(struct btrfs_work *work)
853
{
854
struct btrfs_block_group *block_group;
855
struct btrfs_fs_info *fs_info;
856
struct btrfs_caching_control *caching_ctl;
857
int ret;
858
859
caching_ctl = container_of(work, struct btrfs_caching_control, work);
860
block_group = caching_ctl->block_group;
861
fs_info = block_group->fs_info;
862
863
mutex_lock(&caching_ctl->mutex);
864
down_read(&fs_info->commit_root_sem);
865
866
load_block_group_size_class(caching_ctl, block_group);
867
if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
868
ret = load_free_space_cache(block_group);
869
if (ret == 1) {
870
ret = 0;
871
goto done;
872
}
873
874
/*
875
* We failed to load the space cache, set ourselves to
876
* CACHE_STARTED and carry on.
877
*/
878
spin_lock(&block_group->lock);
879
block_group->cached = BTRFS_CACHE_STARTED;
880
spin_unlock(&block_group->lock);
881
wake_up(&caching_ctl->wait);
882
}
883
884
/*
885
* If we are in the transaction that populated the free space tree we
886
* can't actually cache from the free space tree as our commit root and
887
* real root are the same, so we could change the contents of the blocks
888
* while caching. Instead do the slow caching in this case, and after
889
* the transaction has committed we will be safe.
890
*/
891
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
892
!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
893
ret = btrfs_load_free_space_tree(caching_ctl);
894
else
895
ret = load_extent_tree_free(caching_ctl);
896
done:
897
spin_lock(&block_group->lock);
898
block_group->caching_ctl = NULL;
899
block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
900
spin_unlock(&block_group->lock);
901
902
#ifdef CONFIG_BTRFS_DEBUG
903
if (btrfs_should_fragment_free_space(block_group)) {
904
u64 bytes_used;
905
906
spin_lock(&block_group->space_info->lock);
907
spin_lock(&block_group->lock);
908
bytes_used = block_group->length - block_group->used;
909
block_group->space_info->bytes_used += bytes_used >> 1;
910
spin_unlock(&block_group->lock);
911
spin_unlock(&block_group->space_info->lock);
912
fragment_free_space(block_group);
913
}
914
#endif
915
916
up_read(&fs_info->commit_root_sem);
917
btrfs_free_excluded_extents(block_group);
918
mutex_unlock(&caching_ctl->mutex);
919
920
wake_up(&caching_ctl->wait);
921
922
btrfs_put_caching_control(caching_ctl);
923
btrfs_put_block_group(block_group);
924
}
925
926
int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
927
{
928
struct btrfs_fs_info *fs_info = cache->fs_info;
929
struct btrfs_caching_control *caching_ctl = NULL;
930
int ret = 0;
931
932
/* Allocator for zoned filesystems does not use the cache at all */
933
if (btrfs_is_zoned(fs_info))
934
return 0;
935
936
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
937
if (!caching_ctl)
938
return -ENOMEM;
939
940
INIT_LIST_HEAD(&caching_ctl->list);
941
mutex_init(&caching_ctl->mutex);
942
init_waitqueue_head(&caching_ctl->wait);
943
caching_ctl->block_group = cache;
944
refcount_set(&caching_ctl->count, 2);
945
atomic_set(&caching_ctl->progress, 0);
946
btrfs_init_work(&caching_ctl->work, caching_thread, NULL);
947
948
spin_lock(&cache->lock);
949
if (cache->cached != BTRFS_CACHE_NO) {
950
kfree(caching_ctl);
951
952
caching_ctl = cache->caching_ctl;
953
if (caching_ctl)
954
refcount_inc(&caching_ctl->count);
955
spin_unlock(&cache->lock);
956
goto out;
957
}
958
WARN_ON(cache->caching_ctl);
959
cache->caching_ctl = caching_ctl;
960
cache->cached = BTRFS_CACHE_STARTED;
961
spin_unlock(&cache->lock);
962
963
write_lock(&fs_info->block_group_cache_lock);
964
refcount_inc(&caching_ctl->count);
965
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
966
write_unlock(&fs_info->block_group_cache_lock);
967
968
btrfs_get_block_group(cache);
969
970
btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
971
out:
972
if (wait && caching_ctl)
973
ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
974
if (caching_ctl)
975
btrfs_put_caching_control(caching_ctl);
976
977
return ret;
978
}
979
980
static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
981
{
982
u64 extra_flags = chunk_to_extended(flags) &
983
BTRFS_EXTENDED_PROFILE_MASK;
984
985
write_seqlock(&fs_info->profiles_lock);
986
if (flags & BTRFS_BLOCK_GROUP_DATA)
987
fs_info->avail_data_alloc_bits &= ~extra_flags;
988
if (flags & BTRFS_BLOCK_GROUP_METADATA)
989
fs_info->avail_metadata_alloc_bits &= ~extra_flags;
990
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
991
fs_info->avail_system_alloc_bits &= ~extra_flags;
992
write_sequnlock(&fs_info->profiles_lock);
993
}
994
995
/*
996
* Clear incompat bits for the following feature(s):
997
*
998
* - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
999
* in the whole filesystem
1000
*
1001
* - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
1002
*/
1003
static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
1004
{
1005
bool found_raid56 = false;
1006
bool found_raid1c34 = false;
1007
1008
if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
1009
(flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
1010
(flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
1011
struct list_head *head = &fs_info->space_info;
1012
struct btrfs_space_info *sinfo;
1013
1014
list_for_each_entry_rcu(sinfo, head, list) {
1015
down_read(&sinfo->groups_sem);
1016
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
1017
found_raid56 = true;
1018
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
1019
found_raid56 = true;
1020
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
1021
found_raid1c34 = true;
1022
if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
1023
found_raid1c34 = true;
1024
up_read(&sinfo->groups_sem);
1025
}
1026
if (!found_raid56)
1027
btrfs_clear_fs_incompat(fs_info, RAID56);
1028
if (!found_raid1c34)
1029
btrfs_clear_fs_incompat(fs_info, RAID1C34);
1030
}
1031
}
1032
1033
static struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
1034
{
1035
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
1036
return fs_info->block_group_root;
1037
return btrfs_extent_root(fs_info, 0);
1038
}
1039
1040
static int remove_block_group_item(struct btrfs_trans_handle *trans,
1041
struct btrfs_path *path,
1042
struct btrfs_block_group *block_group)
1043
{
1044
struct btrfs_fs_info *fs_info = trans->fs_info;
1045
struct btrfs_root *root;
1046
struct btrfs_key key;
1047
int ret;
1048
1049
root = btrfs_block_group_root(fs_info);
1050
key.objectid = block_group->start;
1051
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
1052
key.offset = block_group->length;
1053
1054
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1055
if (ret > 0)
1056
ret = -ENOENT;
1057
if (ret < 0)
1058
return ret;
1059
1060
ret = btrfs_del_item(trans, root, path);
1061
return ret;
1062
}
1063
1064
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
1065
struct btrfs_chunk_map *map)
1066
{
1067
struct btrfs_fs_info *fs_info = trans->fs_info;
1068
struct btrfs_path *path;
1069
struct btrfs_block_group *block_group;
1070
struct btrfs_free_cluster *cluster;
1071
struct inode *inode;
1072
struct kobject *kobj = NULL;
1073
int ret;
1074
int index;
1075
int factor;
1076
struct btrfs_caching_control *caching_ctl = NULL;
1077
bool remove_map;
1078
bool remove_rsv = false;
1079
1080
block_group = btrfs_lookup_block_group(fs_info, map->start);
1081
if (!block_group)
1082
return -ENOENT;
1083
1084
BUG_ON(!block_group->ro);
1085
1086
trace_btrfs_remove_block_group(block_group);
1087
/*
1088
* Free the reserved super bytes from this block group before
1089
* remove it.
1090
*/
1091
btrfs_free_excluded_extents(block_group);
1092
btrfs_free_ref_tree_range(fs_info, block_group->start,
1093
block_group->length);
1094
1095
index = btrfs_bg_flags_to_raid_index(block_group->flags);
1096
factor = btrfs_bg_type_to_factor(block_group->flags);
1097
1098
/* make sure this block group isn't part of an allocation cluster */
1099
cluster = &fs_info->data_alloc_cluster;
1100
spin_lock(&cluster->refill_lock);
1101
btrfs_return_cluster_to_free_space(block_group, cluster);
1102
spin_unlock(&cluster->refill_lock);
1103
1104
/*
1105
* make sure this block group isn't part of a metadata
1106
* allocation cluster
1107
*/
1108
cluster = &fs_info->meta_alloc_cluster;
1109
spin_lock(&cluster->refill_lock);
1110
btrfs_return_cluster_to_free_space(block_group, cluster);
1111
spin_unlock(&cluster->refill_lock);
1112
1113
btrfs_clear_treelog_bg(block_group);
1114
btrfs_clear_data_reloc_bg(block_group);
1115
1116
path = btrfs_alloc_path();
1117
if (!path) {
1118
ret = -ENOMEM;
1119
goto out;
1120
}
1121
1122
/*
1123
* get the inode first so any iput calls done for the io_list
1124
* aren't the final iput (no unlinks allowed now)
1125
*/
1126
inode = lookup_free_space_inode(block_group, path);
1127
1128
mutex_lock(&trans->transaction->cache_write_mutex);
1129
/*
1130
* Make sure our free space cache IO is done before removing the
1131
* free space inode
1132
*/
1133
spin_lock(&trans->transaction->dirty_bgs_lock);
1134
if (!list_empty(&block_group->io_list)) {
1135
list_del_init(&block_group->io_list);
1136
1137
WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
1138
1139
spin_unlock(&trans->transaction->dirty_bgs_lock);
1140
btrfs_wait_cache_io(trans, block_group, path);
1141
btrfs_put_block_group(block_group);
1142
spin_lock(&trans->transaction->dirty_bgs_lock);
1143
}
1144
1145
if (!list_empty(&block_group->dirty_list)) {
1146
list_del_init(&block_group->dirty_list);
1147
remove_rsv = true;
1148
btrfs_put_block_group(block_group);
1149
}
1150
spin_unlock(&trans->transaction->dirty_bgs_lock);
1151
mutex_unlock(&trans->transaction->cache_write_mutex);
1152
1153
ret = btrfs_remove_free_space_inode(trans, inode, block_group);
1154
if (ret)
1155
goto out;
1156
1157
write_lock(&fs_info->block_group_cache_lock);
1158
rb_erase_cached(&block_group->cache_node,
1159
&fs_info->block_group_cache_tree);
1160
RB_CLEAR_NODE(&block_group->cache_node);
1161
1162
/* Once for the block groups rbtree */
1163
btrfs_put_block_group(block_group);
1164
1165
write_unlock(&fs_info->block_group_cache_lock);
1166
1167
down_write(&block_group->space_info->groups_sem);
1168
/*
1169
* we must use list_del_init so people can check to see if they
1170
* are still on the list after taking the semaphore
1171
*/
1172
list_del_init(&block_group->list);
1173
if (list_empty(&block_group->space_info->block_groups[index])) {
1174
kobj = block_group->space_info->block_group_kobjs[index];
1175
block_group->space_info->block_group_kobjs[index] = NULL;
1176
clear_avail_alloc_bits(fs_info, block_group->flags);
1177
}
1178
up_write(&block_group->space_info->groups_sem);
1179
clear_incompat_bg_bits(fs_info, block_group->flags);
1180
if (kobj) {
1181
kobject_del(kobj);
1182
kobject_put(kobj);
1183
}
1184
1185
if (block_group->cached == BTRFS_CACHE_STARTED)
1186
btrfs_wait_block_group_cache_done(block_group);
1187
1188
write_lock(&fs_info->block_group_cache_lock);
1189
caching_ctl = btrfs_get_caching_control(block_group);
1190
if (!caching_ctl) {
1191
struct btrfs_caching_control *ctl;
1192
1193
list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
1194
if (ctl->block_group == block_group) {
1195
caching_ctl = ctl;
1196
refcount_inc(&caching_ctl->count);
1197
break;
1198
}
1199
}
1200
}
1201
if (caching_ctl)
1202
list_del_init(&caching_ctl->list);
1203
write_unlock(&fs_info->block_group_cache_lock);
1204
1205
if (caching_ctl) {
1206
/* Once for the caching bgs list and once for us. */
1207
btrfs_put_caching_control(caching_ctl);
1208
btrfs_put_caching_control(caching_ctl);
1209
}
1210
1211
spin_lock(&trans->transaction->dirty_bgs_lock);
1212
WARN_ON(!list_empty(&block_group->dirty_list));
1213
WARN_ON(!list_empty(&block_group->io_list));
1214
spin_unlock(&trans->transaction->dirty_bgs_lock);
1215
1216
btrfs_remove_free_space_cache(block_group);
1217
1218
spin_lock(&block_group->space_info->lock);
1219
list_del_init(&block_group->ro_list);
1220
1221
if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
1222
WARN_ON(block_group->space_info->total_bytes
1223
< block_group->length);
1224
WARN_ON(block_group->space_info->bytes_readonly
1225
< block_group->length - block_group->zone_unusable);
1226
WARN_ON(block_group->space_info->bytes_zone_unusable
1227
< block_group->zone_unusable);
1228
WARN_ON(block_group->space_info->disk_total
1229
< block_group->length * factor);
1230
}
1231
block_group->space_info->total_bytes -= block_group->length;
1232
block_group->space_info->bytes_readonly -=
1233
(block_group->length - block_group->zone_unusable);
1234
btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
1235
-block_group->zone_unusable);
1236
block_group->space_info->disk_total -= block_group->length * factor;
1237
1238
spin_unlock(&block_group->space_info->lock);
1239
1240
/*
1241
* Remove the free space for the block group from the free space tree
1242
* and the block group's item from the extent tree before marking the
1243
* block group as removed. This is to prevent races with tasks that
1244
* freeze and unfreeze a block group, this task and another task
1245
* allocating a new block group - the unfreeze task ends up removing
1246
* the block group's extent map before the task calling this function
1247
* deletes the block group item from the extent tree, allowing for
1248
* another task to attempt to create another block group with the same
1249
* item key (and failing with -EEXIST and a transaction abort).
1250
*/
1251
ret = btrfs_remove_block_group_free_space(trans, block_group);
1252
if (ret)
1253
goto out;
1254
1255
ret = remove_block_group_item(trans, path, block_group);
1256
if (ret < 0)
1257
goto out;
1258
1259
spin_lock(&block_group->lock);
1260
/*
1261
* Hitting this WARN means we removed a block group with an unwritten
1262
* region. It will cause "unable to find chunk map for logical" errors.
1263
*/
1264
if (WARN_ON(has_unwritten_metadata(block_group)))
1265
btrfs_warn(fs_info,
1266
"block group %llu is removed before metadata write out",
1267
block_group->start);
1268
1269
set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
1270
1271
/*
1272
* At this point trimming or scrub can't start on this block group,
1273
* because we removed the block group from the rbtree
1274
* fs_info->block_group_cache_tree so no one can't find it anymore and
1275
* even if someone already got this block group before we removed it
1276
* from the rbtree, they have already incremented block_group->frozen -
1277
* if they didn't, for the trimming case they won't find any free space
1278
* entries because we already removed them all when we called
1279
* btrfs_remove_free_space_cache().
1280
*
1281
* And we must not remove the chunk map from the fs_info->mapping_tree
1282
* to prevent the same logical address range and physical device space
1283
* ranges from being reused for a new block group. This is needed to
1284
* avoid races with trimming and scrub.
1285
*
1286
* An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
1287
* completely transactionless, so while it is trimming a range the
1288
* currently running transaction might finish and a new one start,
1289
* allowing for new block groups to be created that can reuse the same
1290
* physical device locations unless we take this special care.
1291
*
1292
* There may also be an implicit trim operation if the file system
1293
* is mounted with -odiscard. The same protections must remain
1294
* in place until the extents have been discarded completely when
1295
* the transaction commit has completed.
1296
*/
1297
remove_map = (atomic_read(&block_group->frozen) == 0);
1298
spin_unlock(&block_group->lock);
1299
1300
if (remove_map)
1301
btrfs_remove_chunk_map(fs_info, map);
1302
1303
out:
1304
/* Once for the lookup reference */
1305
btrfs_put_block_group(block_group);
1306
if (remove_rsv)
1307
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
1308
btrfs_free_path(path);
1309
return ret;
1310
}
1311
1312
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
1313
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
1314
{
1315
struct btrfs_root *root = btrfs_block_group_root(fs_info);
1316
struct btrfs_chunk_map *map;
1317
unsigned int num_items;
1318
1319
map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
1320
ASSERT(map != NULL);
1321
ASSERT(map->start == chunk_offset);
1322
1323
/*
1324
* We need to reserve 3 + N units from the metadata space info in order
1325
* to remove a block group (done at btrfs_remove_chunk() and at
1326
* btrfs_remove_block_group()), which are used for:
1327
*
1328
* 1 unit for adding the free space inode's orphan (located in the tree
1329
* of tree roots).
1330
* 1 unit for deleting the block group item (located in the extent
1331
* tree).
1332
* 1 unit for deleting the free space item (located in tree of tree
1333
* roots).
1334
* N units for deleting N device extent items corresponding to each
1335
* stripe (located in the device tree).
1336
*
1337
* In order to remove a block group we also need to reserve units in the
1338
* system space info in order to update the chunk tree (update one or
1339
* more device items and remove one chunk item), but this is done at
1340
* btrfs_remove_chunk() through a call to check_system_chunk().
1341
*/
1342
num_items = 3 + map->num_stripes;
1343
btrfs_free_chunk_map(map);
1344
1345
return btrfs_start_transaction_fallback_global_rsv(root, num_items);
1346
}
1347
1348
/*
1349
* Mark block group @cache read-only, so later write won't happen to block
1350
* group @cache.
1351
*
1352
* If @force is not set, this function will only mark the block group readonly
1353
* if we have enough free space (1M) in other metadata/system block groups.
1354
* If @force is not set, this function will mark the block group readonly
1355
* without checking free space.
1356
*
1357
* NOTE: This function doesn't care if other block groups can contain all the
1358
* data in this block group. That check should be done by relocation routine,
1359
* not this function.
1360
*/
1361
static int inc_block_group_ro(struct btrfs_block_group *cache, bool force)
1362
{
1363
struct btrfs_space_info *sinfo = cache->space_info;
1364
u64 num_bytes;
1365
int ret = -ENOSPC;
1366
1367
spin_lock(&sinfo->lock);
1368
spin_lock(&cache->lock);
1369
1370
if (cache->swap_extents) {
1371
ret = -ETXTBSY;
1372
goto out;
1373
}
1374
1375
if (cache->ro) {
1376
cache->ro++;
1377
ret = 0;
1378
goto out;
1379
}
1380
1381
num_bytes = cache->length - cache->reserved - cache->pinned -
1382
cache->bytes_super - cache->zone_unusable - cache->used;
1383
1384
/*
1385
* Data never overcommits, even in mixed mode, so do just the straight
1386
* check of left over space in how much we have allocated.
1387
*/
1388
if (force) {
1389
ret = 0;
1390
} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
1391
u64 sinfo_used = btrfs_space_info_used(sinfo, true);
1392
1393
/*
1394
* Here we make sure if we mark this bg RO, we still have enough
1395
* free space as buffer.
1396
*/
1397
if (sinfo_used + num_bytes <= sinfo->total_bytes)
1398
ret = 0;
1399
} else {
1400
/*
1401
* We overcommit metadata, so we need to do the
1402
* btrfs_can_overcommit check here, and we need to pass in
1403
* BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
1404
* leeway to allow us to mark this block group as read only.
1405
*/
1406
if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
1407
BTRFS_RESERVE_NO_FLUSH))
1408
ret = 0;
1409
}
1410
1411
if (!ret) {
1412
sinfo->bytes_readonly += num_bytes;
1413
if (btrfs_is_zoned(cache->fs_info)) {
1414
/* Migrate zone_unusable bytes to readonly */
1415
sinfo->bytes_readonly += cache->zone_unusable;
1416
btrfs_space_info_update_bytes_zone_unusable(sinfo, -cache->zone_unusable);
1417
cache->zone_unusable = 0;
1418
}
1419
cache->ro++;
1420
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
1421
}
1422
out:
1423
spin_unlock(&cache->lock);
1424
spin_unlock(&sinfo->lock);
1425
if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
1426
btrfs_info(cache->fs_info,
1427
"unable to make block group %llu ro", cache->start);
1428
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, false);
1429
}
1430
return ret;
1431
}
1432
1433
static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
1434
const struct btrfs_block_group *bg)
1435
{
1436
struct btrfs_fs_info *fs_info = trans->fs_info;
1437
struct btrfs_transaction *prev_trans = NULL;
1438
const u64 start = bg->start;
1439
const u64 end = start + bg->length - 1;
1440
int ret;
1441
1442
spin_lock(&fs_info->trans_lock);
1443
if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) {
1444
prev_trans = list_prev_entry(trans->transaction, list);
1445
refcount_inc(&prev_trans->use_count);
1446
}
1447
spin_unlock(&fs_info->trans_lock);
1448
1449
/*
1450
* Hold the unused_bg_unpin_mutex lock to avoid racing with
1451
* btrfs_finish_extent_commit(). If we are at transaction N, another
1452
* task might be running finish_extent_commit() for the previous
1453
* transaction N - 1, and have seen a range belonging to the block
1454
* group in pinned_extents before we were able to clear the whole block
1455
* group range from pinned_extents. This means that task can lookup for
1456
* the block group after we unpinned it from pinned_extents and removed
1457
* it, leading to an error at unpin_extent_range().
1458
*/
1459
mutex_lock(&fs_info->unused_bg_unpin_mutex);
1460
if (prev_trans) {
1461
ret = btrfs_clear_extent_bit(&prev_trans->pinned_extents, start, end,
1462
EXTENT_DIRTY, NULL);
1463
if (ret)
1464
goto out;
1465
}
1466
1467
ret = btrfs_clear_extent_bit(&trans->transaction->pinned_extents, start, end,
1468
EXTENT_DIRTY, NULL);
1469
out:
1470
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
1471
if (prev_trans)
1472
btrfs_put_transaction(prev_trans);
1473
1474
return ret == 0;
1475
}
1476
1477
/*
1478
* Link the block_group to a list via bg_list.
1479
*
1480
* @bg: The block_group to link to the list.
1481
* @list: The list to link it to.
1482
*
1483
* Use this rather than list_add_tail() directly to ensure proper respect
1484
* to locking and refcounting.
1485
*
1486
* Returns: true if the bg was linked with a refcount bump and false otherwise.
1487
*/
1488
static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list)
1489
{
1490
struct btrfs_fs_info *fs_info = bg->fs_info;
1491
bool added = false;
1492
1493
spin_lock(&fs_info->unused_bgs_lock);
1494
if (list_empty(&bg->bg_list)) {
1495
btrfs_get_block_group(bg);
1496
list_add_tail(&bg->bg_list, list);
1497
added = true;
1498
}
1499
spin_unlock(&fs_info->unused_bgs_lock);
1500
return added;
1501
}
1502
1503
/*
1504
* Process the unused_bgs list and remove any that don't have any allocated
1505
* space inside of them.
1506
*/
1507
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
1508
{
1509
LIST_HEAD(retry_list);
1510
struct btrfs_block_group *block_group;
1511
struct btrfs_space_info *space_info;
1512
struct btrfs_trans_handle *trans;
1513
const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
1514
int ret = 0;
1515
1516
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1517
return;
1518
1519
if (btrfs_fs_closing(fs_info))
1520
return;
1521
1522
/*
1523
* Long running balances can keep us blocked here for eternity, so
1524
* simply skip deletion if we're unable to get the mutex.
1525
*/
1526
if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
1527
return;
1528
1529
spin_lock(&fs_info->unused_bgs_lock);
1530
while (!list_empty(&fs_info->unused_bgs)) {
1531
u64 used;
1532
int trimming;
1533
1534
block_group = list_first_entry(&fs_info->unused_bgs,
1535
struct btrfs_block_group,
1536
bg_list);
1537
list_del_init(&block_group->bg_list);
1538
1539
space_info = block_group->space_info;
1540
1541
if (ret || btrfs_mixed_space_info(space_info)) {
1542
btrfs_put_block_group(block_group);
1543
continue;
1544
}
1545
spin_unlock(&fs_info->unused_bgs_lock);
1546
1547
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
1548
1549
/* Don't want to race with allocators so take the groups_sem */
1550
down_write(&space_info->groups_sem);
1551
1552
/*
1553
* Async discard moves the final block group discard to be prior
1554
* to the unused_bgs code path. Therefore, if it's not fully
1555
* trimmed, punt it back to the async discard lists.
1556
*/
1557
if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
1558
!btrfs_is_free_space_trimmed(block_group)) {
1559
trace_btrfs_skip_unused_block_group(block_group);
1560
up_write(&space_info->groups_sem);
1561
/* Requeue if we failed because of async discard */
1562
btrfs_discard_queue_work(&fs_info->discard_ctl,
1563
block_group);
1564
goto next;
1565
}
1566
1567
spin_lock(&space_info->lock);
1568
spin_lock(&block_group->lock);
1569
if (btrfs_is_block_group_used(block_group) || block_group->ro ||
1570
list_is_singular(&block_group->list)) {
1571
/*
1572
* We want to bail if we made new allocations or have
1573
* outstanding allocations in this block group. We do
1574
* the ro check in case balance is currently acting on
1575
* this block group.
1576
*
1577
* Also bail out if this is the only block group for its
1578
* type, because otherwise we would lose profile
1579
* information from fs_info->avail_*_alloc_bits and the
1580
* next block group of this type would be created with a
1581
* "single" profile (even if we're in a raid fs) because
1582
* fs_info->avail_*_alloc_bits would be 0.
1583
*/
1584
trace_btrfs_skip_unused_block_group(block_group);
1585
spin_unlock(&block_group->lock);
1586
spin_unlock(&space_info->lock);
1587
up_write(&space_info->groups_sem);
1588
goto next;
1589
}
1590
1591
/*
1592
* The block group may be unused but there may be space reserved
1593
* accounting with the existence of that block group, that is,
1594
* space_info->bytes_may_use was incremented by a task but no
1595
* space was yet allocated from the block group by the task.
1596
* That space may or may not be allocated, as we are generally
1597
* pessimistic about space reservation for metadata as well as
1598
* for data when using compression (as we reserve space based on
1599
* the worst case, when data can't be compressed, and before
1600
* actually attempting compression, before starting writeback).
1601
*
1602
* So check if the total space of the space_info minus the size
1603
* of this block group is less than the used space of the
1604
* space_info - if that's the case, then it means we have tasks
1605
* that might be relying on the block group in order to allocate
1606
* extents, and add back the block group to the unused list when
1607
* we finish, so that we retry later in case no tasks ended up
1608
* needing to allocate extents from the block group.
1609
*/
1610
used = btrfs_space_info_used(space_info, true);
1611
if ((space_info->total_bytes - block_group->length < used &&
1612
block_group->zone_unusable < block_group->length) ||
1613
has_unwritten_metadata(block_group)) {
1614
/*
1615
* Add a reference for the list, compensate for the ref
1616
* drop under the "next" label for the
1617
* fs_info->unused_bgs list.
1618
*/
1619
btrfs_link_bg_list(block_group, &retry_list);
1620
1621
trace_btrfs_skip_unused_block_group(block_group);
1622
spin_unlock(&block_group->lock);
1623
spin_unlock(&space_info->lock);
1624
up_write(&space_info->groups_sem);
1625
goto next;
1626
}
1627
1628
spin_unlock(&block_group->lock);
1629
spin_unlock(&space_info->lock);
1630
1631
/* We don't want to force the issue, only flip if it's ok. */
1632
ret = inc_block_group_ro(block_group, 0);
1633
up_write(&space_info->groups_sem);
1634
if (ret < 0) {
1635
ret = 0;
1636
goto next;
1637
}
1638
1639
ret = btrfs_zone_finish(block_group);
1640
if (ret < 0) {
1641
btrfs_dec_block_group_ro(block_group);
1642
if (ret == -EAGAIN) {
1643
btrfs_link_bg_list(block_group, &retry_list);
1644
ret = 0;
1645
}
1646
goto next;
1647
}
1648
1649
/*
1650
* Want to do this before we do anything else so we can recover
1651
* properly if we fail to join the transaction.
1652
*/
1653
trans = btrfs_start_trans_remove_block_group(fs_info,
1654
block_group->start);
1655
if (IS_ERR(trans)) {
1656
btrfs_dec_block_group_ro(block_group);
1657
ret = PTR_ERR(trans);
1658
goto next;
1659
}
1660
1661
/*
1662
* We could have pending pinned extents for this block group,
1663
* just delete them, we don't care about them anymore.
1664
*/
1665
if (!clean_pinned_extents(trans, block_group)) {
1666
btrfs_dec_block_group_ro(block_group);
1667
goto end_trans;
1668
}
1669
1670
/*
1671
* At this point, the block_group is read only and should fail
1672
* new allocations. However, btrfs_finish_extent_commit() can
1673
* cause this block_group to be placed back on the discard
1674
* lists because now the block_group isn't fully discarded.
1675
* Bail here and try again later after discarding everything.
1676
*/
1677
spin_lock(&fs_info->discard_ctl.lock);
1678
if (!list_empty(&block_group->discard_list)) {
1679
spin_unlock(&fs_info->discard_ctl.lock);
1680
btrfs_dec_block_group_ro(block_group);
1681
btrfs_discard_queue_work(&fs_info->discard_ctl,
1682
block_group);
1683
goto end_trans;
1684
}
1685
spin_unlock(&fs_info->discard_ctl.lock);
1686
1687
/* Reset pinned so btrfs_put_block_group doesn't complain */
1688
spin_lock(&space_info->lock);
1689
spin_lock(&block_group->lock);
1690
1691
btrfs_space_info_update_bytes_pinned(space_info, -block_group->pinned);
1692
space_info->bytes_readonly += block_group->pinned;
1693
block_group->pinned = 0;
1694
1695
spin_unlock(&block_group->lock);
1696
spin_unlock(&space_info->lock);
1697
1698
/*
1699
* The normal path here is an unused block group is passed here,
1700
* then trimming is handled in the transaction commit path.
1701
* Async discard interposes before this to do the trimming
1702
* before coming down the unused block group path as trimming
1703
* will no longer be done later in the transaction commit path.
1704
*/
1705
if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
1706
goto flip_async;
1707
1708
/*
1709
* DISCARD can flip during remount. On zoned filesystems, we
1710
* need to reset sequential-required zones.
1711
*/
1712
trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
1713
btrfs_is_zoned(fs_info);
1714
1715
/* Implicit trim during transaction commit. */
1716
if (trimming)
1717
btrfs_freeze_block_group(block_group);
1718
1719
/*
1720
* Btrfs_remove_chunk will abort the transaction if things go
1721
* horribly wrong.
1722
*/
1723
ret = btrfs_remove_chunk(trans, block_group->start);
1724
1725
if (ret) {
1726
if (trimming)
1727
btrfs_unfreeze_block_group(block_group);
1728
goto end_trans;
1729
}
1730
1731
/*
1732
* If we're not mounted with -odiscard, we can just forget
1733
* about this block group. Otherwise we'll need to wait
1734
* until transaction commit to do the actual discard.
1735
*/
1736
if (trimming) {
1737
spin_lock(&fs_info->unused_bgs_lock);
1738
/*
1739
* A concurrent scrub might have added us to the list
1740
* fs_info->unused_bgs, so use a list_move operation
1741
* to add the block group to the deleted_bgs list.
1742
*/
1743
list_move(&block_group->bg_list,
1744
&trans->transaction->deleted_bgs);
1745
spin_unlock(&fs_info->unused_bgs_lock);
1746
btrfs_get_block_group(block_group);
1747
}
1748
end_trans:
1749
btrfs_end_transaction(trans);
1750
next:
1751
btrfs_put_block_group(block_group);
1752
spin_lock(&fs_info->unused_bgs_lock);
1753
}
1754
list_splice_tail(&retry_list, &fs_info->unused_bgs);
1755
spin_unlock(&fs_info->unused_bgs_lock);
1756
mutex_unlock(&fs_info->reclaim_bgs_lock);
1757
return;
1758
1759
flip_async:
1760
btrfs_end_transaction(trans);
1761
spin_lock(&fs_info->unused_bgs_lock);
1762
list_splice_tail(&retry_list, &fs_info->unused_bgs);
1763
spin_unlock(&fs_info->unused_bgs_lock);
1764
mutex_unlock(&fs_info->reclaim_bgs_lock);
1765
btrfs_put_block_group(block_group);
1766
btrfs_discard_punt_unused_bgs_list(fs_info);
1767
}
1768
1769
void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
1770
{
1771
struct btrfs_fs_info *fs_info = bg->fs_info;
1772
1773
spin_lock(&fs_info->unused_bgs_lock);
1774
if (list_empty(&bg->bg_list)) {
1775
btrfs_get_block_group(bg);
1776
trace_btrfs_add_unused_block_group(bg);
1777
list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
1778
} else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
1779
/* Pull out the block group from the reclaim_bgs list. */
1780
trace_btrfs_add_unused_block_group(bg);
1781
list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
1782
}
1783
spin_unlock(&fs_info->unused_bgs_lock);
1784
}
1785
1786
/*
1787
* We want block groups with a low number of used bytes to be in the beginning
1788
* of the list, so they will get reclaimed first.
1789
*/
1790
static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
1791
const struct list_head *b)
1792
{
1793
const struct btrfs_block_group *bg1, *bg2;
1794
1795
bg1 = list_entry(a, struct btrfs_block_group, bg_list);
1796
bg2 = list_entry(b, struct btrfs_block_group, bg_list);
1797
1798
/*
1799
* Some other task may be updating the ->used field concurrently, but it
1800
* is not serious if we get a stale value or load/store tearing issues,
1801
* as sorting the list of block groups to reclaim is not critical and an
1802
* occasional imperfect order is ok. So silence KCSAN and avoid the
1803
* overhead of locking or any other synchronization.
1804
*/
1805
return data_race(bg1->used > bg2->used);
1806
}
1807
1808
static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info)
1809
{
1810
if (btrfs_is_zoned(fs_info))
1811
return btrfs_zoned_should_reclaim(fs_info);
1812
return true;
1813
}
1814
1815
static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 bytes_freed)
1816
{
1817
const int thresh_pct = btrfs_calc_reclaim_threshold(bg->space_info);
1818
u64 thresh_bytes = mult_perc(bg->length, thresh_pct);
1819
const u64 new_val = bg->used;
1820
const u64 old_val = new_val + bytes_freed;
1821
1822
if (thresh_bytes == 0)
1823
return false;
1824
1825
/*
1826
* If we were below the threshold before don't reclaim, we are likely a
1827
* brand new block group and we don't want to relocate new block groups.
1828
*/
1829
if (old_val < thresh_bytes)
1830
return false;
1831
if (new_val >= thresh_bytes)
1832
return false;
1833
return true;
1834
}
1835
1836
void btrfs_reclaim_bgs_work(struct work_struct *work)
1837
{
1838
struct btrfs_fs_info *fs_info =
1839
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
1840
struct btrfs_block_group *bg;
1841
struct btrfs_space_info *space_info;
1842
LIST_HEAD(retry_list);
1843
1844
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1845
return;
1846
1847
if (btrfs_fs_closing(fs_info))
1848
return;
1849
1850
if (!btrfs_should_reclaim(fs_info))
1851
return;
1852
1853
sb_start_write(fs_info->sb);
1854
1855
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
1856
sb_end_write(fs_info->sb);
1857
return;
1858
}
1859
1860
/*
1861
* Long running balances can keep us blocked here for eternity, so
1862
* simply skip reclaim if we're unable to get the mutex.
1863
*/
1864
if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
1865
btrfs_exclop_finish(fs_info);
1866
sb_end_write(fs_info->sb);
1867
return;
1868
}
1869
1870
spin_lock(&fs_info->unused_bgs_lock);
1871
/*
1872
* Sort happens under lock because we can't simply splice it and sort.
1873
* The block groups might still be in use and reachable via bg_list,
1874
* and their presence in the reclaim_bgs list must be preserved.
1875
*/
1876
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
1877
while (!list_empty(&fs_info->reclaim_bgs)) {
1878
u64 used;
1879
u64 reserved;
1880
int ret = 0;
1881
1882
bg = list_first_entry(&fs_info->reclaim_bgs,
1883
struct btrfs_block_group,
1884
bg_list);
1885
list_del_init(&bg->bg_list);
1886
1887
space_info = bg->space_info;
1888
spin_unlock(&fs_info->unused_bgs_lock);
1889
1890
/* Don't race with allocators so take the groups_sem */
1891
down_write(&space_info->groups_sem);
1892
1893
spin_lock(&space_info->lock);
1894
spin_lock(&bg->lock);
1895
if (bg->reserved || bg->pinned || bg->ro) {
1896
/*
1897
* We want to bail if we made new allocations or have
1898
* outstanding allocations in this block group. We do
1899
* the ro check in case balance is currently acting on
1900
* this block group.
1901
*/
1902
spin_unlock(&bg->lock);
1903
spin_unlock(&space_info->lock);
1904
up_write(&space_info->groups_sem);
1905
goto next;
1906
}
1907
if (bg->used == 0) {
1908
/*
1909
* It is possible that we trigger relocation on a block
1910
* group as its extents are deleted and it first goes
1911
* below the threshold, then shortly after goes empty.
1912
*
1913
* In this case, relocating it does delete it, but has
1914
* some overhead in relocation specific metadata, looking
1915
* for the non-existent extents and running some extra
1916
* transactions, which we can avoid by using one of the
1917
* other mechanisms for dealing with empty block groups.
1918
*/
1919
if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
1920
btrfs_mark_bg_unused(bg);
1921
spin_unlock(&bg->lock);
1922
spin_unlock(&space_info->lock);
1923
up_write(&space_info->groups_sem);
1924
goto next;
1925
1926
}
1927
/*
1928
* The block group might no longer meet the reclaim condition by
1929
* the time we get around to reclaiming it, so to avoid
1930
* reclaiming overly full block_groups, skip reclaiming them.
1931
*
1932
* Since the decision making process also depends on the amount
1933
* being freed, pass in a fake giant value to skip that extra
1934
* check, which is more meaningful when adding to the list in
1935
* the first place.
1936
*/
1937
if (!should_reclaim_block_group(bg, bg->length)) {
1938
spin_unlock(&bg->lock);
1939
spin_unlock(&space_info->lock);
1940
up_write(&space_info->groups_sem);
1941
goto next;
1942
}
1943
1944
spin_unlock(&bg->lock);
1945
spin_unlock(&space_info->lock);
1946
1947
/*
1948
* Get out fast, in case we're read-only or unmounting the
1949
* filesystem. It is OK to drop block groups from the list even
1950
* for the read-only case. As we did sb_start_write(),
1951
* "mount -o remount,ro" won't happen and read-only filesystem
1952
* means it is forced read-only due to a fatal error. So, it
1953
* never gets back to read-write to let us reclaim again.
1954
*/
1955
if (btrfs_need_cleaner_sleep(fs_info)) {
1956
up_write(&space_info->groups_sem);
1957
goto next;
1958
}
1959
1960
ret = inc_block_group_ro(bg, 0);
1961
up_write(&space_info->groups_sem);
1962
if (ret < 0)
1963
goto next;
1964
1965
/*
1966
* The amount of bytes reclaimed corresponds to the sum of the
1967
* "used" and "reserved" counters. We have set the block group
1968
* to RO above, which prevents reservations from happening but
1969
* we may have existing reservations for which allocation has
1970
* not yet been done - btrfs_update_block_group() was not yet
1971
* called, which is where we will transfer a reserved extent's
1972
* size from the "reserved" counter to the "used" counter - this
1973
* happens when running delayed references. When we relocate the
1974
* chunk below, relocation first flushes delalloc, waits for
1975
* ordered extent completion (which is where we create delayed
1976
* references for data extents) and commits the current
1977
* transaction (which runs delayed references), and only after
1978
* it does the actual work to move extents out of the block
1979
* group. So the reported amount of reclaimed bytes is
1980
* effectively the sum of the 'used' and 'reserved' counters.
1981
*/
1982
spin_lock(&bg->lock);
1983
used = bg->used;
1984
reserved = bg->reserved;
1985
spin_unlock(&bg->lock);
1986
1987
trace_btrfs_reclaim_block_group(bg);
1988
ret = btrfs_relocate_chunk(fs_info, bg->start, false);
1989
if (ret) {
1990
btrfs_dec_block_group_ro(bg);
1991
btrfs_err(fs_info, "error relocating chunk %llu",
1992
bg->start);
1993
used = 0;
1994
reserved = 0;
1995
spin_lock(&space_info->lock);
1996
space_info->reclaim_errors++;
1997
if (READ_ONCE(space_info->periodic_reclaim))
1998
space_info->periodic_reclaim_ready = false;
1999
spin_unlock(&space_info->lock);
2000
}
2001
spin_lock(&space_info->lock);
2002
space_info->reclaim_count++;
2003
space_info->reclaim_bytes += used;
2004
space_info->reclaim_bytes += reserved;
2005
spin_unlock(&space_info->lock);
2006
2007
next:
2008
if (ret && !READ_ONCE(space_info->periodic_reclaim))
2009
btrfs_link_bg_list(bg, &retry_list);
2010
btrfs_put_block_group(bg);
2011
2012
mutex_unlock(&fs_info->reclaim_bgs_lock);
2013
/*
2014
* Reclaiming all the block groups in the list can take really
2015
* long. Prioritize cleaning up unused block groups.
2016
*/
2017
btrfs_delete_unused_bgs(fs_info);
2018
/*
2019
* If we are interrupted by a balance, we can just bail out. The
2020
* cleaner thread restart again if necessary.
2021
*/
2022
if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
2023
goto end;
2024
spin_lock(&fs_info->unused_bgs_lock);
2025
}
2026
spin_unlock(&fs_info->unused_bgs_lock);
2027
mutex_unlock(&fs_info->reclaim_bgs_lock);
2028
end:
2029
spin_lock(&fs_info->unused_bgs_lock);
2030
list_splice_tail(&retry_list, &fs_info->reclaim_bgs);
2031
spin_unlock(&fs_info->unused_bgs_lock);
2032
btrfs_exclop_finish(fs_info);
2033
sb_end_write(fs_info->sb);
2034
}
2035
2036
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
2037
{
2038
btrfs_reclaim_sweep(fs_info);
2039
spin_lock(&fs_info->unused_bgs_lock);
2040
if (!list_empty(&fs_info->reclaim_bgs))
2041
queue_work(system_dfl_wq, &fs_info->reclaim_bgs_work);
2042
spin_unlock(&fs_info->unused_bgs_lock);
2043
}
2044
2045
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
2046
{
2047
struct btrfs_fs_info *fs_info = bg->fs_info;
2048
2049
if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs))
2050
trace_btrfs_add_reclaim_block_group(bg);
2051
}
2052
2053
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key,
2054
const struct btrfs_path *path)
2055
{
2056
struct btrfs_chunk_map *map;
2057
struct btrfs_block_group_item bg;
2058
struct extent_buffer *leaf;
2059
int slot;
2060
u64 flags;
2061
int ret = 0;
2062
2063
slot = path->slots[0];
2064
leaf = path->nodes[0];
2065
2066
map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
2067
if (!map) {
2068
btrfs_err(fs_info,
2069
"logical %llu len %llu found bg but no related chunk",
2070
key->objectid, key->offset);
2071
return -ENOENT;
2072
}
2073
2074
if (unlikely(map->start != key->objectid || map->chunk_len != key->offset)) {
2075
btrfs_err(fs_info,
2076
"block group %llu len %llu mismatch with chunk %llu len %llu",
2077
key->objectid, key->offset, map->start, map->chunk_len);
2078
ret = -EUCLEAN;
2079
goto out_free_map;
2080
}
2081
2082
read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
2083
sizeof(bg));
2084
flags = btrfs_stack_block_group_flags(&bg) &
2085
BTRFS_BLOCK_GROUP_TYPE_MASK;
2086
2087
if (unlikely(flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
2088
btrfs_err(fs_info,
2089
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
2090
key->objectid, key->offset, flags,
2091
(BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
2092
ret = -EUCLEAN;
2093
}
2094
2095
out_free_map:
2096
btrfs_free_chunk_map(map);
2097
return ret;
2098
}
2099
2100
static int find_first_block_group(struct btrfs_fs_info *fs_info,
2101
struct btrfs_path *path,
2102
const struct btrfs_key *key)
2103
{
2104
struct btrfs_root *root = btrfs_block_group_root(fs_info);
2105
int ret;
2106
struct btrfs_key found_key;
2107
2108
btrfs_for_each_slot(root, key, &found_key, path, ret) {
2109
if (found_key.objectid >= key->objectid &&
2110
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
2111
return read_bg_from_eb(fs_info, &found_key, path);
2112
}
2113
}
2114
return ret;
2115
}
2116
2117
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2118
{
2119
u64 extra_flags = chunk_to_extended(flags) &
2120
BTRFS_EXTENDED_PROFILE_MASK;
2121
2122
write_seqlock(&fs_info->profiles_lock);
2123
if (flags & BTRFS_BLOCK_GROUP_DATA)
2124
fs_info->avail_data_alloc_bits |= extra_flags;
2125
if (flags & BTRFS_BLOCK_GROUP_METADATA)
2126
fs_info->avail_metadata_alloc_bits |= extra_flags;
2127
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2128
fs_info->avail_system_alloc_bits |= extra_flags;
2129
write_sequnlock(&fs_info->profiles_lock);
2130
}
2131
2132
/*
2133
* Map a physical disk address to a list of logical addresses.
2134
*
2135
* @fs_info: the filesystem
2136
* @chunk_start: logical address of block group
2137
* @physical: physical address to map to logical addresses
2138
* @logical: return array of logical addresses which map to @physical
2139
* @naddrs: length of @logical
2140
* @stripe_len: size of IO stripe for the given block group
2141
*
2142
* Maps a particular @physical disk address to a list of @logical addresses.
2143
* Used primarily to exclude those portions of a block group that contain super
2144
* block copies.
2145
*/
2146
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
2147
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
2148
{
2149
struct btrfs_chunk_map *map;
2150
u64 *buf;
2151
u64 bytenr;
2152
u64 data_stripe_length;
2153
u64 io_stripe_size;
2154
int i, nr = 0;
2155
int ret = 0;
2156
2157
map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
2158
if (IS_ERR(map))
2159
return -EIO;
2160
2161
data_stripe_length = map->stripe_size;
2162
io_stripe_size = BTRFS_STRIPE_LEN;
2163
chunk_start = map->start;
2164
2165
/* For RAID5/6 adjust to a full IO stripe length */
2166
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2167
io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
2168
2169
buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
2170
if (!buf) {
2171
ret = -ENOMEM;
2172
goto out;
2173
}
2174
2175
for (i = 0; i < map->num_stripes; i++) {
2176
bool already_inserted = false;
2177
u32 stripe_nr;
2178
u32 offset;
2179
int j;
2180
2181
if (!in_range(physical, map->stripes[i].physical,
2182
data_stripe_length))
2183
continue;
2184
2185
stripe_nr = (physical - map->stripes[i].physical) >>
2186
BTRFS_STRIPE_LEN_SHIFT;
2187
offset = (physical - map->stripes[i].physical) &
2188
BTRFS_STRIPE_LEN_MASK;
2189
2190
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2191
BTRFS_BLOCK_GROUP_RAID10))
2192
stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
2193
map->sub_stripes);
2194
/*
2195
* The remaining case would be for RAID56, multiply by
2196
* nr_data_stripes(). Alternatively, just use rmap_len below
2197
* instead of map->stripe_len
2198
*/
2199
bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
2200
2201
/* Ensure we don't add duplicate addresses */
2202
for (j = 0; j < nr; j++) {
2203
if (buf[j] == bytenr) {
2204
already_inserted = true;
2205
break;
2206
}
2207
}
2208
2209
if (!already_inserted)
2210
buf[nr++] = bytenr;
2211
}
2212
2213
*logical = buf;
2214
*naddrs = nr;
2215
*stripe_len = io_stripe_size;
2216
out:
2217
btrfs_free_chunk_map(map);
2218
return ret;
2219
}
2220
2221
static int exclude_super_stripes(struct btrfs_block_group *cache)
2222
{
2223
struct btrfs_fs_info *fs_info = cache->fs_info;
2224
const bool zoned = btrfs_is_zoned(fs_info);
2225
u64 bytenr;
2226
u64 *logical;
2227
int stripe_len;
2228
int i, nr, ret;
2229
2230
if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
2231
stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
2232
cache->bytes_super += stripe_len;
2233
ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start,
2234
cache->start + stripe_len - 1,
2235
EXTENT_DIRTY, NULL);
2236
if (ret)
2237
return ret;
2238
}
2239
2240
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2241
bytenr = btrfs_sb_offset(i);
2242
ret = btrfs_rmap_block(fs_info, cache->start,
2243
bytenr, &logical, &nr, &stripe_len);
2244
if (ret)
2245
return ret;
2246
2247
/* Shouldn't have super stripes in sequential zones */
2248
if (unlikely(zoned && nr)) {
2249
kfree(logical);
2250
btrfs_err(fs_info,
2251
"zoned: block group %llu must not contain super block",
2252
cache->start);
2253
return -EUCLEAN;
2254
}
2255
2256
while (nr--) {
2257
u64 len = min_t(u64, stripe_len,
2258
cache->start + cache->length - logical[nr]);
2259
2260
cache->bytes_super += len;
2261
ret = btrfs_set_extent_bit(&fs_info->excluded_extents,
2262
logical[nr], logical[nr] + len - 1,
2263
EXTENT_DIRTY, NULL);
2264
if (ret) {
2265
kfree(logical);
2266
return ret;
2267
}
2268
}
2269
2270
kfree(logical);
2271
}
2272
return 0;
2273
}
2274
2275
static struct btrfs_block_group *btrfs_create_block_group_cache(
2276
struct btrfs_fs_info *fs_info, u64 start)
2277
{
2278
struct btrfs_block_group *cache;
2279
2280
cache = kzalloc(sizeof(*cache), GFP_NOFS);
2281
if (!cache)
2282
return NULL;
2283
2284
cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
2285
GFP_NOFS);
2286
if (!cache->free_space_ctl) {
2287
kfree(cache);
2288
return NULL;
2289
}
2290
2291
cache->start = start;
2292
2293
cache->fs_info = fs_info;
2294
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
2295
2296
cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
2297
2298
refcount_set(&cache->refs, 1);
2299
spin_lock_init(&cache->lock);
2300
init_rwsem(&cache->data_rwsem);
2301
INIT_LIST_HEAD(&cache->list);
2302
INIT_LIST_HEAD(&cache->cluster_list);
2303
INIT_LIST_HEAD(&cache->bg_list);
2304
INIT_LIST_HEAD(&cache->ro_list);
2305
INIT_LIST_HEAD(&cache->discard_list);
2306
INIT_LIST_HEAD(&cache->dirty_list);
2307
INIT_LIST_HEAD(&cache->io_list);
2308
INIT_LIST_HEAD(&cache->active_bg_list);
2309
btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
2310
atomic_set(&cache->frozen, 0);
2311
mutex_init(&cache->free_space_lock);
2312
2313
return cache;
2314
}
2315
2316
/*
2317
* Iterate all chunks and verify that each of them has the corresponding block
2318
* group
2319
*/
2320
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
2321
{
2322
u64 start = 0;
2323
int ret = 0;
2324
2325
while (1) {
2326
struct btrfs_chunk_map *map;
2327
struct btrfs_block_group *bg;
2328
2329
/*
2330
* btrfs_find_chunk_map() will return the first chunk map
2331
* intersecting the range, so setting @length to 1 is enough to
2332
* get the first chunk.
2333
*/
2334
map = btrfs_find_chunk_map(fs_info, start, 1);
2335
if (!map)
2336
break;
2337
2338
bg = btrfs_lookup_block_group(fs_info, map->start);
2339
if (unlikely(!bg)) {
2340
btrfs_err(fs_info,
2341
"chunk start=%llu len=%llu doesn't have corresponding block group",
2342
map->start, map->chunk_len);
2343
ret = -EUCLEAN;
2344
btrfs_free_chunk_map(map);
2345
break;
2346
}
2347
if (unlikely(bg->start != map->start || bg->length != map->chunk_len ||
2348
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
2349
(map->type & BTRFS_BLOCK_GROUP_TYPE_MASK))) {
2350
btrfs_err(fs_info,
2351
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
2352
map->start, map->chunk_len,
2353
map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
2354
bg->start, bg->length,
2355
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
2356
ret = -EUCLEAN;
2357
btrfs_free_chunk_map(map);
2358
btrfs_put_block_group(bg);
2359
break;
2360
}
2361
start = map->start + map->chunk_len;
2362
btrfs_free_chunk_map(map);
2363
btrfs_put_block_group(bg);
2364
}
2365
return ret;
2366
}
2367
2368
static int read_one_block_group(struct btrfs_fs_info *info,
2369
struct btrfs_block_group_item *bgi,
2370
const struct btrfs_key *key,
2371
int need_clear)
2372
{
2373
struct btrfs_block_group *cache;
2374
const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
2375
int ret;
2376
2377
ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
2378
2379
cache = btrfs_create_block_group_cache(info, key->objectid);
2380
if (!cache)
2381
return -ENOMEM;
2382
2383
cache->length = key->offset;
2384
cache->used = btrfs_stack_block_group_used(bgi);
2385
cache->commit_used = cache->used;
2386
cache->flags = btrfs_stack_block_group_flags(bgi);
2387
cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
2388
cache->space_info = btrfs_find_space_info(info, cache->flags);
2389
2390
btrfs_set_free_space_tree_thresholds(cache);
2391
2392
if (need_clear) {
2393
/*
2394
* When we mount with old space cache, we need to
2395
* set BTRFS_DC_CLEAR and set dirty flag.
2396
*
2397
* a) Setting 'BTRFS_DC_CLEAR' makes sure that we
2398
* truncate the old free space cache inode and
2399
* setup a new one.
2400
* b) Setting 'dirty flag' makes sure that we flush
2401
* the new space cache info onto disk.
2402
*/
2403
if (btrfs_test_opt(info, SPACE_CACHE))
2404
cache->disk_cache_state = BTRFS_DC_CLEAR;
2405
}
2406
if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2407
(cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
2408
btrfs_err(info,
2409
"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
2410
cache->start);
2411
ret = -EINVAL;
2412
goto error;
2413
}
2414
2415
ret = btrfs_load_block_group_zone_info(cache, false);
2416
if (ret) {
2417
btrfs_err(info, "zoned: failed to load zone info of bg %llu",
2418
cache->start);
2419
goto error;
2420
}
2421
2422
/*
2423
* We need to exclude the super stripes now so that the space info has
2424
* super bytes accounted for, otherwise we'll think we have more space
2425
* than we actually do.
2426
*/
2427
ret = exclude_super_stripes(cache);
2428
if (ret) {
2429
/* We may have excluded something, so call this just in case. */
2430
btrfs_free_excluded_extents(cache);
2431
goto error;
2432
}
2433
2434
/*
2435
* For zoned filesystem, space after the allocation offset is the only
2436
* free space for a block group. So, we don't need any caching work.
2437
* btrfs_calc_zone_unusable() will set the amount of free space and
2438
* zone_unusable space.
2439
*
2440
* For regular filesystem, check for two cases, either we are full, and
2441
* therefore don't need to bother with the caching work since we won't
2442
* find any space, or we are empty, and we can just add all the space
2443
* in and be done with it. This saves us _a_lot_ of time, particularly
2444
* in the full case.
2445
*/
2446
if (btrfs_is_zoned(info)) {
2447
btrfs_calc_zone_unusable(cache);
2448
/* Should not have any excluded extents. Just in case, though. */
2449
btrfs_free_excluded_extents(cache);
2450
} else if (cache->length == cache->used) {
2451
cache->cached = BTRFS_CACHE_FINISHED;
2452
btrfs_free_excluded_extents(cache);
2453
} else if (cache->used == 0) {
2454
cache->cached = BTRFS_CACHE_FINISHED;
2455
ret = btrfs_add_new_free_space(cache, cache->start,
2456
cache->start + cache->length, NULL);
2457
btrfs_free_excluded_extents(cache);
2458
if (ret)
2459
goto error;
2460
}
2461
2462
ret = btrfs_add_block_group_cache(cache);
2463
if (ret) {
2464
btrfs_remove_free_space_cache(cache);
2465
goto error;
2466
}
2467
2468
trace_btrfs_add_block_group(info, cache, 0);
2469
btrfs_add_bg_to_space_info(info, cache);
2470
2471
set_avail_alloc_bits(info, cache->flags);
2472
if (btrfs_chunk_writeable(info, cache->start)) {
2473
if (cache->used == 0) {
2474
ASSERT(list_empty(&cache->bg_list));
2475
if (btrfs_test_opt(info, DISCARD_ASYNC))
2476
btrfs_discard_queue_work(&info->discard_ctl, cache);
2477
else
2478
btrfs_mark_bg_unused(cache);
2479
}
2480
} else {
2481
inc_block_group_ro(cache, 1);
2482
}
2483
2484
return 0;
2485
error:
2486
btrfs_put_block_group(cache);
2487
return ret;
2488
}
2489
2490
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
2491
{
2492
struct rb_node *node;
2493
int ret = 0;
2494
2495
for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
2496
struct btrfs_chunk_map *map;
2497
struct btrfs_block_group *bg;
2498
2499
map = rb_entry(node, struct btrfs_chunk_map, rb_node);
2500
bg = btrfs_create_block_group_cache(fs_info, map->start);
2501
if (!bg) {
2502
ret = -ENOMEM;
2503
break;
2504
}
2505
2506
/* Fill dummy cache as FULL */
2507
bg->length = map->chunk_len;
2508
bg->flags = map->type;
2509
bg->cached = BTRFS_CACHE_FINISHED;
2510
bg->used = map->chunk_len;
2511
bg->flags = map->type;
2512
bg->space_info = btrfs_find_space_info(fs_info, bg->flags);
2513
ret = btrfs_add_block_group_cache(bg);
2514
/*
2515
* We may have some valid block group cache added already, in
2516
* that case we skip to the next one.
2517
*/
2518
if (ret == -EEXIST) {
2519
ret = 0;
2520
btrfs_put_block_group(bg);
2521
continue;
2522
}
2523
2524
if (ret) {
2525
btrfs_remove_free_space_cache(bg);
2526
btrfs_put_block_group(bg);
2527
break;
2528
}
2529
2530
btrfs_add_bg_to_space_info(fs_info, bg);
2531
2532
set_avail_alloc_bits(fs_info, bg->flags);
2533
}
2534
if (!ret)
2535
btrfs_init_global_block_rsv(fs_info);
2536
return ret;
2537
}
2538
2539
int btrfs_read_block_groups(struct btrfs_fs_info *info)
2540
{
2541
struct btrfs_root *root = btrfs_block_group_root(info);
2542
struct btrfs_path *path;
2543
int ret;
2544
struct btrfs_block_group *cache;
2545
struct btrfs_space_info *space_info;
2546
struct btrfs_key key;
2547
int need_clear = 0;
2548
u64 cache_gen;
2549
2550
/*
2551
* Either no extent root (with ibadroots rescue option) or we have
2552
* unsupported RO options. The fs can never be mounted read-write, so no
2553
* need to waste time searching block group items.
2554
*
2555
* This also allows new extent tree related changes to be RO compat,
2556
* no need for a full incompat flag.
2557
*/
2558
if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
2559
~BTRFS_FEATURE_COMPAT_RO_SUPP))
2560
return fill_dummy_bgs(info);
2561
2562
key.objectid = 0;
2563
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2564
key.offset = 0;
2565
path = btrfs_alloc_path();
2566
if (!path)
2567
return -ENOMEM;
2568
2569
cache_gen = btrfs_super_cache_generation(info->super_copy);
2570
if (btrfs_test_opt(info, SPACE_CACHE) &&
2571
btrfs_super_generation(info->super_copy) != cache_gen)
2572
need_clear = 1;
2573
if (btrfs_test_opt(info, CLEAR_CACHE))
2574
need_clear = 1;
2575
2576
while (1) {
2577
struct btrfs_block_group_item bgi;
2578
struct extent_buffer *leaf;
2579
int slot;
2580
2581
ret = find_first_block_group(info, path, &key);
2582
if (ret > 0)
2583
break;
2584
if (ret != 0)
2585
goto error;
2586
2587
leaf = path->nodes[0];
2588
slot = path->slots[0];
2589
2590
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
2591
sizeof(bgi));
2592
2593
btrfs_item_key_to_cpu(leaf, &key, slot);
2594
btrfs_release_path(path);
2595
ret = read_one_block_group(info, &bgi, &key, need_clear);
2596
if (ret < 0)
2597
goto error;
2598
key.objectid += key.offset;
2599
key.offset = 0;
2600
}
2601
btrfs_release_path(path);
2602
2603
list_for_each_entry(space_info, &info->space_info, list) {
2604
int i;
2605
2606
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2607
if (list_empty(&space_info->block_groups[i]))
2608
continue;
2609
cache = list_first_entry(&space_info->block_groups[i],
2610
struct btrfs_block_group,
2611
list);
2612
btrfs_sysfs_add_block_group_type(cache);
2613
}
2614
2615
if (!(btrfs_get_alloc_profile(info, space_info->flags) &
2616
(BTRFS_BLOCK_GROUP_RAID10 |
2617
BTRFS_BLOCK_GROUP_RAID1_MASK |
2618
BTRFS_BLOCK_GROUP_RAID56_MASK |
2619
BTRFS_BLOCK_GROUP_DUP)))
2620
continue;
2621
/*
2622
* Avoid allocating from un-mirrored block group if there are
2623
* mirrored block groups.
2624
*/
2625
list_for_each_entry(cache,
2626
&space_info->block_groups[BTRFS_RAID_RAID0],
2627
list)
2628
inc_block_group_ro(cache, 1);
2629
list_for_each_entry(cache,
2630
&space_info->block_groups[BTRFS_RAID_SINGLE],
2631
list)
2632
inc_block_group_ro(cache, 1);
2633
}
2634
2635
btrfs_init_global_block_rsv(info);
2636
ret = check_chunk_block_group_mappings(info);
2637
error:
2638
btrfs_free_path(path);
2639
/*
2640
* We've hit some error while reading the extent tree, and have
2641
* rescue=ibadroots mount option.
2642
* Try to fill the tree using dummy block groups so that the user can
2643
* continue to mount and grab their data.
2644
*/
2645
if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
2646
ret = fill_dummy_bgs(info);
2647
return ret;
2648
}
2649
2650
/*
2651
* This function, insert_block_group_item(), belongs to the phase 2 of chunk
2652
* allocation.
2653
*
2654
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2655
* phases.
2656
*/
2657
static int insert_block_group_item(struct btrfs_trans_handle *trans,
2658
struct btrfs_block_group *block_group)
2659
{
2660
struct btrfs_fs_info *fs_info = trans->fs_info;
2661
struct btrfs_block_group_item bgi;
2662
struct btrfs_root *root = btrfs_block_group_root(fs_info);
2663
struct btrfs_key key;
2664
u64 old_commit_used;
2665
int ret;
2666
2667
spin_lock(&block_group->lock);
2668
btrfs_set_stack_block_group_used(&bgi, block_group->used);
2669
btrfs_set_stack_block_group_chunk_objectid(&bgi,
2670
block_group->global_root_id);
2671
btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
2672
old_commit_used = block_group->commit_used;
2673
block_group->commit_used = block_group->used;
2674
key.objectid = block_group->start;
2675
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
2676
key.offset = block_group->length;
2677
spin_unlock(&block_group->lock);
2678
2679
ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
2680
if (ret < 0) {
2681
spin_lock(&block_group->lock);
2682
block_group->commit_used = old_commit_used;
2683
spin_unlock(&block_group->lock);
2684
}
2685
2686
return ret;
2687
}
2688
2689
static int insert_dev_extent(struct btrfs_trans_handle *trans,
2690
const struct btrfs_device *device, u64 chunk_offset,
2691
u64 start, u64 num_bytes)
2692
{
2693
struct btrfs_fs_info *fs_info = device->fs_info;
2694
struct btrfs_root *root = fs_info->dev_root;
2695
BTRFS_PATH_AUTO_FREE(path);
2696
struct btrfs_dev_extent *extent;
2697
struct extent_buffer *leaf;
2698
struct btrfs_key key;
2699
int ret;
2700
2701
WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
2702
WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
2703
path = btrfs_alloc_path();
2704
if (!path)
2705
return -ENOMEM;
2706
2707
key.objectid = device->devid;
2708
key.type = BTRFS_DEV_EXTENT_KEY;
2709
key.offset = start;
2710
ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
2711
if (ret)
2712
return ret;
2713
2714
leaf = path->nodes[0];
2715
extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
2716
btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
2717
btrfs_set_dev_extent_chunk_objectid(leaf, extent,
2718
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
2719
btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
2720
btrfs_set_dev_extent_length(leaf, extent, num_bytes);
2721
2722
return ret;
2723
}
2724
2725
/*
2726
* This function belongs to phase 2.
2727
*
2728
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2729
* phases.
2730
*/
2731
static int insert_dev_extents(struct btrfs_trans_handle *trans,
2732
u64 chunk_offset, u64 chunk_size)
2733
{
2734
struct btrfs_fs_info *fs_info = trans->fs_info;
2735
struct btrfs_device *device;
2736
struct btrfs_chunk_map *map;
2737
u64 dev_offset;
2738
int i;
2739
int ret = 0;
2740
2741
map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
2742
if (IS_ERR(map))
2743
return PTR_ERR(map);
2744
2745
/*
2746
* Take the device list mutex to prevent races with the final phase of
2747
* a device replace operation that replaces the device object associated
2748
* with the map's stripes, because the device object's id can change
2749
* at any time during that final phase of the device replace operation
2750
* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
2751
* replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
2752
* resulting in persisting a device extent item with such ID.
2753
*/
2754
mutex_lock(&fs_info->fs_devices->device_list_mutex);
2755
for (i = 0; i < map->num_stripes; i++) {
2756
device = map->stripes[i].dev;
2757
dev_offset = map->stripes[i].physical;
2758
2759
ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
2760
map->stripe_size);
2761
if (ret)
2762
break;
2763
}
2764
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2765
2766
btrfs_free_chunk_map(map);
2767
return ret;
2768
}
2769
2770
/*
2771
* This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
2772
* chunk allocation.
2773
*
2774
* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
2775
* phases.
2776
*/
2777
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
2778
{
2779
struct btrfs_fs_info *fs_info = trans->fs_info;
2780
struct btrfs_block_group *block_group;
2781
int ret = 0;
2782
2783
while (!list_empty(&trans->new_bgs)) {
2784
int index;
2785
2786
block_group = list_first_entry(&trans->new_bgs,
2787
struct btrfs_block_group,
2788
bg_list);
2789
if (ret)
2790
goto next;
2791
2792
index = btrfs_bg_flags_to_raid_index(block_group->flags);
2793
2794
ret = insert_block_group_item(trans, block_group);
2795
if (ret)
2796
btrfs_abort_transaction(trans, ret);
2797
if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
2798
&block_group->runtime_flags)) {
2799
mutex_lock(&fs_info->chunk_mutex);
2800
ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
2801
mutex_unlock(&fs_info->chunk_mutex);
2802
if (ret)
2803
btrfs_abort_transaction(trans, ret);
2804
}
2805
ret = insert_dev_extents(trans, block_group->start,
2806
block_group->length);
2807
if (ret)
2808
btrfs_abort_transaction(trans, ret);
2809
btrfs_add_block_group_free_space(trans, block_group);
2810
2811
/*
2812
* If we restriped during balance, we may have added a new raid
2813
* type, so now add the sysfs entries when it is safe to do so.
2814
* We don't have to worry about locking here as it's handled in
2815
* btrfs_sysfs_add_block_group_type.
2816
*/
2817
if (block_group->space_info->block_group_kobjs[index] == NULL)
2818
btrfs_sysfs_add_block_group_type(block_group);
2819
2820
/* Already aborted the transaction if it failed. */
2821
next:
2822
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
2823
2824
spin_lock(&fs_info->unused_bgs_lock);
2825
list_del_init(&block_group->bg_list);
2826
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
2827
btrfs_put_block_group(block_group);
2828
spin_unlock(&fs_info->unused_bgs_lock);
2829
2830
/*
2831
* If the block group is still unused, add it to the list of
2832
* unused block groups. The block group may have been created in
2833
* order to satisfy a space reservation, in which case the
2834
* extent allocation only happens later. But often we don't
2835
* actually need to allocate space that we previously reserved,
2836
* so the block group may become unused for a long time. For
2837
* example for metadata we generally reserve space for a worst
2838
* possible scenario, but then don't end up allocating all that
2839
* space or none at all (due to no need to COW, extent buffers
2840
* were already COWed in the current transaction and still
2841
* unwritten, tree heights lower than the maximum possible
2842
* height, etc). For data we generally reserve the exact amount
2843
* of space we are going to allocate later, the exception is
2844
* when using compression, as we must reserve space based on the
2845
* uncompressed data size, because the compression is only done
2846
* when writeback triggered and we don't know how much space we
2847
* are actually going to need, so we reserve the uncompressed
2848
* size because the data may be incompressible in the worst case.
2849
*/
2850
if (ret == 0) {
2851
bool used;
2852
2853
spin_lock(&block_group->lock);
2854
used = btrfs_is_block_group_used(block_group);
2855
spin_unlock(&block_group->lock);
2856
2857
if (!used)
2858
btrfs_mark_bg_unused(block_group);
2859
}
2860
}
2861
btrfs_trans_release_chunk_metadata(trans);
2862
}
2863
2864
/*
2865
* For extent tree v2 we use the block_group_item->chunk_offset to point at our
2866
* global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
2867
*/
2868
static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 offset)
2869
{
2870
u64 div = SZ_1G;
2871
u64 index;
2872
2873
if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
2874
return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2875
2876
/* If we have a smaller fs index based on 128MiB. */
2877
if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
2878
div = SZ_128M;
2879
2880
offset = div64_u64(offset, div);
2881
div64_u64_rem(offset, fs_info->nr_global_roots, &index);
2882
return index;
2883
}
2884
2885
struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
2886
struct btrfs_space_info *space_info,
2887
u64 type, u64 chunk_offset, u64 size)
2888
{
2889
struct btrfs_fs_info *fs_info = trans->fs_info;
2890
struct btrfs_block_group *cache;
2891
int ret;
2892
2893
btrfs_set_log_full_commit(trans);
2894
2895
cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
2896
if (!cache)
2897
return ERR_PTR(-ENOMEM);
2898
2899
/*
2900
* Mark it as new before adding it to the rbtree of block groups or any
2901
* list, so that no other task finds it and calls btrfs_mark_bg_unused()
2902
* before the new flag is set.
2903
*/
2904
set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
2905
2906
cache->length = size;
2907
btrfs_set_free_space_tree_thresholds(cache);
2908
cache->flags = type;
2909
cache->cached = BTRFS_CACHE_FINISHED;
2910
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
2911
2912
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
2913
set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
2914
2915
ret = btrfs_load_block_group_zone_info(cache, true);
2916
if (ret) {
2917
btrfs_put_block_group(cache);
2918
return ERR_PTR(ret);
2919
}
2920
2921
ret = exclude_super_stripes(cache);
2922
if (ret) {
2923
/* We may have excluded something, so call this just in case */
2924
btrfs_free_excluded_extents(cache);
2925
btrfs_put_block_group(cache);
2926
return ERR_PTR(ret);
2927
}
2928
2929
ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
2930
btrfs_free_excluded_extents(cache);
2931
if (ret) {
2932
btrfs_put_block_group(cache);
2933
return ERR_PTR(ret);
2934
}
2935
2936
/*
2937
* Ensure the corresponding space_info object is created and
2938
* assigned to our block group. We want our bg to be added to the rbtree
2939
* with its ->space_info set.
2940
*/
2941
cache->space_info = space_info;
2942
ASSERT(cache->space_info);
2943
2944
ret = btrfs_add_block_group_cache(cache);
2945
if (ret) {
2946
btrfs_remove_free_space_cache(cache);
2947
btrfs_put_block_group(cache);
2948
return ERR_PTR(ret);
2949
}
2950
2951
/*
2952
* Now that our block group has its ->space_info set and is inserted in
2953
* the rbtree, update the space info's counters.
2954
*/
2955
trace_btrfs_add_block_group(fs_info, cache, 1);
2956
btrfs_add_bg_to_space_info(fs_info, cache);
2957
btrfs_update_global_block_rsv(fs_info);
2958
2959
#ifdef CONFIG_BTRFS_DEBUG
2960
if (btrfs_should_fragment_free_space(cache)) {
2961
cache->space_info->bytes_used += size >> 1;
2962
fragment_free_space(cache);
2963
}
2964
#endif
2965
2966
btrfs_link_bg_list(cache, &trans->new_bgs);
2967
btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info);
2968
2969
set_avail_alloc_bits(fs_info, type);
2970
return cache;
2971
}
2972
2973
/*
2974
* Mark one block group RO, can be called several times for the same block
2975
* group.
2976
*
2977
* @cache: the destination block group
2978
* @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
2979
* ensure we still have some free space after marking this
2980
* block group RO.
2981
*/
2982
int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
2983
bool do_chunk_alloc)
2984
{
2985
struct btrfs_fs_info *fs_info = cache->fs_info;
2986
struct btrfs_space_info *space_info = cache->space_info;
2987
struct btrfs_trans_handle *trans;
2988
struct btrfs_root *root = btrfs_block_group_root(fs_info);
2989
u64 alloc_flags;
2990
int ret;
2991
bool dirty_bg_running;
2992
2993
/*
2994
* This can only happen when we are doing read-only scrub on read-only
2995
* mount.
2996
* In that case we should not start a new transaction on read-only fs.
2997
* Thus here we skip all chunk allocations.
2998
*/
2999
if (sb_rdonly(fs_info->sb)) {
3000
mutex_lock(&fs_info->ro_block_group_mutex);
3001
ret = inc_block_group_ro(cache, 0);
3002
mutex_unlock(&fs_info->ro_block_group_mutex);
3003
return ret;
3004
}
3005
3006
do {
3007
trans = btrfs_join_transaction(root);
3008
if (IS_ERR(trans))
3009
return PTR_ERR(trans);
3010
3011
dirty_bg_running = false;
3012
3013
/*
3014
* We're not allowed to set block groups readonly after the dirty
3015
* block group cache has started writing. If it already started,
3016
* back off and let this transaction commit.
3017
*/
3018
mutex_lock(&fs_info->ro_block_group_mutex);
3019
if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
3020
u64 transid = trans->transid;
3021
3022
mutex_unlock(&fs_info->ro_block_group_mutex);
3023
btrfs_end_transaction(trans);
3024
3025
ret = btrfs_wait_for_commit(fs_info, transid);
3026
if (ret)
3027
return ret;
3028
dirty_bg_running = true;
3029
}
3030
} while (dirty_bg_running);
3031
3032
if (do_chunk_alloc) {
3033
/*
3034
* If we are changing raid levels, try to allocate a
3035
* corresponding block group with the new raid level.
3036
*/
3037
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
3038
if (alloc_flags != cache->flags) {
3039
ret = btrfs_chunk_alloc(trans, space_info, alloc_flags,
3040
CHUNK_ALLOC_FORCE);
3041
/*
3042
* ENOSPC is allowed here, we may have enough space
3043
* already allocated at the new raid level to carry on
3044
*/
3045
if (ret == -ENOSPC)
3046
ret = 0;
3047
if (ret < 0)
3048
goto out;
3049
}
3050
}
3051
3052
ret = inc_block_group_ro(cache, 0);
3053
if (!ret)
3054
goto out;
3055
if (ret == -ETXTBSY)
3056
goto unlock_out;
3057
3058
/*
3059
* Skip chunk allocation if the bg is SYSTEM, this is to avoid system
3060
* chunk allocation storm to exhaust the system chunk array. Otherwise
3061
* we still want to try our best to mark the block group read-only.
3062
*/
3063
if (!do_chunk_alloc && ret == -ENOSPC &&
3064
(cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
3065
goto unlock_out;
3066
3067
alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
3068
ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
3069
if (ret < 0)
3070
goto out;
3071
/*
3072
* We have allocated a new chunk. We also need to activate that chunk to
3073
* grant metadata tickets for zoned filesystem.
3074
*/
3075
ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true);
3076
if (ret < 0)
3077
goto out;
3078
3079
ret = inc_block_group_ro(cache, 0);
3080
if (ret == -ETXTBSY)
3081
goto unlock_out;
3082
out:
3083
if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
3084
alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
3085
mutex_lock(&fs_info->chunk_mutex);
3086
check_system_chunk(trans, alloc_flags);
3087
mutex_unlock(&fs_info->chunk_mutex);
3088
}
3089
unlock_out:
3090
mutex_unlock(&fs_info->ro_block_group_mutex);
3091
3092
btrfs_end_transaction(trans);
3093
return ret;
3094
}
3095
3096
void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
3097
{
3098
struct btrfs_space_info *sinfo = cache->space_info;
3099
u64 num_bytes;
3100
3101
BUG_ON(!cache->ro);
3102
3103
spin_lock(&sinfo->lock);
3104
spin_lock(&cache->lock);
3105
if (!--cache->ro) {
3106
if (btrfs_is_zoned(cache->fs_info)) {
3107
/* Migrate zone_unusable bytes back */
3108
cache->zone_unusable =
3109
(cache->alloc_offset - cache->used - cache->pinned -
3110
cache->reserved) +
3111
(cache->length - cache->zone_capacity);
3112
btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable);
3113
sinfo->bytes_readonly -= cache->zone_unusable;
3114
}
3115
num_bytes = cache->length - cache->reserved -
3116
cache->pinned - cache->bytes_super -
3117
cache->zone_unusable - cache->used;
3118
sinfo->bytes_readonly -= num_bytes;
3119
list_del_init(&cache->ro_list);
3120
}
3121
spin_unlock(&cache->lock);
3122
spin_unlock(&sinfo->lock);
3123
}
3124
3125
static int update_block_group_item(struct btrfs_trans_handle *trans,
3126
struct btrfs_path *path,
3127
struct btrfs_block_group *cache)
3128
{
3129
struct btrfs_fs_info *fs_info = trans->fs_info;
3130
int ret;
3131
struct btrfs_root *root = btrfs_block_group_root(fs_info);
3132
unsigned long bi;
3133
struct extent_buffer *leaf;
3134
struct btrfs_block_group_item bgi;
3135
struct btrfs_key key;
3136
u64 old_commit_used;
3137
u64 used;
3138
3139
/*
3140
* Block group items update can be triggered out of commit transaction
3141
* critical section, thus we need a consistent view of used bytes.
3142
* We cannot use cache->used directly outside of the spin lock, as it
3143
* may be changed.
3144
*/
3145
spin_lock(&cache->lock);
3146
old_commit_used = cache->commit_used;
3147
used = cache->used;
3148
/* No change in used bytes, can safely skip it. */
3149
if (cache->commit_used == used) {
3150
spin_unlock(&cache->lock);
3151
return 0;
3152
}
3153
cache->commit_used = used;
3154
spin_unlock(&cache->lock);
3155
3156
key.objectid = cache->start;
3157
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
3158
key.offset = cache->length;
3159
3160
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3161
if (ret) {
3162
if (ret > 0)
3163
ret = -ENOENT;
3164
goto fail;
3165
}
3166
3167
leaf = path->nodes[0];
3168
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3169
btrfs_set_stack_block_group_used(&bgi, used);
3170
btrfs_set_stack_block_group_chunk_objectid(&bgi,
3171
cache->global_root_id);
3172
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
3173
write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
3174
fail:
3175
btrfs_release_path(path);
3176
/*
3177
* We didn't update the block group item, need to revert commit_used
3178
* unless the block group item didn't exist yet - this is to prevent a
3179
* race with a concurrent insertion of the block group item, with
3180
* insert_block_group_item(), that happened just after we attempted to
3181
* update. In that case we would reset commit_used to 0 just after the
3182
* insertion set it to a value greater than 0 - if the block group later
3183
* becomes with 0 used bytes, we would incorrectly skip its update.
3184
*/
3185
if (ret < 0 && ret != -ENOENT) {
3186
spin_lock(&cache->lock);
3187
cache->commit_used = old_commit_used;
3188
spin_unlock(&cache->lock);
3189
}
3190
return ret;
3191
3192
}
3193
3194
static int cache_save_setup(struct btrfs_block_group *block_group,
3195
struct btrfs_trans_handle *trans,
3196
struct btrfs_path *path)
3197
{
3198
struct btrfs_fs_info *fs_info = block_group->fs_info;
3199
struct inode *inode = NULL;
3200
struct extent_changeset *data_reserved = NULL;
3201
u64 alloc_hint = 0;
3202
int dcs = BTRFS_DC_ERROR;
3203
u64 cache_size = 0;
3204
int retries = 0;
3205
int ret = 0;
3206
3207
if (!btrfs_test_opt(fs_info, SPACE_CACHE))
3208
return 0;
3209
3210
/*
3211
* If this block group is smaller than 100 megs don't bother caching the
3212
* block group.
3213
*/
3214
if (block_group->length < (100 * SZ_1M)) {
3215
spin_lock(&block_group->lock);
3216
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3217
spin_unlock(&block_group->lock);
3218
return 0;
3219
}
3220
3221
if (TRANS_ABORTED(trans))
3222
return 0;
3223
again:
3224
inode = lookup_free_space_inode(block_group, path);
3225
if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3226
ret = PTR_ERR(inode);
3227
btrfs_release_path(path);
3228
goto out;
3229
}
3230
3231
if (IS_ERR(inode)) {
3232
BUG_ON(retries);
3233
retries++;
3234
3235
if (block_group->ro)
3236
goto out_free;
3237
3238
ret = create_free_space_inode(trans, block_group, path);
3239
if (ret)
3240
goto out_free;
3241
goto again;
3242
}
3243
3244
/*
3245
* We want to set the generation to 0, that way if anything goes wrong
3246
* from here on out we know not to trust this cache when we load up next
3247
* time.
3248
*/
3249
BTRFS_I(inode)->generation = 0;
3250
ret = btrfs_update_inode(trans, BTRFS_I(inode));
3251
if (unlikely(ret)) {
3252
/*
3253
* So theoretically we could recover from this, simply set the
3254
* super cache generation to 0 so we know to invalidate the
3255
* cache, but then we'd have to keep track of the block groups
3256
* that fail this way so we know we _have_ to reset this cache
3257
* before the next commit or risk reading stale cache. So to
3258
* limit our exposure to horrible edge cases lets just abort the
3259
* transaction, this only happens in really bad situations
3260
* anyway.
3261
*/
3262
btrfs_abort_transaction(trans, ret);
3263
goto out_put;
3264
}
3265
WARN_ON(ret);
3266
3267
/* We've already setup this transaction, go ahead and exit */
3268
if (block_group->cache_generation == trans->transid &&
3269
i_size_read(inode)) {
3270
dcs = BTRFS_DC_SETUP;
3271
goto out_put;
3272
}
3273
3274
if (i_size_read(inode) > 0) {
3275
ret = btrfs_check_trunc_cache_free_space(fs_info,
3276
&fs_info->global_block_rsv);
3277
if (ret)
3278
goto out_put;
3279
3280
ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
3281
if (ret)
3282
goto out_put;
3283
}
3284
3285
spin_lock(&block_group->lock);
3286
if (block_group->cached != BTRFS_CACHE_FINISHED ||
3287
!btrfs_test_opt(fs_info, SPACE_CACHE)) {
3288
/*
3289
* don't bother trying to write stuff out _if_
3290
* a) we're not cached,
3291
* b) we're with nospace_cache mount option,
3292
* c) we're with v2 space_cache (FREE_SPACE_TREE).
3293
*/
3294
dcs = BTRFS_DC_WRITTEN;
3295
spin_unlock(&block_group->lock);
3296
goto out_put;
3297
}
3298
spin_unlock(&block_group->lock);
3299
3300
/*
3301
* We hit an ENOSPC when setting up the cache in this transaction, just
3302
* skip doing the setup, we've already cleared the cache so we're safe.
3303
*/
3304
if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3305
ret = -ENOSPC;
3306
goto out_put;
3307
}
3308
3309
/*
3310
* Try to preallocate enough space based on how big the block group is.
3311
* Keep in mind this has to include any pinned space which could end up
3312
* taking up quite a bit since it's not folded into the other space
3313
* cache.
3314
*/
3315
cache_size = div_u64(block_group->length, SZ_256M);
3316
if (!cache_size)
3317
cache_size = 1;
3318
3319
cache_size *= 16;
3320
cache_size *= fs_info->sectorsize;
3321
3322
ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
3323
cache_size, false);
3324
if (ret)
3325
goto out_put;
3326
3327
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
3328
cache_size, cache_size,
3329
&alloc_hint);
3330
/*
3331
* Our cache requires contiguous chunks so that we don't modify a bunch
3332
* of metadata or split extents when writing the cache out, which means
3333
* we can enospc if we are heavily fragmented in addition to just normal
3334
* out of space conditions. So if we hit this just skip setting up any
3335
* other block groups for this transaction, maybe we'll unpin enough
3336
* space the next time around.
3337
*/
3338
if (!ret)
3339
dcs = BTRFS_DC_SETUP;
3340
else if (ret == -ENOSPC)
3341
set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3342
3343
out_put:
3344
iput(inode);
3345
out_free:
3346
btrfs_release_path(path);
3347
out:
3348
spin_lock(&block_group->lock);
3349
if (!ret && dcs == BTRFS_DC_SETUP)
3350
block_group->cache_generation = trans->transid;
3351
block_group->disk_cache_state = dcs;
3352
spin_unlock(&block_group->lock);
3353
3354
extent_changeset_free(data_reserved);
3355
return ret;
3356
}
3357
3358
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
3359
{
3360
struct btrfs_fs_info *fs_info = trans->fs_info;
3361
struct btrfs_block_group *cache, *tmp;
3362
struct btrfs_transaction *cur_trans = trans->transaction;
3363
BTRFS_PATH_AUTO_FREE(path);
3364
3365
if (list_empty(&cur_trans->dirty_bgs) ||
3366
!btrfs_test_opt(fs_info, SPACE_CACHE))
3367
return 0;
3368
3369
path = btrfs_alloc_path();
3370
if (!path)
3371
return -ENOMEM;
3372
3373
/* Could add new block groups, use _safe just in case */
3374
list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
3375
dirty_list) {
3376
if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3377
cache_save_setup(cache, trans, path);
3378
}
3379
3380
return 0;
3381
}
3382
3383
/*
3384
* Transaction commit does final block group cache writeback during a critical
3385
* section where nothing is allowed to change the FS. This is required in
3386
* order for the cache to actually match the block group, but can introduce a
3387
* lot of latency into the commit.
3388
*
3389
* So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
3390
* There's a chance we'll have to redo some of it if the block group changes
3391
* again during the commit, but it greatly reduces the commit latency by
3392
* getting rid of the easy block groups while we're still allowing others to
3393
* join the commit.
3394
*/
3395
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
3396
{
3397
struct btrfs_fs_info *fs_info = trans->fs_info;
3398
struct btrfs_block_group *cache;
3399
struct btrfs_transaction *cur_trans = trans->transaction;
3400
int ret = 0;
3401
int should_put;
3402
BTRFS_PATH_AUTO_FREE(path);
3403
LIST_HEAD(dirty);
3404
struct list_head *io = &cur_trans->io_bgs;
3405
int loops = 0;
3406
3407
spin_lock(&cur_trans->dirty_bgs_lock);
3408
if (list_empty(&cur_trans->dirty_bgs)) {
3409
spin_unlock(&cur_trans->dirty_bgs_lock);
3410
return 0;
3411
}
3412
list_splice_init(&cur_trans->dirty_bgs, &dirty);
3413
spin_unlock(&cur_trans->dirty_bgs_lock);
3414
3415
again:
3416
/* Make sure all the block groups on our dirty list actually exist */
3417
btrfs_create_pending_block_groups(trans);
3418
3419
if (!path) {
3420
path = btrfs_alloc_path();
3421
if (!path) {
3422
ret = -ENOMEM;
3423
goto out;
3424
}
3425
}
3426
3427
/*
3428
* cache_write_mutex is here only to save us from balance or automatic
3429
* removal of empty block groups deleting this block group while we are
3430
* writing out the cache
3431
*/
3432
mutex_lock(&trans->transaction->cache_write_mutex);
3433
while (!list_empty(&dirty)) {
3434
bool drop_reserve = true;
3435
3436
cache = list_first_entry(&dirty, struct btrfs_block_group,
3437
dirty_list);
3438
/*
3439
* This can happen if something re-dirties a block group that
3440
* is already under IO. Just wait for it to finish and then do
3441
* it all again
3442
*/
3443
if (!list_empty(&cache->io_list)) {
3444
list_del_init(&cache->io_list);
3445
btrfs_wait_cache_io(trans, cache, path);
3446
btrfs_put_block_group(cache);
3447
}
3448
3449
3450
/*
3451
* btrfs_wait_cache_io uses the cache->dirty_list to decide if
3452
* it should update the cache_state. Don't delete until after
3453
* we wait.
3454
*
3455
* Since we're not running in the commit critical section
3456
* we need the dirty_bgs_lock to protect from update_block_group
3457
*/
3458
spin_lock(&cur_trans->dirty_bgs_lock);
3459
list_del_init(&cache->dirty_list);
3460
spin_unlock(&cur_trans->dirty_bgs_lock);
3461
3462
should_put = 1;
3463
3464
cache_save_setup(cache, trans, path);
3465
3466
if (cache->disk_cache_state == BTRFS_DC_SETUP) {
3467
cache->io_ctl.inode = NULL;
3468
ret = btrfs_write_out_cache(trans, cache, path);
3469
if (ret == 0 && cache->io_ctl.inode) {
3470
should_put = 0;
3471
3472
/*
3473
* The cache_write_mutex is protecting the
3474
* io_list, also refer to the definition of
3475
* btrfs_transaction::io_bgs for more details
3476
*/
3477
list_add_tail(&cache->io_list, io);
3478
} else {
3479
/*
3480
* If we failed to write the cache, the
3481
* generation will be bad and life goes on
3482
*/
3483
ret = 0;
3484
}
3485
}
3486
if (!ret) {
3487
ret = update_block_group_item(trans, path, cache);
3488
/*
3489
* Our block group might still be attached to the list
3490
* of new block groups in the transaction handle of some
3491
* other task (struct btrfs_trans_handle->new_bgs). This
3492
* means its block group item isn't yet in the extent
3493
* tree. If this happens ignore the error, as we will
3494
* try again later in the critical section of the
3495
* transaction commit.
3496
*/
3497
if (ret == -ENOENT) {
3498
ret = 0;
3499
spin_lock(&cur_trans->dirty_bgs_lock);
3500
if (list_empty(&cache->dirty_list)) {
3501
list_add_tail(&cache->dirty_list,
3502
&cur_trans->dirty_bgs);
3503
btrfs_get_block_group(cache);
3504
drop_reserve = false;
3505
}
3506
spin_unlock(&cur_trans->dirty_bgs_lock);
3507
} else if (ret) {
3508
btrfs_abort_transaction(trans, ret);
3509
}
3510
}
3511
3512
/* If it's not on the io list, we need to put the block group */
3513
if (should_put)
3514
btrfs_put_block_group(cache);
3515
if (drop_reserve)
3516
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
3517
/*
3518
* Avoid blocking other tasks for too long. It might even save
3519
* us from writing caches for block groups that are going to be
3520
* removed.
3521
*/
3522
mutex_unlock(&trans->transaction->cache_write_mutex);
3523
if (ret)
3524
goto out;
3525
mutex_lock(&trans->transaction->cache_write_mutex);
3526
}
3527
mutex_unlock(&trans->transaction->cache_write_mutex);
3528
3529
/*
3530
* Go through delayed refs for all the stuff we've just kicked off
3531
* and then loop back (just once)
3532
*/
3533
if (!ret)
3534
ret = btrfs_run_delayed_refs(trans, 0);
3535
if (!ret && loops == 0) {
3536
loops++;
3537
spin_lock(&cur_trans->dirty_bgs_lock);
3538
list_splice_init(&cur_trans->dirty_bgs, &dirty);
3539
/*
3540
* dirty_bgs_lock protects us from concurrent block group
3541
* deletes too (not just cache_write_mutex).
3542
*/
3543
if (!list_empty(&dirty)) {
3544
spin_unlock(&cur_trans->dirty_bgs_lock);
3545
goto again;
3546
}
3547
spin_unlock(&cur_trans->dirty_bgs_lock);
3548
}
3549
out:
3550
if (ret < 0) {
3551
spin_lock(&cur_trans->dirty_bgs_lock);
3552
list_splice_init(&dirty, &cur_trans->dirty_bgs);
3553
spin_unlock(&cur_trans->dirty_bgs_lock);
3554
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
3555
}
3556
3557
return ret;
3558
}
3559
3560
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
3561
{
3562
struct btrfs_fs_info *fs_info = trans->fs_info;
3563
struct btrfs_block_group *cache;
3564
struct btrfs_transaction *cur_trans = trans->transaction;
3565
int ret = 0;
3566
int should_put;
3567
BTRFS_PATH_AUTO_FREE(path);
3568
struct list_head *io = &cur_trans->io_bgs;
3569
3570
path = btrfs_alloc_path();
3571
if (!path)
3572
return -ENOMEM;
3573
3574
/*
3575
* Even though we are in the critical section of the transaction commit,
3576
* we can still have concurrent tasks adding elements to this
3577
* transaction's list of dirty block groups. These tasks correspond to
3578
* endio free space workers started when writeback finishes for a
3579
* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
3580
* allocate new block groups as a result of COWing nodes of the root
3581
* tree when updating the free space inode. The writeback for the space
3582
* caches is triggered by an earlier call to
3583
* btrfs_start_dirty_block_groups() and iterations of the following
3584
* loop.
3585
* Also we want to do the cache_save_setup first and then run the
3586
* delayed refs to make sure we have the best chance at doing this all
3587
* in one shot.
3588
*/
3589
spin_lock(&cur_trans->dirty_bgs_lock);
3590
while (!list_empty(&cur_trans->dirty_bgs)) {
3591
cache = list_first_entry(&cur_trans->dirty_bgs,
3592
struct btrfs_block_group,
3593
dirty_list);
3594
3595
/*
3596
* This can happen if cache_save_setup re-dirties a block group
3597
* that is already under IO. Just wait for it to finish and
3598
* then do it all again
3599
*/
3600
if (!list_empty(&cache->io_list)) {
3601
spin_unlock(&cur_trans->dirty_bgs_lock);
3602
list_del_init(&cache->io_list);
3603
btrfs_wait_cache_io(trans, cache, path);
3604
btrfs_put_block_group(cache);
3605
spin_lock(&cur_trans->dirty_bgs_lock);
3606
}
3607
3608
/*
3609
* Don't remove from the dirty list until after we've waited on
3610
* any pending IO
3611
*/
3612
list_del_init(&cache->dirty_list);
3613
spin_unlock(&cur_trans->dirty_bgs_lock);
3614
should_put = 1;
3615
3616
cache_save_setup(cache, trans, path);
3617
3618
if (!ret)
3619
ret = btrfs_run_delayed_refs(trans, U64_MAX);
3620
3621
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
3622
cache->io_ctl.inode = NULL;
3623
ret = btrfs_write_out_cache(trans, cache, path);
3624
if (ret == 0 && cache->io_ctl.inode) {
3625
should_put = 0;
3626
list_add_tail(&cache->io_list, io);
3627
} else {
3628
/*
3629
* If we failed to write the cache, the
3630
* generation will be bad and life goes on
3631
*/
3632
ret = 0;
3633
}
3634
}
3635
if (!ret) {
3636
ret = update_block_group_item(trans, path, cache);
3637
/*
3638
* One of the free space endio workers might have
3639
* created a new block group while updating a free space
3640
* cache's inode (at inode.c:btrfs_finish_ordered_io())
3641
* and hasn't released its transaction handle yet, in
3642
* which case the new block group is still attached to
3643
* its transaction handle and its creation has not
3644
* finished yet (no block group item in the extent tree
3645
* yet, etc). If this is the case, wait for all free
3646
* space endio workers to finish and retry. This is a
3647
* very rare case so no need for a more efficient and
3648
* complex approach.
3649
*/
3650
if (ret == -ENOENT) {
3651
wait_event(cur_trans->writer_wait,
3652
atomic_read(&cur_trans->num_writers) == 1);
3653
ret = update_block_group_item(trans, path, cache);
3654
if (ret)
3655
btrfs_abort_transaction(trans, ret);
3656
} else if (ret) {
3657
btrfs_abort_transaction(trans, ret);
3658
}
3659
}
3660
3661
/* If its not on the io list, we need to put the block group */
3662
if (should_put)
3663
btrfs_put_block_group(cache);
3664
btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
3665
spin_lock(&cur_trans->dirty_bgs_lock);
3666
}
3667
spin_unlock(&cur_trans->dirty_bgs_lock);
3668
3669
/*
3670
* Refer to the definition of io_bgs member for details why it's safe
3671
* to use it without any locking
3672
*/
3673
while (!list_empty(io)) {
3674
cache = list_first_entry(io, struct btrfs_block_group,
3675
io_list);
3676
list_del_init(&cache->io_list);
3677
btrfs_wait_cache_io(trans, cache, path);
3678
btrfs_put_block_group(cache);
3679
}
3680
3681
return ret;
3682
}
3683
3684
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
3685
u64 bytenr, u64 num_bytes, bool alloc)
3686
{
3687
struct btrfs_fs_info *info = trans->fs_info;
3688
struct btrfs_space_info *space_info;
3689
struct btrfs_block_group *cache;
3690
u64 old_val;
3691
bool reclaim = false;
3692
bool bg_already_dirty = true;
3693
int factor;
3694
3695
/* Block accounting for super block */
3696
spin_lock(&info->delalloc_root_lock);
3697
old_val = btrfs_super_bytes_used(info->super_copy);
3698
if (alloc)
3699
old_val += num_bytes;
3700
else
3701
old_val -= num_bytes;
3702
btrfs_set_super_bytes_used(info->super_copy, old_val);
3703
spin_unlock(&info->delalloc_root_lock);
3704
3705
cache = btrfs_lookup_block_group(info, bytenr);
3706
if (!cache)
3707
return -ENOENT;
3708
3709
/* An extent can not span multiple block groups. */
3710
ASSERT(bytenr + num_bytes <= cache->start + cache->length);
3711
3712
space_info = cache->space_info;
3713
factor = btrfs_bg_type_to_factor(cache->flags);
3714
3715
/*
3716
* If this block group has free space cache written out, we need to make
3717
* sure to load it if we are removing space. This is because we need
3718
* the unpinning stage to actually add the space back to the block group,
3719
* otherwise we will leak space.
3720
*/
3721
if (!alloc && !btrfs_block_group_done(cache))
3722
btrfs_cache_block_group(cache, true);
3723
3724
spin_lock(&space_info->lock);
3725
spin_lock(&cache->lock);
3726
3727
if (btrfs_test_opt(info, SPACE_CACHE) &&
3728
cache->disk_cache_state < BTRFS_DC_CLEAR)
3729
cache->disk_cache_state = BTRFS_DC_CLEAR;
3730
3731
old_val = cache->used;
3732
if (alloc) {
3733
old_val += num_bytes;
3734
cache->used = old_val;
3735
cache->reserved -= num_bytes;
3736
cache->reclaim_mark = 0;
3737
space_info->bytes_reserved -= num_bytes;
3738
space_info->bytes_used += num_bytes;
3739
space_info->disk_used += num_bytes * factor;
3740
if (READ_ONCE(space_info->periodic_reclaim))
3741
btrfs_space_info_update_reclaimable(space_info, -num_bytes);
3742
spin_unlock(&cache->lock);
3743
spin_unlock(&space_info->lock);
3744
} else {
3745
old_val -= num_bytes;
3746
cache->used = old_val;
3747
cache->pinned += num_bytes;
3748
btrfs_space_info_update_bytes_pinned(space_info, num_bytes);
3749
space_info->bytes_used -= num_bytes;
3750
space_info->disk_used -= num_bytes * factor;
3751
if (READ_ONCE(space_info->periodic_reclaim))
3752
btrfs_space_info_update_reclaimable(space_info, num_bytes);
3753
else
3754
reclaim = should_reclaim_block_group(cache, num_bytes);
3755
3756
spin_unlock(&cache->lock);
3757
spin_unlock(&space_info->lock);
3758
3759
btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr,
3760
bytenr + num_bytes - 1, EXTENT_DIRTY, NULL);
3761
}
3762
3763
spin_lock(&trans->transaction->dirty_bgs_lock);
3764
if (list_empty(&cache->dirty_list)) {
3765
list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs);
3766
bg_already_dirty = false;
3767
btrfs_get_block_group(cache);
3768
}
3769
spin_unlock(&trans->transaction->dirty_bgs_lock);
3770
3771
/*
3772
* No longer have used bytes in this block group, queue it for deletion.
3773
* We do this after adding the block group to the dirty list to avoid
3774
* races between cleaner kthread and space cache writeout.
3775
*/
3776
if (!alloc && old_val == 0) {
3777
if (!btrfs_test_opt(info, DISCARD_ASYNC))
3778
btrfs_mark_bg_unused(cache);
3779
} else if (!alloc && reclaim) {
3780
btrfs_mark_bg_to_reclaim(cache);
3781
}
3782
3783
btrfs_put_block_group(cache);
3784
3785
/* Modified block groups are accounted for in the delayed_refs_rsv. */
3786
if (!bg_already_dirty)
3787
btrfs_inc_delayed_refs_rsv_bg_updates(info);
3788
3789
return 0;
3790
}
3791
3792
/*
3793
* Update the block_group and space info counters.
3794
*
3795
* @cache: The cache we are manipulating
3796
* @ram_bytes: The number of bytes of file content, and will be same to
3797
* @num_bytes except for the compress path.
3798
* @num_bytes: The number of bytes in question
3799
* @delalloc: The blocks are allocated for the delalloc write
3800
*
3801
* This is called by the allocator when it reserves space. If this is a
3802
* reservation and the block group has become read only we cannot make the
3803
* reservation and return -EAGAIN, otherwise this function always succeeds.
3804
*/
3805
int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
3806
u64 ram_bytes, u64 num_bytes, int delalloc,
3807
bool force_wrong_size_class)
3808
{
3809
struct btrfs_space_info *space_info = cache->space_info;
3810
enum btrfs_block_group_size_class size_class;
3811
int ret = 0;
3812
3813
spin_lock(&space_info->lock);
3814
spin_lock(&cache->lock);
3815
if (cache->ro) {
3816
ret = -EAGAIN;
3817
goto out;
3818
}
3819
3820
if (btrfs_block_group_should_use_size_class(cache)) {
3821
size_class = btrfs_calc_block_group_size_class(num_bytes);
3822
ret = btrfs_use_block_group_size_class(cache, size_class, force_wrong_size_class);
3823
if (ret)
3824
goto out;
3825
}
3826
cache->reserved += num_bytes;
3827
space_info->bytes_reserved += num_bytes;
3828
trace_btrfs_space_reservation(cache->fs_info, "space_info",
3829
space_info->flags, num_bytes, 1);
3830
btrfs_space_info_update_bytes_may_use(space_info, -ram_bytes);
3831
if (delalloc)
3832
cache->delalloc_bytes += num_bytes;
3833
3834
/*
3835
* Compression can use less space than we reserved, so wake tickets if
3836
* that happens.
3837
*/
3838
if (num_bytes < ram_bytes)
3839
btrfs_try_granting_tickets(cache->fs_info, space_info);
3840
out:
3841
spin_unlock(&cache->lock);
3842
spin_unlock(&space_info->lock);
3843
return ret;
3844
}
3845
3846
/*
3847
* Update the block_group and space info counters.
3848
*
3849
* @cache: The cache we are manipulating.
3850
* @num_bytes: The number of bytes in question.
3851
* @is_delalloc: Whether the blocks are allocated for a delalloc write.
3852
*
3853
* This is called by somebody who is freeing space that was never actually used
3854
* on disk. For example if you reserve some space for a new leaf in transaction
3855
* A and before transaction A commits you free that leaf, you call this with
3856
* reserve set to 0 in order to clear the reservation.
3857
*/
3858
void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes,
3859
bool is_delalloc)
3860
{
3861
struct btrfs_space_info *space_info = cache->space_info;
3862
3863
spin_lock(&space_info->lock);
3864
spin_lock(&cache->lock);
3865
if (cache->ro)
3866
space_info->bytes_readonly += num_bytes;
3867
else if (btrfs_is_zoned(cache->fs_info))
3868
space_info->bytes_zone_unusable += num_bytes;
3869
cache->reserved -= num_bytes;
3870
space_info->bytes_reserved -= num_bytes;
3871
space_info->max_extent_size = 0;
3872
3873
if (is_delalloc)
3874
cache->delalloc_bytes -= num_bytes;
3875
spin_unlock(&cache->lock);
3876
3877
btrfs_try_granting_tickets(cache->fs_info, space_info);
3878
spin_unlock(&space_info->lock);
3879
}
3880
3881
static void force_metadata_allocation(struct btrfs_fs_info *info)
3882
{
3883
struct list_head *head = &info->space_info;
3884
struct btrfs_space_info *found;
3885
3886
list_for_each_entry(found, head, list) {
3887
if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3888
found->force_alloc = CHUNK_ALLOC_FORCE;
3889
}
3890
}
3891
3892
static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info,
3893
const struct btrfs_space_info *sinfo, int force)
3894
{
3895
u64 bytes_used = btrfs_space_info_used(sinfo, false);
3896
u64 thresh;
3897
3898
if (force == CHUNK_ALLOC_FORCE)
3899
return true;
3900
3901
/*
3902
* in limited mode, we want to have some free space up to
3903
* about 1% of the FS size.
3904
*/
3905
if (force == CHUNK_ALLOC_LIMITED) {
3906
thresh = btrfs_super_total_bytes(fs_info->super_copy);
3907
thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
3908
3909
if (sinfo->total_bytes - bytes_used < thresh)
3910
return true;
3911
}
3912
3913
if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
3914
return false;
3915
return true;
3916
}
3917
3918
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
3919
{
3920
u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
3921
struct btrfs_space_info *space_info;
3922
3923
space_info = btrfs_find_space_info(trans->fs_info, type);
3924
if (!space_info) {
3925
DEBUG_WARN();
3926
return -EINVAL;
3927
}
3928
3929
return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
3930
}
3931
3932
static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans,
3933
struct btrfs_space_info *space_info,
3934
u64 flags)
3935
{
3936
struct btrfs_block_group *bg;
3937
int ret;
3938
3939
/*
3940
* Check if we have enough space in the system space info because we
3941
* will need to update device items in the chunk btree and insert a new
3942
* chunk item in the chunk btree as well. This will allocate a new
3943
* system block group if needed.
3944
*/
3945
check_system_chunk(trans, flags);
3946
3947
bg = btrfs_create_chunk(trans, space_info, flags);
3948
if (IS_ERR(bg)) {
3949
ret = PTR_ERR(bg);
3950
goto out;
3951
}
3952
3953
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
3954
/*
3955
* Normally we are not expected to fail with -ENOSPC here, since we have
3956
* previously reserved space in the system space_info and allocated one
3957
* new system chunk if necessary. However there are three exceptions:
3958
*
3959
* 1) We may have enough free space in the system space_info but all the
3960
* existing system block groups have a profile which can not be used
3961
* for extent allocation.
3962
*
3963
* This happens when mounting in degraded mode. For example we have a
3964
* RAID1 filesystem with 2 devices, lose one device and mount the fs
3965
* using the other device in degraded mode. If we then allocate a chunk,
3966
* we may have enough free space in the existing system space_info, but
3967
* none of the block groups can be used for extent allocation since they
3968
* have a RAID1 profile, and because we are in degraded mode with a
3969
* single device, we are forced to allocate a new system chunk with a
3970
* SINGLE profile. Making check_system_chunk() iterate over all system
3971
* block groups and check if they have a usable profile and enough space
3972
* can be slow on very large filesystems, so we tolerate the -ENOSPC and
3973
* try again after forcing allocation of a new system chunk. Like this
3974
* we avoid paying the cost of that search in normal circumstances, when
3975
* we were not mounted in degraded mode;
3976
*
3977
* 2) We had enough free space info the system space_info, and one suitable
3978
* block group to allocate from when we called check_system_chunk()
3979
* above. However right after we called it, the only system block group
3980
* with enough free space got turned into RO mode by a running scrub,
3981
* and in this case we have to allocate a new one and retry. We only
3982
* need do this allocate and retry once, since we have a transaction
3983
* handle and scrub uses the commit root to search for block groups;
3984
*
3985
* 3) We had one system block group with enough free space when we called
3986
* check_system_chunk(), but after that, right before we tried to
3987
* allocate the last extent buffer we needed, a discard operation came
3988
* in and it temporarily removed the last free space entry from the
3989
* block group (discard removes a free space entry, discards it, and
3990
* then adds back the entry to the block group cache).
3991
*/
3992
if (ret == -ENOSPC) {
3993
const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
3994
struct btrfs_block_group *sys_bg;
3995
struct btrfs_space_info *sys_space_info;
3996
3997
sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags);
3998
if (unlikely(!sys_space_info)) {
3999
ret = -EINVAL;
4000
btrfs_abort_transaction(trans, ret);
4001
goto out;
4002
}
4003
4004
sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags);
4005
if (IS_ERR(sys_bg)) {
4006
ret = PTR_ERR(sys_bg);
4007
btrfs_abort_transaction(trans, ret);
4008
goto out;
4009
}
4010
4011
ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
4012
if (unlikely(ret)) {
4013
btrfs_abort_transaction(trans, ret);
4014
goto out;
4015
}
4016
4017
ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
4018
if (unlikely(ret)) {
4019
btrfs_abort_transaction(trans, ret);
4020
goto out;
4021
}
4022
} else if (unlikely(ret)) {
4023
btrfs_abort_transaction(trans, ret);
4024
goto out;
4025
}
4026
out:
4027
btrfs_trans_release_chunk_metadata(trans);
4028
4029
if (ret)
4030
return ERR_PTR(ret);
4031
4032
btrfs_get_block_group(bg);
4033
return bg;
4034
}
4035
4036
/*
4037
* Chunk allocation is done in 2 phases:
4038
*
4039
* 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
4040
* the chunk, the chunk mapping, create its block group and add the items
4041
* that belong in the chunk btree to it - more specifically, we need to
4042
* update device items in the chunk btree and add a new chunk item to it.
4043
*
4044
* 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
4045
* group item to the extent btree and the device extent items to the devices
4046
* btree.
4047
*
4048
* This is done to prevent deadlocks. For example when COWing a node from the
4049
* extent btree we are holding a write lock on the node's parent and if we
4050
* trigger chunk allocation and attempted to insert the new block group item
4051
* in the extent btree right way, we could deadlock because the path for the
4052
* insertion can include that parent node. At first glance it seems impossible
4053
* to trigger chunk allocation after starting a transaction since tasks should
4054
* reserve enough transaction units (metadata space), however while that is true
4055
* most of the time, chunk allocation may still be triggered for several reasons:
4056
*
4057
* 1) When reserving metadata, we check if there is enough free space in the
4058
* metadata space_info and therefore don't trigger allocation of a new chunk.
4059
* However later when the task actually tries to COW an extent buffer from
4060
* the extent btree or from the device btree for example, it is forced to
4061
* allocate a new block group (chunk) because the only one that had enough
4062
* free space was just turned to RO mode by a running scrub for example (or
4063
* device replace, block group reclaim thread, etc), so we can not use it
4064
* for allocating an extent and end up being forced to allocate a new one;
4065
*
4066
* 2) Because we only check that the metadata space_info has enough free bytes,
4067
* we end up not allocating a new metadata chunk in that case. However if
4068
* the filesystem was mounted in degraded mode, none of the existing block
4069
* groups might be suitable for extent allocation due to their incompatible
4070
* profile (for e.g. mounting a 2 devices filesystem, where all block groups
4071
* use a RAID1 profile, in degraded mode using a single device). In this case
4072
* when the task attempts to COW some extent buffer of the extent btree for
4073
* example, it will trigger allocation of a new metadata block group with a
4074
* suitable profile (SINGLE profile in the example of the degraded mount of
4075
* the RAID1 filesystem);
4076
*
4077
* 3) The task has reserved enough transaction units / metadata space, but when
4078
* it attempts to COW an extent buffer from the extent or device btree for
4079
* example, it does not find any free extent in any metadata block group,
4080
* therefore forced to try to allocate a new metadata block group.
4081
* This is because some other task allocated all available extents in the
4082
* meanwhile - this typically happens with tasks that don't reserve space
4083
* properly, either intentionally or as a bug. One example where this is
4084
* done intentionally is fsync, as it does not reserve any transaction units
4085
* and ends up allocating a variable number of metadata extents for log
4086
* tree extent buffers;
4087
*
4088
* 4) The task has reserved enough transaction units / metadata space, but right
4089
* before it tries to allocate the last extent buffer it needs, a discard
4090
* operation comes in and, temporarily, removes the last free space entry from
4091
* the only metadata block group that had free space (discard starts by
4092
* removing a free space entry from a block group, then does the discard
4093
* operation and, once it's done, it adds back the free space entry to the
4094
* block group).
4095
*
4096
* We also need this 2 phases setup when adding a device to a filesystem with
4097
* a seed device - we must create new metadata and system chunks without adding
4098
* any of the block group items to the chunk, extent and device btrees. If we
4099
* did not do it this way, we would get ENOSPC when attempting to update those
4100
* btrees, since all the chunks from the seed device are read-only.
4101
*
4102
* Phase 1 does the updates and insertions to the chunk btree because if we had
4103
* it done in phase 2 and have a thundering herd of tasks allocating chunks in
4104
* parallel, we risk having too many system chunks allocated by many tasks if
4105
* many tasks reach phase 1 without the previous ones completing phase 2. In the
4106
* extreme case this leads to exhaustion of the system chunk array in the
4107
* superblock. This is easier to trigger if using a btree node/leaf size of 64K
4108
* and with RAID filesystems (so we have more device items in the chunk btree).
4109
* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
4110
* the system chunk array due to concurrent allocations") provides more details.
4111
*
4112
* Allocation of system chunks does not happen through this function. A task that
4113
* needs to update the chunk btree (the only btree that uses system chunks), must
4114
* preallocate chunk space by calling either check_system_chunk() or
4115
* btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
4116
* metadata chunk or when removing a chunk, while the later is used before doing
4117
* a modification to the chunk btree - use cases for the later are adding,
4118
* removing and resizing a device as well as relocation of a system chunk.
4119
* See the comment below for more details.
4120
*
4121
* The reservation of system space, done through check_system_chunk(), as well
4122
* as all the updates and insertions into the chunk btree must be done while
4123
* holding fs_info->chunk_mutex. This is important to guarantee that while COWing
4124
* an extent buffer from the chunks btree we never trigger allocation of a new
4125
* system chunk, which would result in a deadlock (trying to lock twice an
4126
* extent buffer of the chunk btree, first time before triggering the chunk
4127
* allocation and the second time during chunk allocation while attempting to
4128
* update the chunks btree). The system chunk array is also updated while holding
4129
* that mutex. The same logic applies to removing chunks - we must reserve system
4130
* space, update the chunk btree and the system chunk array in the superblock
4131
* while holding fs_info->chunk_mutex.
4132
*
4133
* This function, btrfs_chunk_alloc(), belongs to phase 1.
4134
*
4135
* @space_info: specify which space_info the new chunk should belong to.
4136
*
4137
* If @force is CHUNK_ALLOC_FORCE:
4138
* - return 1 if it successfully allocates a chunk,
4139
* - return errors including -ENOSPC otherwise.
4140
* If @force is NOT CHUNK_ALLOC_FORCE:
4141
* - return 0 if it doesn't need to allocate a new chunk,
4142
* - return 1 if it successfully allocates a chunk,
4143
* - return errors including -ENOSPC otherwise.
4144
*/
4145
int btrfs_chunk_alloc(struct btrfs_trans_handle *trans,
4146
struct btrfs_space_info *space_info, u64 flags,
4147
enum btrfs_chunk_alloc_enum force)
4148
{
4149
struct btrfs_fs_info *fs_info = trans->fs_info;
4150
struct btrfs_block_group *ret_bg;
4151
bool wait_for_alloc = false;
4152
bool should_alloc = false;
4153
bool from_extent_allocation = false;
4154
int ret = 0;
4155
4156
if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
4157
from_extent_allocation = true;
4158
force = CHUNK_ALLOC_FORCE;
4159
}
4160
4161
/* Don't re-enter if we're already allocating a chunk */
4162
if (trans->allocating_chunk)
4163
return -ENOSPC;
4164
/*
4165
* Allocation of system chunks can not happen through this path, as we
4166
* could end up in a deadlock if we are allocating a data or metadata
4167
* chunk and there is another task modifying the chunk btree.
4168
*
4169
* This is because while we are holding the chunk mutex, we will attempt
4170
* to add the new chunk item to the chunk btree or update an existing
4171
* device item in the chunk btree, while the other task that is modifying
4172
* the chunk btree is attempting to COW an extent buffer while holding a
4173
* lock on it and on its parent - if the COW operation triggers a system
4174
* chunk allocation, then we can deadlock because we are holding the
4175
* chunk mutex and we may need to access that extent buffer or its parent
4176
* in order to add the chunk item or update a device item.
4177
*
4178
* Tasks that want to modify the chunk tree should reserve system space
4179
* before updating the chunk btree, by calling either
4180
* btrfs_reserve_chunk_metadata() or check_system_chunk().
4181
* It's possible that after a task reserves the space, it still ends up
4182
* here - this happens in the cases described above at do_chunk_alloc().
4183
* The task will have to either retry or fail.
4184
*/
4185
if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
4186
return -ENOSPC;
4187
4188
do {
4189
spin_lock(&space_info->lock);
4190
if (force < space_info->force_alloc)
4191
force = space_info->force_alloc;
4192
should_alloc = should_alloc_chunk(fs_info, space_info, force);
4193
if (space_info->full) {
4194
/* No more free physical space */
4195
if (should_alloc)
4196
ret = -ENOSPC;
4197
else
4198
ret = 0;
4199
spin_unlock(&space_info->lock);
4200
return ret;
4201
} else if (!should_alloc) {
4202
spin_unlock(&space_info->lock);
4203
return 0;
4204
} else if (space_info->chunk_alloc) {
4205
/*
4206
* Someone is already allocating, so we need to block
4207
* until this someone is finished and then loop to
4208
* recheck if we should continue with our allocation
4209
* attempt.
4210
*/
4211
wait_for_alloc = true;
4212
force = CHUNK_ALLOC_NO_FORCE;
4213
spin_unlock(&space_info->lock);
4214
mutex_lock(&fs_info->chunk_mutex);
4215
mutex_unlock(&fs_info->chunk_mutex);
4216
} else {
4217
/* Proceed with allocation */
4218
space_info->chunk_alloc = 1;
4219
wait_for_alloc = false;
4220
spin_unlock(&space_info->lock);
4221
}
4222
4223
cond_resched();
4224
} while (wait_for_alloc);
4225
4226
mutex_lock(&fs_info->chunk_mutex);
4227
trans->allocating_chunk = true;
4228
4229
/*
4230
* If we have mixed data/metadata chunks we want to make sure we keep
4231
* allocating mixed chunks instead of individual chunks.
4232
*/
4233
if (btrfs_mixed_space_info(space_info))
4234
flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
4235
4236
/*
4237
* if we're doing a data chunk, go ahead and make sure that
4238
* we keep a reasonable number of metadata chunks allocated in the
4239
* FS as well.
4240
*/
4241
if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4242
fs_info->data_chunk_allocations++;
4243
if (!(fs_info->data_chunk_allocations %
4244
fs_info->metadata_ratio))
4245
force_metadata_allocation(fs_info);
4246
}
4247
4248
ret_bg = do_chunk_alloc(trans, space_info, flags);
4249
trans->allocating_chunk = false;
4250
4251
if (IS_ERR(ret_bg)) {
4252
ret = PTR_ERR(ret_bg);
4253
} else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) {
4254
/*
4255
* New block group is likely to be used soon. Try to activate
4256
* it now. Failure is OK for now.
4257
*/
4258
btrfs_zone_activate(ret_bg);
4259
}
4260
4261
if (!ret)
4262
btrfs_put_block_group(ret_bg);
4263
4264
spin_lock(&space_info->lock);
4265
if (ret < 0) {
4266
if (ret == -ENOSPC)
4267
space_info->full = 1;
4268
else
4269
goto out;
4270
} else {
4271
ret = 1;
4272
space_info->max_extent_size = 0;
4273
}
4274
4275
space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4276
out:
4277
space_info->chunk_alloc = 0;
4278
spin_unlock(&space_info->lock);
4279
mutex_unlock(&fs_info->chunk_mutex);
4280
4281
return ret;
4282
}
4283
4284
static u64 get_profile_num_devs(const struct btrfs_fs_info *fs_info, u64 type)
4285
{
4286
u64 num_dev;
4287
4288
num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
4289
if (!num_dev)
4290
num_dev = fs_info->fs_devices->rw_devices;
4291
4292
return num_dev;
4293
}
4294
4295
static void reserve_chunk_space(struct btrfs_trans_handle *trans,
4296
u64 bytes,
4297
u64 type)
4298
{
4299
struct btrfs_fs_info *fs_info = trans->fs_info;
4300
struct btrfs_space_info *info;
4301
u64 left;
4302
int ret = 0;
4303
4304
/*
4305
* Needed because we can end up allocating a system chunk and for an
4306
* atomic and race free space reservation in the chunk block reserve.
4307
*/
4308
lockdep_assert_held(&fs_info->chunk_mutex);
4309
4310
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4311
spin_lock(&info->lock);
4312
left = info->total_bytes - btrfs_space_info_used(info, true);
4313
spin_unlock(&info->lock);
4314
4315
if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4316
btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
4317
left, bytes, type);
4318
btrfs_dump_space_info(fs_info, info, 0, false);
4319
}
4320
4321
if (left < bytes) {
4322
u64 flags = btrfs_system_alloc_profile(fs_info);
4323
struct btrfs_block_group *bg;
4324
struct btrfs_space_info *space_info;
4325
4326
space_info = btrfs_find_space_info(fs_info, flags);
4327
ASSERT(space_info);
4328
4329
/*
4330
* Ignore failure to create system chunk. We might end up not
4331
* needing it, as we might not need to COW all nodes/leafs from
4332
* the paths we visit in the chunk tree (they were already COWed
4333
* or created in the current transaction for example).
4334
*/
4335
bg = btrfs_create_chunk(trans, space_info, flags);
4336
if (IS_ERR(bg)) {
4337
ret = PTR_ERR(bg);
4338
} else {
4339
/*
4340
* We have a new chunk. We also need to activate it for
4341
* zoned filesystem.
4342
*/
4343
ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
4344
if (ret < 0)
4345
return;
4346
4347
/*
4348
* If we fail to add the chunk item here, we end up
4349
* trying again at phase 2 of chunk allocation, at
4350
* btrfs_create_pending_block_groups(). So ignore
4351
* any error here. An ENOSPC here could happen, due to
4352
* the cases described at do_chunk_alloc() - the system
4353
* block group we just created was just turned into RO
4354
* mode by a scrub for example, or a running discard
4355
* temporarily removed its free space entries, etc.
4356
*/
4357
btrfs_chunk_alloc_add_chunk_item(trans, bg);
4358
}
4359
}
4360
4361
if (!ret) {
4362
ret = btrfs_block_rsv_add(fs_info,
4363
&fs_info->chunk_block_rsv,
4364
bytes, BTRFS_RESERVE_NO_FLUSH);
4365
if (!ret)
4366
trans->chunk_bytes_reserved += bytes;
4367
}
4368
}
4369
4370
/*
4371
* Reserve space in the system space for allocating or removing a chunk.
4372
* The caller must be holding fs_info->chunk_mutex.
4373
*/
4374
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
4375
{
4376
struct btrfs_fs_info *fs_info = trans->fs_info;
4377
const u64 num_devs = get_profile_num_devs(fs_info, type);
4378
u64 bytes;
4379
4380
/* num_devs device items to update and 1 chunk item to add or remove. */
4381
bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
4382
btrfs_calc_insert_metadata_size(fs_info, 1);
4383
4384
reserve_chunk_space(trans, bytes, type);
4385
}
4386
4387
/*
4388
* Reserve space in the system space, if needed, for doing a modification to the
4389
* chunk btree.
4390
*
4391
* @trans: A transaction handle.
4392
* @is_item_insertion: Indicate if the modification is for inserting a new item
4393
* in the chunk btree or if it's for the deletion or update
4394
* of an existing item.
4395
*
4396
* This is used in a context where we need to update the chunk btree outside
4397
* block group allocation and removal, to avoid a deadlock with a concurrent
4398
* task that is allocating a metadata or data block group and therefore needs to
4399
* update the chunk btree while holding the chunk mutex. After the update to the
4400
* chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
4401
*
4402
*/
4403
void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
4404
bool is_item_insertion)
4405
{
4406
struct btrfs_fs_info *fs_info = trans->fs_info;
4407
u64 bytes;
4408
4409
if (is_item_insertion)
4410
bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
4411
else
4412
bytes = btrfs_calc_metadata_size(fs_info, 1);
4413
4414
mutex_lock(&fs_info->chunk_mutex);
4415
reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
4416
mutex_unlock(&fs_info->chunk_mutex);
4417
}
4418
4419
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
4420
{
4421
struct btrfs_block_group *block_group;
4422
4423
block_group = btrfs_lookup_first_block_group(info, 0);
4424
while (block_group) {
4425
btrfs_wait_block_group_cache_done(block_group);
4426
spin_lock(&block_group->lock);
4427
if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
4428
&block_group->runtime_flags)) {
4429
struct btrfs_inode *inode = block_group->inode;
4430
4431
block_group->inode = NULL;
4432
spin_unlock(&block_group->lock);
4433
4434
ASSERT(block_group->io_ctl.inode == NULL);
4435
iput(&inode->vfs_inode);
4436
} else {
4437
spin_unlock(&block_group->lock);
4438
}
4439
block_group = btrfs_next_block_group(block_group);
4440
}
4441
}
4442
4443
static void check_removing_space_info(struct btrfs_space_info *space_info)
4444
{
4445
struct btrfs_fs_info *info = space_info->fs_info;
4446
4447
if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) {
4448
/* This is a top space_info, proceed with its children first. */
4449
for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) {
4450
if (space_info->sub_group[i]) {
4451
check_removing_space_info(space_info->sub_group[i]);
4452
kfree(space_info->sub_group[i]);
4453
space_info->sub_group[i] = NULL;
4454
}
4455
}
4456
}
4457
4458
/*
4459
* Do not hide this behind enospc_debug, this is actually important and
4460
* indicates a real bug if this happens.
4461
*/
4462
if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0))
4463
btrfs_dump_space_info(info, space_info, 0, false);
4464
4465
/*
4466
* If there was a failure to cleanup a log tree, very likely due to an
4467
* IO failure on a writeback attempt of one or more of its extent
4468
* buffers, we could not do proper (and cheap) unaccounting of their
4469
* reserved space, so don't warn on bytes_reserved > 0 in that case.
4470
*/
4471
if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
4472
!BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
4473
if (WARN_ON(space_info->bytes_reserved > 0))
4474
btrfs_dump_space_info(info, space_info, 0, false);
4475
}
4476
4477
WARN_ON(space_info->reclaim_size > 0);
4478
}
4479
4480
/*
4481
* Must be called only after stopping all workers, since we could have block
4482
* group caching kthreads running, and therefore they could race with us if we
4483
* freed the block groups before stopping them.
4484
*/
4485
int btrfs_free_block_groups(struct btrfs_fs_info *info)
4486
{
4487
struct btrfs_block_group *block_group;
4488
struct btrfs_space_info *space_info;
4489
struct btrfs_caching_control *caching_ctl;
4490
struct rb_node *n;
4491
4492
if (btrfs_is_zoned(info)) {
4493
if (info->active_meta_bg) {
4494
btrfs_put_block_group(info->active_meta_bg);
4495
info->active_meta_bg = NULL;
4496
}
4497
if (info->active_system_bg) {
4498
btrfs_put_block_group(info->active_system_bg);
4499
info->active_system_bg = NULL;
4500
}
4501
}
4502
4503
write_lock(&info->block_group_cache_lock);
4504
while (!list_empty(&info->caching_block_groups)) {
4505
caching_ctl = list_first_entry(&info->caching_block_groups,
4506
struct btrfs_caching_control, list);
4507
list_del(&caching_ctl->list);
4508
btrfs_put_caching_control(caching_ctl);
4509
}
4510
write_unlock(&info->block_group_cache_lock);
4511
4512
spin_lock(&info->unused_bgs_lock);
4513
while (!list_empty(&info->unused_bgs)) {
4514
block_group = list_first_entry(&info->unused_bgs,
4515
struct btrfs_block_group,
4516
bg_list);
4517
list_del_init(&block_group->bg_list);
4518
btrfs_put_block_group(block_group);
4519
}
4520
4521
while (!list_empty(&info->reclaim_bgs)) {
4522
block_group = list_first_entry(&info->reclaim_bgs,
4523
struct btrfs_block_group,
4524
bg_list);
4525
list_del_init(&block_group->bg_list);
4526
btrfs_put_block_group(block_group);
4527
}
4528
spin_unlock(&info->unused_bgs_lock);
4529
4530
spin_lock(&info->zone_active_bgs_lock);
4531
while (!list_empty(&info->zone_active_bgs)) {
4532
block_group = list_first_entry(&info->zone_active_bgs,
4533
struct btrfs_block_group,
4534
active_bg_list);
4535
list_del_init(&block_group->active_bg_list);
4536
btrfs_put_block_group(block_group);
4537
}
4538
spin_unlock(&info->zone_active_bgs_lock);
4539
4540
write_lock(&info->block_group_cache_lock);
4541
while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
4542
block_group = rb_entry(n, struct btrfs_block_group,
4543
cache_node);
4544
rb_erase_cached(&block_group->cache_node,
4545
&info->block_group_cache_tree);
4546
RB_CLEAR_NODE(&block_group->cache_node);
4547
write_unlock(&info->block_group_cache_lock);
4548
4549
down_write(&block_group->space_info->groups_sem);
4550
list_del(&block_group->list);
4551
up_write(&block_group->space_info->groups_sem);
4552
4553
/*
4554
* We haven't cached this block group, which means we could
4555
* possibly have excluded extents on this block group.
4556
*/
4557
if (block_group->cached == BTRFS_CACHE_NO ||
4558
block_group->cached == BTRFS_CACHE_ERROR)
4559
btrfs_free_excluded_extents(block_group);
4560
4561
btrfs_remove_free_space_cache(block_group);
4562
ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
4563
ASSERT(list_empty(&block_group->dirty_list));
4564
ASSERT(list_empty(&block_group->io_list));
4565
ASSERT(list_empty(&block_group->bg_list));
4566
ASSERT(refcount_read(&block_group->refs) == 1);
4567
ASSERT(block_group->swap_extents == 0);
4568
btrfs_put_block_group(block_group);
4569
4570
write_lock(&info->block_group_cache_lock);
4571
}
4572
write_unlock(&info->block_group_cache_lock);
4573
4574
btrfs_release_global_block_rsv(info);
4575
4576
while (!list_empty(&info->space_info)) {
4577
space_info = list_first_entry(&info->space_info,
4578
struct btrfs_space_info, list);
4579
4580
check_removing_space_info(space_info);
4581
list_del(&space_info->list);
4582
btrfs_sysfs_remove_space_info(space_info);
4583
}
4584
return 0;
4585
}
4586
4587
void btrfs_freeze_block_group(struct btrfs_block_group *cache)
4588
{
4589
atomic_inc(&cache->frozen);
4590
}
4591
4592
void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
4593
{
4594
struct btrfs_fs_info *fs_info = block_group->fs_info;
4595
bool cleanup;
4596
4597
spin_lock(&block_group->lock);
4598
cleanup = (atomic_dec_and_test(&block_group->frozen) &&
4599
test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
4600
spin_unlock(&block_group->lock);
4601
4602
if (cleanup) {
4603
struct btrfs_chunk_map *map;
4604
4605
map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
4606
/* Logic error, can't happen. */
4607
ASSERT(map);
4608
4609
btrfs_remove_chunk_map(fs_info, map);
4610
4611
/* Once for our lookup reference. */
4612
btrfs_free_chunk_map(map);
4613
4614
/*
4615
* We may have left one free space entry and other possible
4616
* tasks trimming this block group have left 1 entry each one.
4617
* Free them if any.
4618
*/
4619
btrfs_remove_free_space_cache(block_group);
4620
}
4621
}
4622
4623
bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
4624
{
4625
bool ret = true;
4626
4627
spin_lock(&bg->lock);
4628
if (bg->ro)
4629
ret = false;
4630
else
4631
bg->swap_extents++;
4632
spin_unlock(&bg->lock);
4633
4634
return ret;
4635
}
4636
4637
void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
4638
{
4639
spin_lock(&bg->lock);
4640
ASSERT(!bg->ro);
4641
ASSERT(bg->swap_extents >= amount);
4642
bg->swap_extents -= amount;
4643
spin_unlock(&bg->lock);
4644
}
4645
4646
enum btrfs_block_group_size_class btrfs_calc_block_group_size_class(u64 size)
4647
{
4648
if (size <= SZ_128K)
4649
return BTRFS_BG_SZ_SMALL;
4650
if (size <= SZ_8M)
4651
return BTRFS_BG_SZ_MEDIUM;
4652
return BTRFS_BG_SZ_LARGE;
4653
}
4654
4655
/*
4656
* Handle a block group allocating an extent in a size class
4657
*
4658
* @bg: The block group we allocated in.
4659
* @size_class: The size class of the allocation.
4660
* @force_wrong_size_class: Whether we are desperate enough to allow
4661
* mismatched size classes.
4662
*
4663
* Returns: 0 if the size class was valid for this block_group, -EAGAIN in the
4664
* case of a race that leads to the wrong size class without
4665
* force_wrong_size_class set.
4666
*
4667
* find_free_extent will skip block groups with a mismatched size class until
4668
* it really needs to avoid ENOSPC. In that case it will set
4669
* force_wrong_size_class. However, if a block group is newly allocated and
4670
* doesn't yet have a size class, then it is possible for two allocations of
4671
* different sizes to race and both try to use it. The loser is caught here and
4672
* has to retry.
4673
*/
4674
int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
4675
enum btrfs_block_group_size_class size_class,
4676
bool force_wrong_size_class)
4677
{
4678
ASSERT(size_class != BTRFS_BG_SZ_NONE);
4679
4680
/* The new allocation is in the right size class, do nothing */
4681
if (bg->size_class == size_class)
4682
return 0;
4683
/*
4684
* The new allocation is in a mismatched size class.
4685
* This means one of two things:
4686
*
4687
* 1. Two tasks in find_free_extent for different size_classes raced
4688
* and hit the same empty block_group. Make the loser try again.
4689
* 2. A call to find_free_extent got desperate enough to set
4690
* 'force_wrong_slab'. Don't change the size_class, but allow the
4691
* allocation.
4692
*/
4693
if (bg->size_class != BTRFS_BG_SZ_NONE) {
4694
if (force_wrong_size_class)
4695
return 0;
4696
return -EAGAIN;
4697
}
4698
/*
4699
* The happy new block group case: the new allocation is the first
4700
* one in the block_group so we set size_class.
4701
*/
4702
bg->size_class = size_class;
4703
4704
return 0;
4705
}
4706
4707
bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
4708
{
4709
if (btrfs_is_zoned(bg->fs_info))
4710
return false;
4711
if (!btrfs_is_block_group_data_only(bg))
4712
return false;
4713
return true;
4714
}
4715
4716