Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/mm/backing-dev.c
29264 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
3
#include <linux/blkdev.h>
4
#include <linux/wait.h>
5
#include <linux/rbtree.h>
6
#include <linux/kthread.h>
7
#include <linux/backing-dev.h>
8
#include <linux/blk-cgroup.h>
9
#include <linux/freezer.h>
10
#include <linux/fs.h>
11
#include <linux/pagemap.h>
12
#include <linux/mm.h>
13
#include <linux/sched/mm.h>
14
#include <linux/sched.h>
15
#include <linux/module.h>
16
#include <linux/writeback.h>
17
#include <linux/device.h>
18
#include <trace/events/writeback.h>
19
#include "internal.h"
20
21
struct backing_dev_info noop_backing_dev_info;
22
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
23
24
static const char *bdi_unknown_name = "(unknown)";
25
26
/*
27
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
28
* reader side locking.
29
*/
30
DEFINE_SPINLOCK(bdi_lock);
31
static u64 bdi_id_cursor;
32
static struct rb_root bdi_tree = RB_ROOT;
33
LIST_HEAD(bdi_list);
34
35
/* bdi_wq serves all asynchronous writeback tasks */
36
struct workqueue_struct *bdi_wq;
37
38
#ifdef CONFIG_DEBUG_FS
39
#include <linux/debugfs.h>
40
#include <linux/seq_file.h>
41
42
struct wb_stats {
43
unsigned long nr_dirty;
44
unsigned long nr_io;
45
unsigned long nr_more_io;
46
unsigned long nr_dirty_time;
47
unsigned long nr_writeback;
48
unsigned long nr_reclaimable;
49
unsigned long nr_dirtied;
50
unsigned long nr_written;
51
unsigned long dirty_thresh;
52
unsigned long wb_thresh;
53
};
54
55
static struct dentry *bdi_debug_root;
56
57
static void bdi_debug_init(void)
58
{
59
bdi_debug_root = debugfs_create_dir("bdi", NULL);
60
}
61
62
static void collect_wb_stats(struct wb_stats *stats,
63
struct bdi_writeback *wb)
64
{
65
struct inode *inode;
66
67
spin_lock(&wb->list_lock);
68
list_for_each_entry(inode, &wb->b_dirty, i_io_list)
69
stats->nr_dirty++;
70
list_for_each_entry(inode, &wb->b_io, i_io_list)
71
stats->nr_io++;
72
list_for_each_entry(inode, &wb->b_more_io, i_io_list)
73
stats->nr_more_io++;
74
list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
75
if (inode->i_state & I_DIRTY_TIME)
76
stats->nr_dirty_time++;
77
spin_unlock(&wb->list_lock);
78
79
stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
80
stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
81
stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
82
stats->nr_written += wb_stat(wb, WB_WRITTEN);
83
stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
84
}
85
86
#ifdef CONFIG_CGROUP_WRITEBACK
87
static void bdi_collect_stats(struct backing_dev_info *bdi,
88
struct wb_stats *stats)
89
{
90
struct bdi_writeback *wb;
91
92
rcu_read_lock();
93
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
94
if (!wb_tryget(wb))
95
continue;
96
97
collect_wb_stats(stats, wb);
98
wb_put(wb);
99
}
100
rcu_read_unlock();
101
}
102
#else
103
static void bdi_collect_stats(struct backing_dev_info *bdi,
104
struct wb_stats *stats)
105
{
106
collect_wb_stats(stats, &bdi->wb);
107
}
108
#endif
109
110
static int bdi_debug_stats_show(struct seq_file *m, void *v)
111
{
112
struct backing_dev_info *bdi = m->private;
113
unsigned long background_thresh;
114
unsigned long dirty_thresh;
115
struct wb_stats stats;
116
unsigned long tot_bw;
117
118
global_dirty_limits(&background_thresh, &dirty_thresh);
119
120
memset(&stats, 0, sizeof(stats));
121
stats.dirty_thresh = dirty_thresh;
122
bdi_collect_stats(bdi, &stats);
123
tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);
124
125
seq_printf(m,
126
"BdiWriteback: %10lu kB\n"
127
"BdiReclaimable: %10lu kB\n"
128
"BdiDirtyThresh: %10lu kB\n"
129
"DirtyThresh: %10lu kB\n"
130
"BackgroundThresh: %10lu kB\n"
131
"BdiDirtied: %10lu kB\n"
132
"BdiWritten: %10lu kB\n"
133
"BdiWriteBandwidth: %10lu kBps\n"
134
"b_dirty: %10lu\n"
135
"b_io: %10lu\n"
136
"b_more_io: %10lu\n"
137
"b_dirty_time: %10lu\n"
138
"bdi_list: %10u\n"
139
"state: %10lx\n",
140
K(stats.nr_writeback),
141
K(stats.nr_reclaimable),
142
K(stats.wb_thresh),
143
K(dirty_thresh),
144
K(background_thresh),
145
K(stats.nr_dirtied),
146
K(stats.nr_written),
147
K(tot_bw),
148
stats.nr_dirty,
149
stats.nr_io,
150
stats.nr_more_io,
151
stats.nr_dirty_time,
152
!list_empty(&bdi->bdi_list), bdi->wb.state);
153
154
return 0;
155
}
156
DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
157
158
static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
159
struct wb_stats *stats)
160
{
161
162
seq_printf(m,
163
"WbCgIno: %10lu\n"
164
"WbWriteback: %10lu kB\n"
165
"WbReclaimable: %10lu kB\n"
166
"WbDirtyThresh: %10lu kB\n"
167
"WbDirtied: %10lu kB\n"
168
"WbWritten: %10lu kB\n"
169
"WbWriteBandwidth: %10lu kBps\n"
170
"b_dirty: %10lu\n"
171
"b_io: %10lu\n"
172
"b_more_io: %10lu\n"
173
"b_dirty_time: %10lu\n"
174
"state: %10lx\n\n",
175
#ifdef CONFIG_CGROUP_WRITEBACK
176
cgroup_ino(wb->memcg_css->cgroup),
177
#else
178
1ul,
179
#endif
180
K(stats->nr_writeback),
181
K(stats->nr_reclaimable),
182
K(stats->wb_thresh),
183
K(stats->nr_dirtied),
184
K(stats->nr_written),
185
K(wb->avg_write_bandwidth),
186
stats->nr_dirty,
187
stats->nr_io,
188
stats->nr_more_io,
189
stats->nr_dirty_time,
190
wb->state);
191
}
192
193
static int cgwb_debug_stats_show(struct seq_file *m, void *v)
194
{
195
struct backing_dev_info *bdi = m->private;
196
unsigned long background_thresh;
197
unsigned long dirty_thresh;
198
struct bdi_writeback *wb;
199
200
global_dirty_limits(&background_thresh, &dirty_thresh);
201
202
rcu_read_lock();
203
list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
204
struct wb_stats stats = { .dirty_thresh = dirty_thresh };
205
206
if (!wb_tryget(wb))
207
continue;
208
209
collect_wb_stats(&stats, wb);
210
211
/*
212
* Calculate thresh of wb in writeback cgroup which is min of
213
* thresh in global domain and thresh in cgroup domain. Drop
214
* rcu lock because cgwb_calc_thresh may sleep in
215
* cgroup_rstat_flush. We can do so here because we have a ref.
216
*/
217
if (mem_cgroup_wb_domain(wb)) {
218
rcu_read_unlock();
219
stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
220
rcu_read_lock();
221
}
222
223
wb_stats_show(m, wb, &stats);
224
225
wb_put(wb);
226
}
227
rcu_read_unlock();
228
229
return 0;
230
}
231
DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
232
233
static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
234
{
235
bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
236
237
debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
238
&bdi_debug_stats_fops);
239
debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
240
&cgwb_debug_stats_fops);
241
}
242
243
static void bdi_debug_unregister(struct backing_dev_info *bdi)
244
{
245
debugfs_remove_recursive(bdi->debug_dir);
246
}
247
#else /* CONFIG_DEBUG_FS */
248
static inline void bdi_debug_init(void)
249
{
250
}
251
static inline void bdi_debug_register(struct backing_dev_info *bdi,
252
const char *name)
253
{
254
}
255
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
256
{
257
}
258
#endif /* CONFIG_DEBUG_FS */
259
260
static ssize_t read_ahead_kb_store(struct device *dev,
261
struct device_attribute *attr,
262
const char *buf, size_t count)
263
{
264
struct backing_dev_info *bdi = dev_get_drvdata(dev);
265
unsigned long read_ahead_kb;
266
ssize_t ret;
267
268
ret = kstrtoul(buf, 10, &read_ahead_kb);
269
if (ret < 0)
270
return ret;
271
272
bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);
273
274
return count;
275
}
276
277
#define BDI_SHOW(name, expr) \
278
static ssize_t name##_show(struct device *dev, \
279
struct device_attribute *attr, char *buf) \
280
{ \
281
struct backing_dev_info *bdi = dev_get_drvdata(dev); \
282
\
283
return sysfs_emit(buf, "%lld\n", (long long)expr); \
284
} \
285
static DEVICE_ATTR_RW(name);
286
287
BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
288
289
static ssize_t min_ratio_store(struct device *dev,
290
struct device_attribute *attr, const char *buf, size_t count)
291
{
292
struct backing_dev_info *bdi = dev_get_drvdata(dev);
293
unsigned int ratio;
294
ssize_t ret;
295
296
ret = kstrtouint(buf, 10, &ratio);
297
if (ret < 0)
298
return ret;
299
300
ret = bdi_set_min_ratio(bdi, ratio);
301
if (!ret)
302
ret = count;
303
304
return ret;
305
}
306
BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE)
307
308
static ssize_t min_ratio_fine_store(struct device *dev,
309
struct device_attribute *attr, const char *buf, size_t count)
310
{
311
struct backing_dev_info *bdi = dev_get_drvdata(dev);
312
unsigned int ratio;
313
ssize_t ret;
314
315
ret = kstrtouint(buf, 10, &ratio);
316
if (ret < 0)
317
return ret;
318
319
ret = bdi_set_min_ratio_no_scale(bdi, ratio);
320
if (!ret)
321
ret = count;
322
323
return ret;
324
}
325
BDI_SHOW(min_ratio_fine, bdi->min_ratio)
326
327
static ssize_t max_ratio_store(struct device *dev,
328
struct device_attribute *attr, const char *buf, size_t count)
329
{
330
struct backing_dev_info *bdi = dev_get_drvdata(dev);
331
unsigned int ratio;
332
ssize_t ret;
333
334
ret = kstrtouint(buf, 10, &ratio);
335
if (ret < 0)
336
return ret;
337
338
ret = bdi_set_max_ratio(bdi, ratio);
339
if (!ret)
340
ret = count;
341
342
return ret;
343
}
344
BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE)
345
346
static ssize_t max_ratio_fine_store(struct device *dev,
347
struct device_attribute *attr, const char *buf, size_t count)
348
{
349
struct backing_dev_info *bdi = dev_get_drvdata(dev);
350
unsigned int ratio;
351
ssize_t ret;
352
353
ret = kstrtouint(buf, 10, &ratio);
354
if (ret < 0)
355
return ret;
356
357
ret = bdi_set_max_ratio_no_scale(bdi, ratio);
358
if (!ret)
359
ret = count;
360
361
return ret;
362
}
363
BDI_SHOW(max_ratio_fine, bdi->max_ratio)
364
365
static ssize_t min_bytes_show(struct device *dev,
366
struct device_attribute *attr,
367
char *buf)
368
{
369
struct backing_dev_info *bdi = dev_get_drvdata(dev);
370
371
return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi));
372
}
373
374
static ssize_t min_bytes_store(struct device *dev,
375
struct device_attribute *attr, const char *buf, size_t count)
376
{
377
struct backing_dev_info *bdi = dev_get_drvdata(dev);
378
u64 bytes;
379
ssize_t ret;
380
381
ret = kstrtoull(buf, 10, &bytes);
382
if (ret < 0)
383
return ret;
384
385
ret = bdi_set_min_bytes(bdi, bytes);
386
if (!ret)
387
ret = count;
388
389
return ret;
390
}
391
static DEVICE_ATTR_RW(min_bytes);
392
393
static ssize_t max_bytes_show(struct device *dev,
394
struct device_attribute *attr,
395
char *buf)
396
{
397
struct backing_dev_info *bdi = dev_get_drvdata(dev);
398
399
return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi));
400
}
401
402
static ssize_t max_bytes_store(struct device *dev,
403
struct device_attribute *attr, const char *buf, size_t count)
404
{
405
struct backing_dev_info *bdi = dev_get_drvdata(dev);
406
u64 bytes;
407
ssize_t ret;
408
409
ret = kstrtoull(buf, 10, &bytes);
410
if (ret < 0)
411
return ret;
412
413
ret = bdi_set_max_bytes(bdi, bytes);
414
if (!ret)
415
ret = count;
416
417
return ret;
418
}
419
static DEVICE_ATTR_RW(max_bytes);
420
421
static ssize_t stable_pages_required_show(struct device *dev,
422
struct device_attribute *attr,
423
char *buf)
424
{
425
dev_warn_once(dev,
426
"the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
427
return sysfs_emit(buf, "%d\n", 0);
428
}
429
static DEVICE_ATTR_RO(stable_pages_required);
430
431
static ssize_t strict_limit_store(struct device *dev,
432
struct device_attribute *attr, const char *buf, size_t count)
433
{
434
struct backing_dev_info *bdi = dev_get_drvdata(dev);
435
unsigned int strict_limit;
436
ssize_t ret;
437
438
ret = kstrtouint(buf, 10, &strict_limit);
439
if (ret < 0)
440
return ret;
441
442
ret = bdi_set_strict_limit(bdi, strict_limit);
443
if (!ret)
444
ret = count;
445
446
return ret;
447
}
448
449
static ssize_t strict_limit_show(struct device *dev,
450
struct device_attribute *attr, char *buf)
451
{
452
struct backing_dev_info *bdi = dev_get_drvdata(dev);
453
454
return sysfs_emit(buf, "%d\n",
455
!!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
456
}
457
static DEVICE_ATTR_RW(strict_limit);
458
459
static struct attribute *bdi_dev_attrs[] = {
460
&dev_attr_read_ahead_kb.attr,
461
&dev_attr_min_ratio.attr,
462
&dev_attr_min_ratio_fine.attr,
463
&dev_attr_max_ratio.attr,
464
&dev_attr_max_ratio_fine.attr,
465
&dev_attr_min_bytes.attr,
466
&dev_attr_max_bytes.attr,
467
&dev_attr_stable_pages_required.attr,
468
&dev_attr_strict_limit.attr,
469
NULL,
470
};
471
ATTRIBUTE_GROUPS(bdi_dev);
472
473
static const struct class bdi_class = {
474
.name = "bdi",
475
.dev_groups = bdi_dev_groups,
476
};
477
478
static __init int bdi_class_init(void)
479
{
480
int ret;
481
482
ret = class_register(&bdi_class);
483
if (ret)
484
return ret;
485
486
bdi_debug_init();
487
488
return 0;
489
}
490
postcore_initcall(bdi_class_init);
491
492
static int __init default_bdi_init(void)
493
{
494
bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
495
WQ_SYSFS, 0);
496
if (!bdi_wq)
497
return -ENOMEM;
498
return 0;
499
}
500
subsys_initcall(default_bdi_init);
501
502
static void wb_update_bandwidth_workfn(struct work_struct *work)
503
{
504
struct bdi_writeback *wb = container_of(to_delayed_work(work),
505
struct bdi_writeback, bw_dwork);
506
507
wb_update_bandwidth(wb);
508
}
509
510
/*
511
* Initial write bandwidth: 100 MB/s
512
*/
513
#define INIT_BW MB_TO_PAGES(100)
514
515
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
516
gfp_t gfp)
517
{
518
int err;
519
520
memset(wb, 0, sizeof(*wb));
521
522
wb->bdi = bdi;
523
wb->last_old_flush = jiffies;
524
INIT_LIST_HEAD(&wb->b_dirty);
525
INIT_LIST_HEAD(&wb->b_io);
526
INIT_LIST_HEAD(&wb->b_more_io);
527
INIT_LIST_HEAD(&wb->b_dirty_time);
528
spin_lock_init(&wb->list_lock);
529
530
atomic_set(&wb->writeback_inodes, 0);
531
wb->bw_time_stamp = jiffies;
532
wb->balanced_dirty_ratelimit = INIT_BW;
533
wb->dirty_ratelimit = INIT_BW;
534
wb->write_bandwidth = INIT_BW;
535
wb->avg_write_bandwidth = INIT_BW;
536
537
spin_lock_init(&wb->work_lock);
538
INIT_LIST_HEAD(&wb->work_list);
539
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
540
INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);
541
542
err = fprop_local_init_percpu(&wb->completions, gfp);
543
if (err)
544
return err;
545
546
err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS);
547
if (err)
548
fprop_local_destroy_percpu(&wb->completions);
549
550
return err;
551
}
552
553
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
554
555
/*
556
* Remove bdi from the global list and shutdown any threads we have running
557
*/
558
static void wb_shutdown(struct bdi_writeback *wb)
559
{
560
/* Make sure nobody queues further work */
561
spin_lock_irq(&wb->work_lock);
562
if (!test_and_clear_bit(WB_registered, &wb->state)) {
563
spin_unlock_irq(&wb->work_lock);
564
return;
565
}
566
spin_unlock_irq(&wb->work_lock);
567
568
cgwb_remove_from_bdi_list(wb);
569
/*
570
* Drain work list and shutdown the delayed_work. !WB_registered
571
* tells wb_workfn() that @wb is dying and its work_list needs to
572
* be drained no matter what.
573
*/
574
mod_delayed_work(bdi_wq, &wb->dwork, 0);
575
flush_delayed_work(&wb->dwork);
576
WARN_ON(!list_empty(&wb->work_list));
577
flush_delayed_work(&wb->bw_dwork);
578
}
579
580
static void wb_exit(struct bdi_writeback *wb)
581
{
582
WARN_ON(delayed_work_pending(&wb->dwork));
583
percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS);
584
fprop_local_destroy_percpu(&wb->completions);
585
}
586
587
#ifdef CONFIG_CGROUP_WRITEBACK
588
589
#include <linux/memcontrol.h>
590
591
/*
592
* cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
593
* memcg->cgwb_list. bdi->cgwb_tree is also RCU protected.
594
*/
595
static DEFINE_SPINLOCK(cgwb_lock);
596
static struct workqueue_struct *cgwb_release_wq;
597
598
static LIST_HEAD(offline_cgwbs);
599
static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
600
static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
601
602
static void cgwb_free_rcu(struct rcu_head *rcu_head)
603
{
604
struct bdi_writeback *wb = container_of(rcu_head,
605
struct bdi_writeback, rcu);
606
607
percpu_ref_exit(&wb->refcnt);
608
kfree(wb);
609
}
610
611
static void cgwb_release_workfn(struct work_struct *work)
612
{
613
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
614
release_work);
615
struct backing_dev_info *bdi = wb->bdi;
616
617
mutex_lock(&wb->bdi->cgwb_release_mutex);
618
wb_shutdown(wb);
619
620
css_put(wb->memcg_css);
621
css_put(wb->blkcg_css);
622
mutex_unlock(&wb->bdi->cgwb_release_mutex);
623
624
/* triggers blkg destruction if no online users left */
625
blkcg_unpin_online(wb->blkcg_css);
626
627
fprop_local_destroy_percpu(&wb->memcg_completions);
628
629
spin_lock_irq(&cgwb_lock);
630
list_del(&wb->offline_node);
631
spin_unlock_irq(&cgwb_lock);
632
633
wb_exit(wb);
634
bdi_put(bdi);
635
WARN_ON_ONCE(!list_empty(&wb->b_attached));
636
WARN_ON_ONCE(work_pending(&wb->switch_work));
637
call_rcu(&wb->rcu, cgwb_free_rcu);
638
}
639
640
static void cgwb_release(struct percpu_ref *refcnt)
641
{
642
struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
643
refcnt);
644
queue_work(cgwb_release_wq, &wb->release_work);
645
}
646
647
static void cgwb_kill(struct bdi_writeback *wb)
648
{
649
lockdep_assert_held(&cgwb_lock);
650
651
WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
652
list_del(&wb->memcg_node);
653
list_del(&wb->blkcg_node);
654
list_add(&wb->offline_node, &offline_cgwbs);
655
percpu_ref_kill(&wb->refcnt);
656
}
657
658
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
659
{
660
spin_lock_irq(&cgwb_lock);
661
list_del_rcu(&wb->bdi_node);
662
spin_unlock_irq(&cgwb_lock);
663
}
664
665
static int cgwb_create(struct backing_dev_info *bdi,
666
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
667
{
668
struct mem_cgroup *memcg;
669
struct cgroup_subsys_state *blkcg_css;
670
struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
671
struct bdi_writeback *wb;
672
unsigned long flags;
673
int ret = 0;
674
675
memcg = mem_cgroup_from_css(memcg_css);
676
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
677
memcg_cgwb_list = &memcg->cgwb_list;
678
blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
679
680
/* look up again under lock and discard on blkcg mismatch */
681
spin_lock_irqsave(&cgwb_lock, flags);
682
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
683
if (wb && wb->blkcg_css != blkcg_css) {
684
cgwb_kill(wb);
685
wb = NULL;
686
}
687
spin_unlock_irqrestore(&cgwb_lock, flags);
688
if (wb)
689
goto out_put;
690
691
/* need to create a new one */
692
wb = kmalloc(sizeof(*wb), gfp);
693
if (!wb) {
694
ret = -ENOMEM;
695
goto out_put;
696
}
697
698
ret = wb_init(wb, bdi, gfp);
699
if (ret)
700
goto err_free;
701
702
ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
703
if (ret)
704
goto err_wb_exit;
705
706
ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
707
if (ret)
708
goto err_ref_exit;
709
710
wb->memcg_css = memcg_css;
711
wb->blkcg_css = blkcg_css;
712
INIT_LIST_HEAD(&wb->b_attached);
713
INIT_WORK(&wb->switch_work, inode_switch_wbs_work_fn);
714
init_llist_head(&wb->switch_wbs_ctxs);
715
INIT_WORK(&wb->release_work, cgwb_release_workfn);
716
set_bit(WB_registered, &wb->state);
717
bdi_get(bdi);
718
719
/*
720
* The root wb determines the registered state of the whole bdi and
721
* memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
722
* whether they're still online. Don't link @wb if any is dead.
723
* See wb_memcg_offline() and wb_blkcg_offline().
724
*/
725
ret = -ENODEV;
726
spin_lock_irqsave(&cgwb_lock, flags);
727
if (test_bit(WB_registered, &bdi->wb.state) &&
728
blkcg_cgwb_list->next && memcg_cgwb_list->next) {
729
/* we might have raced another instance of this function */
730
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
731
if (!ret) {
732
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
733
list_add(&wb->memcg_node, memcg_cgwb_list);
734
list_add(&wb->blkcg_node, blkcg_cgwb_list);
735
blkcg_pin_online(blkcg_css);
736
css_get(memcg_css);
737
css_get(blkcg_css);
738
}
739
}
740
spin_unlock_irqrestore(&cgwb_lock, flags);
741
if (ret) {
742
if (ret == -EEXIST)
743
ret = 0;
744
goto err_fprop_exit;
745
}
746
goto out_put;
747
748
err_fprop_exit:
749
bdi_put(bdi);
750
fprop_local_destroy_percpu(&wb->memcg_completions);
751
err_ref_exit:
752
percpu_ref_exit(&wb->refcnt);
753
err_wb_exit:
754
wb_exit(wb);
755
err_free:
756
kfree(wb);
757
out_put:
758
css_put(blkcg_css);
759
return ret;
760
}
761
762
/**
763
* wb_get_lookup - get wb for a given memcg
764
* @bdi: target bdi
765
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
766
*
767
* Try to get the wb for @memcg_css on @bdi. The returned wb has its
768
* refcount incremented.
769
*
770
* This function uses css_get() on @memcg_css and thus expects its refcnt
771
* to be positive on invocation. IOW, rcu_read_lock() protection on
772
* @memcg_css isn't enough. try_get it before calling this function.
773
*
774
* A wb is keyed by its associated memcg. As blkcg implicitly enables
775
* memcg on the default hierarchy, memcg association is guaranteed to be
776
* more specific (equal or descendant to the associated blkcg) and thus can
777
* identify both the memcg and blkcg associations.
778
*
779
* Because the blkcg associated with a memcg may change as blkcg is enabled
780
* and disabled closer to root in the hierarchy, each wb keeps track of
781
* both the memcg and blkcg associated with it and verifies the blkcg on
782
* each lookup. On mismatch, the existing wb is discarded and a new one is
783
* created.
784
*/
785
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
786
struct cgroup_subsys_state *memcg_css)
787
{
788
struct bdi_writeback *wb;
789
790
if (!memcg_css->parent)
791
return &bdi->wb;
792
793
rcu_read_lock();
794
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
795
if (wb) {
796
struct cgroup_subsys_state *blkcg_css;
797
798
/* see whether the blkcg association has changed */
799
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
800
if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
801
wb = NULL;
802
css_put(blkcg_css);
803
}
804
rcu_read_unlock();
805
806
return wb;
807
}
808
809
/**
810
* wb_get_create - get wb for a given memcg, create if necessary
811
* @bdi: target bdi
812
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
813
* @gfp: allocation mask to use
814
*
815
* Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
816
* create one. See wb_get_lookup() for more details.
817
*/
818
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
819
struct cgroup_subsys_state *memcg_css,
820
gfp_t gfp)
821
{
822
struct bdi_writeback *wb;
823
824
might_alloc(gfp);
825
826
do {
827
wb = wb_get_lookup(bdi, memcg_css);
828
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
829
830
return wb;
831
}
832
833
static int cgwb_bdi_init(struct backing_dev_info *bdi)
834
{
835
int ret;
836
837
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
838
mutex_init(&bdi->cgwb_release_mutex);
839
init_rwsem(&bdi->wb_switch_rwsem);
840
841
ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
842
if (!ret) {
843
bdi->wb.memcg_css = &root_mem_cgroup->css;
844
bdi->wb.blkcg_css = blkcg_root_css;
845
INIT_WORK(&bdi->wb.switch_work, inode_switch_wbs_work_fn);
846
init_llist_head(&bdi->wb.switch_wbs_ctxs);
847
}
848
return ret;
849
}
850
851
static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
852
{
853
struct radix_tree_iter iter;
854
void **slot;
855
struct bdi_writeback *wb;
856
857
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
858
859
spin_lock_irq(&cgwb_lock);
860
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
861
cgwb_kill(*slot);
862
spin_unlock_irq(&cgwb_lock);
863
864
mutex_lock(&bdi->cgwb_release_mutex);
865
spin_lock_irq(&cgwb_lock);
866
while (!list_empty(&bdi->wb_list)) {
867
wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
868
bdi_node);
869
spin_unlock_irq(&cgwb_lock);
870
wb_shutdown(wb);
871
spin_lock_irq(&cgwb_lock);
872
}
873
spin_unlock_irq(&cgwb_lock);
874
mutex_unlock(&bdi->cgwb_release_mutex);
875
}
876
877
/*
878
* cleanup_offline_cgwbs_workfn - try to release dying cgwbs
879
*
880
* Try to release dying cgwbs by switching attached inodes to the nearest
881
* living ancestor's writeback. Processed wbs are placed at the end
882
* of the list to guarantee the forward progress.
883
*/
884
static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
885
{
886
struct bdi_writeback *wb;
887
LIST_HEAD(processed);
888
889
spin_lock_irq(&cgwb_lock);
890
891
while (!list_empty(&offline_cgwbs)) {
892
wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
893
offline_node);
894
list_move(&wb->offline_node, &processed);
895
896
/*
897
* If wb is dirty, cleaning up the writeback by switching
898
* attached inodes will result in an effective removal of any
899
* bandwidth restrictions, which isn't the goal. Instead,
900
* it can be postponed until the next time, when all io
901
* will be likely completed. If in the meantime some inodes
902
* will get re-dirtied, they should be eventually switched to
903
* a new cgwb.
904
*/
905
if (wb_has_dirty_io(wb))
906
continue;
907
908
if (!wb_tryget(wb))
909
continue;
910
911
spin_unlock_irq(&cgwb_lock);
912
while (cleanup_offline_cgwb(wb))
913
cond_resched();
914
spin_lock_irq(&cgwb_lock);
915
916
wb_put(wb);
917
}
918
919
if (!list_empty(&processed))
920
list_splice_tail(&processed, &offline_cgwbs);
921
922
spin_unlock_irq(&cgwb_lock);
923
}
924
925
/**
926
* wb_memcg_offline - kill all wb's associated with a memcg being offlined
927
* @memcg: memcg being offlined
928
*
929
* Also prevents creation of any new wb's associated with @memcg.
930
*/
931
void wb_memcg_offline(struct mem_cgroup *memcg)
932
{
933
struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
934
struct bdi_writeback *wb, *next;
935
936
spin_lock_irq(&cgwb_lock);
937
list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
938
cgwb_kill(wb);
939
memcg_cgwb_list->next = NULL; /* prevent new wb's */
940
spin_unlock_irq(&cgwb_lock);
941
942
queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
943
}
944
945
/**
946
* wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
947
* @css: blkcg being offlined
948
*
949
* Also prevents creation of any new wb's associated with @blkcg.
950
*/
951
void wb_blkcg_offline(struct cgroup_subsys_state *css)
952
{
953
struct bdi_writeback *wb, *next;
954
struct list_head *list = blkcg_get_cgwb_list(css);
955
956
spin_lock_irq(&cgwb_lock);
957
list_for_each_entry_safe(wb, next, list, blkcg_node)
958
cgwb_kill(wb);
959
list->next = NULL; /* prevent new wb's */
960
spin_unlock_irq(&cgwb_lock);
961
}
962
963
static void cgwb_bdi_register(struct backing_dev_info *bdi)
964
{
965
spin_lock_irq(&cgwb_lock);
966
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
967
spin_unlock_irq(&cgwb_lock);
968
}
969
970
static int __init cgwb_init(void)
971
{
972
/*
973
* There can be many concurrent release work items overwhelming
974
* system_wq. Put them in a separate wq and limit concurrency.
975
* There's no point in executing many of these in parallel.
976
*/
977
cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
978
if (!cgwb_release_wq)
979
return -ENOMEM;
980
981
return 0;
982
}
983
subsys_initcall(cgwb_init);
984
985
#else /* CONFIG_CGROUP_WRITEBACK */
986
987
static int cgwb_bdi_init(struct backing_dev_info *bdi)
988
{
989
return wb_init(&bdi->wb, bdi, GFP_KERNEL);
990
}
991
992
static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
993
994
static void cgwb_bdi_register(struct backing_dev_info *bdi)
995
{
996
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
997
}
998
999
static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
1000
{
1001
list_del_rcu(&wb->bdi_node);
1002
}
1003
1004
#endif /* CONFIG_CGROUP_WRITEBACK */
1005
1006
int bdi_init(struct backing_dev_info *bdi)
1007
{
1008
bdi->dev = NULL;
1009
1010
kref_init(&bdi->refcnt);
1011
bdi->min_ratio = 0;
1012
bdi->max_ratio = 100 * BDI_RATIO_SCALE;
1013
bdi->max_prop_frac = FPROP_FRAC_BASE;
1014
INIT_LIST_HEAD(&bdi->bdi_list);
1015
INIT_LIST_HEAD(&bdi->wb_list);
1016
init_waitqueue_head(&bdi->wb_waitq);
1017
bdi->last_bdp_sleep = jiffies;
1018
1019
return cgwb_bdi_init(bdi);
1020
}
1021
1022
struct backing_dev_info *bdi_alloc(int node_id)
1023
{
1024
struct backing_dev_info *bdi;
1025
1026
bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
1027
if (!bdi)
1028
return NULL;
1029
1030
if (bdi_init(bdi)) {
1031
kfree(bdi);
1032
return NULL;
1033
}
1034
bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
1035
bdi->ra_pages = VM_READAHEAD_PAGES;
1036
bdi->io_pages = VM_READAHEAD_PAGES;
1037
timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
1038
return bdi;
1039
}
1040
EXPORT_SYMBOL(bdi_alloc);
1041
1042
static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
1043
{
1044
struct rb_node **p = &bdi_tree.rb_node;
1045
struct rb_node *parent = NULL;
1046
struct backing_dev_info *bdi;
1047
1048
lockdep_assert_held(&bdi_lock);
1049
1050
while (*p) {
1051
parent = *p;
1052
bdi = rb_entry(parent, struct backing_dev_info, rb_node);
1053
1054
if (bdi->id > id)
1055
p = &(*p)->rb_left;
1056
else if (bdi->id < id)
1057
p = &(*p)->rb_right;
1058
else
1059
break;
1060
}
1061
1062
if (parentp)
1063
*parentp = parent;
1064
return p;
1065
}
1066
1067
/**
1068
* bdi_get_by_id - lookup and get bdi from its id
1069
* @id: bdi id to lookup
1070
*
1071
* Find bdi matching @id and get it. Returns NULL if the matching bdi
1072
* doesn't exist or is already unregistered.
1073
*/
1074
struct backing_dev_info *bdi_get_by_id(u64 id)
1075
{
1076
struct backing_dev_info *bdi = NULL;
1077
struct rb_node **p;
1078
1079
spin_lock_bh(&bdi_lock);
1080
p = bdi_lookup_rb_node(id, NULL);
1081
if (*p) {
1082
bdi = rb_entry(*p, struct backing_dev_info, rb_node);
1083
bdi_get(bdi);
1084
}
1085
spin_unlock_bh(&bdi_lock);
1086
1087
return bdi;
1088
}
1089
1090
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
1091
{
1092
struct device *dev;
1093
struct rb_node *parent, **p;
1094
1095
if (bdi->dev) /* The driver needs to use separate queues per device */
1096
return 0;
1097
1098
vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
1099
dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
1100
if (IS_ERR(dev))
1101
return PTR_ERR(dev);
1102
1103
cgwb_bdi_register(bdi);
1104
bdi->dev = dev;
1105
1106
bdi_debug_register(bdi, dev_name(dev));
1107
set_bit(WB_registered, &bdi->wb.state);
1108
1109
spin_lock_bh(&bdi_lock);
1110
1111
bdi->id = ++bdi_id_cursor;
1112
1113
p = bdi_lookup_rb_node(bdi->id, &parent);
1114
rb_link_node(&bdi->rb_node, parent, p);
1115
rb_insert_color(&bdi->rb_node, &bdi_tree);
1116
1117
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
1118
1119
spin_unlock_bh(&bdi_lock);
1120
1121
trace_writeback_bdi_register(bdi);
1122
return 0;
1123
}
1124
1125
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
1126
{
1127
va_list args;
1128
int ret;
1129
1130
va_start(args, fmt);
1131
ret = bdi_register_va(bdi, fmt, args);
1132
va_end(args);
1133
return ret;
1134
}
1135
EXPORT_SYMBOL(bdi_register);
1136
1137
void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
1138
{
1139
WARN_ON_ONCE(bdi->owner);
1140
bdi->owner = owner;
1141
get_device(owner);
1142
}
1143
1144
/*
1145
* Remove bdi from bdi_list, and ensure that it is no longer visible
1146
*/
1147
static void bdi_remove_from_list(struct backing_dev_info *bdi)
1148
{
1149
spin_lock_bh(&bdi_lock);
1150
rb_erase(&bdi->rb_node, &bdi_tree);
1151
list_del_rcu(&bdi->bdi_list);
1152
spin_unlock_bh(&bdi_lock);
1153
1154
synchronize_rcu_expedited();
1155
}
1156
1157
void bdi_unregister(struct backing_dev_info *bdi)
1158
{
1159
timer_delete_sync(&bdi->laptop_mode_wb_timer);
1160
1161
/* make sure nobody finds us on the bdi_list anymore */
1162
bdi_remove_from_list(bdi);
1163
wb_shutdown(&bdi->wb);
1164
cgwb_bdi_unregister(bdi);
1165
1166
/*
1167
* If this BDI's min ratio has been set, use bdi_set_min_ratio() to
1168
* update the global bdi_min_ratio.
1169
*/
1170
if (bdi->min_ratio)
1171
bdi_set_min_ratio(bdi, 0);
1172
1173
if (bdi->dev) {
1174
bdi_debug_unregister(bdi);
1175
device_unregister(bdi->dev);
1176
bdi->dev = NULL;
1177
}
1178
1179
if (bdi->owner) {
1180
put_device(bdi->owner);
1181
bdi->owner = NULL;
1182
}
1183
}
1184
EXPORT_SYMBOL(bdi_unregister);
1185
1186
static void release_bdi(struct kref *ref)
1187
{
1188
struct backing_dev_info *bdi =
1189
container_of(ref, struct backing_dev_info, refcnt);
1190
1191
WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
1192
WARN_ON_ONCE(bdi->dev);
1193
wb_exit(&bdi->wb);
1194
kfree(bdi);
1195
}
1196
1197
void bdi_put(struct backing_dev_info *bdi)
1198
{
1199
kref_put(&bdi->refcnt, release_bdi);
1200
}
1201
EXPORT_SYMBOL(bdi_put);
1202
1203
struct backing_dev_info *inode_to_bdi(struct inode *inode)
1204
{
1205
struct super_block *sb;
1206
1207
if (!inode)
1208
return &noop_backing_dev_info;
1209
1210
sb = inode->i_sb;
1211
#ifdef CONFIG_BLOCK
1212
if (sb_is_blkdev_sb(sb))
1213
return I_BDEV(inode)->bd_disk->bdi;
1214
#endif
1215
return sb->s_bdi;
1216
}
1217
EXPORT_SYMBOL(inode_to_bdi);
1218
1219
const char *bdi_dev_name(struct backing_dev_info *bdi)
1220
{
1221
if (!bdi || !bdi->dev)
1222
return bdi_unknown_name;
1223
return bdi->dev_name;
1224
}
1225
EXPORT_SYMBOL_GPL(bdi_dev_name);
1226
1227