Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/fs/btrfs/compression.c
29267 views
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
* Copyright (C) 2008 Oracle. All rights reserved.
4
*/
5
6
#include <linux/kernel.h>
7
#include <linux/bio.h>
8
#include <linux/file.h>
9
#include <linux/fs.h>
10
#include <linux/pagemap.h>
11
#include <linux/pagevec.h>
12
#include <linux/highmem.h>
13
#include <linux/kthread.h>
14
#include <linux/time.h>
15
#include <linux/init.h>
16
#include <linux/string.h>
17
#include <linux/backing-dev.h>
18
#include <linux/writeback.h>
19
#include <linux/psi.h>
20
#include <linux/slab.h>
21
#include <linux/sched/mm.h>
22
#include <linux/log2.h>
23
#include <linux/shrinker.h>
24
#include <crypto/hash.h>
25
#include "misc.h"
26
#include "ctree.h"
27
#include "fs.h"
28
#include "btrfs_inode.h"
29
#include "bio.h"
30
#include "ordered-data.h"
31
#include "compression.h"
32
#include "extent_io.h"
33
#include "extent_map.h"
34
#include "subpage.h"
35
#include "messages.h"
36
#include "super.h"
37
38
static struct bio_set btrfs_compressed_bioset;
39
40
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
41
42
const char* btrfs_compress_type2str(enum btrfs_compression_type type)
43
{
44
switch (type) {
45
case BTRFS_COMPRESS_ZLIB:
46
case BTRFS_COMPRESS_LZO:
47
case BTRFS_COMPRESS_ZSTD:
48
case BTRFS_COMPRESS_NONE:
49
return btrfs_compress_types[type];
50
default:
51
break;
52
}
53
54
return NULL;
55
}
56
57
static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio)
58
{
59
return container_of(bbio, struct compressed_bio, bbio);
60
}
61
62
static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
63
u64 start, blk_opf_t op,
64
btrfs_bio_end_io_t end_io)
65
{
66
struct btrfs_bio *bbio;
67
68
bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
69
GFP_NOFS, &btrfs_compressed_bioset));
70
btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL);
71
bbio->inode = inode;
72
bbio->file_offset = start;
73
return to_compressed_bio(bbio);
74
}
75
76
bool btrfs_compress_is_valid_type(const char *str, size_t len)
77
{
78
int i;
79
80
for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
81
size_t comp_len = strlen(btrfs_compress_types[i]);
82
83
if (len < comp_len)
84
continue;
85
86
if (!strncmp(btrfs_compress_types[i], str, comp_len))
87
return true;
88
}
89
return false;
90
}
91
92
static int compression_compress_pages(int type, struct list_head *ws,
93
struct btrfs_inode *inode, u64 start,
94
struct folio **folios, unsigned long *out_folios,
95
unsigned long *total_in, unsigned long *total_out)
96
{
97
switch (type) {
98
case BTRFS_COMPRESS_ZLIB:
99
return zlib_compress_folios(ws, inode, start, folios,
100
out_folios, total_in, total_out);
101
case BTRFS_COMPRESS_LZO:
102
return lzo_compress_folios(ws, inode, start, folios,
103
out_folios, total_in, total_out);
104
case BTRFS_COMPRESS_ZSTD:
105
return zstd_compress_folios(ws, inode, start, folios,
106
out_folios, total_in, total_out);
107
case BTRFS_COMPRESS_NONE:
108
default:
109
/*
110
* This can happen when compression races with remount setting
111
* it to 'no compress', while caller doesn't call
112
* inode_need_compress() to check if we really need to
113
* compress.
114
*
115
* Not a big deal, just need to inform caller that we
116
* haven't allocated any pages yet.
117
*/
118
*out_folios = 0;
119
return -E2BIG;
120
}
121
}
122
123
static int compression_decompress_bio(struct list_head *ws,
124
struct compressed_bio *cb)
125
{
126
switch (cb->compress_type) {
127
case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
128
case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
129
case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
130
case BTRFS_COMPRESS_NONE:
131
default:
132
/*
133
* This can't happen, the type is validated several times
134
* before we get here.
135
*/
136
BUG();
137
}
138
}
139
140
static int compression_decompress(int type, struct list_head *ws,
141
const u8 *data_in, struct folio *dest_folio,
142
unsigned long dest_pgoff, size_t srclen, size_t destlen)
143
{
144
switch (type) {
145
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_folio,
146
dest_pgoff, srclen, destlen);
147
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_folio,
148
dest_pgoff, srclen, destlen);
149
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_folio,
150
dest_pgoff, srclen, destlen);
151
case BTRFS_COMPRESS_NONE:
152
default:
153
/*
154
* This can't happen, the type is validated several times
155
* before we get here.
156
*/
157
BUG();
158
}
159
}
160
161
static void btrfs_free_compressed_folios(struct compressed_bio *cb)
162
{
163
for (unsigned int i = 0; i < cb->nr_folios; i++)
164
btrfs_free_compr_folio(cb->compressed_folios[i]);
165
kfree(cb->compressed_folios);
166
}
167
168
static int btrfs_decompress_bio(struct compressed_bio *cb);
169
170
/*
171
* Global cache of last unused pages for compression/decompression.
172
*/
173
static struct btrfs_compr_pool {
174
struct shrinker *shrinker;
175
spinlock_t lock;
176
struct list_head list;
177
int count;
178
int thresh;
179
} compr_pool;
180
181
static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc)
182
{
183
int ret;
184
185
/*
186
* We must not read the values more than once if 'ret' gets expanded in
187
* the return statement so we don't accidentally return a negative
188
* number, even if the first condition finds it positive.
189
*/
190
ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh);
191
192
return ret > 0 ? ret : 0;
193
}
194
195
static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
196
{
197
struct list_head remove;
198
struct list_head *tmp, *next;
199
int freed;
200
201
if (compr_pool.count == 0)
202
return SHRINK_STOP;
203
204
INIT_LIST_HEAD(&remove);
205
206
/* For now, just simply drain the whole list. */
207
spin_lock(&compr_pool.lock);
208
list_splice_init(&compr_pool.list, &remove);
209
freed = compr_pool.count;
210
compr_pool.count = 0;
211
spin_unlock(&compr_pool.lock);
212
213
list_for_each_safe(tmp, next, &remove) {
214
struct page *page = list_entry(tmp, struct page, lru);
215
216
ASSERT(page_ref_count(page) == 1);
217
put_page(page);
218
}
219
220
return freed;
221
}
222
223
/*
224
* Common wrappers for page allocation from compression wrappers
225
*/
226
struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info)
227
{
228
struct folio *folio = NULL;
229
230
/* For bs > ps cases, no cached folio pool for now. */
231
if (fs_info->block_min_order)
232
goto alloc;
233
234
spin_lock(&compr_pool.lock);
235
if (compr_pool.count > 0) {
236
folio = list_first_entry(&compr_pool.list, struct folio, lru);
237
list_del_init(&folio->lru);
238
compr_pool.count--;
239
}
240
spin_unlock(&compr_pool.lock);
241
242
if (folio)
243
return folio;
244
245
alloc:
246
return folio_alloc(GFP_NOFS, fs_info->block_min_order);
247
}
248
249
void btrfs_free_compr_folio(struct folio *folio)
250
{
251
bool do_free = false;
252
253
/* The folio is from bs > ps fs, no cached pool for now. */
254
if (folio_order(folio))
255
goto free;
256
257
spin_lock(&compr_pool.lock);
258
if (compr_pool.count > compr_pool.thresh) {
259
do_free = true;
260
} else {
261
list_add(&folio->lru, &compr_pool.list);
262
compr_pool.count++;
263
}
264
spin_unlock(&compr_pool.lock);
265
266
if (!do_free)
267
return;
268
269
free:
270
ASSERT(folio_ref_count(folio) == 1);
271
folio_put(folio);
272
}
273
274
static void end_bbio_compressed_read(struct btrfs_bio *bbio)
275
{
276
struct compressed_bio *cb = to_compressed_bio(bbio);
277
blk_status_t status = bbio->bio.bi_status;
278
279
if (!status)
280
status = errno_to_blk_status(btrfs_decompress_bio(cb));
281
282
btrfs_free_compressed_folios(cb);
283
btrfs_bio_end_io(cb->orig_bbio, status);
284
bio_put(&bbio->bio);
285
}
286
287
/*
288
* Clear the writeback bits on all of the file
289
* pages for a compressed write
290
*/
291
static noinline void end_compressed_writeback(const struct compressed_bio *cb)
292
{
293
struct inode *inode = &cb->bbio.inode->vfs_inode;
294
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
295
pgoff_t index = cb->start >> PAGE_SHIFT;
296
const pgoff_t end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
297
struct folio_batch fbatch;
298
int i;
299
int ret;
300
301
ret = blk_status_to_errno(cb->bbio.bio.bi_status);
302
if (ret)
303
mapping_set_error(inode->i_mapping, ret);
304
305
folio_batch_init(&fbatch);
306
while (index <= end_index) {
307
ret = filemap_get_folios(inode->i_mapping, &index, end_index,
308
&fbatch);
309
310
if (ret == 0)
311
return;
312
313
for (i = 0; i < ret; i++) {
314
struct folio *folio = fbatch.folios[i];
315
316
btrfs_folio_clamp_clear_writeback(fs_info, folio,
317
cb->start, cb->len);
318
}
319
folio_batch_release(&fbatch);
320
}
321
/* the inode may be gone now */
322
}
323
324
static void btrfs_finish_compressed_write_work(struct work_struct *work)
325
{
326
struct compressed_bio *cb =
327
container_of(work, struct compressed_bio, write_end_work);
328
329
btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len,
330
cb->bbio.bio.bi_status == BLK_STS_OK);
331
332
if (cb->writeback)
333
end_compressed_writeback(cb);
334
/* Note, our inode could be gone now */
335
336
btrfs_free_compressed_folios(cb);
337
bio_put(&cb->bbio.bio);
338
}
339
340
/*
341
* Do the cleanup once all the compressed pages hit the disk. This will clear
342
* writeback on the file pages and free the compressed pages.
343
*
344
* This also calls the writeback end hooks for the file pages so that metadata
345
* and checksums can be updated in the file.
346
*/
347
static void end_bbio_compressed_write(struct btrfs_bio *bbio)
348
{
349
struct compressed_bio *cb = to_compressed_bio(bbio);
350
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
351
352
queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
353
}
354
355
static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb)
356
{
357
struct btrfs_fs_info *fs_info = cb->bbio.fs_info;
358
struct bio *bio = &cb->bbio.bio;
359
u32 offset = 0;
360
361
while (offset < cb->compressed_len) {
362
struct folio *folio;
363
int ret;
364
u32 len = min_t(u32, cb->compressed_len - offset,
365
btrfs_min_folio_size(fs_info));
366
367
folio = cb->compressed_folios[offset >> (PAGE_SHIFT + fs_info->block_min_order)];
368
/* Maximum compressed extent is smaller than bio size limit. */
369
ret = bio_add_folio(bio, folio, len, 0);
370
ASSERT(ret);
371
offset += len;
372
}
373
}
374
375
/*
376
* worker function to build and submit bios for previously compressed pages.
377
* The corresponding pages in the inode should be marked for writeback
378
* and the compressed pages should have a reference on them for dropping
379
* when the IO is complete.
380
*
381
* This also checksums the file bytes and gets things ready for
382
* the end io hooks.
383
*/
384
void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
385
struct folio **compressed_folios,
386
unsigned int nr_folios,
387
blk_opf_t write_flags,
388
bool writeback)
389
{
390
struct btrfs_inode *inode = ordered->inode;
391
struct btrfs_fs_info *fs_info = inode->root->fs_info;
392
struct compressed_bio *cb;
393
394
ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize));
395
ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize));
396
397
cb = alloc_compressed_bio(inode, ordered->file_offset,
398
REQ_OP_WRITE | write_flags,
399
end_bbio_compressed_write);
400
cb->start = ordered->file_offset;
401
cb->len = ordered->num_bytes;
402
cb->compressed_folios = compressed_folios;
403
cb->compressed_len = ordered->disk_num_bytes;
404
cb->writeback = writeback;
405
INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
406
cb->nr_folios = nr_folios;
407
cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
408
cb->bbio.ordered = ordered;
409
btrfs_add_compressed_bio_folios(cb);
410
411
btrfs_submit_bbio(&cb->bbio, 0);
412
}
413
414
/*
415
* Add extra pages in the same compressed file extent so that we don't need to
416
* re-read the same extent again and again.
417
*
418
* NOTE: this won't work well for subpage, as for subpage read, we lock the
419
* full page then submit bio for each compressed/regular extents.
420
*
421
* This means, if we have several sectors in the same page points to the same
422
* on-disk compressed data, we will re-read the same extent many times and
423
* this function can only help for the next page.
424
*/
425
static noinline int add_ra_bio_pages(struct inode *inode,
426
u64 compressed_end,
427
struct compressed_bio *cb,
428
int *memstall, unsigned long *pflags)
429
{
430
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
431
pgoff_t end_index;
432
struct bio *orig_bio = &cb->orig_bbio->bio;
433
u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
434
u64 isize = i_size_read(inode);
435
int ret;
436
struct folio *folio;
437
struct extent_map *em;
438
struct address_space *mapping = inode->i_mapping;
439
struct extent_map_tree *em_tree;
440
struct extent_io_tree *tree;
441
int sectors_missed = 0;
442
443
em_tree = &BTRFS_I(inode)->extent_tree;
444
tree = &BTRFS_I(inode)->io_tree;
445
446
if (isize == 0)
447
return 0;
448
449
/*
450
* For current subpage support, we only support 64K page size,
451
* which means maximum compressed extent size (128K) is just 2x page
452
* size.
453
* This makes readahead less effective, so here disable readahead for
454
* subpage for now, until full compressed write is supported.
455
*/
456
if (fs_info->sectorsize < PAGE_SIZE)
457
return 0;
458
459
/* For bs > ps cases, we don't support readahead for compressed folios for now. */
460
if (fs_info->block_min_order)
461
return 0;
462
463
end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
464
465
while (cur < compressed_end) {
466
pgoff_t page_end;
467
pgoff_t pg_index = cur >> PAGE_SHIFT;
468
u32 add_size;
469
470
if (pg_index > end_index)
471
break;
472
473
folio = filemap_get_folio(mapping, pg_index);
474
if (!IS_ERR(folio)) {
475
u64 folio_sz = folio_size(folio);
476
u64 offset = offset_in_folio(folio, cur);
477
478
folio_put(folio);
479
sectors_missed += (folio_sz - offset) >>
480
fs_info->sectorsize_bits;
481
482
/* Beyond threshold, no need to continue */
483
if (sectors_missed > 4)
484
break;
485
486
/*
487
* Jump to next page start as we already have page for
488
* current offset.
489
*/
490
cur += (folio_sz - offset);
491
continue;
492
}
493
494
folio = filemap_alloc_folio(mapping_gfp_constraint(mapping,
495
~__GFP_FS), 0);
496
if (!folio)
497
break;
498
499
if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) {
500
/* There is already a page, skip to page end */
501
cur += folio_size(folio);
502
folio_put(folio);
503
continue;
504
}
505
506
if (!*memstall && folio_test_workingset(folio)) {
507
psi_memstall_enter(pflags);
508
*memstall = 1;
509
}
510
511
ret = set_folio_extent_mapped(folio);
512
if (ret < 0) {
513
folio_unlock(folio);
514
folio_put(folio);
515
break;
516
}
517
518
page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1;
519
btrfs_lock_extent(tree, cur, page_end, NULL);
520
read_lock(&em_tree->lock);
521
em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
522
read_unlock(&em_tree->lock);
523
524
/*
525
* At this point, we have a locked page in the page cache for
526
* these bytes in the file. But, we have to make sure they map
527
* to this compressed extent on disk.
528
*/
529
if (!em || cur < em->start ||
530
(cur + fs_info->sectorsize > btrfs_extent_map_end(em)) ||
531
(btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) !=
532
orig_bio->bi_iter.bi_sector) {
533
btrfs_free_extent_map(em);
534
btrfs_unlock_extent(tree, cur, page_end, NULL);
535
folio_unlock(folio);
536
folio_put(folio);
537
break;
538
}
539
add_size = min(em->start + em->len, page_end + 1) - cur;
540
btrfs_free_extent_map(em);
541
btrfs_unlock_extent(tree, cur, page_end, NULL);
542
543
if (folio_contains(folio, end_index)) {
544
size_t zero_offset = offset_in_folio(folio, isize);
545
546
if (zero_offset) {
547
int zeros;
548
zeros = folio_size(folio) - zero_offset;
549
folio_zero_range(folio, zero_offset, zeros);
550
}
551
}
552
553
if (!bio_add_folio(orig_bio, folio, add_size,
554
offset_in_folio(folio, cur))) {
555
folio_unlock(folio);
556
folio_put(folio);
557
break;
558
}
559
/*
560
* If it's subpage, we also need to increase its
561
* subpage::readers number, as at endio we will decrease
562
* subpage::readers and to unlock the page.
563
*/
564
if (fs_info->sectorsize < PAGE_SIZE)
565
btrfs_folio_set_lock(fs_info, folio, cur, add_size);
566
folio_put(folio);
567
cur += add_size;
568
}
569
return 0;
570
}
571
572
/*
573
* for a compressed read, the bio we get passed has all the inode pages
574
* in it. We don't actually do IO on those pages but allocate new ones
575
* to hold the compressed pages on disk.
576
*
577
* bio->bi_iter.bi_sector points to the compressed extent on disk
578
* bio->bi_io_vec points to all of the inode pages
579
*
580
* After the compressed pages are read, we copy the bytes into the
581
* bio we were passed and then call the bio end_io calls
582
*/
583
void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
584
{
585
struct btrfs_inode *inode = bbio->inode;
586
struct btrfs_fs_info *fs_info = inode->root->fs_info;
587
struct extent_map_tree *em_tree = &inode->extent_tree;
588
struct compressed_bio *cb;
589
unsigned int compressed_len;
590
u64 file_offset = bbio->file_offset;
591
u64 em_len;
592
u64 em_start;
593
struct extent_map *em;
594
unsigned long pflags;
595
int memstall = 0;
596
blk_status_t status;
597
int ret;
598
599
/* we need the actual starting offset of this extent in the file */
600
read_lock(&em_tree->lock);
601
em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
602
read_unlock(&em_tree->lock);
603
if (!em) {
604
status = BLK_STS_IOERR;
605
goto out;
606
}
607
608
ASSERT(btrfs_extent_map_is_compressed(em));
609
compressed_len = em->disk_num_bytes;
610
611
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
612
end_bbio_compressed_read);
613
614
cb->start = em->start - em->offset;
615
em_len = em->len;
616
em_start = em->start;
617
618
cb->len = bbio->bio.bi_iter.bi_size;
619
cb->compressed_len = compressed_len;
620
cb->compress_type = btrfs_extent_map_compression(em);
621
cb->orig_bbio = bbio;
622
cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root;
623
624
btrfs_free_extent_map(em);
625
626
cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info));
627
cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS);
628
if (!cb->compressed_folios) {
629
status = BLK_STS_RESOURCE;
630
goto out_free_bio;
631
}
632
633
ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order,
634
cb->compressed_folios);
635
if (ret) {
636
status = BLK_STS_RESOURCE;
637
goto out_free_compressed_pages;
638
}
639
640
add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
641
&pflags);
642
643
/* include any pages we added in add_ra-bio_pages */
644
cb->len = bbio->bio.bi_iter.bi_size;
645
cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
646
btrfs_add_compressed_bio_folios(cb);
647
648
if (memstall)
649
psi_memstall_leave(&pflags);
650
651
btrfs_submit_bbio(&cb->bbio, 0);
652
return;
653
654
out_free_compressed_pages:
655
kfree(cb->compressed_folios);
656
out_free_bio:
657
bio_put(&cb->bbio.bio);
658
out:
659
btrfs_bio_end_io(bbio, status);
660
}
661
662
/*
663
* Heuristic uses systematic sampling to collect data from the input data
664
* range, the logic can be tuned by the following constants:
665
*
666
* @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
667
* @SAMPLING_INTERVAL - range from which the sampled data can be collected
668
*/
669
#define SAMPLING_READ_SIZE (16)
670
#define SAMPLING_INTERVAL (256)
671
672
/*
673
* For statistical analysis of the input data we consider bytes that form a
674
* Galois Field of 256 objects. Each object has an attribute count, ie. how
675
* many times the object appeared in the sample.
676
*/
677
#define BUCKET_SIZE (256)
678
679
/*
680
* The size of the sample is based on a statistical sampling rule of thumb.
681
* The common way is to perform sampling tests as long as the number of
682
* elements in each cell is at least 5.
683
*
684
* Instead of 5, we choose 32 to obtain more accurate results.
685
* If the data contain the maximum number of symbols, which is 256, we obtain a
686
* sample size bound by 8192.
687
*
688
* For a sample of at most 8KB of data per data range: 16 consecutive bytes
689
* from up to 512 locations.
690
*/
691
#define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
692
SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
693
694
struct bucket_item {
695
u32 count;
696
};
697
698
struct heuristic_ws {
699
/* Partial copy of input data */
700
u8 *sample;
701
u32 sample_size;
702
/* Buckets store counters for each byte value */
703
struct bucket_item *bucket;
704
/* Sorting buffer */
705
struct bucket_item *bucket_b;
706
struct list_head list;
707
};
708
709
static void free_heuristic_ws(struct list_head *ws)
710
{
711
struct heuristic_ws *workspace;
712
713
workspace = list_entry(ws, struct heuristic_ws, list);
714
715
kvfree(workspace->sample);
716
kfree(workspace->bucket);
717
kfree(workspace->bucket_b);
718
kfree(workspace);
719
}
720
721
static struct list_head *alloc_heuristic_ws(struct btrfs_fs_info *fs_info)
722
{
723
struct heuristic_ws *ws;
724
725
ws = kzalloc(sizeof(*ws), GFP_KERNEL);
726
if (!ws)
727
return ERR_PTR(-ENOMEM);
728
729
ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
730
if (!ws->sample)
731
goto fail;
732
733
ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
734
if (!ws->bucket)
735
goto fail;
736
737
ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
738
if (!ws->bucket_b)
739
goto fail;
740
741
INIT_LIST_HEAD(&ws->list);
742
return &ws->list;
743
fail:
744
free_heuristic_ws(&ws->list);
745
return ERR_PTR(-ENOMEM);
746
}
747
748
const struct btrfs_compress_levels btrfs_heuristic_compress = { 0 };
749
750
static const struct btrfs_compress_levels * const btrfs_compress_levels[] = {
751
/* The heuristic is represented as compression type 0 */
752
&btrfs_heuristic_compress,
753
&btrfs_zlib_compress,
754
&btrfs_lzo_compress,
755
&btrfs_zstd_compress,
756
};
757
758
static struct list_head *alloc_workspace(struct btrfs_fs_info *fs_info, int type, int level)
759
{
760
switch (type) {
761
case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(fs_info);
762
case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(fs_info, level);
763
case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(fs_info);
764
case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(fs_info, level);
765
default:
766
/*
767
* This can't happen, the type is validated several times
768
* before we get here.
769
*/
770
BUG();
771
}
772
}
773
774
static void free_workspace(int type, struct list_head *ws)
775
{
776
switch (type) {
777
case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
778
case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
779
case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws);
780
case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
781
default:
782
/*
783
* This can't happen, the type is validated several times
784
* before we get here.
785
*/
786
BUG();
787
}
788
}
789
790
static int alloc_workspace_manager(struct btrfs_fs_info *fs_info,
791
enum btrfs_compression_type type)
792
{
793
struct workspace_manager *gwsm;
794
struct list_head *workspace;
795
796
ASSERT(fs_info->compr_wsm[type] == NULL);
797
gwsm = kzalloc(sizeof(*gwsm), GFP_KERNEL);
798
if (!gwsm)
799
return -ENOMEM;
800
801
INIT_LIST_HEAD(&gwsm->idle_ws);
802
spin_lock_init(&gwsm->ws_lock);
803
atomic_set(&gwsm->total_ws, 0);
804
init_waitqueue_head(&gwsm->ws_wait);
805
fs_info->compr_wsm[type] = gwsm;
806
807
/*
808
* Preallocate one workspace for each compression type so we can
809
* guarantee forward progress in the worst case
810
*/
811
workspace = alloc_workspace(fs_info, type, 0);
812
if (IS_ERR(workspace)) {
813
btrfs_warn(fs_info,
814
"cannot preallocate compression workspace for %s, will try later",
815
btrfs_compress_type2str(type));
816
} else {
817
atomic_set(&gwsm->total_ws, 1);
818
gwsm->free_ws = 1;
819
list_add(workspace, &gwsm->idle_ws);
820
}
821
return 0;
822
}
823
824
static void free_workspace_manager(struct btrfs_fs_info *fs_info,
825
enum btrfs_compression_type type)
826
{
827
struct list_head *ws;
828
struct workspace_manager *gwsm = fs_info->compr_wsm[type];
829
830
/* ZSTD uses its own workspace manager, should enter here. */
831
ASSERT(type != BTRFS_COMPRESS_ZSTD && type < BTRFS_NR_COMPRESS_TYPES);
832
if (!gwsm)
833
return;
834
fs_info->compr_wsm[type] = NULL;
835
while (!list_empty(&gwsm->idle_ws)) {
836
ws = gwsm->idle_ws.next;
837
list_del(ws);
838
free_workspace(type, ws);
839
atomic_dec(&gwsm->total_ws);
840
}
841
kfree(gwsm);
842
}
843
844
/*
845
* This finds an available workspace or allocates a new one.
846
* If it's not possible to allocate a new one, waits until there's one.
847
* Preallocation makes a forward progress guarantees and we do not return
848
* errors.
849
*/
850
struct list_head *btrfs_get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
851
{
852
struct workspace_manager *wsm = fs_info->compr_wsm[type];
853
struct list_head *workspace;
854
int cpus = num_online_cpus();
855
unsigned nofs_flag;
856
struct list_head *idle_ws;
857
spinlock_t *ws_lock;
858
atomic_t *total_ws;
859
wait_queue_head_t *ws_wait;
860
int *free_ws;
861
862
ASSERT(wsm);
863
idle_ws = &wsm->idle_ws;
864
ws_lock = &wsm->ws_lock;
865
total_ws = &wsm->total_ws;
866
ws_wait = &wsm->ws_wait;
867
free_ws = &wsm->free_ws;
868
869
again:
870
spin_lock(ws_lock);
871
if (!list_empty(idle_ws)) {
872
workspace = idle_ws->next;
873
list_del(workspace);
874
(*free_ws)--;
875
spin_unlock(ws_lock);
876
return workspace;
877
878
}
879
if (atomic_read(total_ws) > cpus) {
880
DEFINE_WAIT(wait);
881
882
spin_unlock(ws_lock);
883
prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
884
if (atomic_read(total_ws) > cpus && !*free_ws)
885
schedule();
886
finish_wait(ws_wait, &wait);
887
goto again;
888
}
889
atomic_inc(total_ws);
890
spin_unlock(ws_lock);
891
892
/*
893
* Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
894
* to turn it off here because we might get called from the restricted
895
* context of btrfs_compress_bio/btrfs_compress_pages
896
*/
897
nofs_flag = memalloc_nofs_save();
898
workspace = alloc_workspace(fs_info, type, level);
899
memalloc_nofs_restore(nofs_flag);
900
901
if (IS_ERR(workspace)) {
902
atomic_dec(total_ws);
903
wake_up(ws_wait);
904
905
/*
906
* Do not return the error but go back to waiting. There's a
907
* workspace preallocated for each type and the compression
908
* time is bounded so we get to a workspace eventually. This
909
* makes our caller's life easier.
910
*
911
* To prevent silent and low-probability deadlocks (when the
912
* initial preallocation fails), check if there are any
913
* workspaces at all.
914
*/
915
if (atomic_read(total_ws) == 0) {
916
static DEFINE_RATELIMIT_STATE(_rs,
917
/* once per minute */ 60 * HZ,
918
/* no burst */ 1);
919
920
if (__ratelimit(&_rs))
921
btrfs_warn(fs_info,
922
"no compression workspaces, low memory, retrying");
923
}
924
goto again;
925
}
926
return workspace;
927
}
928
929
static struct list_head *get_workspace(struct btrfs_fs_info *fs_info, int type, int level)
930
{
931
switch (type) {
932
case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(fs_info, type, level);
933
case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(fs_info, level);
934
case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(fs_info, type, level);
935
case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(fs_info, level);
936
default:
937
/*
938
* This can't happen, the type is validated several times
939
* before we get here.
940
*/
941
BUG();
942
}
943
}
944
945
/*
946
* put a workspace struct back on the list or free it if we have enough
947
* idle ones sitting around
948
*/
949
void btrfs_put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
950
{
951
struct workspace_manager *gwsm = fs_info->compr_wsm[type];
952
struct list_head *idle_ws;
953
spinlock_t *ws_lock;
954
atomic_t *total_ws;
955
wait_queue_head_t *ws_wait;
956
int *free_ws;
957
958
ASSERT(gwsm);
959
idle_ws = &gwsm->idle_ws;
960
ws_lock = &gwsm->ws_lock;
961
total_ws = &gwsm->total_ws;
962
ws_wait = &gwsm->ws_wait;
963
free_ws = &gwsm->free_ws;
964
965
spin_lock(ws_lock);
966
if (*free_ws <= num_online_cpus()) {
967
list_add(ws, idle_ws);
968
(*free_ws)++;
969
spin_unlock(ws_lock);
970
goto wake;
971
}
972
spin_unlock(ws_lock);
973
974
free_workspace(type, ws);
975
atomic_dec(total_ws);
976
wake:
977
cond_wake_up(ws_wait);
978
}
979
980
static void put_workspace(struct btrfs_fs_info *fs_info, int type, struct list_head *ws)
981
{
982
switch (type) {
983
case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(fs_info, type, ws);
984
case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(fs_info, type, ws);
985
case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(fs_info, type, ws);
986
case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(fs_info, ws);
987
default:
988
/*
989
* This can't happen, the type is validated several times
990
* before we get here.
991
*/
992
BUG();
993
}
994
}
995
996
/*
997
* Adjust @level according to the limits of the compression algorithm or
998
* fallback to default
999
*/
1000
static int btrfs_compress_set_level(unsigned int type, int level)
1001
{
1002
const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
1003
1004
if (level == 0)
1005
level = levels->default_level;
1006
else
1007
level = clamp(level, levels->min_level, levels->max_level);
1008
1009
return level;
1010
}
1011
1012
/*
1013
* Check whether the @level is within the valid range for the given type.
1014
*/
1015
bool btrfs_compress_level_valid(unsigned int type, int level)
1016
{
1017
const struct btrfs_compress_levels *levels = btrfs_compress_levels[type];
1018
1019
return levels->min_level <= level && level <= levels->max_level;
1020
}
1021
1022
/* Wrapper around find_get_page(), with extra error message. */
1023
int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start,
1024
struct folio **in_folio_ret)
1025
{
1026
struct folio *in_folio;
1027
1028
/*
1029
* The compressed write path should have the folio locked already, thus
1030
* we only need to grab one reference.
1031
*/
1032
in_folio = filemap_get_folio(mapping, start >> PAGE_SHIFT);
1033
if (IS_ERR(in_folio)) {
1034
struct btrfs_inode *inode = BTRFS_I(mapping->host);
1035
1036
btrfs_crit(inode->root->fs_info,
1037
"failed to get page cache, root %lld ino %llu file offset %llu",
1038
btrfs_root_id(inode->root), btrfs_ino(inode), start);
1039
return -ENOENT;
1040
}
1041
*in_folio_ret = in_folio;
1042
return 0;
1043
}
1044
1045
/*
1046
* Given an address space and start and length, compress the bytes into @pages
1047
* that are allocated on demand.
1048
*
1049
* @type_level is encoded algorithm and level, where level 0 means whatever
1050
* default the algorithm chooses and is opaque here;
1051
* - compression algo are 0-3
1052
* - the level are bits 4-7
1053
*
1054
* @out_folios is an in/out parameter, holds maximum number of folios to allocate
1055
* and returns number of actually allocated folios
1056
*
1057
* @total_in is used to return the number of bytes actually read. It
1058
* may be smaller than the input length if we had to exit early because we
1059
* ran out of room in the folios array or because we cross the
1060
* max_out threshold.
1061
*
1062
* @total_out is an in/out parameter, must be set to the input length and will
1063
* be also used to return the total number of compressed bytes
1064
*/
1065
int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode,
1066
u64 start, struct folio **folios, unsigned long *out_folios,
1067
unsigned long *total_in, unsigned long *total_out)
1068
{
1069
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1070
const unsigned long orig_len = *total_out;
1071
struct list_head *workspace;
1072
int ret;
1073
1074
level = btrfs_compress_set_level(type, level);
1075
workspace = get_workspace(fs_info, type, level);
1076
ret = compression_compress_pages(type, workspace, inode, start, folios,
1077
out_folios, total_in, total_out);
1078
/* The total read-in bytes should be no larger than the input. */
1079
ASSERT(*total_in <= orig_len);
1080
put_workspace(fs_info, type, workspace);
1081
return ret;
1082
}
1083
1084
static int btrfs_decompress_bio(struct compressed_bio *cb)
1085
{
1086
struct btrfs_fs_info *fs_info = cb_to_fs_info(cb);
1087
struct list_head *workspace;
1088
int ret;
1089
int type = cb->compress_type;
1090
1091
workspace = get_workspace(fs_info, type, 0);
1092
ret = compression_decompress_bio(workspace, cb);
1093
put_workspace(fs_info, type, workspace);
1094
1095
if (!ret)
1096
zero_fill_bio(&cb->orig_bbio->bio);
1097
return ret;
1098
}
1099
1100
/*
1101
* a less complex decompression routine. Our compressed data fits in a
1102
* single page, and we want to read a single page out of it.
1103
* start_byte tells us the offset into the compressed data we're interested in
1104
*/
1105
int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio,
1106
unsigned long dest_pgoff, size_t srclen, size_t destlen)
1107
{
1108
struct btrfs_fs_info *fs_info = folio_to_fs_info(dest_folio);
1109
struct list_head *workspace;
1110
const u32 sectorsize = fs_info->sectorsize;
1111
int ret;
1112
1113
/*
1114
* The full destination folio range should not exceed the folio size.
1115
* And the @destlen should not exceed sectorsize, as this is only called for
1116
* inline file extents, which should not exceed sectorsize.
1117
*/
1118
ASSERT(dest_pgoff + destlen <= folio_size(dest_folio) && destlen <= sectorsize);
1119
1120
workspace = get_workspace(fs_info, type, 0);
1121
ret = compression_decompress(type, workspace, data_in, dest_folio,
1122
dest_pgoff, srclen, destlen);
1123
put_workspace(fs_info, type, workspace);
1124
1125
return ret;
1126
}
1127
1128
int btrfs_alloc_compress_wsm(struct btrfs_fs_info *fs_info)
1129
{
1130
int ret;
1131
1132
ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
1133
if (ret < 0)
1134
goto error;
1135
ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
1136
if (ret < 0)
1137
goto error;
1138
ret = alloc_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
1139
if (ret < 0)
1140
goto error;
1141
ret = zstd_alloc_workspace_manager(fs_info);
1142
if (ret < 0)
1143
goto error;
1144
return 0;
1145
error:
1146
btrfs_free_compress_wsm(fs_info);
1147
return ret;
1148
}
1149
1150
void btrfs_free_compress_wsm(struct btrfs_fs_info *fs_info)
1151
{
1152
free_workspace_manager(fs_info, BTRFS_COMPRESS_NONE);
1153
free_workspace_manager(fs_info, BTRFS_COMPRESS_ZLIB);
1154
free_workspace_manager(fs_info, BTRFS_COMPRESS_LZO);
1155
zstd_free_workspace_manager(fs_info);
1156
}
1157
1158
int __init btrfs_init_compress(void)
1159
{
1160
if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
1161
offsetof(struct compressed_bio, bbio.bio),
1162
BIOSET_NEED_BVECS))
1163
return -ENOMEM;
1164
1165
compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages");
1166
if (!compr_pool.shrinker)
1167
return -ENOMEM;
1168
1169
spin_lock_init(&compr_pool.lock);
1170
INIT_LIST_HEAD(&compr_pool.list);
1171
compr_pool.count = 0;
1172
/* 128K / 4K = 32, for 8 threads is 256 pages. */
1173
compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8;
1174
compr_pool.shrinker->count_objects = btrfs_compr_pool_count;
1175
compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan;
1176
compr_pool.shrinker->batch = 32;
1177
compr_pool.shrinker->seeks = DEFAULT_SEEKS;
1178
shrinker_register(compr_pool.shrinker);
1179
1180
return 0;
1181
}
1182
1183
void __cold btrfs_exit_compress(void)
1184
{
1185
/* For now scan drains all pages and does not touch the parameters. */
1186
btrfs_compr_pool_scan(NULL, NULL);
1187
shrinker_free(compr_pool.shrinker);
1188
1189
bioset_exit(&btrfs_compressed_bioset);
1190
}
1191
1192
/*
1193
* The bvec is a single page bvec from a bio that contains folios from a filemap.
1194
*
1195
* Since the folio may be a large one, and if the bv_page is not a head page of
1196
* a large folio, then page->index is unreliable.
1197
*
1198
* Thus we need this helper to grab the proper file offset.
1199
*/
1200
static u64 file_offset_from_bvec(const struct bio_vec *bvec)
1201
{
1202
const struct page *page = bvec->bv_page;
1203
const struct folio *folio = page_folio(page);
1204
1205
return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset;
1206
}
1207
1208
/*
1209
* Copy decompressed data from working buffer to pages.
1210
*
1211
* @buf: The decompressed data buffer
1212
* @buf_len: The decompressed data length
1213
* @decompressed: Number of bytes that are already decompressed inside the
1214
* compressed extent
1215
* @cb: The compressed extent descriptor
1216
* @orig_bio: The original bio that the caller wants to read for
1217
*
1218
* An easier to understand graph is like below:
1219
*
1220
* |<- orig_bio ->| |<- orig_bio->|
1221
* |<------- full decompressed extent ----->|
1222
* |<----------- @cb range ---->|
1223
* | |<-- @buf_len -->|
1224
* |<--- @decompressed --->|
1225
*
1226
* Note that, @cb can be a subpage of the full decompressed extent, but
1227
* @cb->start always has the same as the orig_file_offset value of the full
1228
* decompressed extent.
1229
*
1230
* When reading compressed extent, we have to read the full compressed extent,
1231
* while @orig_bio may only want part of the range.
1232
* Thus this function will ensure only data covered by @orig_bio will be copied
1233
* to.
1234
*
1235
* Return 0 if we have copied all needed contents for @orig_bio.
1236
* Return >0 if we need continue decompress.
1237
*/
1238
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
1239
struct compressed_bio *cb, u32 decompressed)
1240
{
1241
struct bio *orig_bio = &cb->orig_bbio->bio;
1242
/* Offset inside the full decompressed extent */
1243
u32 cur_offset;
1244
1245
cur_offset = decompressed;
1246
/* The main loop to do the copy */
1247
while (cur_offset < decompressed + buf_len) {
1248
struct bio_vec bvec;
1249
size_t copy_len;
1250
u32 copy_start;
1251
/* Offset inside the full decompressed extent */
1252
u32 bvec_offset;
1253
void *kaddr;
1254
1255
bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
1256
/*
1257
* cb->start may underflow, but subtracting that value can still
1258
* give us correct offset inside the full decompressed extent.
1259
*/
1260
bvec_offset = file_offset_from_bvec(&bvec) - cb->start;
1261
1262
/* Haven't reached the bvec range, exit */
1263
if (decompressed + buf_len <= bvec_offset)
1264
return 1;
1265
1266
copy_start = max(cur_offset, bvec_offset);
1267
copy_len = min(bvec_offset + bvec.bv_len,
1268
decompressed + buf_len) - copy_start;
1269
ASSERT(copy_len);
1270
1271
/*
1272
* Extra range check to ensure we didn't go beyond
1273
* @buf + @buf_len.
1274
*/
1275
ASSERT(copy_start - decompressed < buf_len);
1276
1277
kaddr = bvec_kmap_local(&bvec);
1278
memcpy(kaddr, buf + copy_start - decompressed, copy_len);
1279
kunmap_local(kaddr);
1280
1281
cur_offset += copy_len;
1282
bio_advance(orig_bio, copy_len);
1283
/* Finished the bio */
1284
if (!orig_bio->bi_iter.bi_size)
1285
return 0;
1286
}
1287
return 1;
1288
}
1289
1290
/*
1291
* Shannon Entropy calculation
1292
*
1293
* Pure byte distribution analysis fails to determine compressibility of data.
1294
* Try calculating entropy to estimate the average minimum number of bits
1295
* needed to encode the sampled data.
1296
*
1297
* For convenience, return the percentage of needed bits, instead of amount of
1298
* bits directly.
1299
*
1300
* @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
1301
* and can be compressible with high probability
1302
*
1303
* @ENTROPY_LVL_HIGH - data are not compressible with high probability
1304
*
1305
* Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
1306
*/
1307
#define ENTROPY_LVL_ACEPTABLE (65)
1308
#define ENTROPY_LVL_HIGH (80)
1309
1310
/*
1311
* For increased precision in shannon_entropy calculation,
1312
* let's do pow(n, M) to save more digits after comma:
1313
*
1314
* - maximum int bit length is 64
1315
* - ilog2(MAX_SAMPLE_SIZE) -> 13
1316
* - 13 * 4 = 52 < 64 -> M = 4
1317
*
1318
* So use pow(n, 4).
1319
*/
1320
static inline u32 ilog2_w(u64 n)
1321
{
1322
return ilog2(n * n * n * n);
1323
}
1324
1325
static u32 shannon_entropy(struct heuristic_ws *ws)
1326
{
1327
const u32 entropy_max = 8 * ilog2_w(2);
1328
u32 entropy_sum = 0;
1329
u32 p, p_base, sz_base;
1330
u32 i;
1331
1332
sz_base = ilog2_w(ws->sample_size);
1333
for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
1334
p = ws->bucket[i].count;
1335
p_base = ilog2_w(p);
1336
entropy_sum += p * (sz_base - p_base);
1337
}
1338
1339
entropy_sum /= ws->sample_size;
1340
return entropy_sum * 100 / entropy_max;
1341
}
1342
1343
#define RADIX_BASE 4U
1344
#define COUNTERS_SIZE (1U << RADIX_BASE)
1345
1346
static u8 get4bits(u64 num, int shift) {
1347
u8 low4bits;
1348
1349
num >>= shift;
1350
/* Reverse order */
1351
low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
1352
return low4bits;
1353
}
1354
1355
/*
1356
* Use 4 bits as radix base
1357
* Use 16 u32 counters for calculating new position in buf array
1358
*
1359
* @array - array that will be sorted
1360
* @array_buf - buffer array to store sorting results
1361
* must be equal in size to @array
1362
* @num - array size
1363
*/
1364
static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
1365
int num)
1366
{
1367
u64 max_num;
1368
u64 buf_num;
1369
u32 counters[COUNTERS_SIZE];
1370
u32 new_addr;
1371
u32 addr;
1372
int bitlen;
1373
int shift;
1374
int i;
1375
1376
/*
1377
* Try avoid useless loop iterations for small numbers stored in big
1378
* counters. Example: 48 33 4 ... in 64bit array
1379
*/
1380
max_num = array[0].count;
1381
for (i = 1; i < num; i++) {
1382
buf_num = array[i].count;
1383
if (buf_num > max_num)
1384
max_num = buf_num;
1385
}
1386
1387
buf_num = ilog2(max_num);
1388
bitlen = ALIGN(buf_num, RADIX_BASE * 2);
1389
1390
shift = 0;
1391
while (shift < bitlen) {
1392
memset(counters, 0, sizeof(counters));
1393
1394
for (i = 0; i < num; i++) {
1395
buf_num = array[i].count;
1396
addr = get4bits(buf_num, shift);
1397
counters[addr]++;
1398
}
1399
1400
for (i = 1; i < COUNTERS_SIZE; i++)
1401
counters[i] += counters[i - 1];
1402
1403
for (i = num - 1; i >= 0; i--) {
1404
buf_num = array[i].count;
1405
addr = get4bits(buf_num, shift);
1406
counters[addr]--;
1407
new_addr = counters[addr];
1408
array_buf[new_addr] = array[i];
1409
}
1410
1411
shift += RADIX_BASE;
1412
1413
/*
1414
* Normal radix expects to move data from a temporary array, to
1415
* the main one. But that requires some CPU time. Avoid that
1416
* by doing another sort iteration to original array instead of
1417
* memcpy()
1418
*/
1419
memset(counters, 0, sizeof(counters));
1420
1421
for (i = 0; i < num; i ++) {
1422
buf_num = array_buf[i].count;
1423
addr = get4bits(buf_num, shift);
1424
counters[addr]++;
1425
}
1426
1427
for (i = 1; i < COUNTERS_SIZE; i++)
1428
counters[i] += counters[i - 1];
1429
1430
for (i = num - 1; i >= 0; i--) {
1431
buf_num = array_buf[i].count;
1432
addr = get4bits(buf_num, shift);
1433
counters[addr]--;
1434
new_addr = counters[addr];
1435
array[new_addr] = array_buf[i];
1436
}
1437
1438
shift += RADIX_BASE;
1439
}
1440
}
1441
1442
/*
1443
* Size of the core byte set - how many bytes cover 90% of the sample
1444
*
1445
* There are several types of structured binary data that use nearly all byte
1446
* values. The distribution can be uniform and counts in all buckets will be
1447
* nearly the same (eg. encrypted data). Unlikely to be compressible.
1448
*
1449
* Other possibility is normal (Gaussian) distribution, where the data could
1450
* be potentially compressible, but we have to take a few more steps to decide
1451
* how much.
1452
*
1453
* @BYTE_CORE_SET_LOW - main part of byte values repeated frequently,
1454
* compression algo can easy fix that
1455
* @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
1456
* probability is not compressible
1457
*/
1458
#define BYTE_CORE_SET_LOW (64)
1459
#define BYTE_CORE_SET_HIGH (200)
1460
1461
static int byte_core_set_size(struct heuristic_ws *ws)
1462
{
1463
u32 i;
1464
u32 coreset_sum = 0;
1465
const u32 core_set_threshold = ws->sample_size * 90 / 100;
1466
struct bucket_item *bucket = ws->bucket;
1467
1468
/* Sort in reverse order */
1469
radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
1470
1471
for (i = 0; i < BYTE_CORE_SET_LOW; i++)
1472
coreset_sum += bucket[i].count;
1473
1474
if (coreset_sum > core_set_threshold)
1475
return i;
1476
1477
for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
1478
coreset_sum += bucket[i].count;
1479
if (coreset_sum > core_set_threshold)
1480
break;
1481
}
1482
1483
return i;
1484
}
1485
1486
/*
1487
* Count byte values in buckets.
1488
* This heuristic can detect textual data (configs, xml, json, html, etc).
1489
* Because in most text-like data byte set is restricted to limited number of
1490
* possible characters, and that restriction in most cases makes data easy to
1491
* compress.
1492
*
1493
* @BYTE_SET_THRESHOLD - consider all data within this byte set size:
1494
* less - compressible
1495
* more - need additional analysis
1496
*/
1497
#define BYTE_SET_THRESHOLD (64)
1498
1499
static u32 byte_set_size(const struct heuristic_ws *ws)
1500
{
1501
u32 i;
1502
u32 byte_set_size = 0;
1503
1504
for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
1505
if (ws->bucket[i].count > 0)
1506
byte_set_size++;
1507
}
1508
1509
/*
1510
* Continue collecting count of byte values in buckets. If the byte
1511
* set size is bigger then the threshold, it's pointless to continue,
1512
* the detection technique would fail for this type of data.
1513
*/
1514
for (; i < BUCKET_SIZE; i++) {
1515
if (ws->bucket[i].count > 0) {
1516
byte_set_size++;
1517
if (byte_set_size > BYTE_SET_THRESHOLD)
1518
return byte_set_size;
1519
}
1520
}
1521
1522
return byte_set_size;
1523
}
1524
1525
static bool sample_repeated_patterns(struct heuristic_ws *ws)
1526
{
1527
const u32 half_of_sample = ws->sample_size / 2;
1528
const u8 *data = ws->sample;
1529
1530
return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
1531
}
1532
1533
static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
1534
struct heuristic_ws *ws)
1535
{
1536
struct page *page;
1537
pgoff_t index, index_end;
1538
u32 i, curr_sample_pos;
1539
u8 *in_data;
1540
1541
/*
1542
* Compression handles the input data by chunks of 128KiB
1543
* (defined by BTRFS_MAX_UNCOMPRESSED)
1544
*
1545
* We do the same for the heuristic and loop over the whole range.
1546
*
1547
* MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
1548
* process no more than BTRFS_MAX_UNCOMPRESSED at a time.
1549
*/
1550
if (end - start > BTRFS_MAX_UNCOMPRESSED)
1551
end = start + BTRFS_MAX_UNCOMPRESSED;
1552
1553
index = start >> PAGE_SHIFT;
1554
index_end = end >> PAGE_SHIFT;
1555
1556
/* Don't miss unaligned end */
1557
if (!PAGE_ALIGNED(end))
1558
index_end++;
1559
1560
curr_sample_pos = 0;
1561
while (index < index_end) {
1562
page = find_get_page(inode->i_mapping, index);
1563
in_data = kmap_local_page(page);
1564
/* Handle case where the start is not aligned to PAGE_SIZE */
1565
i = start % PAGE_SIZE;
1566
while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
1567
/* Don't sample any garbage from the last page */
1568
if (start > end - SAMPLING_READ_SIZE)
1569
break;
1570
memcpy(&ws->sample[curr_sample_pos], &in_data[i],
1571
SAMPLING_READ_SIZE);
1572
i += SAMPLING_INTERVAL;
1573
start += SAMPLING_INTERVAL;
1574
curr_sample_pos += SAMPLING_READ_SIZE;
1575
}
1576
kunmap_local(in_data);
1577
put_page(page);
1578
1579
index++;
1580
}
1581
1582
ws->sample_size = curr_sample_pos;
1583
}
1584
1585
/*
1586
* Compression heuristic.
1587
*
1588
* The following types of analysis can be performed:
1589
* - detect mostly zero data
1590
* - detect data with low "byte set" size (text, etc)
1591
* - detect data with low/high "core byte" set
1592
*
1593
* Return non-zero if the compression should be done, 0 otherwise.
1594
*/
1595
int btrfs_compress_heuristic(struct btrfs_inode *inode, u64 start, u64 end)
1596
{
1597
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1598
struct list_head *ws_list = get_workspace(fs_info, 0, 0);
1599
struct heuristic_ws *ws;
1600
u32 i;
1601
u8 byte;
1602
int ret = 0;
1603
1604
ws = list_entry(ws_list, struct heuristic_ws, list);
1605
1606
heuristic_collect_sample(&inode->vfs_inode, start, end, ws);
1607
1608
if (sample_repeated_patterns(ws)) {
1609
ret = 1;
1610
goto out;
1611
}
1612
1613
memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
1614
1615
for (i = 0; i < ws->sample_size; i++) {
1616
byte = ws->sample[i];
1617
ws->bucket[byte].count++;
1618
}
1619
1620
i = byte_set_size(ws);
1621
if (i < BYTE_SET_THRESHOLD) {
1622
ret = 2;
1623
goto out;
1624
}
1625
1626
i = byte_core_set_size(ws);
1627
if (i <= BYTE_CORE_SET_LOW) {
1628
ret = 3;
1629
goto out;
1630
}
1631
1632
if (i >= BYTE_CORE_SET_HIGH) {
1633
ret = 0;
1634
goto out;
1635
}
1636
1637
i = shannon_entropy(ws);
1638
if (i <= ENTROPY_LVL_ACEPTABLE) {
1639
ret = 4;
1640
goto out;
1641
}
1642
1643
/*
1644
* For the levels below ENTROPY_LVL_HIGH, additional analysis would be
1645
* needed to give green light to compression.
1646
*
1647
* For now just assume that compression at that level is not worth the
1648
* resources because:
1649
*
1650
* 1. it is possible to defrag the data later
1651
*
1652
* 2. the data would turn out to be hardly compressible, eg. 150 byte
1653
* values, every bucket has counter at level ~54. The heuristic would
1654
* be confused. This can happen when data have some internal repeated
1655
* patterns like "abbacbbc...". This can be detected by analyzing
1656
* pairs of bytes, which is too costly.
1657
*/
1658
if (i < ENTROPY_LVL_HIGH) {
1659
ret = 5;
1660
goto out;
1661
} else {
1662
ret = 0;
1663
goto out;
1664
}
1665
1666
out:
1667
put_workspace(fs_info, 0, ws_list);
1668
return ret;
1669
}
1670
1671
/*
1672
* Convert the compression suffix (eg. after "zlib" starting with ":") to level.
1673
*
1674
* If the resulting level exceeds the algo's supported levels, it will be clamped.
1675
*
1676
* Return <0 if no valid string can be found.
1677
* Return 0 if everything is fine.
1678
*/
1679
int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret)
1680
{
1681
int level = 0;
1682
int ret;
1683
1684
if (!type) {
1685
*level_ret = btrfs_compress_set_level(type, level);
1686
return 0;
1687
}
1688
1689
if (str[0] == ':') {
1690
ret = kstrtoint(str + 1, 10, &level);
1691
if (ret)
1692
return ret;
1693
}
1694
1695
*level_ret = btrfs_compress_set_level(type, level);
1696
return 0;
1697
}
1698
1699