// SPDX-License-Identifier: GPL-2.012#include <linux/fsverity.h>3#include <linux/iomap.h>4#include "ctree.h"5#include "delalloc-space.h"6#include "direct-io.h"7#include "extent-tree.h"8#include "file.h"9#include "fs.h"10#include "transaction.h"11#include "volumes.h"1213struct btrfs_dio_data {14ssize_t submitted;15struct extent_changeset *data_reserved;16struct btrfs_ordered_extent *ordered;17bool data_space_reserved;18bool nocow_done;19};2021struct btrfs_dio_private {22/* Range of I/O */23u64 file_offset;24u32 bytes;2526/* This must be last */27struct btrfs_bio bbio;28};2930static struct bio_set btrfs_dio_bioset;3132static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,33struct extent_state **cached_state,34unsigned int iomap_flags)35{36const bool writing = (iomap_flags & IOMAP_WRITE);37const bool nowait = (iomap_flags & IOMAP_NOWAIT);38struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;39struct btrfs_ordered_extent *ordered;40int ret = 0;4142/* Direct lock must be taken before the extent lock. */43if (nowait) {44if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))45return -EAGAIN;46} else {47btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state);48}4950while (1) {51if (nowait) {52if (!btrfs_try_lock_extent(io_tree, lockstart, lockend,53cached_state)) {54ret = -EAGAIN;55break;56}57} else {58btrfs_lock_extent(io_tree, lockstart, lockend, cached_state);59}60/*61* We're concerned with the entire range that we're going to be62* doing DIO to, so we need to make sure there's no ordered63* extents in this range.64*/65ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,66lockend - lockstart + 1);6768/*69* We need to make sure there are no buffered pages in this70* range either, we could have raced between the invalidate in71* generic_file_direct_write and locking the extent. The72* invalidate needs to happen so that reads after a write do not73* get stale data.74*/75if (!ordered &&76(!writing || !filemap_range_has_page(inode->i_mapping,77lockstart, lockend)))78break;7980btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state);8182if (ordered) {83if (nowait) {84btrfs_put_ordered_extent(ordered);85ret = -EAGAIN;86break;87}88/*89* If we are doing a DIO read and the ordered extent we90* found is for a buffered write, we can not wait for it91* to complete and retry, because if we do so we can92* deadlock with concurrent buffered writes on page93* locks. This happens only if our DIO read covers more94* than one extent map, if at this point has already95* created an ordered extent for a previous extent map96* and locked its range in the inode's io tree, and a97* concurrent write against that previous extent map's98* range and this range started (we unlock the ranges99* in the io tree only when the bios complete and100* buffered writes always lock pages before attempting101* to lock range in the io tree).102*/103if (writing ||104test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))105btrfs_start_ordered_extent(ordered);106else107ret = nowait ? -EAGAIN : -ENOTBLK;108btrfs_put_ordered_extent(ordered);109} else {110/*111* We could trigger writeback for this range (and wait112* for it to complete) and then invalidate the pages for113* this range (through invalidate_inode_pages2_range()),114* but that can lead us to a deadlock with a concurrent115* call to readahead (a buffered read or a defrag call116* triggered a readahead) on a page lock due to an117* ordered dio extent we created before but did not have118* yet a corresponding bio submitted (whence it can not119* complete), which makes readahead wait for that120* ordered extent to complete while holding a lock on121* that page.122*/123ret = nowait ? -EAGAIN : -ENOTBLK;124}125126if (ret)127break;128129cond_resched();130}131132if (ret)133btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state);134return ret;135}136137static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,138struct btrfs_dio_data *dio_data,139const u64 start,140const struct btrfs_file_extent *file_extent,141const int type)142{143struct extent_map *em = NULL;144struct btrfs_ordered_extent *ordered;145146if (type != BTRFS_ORDERED_NOCOW) {147em = btrfs_create_io_em(inode, start, file_extent, type);148if (IS_ERR(em))149goto out;150}151152ordered = btrfs_alloc_ordered_extent(inode, start, file_extent,153(1U << type) |154(1U << BTRFS_ORDERED_DIRECT));155if (IS_ERR(ordered)) {156if (em) {157btrfs_free_extent_map(em);158btrfs_drop_extent_map_range(inode, start,159start + file_extent->num_bytes - 1, false);160}161em = ERR_CAST(ordered);162} else {163ASSERT(!dio_data->ordered);164dio_data->ordered = ordered;165}166out:167168return em;169}170171static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,172struct btrfs_dio_data *dio_data,173u64 start, u64 len)174{175struct btrfs_root *root = inode->root;176struct btrfs_fs_info *fs_info = root->fs_info;177struct btrfs_file_extent file_extent;178struct extent_map *em;179struct btrfs_key ins;180u64 alloc_hint;181int ret;182183alloc_hint = btrfs_get_extent_allocation_hint(inode, start, len);184again:185ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,1860, alloc_hint, &ins, 1, 1);187if (ret == -EAGAIN) {188ASSERT(btrfs_is_zoned(fs_info));189wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,190TASK_UNINTERRUPTIBLE);191goto again;192}193if (ret)194return ERR_PTR(ret);195196file_extent.disk_bytenr = ins.objectid;197file_extent.disk_num_bytes = ins.offset;198file_extent.num_bytes = ins.offset;199file_extent.ram_bytes = ins.offset;200file_extent.offset = 0;201file_extent.compression = BTRFS_COMPRESS_NONE;202em = btrfs_create_dio_extent(inode, dio_data, start, &file_extent,203BTRFS_ORDERED_REGULAR);204btrfs_dec_block_group_reservations(fs_info, ins.objectid);205if (IS_ERR(em))206btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);207208return em;209}210211static int btrfs_get_blocks_direct_write(struct extent_map **map,212struct inode *inode,213struct btrfs_dio_data *dio_data,214u64 start, u64 *lenp,215unsigned int iomap_flags)216{217const bool nowait = (iomap_flags & IOMAP_NOWAIT);218struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);219struct btrfs_file_extent file_extent;220struct extent_map *em = *map;221int type;222u64 block_start;223struct btrfs_block_group *bg;224bool can_nocow = false;225bool space_reserved = false;226u64 len = *lenp;227u64 prev_len;228int ret = 0;229230/*231* We don't allocate a new extent in the following cases232*233* 1) The inode is marked as NODATACOW. In this case we'll just use the234* existing extent.235* 2) The extent is marked as PREALLOC. We're good to go here and can236* just use the extent.237*238*/239if ((em->flags & EXTENT_FLAG_PREALLOC) ||240((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&241em->disk_bytenr != EXTENT_MAP_HOLE)) {242if (em->flags & EXTENT_FLAG_PREALLOC)243type = BTRFS_ORDERED_PREALLOC;244else245type = BTRFS_ORDERED_NOCOW;246len = min(len, em->len - (start - em->start));247block_start = btrfs_extent_map_block_start(em) + (start - em->start);248249if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent,250false) == 1) {251bg = btrfs_inc_nocow_writers(fs_info, block_start);252if (bg)253can_nocow = true;254}255}256257prev_len = len;258if (can_nocow) {259struct extent_map *em2;260261/* We can NOCOW, so only need to reserve metadata space. */262ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,263nowait);264if (ret < 0) {265/* Our caller expects us to free the input extent map. */266btrfs_free_extent_map(em);267*map = NULL;268btrfs_dec_nocow_writers(bg);269if (nowait && (ret == -ENOSPC || ret == -EDQUOT))270ret = -EAGAIN;271goto out;272}273space_reserved = true;274275em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start,276&file_extent, type);277btrfs_dec_nocow_writers(bg);278if (type == BTRFS_ORDERED_PREALLOC) {279btrfs_free_extent_map(em);280*map = em2;281em = em2;282}283284if (IS_ERR(em2)) {285ret = PTR_ERR(em2);286goto out;287}288289dio_data->nocow_done = true;290} else {291/* Our caller expects us to free the input extent map. */292btrfs_free_extent_map(em);293*map = NULL;294295if (nowait) {296ret = -EAGAIN;297goto out;298}299300/*301* If we could not allocate data space before locking the file302* range and we can't do a NOCOW write, then we have to fail.303*/304if (!dio_data->data_space_reserved) {305ret = -ENOSPC;306goto out;307}308309/*310* We have to COW and we have already reserved data space before,311* so now we reserve only metadata.312*/313ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,314false);315if (ret < 0)316goto out;317space_reserved = true;318319em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);320if (IS_ERR(em)) {321ret = PTR_ERR(em);322goto out;323}324*map = em;325len = min(len, em->len - (start - em->start));326if (len < prev_len)327btrfs_delalloc_release_metadata(BTRFS_I(inode),328prev_len - len, true);329}330331/*332* We have created our ordered extent, so we can now release our reservation333* for an outstanding extent.334*/335btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);336337/*338* Need to update the i_size under the extent lock so buffered339* readers will get the updated i_size when we unlock.340*/341if (start + len > i_size_read(inode))342i_size_write(inode, start + len);343out:344if (ret && space_reserved) {345btrfs_delalloc_release_extents(BTRFS_I(inode), len);346btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);347}348*lenp = len;349return ret;350}351352static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,353loff_t length, unsigned int flags, struct iomap *iomap,354struct iomap *srcmap)355{356struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);357struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);358struct extent_map *em;359struct extent_state *cached_state = NULL;360struct btrfs_dio_data *dio_data = iter->private;361u64 lockstart, lockend;362const bool write = !!(flags & IOMAP_WRITE);363int ret = 0;364u64 len = length;365const u64 data_alloc_len = length;366u32 unlock_bits = EXTENT_LOCKED;367368/*369* We could potentially fault if we have a buffer > PAGE_SIZE, and if370* we're NOWAIT we may submit a bio for a partial range and return371* EIOCBQUEUED, which would result in an errant short read.372*373* The best way to handle this would be to allow for partial completions374* of iocb's, so we could submit the partial bio, return and fault in375* the rest of the pages, and then submit the io for the rest of the376* range. However we don't have that currently, so simply return377* -EAGAIN at this point so that the normal path is used.378*/379if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)380return -EAGAIN;381382/*383* Cap the size of reads to that usually seen in buffered I/O as we need384* to allocate a contiguous array for the checksums.385*/386if (!write)387len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);388389lockstart = start;390lockend = start + len - 1;391392/*393* iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't394* enough if we've written compressed pages to this area, so we need to395* flush the dirty pages again to make absolutely sure that any396* outstanding dirty pages are on disk - the first flush only starts397* compression on the data, while keeping the pages locked, so by the398* time the second flush returns we know bios for the compressed pages399* were submitted and finished, and the pages no longer under writeback.400*401* If we have a NOWAIT request and we have any pages in the range that402* are locked, likely due to compression still in progress, we don't want403* to block on page locks. We also don't want to block on pages marked as404* dirty or under writeback (same as for the non-compression case).405* iomap_dio_rw() did the same check, but after that and before we got406* here, mmap'ed writes may have happened or buffered reads started407* (readpage() and readahead(), which lock pages), as we haven't locked408* the file range yet.409*/410if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,411&BTRFS_I(inode)->runtime_flags)) {412if (flags & IOMAP_NOWAIT) {413if (filemap_range_needs_writeback(inode->i_mapping,414lockstart, lockend))415return -EAGAIN;416} else {417ret = filemap_fdatawrite_range(inode->i_mapping, start,418start + length - 1);419if (ret)420return ret;421}422}423424memset(dio_data, 0, sizeof(*dio_data));425426/*427* We always try to allocate data space and must do it before locking428* the file range, to avoid deadlocks with concurrent writes to the same429* range if the range has several extents and the writes don't expand the430* current i_size (the inode lock is taken in shared mode). If we fail to431* allocate data space here we continue and later, after locking the432* file range, we fail with ENOSPC only if we figure out we can not do a433* NOCOW write.434*/435if (write && !(flags & IOMAP_NOWAIT)) {436ret = btrfs_check_data_free_space(BTRFS_I(inode),437&dio_data->data_reserved,438start, data_alloc_len, false);439if (!ret)440dio_data->data_space_reserved = true;441else if (!(BTRFS_I(inode)->flags &442(BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))443goto err;444}445446/*447* If this errors out it's because we couldn't invalidate pagecache for448* this range and we need to fallback to buffered IO, or we are doing a449* NOWAIT read/write and we need to block.450*/451ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);452if (ret < 0)453goto err;454455em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len);456if (IS_ERR(em)) {457ret = PTR_ERR(em);458goto unlock_err;459}460461/*462* Ok for INLINE and COMPRESSED extents we need to fallback on buffered463* io. INLINE is special, and we could probably kludge it in here, but464* it's still buffered so for safety lets just fall back to the generic465* buffered path.466*467* For COMPRESSED we _have_ to read the entire extent in so we can468* decompress it, so there will be buffering required no matter what we469* do, so go ahead and fallback to buffered.470*471* We return -ENOTBLK because that's what makes DIO go ahead and go back472* to buffered IO. Don't blame me, this is the price we pay for using473* the generic code.474*/475if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) {476btrfs_free_extent_map(em);477/*478* If we are in a NOWAIT context, return -EAGAIN in order to479* fallback to buffered IO. This is not only because we can480* block with buffered IO (no support for NOWAIT semantics at481* the moment) but also to avoid returning short reads to user482* space - this happens if we were able to read some data from483* previous non-compressed extents and then when we fallback to484* buffered IO, at btrfs_file_read_iter() by calling485* filemap_read(), we fail to fault in pages for the read buffer,486* in which case filemap_read() returns a short read (the number487* of bytes previously read is > 0, so it does not return -EFAULT).488*/489ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;490goto unlock_err;491}492493len = min(len, em->len - (start - em->start));494495/*496* If we have a NOWAIT request and the range contains multiple extents497* (or a mix of extents and holes), then we return -EAGAIN to make the498* caller fallback to a context where it can do a blocking (without499* NOWAIT) request. This way we avoid doing partial IO and returning500* success to the caller, which is not optimal for writes and for reads501* it can result in unexpected behaviour for an application.502*503* When doing a read, because we use IOMAP_DIO_PARTIAL when calling504* iomap_dio_rw(), we can end up returning less data then what the caller505* asked for, resulting in an unexpected, and incorrect, short read.506* That is, the caller asked to read N bytes and we return less than that,507* which is wrong unless we are crossing EOF. This happens if we get a508* page fault error when trying to fault in pages for the buffer that is509* associated to the struct iov_iter passed to iomap_dio_rw(), and we510* have previously submitted bios for other extents in the range, in511* which case iomap_dio_rw() may return us EIOCBQUEUED if not all of512* those bios have completed by the time we get the page fault error,513* which we return back to our caller - we should only return EIOCBQUEUED514* after we have submitted bios for all the extents in the range.515*/516if ((flags & IOMAP_NOWAIT) && len < length) {517btrfs_free_extent_map(em);518ret = -EAGAIN;519goto unlock_err;520}521522if (write) {523ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,524start, &len, flags);525if (ret < 0)526goto unlock_err;527/* Recalc len in case the new em is smaller than requested */528len = min(len, em->len - (start - em->start));529if (dio_data->data_space_reserved) {530u64 release_offset;531u64 release_len = 0;532533if (dio_data->nocow_done) {534release_offset = start;535release_len = data_alloc_len;536} else if (len < data_alloc_len) {537release_offset = start + len;538release_len = data_alloc_len - len;539}540541if (release_len > 0)542btrfs_free_reserved_data_space(BTRFS_I(inode),543dio_data->data_reserved,544release_offset,545release_len);546}547}548549/*550* Translate extent map information to iomap.551* We trim the extents (and move the addr) even though iomap code does552* that, since we have locked only the parts we are performing I/O in.553*/554if ((em->disk_bytenr == EXTENT_MAP_HOLE) ||555((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {556iomap->addr = IOMAP_NULL_ADDR;557iomap->type = IOMAP_HOLE;558} else {559iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start);560iomap->type = IOMAP_MAPPED;561}562iomap->offset = start;563iomap->bdev = fs_info->fs_devices->latest_dev->bdev;564iomap->length = len;565btrfs_free_extent_map(em);566567/*568* Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed,569* writes only hold it for this part. We hold the extent lock until570* we're completely done with the extent map to make sure it remains571* valid.572*/573if (write)574unlock_bits |= EXTENT_DIO_LOCKED;575576btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,577unlock_bits, &cached_state);578579/* We didn't use everything, unlock the dio extent for the remainder. */580if (!write && (start + len) < lockend)581btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len,582lockend, NULL);583584return 0;585586unlock_err:587/*588* Don't use EXTENT_LOCK_BITS here in case we extend it later and forget589* to update this, be explicit that we expect EXTENT_LOCKED and590* EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.591*/592btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,593EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);594err:595if (dio_data->data_space_reserved) {596btrfs_free_reserved_data_space(BTRFS_I(inode),597dio_data->data_reserved,598start, data_alloc_len);599extent_changeset_free(dio_data->data_reserved);600}601602return ret;603}604605static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,606ssize_t written, unsigned int flags, struct iomap *iomap)607{608struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);609struct btrfs_dio_data *dio_data = iter->private;610size_t submitted = dio_data->submitted;611const bool write = !!(flags & IOMAP_WRITE);612int ret = 0;613614if (!write && (iomap->type == IOMAP_HOLE)) {615/* If reading from a hole, unlock and return */616btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,617pos + length - 1, NULL);618return 0;619}620621if (submitted < length) {622pos += submitted;623length -= submitted;624if (write)625btrfs_finish_ordered_extent(dio_data->ordered, NULL,626pos, length, false);627else628btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos,629pos + length - 1, NULL);630ret = -ENOTBLK;631}632if (write) {633btrfs_put_ordered_extent(dio_data->ordered);634dio_data->ordered = NULL;635}636637if (write)638extent_changeset_free(dio_data->data_reserved);639return ret;640}641642static void btrfs_dio_end_io(struct btrfs_bio *bbio)643{644struct btrfs_dio_private *dip =645container_of(bbio, struct btrfs_dio_private, bbio);646struct btrfs_inode *inode = bbio->inode;647struct bio *bio = &bbio->bio;648649if (bio->bi_status) {650btrfs_warn(inode->root->fs_info,651"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",652btrfs_ino(inode), bio->bi_opf,653dip->file_offset, dip->bytes, bio->bi_status);654}655656if (btrfs_op(bio) == BTRFS_MAP_WRITE) {657btrfs_finish_ordered_extent(bbio->ordered, NULL,658dip->file_offset, dip->bytes,659!bio->bi_status);660} else {661btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset,662dip->file_offset + dip->bytes - 1, NULL);663}664665bbio->bio.bi_private = bbio->private;666iomap_dio_bio_end_io(bio);667}668669static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,670struct btrfs_ordered_extent *ordered)671{672u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;673u64 len = bbio->bio.bi_iter.bi_size;674struct btrfs_ordered_extent *new;675int ret;676677/* Must always be called for the beginning of an ordered extent. */678if (WARN_ON_ONCE(start != ordered->disk_bytenr))679return -EINVAL;680681/* No need to split if the ordered extent covers the entire bio. */682if (ordered->disk_num_bytes == len) {683refcount_inc(&ordered->refs);684bbio->ordered = ordered;685return 0;686}687688/*689* Don't split the extent_map for NOCOW extents, as we're writing into690* a pre-existing one.691*/692if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {693ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset,694ordered->num_bytes, len,695ordered->disk_bytenr);696if (ret)697return ret;698}699700new = btrfs_split_ordered_extent(ordered, len);701if (IS_ERR(new))702return PTR_ERR(new);703bbio->ordered = new;704return 0;705}706707static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,708loff_t file_offset)709{710struct btrfs_bio *bbio = btrfs_bio(bio);711struct btrfs_dio_private *dip =712container_of(bbio, struct btrfs_dio_private, bbio);713struct btrfs_dio_data *dio_data = iter->private;714715btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,716btrfs_dio_end_io, bio->bi_private);717bbio->inode = BTRFS_I(iter->inode);718bbio->file_offset = file_offset;719720dip->file_offset = file_offset;721dip->bytes = bio->bi_iter.bi_size;722723dio_data->submitted += bio->bi_iter.bi_size;724725/*726* Check if we are doing a partial write. If we are, we need to split727* the ordered extent to match the submitted bio. Hang on to the728* remaining unfinishable ordered_extent in dio_data so that it can be729* cancelled in iomap_end to avoid a deadlock wherein faulting the730* remaining pages is blocked on the outstanding ordered extent.731*/732if (iter->flags & IOMAP_WRITE) {733int ret;734735ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);736if (ret) {737btrfs_finish_ordered_extent(dio_data->ordered, NULL,738file_offset, dip->bytes,739!ret);740bio->bi_status = errno_to_blk_status(ret);741iomap_dio_bio_end_io(bio);742return;743}744}745746btrfs_submit_bbio(bbio, 0);747}748749static const struct iomap_ops btrfs_dio_iomap_ops = {750.iomap_begin = btrfs_dio_iomap_begin,751.iomap_end = btrfs_dio_iomap_end,752};753754static const struct iomap_dio_ops btrfs_dio_ops = {755.submit_io = btrfs_dio_submit_io,756.bio_set = &btrfs_dio_bioset,757};758759static ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,760size_t done_before)761{762struct btrfs_dio_data data = { 0 };763764return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,765IOMAP_DIO_PARTIAL, &data, done_before);766}767768static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,769size_t done_before)770{771struct btrfs_dio_data data = { 0 };772773return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,774IOMAP_DIO_PARTIAL, &data, done_before);775}776777static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,778const struct iov_iter *iter, loff_t offset)779{780const u32 blocksize_mask = fs_info->sectorsize - 1;781782if (offset & blocksize_mask)783return -EINVAL;784785if (iov_iter_alignment(iter) & blocksize_mask)786return -EINVAL;787788/*789* For bs > ps support, we heavily rely on large folios to make sure no790* block will cross large folio boundaries.791*792* But memory provided by direct IO is only virtually contiguous, not793* physically contiguous, and will break the btrfs' large folio requirement.794*795* So for bs > ps support, all direct IOs should fallback to buffered ones.796*/797if (fs_info->sectorsize > PAGE_SIZE)798return -EINVAL;799800return 0;801}802803ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)804{805struct file *file = iocb->ki_filp;806struct inode *inode = file_inode(file);807struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);808loff_t pos;809ssize_t written = 0;810ssize_t written_buffered;811size_t prev_left = 0;812loff_t endbyte;813ssize_t ret;814unsigned int ilock_flags = 0;815struct iomap_dio *dio;816817if (iocb->ki_flags & IOCB_NOWAIT)818ilock_flags |= BTRFS_ILOCK_TRY;819820/*821* If the write DIO is within EOF, use a shared lock and also only if822* security bits will likely not be dropped by file_remove_privs() called823* from btrfs_write_check(). Either will need to be rechecked after the824* lock was acquired.825*/826if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))827ilock_flags |= BTRFS_ILOCK_SHARED;828829relock:830ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);831if (ret < 0)832return ret;833834/* Shared lock cannot be used with security bits set. */835if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {836btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);837ilock_flags &= ~BTRFS_ILOCK_SHARED;838goto relock;839}840841ret = generic_write_checks(iocb, from);842if (ret <= 0) {843btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);844return ret;845}846847ret = btrfs_write_check(iocb, ret);848if (ret < 0) {849btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);850goto out;851}852853pos = iocb->ki_pos;854/*855* Re-check since file size may have changed just before taking the856* lock or pos may have changed because of O_APPEND in generic_write_check()857*/858if ((ilock_flags & BTRFS_ILOCK_SHARED) &&859pos + iov_iter_count(from) > i_size_read(inode)) {860btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);861ilock_flags &= ~BTRFS_ILOCK_SHARED;862goto relock;863}864865if (check_direct_IO(fs_info, from, pos)) {866btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);867goto buffered;868}869/*870* We can't control the folios being passed in, applications can write871* to them while a direct IO write is in progress. This means the872* content might change after we calculated the data checksum.873* Therefore we can end up storing a checksum that doesn't match the874* persisted data.875*876* To be extra safe and avoid false data checksum mismatch, if the877* inode requires data checksum, just fallback to buffered IO.878* For buffered IO we have full control of page cache and can ensure879* no one is modifying the content during writeback.880*/881if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {882btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);883goto buffered;884}885886/*887* The iov_iter can be mapped to the same file range we are writing to.888* If that's the case, then we will deadlock in the iomap code, because889* it first calls our callback btrfs_dio_iomap_begin(), which will create890* an ordered extent, and after that it will fault in the pages that the891* iov_iter refers to. During the fault in we end up in the readahead892* pages code (starting at btrfs_readahead()), which will lock the range,893* find that ordered extent and then wait for it to complete (at894* btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since895* obviously the ordered extent can never complete as we didn't submit896* yet the respective bio(s). This always happens when the buffer is897* memory mapped to the same file range, since the iomap DIO code always898* invalidates pages in the target file range (after starting and waiting899* for any writeback).900*901* So here we disable page faults in the iov_iter and then retry if we902* got -EFAULT, faulting in the pages before the retry.903*/904again:905from->nofault = true;906dio = btrfs_dio_write(iocb, from, written);907from->nofault = false;908909if (IS_ERR_OR_NULL(dio)) {910ret = PTR_ERR_OR_ZERO(dio);911} else {912/*913* If we have a synchronous write, we must make sure the fsync914* triggered by the iomap_dio_complete() call below doesn't915* deadlock on the inode lock - we are already holding it and we916* can't call it after unlocking because we may need to complete917* partial writes due to the input buffer (or parts of it) not918* being already faulted in.919*/920ASSERT(current->journal_info == NULL);921current->journal_info = BTRFS_TRANS_DIO_WRITE_STUB;922ret = iomap_dio_complete(dio);923current->journal_info = NULL;924}925926/* No increment (+=) because iomap returns a cumulative value. */927if (ret > 0)928written = ret;929930if (iov_iter_count(from) > 0 && (ret == -EFAULT || ret > 0)) {931const size_t left = iov_iter_count(from);932/*933* We have more data left to write. Try to fault in as many as934* possible of the remainder pages and retry. We do this without935* releasing and locking again the inode, to prevent races with936* truncate.937*938* Also, in case the iov refers to pages in the file range of the939* file we want to write to (due to a mmap), we could enter an940* infinite loop if we retry after faulting the pages in, since941* iomap will invalidate any pages in the range early on, before942* it tries to fault in the pages of the iov. So we keep track of943* how much was left of iov in the previous EFAULT and fallback944* to buffered IO in case we haven't made any progress.945*/946if (left == prev_left) {947ret = -ENOTBLK;948} else {949fault_in_iov_iter_readable(from, left);950prev_left = left;951goto again;952}953}954955btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);956957/*958* If 'ret' is -ENOTBLK or we have not written all data, then it means959* we must fallback to buffered IO.960*/961if ((ret < 0 && ret != -ENOTBLK) || !iov_iter_count(from))962goto out;963964buffered:965/*966* If we are in a NOWAIT context, then return -EAGAIN to signal the caller967* it must retry the operation in a context where blocking is acceptable,968* because even if we end up not blocking during the buffered IO attempt969* below, we will block when flushing and waiting for the IO.970*/971if (iocb->ki_flags & IOCB_NOWAIT) {972ret = -EAGAIN;973goto out;974}975976pos = iocb->ki_pos;977written_buffered = btrfs_buffered_write(iocb, from);978if (written_buffered < 0) {979ret = written_buffered;980goto out;981}982/*983* Ensure all data is persisted. We want the next direct IO read to be984* able to read what was just written.985*/986endbyte = pos + written_buffered - 1;987ret = btrfs_fdatawrite_range(BTRFS_I(inode), pos, endbyte);988if (ret)989goto out;990ret = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);991if (ret)992goto out;993written += written_buffered;994iocb->ki_pos = pos + written_buffered;995invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,996endbyte >> PAGE_SHIFT);997out:998return ret < 0 ? ret : written;999}10001001static int check_direct_read(struct btrfs_fs_info *fs_info,1002const struct iov_iter *iter, loff_t offset)1003{1004int ret;1005int i, seg;10061007ret = check_direct_IO(fs_info, iter, offset);1008if (ret < 0)1009return ret;10101011if (!iter_is_iovec(iter))1012return 0;10131014for (seg = 0; seg < iter->nr_segs; seg++) {1015for (i = seg + 1; i < iter->nr_segs; i++) {1016const struct iovec *iov1 = iter_iov(iter) + seg;1017const struct iovec *iov2 = iter_iov(iter) + i;10181019if (iov1->iov_base == iov2->iov_base)1020return -EINVAL;1021}1022}1023return 0;1024}10251026ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)1027{1028struct inode *inode = file_inode(iocb->ki_filp);1029size_t prev_left = 0;1030ssize_t read = 0;1031ssize_t ret;10321033if (fsverity_active(inode))1034return 0;10351036if (check_direct_read(inode_to_fs_info(inode), to, iocb->ki_pos))1037return 0;10381039btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);1040again:1041/*1042* This is similar to what we do for direct IO writes, see the comment1043* at btrfs_direct_write(), but we also disable page faults in addition1044* to disabling them only at the iov_iter level. This is because when1045* reading from a hole or prealloc extent, iomap calls iov_iter_zero(),1046* which can still trigger page fault ins despite having set ->nofault1047* to true of our 'to' iov_iter.1048*1049* The difference to direct IO writes is that we deadlock when trying1050* to lock the extent range in the inode's tree during he page reads1051* triggered by the fault in (while for writes it is due to waiting for1052* our own ordered extent). This is because for direct IO reads,1053* btrfs_dio_iomap_begin() returns with the extent range locked, which1054* is only unlocked in the endio callback (end_bio_extent_readpage()).1055*/1056pagefault_disable();1057to->nofault = true;1058ret = btrfs_dio_read(iocb, to, read);1059to->nofault = false;1060pagefault_enable();10611062/* No increment (+=) because iomap returns a cumulative value. */1063if (ret > 0)1064read = ret;10651066if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {1067const size_t left = iov_iter_count(to);10681069if (left == prev_left) {1070/*1071* We didn't make any progress since the last attempt,1072* fallback to a buffered read for the remainder of the1073* range. This is just to avoid any possibility of looping1074* for too long.1075*/1076ret = read;1077} else {1078/*1079* We made some progress since the last retry or this is1080* the first time we are retrying. Fault in as many pages1081* as possible and retry.1082*/1083fault_in_iov_iter_writeable(to, left);1084prev_left = left;1085goto again;1086}1087}1088btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);1089return ret < 0 ? ret : read;1090}10911092int __init btrfs_init_dio(void)1093{1094if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,1095offsetof(struct btrfs_dio_private, bbio.bio),1096BIOSET_NEED_BVECS))1097return -ENOMEM;10981099return 0;1100}11011102void __cold btrfs_destroy_dio(void)1103{1104bioset_exit(&btrfs_dio_bioset);1105}110611071108