Path: blob/master/drivers/accel/habanalabs/common/hldio.c
29282 views
// SPDX-License-Identifier: GPL-2.012/*3* Copyright 2024 HabanaLabs, Ltd.4* All Rights Reserved.5*/67#include "habanalabs.h"8#include "hldio.h"9#include <generated/uapi/linux/version.h>10#include <linux/pci-p2pdma.h>11#include <linux/blkdev.h>12#include <linux/vmalloc.h>1314/*15* NVMe Direct I/O implementation for habanalabs driver16*17* ASSUMPTIONS18* ===========19* 1. No IOMMU (well, technically it can work with IOMMU, but it is *almost useless).20* 2. Only READ operations (can extend in the future).21* 3. No sparse files (can overcome this in the future).22* 4. Kernel version >= 6.923* 5. Requiring page alignment is OK (I don't see a solution to this one right,24* now, how do we read partial pages?)25* 6. Kernel compiled with CONFIG_PCI_P2PDMA. This requires a CUSTOM kernel.26* Theoretically I have a slight idea on how this could be solvable, but it27* is probably inacceptable for the upstream. Also may not work in the end.28* 7. Either make sure our cards and disks are under the same PCI bridge, or29* compile a custom kernel to hack around this.30*/3132#define IO_STABILIZE_TIMEOUT 10000000 /* 10 seconds in microseconds */3334/*35* This struct contains all the useful data I could milk out of the file handle36* provided by the user.37* @TODO: right now it is retrieved on each IO, but can be done once with some38* dedicated IOCTL, call it for example HL_REGISTER_HANDLE.39*/40struct hl_dio_fd {41/* Back pointer in case we need it in async completion */42struct hl_ctx *ctx;43/* Associated fd struct */44struct file *filp;45};4647/*48* This is a single IO descriptor49*/50struct hl_direct_io {51struct hl_dio_fd f;52struct kiocb kio;53struct bio_vec *bv;54struct iov_iter iter;55u64 device_va;56u64 off_bytes;57u64 len_bytes;58u32 type;59};6061bool hl_device_supports_nvme(struct hl_device *hdev)62{63return hdev->asic_prop.supports_nvme;64}6566static int hl_dio_fd_register(struct hl_ctx *ctx, int fd, struct hl_dio_fd *f)67{68struct hl_device *hdev = ctx->hdev;69struct block_device *bd;70struct super_block *sb;71struct inode *inode;72struct gendisk *gd;73struct device *disk_dev;74int rc;7576f->filp = fget(fd);77if (!f->filp) {78rc = -ENOENT;79goto out;80}8182if (!(f->filp->f_flags & O_DIRECT)) {83dev_err(hdev->dev, "file is not in the direct mode\n");84rc = -EINVAL;85goto fput;86}8788if (!f->filp->f_op->read_iter) {89dev_err(hdev->dev, "read iter is not supported, need to fall back to legacy\n");90rc = -EINVAL;91goto fput;92}9394inode = file_inode(f->filp);95sb = inode->i_sb;96bd = sb->s_bdev;97gd = bd->bd_disk;9899if (inode->i_blocks << sb->s_blocksize_bits < i_size_read(inode)) {100dev_err(hdev->dev, "sparse files are not currently supported\n");101rc = -EINVAL;102goto fput;103}104105if (!bd || !gd) {106dev_err(hdev->dev, "invalid block device\n");107rc = -ENODEV;108goto fput;109}110/* Get the underlying device from the block device */111disk_dev = disk_to_dev(gd);112if (!dma_pci_p2pdma_supported(disk_dev)) {113dev_err(hdev->dev, "device does not support PCI P2P DMA\n");114rc = -EOPNOTSUPP;115goto fput;116}117118/*119* @TODO: Maybe we need additional checks here120*/121122f->ctx = ctx;123rc = 0;124125goto out;126fput:127fput(f->filp);128out:129return rc;130}131132static void hl_dio_fd_unregister(struct hl_dio_fd *f)133{134fput(f->filp);135}136137static long hl_dio_count_io(struct hl_device *hdev)138{139s64 sum = 0;140int i;141142for_each_possible_cpu(i)143sum += per_cpu(*hdev->hldio.inflight_ios, i);144145return sum;146}147148static bool hl_dio_get_iopath(struct hl_ctx *ctx)149{150struct hl_device *hdev = ctx->hdev;151152if (hdev->hldio.io_enabled) {153this_cpu_inc(*hdev->hldio.inflight_ios);154155/* Avoid race conditions */156if (!hdev->hldio.io_enabled) {157this_cpu_dec(*hdev->hldio.inflight_ios);158return false;159}160161hl_ctx_get(ctx);162163return true;164}165166return false;167}168169static void hl_dio_put_iopath(struct hl_ctx *ctx)170{171struct hl_device *hdev = ctx->hdev;172173hl_ctx_put(ctx);174this_cpu_dec(*hdev->hldio.inflight_ios);175}176177static void hl_dio_set_io_enabled(struct hl_device *hdev, bool enabled)178{179hdev->hldio.io_enabled = enabled;180}181182static bool hl_dio_validate_io(struct hl_device *hdev, struct hl_direct_io *io)183{184if ((u64)io->device_va & ~PAGE_MASK) {185dev_dbg(hdev->dev, "device address must be 4K aligned\n");186return false;187}188189if (io->len_bytes & ~PAGE_MASK) {190dev_dbg(hdev->dev, "IO length must be 4K aligned\n");191return false;192}193194if (io->off_bytes & ~PAGE_MASK) {195dev_dbg(hdev->dev, "IO offset must be 4K aligned\n");196return false;197}198199return true;200}201202static struct page *hl_dio_va2page(struct hl_device *hdev, struct hl_ctx *ctx, u64 device_va)203{204struct hl_dio *hldio = &hdev->hldio;205u64 device_pa;206int rc, i;207208rc = hl_mmu_va_to_pa(ctx, device_va, &device_pa);209if (rc) {210dev_err(hdev->dev, "device virtual address translation error: %#llx (%d)",211device_va, rc);212return NULL;213}214215for (i = 0 ; i < hldio->np2prs ; ++i) {216if (device_pa >= hldio->p2prs[i].device_pa &&217device_pa < hldio->p2prs[i].device_pa + hldio->p2prs[i].size)218return hldio->p2prs[i].p2ppages[(device_pa - hldio->p2prs[i].device_pa) >>219PAGE_SHIFT];220}221222return NULL;223}224225static ssize_t hl_direct_io(struct hl_device *hdev, struct hl_direct_io *io)226{227u64 npages, device_va;228ssize_t rc;229int i;230231if (!hl_dio_validate_io(hdev, io))232return -EINVAL;233234if (!hl_dio_get_iopath(io->f.ctx)) {235dev_info(hdev->dev, "can't schedule a new IO, IO is disabled\n");236return -ESHUTDOWN;237}238239init_sync_kiocb(&io->kio, io->f.filp);240io->kio.ki_pos = io->off_bytes;241242npages = (io->len_bytes >> PAGE_SHIFT);243244/* @TODO: this can be implemented smarter, vmalloc in iopath is not245* ideal. Maybe some variation of genpool. Number of pages may differ246* greatly, so maybe even use pools of different sizes and chose the247* closest one.248*/249io->bv = vzalloc(npages * sizeof(struct bio_vec));250if (!io->bv)251return -ENOMEM;252253for (i = 0, device_va = io->device_va; i < npages ; ++i, device_va += PAGE_SIZE) {254io->bv[i].bv_page = hl_dio_va2page(hdev, io->f.ctx, device_va);255if (!io->bv[i].bv_page) {256dev_err(hdev->dev, "error getting page struct for device va %#llx",257device_va);258rc = -EFAULT;259goto cleanup;260}261io->bv[i].bv_offset = 0;262io->bv[i].bv_len = PAGE_SIZE;263}264265iov_iter_bvec(&io->iter, io->type, io->bv, 1, io->len_bytes);266if (io->f.filp->f_op && io->f.filp->f_op->read_iter)267rc = io->f.filp->f_op->read_iter(&io->kio, &io->iter);268else269rc = -EINVAL;270271cleanup:272vfree(io->bv);273hl_dio_put_iopath(io->f.ctx);274275dev_dbg(hdev->dev, "IO ended with %ld\n", rc);276277return rc;278}279280/*281* @TODO: This function can be used as a callback for io completion under282* kio->ki_complete in order to implement async IO.283* Note that on more recent kernels there is no ret2.284*/285__maybe_unused static void hl_direct_io_complete(struct kiocb *kio, long ret, long ret2)286{287struct hl_direct_io *io = container_of(kio, struct hl_direct_io, kio);288289dev_dbg(io->f.ctx->hdev->dev, "IO completed with %ld\n", ret);290291/* Do something to copy result to user / notify completion */292293hl_dio_put_iopath(io->f.ctx);294295hl_dio_fd_unregister(&io->f);296}297298/*299* DMA disk to ASIC, wait for results. Must be invoked from the user context300*/301int hl_dio_ssd2hl(struct hl_device *hdev, struct hl_ctx *ctx, int fd,302u64 device_va, off_t off_bytes, size_t len_bytes,303size_t *len_read)304{305struct hl_direct_io *io;306ssize_t rc;307308dev_dbg(hdev->dev, "SSD2HL fd=%d va=%#llx len=%#lx\n", fd, device_va, len_bytes);309310io = kzalloc(sizeof(*io), GFP_KERNEL);311if (!io) {312rc = -ENOMEM;313goto out;314}315316*io = (struct hl_direct_io){317.device_va = device_va,318.len_bytes = len_bytes,319.off_bytes = off_bytes,320.type = READ,321};322323rc = hl_dio_fd_register(ctx, fd, &io->f);324if (rc)325goto kfree_io;326327rc = hl_direct_io(hdev, io);328if (rc >= 0) {329*len_read = rc;330rc = 0;331}332333/* This shall be called only in the case of a sync IO */334hl_dio_fd_unregister(&io->f);335kfree_io:336kfree(io);337out:338return rc;339}340341static void hl_p2p_region_fini(struct hl_device *hdev, struct hl_p2p_region *p2pr)342{343if (p2pr->p2ppages) {344vfree(p2pr->p2ppages);345p2pr->p2ppages = NULL;346}347348if (p2pr->p2pmem) {349dev_dbg(hdev->dev, "freeing P2P mem from %p, size=%#llx\n",350p2pr->p2pmem, p2pr->size);351pci_free_p2pmem(hdev->pdev, p2pr->p2pmem, p2pr->size);352p2pr->p2pmem = NULL;353}354}355356void hl_p2p_region_fini_all(struct hl_device *hdev)357{358int i;359360for (i = 0 ; i < hdev->hldio.np2prs ; ++i)361hl_p2p_region_fini(hdev, &hdev->hldio.p2prs[i]);362363kvfree(hdev->hldio.p2prs);364hdev->hldio.p2prs = NULL;365hdev->hldio.np2prs = 0;366}367368int hl_p2p_region_init(struct hl_device *hdev, struct hl_p2p_region *p2pr)369{370void *addr;371int rc, i;372373/* Start by publishing our p2p memory */374rc = pci_p2pdma_add_resource(hdev->pdev, p2pr->bar, p2pr->size, p2pr->bar_offset);375if (rc) {376dev_err(hdev->dev, "error adding p2p resource: %d\n", rc);377goto err;378}379380/* Alloc all p2p mem */381p2pr->p2pmem = pci_alloc_p2pmem(hdev->pdev, p2pr->size);382if (!p2pr->p2pmem) {383dev_err(hdev->dev, "error allocating p2p memory\n");384rc = -ENOMEM;385goto err;386}387388p2pr->p2ppages = vmalloc((p2pr->size >> PAGE_SHIFT) * sizeof(struct page *));389if (!p2pr->p2ppages) {390rc = -ENOMEM;391goto err;392}393394for (i = 0, addr = p2pr->p2pmem ; i < (p2pr->size >> PAGE_SHIFT) ; ++i, addr += PAGE_SIZE) {395p2pr->p2ppages[i] = virt_to_page(addr);396if (!p2pr->p2ppages[i]) {397rc = -EFAULT;398goto err;399}400}401402return 0;403err:404hl_p2p_region_fini(hdev, p2pr);405return rc;406}407408int hl_dio_start(struct hl_device *hdev)409{410dev_dbg(hdev->dev, "initializing HLDIO\n");411412/* Initialize the IO counter and enable IO */413hdev->hldio.inflight_ios = alloc_percpu(s64);414if (!hdev->hldio.inflight_ios)415return -ENOMEM;416417hl_dio_set_io_enabled(hdev, true);418419return 0;420}421422void hl_dio_stop(struct hl_device *hdev)423{424dev_dbg(hdev->dev, "deinitializing HLDIO\n");425426if (hdev->hldio.io_enabled) {427/* Wait for all the IO to finish */428hl_dio_set_io_enabled(hdev, false);429hl_poll_timeout_condition(hdev, !hl_dio_count_io(hdev), 1000, IO_STABILIZE_TIMEOUT);430}431432if (hdev->hldio.inflight_ios) {433free_percpu(hdev->hldio.inflight_ios);434hdev->hldio.inflight_ios = NULL;435}436}437438439