Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/accel/habanalabs/common/hldio.c
29282 views
1
// SPDX-License-Identifier: GPL-2.0
2
3
/*
4
* Copyright 2024 HabanaLabs, Ltd.
5
* All Rights Reserved.
6
*/
7
8
#include "habanalabs.h"
9
#include "hldio.h"
10
#include <generated/uapi/linux/version.h>
11
#include <linux/pci-p2pdma.h>
12
#include <linux/blkdev.h>
13
#include <linux/vmalloc.h>
14
15
/*
16
* NVMe Direct I/O implementation for habanalabs driver
17
*
18
* ASSUMPTIONS
19
* ===========
20
* 1. No IOMMU (well, technically it can work with IOMMU, but it is *almost useless).
21
* 2. Only READ operations (can extend in the future).
22
* 3. No sparse files (can overcome this in the future).
23
* 4. Kernel version >= 6.9
24
* 5. Requiring page alignment is OK (I don't see a solution to this one right,
25
* now, how do we read partial pages?)
26
* 6. Kernel compiled with CONFIG_PCI_P2PDMA. This requires a CUSTOM kernel.
27
* Theoretically I have a slight idea on how this could be solvable, but it
28
* is probably inacceptable for the upstream. Also may not work in the end.
29
* 7. Either make sure our cards and disks are under the same PCI bridge, or
30
* compile a custom kernel to hack around this.
31
*/
32
33
#define IO_STABILIZE_TIMEOUT 10000000 /* 10 seconds in microseconds */
34
35
/*
36
* This struct contains all the useful data I could milk out of the file handle
37
* provided by the user.
38
* @TODO: right now it is retrieved on each IO, but can be done once with some
39
* dedicated IOCTL, call it for example HL_REGISTER_HANDLE.
40
*/
41
struct hl_dio_fd {
42
/* Back pointer in case we need it in async completion */
43
struct hl_ctx *ctx;
44
/* Associated fd struct */
45
struct file *filp;
46
};
47
48
/*
49
* This is a single IO descriptor
50
*/
51
struct hl_direct_io {
52
struct hl_dio_fd f;
53
struct kiocb kio;
54
struct bio_vec *bv;
55
struct iov_iter iter;
56
u64 device_va;
57
u64 off_bytes;
58
u64 len_bytes;
59
u32 type;
60
};
61
62
bool hl_device_supports_nvme(struct hl_device *hdev)
63
{
64
return hdev->asic_prop.supports_nvme;
65
}
66
67
static int hl_dio_fd_register(struct hl_ctx *ctx, int fd, struct hl_dio_fd *f)
68
{
69
struct hl_device *hdev = ctx->hdev;
70
struct block_device *bd;
71
struct super_block *sb;
72
struct inode *inode;
73
struct gendisk *gd;
74
struct device *disk_dev;
75
int rc;
76
77
f->filp = fget(fd);
78
if (!f->filp) {
79
rc = -ENOENT;
80
goto out;
81
}
82
83
if (!(f->filp->f_flags & O_DIRECT)) {
84
dev_err(hdev->dev, "file is not in the direct mode\n");
85
rc = -EINVAL;
86
goto fput;
87
}
88
89
if (!f->filp->f_op->read_iter) {
90
dev_err(hdev->dev, "read iter is not supported, need to fall back to legacy\n");
91
rc = -EINVAL;
92
goto fput;
93
}
94
95
inode = file_inode(f->filp);
96
sb = inode->i_sb;
97
bd = sb->s_bdev;
98
gd = bd->bd_disk;
99
100
if (inode->i_blocks << sb->s_blocksize_bits < i_size_read(inode)) {
101
dev_err(hdev->dev, "sparse files are not currently supported\n");
102
rc = -EINVAL;
103
goto fput;
104
}
105
106
if (!bd || !gd) {
107
dev_err(hdev->dev, "invalid block device\n");
108
rc = -ENODEV;
109
goto fput;
110
}
111
/* Get the underlying device from the block device */
112
disk_dev = disk_to_dev(gd);
113
if (!dma_pci_p2pdma_supported(disk_dev)) {
114
dev_err(hdev->dev, "device does not support PCI P2P DMA\n");
115
rc = -EOPNOTSUPP;
116
goto fput;
117
}
118
119
/*
120
* @TODO: Maybe we need additional checks here
121
*/
122
123
f->ctx = ctx;
124
rc = 0;
125
126
goto out;
127
fput:
128
fput(f->filp);
129
out:
130
return rc;
131
}
132
133
static void hl_dio_fd_unregister(struct hl_dio_fd *f)
134
{
135
fput(f->filp);
136
}
137
138
static long hl_dio_count_io(struct hl_device *hdev)
139
{
140
s64 sum = 0;
141
int i;
142
143
for_each_possible_cpu(i)
144
sum += per_cpu(*hdev->hldio.inflight_ios, i);
145
146
return sum;
147
}
148
149
static bool hl_dio_get_iopath(struct hl_ctx *ctx)
150
{
151
struct hl_device *hdev = ctx->hdev;
152
153
if (hdev->hldio.io_enabled) {
154
this_cpu_inc(*hdev->hldio.inflight_ios);
155
156
/* Avoid race conditions */
157
if (!hdev->hldio.io_enabled) {
158
this_cpu_dec(*hdev->hldio.inflight_ios);
159
return false;
160
}
161
162
hl_ctx_get(ctx);
163
164
return true;
165
}
166
167
return false;
168
}
169
170
static void hl_dio_put_iopath(struct hl_ctx *ctx)
171
{
172
struct hl_device *hdev = ctx->hdev;
173
174
hl_ctx_put(ctx);
175
this_cpu_dec(*hdev->hldio.inflight_ios);
176
}
177
178
static void hl_dio_set_io_enabled(struct hl_device *hdev, bool enabled)
179
{
180
hdev->hldio.io_enabled = enabled;
181
}
182
183
static bool hl_dio_validate_io(struct hl_device *hdev, struct hl_direct_io *io)
184
{
185
if ((u64)io->device_va & ~PAGE_MASK) {
186
dev_dbg(hdev->dev, "device address must be 4K aligned\n");
187
return false;
188
}
189
190
if (io->len_bytes & ~PAGE_MASK) {
191
dev_dbg(hdev->dev, "IO length must be 4K aligned\n");
192
return false;
193
}
194
195
if (io->off_bytes & ~PAGE_MASK) {
196
dev_dbg(hdev->dev, "IO offset must be 4K aligned\n");
197
return false;
198
}
199
200
return true;
201
}
202
203
static struct page *hl_dio_va2page(struct hl_device *hdev, struct hl_ctx *ctx, u64 device_va)
204
{
205
struct hl_dio *hldio = &hdev->hldio;
206
u64 device_pa;
207
int rc, i;
208
209
rc = hl_mmu_va_to_pa(ctx, device_va, &device_pa);
210
if (rc) {
211
dev_err(hdev->dev, "device virtual address translation error: %#llx (%d)",
212
device_va, rc);
213
return NULL;
214
}
215
216
for (i = 0 ; i < hldio->np2prs ; ++i) {
217
if (device_pa >= hldio->p2prs[i].device_pa &&
218
device_pa < hldio->p2prs[i].device_pa + hldio->p2prs[i].size)
219
return hldio->p2prs[i].p2ppages[(device_pa - hldio->p2prs[i].device_pa) >>
220
PAGE_SHIFT];
221
}
222
223
return NULL;
224
}
225
226
static ssize_t hl_direct_io(struct hl_device *hdev, struct hl_direct_io *io)
227
{
228
u64 npages, device_va;
229
ssize_t rc;
230
int i;
231
232
if (!hl_dio_validate_io(hdev, io))
233
return -EINVAL;
234
235
if (!hl_dio_get_iopath(io->f.ctx)) {
236
dev_info(hdev->dev, "can't schedule a new IO, IO is disabled\n");
237
return -ESHUTDOWN;
238
}
239
240
init_sync_kiocb(&io->kio, io->f.filp);
241
io->kio.ki_pos = io->off_bytes;
242
243
npages = (io->len_bytes >> PAGE_SHIFT);
244
245
/* @TODO: this can be implemented smarter, vmalloc in iopath is not
246
* ideal. Maybe some variation of genpool. Number of pages may differ
247
* greatly, so maybe even use pools of different sizes and chose the
248
* closest one.
249
*/
250
io->bv = vzalloc(npages * sizeof(struct bio_vec));
251
if (!io->bv)
252
return -ENOMEM;
253
254
for (i = 0, device_va = io->device_va; i < npages ; ++i, device_va += PAGE_SIZE) {
255
io->bv[i].bv_page = hl_dio_va2page(hdev, io->f.ctx, device_va);
256
if (!io->bv[i].bv_page) {
257
dev_err(hdev->dev, "error getting page struct for device va %#llx",
258
device_va);
259
rc = -EFAULT;
260
goto cleanup;
261
}
262
io->bv[i].bv_offset = 0;
263
io->bv[i].bv_len = PAGE_SIZE;
264
}
265
266
iov_iter_bvec(&io->iter, io->type, io->bv, 1, io->len_bytes);
267
if (io->f.filp->f_op && io->f.filp->f_op->read_iter)
268
rc = io->f.filp->f_op->read_iter(&io->kio, &io->iter);
269
else
270
rc = -EINVAL;
271
272
cleanup:
273
vfree(io->bv);
274
hl_dio_put_iopath(io->f.ctx);
275
276
dev_dbg(hdev->dev, "IO ended with %ld\n", rc);
277
278
return rc;
279
}
280
281
/*
282
* @TODO: This function can be used as a callback for io completion under
283
* kio->ki_complete in order to implement async IO.
284
* Note that on more recent kernels there is no ret2.
285
*/
286
__maybe_unused static void hl_direct_io_complete(struct kiocb *kio, long ret, long ret2)
287
{
288
struct hl_direct_io *io = container_of(kio, struct hl_direct_io, kio);
289
290
dev_dbg(io->f.ctx->hdev->dev, "IO completed with %ld\n", ret);
291
292
/* Do something to copy result to user / notify completion */
293
294
hl_dio_put_iopath(io->f.ctx);
295
296
hl_dio_fd_unregister(&io->f);
297
}
298
299
/*
300
* DMA disk to ASIC, wait for results. Must be invoked from the user context
301
*/
302
int hl_dio_ssd2hl(struct hl_device *hdev, struct hl_ctx *ctx, int fd,
303
u64 device_va, off_t off_bytes, size_t len_bytes,
304
size_t *len_read)
305
{
306
struct hl_direct_io *io;
307
ssize_t rc;
308
309
dev_dbg(hdev->dev, "SSD2HL fd=%d va=%#llx len=%#lx\n", fd, device_va, len_bytes);
310
311
io = kzalloc(sizeof(*io), GFP_KERNEL);
312
if (!io) {
313
rc = -ENOMEM;
314
goto out;
315
}
316
317
*io = (struct hl_direct_io){
318
.device_va = device_va,
319
.len_bytes = len_bytes,
320
.off_bytes = off_bytes,
321
.type = READ,
322
};
323
324
rc = hl_dio_fd_register(ctx, fd, &io->f);
325
if (rc)
326
goto kfree_io;
327
328
rc = hl_direct_io(hdev, io);
329
if (rc >= 0) {
330
*len_read = rc;
331
rc = 0;
332
}
333
334
/* This shall be called only in the case of a sync IO */
335
hl_dio_fd_unregister(&io->f);
336
kfree_io:
337
kfree(io);
338
out:
339
return rc;
340
}
341
342
static void hl_p2p_region_fini(struct hl_device *hdev, struct hl_p2p_region *p2pr)
343
{
344
if (p2pr->p2ppages) {
345
vfree(p2pr->p2ppages);
346
p2pr->p2ppages = NULL;
347
}
348
349
if (p2pr->p2pmem) {
350
dev_dbg(hdev->dev, "freeing P2P mem from %p, size=%#llx\n",
351
p2pr->p2pmem, p2pr->size);
352
pci_free_p2pmem(hdev->pdev, p2pr->p2pmem, p2pr->size);
353
p2pr->p2pmem = NULL;
354
}
355
}
356
357
void hl_p2p_region_fini_all(struct hl_device *hdev)
358
{
359
int i;
360
361
for (i = 0 ; i < hdev->hldio.np2prs ; ++i)
362
hl_p2p_region_fini(hdev, &hdev->hldio.p2prs[i]);
363
364
kvfree(hdev->hldio.p2prs);
365
hdev->hldio.p2prs = NULL;
366
hdev->hldio.np2prs = 0;
367
}
368
369
int hl_p2p_region_init(struct hl_device *hdev, struct hl_p2p_region *p2pr)
370
{
371
void *addr;
372
int rc, i;
373
374
/* Start by publishing our p2p memory */
375
rc = pci_p2pdma_add_resource(hdev->pdev, p2pr->bar, p2pr->size, p2pr->bar_offset);
376
if (rc) {
377
dev_err(hdev->dev, "error adding p2p resource: %d\n", rc);
378
goto err;
379
}
380
381
/* Alloc all p2p mem */
382
p2pr->p2pmem = pci_alloc_p2pmem(hdev->pdev, p2pr->size);
383
if (!p2pr->p2pmem) {
384
dev_err(hdev->dev, "error allocating p2p memory\n");
385
rc = -ENOMEM;
386
goto err;
387
}
388
389
p2pr->p2ppages = vmalloc((p2pr->size >> PAGE_SHIFT) * sizeof(struct page *));
390
if (!p2pr->p2ppages) {
391
rc = -ENOMEM;
392
goto err;
393
}
394
395
for (i = 0, addr = p2pr->p2pmem ; i < (p2pr->size >> PAGE_SHIFT) ; ++i, addr += PAGE_SIZE) {
396
p2pr->p2ppages[i] = virt_to_page(addr);
397
if (!p2pr->p2ppages[i]) {
398
rc = -EFAULT;
399
goto err;
400
}
401
}
402
403
return 0;
404
err:
405
hl_p2p_region_fini(hdev, p2pr);
406
return rc;
407
}
408
409
int hl_dio_start(struct hl_device *hdev)
410
{
411
dev_dbg(hdev->dev, "initializing HLDIO\n");
412
413
/* Initialize the IO counter and enable IO */
414
hdev->hldio.inflight_ios = alloc_percpu(s64);
415
if (!hdev->hldio.inflight_ios)
416
return -ENOMEM;
417
418
hl_dio_set_io_enabled(hdev, true);
419
420
return 0;
421
}
422
423
void hl_dio_stop(struct hl_device *hdev)
424
{
425
dev_dbg(hdev->dev, "deinitializing HLDIO\n");
426
427
if (hdev->hldio.io_enabled) {
428
/* Wait for all the IO to finish */
429
hl_dio_set_io_enabled(hdev, false);
430
hl_poll_timeout_condition(hdev, !hl_dio_count_io(hdev), 1000, IO_STABILIZE_TIMEOUT);
431
}
432
433
if (hdev->hldio.inflight_ios) {
434
free_percpu(hdev->hldio.inflight_ios);
435
hdev->hldio.inflight_ios = NULL;
436
}
437
}
438
439