CoCalc -- hldio.c

GitHub Repository: torvalds/linux
Path: blob/master/drivers/accel/habanalabs/common/hldio.c
²⁹²⁸² views
1
// SPDX-License-Identifier: GPL-2.0
2

3
/*
4
 * Copyright 2024 HabanaLabs, Ltd.
5
 * All Rights Reserved.
6
 */
7

8
#include "habanalabs.h"
9
#include "hldio.h"
10
#include <generated/uapi/linux/version.h>
11
#include <linux/pci-p2pdma.h>
12
#include <linux/blkdev.h>
13
#include <linux/vmalloc.h>
14

15
/*
16
 * NVMe Direct I/O implementation for habanalabs driver
17
 *
18
 * ASSUMPTIONS
19
 * ===========
20
 * 1. No IOMMU (well, technically it can work with IOMMU, but it is *almost useless).
21
 * 2. Only READ operations (can extend in the future).
22
 * 3. No sparse files (can overcome this in the future).
23
 * 4. Kernel version >= 6.9
24
 * 5. Requiring page alignment is OK (I don't see a solution to this one right,
25
 *    now, how do we read partial pages?)
26
 * 6. Kernel compiled with CONFIG_PCI_P2PDMA. This requires a CUSTOM kernel.
27
 *    Theoretically I have a slight idea on how this could be solvable, but it
28
 *    is probably inacceptable for the upstream. Also may not work in the end.
29
 * 7. Either make sure our cards and disks are under the same PCI bridge, or
30
 *    compile a custom kernel to hack around this.
31
 */
32

33
#define IO_STABILIZE_TIMEOUT 10000000 /* 10 seconds in microseconds */
34

35
/*
36
 * This struct contains all the useful data I could milk out of the file handle
37
 * provided by the user.
38
 * @TODO: right now it is retrieved on each IO, but can be done once with some
39
 * dedicated IOCTL, call it for example HL_REGISTER_HANDLE.
40
 */
41
struct hl_dio_fd {
42
	/* Back pointer in case we need it in async completion */
43
	struct hl_ctx *ctx;
44
	/* Associated fd struct */
45
	struct file *filp;
46
};
47

48
/*
49
 * This is a single IO descriptor
50
 */
51
struct hl_direct_io {
52
	struct hl_dio_fd f;
53
	struct kiocb kio;
54
	struct bio_vec *bv;
55
	struct iov_iter iter;
56
	u64 device_va;
57
	u64 off_bytes;
58
	u64 len_bytes;
59
	u32 type;
60
};
61

62
bool hl_device_supports_nvme(struct hl_device *hdev)
63
{
64
	return hdev->asic_prop.supports_nvme;
65
}
66

67
static int hl_dio_fd_register(struct hl_ctx *ctx, int fd, struct hl_dio_fd *f)
68
{
69
	struct hl_device *hdev = ctx->hdev;
70
	struct block_device *bd;
71
	struct super_block *sb;
72
	struct inode *inode;
73
	struct gendisk *gd;
74
	struct device *disk_dev;
75
	int rc;
76

77
	f->filp = fget(fd);
78
	if (!f->filp) {
79
		rc = -ENOENT;
80
		goto out;
81
	}
82

83
	if (!(f->filp->f_flags & O_DIRECT)) {
84
		dev_err(hdev->dev, "file is not in the direct mode\n");
85
		rc = -EINVAL;
86
		goto fput;
87
	}
88

89
	if (!f->filp->f_op->read_iter) {
90
		dev_err(hdev->dev, "read iter is not supported, need to fall back to legacy\n");
91
		rc = -EINVAL;
92
		goto fput;
93
	}
94

95
	inode = file_inode(f->filp);
96
	sb = inode->i_sb;
97
	bd = sb->s_bdev;
98
	gd = bd->bd_disk;
99

100
	if (inode->i_blocks << sb->s_blocksize_bits < i_size_read(inode)) {
101
		dev_err(hdev->dev, "sparse files are not currently supported\n");
102
		rc = -EINVAL;
103
		goto fput;
104
	}
105

106
	if (!bd || !gd) {
107
		dev_err(hdev->dev, "invalid block device\n");
108
		rc = -ENODEV;
109
		goto fput;
110
	}
111
	/* Get the underlying device from the block device */
112
	disk_dev = disk_to_dev(gd);
113
	if (!dma_pci_p2pdma_supported(disk_dev)) {
114
		dev_err(hdev->dev, "device does not support PCI P2P DMA\n");
115
		rc = -EOPNOTSUPP;
116
		goto fput;
117
	}
118

119
	/*
120
	 * @TODO: Maybe we need additional checks here
121
	 */
122

123
	f->ctx = ctx;
124
	rc = 0;
125

126
	goto out;
127
fput:
128
	fput(f->filp);
129
out:
130
	return rc;
131
}
132

133
static void hl_dio_fd_unregister(struct hl_dio_fd *f)
134
{
135
	fput(f->filp);
136
}
137

138
static long hl_dio_count_io(struct hl_device *hdev)
139
{
140
	s64 sum = 0;
141
	int i;
142

143
	for_each_possible_cpu(i)
144
		sum += per_cpu(*hdev->hldio.inflight_ios, i);
145

146
	return sum;
147
}
148

149
static bool hl_dio_get_iopath(struct hl_ctx *ctx)
150
{
151
	struct hl_device *hdev = ctx->hdev;
152

153
	if (hdev->hldio.io_enabled) {
154
		this_cpu_inc(*hdev->hldio.inflight_ios);
155

156
		/* Avoid race conditions */
157
		if (!hdev->hldio.io_enabled) {
158
			this_cpu_dec(*hdev->hldio.inflight_ios);
159
			return false;
160
		}
161

162
		hl_ctx_get(ctx);
163

164
		return true;
165
	}
166

167
	return false;
168
}
169

170
static void hl_dio_put_iopath(struct hl_ctx *ctx)
171
{
172
	struct hl_device *hdev = ctx->hdev;
173

174
	hl_ctx_put(ctx);
175
	this_cpu_dec(*hdev->hldio.inflight_ios);
176
}
177

178
static void hl_dio_set_io_enabled(struct hl_device *hdev, bool enabled)
179
{
180
	hdev->hldio.io_enabled = enabled;
181
}
182

183
static bool hl_dio_validate_io(struct hl_device *hdev, struct hl_direct_io *io)
184
{
185
	if ((u64)io->device_va & ~PAGE_MASK) {
186
		dev_dbg(hdev->dev, "device address must be 4K aligned\n");
187
		return false;
188
	}
189

190
	if (io->len_bytes & ~PAGE_MASK) {
191
		dev_dbg(hdev->dev, "IO length must be 4K aligned\n");
192
		return false;
193
	}
194

195
	if (io->off_bytes & ~PAGE_MASK) {
196
		dev_dbg(hdev->dev, "IO offset must be 4K aligned\n");
197
		return false;
198
	}
199

200
	return true;
201
}
202

203
static struct page *hl_dio_va2page(struct hl_device *hdev, struct hl_ctx *ctx, u64 device_va)
204
{
205
	struct hl_dio *hldio = &hdev->hldio;
206
	u64 device_pa;
207
	int rc, i;
208

209
	rc = hl_mmu_va_to_pa(ctx, device_va, &device_pa);
210
	if (rc) {
211
		dev_err(hdev->dev, "device virtual address translation error: %#llx (%d)",
212
				device_va, rc);
213
		return NULL;
214
	}
215

216
	for (i = 0 ; i < hldio->np2prs ; ++i) {
217
		if (device_pa >= hldio->p2prs[i].device_pa &&
218
		    device_pa < hldio->p2prs[i].device_pa + hldio->p2prs[i].size)
219
			return hldio->p2prs[i].p2ppages[(device_pa - hldio->p2prs[i].device_pa) >>
220
				PAGE_SHIFT];
221
	}
222

223
	return NULL;
224
}
225

226
static ssize_t hl_direct_io(struct hl_device *hdev, struct hl_direct_io *io)
227
{
228
	u64 npages, device_va;
229
	ssize_t rc;
230
	int i;
231

232
	if (!hl_dio_validate_io(hdev, io))
233
		return -EINVAL;
234

235
	if (!hl_dio_get_iopath(io->f.ctx)) {
236
		dev_info(hdev->dev, "can't schedule a new IO, IO is disabled\n");
237
		return -ESHUTDOWN;
238
	}
239

240
	init_sync_kiocb(&io->kio, io->f.filp);
241
	io->kio.ki_pos = io->off_bytes;
242

243
	npages = (io->len_bytes >> PAGE_SHIFT);
244

245
	/* @TODO: this can be implemented smarter, vmalloc in iopath is not
246
	 * ideal. Maybe some variation of genpool. Number of pages may differ
247
	 * greatly, so maybe even use pools of different sizes and chose the
248
	 * closest one.
249
	 */
250
	io->bv = vzalloc(npages * sizeof(struct bio_vec));
251
	if (!io->bv)
252
		return -ENOMEM;
253

254
	for (i = 0, device_va = io->device_va; i < npages ; ++i, device_va += PAGE_SIZE) {
255
		io->bv[i].bv_page = hl_dio_va2page(hdev, io->f.ctx, device_va);
256
		if (!io->bv[i].bv_page) {
257
			dev_err(hdev->dev, "error getting page struct for device va %#llx",
258
					device_va);
259
			rc = -EFAULT;
260
			goto cleanup;
261
		}
262
		io->bv[i].bv_offset = 0;
263
		io->bv[i].bv_len = PAGE_SIZE;
264
	}
265

266
	iov_iter_bvec(&io->iter, io->type, io->bv, 1, io->len_bytes);
267
	if (io->f.filp->f_op && io->f.filp->f_op->read_iter)
268
		rc = io->f.filp->f_op->read_iter(&io->kio, &io->iter);
269
	else
270
		rc = -EINVAL;
271

272
cleanup:
273
	vfree(io->bv);
274
	hl_dio_put_iopath(io->f.ctx);
275

276
	dev_dbg(hdev->dev, "IO ended with %ld\n", rc);
277

278
	return rc;
279
}
280

281
/*
282
 * @TODO: This function can be used as a callback for io completion under
283
 * kio->ki_complete in order to implement async IO.
284
 * Note that on more recent kernels there is no ret2.
285
 */
286
__maybe_unused static void hl_direct_io_complete(struct kiocb *kio, long ret, long ret2)
287
{
288
	struct hl_direct_io *io = container_of(kio, struct hl_direct_io, kio);
289

290
	dev_dbg(io->f.ctx->hdev->dev, "IO completed with %ld\n", ret);
291

292
	/* Do something to copy result to user / notify completion */
293

294
	hl_dio_put_iopath(io->f.ctx);
295

296
	hl_dio_fd_unregister(&io->f);
297
}
298

299
/*
300
 * DMA disk to ASIC, wait for results. Must be invoked from the user context
301
 */
302
int hl_dio_ssd2hl(struct hl_device *hdev, struct hl_ctx *ctx, int fd,
303
		  u64 device_va, off_t off_bytes, size_t len_bytes,
304
		  size_t *len_read)
305
{
306
	struct hl_direct_io *io;
307
	ssize_t rc;
308

309
	dev_dbg(hdev->dev, "SSD2HL fd=%d va=%#llx len=%#lx\n", fd, device_va, len_bytes);
310

311
	io = kzalloc(sizeof(*io), GFP_KERNEL);
312
	if (!io) {
313
		rc = -ENOMEM;
314
		goto out;
315
	}
316

317
	*io = (struct hl_direct_io){
318
		.device_va = device_va,
319
		.len_bytes = len_bytes,
320
		.off_bytes = off_bytes,
321
		.type = READ,
322
	};
323

324
	rc = hl_dio_fd_register(ctx, fd, &io->f);
325
	if (rc)
326
		goto kfree_io;
327

328
	rc = hl_direct_io(hdev, io);
329
	if (rc >= 0) {
330
		*len_read = rc;
331
		rc = 0;
332
	}
333

334
	/* This shall be called only in the case of a sync IO */
335
	hl_dio_fd_unregister(&io->f);
336
kfree_io:
337
	kfree(io);
338
out:
339
	return rc;
340
}
341

342
static void hl_p2p_region_fini(struct hl_device *hdev, struct hl_p2p_region *p2pr)
343
{
344
	if (p2pr->p2ppages) {
345
		vfree(p2pr->p2ppages);
346
		p2pr->p2ppages = NULL;
347
	}
348

349
	if (p2pr->p2pmem) {
350
		dev_dbg(hdev->dev, "freeing P2P mem from %p, size=%#llx\n",
351
				p2pr->p2pmem, p2pr->size);
352
		pci_free_p2pmem(hdev->pdev, p2pr->p2pmem, p2pr->size);
353
		p2pr->p2pmem = NULL;
354
	}
355
}
356

357
void hl_p2p_region_fini_all(struct hl_device *hdev)
358
{
359
	int i;
360

361
	for (i = 0 ; i < hdev->hldio.np2prs ; ++i)
362
		hl_p2p_region_fini(hdev, &hdev->hldio.p2prs[i]);
363

364
	kvfree(hdev->hldio.p2prs);
365
	hdev->hldio.p2prs = NULL;
366
	hdev->hldio.np2prs = 0;
367
}
368

369
int hl_p2p_region_init(struct hl_device *hdev, struct hl_p2p_region *p2pr)
370
{
371
	void *addr;
372
	int rc, i;
373

374
	/* Start by publishing our p2p memory */
375
	rc = pci_p2pdma_add_resource(hdev->pdev, p2pr->bar, p2pr->size, p2pr->bar_offset);
376
	if (rc) {
377
		dev_err(hdev->dev, "error adding p2p resource: %d\n", rc);
378
		goto err;
379
	}
380

381
	/* Alloc all p2p mem */
382
	p2pr->p2pmem = pci_alloc_p2pmem(hdev->pdev, p2pr->size);
383
	if (!p2pr->p2pmem) {
384
		dev_err(hdev->dev, "error allocating p2p memory\n");
385
		rc = -ENOMEM;
386
		goto err;
387
	}
388

389
	p2pr->p2ppages = vmalloc((p2pr->size >> PAGE_SHIFT) * sizeof(struct page *));
390
	if (!p2pr->p2ppages) {
391
		rc = -ENOMEM;
392
		goto err;
393
	}
394

395
	for (i = 0, addr = p2pr->p2pmem ; i < (p2pr->size >> PAGE_SHIFT) ; ++i, addr += PAGE_SIZE) {
396
		p2pr->p2ppages[i] = virt_to_page(addr);
397
		if (!p2pr->p2ppages[i]) {
398
			rc = -EFAULT;
399
			goto err;
400
		}
401
	}
402

403
	return 0;
404
err:
405
	hl_p2p_region_fini(hdev, p2pr);
406
	return rc;
407
}
408

409
int hl_dio_start(struct hl_device *hdev)
410
{
411
	dev_dbg(hdev->dev, "initializing HLDIO\n");
412

413
	/* Initialize the IO counter and enable IO */
414
	hdev->hldio.inflight_ios = alloc_percpu(s64);
415
	if (!hdev->hldio.inflight_ios)
416
		return -ENOMEM;
417

418
	hl_dio_set_io_enabled(hdev, true);
419

420
	return 0;
421
}
422

423
void hl_dio_stop(struct hl_device *hdev)
424
{
425
	dev_dbg(hdev->dev, "deinitializing HLDIO\n");
426

427
	if (hdev->hldio.io_enabled) {
428
		/* Wait for all the IO to finish */
429
		hl_dio_set_io_enabled(hdev, false);
430
		hl_poll_timeout_condition(hdev, !hl_dio_count_io(hdev), 1000, IO_STABILIZE_TIMEOUT);
431
	}
432

433
	if (hdev->hldio.inflight_ios) {
434
		free_percpu(hdev->hldio.inflight_ios);
435
		hdev->hldio.inflight_ios = NULL;
436
	}
437
}
438

439
Product

Resources

Company