Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/block/ublk_drv.c
29266 views
1
// SPDX-License-Identifier: GPL-2.0-or-later
2
/*
3
* Userspace block device - block device which IO is handled from userspace
4
*
5
* Take full use of io_uring passthrough command for communicating with
6
* ublk userspace daemon(ublksrvd) for handling basic IO request.
7
*
8
* Copyright 2022 Ming Lei <[email protected]>
9
*
10
* (part of code stolen from loop.c)
11
*/
12
#include <linux/module.h>
13
#include <linux/moduleparam.h>
14
#include <linux/sched.h>
15
#include <linux/fs.h>
16
#include <linux/pagemap.h>
17
#include <linux/file.h>
18
#include <linux/stat.h>
19
#include <linux/errno.h>
20
#include <linux/major.h>
21
#include <linux/wait.h>
22
#include <linux/blkdev.h>
23
#include <linux/init.h>
24
#include <linux/swap.h>
25
#include <linux/slab.h>
26
#include <linux/compat.h>
27
#include <linux/mutex.h>
28
#include <linux/writeback.h>
29
#include <linux/completion.h>
30
#include <linux/highmem.h>
31
#include <linux/sysfs.h>
32
#include <linux/miscdevice.h>
33
#include <linux/falloc.h>
34
#include <linux/uio.h>
35
#include <linux/ioprio.h>
36
#include <linux/sched/mm.h>
37
#include <linux/uaccess.h>
38
#include <linux/cdev.h>
39
#include <linux/io_uring/cmd.h>
40
#include <linux/blk-mq.h>
41
#include <linux/delay.h>
42
#include <linux/mm.h>
43
#include <asm/page.h>
44
#include <linux/task_work.h>
45
#include <linux/namei.h>
46
#include <linux/kref.h>
47
#include <uapi/linux/ublk_cmd.h>
48
49
#define UBLK_MINORS (1U << MINORBITS)
50
51
#define UBLK_INVALID_BUF_IDX ((u16)-1)
52
53
/* private ioctl command mirror */
54
#define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC)
55
#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
56
#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
57
58
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
59
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
60
61
/* All UBLK_F_* have to be included into UBLK_F_ALL */
62
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
63
| UBLK_F_URING_CMD_COMP_IN_TASK \
64
| UBLK_F_NEED_GET_DATA \
65
| UBLK_F_USER_RECOVERY \
66
| UBLK_F_USER_RECOVERY_REISSUE \
67
| UBLK_F_UNPRIVILEGED_DEV \
68
| UBLK_F_CMD_IOCTL_ENCODE \
69
| UBLK_F_USER_COPY \
70
| UBLK_F_ZONED \
71
| UBLK_F_USER_RECOVERY_FAIL_IO \
72
| UBLK_F_UPDATE_SIZE \
73
| UBLK_F_AUTO_BUF_REG \
74
| UBLK_F_QUIESCE \
75
| UBLK_F_PER_IO_DAEMON \
76
| UBLK_F_BUF_REG_OFF_DAEMON)
77
78
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
79
| UBLK_F_USER_RECOVERY_REISSUE \
80
| UBLK_F_USER_RECOVERY_FAIL_IO)
81
82
/* All UBLK_PARAM_TYPE_* should be included here */
83
#define UBLK_PARAM_TYPE_ALL \
84
(UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
85
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED | \
86
UBLK_PARAM_TYPE_DMA_ALIGN | UBLK_PARAM_TYPE_SEGMENT)
87
88
struct ublk_uring_cmd_pdu {
89
/*
90
* Store requests in same batch temporarily for queuing them to
91
* daemon context.
92
*
93
* It should have been stored to request payload, but we do want
94
* to avoid extra pre-allocation, and uring_cmd payload is always
95
* free for us
96
*/
97
union {
98
struct request *req;
99
struct request *req_list;
100
};
101
102
/*
103
* The following two are valid in this cmd whole lifetime, and
104
* setup in ublk uring_cmd handler
105
*/
106
struct ublk_queue *ubq;
107
108
u16 tag;
109
};
110
111
/*
112
* io command is active: sqe cmd is received, and its cqe isn't done
113
*
114
* If the flag is set, the io command is owned by ublk driver, and waited
115
* for incoming blk-mq request from the ublk block device.
116
*
117
* If the flag is cleared, the io command will be completed, and owned by
118
* ublk server.
119
*/
120
#define UBLK_IO_FLAG_ACTIVE 0x01
121
122
/*
123
* IO command is completed via cqe, and it is being handled by ublksrv, and
124
* not committed yet
125
*
126
* Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
127
* cross verification
128
*/
129
#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
130
131
/*
132
* UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
133
* get data buffer address from ublksrv.
134
*
135
* Then, bio data could be copied into this data buffer for a WRITE request
136
* after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
137
*/
138
#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
139
140
/*
141
* request buffer is registered automatically, so we have to unregister it
142
* before completing this request.
143
*
144
* io_uring will unregister buffer automatically for us during exiting.
145
*/
146
#define UBLK_IO_FLAG_AUTO_BUF_REG 0x10
147
148
/* atomic RW with ubq->cancel_lock */
149
#define UBLK_IO_FLAG_CANCELED 0x80000000
150
151
/*
152
* Initialize refcount to a large number to include any registered buffers.
153
* UBLK_IO_COMMIT_AND_FETCH_REQ will release these references minus those for
154
* any buffers registered on the io daemon task.
155
*/
156
#define UBLK_REFCOUNT_INIT (REFCOUNT_MAX / 2)
157
158
struct ublk_io {
159
/* userspace buffer address from io cmd */
160
union {
161
__u64 addr;
162
struct ublk_auto_buf_reg buf;
163
};
164
unsigned int flags;
165
int res;
166
167
union {
168
/* valid if UBLK_IO_FLAG_ACTIVE is set */
169
struct io_uring_cmd *cmd;
170
/* valid if UBLK_IO_FLAG_OWNED_BY_SRV is set */
171
struct request *req;
172
};
173
174
struct task_struct *task;
175
176
/*
177
* The number of uses of this I/O by the ublk server
178
* if user copy or zero copy are enabled:
179
* - UBLK_REFCOUNT_INIT from dispatch to the server
180
* until UBLK_IO_COMMIT_AND_FETCH_REQ
181
* - 1 for each inflight ublk_ch_{read,write}_iter() call
182
* - 1 for each io_uring registered buffer not registered on task
183
* The I/O can only be completed once all references are dropped.
184
* User copy and buffer registration operations are only permitted
185
* if the reference count is nonzero.
186
*/
187
refcount_t ref;
188
/* Count of buffers registered on task and not yet unregistered */
189
unsigned task_registered_buffers;
190
191
void *buf_ctx_handle;
192
} ____cacheline_aligned_in_smp;
193
194
struct ublk_queue {
195
int q_id;
196
int q_depth;
197
198
unsigned long flags;
199
struct ublksrv_io_desc *io_cmd_buf;
200
201
bool force_abort;
202
bool canceling;
203
bool fail_io; /* copy of dev->state == UBLK_S_DEV_FAIL_IO */
204
spinlock_t cancel_lock;
205
struct ublk_device *dev;
206
struct ublk_io ios[];
207
};
208
209
struct ublk_device {
210
struct gendisk *ub_disk;
211
212
char *__queues;
213
214
unsigned int queue_size;
215
struct ublksrv_ctrl_dev_info dev_info;
216
217
struct blk_mq_tag_set tag_set;
218
219
struct cdev cdev;
220
struct device cdev_dev;
221
222
#define UB_STATE_OPEN 0
223
#define UB_STATE_USED 1
224
#define UB_STATE_DELETED 2
225
unsigned long state;
226
int ub_number;
227
228
struct mutex mutex;
229
230
spinlock_t lock;
231
struct mm_struct *mm;
232
233
struct ublk_params params;
234
235
struct completion completion;
236
u32 nr_io_ready;
237
bool unprivileged_daemons;
238
struct mutex cancel_mutex;
239
bool canceling;
240
pid_t ublksrv_tgid;
241
struct delayed_work exit_work;
242
};
243
244
/* header of ublk_params */
245
struct ublk_params_header {
246
__u32 len;
247
__u32 types;
248
};
249
250
static void ublk_io_release(void *priv);
251
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
252
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
253
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
254
u16 q_id, u16 tag, struct ublk_io *io, size_t offset);
255
static inline unsigned int ublk_req_build_flags(struct request *req);
256
257
static inline struct ublksrv_io_desc *
258
ublk_get_iod(const struct ublk_queue *ubq, unsigned tag)
259
{
260
return &ubq->io_cmd_buf[tag];
261
}
262
263
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
264
{
265
return ub->dev_info.flags & UBLK_F_ZONED;
266
}
267
268
static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
269
{
270
return ubq->flags & UBLK_F_ZONED;
271
}
272
273
#ifdef CONFIG_BLK_DEV_ZONED
274
275
struct ublk_zoned_report_desc {
276
__u64 sector;
277
__u32 operation;
278
__u32 nr_zones;
279
};
280
281
static DEFINE_XARRAY(ublk_zoned_report_descs);
282
283
static int ublk_zoned_insert_report_desc(const struct request *req,
284
struct ublk_zoned_report_desc *desc)
285
{
286
return xa_insert(&ublk_zoned_report_descs, (unsigned long)req,
287
desc, GFP_KERNEL);
288
}
289
290
static struct ublk_zoned_report_desc *ublk_zoned_erase_report_desc(
291
const struct request *req)
292
{
293
return xa_erase(&ublk_zoned_report_descs, (unsigned long)req);
294
}
295
296
static struct ublk_zoned_report_desc *ublk_zoned_get_report_desc(
297
const struct request *req)
298
{
299
return xa_load(&ublk_zoned_report_descs, (unsigned long)req);
300
}
301
302
static int ublk_get_nr_zones(const struct ublk_device *ub)
303
{
304
const struct ublk_param_basic *p = &ub->params.basic;
305
306
/* Zone size is a power of 2 */
307
return p->dev_sectors >> ilog2(p->chunk_sectors);
308
}
309
310
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
311
{
312
return blk_revalidate_disk_zones(ub->ub_disk);
313
}
314
315
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
316
{
317
const struct ublk_param_zoned *p = &ub->params.zoned;
318
int nr_zones;
319
320
if (!ublk_dev_is_zoned(ub))
321
return -EINVAL;
322
323
if (!p->max_zone_append_sectors)
324
return -EINVAL;
325
326
nr_zones = ublk_get_nr_zones(ub);
327
328
if (p->max_active_zones > nr_zones)
329
return -EINVAL;
330
331
if (p->max_open_zones > nr_zones)
332
return -EINVAL;
333
334
return 0;
335
}
336
337
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
338
{
339
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
340
}
341
342
/* Based on virtblk_alloc_report_buffer */
343
static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
344
unsigned int nr_zones, size_t *buflen)
345
{
346
struct request_queue *q = ublk->ub_disk->queue;
347
size_t bufsize;
348
void *buf;
349
350
nr_zones = min_t(unsigned int, nr_zones,
351
ublk->ub_disk->nr_zones);
352
353
bufsize = nr_zones * sizeof(struct blk_zone);
354
bufsize =
355
min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
356
357
while (bufsize >= sizeof(struct blk_zone)) {
358
buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
359
if (buf) {
360
*buflen = bufsize;
361
return buf;
362
}
363
bufsize >>= 1;
364
}
365
366
*buflen = 0;
367
return NULL;
368
}
369
370
static int ublk_report_zones(struct gendisk *disk, sector_t sector,
371
unsigned int nr_zones, report_zones_cb cb, void *data)
372
{
373
struct ublk_device *ub = disk->private_data;
374
unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
375
unsigned int first_zone = sector >> ilog2(zone_size_sectors);
376
unsigned int done_zones = 0;
377
unsigned int max_zones_per_request;
378
int ret;
379
struct blk_zone *buffer;
380
size_t buffer_length;
381
382
nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
383
nr_zones);
384
385
buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
386
if (!buffer)
387
return -ENOMEM;
388
389
max_zones_per_request = buffer_length / sizeof(struct blk_zone);
390
391
while (done_zones < nr_zones) {
392
unsigned int remaining_zones = nr_zones - done_zones;
393
unsigned int zones_in_request =
394
min_t(unsigned int, remaining_zones, max_zones_per_request);
395
struct request *req;
396
struct ublk_zoned_report_desc desc;
397
blk_status_t status;
398
399
memset(buffer, 0, buffer_length);
400
401
req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
402
if (IS_ERR(req)) {
403
ret = PTR_ERR(req);
404
goto out;
405
}
406
407
desc.operation = UBLK_IO_OP_REPORT_ZONES;
408
desc.sector = sector;
409
desc.nr_zones = zones_in_request;
410
ret = ublk_zoned_insert_report_desc(req, &desc);
411
if (ret)
412
goto free_req;
413
414
ret = blk_rq_map_kern(req, buffer, buffer_length, GFP_KERNEL);
415
if (ret)
416
goto erase_desc;
417
418
status = blk_execute_rq(req, 0);
419
ret = blk_status_to_errno(status);
420
erase_desc:
421
ublk_zoned_erase_report_desc(req);
422
free_req:
423
blk_mq_free_request(req);
424
if (ret)
425
goto out;
426
427
for (unsigned int i = 0; i < zones_in_request; i++) {
428
struct blk_zone *zone = buffer + i;
429
430
/* A zero length zone means no more zones in this response */
431
if (!zone->len)
432
break;
433
434
ret = cb(zone, i, data);
435
if (ret)
436
goto out;
437
438
done_zones++;
439
sector += zone_size_sectors;
440
441
}
442
}
443
444
ret = done_zones;
445
446
out:
447
kvfree(buffer);
448
return ret;
449
}
450
451
static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
452
struct request *req)
453
{
454
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
455
struct ublk_io *io = &ubq->ios[req->tag];
456
struct ublk_zoned_report_desc *desc;
457
u32 ublk_op;
458
459
switch (req_op(req)) {
460
case REQ_OP_ZONE_OPEN:
461
ublk_op = UBLK_IO_OP_ZONE_OPEN;
462
break;
463
case REQ_OP_ZONE_CLOSE:
464
ublk_op = UBLK_IO_OP_ZONE_CLOSE;
465
break;
466
case REQ_OP_ZONE_FINISH:
467
ublk_op = UBLK_IO_OP_ZONE_FINISH;
468
break;
469
case REQ_OP_ZONE_RESET:
470
ublk_op = UBLK_IO_OP_ZONE_RESET;
471
break;
472
case REQ_OP_ZONE_APPEND:
473
ublk_op = UBLK_IO_OP_ZONE_APPEND;
474
break;
475
case REQ_OP_ZONE_RESET_ALL:
476
ublk_op = UBLK_IO_OP_ZONE_RESET_ALL;
477
break;
478
case REQ_OP_DRV_IN:
479
desc = ublk_zoned_get_report_desc(req);
480
if (!desc)
481
return BLK_STS_IOERR;
482
ublk_op = desc->operation;
483
switch (ublk_op) {
484
case UBLK_IO_OP_REPORT_ZONES:
485
iod->op_flags = ublk_op | ublk_req_build_flags(req);
486
iod->nr_zones = desc->nr_zones;
487
iod->start_sector = desc->sector;
488
return BLK_STS_OK;
489
default:
490
return BLK_STS_IOERR;
491
}
492
case REQ_OP_DRV_OUT:
493
/* We do not support drv_out */
494
return BLK_STS_NOTSUPP;
495
default:
496
return BLK_STS_IOERR;
497
}
498
499
iod->op_flags = ublk_op | ublk_req_build_flags(req);
500
iod->nr_sectors = blk_rq_sectors(req);
501
iod->start_sector = blk_rq_pos(req);
502
iod->addr = io->addr;
503
504
return BLK_STS_OK;
505
}
506
507
#else
508
509
#define ublk_report_zones (NULL)
510
511
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
512
{
513
return -EOPNOTSUPP;
514
}
515
516
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
517
{
518
}
519
520
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
521
{
522
return 0;
523
}
524
525
static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
526
struct request *req)
527
{
528
return BLK_STS_NOTSUPP;
529
}
530
531
#endif
532
533
static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
534
bool need_map);
535
536
static dev_t ublk_chr_devt;
537
static const struct class ublk_chr_class = {
538
.name = "ublk-char",
539
};
540
541
static DEFINE_IDR(ublk_index_idr);
542
static DEFINE_SPINLOCK(ublk_idr_lock);
543
static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */
544
545
static DEFINE_MUTEX(ublk_ctl_mutex);
546
547
548
#define UBLK_MAX_UBLKS UBLK_MINORS
549
550
/*
551
* Max unprivileged ublk devices allowed to add
552
*
553
* It can be extended to one per-user limit in future or even controlled
554
* by cgroup.
555
*/
556
static unsigned int unprivileged_ublks_max = 64;
557
static unsigned int unprivileged_ublks_added; /* protected by ublk_ctl_mutex */
558
559
static struct miscdevice ublk_misc;
560
561
static inline unsigned ublk_pos_to_hwq(loff_t pos)
562
{
563
return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_QID_OFF) &
564
UBLK_QID_BITS_MASK;
565
}
566
567
static inline unsigned ublk_pos_to_buf_off(loff_t pos)
568
{
569
return (pos - UBLKSRV_IO_BUF_OFFSET) & UBLK_IO_BUF_BITS_MASK;
570
}
571
572
static inline unsigned ublk_pos_to_tag(loff_t pos)
573
{
574
return ((pos - UBLKSRV_IO_BUF_OFFSET) >> UBLK_TAG_OFF) &
575
UBLK_TAG_BITS_MASK;
576
}
577
578
static void ublk_dev_param_basic_apply(struct ublk_device *ub)
579
{
580
const struct ublk_param_basic *p = &ub->params.basic;
581
582
if (p->attrs & UBLK_ATTR_READ_ONLY)
583
set_disk_ro(ub->ub_disk, true);
584
585
set_capacity(ub->ub_disk, p->dev_sectors);
586
}
587
588
static int ublk_validate_params(const struct ublk_device *ub)
589
{
590
/* basic param is the only one which must be set */
591
if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
592
const struct ublk_param_basic *p = &ub->params.basic;
593
594
if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
595
return -EINVAL;
596
597
if (p->logical_bs_shift > p->physical_bs_shift)
598
return -EINVAL;
599
600
if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
601
return -EINVAL;
602
603
if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
604
return -EINVAL;
605
} else
606
return -EINVAL;
607
608
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
609
const struct ublk_param_discard *p = &ub->params.discard;
610
611
/* So far, only support single segment discard */
612
if (p->max_discard_sectors && p->max_discard_segments != 1)
613
return -EINVAL;
614
615
if (!p->discard_granularity)
616
return -EINVAL;
617
}
618
619
/* dev_t is read-only */
620
if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
621
return -EINVAL;
622
623
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
624
return ublk_dev_param_zoned_validate(ub);
625
else if (ublk_dev_is_zoned(ub))
626
return -EINVAL;
627
628
if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN) {
629
const struct ublk_param_dma_align *p = &ub->params.dma;
630
631
if (p->alignment >= PAGE_SIZE)
632
return -EINVAL;
633
634
if (!is_power_of_2(p->alignment + 1))
635
return -EINVAL;
636
}
637
638
if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
639
const struct ublk_param_segment *p = &ub->params.seg;
640
641
if (!is_power_of_2(p->seg_boundary_mask + 1))
642
return -EINVAL;
643
644
if (p->seg_boundary_mask + 1 < UBLK_MIN_SEGMENT_SIZE)
645
return -EINVAL;
646
if (p->max_segment_size < UBLK_MIN_SEGMENT_SIZE)
647
return -EINVAL;
648
}
649
650
return 0;
651
}
652
653
static void ublk_apply_params(struct ublk_device *ub)
654
{
655
ublk_dev_param_basic_apply(ub);
656
657
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
658
ublk_dev_param_zoned_apply(ub);
659
}
660
661
static inline bool ublk_support_zero_copy(const struct ublk_queue *ubq)
662
{
663
return ubq->flags & UBLK_F_SUPPORT_ZERO_COPY;
664
}
665
666
static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
667
{
668
return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
669
}
670
671
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
672
{
673
return ubq->flags & UBLK_F_AUTO_BUF_REG;
674
}
675
676
static inline bool ublk_dev_support_auto_buf_reg(const struct ublk_device *ub)
677
{
678
return ub->dev_info.flags & UBLK_F_AUTO_BUF_REG;
679
}
680
681
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
682
{
683
return ubq->flags & UBLK_F_USER_COPY;
684
}
685
686
static inline bool ublk_dev_support_user_copy(const struct ublk_device *ub)
687
{
688
return ub->dev_info.flags & UBLK_F_USER_COPY;
689
}
690
691
static inline bool ublk_need_map_io(const struct ublk_queue *ubq)
692
{
693
return !ublk_support_user_copy(ubq) && !ublk_support_zero_copy(ubq) &&
694
!ublk_support_auto_buf_reg(ubq);
695
}
696
697
static inline bool ublk_dev_need_map_io(const struct ublk_device *ub)
698
{
699
return !ublk_dev_support_user_copy(ub) &&
700
!ublk_dev_support_zero_copy(ub) &&
701
!ublk_dev_support_auto_buf_reg(ub);
702
}
703
704
static inline bool ublk_need_req_ref(const struct ublk_queue *ubq)
705
{
706
/*
707
* read()/write() is involved in user copy, so request reference
708
* has to be grabbed
709
*
710
* for zero copy, request buffer need to be registered to io_uring
711
* buffer table, so reference is needed
712
*
713
* For auto buffer register, ublk server still may issue
714
* UBLK_IO_COMMIT_AND_FETCH_REQ before one registered buffer is used up,
715
* so reference is required too.
716
*/
717
return ublk_support_user_copy(ubq) || ublk_support_zero_copy(ubq) ||
718
ublk_support_auto_buf_reg(ubq);
719
}
720
721
static inline bool ublk_dev_need_req_ref(const struct ublk_device *ub)
722
{
723
return ublk_dev_support_user_copy(ub) ||
724
ublk_dev_support_zero_copy(ub) ||
725
ublk_dev_support_auto_buf_reg(ub);
726
}
727
728
static inline void ublk_init_req_ref(const struct ublk_queue *ubq,
729
struct ublk_io *io)
730
{
731
if (ublk_need_req_ref(ubq))
732
refcount_set(&io->ref, UBLK_REFCOUNT_INIT);
733
}
734
735
static inline bool ublk_get_req_ref(struct ublk_io *io)
736
{
737
return refcount_inc_not_zero(&io->ref);
738
}
739
740
static inline void ublk_put_req_ref(struct ublk_io *io, struct request *req)
741
{
742
if (!refcount_dec_and_test(&io->ref))
743
return;
744
745
/* ublk_need_map_io() and ublk_need_req_ref() are mutually exclusive */
746
__ublk_complete_rq(req, io, false);
747
}
748
749
static inline bool ublk_sub_req_ref(struct ublk_io *io)
750
{
751
unsigned sub_refs = UBLK_REFCOUNT_INIT - io->task_registered_buffers;
752
753
io->task_registered_buffers = 0;
754
return refcount_sub_and_test(sub_refs, &io->ref);
755
}
756
757
static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
758
{
759
return ubq->flags & UBLK_F_NEED_GET_DATA;
760
}
761
762
static inline bool ublk_dev_need_get_data(const struct ublk_device *ub)
763
{
764
return ub->dev_info.flags & UBLK_F_NEED_GET_DATA;
765
}
766
767
/* Called in slow path only, keep it noinline for trace purpose */
768
static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
769
{
770
if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
771
return ub;
772
return NULL;
773
}
774
775
/* Called in slow path only, keep it noinline for trace purpose */
776
static noinline void ublk_put_device(struct ublk_device *ub)
777
{
778
put_device(&ub->cdev_dev);
779
}
780
781
static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
782
int qid)
783
{
784
return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
785
}
786
787
static inline bool ublk_rq_has_data(const struct request *rq)
788
{
789
return bio_has_data(rq->bio);
790
}
791
792
static inline struct ublksrv_io_desc *
793
ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
794
{
795
return ublk_get_queue(ub, q_id)->io_cmd_buf;
796
}
797
798
static inline int __ublk_queue_cmd_buf_size(int depth)
799
{
800
return round_up(depth * sizeof(struct ublksrv_io_desc), PAGE_SIZE);
801
}
802
803
static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub)
804
{
805
return __ublk_queue_cmd_buf_size(ub->dev_info.queue_depth);
806
}
807
808
static int ublk_max_cmd_buf_size(void)
809
{
810
return __ublk_queue_cmd_buf_size(UBLK_MAX_QUEUE_DEPTH);
811
}
812
813
/*
814
* Should I/O outstanding to the ublk server when it exits be reissued?
815
* If not, outstanding I/O will get errors.
816
*/
817
static inline bool ublk_nosrv_should_reissue_outstanding(struct ublk_device *ub)
818
{
819
return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
820
(ub->dev_info.flags & UBLK_F_USER_RECOVERY_REISSUE);
821
}
822
823
/*
824
* Should I/O issued while there is no ublk server queue? If not, I/O
825
* issued while there is no ublk server will get errors.
826
*/
827
static inline bool ublk_nosrv_dev_should_queue_io(struct ublk_device *ub)
828
{
829
return (ub->dev_info.flags & UBLK_F_USER_RECOVERY) &&
830
!(ub->dev_info.flags & UBLK_F_USER_RECOVERY_FAIL_IO);
831
}
832
833
/*
834
* Same as ublk_nosrv_dev_should_queue_io, but uses a queue-local copy
835
* of the device flags for smaller cache footprint - better for fast
836
* paths.
837
*/
838
static inline bool ublk_nosrv_should_queue_io(struct ublk_queue *ubq)
839
{
840
return (ubq->flags & UBLK_F_USER_RECOVERY) &&
841
!(ubq->flags & UBLK_F_USER_RECOVERY_FAIL_IO);
842
}
843
844
/*
845
* Should ublk devices be stopped (i.e. no recovery possible) when the
846
* ublk server exits? If not, devices can be used again by a future
847
* incarnation of a ublk server via the start_recovery/end_recovery
848
* commands.
849
*/
850
static inline bool ublk_nosrv_should_stop_dev(struct ublk_device *ub)
851
{
852
return !(ub->dev_info.flags & UBLK_F_USER_RECOVERY);
853
}
854
855
static inline bool ublk_dev_in_recoverable_state(struct ublk_device *ub)
856
{
857
return ub->dev_info.state == UBLK_S_DEV_QUIESCED ||
858
ub->dev_info.state == UBLK_S_DEV_FAIL_IO;
859
}
860
861
static void ublk_free_disk(struct gendisk *disk)
862
{
863
struct ublk_device *ub = disk->private_data;
864
865
clear_bit(UB_STATE_USED, &ub->state);
866
ublk_put_device(ub);
867
}
868
869
static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
870
unsigned int *owner_gid)
871
{
872
kuid_t uid;
873
kgid_t gid;
874
875
current_uid_gid(&uid, &gid);
876
877
*owner_uid = from_kuid(&init_user_ns, uid);
878
*owner_gid = from_kgid(&init_user_ns, gid);
879
}
880
881
static int ublk_open(struct gendisk *disk, blk_mode_t mode)
882
{
883
struct ublk_device *ub = disk->private_data;
884
885
if (capable(CAP_SYS_ADMIN))
886
return 0;
887
888
/*
889
* If it is one unprivileged device, only owner can open
890
* the disk. Otherwise it could be one trap made by one
891
* evil user who grants this disk's privileges to other
892
* users deliberately.
893
*
894
* This way is reasonable too given anyone can create
895
* unprivileged device, and no need other's grant.
896
*/
897
if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV) {
898
unsigned int curr_uid, curr_gid;
899
900
ublk_store_owner_uid_gid(&curr_uid, &curr_gid);
901
902
if (curr_uid != ub->dev_info.owner_uid || curr_gid !=
903
ub->dev_info.owner_gid)
904
return -EPERM;
905
}
906
907
return 0;
908
}
909
910
static const struct block_device_operations ub_fops = {
911
.owner = THIS_MODULE,
912
.open = ublk_open,
913
.free_disk = ublk_free_disk,
914
.report_zones = ublk_report_zones,
915
};
916
917
#define UBLK_MAX_PIN_PAGES 32
918
919
struct ublk_io_iter {
920
struct page *pages[UBLK_MAX_PIN_PAGES];
921
struct bio *bio;
922
struct bvec_iter iter;
923
};
924
925
/* return how many pages are copied */
926
static void ublk_copy_io_pages(struct ublk_io_iter *data,
927
size_t total, size_t pg_off, int dir)
928
{
929
unsigned done = 0;
930
unsigned pg_idx = 0;
931
932
while (done < total) {
933
struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
934
unsigned int bytes = min3(bv.bv_len, (unsigned)total - done,
935
(unsigned)(PAGE_SIZE - pg_off));
936
void *bv_buf = bvec_kmap_local(&bv);
937
void *pg_buf = kmap_local_page(data->pages[pg_idx]);
938
939
if (dir == ITER_DEST)
940
memcpy(pg_buf + pg_off, bv_buf, bytes);
941
else
942
memcpy(bv_buf, pg_buf + pg_off, bytes);
943
944
kunmap_local(pg_buf);
945
kunmap_local(bv_buf);
946
947
/* advance page array */
948
pg_off += bytes;
949
if (pg_off == PAGE_SIZE) {
950
pg_idx += 1;
951
pg_off = 0;
952
}
953
954
done += bytes;
955
956
/* advance bio */
957
bio_advance_iter_single(data->bio, &data->iter, bytes);
958
if (!data->iter.bi_size) {
959
data->bio = data->bio->bi_next;
960
if (data->bio == NULL)
961
break;
962
data->iter = data->bio->bi_iter;
963
}
964
}
965
}
966
967
static bool ublk_advance_io_iter(const struct request *req,
968
struct ublk_io_iter *iter, unsigned int offset)
969
{
970
struct bio *bio = req->bio;
971
972
for_each_bio(bio) {
973
if (bio->bi_iter.bi_size > offset) {
974
iter->bio = bio;
975
iter->iter = bio->bi_iter;
976
bio_advance_iter(iter->bio, &iter->iter, offset);
977
return true;
978
}
979
offset -= bio->bi_iter.bi_size;
980
}
981
return false;
982
}
983
984
/*
985
* Copy data between request pages and io_iter, and 'offset'
986
* is the start point of linear offset of request.
987
*/
988
static size_t ublk_copy_user_pages(const struct request *req,
989
unsigned offset, struct iov_iter *uiter, int dir)
990
{
991
struct ublk_io_iter iter;
992
size_t done = 0;
993
994
if (!ublk_advance_io_iter(req, &iter, offset))
995
return 0;
996
997
while (iov_iter_count(uiter) && iter.bio) {
998
unsigned nr_pages;
999
ssize_t len;
1000
size_t off;
1001
int i;
1002
1003
len = iov_iter_get_pages2(uiter, iter.pages,
1004
iov_iter_count(uiter),
1005
UBLK_MAX_PIN_PAGES, &off);
1006
if (len <= 0)
1007
return done;
1008
1009
ublk_copy_io_pages(&iter, len, off, dir);
1010
nr_pages = DIV_ROUND_UP(len + off, PAGE_SIZE);
1011
for (i = 0; i < nr_pages; i++) {
1012
if (dir == ITER_DEST)
1013
set_page_dirty(iter.pages[i]);
1014
put_page(iter.pages[i]);
1015
}
1016
done += len;
1017
}
1018
1019
return done;
1020
}
1021
1022
static inline bool ublk_need_map_req(const struct request *req)
1023
{
1024
return ublk_rq_has_data(req) && req_op(req) == REQ_OP_WRITE;
1025
}
1026
1027
static inline bool ublk_need_unmap_req(const struct request *req)
1028
{
1029
return ublk_rq_has_data(req) &&
1030
(req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
1031
}
1032
1033
static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
1034
const struct ublk_io *io)
1035
{
1036
const unsigned int rq_bytes = blk_rq_bytes(req);
1037
1038
if (!ublk_need_map_io(ubq))
1039
return rq_bytes;
1040
1041
/*
1042
* no zero copy, we delay copy WRITE request data into ublksrv
1043
* context and the big benefit is that pinning pages in current
1044
* context is pretty fast, see ublk_pin_user_pages
1045
*/
1046
if (ublk_need_map_req(req)) {
1047
struct iov_iter iter;
1048
const int dir = ITER_DEST;
1049
1050
import_ubuf(dir, u64_to_user_ptr(io->addr), rq_bytes, &iter);
1051
return ublk_copy_user_pages(req, 0, &iter, dir);
1052
}
1053
return rq_bytes;
1054
}
1055
1056
static int ublk_unmap_io(bool need_map,
1057
const struct request *req,
1058
const struct ublk_io *io)
1059
{
1060
const unsigned int rq_bytes = blk_rq_bytes(req);
1061
1062
if (!need_map)
1063
return rq_bytes;
1064
1065
if (ublk_need_unmap_req(req)) {
1066
struct iov_iter iter;
1067
const int dir = ITER_SOURCE;
1068
1069
WARN_ON_ONCE(io->res > rq_bytes);
1070
1071
import_ubuf(dir, u64_to_user_ptr(io->addr), io->res, &iter);
1072
return ublk_copy_user_pages(req, 0, &iter, dir);
1073
}
1074
return rq_bytes;
1075
}
1076
1077
static inline unsigned int ublk_req_build_flags(struct request *req)
1078
{
1079
unsigned flags = 0;
1080
1081
if (req->cmd_flags & REQ_FAILFAST_DEV)
1082
flags |= UBLK_IO_F_FAILFAST_DEV;
1083
1084
if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
1085
flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
1086
1087
if (req->cmd_flags & REQ_FAILFAST_DRIVER)
1088
flags |= UBLK_IO_F_FAILFAST_DRIVER;
1089
1090
if (req->cmd_flags & REQ_META)
1091
flags |= UBLK_IO_F_META;
1092
1093
if (req->cmd_flags & REQ_FUA)
1094
flags |= UBLK_IO_F_FUA;
1095
1096
if (req->cmd_flags & REQ_NOUNMAP)
1097
flags |= UBLK_IO_F_NOUNMAP;
1098
1099
if (req->cmd_flags & REQ_SWAP)
1100
flags |= UBLK_IO_F_SWAP;
1101
1102
return flags;
1103
}
1104
1105
static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
1106
{
1107
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
1108
struct ublk_io *io = &ubq->ios[req->tag];
1109
u32 ublk_op;
1110
1111
switch (req_op(req)) {
1112
case REQ_OP_READ:
1113
ublk_op = UBLK_IO_OP_READ;
1114
break;
1115
case REQ_OP_WRITE:
1116
ublk_op = UBLK_IO_OP_WRITE;
1117
break;
1118
case REQ_OP_FLUSH:
1119
ublk_op = UBLK_IO_OP_FLUSH;
1120
break;
1121
case REQ_OP_DISCARD:
1122
ublk_op = UBLK_IO_OP_DISCARD;
1123
break;
1124
case REQ_OP_WRITE_ZEROES:
1125
ublk_op = UBLK_IO_OP_WRITE_ZEROES;
1126
break;
1127
default:
1128
if (ublk_queue_is_zoned(ubq))
1129
return ublk_setup_iod_zoned(ubq, req);
1130
return BLK_STS_IOERR;
1131
}
1132
1133
/* need to translate since kernel may change */
1134
iod->op_flags = ublk_op | ublk_req_build_flags(req);
1135
iod->nr_sectors = blk_rq_sectors(req);
1136
iod->start_sector = blk_rq_pos(req);
1137
iod->addr = io->addr;
1138
1139
return BLK_STS_OK;
1140
}
1141
1142
static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
1143
struct io_uring_cmd *ioucmd)
1144
{
1145
return io_uring_cmd_to_pdu(ioucmd, struct ublk_uring_cmd_pdu);
1146
}
1147
1148
/* todo: handle partial completion */
1149
static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
1150
bool need_map)
1151
{
1152
unsigned int unmapped_bytes;
1153
blk_status_t res = BLK_STS_OK;
1154
1155
/* failed read IO if nothing is read */
1156
if (!io->res && req_op(req) == REQ_OP_READ)
1157
io->res = -EIO;
1158
1159
if (io->res < 0) {
1160
res = errno_to_blk_status(io->res);
1161
goto exit;
1162
}
1163
1164
/*
1165
* FLUSH, DISCARD or WRITE_ZEROES usually won't return bytes returned, so end them
1166
* directly.
1167
*
1168
* Both the two needn't unmap.
1169
*/
1170
if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
1171
req_op(req) != REQ_OP_DRV_IN)
1172
goto exit;
1173
1174
/* for READ request, writing data in iod->addr to rq buffers */
1175
unmapped_bytes = ublk_unmap_io(need_map, req, io);
1176
1177
/*
1178
* Extremely impossible since we got data filled in just before
1179
*
1180
* Re-read simply for this unlikely case.
1181
*/
1182
if (unlikely(unmapped_bytes < io->res))
1183
io->res = unmapped_bytes;
1184
1185
if (blk_update_request(req, BLK_STS_OK, io->res))
1186
blk_mq_requeue_request(req, true);
1187
else if (likely(!blk_should_fake_timeout(req->q)))
1188
__blk_mq_end_request(req, BLK_STS_OK);
1189
1190
return;
1191
exit:
1192
blk_mq_end_request(req, res);
1193
}
1194
1195
static struct io_uring_cmd *__ublk_prep_compl_io_cmd(struct ublk_io *io,
1196
struct request *req)
1197
{
1198
/* read cmd first because req will overwrite it */
1199
struct io_uring_cmd *cmd = io->cmd;
1200
1201
/* mark this cmd owned by ublksrv */
1202
io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
1203
1204
/*
1205
* clear ACTIVE since we are done with this sqe/cmd slot
1206
* We can only accept io cmd in case of being not active.
1207
*/
1208
io->flags &= ~UBLK_IO_FLAG_ACTIVE;
1209
1210
io->req = req;
1211
return cmd;
1212
}
1213
1214
static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
1215
int res, unsigned issue_flags)
1216
{
1217
struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
1218
1219
/* tell ublksrv one io request is coming */
1220
io_uring_cmd_done(cmd, res, issue_flags);
1221
}
1222
1223
#define UBLK_REQUEUE_DELAY_MS 3
1224
1225
static inline void __ublk_abort_rq(struct ublk_queue *ubq,
1226
struct request *rq)
1227
{
1228
/* We cannot process this rq so just requeue it. */
1229
if (ublk_nosrv_dev_should_queue_io(ubq->dev))
1230
blk_mq_requeue_request(rq, false);
1231
else
1232
blk_mq_end_request(rq, BLK_STS_IOERR);
1233
}
1234
1235
static void
1236
ublk_auto_buf_reg_fallback(const struct ublk_queue *ubq, struct ublk_io *io)
1237
{
1238
unsigned tag = io - ubq->ios;
1239
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, tag);
1240
1241
iod->op_flags |= UBLK_IO_F_NEED_REG_BUF;
1242
}
1243
1244
static bool ublk_auto_buf_reg(const struct ublk_queue *ubq, struct request *req,
1245
struct ublk_io *io, unsigned int issue_flags)
1246
{
1247
int ret;
1248
1249
ret = io_buffer_register_bvec(io->cmd, req, ublk_io_release,
1250
io->buf.index, issue_flags);
1251
if (ret) {
1252
if (io->buf.flags & UBLK_AUTO_BUF_REG_FALLBACK) {
1253
ublk_auto_buf_reg_fallback(ubq, io);
1254
return true;
1255
}
1256
blk_mq_end_request(req, BLK_STS_IOERR);
1257
return false;
1258
}
1259
1260
io->task_registered_buffers = 1;
1261
io->buf_ctx_handle = io_uring_cmd_ctx_handle(io->cmd);
1262
io->flags |= UBLK_IO_FLAG_AUTO_BUF_REG;
1263
return true;
1264
}
1265
1266
static bool ublk_prep_auto_buf_reg(struct ublk_queue *ubq,
1267
struct request *req, struct ublk_io *io,
1268
unsigned int issue_flags)
1269
{
1270
ublk_init_req_ref(ubq, io);
1271
if (ublk_support_auto_buf_reg(ubq) && ublk_rq_has_data(req))
1272
return ublk_auto_buf_reg(ubq, req, io, issue_flags);
1273
1274
return true;
1275
}
1276
1277
static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
1278
struct ublk_io *io)
1279
{
1280
unsigned mapped_bytes = ublk_map_io(ubq, req, io);
1281
1282
/* partially mapped, update io descriptor */
1283
if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
1284
/*
1285
* Nothing mapped, retry until we succeed.
1286
*
1287
* We may never succeed in mapping any bytes here because
1288
* of OOM. TODO: reserve one buffer with single page pinned
1289
* for providing forward progress guarantee.
1290
*/
1291
if (unlikely(!mapped_bytes)) {
1292
blk_mq_requeue_request(req, false);
1293
blk_mq_delay_kick_requeue_list(req->q,
1294
UBLK_REQUEUE_DELAY_MS);
1295
return false;
1296
}
1297
1298
ublk_get_iod(ubq, req->tag)->nr_sectors =
1299
mapped_bytes >> 9;
1300
}
1301
1302
return true;
1303
}
1304
1305
static void ublk_dispatch_req(struct ublk_queue *ubq,
1306
struct request *req,
1307
unsigned int issue_flags)
1308
{
1309
int tag = req->tag;
1310
struct ublk_io *io = &ubq->ios[tag];
1311
1312
pr_devel("%s: complete: qid %d tag %d io_flags %x addr %llx\n",
1313
__func__, ubq->q_id, req->tag, io->flags,
1314
ublk_get_iod(ubq, req->tag)->addr);
1315
1316
/*
1317
* Task is exiting if either:
1318
*
1319
* (1) current != io->task.
1320
* io_uring_cmd_complete_in_task() tries to run task_work
1321
* in a workqueue if cmd's task is PF_EXITING.
1322
*
1323
* (2) current->flags & PF_EXITING.
1324
*/
1325
if (unlikely(current != io->task || current->flags & PF_EXITING)) {
1326
__ublk_abort_rq(ubq, req);
1327
return;
1328
}
1329
1330
if (ublk_need_get_data(ubq) && ublk_need_map_req(req)) {
1331
/*
1332
* We have not handled UBLK_IO_NEED_GET_DATA command yet,
1333
* so immediately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
1334
* and notify it.
1335
*/
1336
io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
1337
pr_devel("%s: need get data. qid %d tag %d io_flags %x\n",
1338
__func__, ubq->q_id, req->tag, io->flags);
1339
ublk_complete_io_cmd(io, req, UBLK_IO_RES_NEED_GET_DATA,
1340
issue_flags);
1341
return;
1342
}
1343
1344
if (!ublk_start_io(ubq, req, io))
1345
return;
1346
1347
if (ublk_prep_auto_buf_reg(ubq, req, io, issue_flags))
1348
ublk_complete_io_cmd(io, req, UBLK_IO_RES_OK, issue_flags);
1349
}
1350
1351
static void ublk_cmd_tw_cb(struct io_uring_cmd *cmd,
1352
unsigned int issue_flags)
1353
{
1354
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1355
struct ublk_queue *ubq = pdu->ubq;
1356
1357
ublk_dispatch_req(ubq, pdu->req, issue_flags);
1358
}
1359
1360
static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
1361
{
1362
struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd;
1363
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1364
1365
pdu->req = rq;
1366
io_uring_cmd_complete_in_task(cmd, ublk_cmd_tw_cb);
1367
}
1368
1369
static void ublk_cmd_list_tw_cb(struct io_uring_cmd *cmd,
1370
unsigned int issue_flags)
1371
{
1372
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1373
struct request *rq = pdu->req_list;
1374
struct request *next;
1375
1376
do {
1377
next = rq->rq_next;
1378
rq->rq_next = NULL;
1379
ublk_dispatch_req(rq->mq_hctx->driver_data, rq, issue_flags);
1380
rq = next;
1381
} while (rq);
1382
}
1383
1384
static void ublk_queue_cmd_list(struct ublk_io *io, struct rq_list *l)
1385
{
1386
struct io_uring_cmd *cmd = io->cmd;
1387
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1388
1389
pdu->req_list = rq_list_peek(l);
1390
rq_list_init(l);
1391
io_uring_cmd_complete_in_task(cmd, ublk_cmd_list_tw_cb);
1392
}
1393
1394
static enum blk_eh_timer_return ublk_timeout(struct request *rq)
1395
{
1396
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
1397
pid_t tgid = ubq->dev->ublksrv_tgid;
1398
struct task_struct *p;
1399
struct pid *pid;
1400
1401
if (!(ubq->flags & UBLK_F_UNPRIVILEGED_DEV))
1402
return BLK_EH_RESET_TIMER;
1403
1404
if (unlikely(!tgid))
1405
return BLK_EH_RESET_TIMER;
1406
1407
rcu_read_lock();
1408
pid = find_vpid(tgid);
1409
p = pid_task(pid, PIDTYPE_PID);
1410
if (p)
1411
send_sig(SIGKILL, p, 0);
1412
rcu_read_unlock();
1413
return BLK_EH_DONE;
1414
}
1415
1416
static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
1417
bool check_cancel)
1418
{
1419
blk_status_t res;
1420
1421
if (unlikely(READ_ONCE(ubq->fail_io)))
1422
return BLK_STS_TARGET;
1423
1424
/* With recovery feature enabled, force_abort is set in
1425
* ublk_stop_dev() before calling del_gendisk(). We have to
1426
* abort all requeued and new rqs here to let del_gendisk()
1427
* move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
1428
* to avoid UAF on io_uring ctx.
1429
*
1430
* Note: force_abort is guaranteed to be seen because it is set
1431
* before request queue is unqiuesced.
1432
*/
1433
if (ublk_nosrv_should_queue_io(ubq) &&
1434
unlikely(READ_ONCE(ubq->force_abort)))
1435
return BLK_STS_IOERR;
1436
1437
if (check_cancel && unlikely(ubq->canceling))
1438
return BLK_STS_IOERR;
1439
1440
/* fill iod to slot in io cmd buffer */
1441
res = ublk_setup_iod(ubq, rq);
1442
if (unlikely(res != BLK_STS_OK))
1443
return BLK_STS_IOERR;
1444
1445
blk_mq_start_request(rq);
1446
return BLK_STS_OK;
1447
}
1448
1449
static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
1450
const struct blk_mq_queue_data *bd)
1451
{
1452
struct ublk_queue *ubq = hctx->driver_data;
1453
struct request *rq = bd->rq;
1454
blk_status_t res;
1455
1456
res = ublk_prep_req(ubq, rq, false);
1457
if (res != BLK_STS_OK)
1458
return res;
1459
1460
/*
1461
* ->canceling has to be handled after ->force_abort and ->fail_io
1462
* is dealt with, otherwise this request may not be failed in case
1463
* of recovery, and cause hang when deleting disk
1464
*/
1465
if (unlikely(ubq->canceling)) {
1466
__ublk_abort_rq(ubq, rq);
1467
return BLK_STS_OK;
1468
}
1469
1470
ublk_queue_cmd(ubq, rq);
1471
return BLK_STS_OK;
1472
}
1473
1474
static inline bool ublk_belong_to_same_batch(const struct ublk_io *io,
1475
const struct ublk_io *io2)
1476
{
1477
return (io_uring_cmd_ctx_handle(io->cmd) ==
1478
io_uring_cmd_ctx_handle(io2->cmd)) &&
1479
(io->task == io2->task);
1480
}
1481
1482
static void ublk_queue_rqs(struct rq_list *rqlist)
1483
{
1484
struct rq_list requeue_list = { };
1485
struct rq_list submit_list = { };
1486
struct ublk_io *io = NULL;
1487
struct request *req;
1488
1489
while ((req = rq_list_pop(rqlist))) {
1490
struct ublk_queue *this_q = req->mq_hctx->driver_data;
1491
struct ublk_io *this_io = &this_q->ios[req->tag];
1492
1493
if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) {
1494
rq_list_add_tail(&requeue_list, req);
1495
continue;
1496
}
1497
1498
if (io && !ublk_belong_to_same_batch(io, this_io) &&
1499
!rq_list_empty(&submit_list))
1500
ublk_queue_cmd_list(io, &submit_list);
1501
io = this_io;
1502
rq_list_add_tail(&submit_list, req);
1503
}
1504
1505
if (!rq_list_empty(&submit_list))
1506
ublk_queue_cmd_list(io, &submit_list);
1507
*rqlist = requeue_list;
1508
}
1509
1510
static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
1511
unsigned int hctx_idx)
1512
{
1513
struct ublk_device *ub = driver_data;
1514
struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
1515
1516
hctx->driver_data = ubq;
1517
return 0;
1518
}
1519
1520
static const struct blk_mq_ops ublk_mq_ops = {
1521
.queue_rq = ublk_queue_rq,
1522
.queue_rqs = ublk_queue_rqs,
1523
.init_hctx = ublk_init_hctx,
1524
.timeout = ublk_timeout,
1525
};
1526
1527
static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
1528
{
1529
int i;
1530
1531
for (i = 0; i < ubq->q_depth; i++) {
1532
struct ublk_io *io = &ubq->ios[i];
1533
1534
/*
1535
* UBLK_IO_FLAG_CANCELED is kept for avoiding to touch
1536
* io->cmd
1537
*/
1538
io->flags &= UBLK_IO_FLAG_CANCELED;
1539
io->cmd = NULL;
1540
io->addr = 0;
1541
1542
/*
1543
* old task is PF_EXITING, put it now
1544
*
1545
* It could be NULL in case of closing one quiesced
1546
* device.
1547
*/
1548
if (io->task) {
1549
put_task_struct(io->task);
1550
io->task = NULL;
1551
}
1552
1553
WARN_ON_ONCE(refcount_read(&io->ref));
1554
WARN_ON_ONCE(io->task_registered_buffers);
1555
}
1556
}
1557
1558
static int ublk_ch_open(struct inode *inode, struct file *filp)
1559
{
1560
struct ublk_device *ub = container_of(inode->i_cdev,
1561
struct ublk_device, cdev);
1562
1563
if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
1564
return -EBUSY;
1565
filp->private_data = ub;
1566
ub->ublksrv_tgid = current->tgid;
1567
return 0;
1568
}
1569
1570
static void ublk_reset_ch_dev(struct ublk_device *ub)
1571
{
1572
int i;
1573
1574
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1575
ublk_queue_reinit(ub, ublk_get_queue(ub, i));
1576
1577
/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
1578
ub->mm = NULL;
1579
ub->nr_io_ready = 0;
1580
ub->unprivileged_daemons = false;
1581
ub->ublksrv_tgid = -1;
1582
}
1583
1584
static struct gendisk *ublk_get_disk(struct ublk_device *ub)
1585
{
1586
struct gendisk *disk;
1587
1588
spin_lock(&ub->lock);
1589
disk = ub->ub_disk;
1590
if (disk)
1591
get_device(disk_to_dev(disk));
1592
spin_unlock(&ub->lock);
1593
1594
return disk;
1595
}
1596
1597
static void ublk_put_disk(struct gendisk *disk)
1598
{
1599
if (disk)
1600
put_device(disk_to_dev(disk));
1601
}
1602
1603
/*
1604
* Use this function to ensure that ->canceling is consistently set for
1605
* the device and all queues. Do not set these flags directly.
1606
*
1607
* Caller must ensure that:
1608
* - cancel_mutex is held. This ensures that there is no concurrent
1609
* access to ub->canceling and no concurrent writes to ubq->canceling.
1610
* - there are no concurrent reads of ubq->canceling from the queue_rq
1611
* path. This can be done by quiescing the queue, or through other
1612
* means.
1613
*/
1614
static void ublk_set_canceling(struct ublk_device *ub, bool canceling)
1615
__must_hold(&ub->cancel_mutex)
1616
{
1617
int i;
1618
1619
ub->canceling = canceling;
1620
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1621
ublk_get_queue(ub, i)->canceling = canceling;
1622
}
1623
1624
static bool ublk_check_and_reset_active_ref(struct ublk_device *ub)
1625
{
1626
int i, j;
1627
1628
if (!(ub->dev_info.flags & (UBLK_F_SUPPORT_ZERO_COPY |
1629
UBLK_F_AUTO_BUF_REG)))
1630
return false;
1631
1632
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
1633
struct ublk_queue *ubq = ublk_get_queue(ub, i);
1634
1635
for (j = 0; j < ubq->q_depth; j++) {
1636
struct ublk_io *io = &ubq->ios[j];
1637
unsigned int refs = refcount_read(&io->ref) +
1638
io->task_registered_buffers;
1639
1640
/*
1641
* UBLK_REFCOUNT_INIT or zero means no active
1642
* reference
1643
*/
1644
if (refs != UBLK_REFCOUNT_INIT && refs != 0)
1645
return true;
1646
1647
/* reset to zero if the io hasn't active references */
1648
refcount_set(&io->ref, 0);
1649
io->task_registered_buffers = 0;
1650
}
1651
}
1652
return false;
1653
}
1654
1655
static void ublk_ch_release_work_fn(struct work_struct *work)
1656
{
1657
struct ublk_device *ub =
1658
container_of(work, struct ublk_device, exit_work.work);
1659
struct gendisk *disk;
1660
int i;
1661
1662
/*
1663
* For zero-copy and auto buffer register modes, I/O references
1664
* might not be dropped naturally when the daemon is killed, but
1665
* io_uring guarantees that registered bvec kernel buffers are
1666
* unregistered finally when freeing io_uring context, then the
1667
* active references are dropped.
1668
*
1669
* Wait until active references are dropped for avoiding use-after-free
1670
*
1671
* registered buffer may be unregistered in io_ring's release hander,
1672
* so have to wait by scheduling work function for avoiding the two
1673
* file release dependency.
1674
*/
1675
if (ublk_check_and_reset_active_ref(ub)) {
1676
schedule_delayed_work(&ub->exit_work, 1);
1677
return;
1678
}
1679
1680
/*
1681
* disk isn't attached yet, either device isn't live, or it has
1682
* been removed already, so we needn't to do anything
1683
*/
1684
disk = ublk_get_disk(ub);
1685
if (!disk)
1686
goto out;
1687
1688
/*
1689
* All uring_cmd are done now, so abort any request outstanding to
1690
* the ublk server
1691
*
1692
* This can be done in lockless way because ublk server has been
1693
* gone
1694
*
1695
* More importantly, we have to provide forward progress guarantee
1696
* without holding ub->mutex, otherwise control task grabbing
1697
* ub->mutex triggers deadlock
1698
*
1699
* All requests may be inflight, so ->canceling may not be set, set
1700
* it now.
1701
*/
1702
mutex_lock(&ub->cancel_mutex);
1703
ublk_set_canceling(ub, true);
1704
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1705
ublk_abort_queue(ub, ublk_get_queue(ub, i));
1706
mutex_unlock(&ub->cancel_mutex);
1707
blk_mq_kick_requeue_list(disk->queue);
1708
1709
/*
1710
* All infligh requests have been completed or requeued and any new
1711
* request will be failed or requeued via `->canceling` now, so it is
1712
* fine to grab ub->mutex now.
1713
*/
1714
mutex_lock(&ub->mutex);
1715
1716
/* double check after grabbing lock */
1717
if (!ub->ub_disk)
1718
goto unlock;
1719
1720
/*
1721
* Transition the device to the nosrv state. What exactly this
1722
* means depends on the recovery flags
1723
*/
1724
if (ublk_nosrv_should_stop_dev(ub)) {
1725
/*
1726
* Allow any pending/future I/O to pass through quickly
1727
* with an error. This is needed because del_gendisk
1728
* waits for all pending I/O to complete
1729
*/
1730
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1731
WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
1732
1733
ublk_stop_dev_unlocked(ub);
1734
} else {
1735
if (ublk_nosrv_dev_should_queue_io(ub)) {
1736
/* ->canceling is set and all requests are aborted */
1737
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
1738
} else {
1739
ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
1740
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1741
WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
1742
}
1743
}
1744
unlock:
1745
mutex_unlock(&ub->mutex);
1746
ublk_put_disk(disk);
1747
1748
/* all uring_cmd has been done now, reset device & ubq */
1749
ublk_reset_ch_dev(ub);
1750
out:
1751
clear_bit(UB_STATE_OPEN, &ub->state);
1752
1753
/* put the reference grabbed in ublk_ch_release() */
1754
ublk_put_device(ub);
1755
}
1756
1757
static int ublk_ch_release(struct inode *inode, struct file *filp)
1758
{
1759
struct ublk_device *ub = filp->private_data;
1760
1761
/*
1762
* Grab ublk device reference, so it won't be gone until we are
1763
* really released from work function.
1764
*/
1765
ublk_get_device(ub);
1766
1767
INIT_DELAYED_WORK(&ub->exit_work, ublk_ch_release_work_fn);
1768
schedule_delayed_work(&ub->exit_work, 0);
1769
return 0;
1770
}
1771
1772
/* map pre-allocated per-queue cmd buffer to ublksrv daemon */
1773
static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
1774
{
1775
struct ublk_device *ub = filp->private_data;
1776
size_t sz = vma->vm_end - vma->vm_start;
1777
unsigned max_sz = ublk_max_cmd_buf_size();
1778
unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
1779
int q_id, ret = 0;
1780
1781
spin_lock(&ub->lock);
1782
if (!ub->mm)
1783
ub->mm = current->mm;
1784
if (current->mm != ub->mm)
1785
ret = -EINVAL;
1786
spin_unlock(&ub->lock);
1787
1788
if (ret)
1789
return ret;
1790
1791
if (vma->vm_flags & VM_WRITE)
1792
return -EPERM;
1793
1794
end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
1795
if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
1796
return -EINVAL;
1797
1798
q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
1799
pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
1800
__func__, q_id, current->pid, vma->vm_start,
1801
phys_off, (unsigned long)sz);
1802
1803
if (sz != ublk_queue_cmd_buf_size(ub))
1804
return -EINVAL;
1805
1806
pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
1807
return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
1808
}
1809
1810
static void __ublk_fail_req(struct ublk_device *ub, struct ublk_io *io,
1811
struct request *req)
1812
{
1813
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
1814
1815
if (ublk_nosrv_should_reissue_outstanding(ub))
1816
blk_mq_requeue_request(req, false);
1817
else {
1818
io->res = -EIO;
1819
__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub));
1820
}
1821
}
1822
1823
/*
1824
* Called from ublk char device release handler, when any uring_cmd is
1825
* done, meantime request queue is "quiesced" since all inflight requests
1826
* can't be completed because ublk server is dead.
1827
*
1828
* So no one can hold our request IO reference any more, simply ignore the
1829
* reference, and complete the request immediately
1830
*/
1831
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
1832
{
1833
int i;
1834
1835
for (i = 0; i < ubq->q_depth; i++) {
1836
struct ublk_io *io = &ubq->ios[i];
1837
1838
if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
1839
__ublk_fail_req(ub, io, io->req);
1840
}
1841
}
1842
1843
static void ublk_start_cancel(struct ublk_device *ub)
1844
{
1845
struct gendisk *disk = ublk_get_disk(ub);
1846
1847
/* Our disk has been dead */
1848
if (!disk)
1849
return;
1850
1851
mutex_lock(&ub->cancel_mutex);
1852
if (ub->canceling)
1853
goto out;
1854
/*
1855
* Now we are serialized with ublk_queue_rq()
1856
*
1857
* Make sure that ubq->canceling is set when queue is frozen,
1858
* because ublk_queue_rq() has to rely on this flag for avoiding to
1859
* touch completed uring_cmd
1860
*/
1861
blk_mq_quiesce_queue(disk->queue);
1862
ublk_set_canceling(ub, true);
1863
blk_mq_unquiesce_queue(disk->queue);
1864
out:
1865
mutex_unlock(&ub->cancel_mutex);
1866
ublk_put_disk(disk);
1867
}
1868
1869
static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
1870
unsigned int issue_flags)
1871
{
1872
struct ublk_io *io = &ubq->ios[tag];
1873
struct ublk_device *ub = ubq->dev;
1874
struct request *req;
1875
bool done;
1876
1877
if (!(io->flags & UBLK_IO_FLAG_ACTIVE))
1878
return;
1879
1880
/*
1881
* Don't try to cancel this command if the request is started for
1882
* avoiding race between io_uring_cmd_done() and
1883
* io_uring_cmd_complete_in_task().
1884
*
1885
* Either the started request will be aborted via __ublk_abort_rq(),
1886
* then this uring_cmd is canceled next time, or it will be done in
1887
* task work function ublk_dispatch_req() because io_uring guarantees
1888
* that ublk_dispatch_req() is always called
1889
*/
1890
req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag);
1891
if (req && blk_mq_request_started(req) && req->tag == tag)
1892
return;
1893
1894
spin_lock(&ubq->cancel_lock);
1895
done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
1896
if (!done)
1897
io->flags |= UBLK_IO_FLAG_CANCELED;
1898
spin_unlock(&ubq->cancel_lock);
1899
1900
if (!done)
1901
io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
1902
}
1903
1904
/*
1905
* The ublk char device won't be closed when calling cancel fn, so both
1906
* ublk device and queue are guaranteed to be live
1907
*
1908
* Two-stage cancel:
1909
*
1910
* - make every active uring_cmd done in ->cancel_fn()
1911
*
1912
* - aborting inflight ublk IO requests in ublk char device release handler,
1913
* which depends on 1st stage because device can only be closed iff all
1914
* uring_cmd are done
1915
*
1916
* Do _not_ try to acquire ub->mutex before all inflight requests are
1917
* aborted, otherwise deadlock may be caused.
1918
*/
1919
static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd,
1920
unsigned int issue_flags)
1921
{
1922
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
1923
struct ublk_queue *ubq = pdu->ubq;
1924
struct task_struct *task;
1925
struct ublk_io *io;
1926
1927
if (WARN_ON_ONCE(!ubq))
1928
return;
1929
1930
if (WARN_ON_ONCE(pdu->tag >= ubq->q_depth))
1931
return;
1932
1933
task = io_uring_cmd_get_task(cmd);
1934
io = &ubq->ios[pdu->tag];
1935
if (WARN_ON_ONCE(task && task != io->task))
1936
return;
1937
1938
ublk_start_cancel(ubq->dev);
1939
1940
WARN_ON_ONCE(io->cmd != cmd);
1941
ublk_cancel_cmd(ubq, pdu->tag, issue_flags);
1942
}
1943
1944
static inline bool ublk_dev_ready(const struct ublk_device *ub)
1945
{
1946
u32 total = (u32)ub->dev_info.nr_hw_queues * ub->dev_info.queue_depth;
1947
1948
return ub->nr_io_ready == total;
1949
}
1950
1951
static void ublk_cancel_queue(struct ublk_queue *ubq)
1952
{
1953
int i;
1954
1955
for (i = 0; i < ubq->q_depth; i++)
1956
ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED);
1957
}
1958
1959
/* Cancel all pending commands, must be called after del_gendisk() returns */
1960
static void ublk_cancel_dev(struct ublk_device *ub)
1961
{
1962
int i;
1963
1964
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
1965
ublk_cancel_queue(ublk_get_queue(ub, i));
1966
}
1967
1968
static bool ublk_check_inflight_rq(struct request *rq, void *data)
1969
{
1970
bool *idle = data;
1971
1972
if (blk_mq_request_started(rq)) {
1973
*idle = false;
1974
return false;
1975
}
1976
return true;
1977
}
1978
1979
static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
1980
{
1981
bool idle;
1982
1983
WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
1984
while (true) {
1985
idle = true;
1986
blk_mq_tagset_busy_iter(&ub->tag_set,
1987
ublk_check_inflight_rq, &idle);
1988
if (idle)
1989
break;
1990
msleep(UBLK_REQUEUE_DELAY_MS);
1991
}
1992
}
1993
1994
static void ublk_force_abort_dev(struct ublk_device *ub)
1995
{
1996
int i;
1997
1998
pr_devel("%s: force abort ub: dev_id %d state %s\n",
1999
__func__, ub->dev_info.dev_id,
2000
ub->dev_info.state == UBLK_S_DEV_LIVE ?
2001
"LIVE" : "QUIESCED");
2002
blk_mq_quiesce_queue(ub->ub_disk->queue);
2003
if (ub->dev_info.state == UBLK_S_DEV_LIVE)
2004
ublk_wait_tagset_rqs_idle(ub);
2005
2006
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
2007
ublk_get_queue(ub, i)->force_abort = true;
2008
blk_mq_unquiesce_queue(ub->ub_disk->queue);
2009
/* We may have requeued some rqs in ublk_quiesce_queue() */
2010
blk_mq_kick_requeue_list(ub->ub_disk->queue);
2011
}
2012
2013
static struct gendisk *ublk_detach_disk(struct ublk_device *ub)
2014
{
2015
struct gendisk *disk;
2016
2017
/* Sync with ublk_abort_queue() by holding the lock */
2018
spin_lock(&ub->lock);
2019
disk = ub->ub_disk;
2020
ub->dev_info.state = UBLK_S_DEV_DEAD;
2021
ub->dev_info.ublksrv_pid = -1;
2022
ub->ub_disk = NULL;
2023
spin_unlock(&ub->lock);
2024
2025
return disk;
2026
}
2027
2028
static void ublk_stop_dev_unlocked(struct ublk_device *ub)
2029
__must_hold(&ub->mutex)
2030
{
2031
struct gendisk *disk;
2032
2033
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2034
return;
2035
2036
if (ublk_nosrv_dev_should_queue_io(ub))
2037
ublk_force_abort_dev(ub);
2038
del_gendisk(ub->ub_disk);
2039
disk = ublk_detach_disk(ub);
2040
put_disk(disk);
2041
}
2042
2043
static void ublk_stop_dev(struct ublk_device *ub)
2044
{
2045
mutex_lock(&ub->mutex);
2046
ublk_stop_dev_unlocked(ub);
2047
mutex_unlock(&ub->mutex);
2048
ublk_cancel_dev(ub);
2049
}
2050
2051
/* reset ublk io_uring queue & io flags */
2052
static void ublk_reset_io_flags(struct ublk_device *ub)
2053
{
2054
int i, j;
2055
2056
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
2057
struct ublk_queue *ubq = ublk_get_queue(ub, i);
2058
2059
/* UBLK_IO_FLAG_CANCELED can be cleared now */
2060
spin_lock(&ubq->cancel_lock);
2061
for (j = 0; j < ubq->q_depth; j++)
2062
ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
2063
spin_unlock(&ubq->cancel_lock);
2064
ubq->fail_io = false;
2065
}
2066
mutex_lock(&ub->cancel_mutex);
2067
ublk_set_canceling(ub, false);
2068
mutex_unlock(&ub->cancel_mutex);
2069
}
2070
2071
/* device can only be started after all IOs are ready */
2072
static void ublk_mark_io_ready(struct ublk_device *ub)
2073
__must_hold(&ub->mutex)
2074
{
2075
if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
2076
ub->unprivileged_daemons = true;
2077
2078
ub->nr_io_ready++;
2079
if (ublk_dev_ready(ub)) {
2080
/* now we are ready for handling ublk io request */
2081
ublk_reset_io_flags(ub);
2082
complete_all(&ub->completion);
2083
}
2084
}
2085
2086
static inline int ublk_check_cmd_op(u32 cmd_op)
2087
{
2088
u32 ioc_type = _IOC_TYPE(cmd_op);
2089
2090
if (!IS_ENABLED(CONFIG_BLKDEV_UBLK_LEGACY_OPCODES) && ioc_type != 'u')
2091
return -EOPNOTSUPP;
2092
2093
if (ioc_type != 'u' && ioc_type != 0)
2094
return -EOPNOTSUPP;
2095
2096
return 0;
2097
}
2098
2099
static inline int ublk_set_auto_buf_reg(struct ublk_io *io, struct io_uring_cmd *cmd)
2100
{
2101
io->buf = ublk_sqe_addr_to_auto_buf_reg(READ_ONCE(cmd->sqe->addr));
2102
2103
if (io->buf.reserved0 || io->buf.reserved1)
2104
return -EINVAL;
2105
2106
if (io->buf.flags & ~UBLK_AUTO_BUF_REG_F_MASK)
2107
return -EINVAL;
2108
return 0;
2109
}
2110
2111
static int ublk_handle_auto_buf_reg(struct ublk_io *io,
2112
struct io_uring_cmd *cmd,
2113
u16 *buf_idx)
2114
{
2115
if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) {
2116
io->flags &= ~UBLK_IO_FLAG_AUTO_BUF_REG;
2117
2118
/*
2119
* `UBLK_F_AUTO_BUF_REG` only works iff `UBLK_IO_FETCH_REQ`
2120
* and `UBLK_IO_COMMIT_AND_FETCH_REQ` are issued from same
2121
* `io_ring_ctx`.
2122
*
2123
* If this uring_cmd's io_ring_ctx isn't same with the
2124
* one for registering the buffer, it is ublk server's
2125
* responsibility for unregistering the buffer, otherwise
2126
* this ublk request gets stuck.
2127
*/
2128
if (io->buf_ctx_handle == io_uring_cmd_ctx_handle(cmd))
2129
*buf_idx = io->buf.index;
2130
}
2131
2132
return ublk_set_auto_buf_reg(io, cmd);
2133
}
2134
2135
/* Once we return, `io->req` can't be used any more */
2136
static inline struct request *
2137
ublk_fill_io_cmd(struct ublk_io *io, struct io_uring_cmd *cmd)
2138
{
2139
struct request *req = io->req;
2140
2141
io->cmd = cmd;
2142
io->flags |= UBLK_IO_FLAG_ACTIVE;
2143
/* now this cmd slot is owned by ublk driver */
2144
io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
2145
2146
return req;
2147
}
2148
2149
static inline int
2150
ublk_config_io_buf(const struct ublk_device *ub, struct ublk_io *io,
2151
struct io_uring_cmd *cmd, unsigned long buf_addr,
2152
u16 *buf_idx)
2153
{
2154
if (ublk_dev_support_auto_buf_reg(ub))
2155
return ublk_handle_auto_buf_reg(io, cmd, buf_idx);
2156
2157
io->addr = buf_addr;
2158
return 0;
2159
}
2160
2161
static inline void ublk_prep_cancel(struct io_uring_cmd *cmd,
2162
unsigned int issue_flags,
2163
struct ublk_queue *ubq, unsigned int tag)
2164
{
2165
struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
2166
2167
/*
2168
* Safe to refer to @ubq since ublk_queue won't be died until its
2169
* commands are completed
2170
*/
2171
pdu->ubq = ubq;
2172
pdu->tag = tag;
2173
io_uring_cmd_mark_cancelable(cmd, issue_flags);
2174
}
2175
2176
static void ublk_io_release(void *priv)
2177
{
2178
struct request *rq = priv;
2179
struct ublk_queue *ubq = rq->mq_hctx->driver_data;
2180
struct ublk_io *io = &ubq->ios[rq->tag];
2181
2182
/*
2183
* task_registered_buffers may be 0 if buffers were registered off task
2184
* but unregistered on task. Or after UBLK_IO_COMMIT_AND_FETCH_REQ.
2185
*/
2186
if (current == io->task && io->task_registered_buffers)
2187
io->task_registered_buffers--;
2188
else
2189
ublk_put_req_ref(io, rq);
2190
}
2191
2192
static int ublk_register_io_buf(struct io_uring_cmd *cmd,
2193
struct ublk_device *ub,
2194
u16 q_id, u16 tag,
2195
struct ublk_io *io,
2196
unsigned int index, unsigned int issue_flags)
2197
{
2198
struct request *req;
2199
int ret;
2200
2201
if (!ublk_dev_support_zero_copy(ub))
2202
return -EINVAL;
2203
2204
req = __ublk_check_and_get_req(ub, q_id, tag, io, 0);
2205
if (!req)
2206
return -EINVAL;
2207
2208
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
2209
issue_flags);
2210
if (ret) {
2211
ublk_put_req_ref(io, req);
2212
return ret;
2213
}
2214
2215
return 0;
2216
}
2217
2218
static int
2219
ublk_daemon_register_io_buf(struct io_uring_cmd *cmd,
2220
struct ublk_device *ub,
2221
u16 q_id, u16 tag, struct ublk_io *io,
2222
unsigned index, unsigned issue_flags)
2223
{
2224
unsigned new_registered_buffers;
2225
struct request *req = io->req;
2226
int ret;
2227
2228
/*
2229
* Ensure there are still references for ublk_sub_req_ref() to release.
2230
* If not, fall back on the thread-safe buffer registration.
2231
*/
2232
new_registered_buffers = io->task_registered_buffers + 1;
2233
if (unlikely(new_registered_buffers >= UBLK_REFCOUNT_INIT))
2234
return ublk_register_io_buf(cmd, ub, q_id, tag, io, index,
2235
issue_flags);
2236
2237
if (!ublk_dev_support_zero_copy(ub) || !ublk_rq_has_data(req))
2238
return -EINVAL;
2239
2240
ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index,
2241
issue_flags);
2242
if (ret)
2243
return ret;
2244
2245
io->task_registered_buffers = new_registered_buffers;
2246
return 0;
2247
}
2248
2249
static int ublk_unregister_io_buf(struct io_uring_cmd *cmd,
2250
const struct ublk_device *ub,
2251
unsigned int index, unsigned int issue_flags)
2252
{
2253
if (!(ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY))
2254
return -EINVAL;
2255
2256
return io_buffer_unregister_bvec(cmd, index, issue_flags);
2257
}
2258
2259
static int ublk_check_fetch_buf(const struct ublk_device *ub, __u64 buf_addr)
2260
{
2261
if (ublk_dev_need_map_io(ub)) {
2262
/*
2263
* FETCH_RQ has to provide IO buffer if NEED GET
2264
* DATA is not enabled
2265
*/
2266
if (!buf_addr && !ublk_dev_need_get_data(ub))
2267
return -EINVAL;
2268
} else if (buf_addr) {
2269
/* User copy requires addr to be unset */
2270
return -EINVAL;
2271
}
2272
return 0;
2273
}
2274
2275
static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
2276
struct ublk_io *io, __u64 buf_addr)
2277
{
2278
int ret = 0;
2279
2280
/*
2281
* When handling FETCH command for setting up ublk uring queue,
2282
* ub->mutex is the innermost lock, and we won't block for handling
2283
* FETCH, so it is fine even for IO_URING_F_NONBLOCK.
2284
*/
2285
mutex_lock(&ub->mutex);
2286
/* UBLK_IO_FETCH_REQ is only allowed before dev is setup */
2287
if (ublk_dev_ready(ub)) {
2288
ret = -EBUSY;
2289
goto out;
2290
}
2291
2292
/* allow each command to be FETCHed at most once */
2293
if (io->flags & UBLK_IO_FLAG_ACTIVE) {
2294
ret = -EINVAL;
2295
goto out;
2296
}
2297
2298
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV);
2299
2300
ublk_fill_io_cmd(io, cmd);
2301
ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
2302
if (ret)
2303
goto out;
2304
2305
WRITE_ONCE(io->task, get_task_struct(current));
2306
ublk_mark_io_ready(ub);
2307
out:
2308
mutex_unlock(&ub->mutex);
2309
return ret;
2310
}
2311
2312
static int ublk_check_commit_and_fetch(const struct ublk_device *ub,
2313
struct ublk_io *io, __u64 buf_addr)
2314
{
2315
struct request *req = io->req;
2316
2317
if (ublk_dev_need_map_io(ub)) {
2318
/*
2319
* COMMIT_AND_FETCH_REQ has to provide IO buffer if
2320
* NEED GET DATA is not enabled or it is Read IO.
2321
*/
2322
if (!buf_addr && (!ublk_dev_need_get_data(ub) ||
2323
req_op(req) == REQ_OP_READ))
2324
return -EINVAL;
2325
} else if (req_op(req) != REQ_OP_ZONE_APPEND && buf_addr) {
2326
/*
2327
* User copy requires addr to be unset when command is
2328
* not zone append
2329
*/
2330
return -EINVAL;
2331
}
2332
2333
return 0;
2334
}
2335
2336
static bool ublk_need_complete_req(const struct ublk_device *ub,
2337
struct ublk_io *io)
2338
{
2339
if (ublk_dev_need_req_ref(ub))
2340
return ublk_sub_req_ref(io);
2341
return true;
2342
}
2343
2344
static bool ublk_get_data(const struct ublk_queue *ubq, struct ublk_io *io,
2345
struct request *req)
2346
{
2347
/*
2348
* We have handled UBLK_IO_NEED_GET_DATA command,
2349
* so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
2350
* do the copy work.
2351
*/
2352
io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
2353
/* update iod->addr because ublksrv may have passed a new io buffer */
2354
ublk_get_iod(ubq, req->tag)->addr = io->addr;
2355
pr_devel("%s: update iod->addr: qid %d tag %d io_flags %x addr %llx\n",
2356
__func__, ubq->q_id, req->tag, io->flags,
2357
ublk_get_iod(ubq, req->tag)->addr);
2358
2359
return ublk_start_io(ubq, req, io);
2360
}
2361
2362
static int ublk_ch_uring_cmd_local(struct io_uring_cmd *cmd,
2363
unsigned int issue_flags)
2364
{
2365
/* May point to userspace-mapped memory */
2366
const struct ublksrv_io_cmd *ub_src = io_uring_sqe_cmd(cmd->sqe);
2367
u16 buf_idx = UBLK_INVALID_BUF_IDX;
2368
struct ublk_device *ub = cmd->file->private_data;
2369
struct ublk_queue *ubq;
2370
struct ublk_io *io;
2371
u32 cmd_op = cmd->cmd_op;
2372
u16 q_id = READ_ONCE(ub_src->q_id);
2373
u16 tag = READ_ONCE(ub_src->tag);
2374
s32 result = READ_ONCE(ub_src->result);
2375
u64 addr = READ_ONCE(ub_src->addr); /* unioned with zone_append_lba */
2376
struct request *req;
2377
int ret;
2378
bool compl;
2379
2380
WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED);
2381
2382
pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
2383
__func__, cmd->cmd_op, q_id, tag, result);
2384
2385
ret = ublk_check_cmd_op(cmd_op);
2386
if (ret)
2387
goto out;
2388
2389
/*
2390
* io_buffer_unregister_bvec() doesn't access the ubq or io,
2391
* so no need to validate the q_id, tag, or task
2392
*/
2393
if (_IOC_NR(cmd_op) == UBLK_IO_UNREGISTER_IO_BUF)
2394
return ublk_unregister_io_buf(cmd, ub, addr, issue_flags);
2395
2396
ret = -EINVAL;
2397
if (q_id >= ub->dev_info.nr_hw_queues)
2398
goto out;
2399
2400
ubq = ublk_get_queue(ub, q_id);
2401
2402
if (tag >= ub->dev_info.queue_depth)
2403
goto out;
2404
2405
io = &ubq->ios[tag];
2406
/* UBLK_IO_FETCH_REQ can be handled on any task, which sets io->task */
2407
if (unlikely(_IOC_NR(cmd_op) == UBLK_IO_FETCH_REQ)) {
2408
ret = ublk_check_fetch_buf(ub, addr);
2409
if (ret)
2410
goto out;
2411
ret = ublk_fetch(cmd, ub, io, addr);
2412
if (ret)
2413
goto out;
2414
2415
ublk_prep_cancel(cmd, issue_flags, ubq, tag);
2416
return -EIOCBQUEUED;
2417
}
2418
2419
if (READ_ONCE(io->task) != current) {
2420
/*
2421
* ublk_register_io_buf() accesses only the io's refcount,
2422
* so can be handled on any task
2423
*/
2424
if (_IOC_NR(cmd_op) == UBLK_IO_REGISTER_IO_BUF)
2425
return ublk_register_io_buf(cmd, ub, q_id, tag, io,
2426
addr, issue_flags);
2427
2428
goto out;
2429
}
2430
2431
/* there is pending io cmd, something must be wrong */
2432
if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) {
2433
ret = -EBUSY;
2434
goto out;
2435
}
2436
2437
/*
2438
* ensure that the user issues UBLK_IO_NEED_GET_DATA
2439
* iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
2440
*/
2441
if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
2442
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
2443
goto out;
2444
2445
switch (_IOC_NR(cmd_op)) {
2446
case UBLK_IO_REGISTER_IO_BUF:
2447
return ublk_daemon_register_io_buf(cmd, ub, q_id, tag, io, addr,
2448
issue_flags);
2449
case UBLK_IO_COMMIT_AND_FETCH_REQ:
2450
ret = ublk_check_commit_and_fetch(ub, io, addr);
2451
if (ret)
2452
goto out;
2453
io->res = result;
2454
req = ublk_fill_io_cmd(io, cmd);
2455
ret = ublk_config_io_buf(ub, io, cmd, addr, &buf_idx);
2456
compl = ublk_need_complete_req(ub, io);
2457
2458
/* can't touch 'ublk_io' any more */
2459
if (buf_idx != UBLK_INVALID_BUF_IDX)
2460
io_buffer_unregister_bvec(cmd, buf_idx, issue_flags);
2461
if (req_op(req) == REQ_OP_ZONE_APPEND)
2462
req->__sector = addr;
2463
if (compl)
2464
__ublk_complete_rq(req, io, ublk_dev_need_map_io(ub));
2465
2466
if (ret)
2467
goto out;
2468
break;
2469
case UBLK_IO_NEED_GET_DATA:
2470
/*
2471
* ublk_get_data() may fail and fallback to requeue, so keep
2472
* uring_cmd active first and prepare for handling new requeued
2473
* request
2474
*/
2475
req = ublk_fill_io_cmd(io, cmd);
2476
ret = ublk_config_io_buf(ub, io, cmd, addr, NULL);
2477
WARN_ON_ONCE(ret);
2478
if (likely(ublk_get_data(ubq, io, req))) {
2479
__ublk_prep_compl_io_cmd(io, req);
2480
return UBLK_IO_RES_OK;
2481
}
2482
break;
2483
default:
2484
goto out;
2485
}
2486
ublk_prep_cancel(cmd, issue_flags, ubq, tag);
2487
return -EIOCBQUEUED;
2488
2489
out:
2490
pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
2491
__func__, cmd_op, tag, ret, io->flags);
2492
return ret;
2493
}
2494
2495
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
2496
u16 q_id, u16 tag, struct ublk_io *io, size_t offset)
2497
{
2498
struct request *req;
2499
2500
/*
2501
* can't use io->req in case of concurrent UBLK_IO_COMMIT_AND_FETCH_REQ,
2502
* which would overwrite it with io->cmd
2503
*/
2504
req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
2505
if (!req)
2506
return NULL;
2507
2508
if (!ublk_get_req_ref(io))
2509
return NULL;
2510
2511
if (unlikely(!blk_mq_request_started(req) || req->tag != tag))
2512
goto fail_put;
2513
2514
if (!ublk_rq_has_data(req))
2515
goto fail_put;
2516
2517
if (offset > blk_rq_bytes(req))
2518
goto fail_put;
2519
2520
return req;
2521
fail_put:
2522
ublk_put_req_ref(io, req);
2523
return NULL;
2524
}
2525
2526
static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
2527
unsigned int issue_flags)
2528
{
2529
int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
2530
2531
if (ret != -EIOCBQUEUED)
2532
io_uring_cmd_done(cmd, ret, issue_flags);
2533
}
2534
2535
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
2536
{
2537
if (unlikely(issue_flags & IO_URING_F_CANCEL)) {
2538
ublk_uring_cmd_cancel_fn(cmd, issue_flags);
2539
return 0;
2540
}
2541
2542
/* well-implemented server won't run into unlocked */
2543
if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) {
2544
io_uring_cmd_complete_in_task(cmd, ublk_ch_uring_cmd_cb);
2545
return -EIOCBQUEUED;
2546
}
2547
2548
return ublk_ch_uring_cmd_local(cmd, issue_flags);
2549
}
2550
2551
static inline bool ublk_check_ubuf_dir(const struct request *req,
2552
int ubuf_dir)
2553
{
2554
/* copy ubuf to request pages */
2555
if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
2556
ubuf_dir == ITER_SOURCE)
2557
return true;
2558
2559
/* copy request pages to ubuf */
2560
if ((req_op(req) == REQ_OP_WRITE ||
2561
req_op(req) == REQ_OP_ZONE_APPEND) &&
2562
ubuf_dir == ITER_DEST)
2563
return true;
2564
2565
return false;
2566
}
2567
2568
static struct request *ublk_check_and_get_req(struct kiocb *iocb,
2569
struct iov_iter *iter, size_t *off, int dir,
2570
struct ublk_io **io)
2571
{
2572
struct ublk_device *ub = iocb->ki_filp->private_data;
2573
struct ublk_queue *ubq;
2574
struct request *req;
2575
size_t buf_off;
2576
u16 tag, q_id;
2577
2578
if (!ub)
2579
return ERR_PTR(-EACCES);
2580
2581
if (!user_backed_iter(iter))
2582
return ERR_PTR(-EACCES);
2583
2584
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
2585
return ERR_PTR(-EACCES);
2586
2587
tag = ublk_pos_to_tag(iocb->ki_pos);
2588
q_id = ublk_pos_to_hwq(iocb->ki_pos);
2589
buf_off = ublk_pos_to_buf_off(iocb->ki_pos);
2590
2591
if (q_id >= ub->dev_info.nr_hw_queues)
2592
return ERR_PTR(-EINVAL);
2593
2594
ubq = ublk_get_queue(ub, q_id);
2595
if (!ublk_dev_support_user_copy(ub))
2596
return ERR_PTR(-EACCES);
2597
2598
if (tag >= ub->dev_info.queue_depth)
2599
return ERR_PTR(-EINVAL);
2600
2601
*io = &ubq->ios[tag];
2602
req = __ublk_check_and_get_req(ub, q_id, tag, *io, buf_off);
2603
if (!req)
2604
return ERR_PTR(-EINVAL);
2605
2606
if (!req->mq_hctx || !req->mq_hctx->driver_data)
2607
goto fail;
2608
2609
if (!ublk_check_ubuf_dir(req, dir))
2610
goto fail;
2611
2612
*off = buf_off;
2613
return req;
2614
fail:
2615
ublk_put_req_ref(*io, req);
2616
return ERR_PTR(-EACCES);
2617
}
2618
2619
static ssize_t ublk_ch_read_iter(struct kiocb *iocb, struct iov_iter *to)
2620
{
2621
struct request *req;
2622
struct ublk_io *io;
2623
size_t buf_off;
2624
size_t ret;
2625
2626
req = ublk_check_and_get_req(iocb, to, &buf_off, ITER_DEST, &io);
2627
if (IS_ERR(req))
2628
return PTR_ERR(req);
2629
2630
ret = ublk_copy_user_pages(req, buf_off, to, ITER_DEST);
2631
ublk_put_req_ref(io, req);
2632
2633
return ret;
2634
}
2635
2636
static ssize_t ublk_ch_write_iter(struct kiocb *iocb, struct iov_iter *from)
2637
{
2638
struct request *req;
2639
struct ublk_io *io;
2640
size_t buf_off;
2641
size_t ret;
2642
2643
req = ublk_check_and_get_req(iocb, from, &buf_off, ITER_SOURCE, &io);
2644
if (IS_ERR(req))
2645
return PTR_ERR(req);
2646
2647
ret = ublk_copy_user_pages(req, buf_off, from, ITER_SOURCE);
2648
ublk_put_req_ref(io, req);
2649
2650
return ret;
2651
}
2652
2653
static const struct file_operations ublk_ch_fops = {
2654
.owner = THIS_MODULE,
2655
.open = ublk_ch_open,
2656
.release = ublk_ch_release,
2657
.read_iter = ublk_ch_read_iter,
2658
.write_iter = ublk_ch_write_iter,
2659
.uring_cmd = ublk_ch_uring_cmd,
2660
.mmap = ublk_ch_mmap,
2661
};
2662
2663
static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
2664
{
2665
int size = ublk_queue_cmd_buf_size(ub);
2666
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2667
int i;
2668
2669
for (i = 0; i < ubq->q_depth; i++) {
2670
struct ublk_io *io = &ubq->ios[i];
2671
if (io->task)
2672
put_task_struct(io->task);
2673
WARN_ON_ONCE(refcount_read(&io->ref));
2674
WARN_ON_ONCE(io->task_registered_buffers);
2675
}
2676
2677
if (ubq->io_cmd_buf)
2678
free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
2679
}
2680
2681
static int ublk_init_queue(struct ublk_device *ub, int q_id)
2682
{
2683
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
2684
gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
2685
void *ptr;
2686
int size;
2687
2688
spin_lock_init(&ubq->cancel_lock);
2689
ubq->flags = ub->dev_info.flags;
2690
ubq->q_id = q_id;
2691
ubq->q_depth = ub->dev_info.queue_depth;
2692
size = ublk_queue_cmd_buf_size(ub);
2693
2694
ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
2695
if (!ptr)
2696
return -ENOMEM;
2697
2698
ubq->io_cmd_buf = ptr;
2699
ubq->dev = ub;
2700
return 0;
2701
}
2702
2703
static void ublk_deinit_queues(struct ublk_device *ub)
2704
{
2705
int nr_queues = ub->dev_info.nr_hw_queues;
2706
int i;
2707
2708
if (!ub->__queues)
2709
return;
2710
2711
for (i = 0; i < nr_queues; i++)
2712
ublk_deinit_queue(ub, i);
2713
kvfree(ub->__queues);
2714
}
2715
2716
static int ublk_init_queues(struct ublk_device *ub)
2717
{
2718
int nr_queues = ub->dev_info.nr_hw_queues;
2719
int depth = ub->dev_info.queue_depth;
2720
int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
2721
int i, ret = -ENOMEM;
2722
2723
ub->queue_size = ubq_size;
2724
ub->__queues = kvcalloc(nr_queues, ubq_size, GFP_KERNEL);
2725
if (!ub->__queues)
2726
return ret;
2727
2728
for (i = 0; i < nr_queues; i++) {
2729
if (ublk_init_queue(ub, i))
2730
goto fail;
2731
}
2732
2733
init_completion(&ub->completion);
2734
return 0;
2735
2736
fail:
2737
ublk_deinit_queues(ub);
2738
return ret;
2739
}
2740
2741
static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
2742
{
2743
int i = idx;
2744
int err;
2745
2746
spin_lock(&ublk_idr_lock);
2747
/* allocate id, if @id >= 0, we're requesting that specific id */
2748
if (i >= 0) {
2749
err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
2750
if (err == -ENOSPC)
2751
err = -EEXIST;
2752
} else {
2753
err = idr_alloc(&ublk_index_idr, ub, 0, UBLK_MAX_UBLKS,
2754
GFP_NOWAIT);
2755
}
2756
spin_unlock(&ublk_idr_lock);
2757
2758
if (err >= 0)
2759
ub->ub_number = err;
2760
2761
return err;
2762
}
2763
2764
static void ublk_free_dev_number(struct ublk_device *ub)
2765
{
2766
spin_lock(&ublk_idr_lock);
2767
idr_remove(&ublk_index_idr, ub->ub_number);
2768
wake_up_all(&ublk_idr_wq);
2769
spin_unlock(&ublk_idr_lock);
2770
}
2771
2772
static void ublk_cdev_rel(struct device *dev)
2773
{
2774
struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
2775
2776
blk_mq_free_tag_set(&ub->tag_set);
2777
ublk_deinit_queues(ub);
2778
ublk_free_dev_number(ub);
2779
mutex_destroy(&ub->mutex);
2780
mutex_destroy(&ub->cancel_mutex);
2781
kfree(ub);
2782
}
2783
2784
static int ublk_add_chdev(struct ublk_device *ub)
2785
{
2786
struct device *dev = &ub->cdev_dev;
2787
int minor = ub->ub_number;
2788
int ret;
2789
2790
dev->parent = ublk_misc.this_device;
2791
dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
2792
dev->class = &ublk_chr_class;
2793
dev->release = ublk_cdev_rel;
2794
device_initialize(dev);
2795
2796
ret = dev_set_name(dev, "ublkc%d", minor);
2797
if (ret)
2798
goto fail;
2799
2800
cdev_init(&ub->cdev, &ublk_ch_fops);
2801
ret = cdev_device_add(&ub->cdev, dev);
2802
if (ret)
2803
goto fail;
2804
2805
if (ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV)
2806
unprivileged_ublks_added++;
2807
return 0;
2808
fail:
2809
put_device(dev);
2810
return ret;
2811
}
2812
2813
/* align max io buffer size with PAGE_SIZE */
2814
static void ublk_align_max_io_size(struct ublk_device *ub)
2815
{
2816
unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
2817
2818
ub->dev_info.max_io_buf_bytes =
2819
round_down(max_io_bytes, PAGE_SIZE);
2820
}
2821
2822
static int ublk_add_tag_set(struct ublk_device *ub)
2823
{
2824
ub->tag_set.ops = &ublk_mq_ops;
2825
ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
2826
ub->tag_set.queue_depth = ub->dev_info.queue_depth;
2827
ub->tag_set.numa_node = NUMA_NO_NODE;
2828
ub->tag_set.driver_data = ub;
2829
return blk_mq_alloc_tag_set(&ub->tag_set);
2830
}
2831
2832
static void ublk_remove(struct ublk_device *ub)
2833
{
2834
bool unprivileged;
2835
2836
ublk_stop_dev(ub);
2837
cdev_device_del(&ub->cdev, &ub->cdev_dev);
2838
unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
2839
ublk_put_device(ub);
2840
2841
if (unprivileged)
2842
unprivileged_ublks_added--;
2843
}
2844
2845
static struct ublk_device *ublk_get_device_from_id(int idx)
2846
{
2847
struct ublk_device *ub = NULL;
2848
2849
if (idx < 0)
2850
return NULL;
2851
2852
spin_lock(&ublk_idr_lock);
2853
ub = idr_find(&ublk_index_idr, idx);
2854
if (ub)
2855
ub = ublk_get_device(ub);
2856
spin_unlock(&ublk_idr_lock);
2857
2858
return ub;
2859
}
2860
2861
static int ublk_ctrl_start_dev(struct ublk_device *ub,
2862
const struct ublksrv_ctrl_cmd *header)
2863
{
2864
const struct ublk_param_basic *p = &ub->params.basic;
2865
int ublksrv_pid = (int)header->data[0];
2866
struct queue_limits lim = {
2867
.logical_block_size = 1 << p->logical_bs_shift,
2868
.physical_block_size = 1 << p->physical_bs_shift,
2869
.io_min = 1 << p->io_min_shift,
2870
.io_opt = 1 << p->io_opt_shift,
2871
.max_hw_sectors = p->max_sectors,
2872
.chunk_sectors = p->chunk_sectors,
2873
.virt_boundary_mask = p->virt_boundary_mask,
2874
.max_segments = USHRT_MAX,
2875
.max_segment_size = UINT_MAX,
2876
.dma_alignment = 3,
2877
};
2878
struct gendisk *disk;
2879
int ret = -EINVAL;
2880
2881
if (ublksrv_pid <= 0)
2882
return -EINVAL;
2883
if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
2884
return -EINVAL;
2885
2886
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
2887
const struct ublk_param_discard *pd = &ub->params.discard;
2888
2889
lim.discard_alignment = pd->discard_alignment;
2890
lim.discard_granularity = pd->discard_granularity;
2891
lim.max_hw_discard_sectors = pd->max_discard_sectors;
2892
lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
2893
lim.max_discard_segments = pd->max_discard_segments;
2894
}
2895
2896
if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
2897
const struct ublk_param_zoned *p = &ub->params.zoned;
2898
2899
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
2900
return -EOPNOTSUPP;
2901
2902
lim.features |= BLK_FEAT_ZONED;
2903
lim.max_active_zones = p->max_active_zones;
2904
lim.max_open_zones = p->max_open_zones;
2905
lim.max_hw_zone_append_sectors = p->max_zone_append_sectors;
2906
}
2907
2908
if (ub->params.basic.attrs & UBLK_ATTR_VOLATILE_CACHE) {
2909
lim.features |= BLK_FEAT_WRITE_CACHE;
2910
if (ub->params.basic.attrs & UBLK_ATTR_FUA)
2911
lim.features |= BLK_FEAT_FUA;
2912
}
2913
2914
if (ub->params.basic.attrs & UBLK_ATTR_ROTATIONAL)
2915
lim.features |= BLK_FEAT_ROTATIONAL;
2916
2917
if (ub->params.types & UBLK_PARAM_TYPE_DMA_ALIGN)
2918
lim.dma_alignment = ub->params.dma.alignment;
2919
2920
if (ub->params.types & UBLK_PARAM_TYPE_SEGMENT) {
2921
lim.seg_boundary_mask = ub->params.seg.seg_boundary_mask;
2922
lim.max_segment_size = ub->params.seg.max_segment_size;
2923
lim.max_segments = ub->params.seg.max_segments;
2924
}
2925
2926
if (wait_for_completion_interruptible(&ub->completion) != 0)
2927
return -EINTR;
2928
2929
if (ub->ublksrv_tgid != ublksrv_pid)
2930
return -EINVAL;
2931
2932
mutex_lock(&ub->mutex);
2933
if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
2934
test_bit(UB_STATE_USED, &ub->state)) {
2935
ret = -EEXIST;
2936
goto out_unlock;
2937
}
2938
2939
disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
2940
if (IS_ERR(disk)) {
2941
ret = PTR_ERR(disk);
2942
goto out_unlock;
2943
}
2944
sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
2945
disk->fops = &ub_fops;
2946
disk->private_data = ub;
2947
2948
ub->dev_info.ublksrv_pid = ublksrv_pid;
2949
ub->ub_disk = disk;
2950
2951
ublk_apply_params(ub);
2952
2953
/* don't probe partitions if any daemon task is un-trusted */
2954
if (ub->unprivileged_daemons)
2955
set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
2956
2957
ublk_get_device(ub);
2958
ub->dev_info.state = UBLK_S_DEV_LIVE;
2959
2960
if (ublk_dev_is_zoned(ub)) {
2961
ret = ublk_revalidate_disk_zones(ub);
2962
if (ret)
2963
goto out_put_cdev;
2964
}
2965
2966
ret = add_disk(disk);
2967
if (ret)
2968
goto out_put_cdev;
2969
2970
set_bit(UB_STATE_USED, &ub->state);
2971
2972
out_put_cdev:
2973
if (ret) {
2974
ublk_detach_disk(ub);
2975
ublk_put_device(ub);
2976
}
2977
if (ret)
2978
put_disk(disk);
2979
out_unlock:
2980
mutex_unlock(&ub->mutex);
2981
return ret;
2982
}
2983
2984
static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
2985
const struct ublksrv_ctrl_cmd *header)
2986
{
2987
void __user *argp = (void __user *)(unsigned long)header->addr;
2988
cpumask_var_t cpumask;
2989
unsigned long queue;
2990
unsigned int retlen;
2991
unsigned int i;
2992
int ret;
2993
2994
if (header->len * BITS_PER_BYTE < nr_cpu_ids)
2995
return -EINVAL;
2996
if (header->len & (sizeof(unsigned long)-1))
2997
return -EINVAL;
2998
if (!header->addr)
2999
return -EINVAL;
3000
3001
queue = header->data[0];
3002
if (queue >= ub->dev_info.nr_hw_queues)
3003
return -EINVAL;
3004
3005
if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
3006
return -ENOMEM;
3007
3008
for_each_possible_cpu(i) {
3009
if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
3010
cpumask_set_cpu(i, cpumask);
3011
}
3012
3013
ret = -EFAULT;
3014
retlen = min_t(unsigned short, header->len, cpumask_size());
3015
if (copy_to_user(argp, cpumask, retlen))
3016
goto out_free_cpumask;
3017
if (retlen != header->len &&
3018
clear_user(argp + retlen, header->len - retlen))
3019
goto out_free_cpumask;
3020
3021
ret = 0;
3022
out_free_cpumask:
3023
free_cpumask_var(cpumask);
3024
return ret;
3025
}
3026
3027
static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
3028
{
3029
pr_devel("%s: dev id %d flags %llx\n", __func__,
3030
info->dev_id, info->flags);
3031
pr_devel("\t nr_hw_queues %d queue_depth %d\n",
3032
info->nr_hw_queues, info->queue_depth);
3033
}
3034
3035
static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
3036
{
3037
void __user *argp = (void __user *)(unsigned long)header->addr;
3038
struct ublksrv_ctrl_dev_info info;
3039
struct ublk_device *ub;
3040
int ret = -EINVAL;
3041
3042
if (header->len < sizeof(info) || !header->addr)
3043
return -EINVAL;
3044
if (header->queue_id != (u16)-1) {
3045
pr_warn("%s: queue_id is wrong %x\n",
3046
__func__, header->queue_id);
3047
return -EINVAL;
3048
}
3049
3050
if (copy_from_user(&info, argp, sizeof(info)))
3051
return -EFAULT;
3052
3053
if (info.queue_depth > UBLK_MAX_QUEUE_DEPTH || !info.queue_depth ||
3054
info.nr_hw_queues > UBLK_MAX_NR_QUEUES || !info.nr_hw_queues)
3055
return -EINVAL;
3056
3057
if (capable(CAP_SYS_ADMIN))
3058
info.flags &= ~UBLK_F_UNPRIVILEGED_DEV;
3059
else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV))
3060
return -EPERM;
3061
3062
/* forbid nonsense combinations of recovery flags */
3063
switch (info.flags & UBLK_F_ALL_RECOVERY_FLAGS) {
3064
case 0:
3065
case UBLK_F_USER_RECOVERY:
3066
case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_REISSUE):
3067
case (UBLK_F_USER_RECOVERY | UBLK_F_USER_RECOVERY_FAIL_IO):
3068
break;
3069
default:
3070
pr_warn("%s: invalid recovery flags %llx\n", __func__,
3071
info.flags & UBLK_F_ALL_RECOVERY_FLAGS);
3072
return -EINVAL;
3073
}
3074
3075
if ((info.flags & UBLK_F_QUIESCE) && !(info.flags & UBLK_F_USER_RECOVERY)) {
3076
pr_warn("UBLK_F_QUIESCE requires UBLK_F_USER_RECOVERY\n");
3077
return -EINVAL;
3078
}
3079
3080
/*
3081
* unprivileged device can't be trusted, but RECOVERY and
3082
* RECOVERY_REISSUE still may hang error handling, so can't
3083
* support recovery features for unprivileged ublk now
3084
*
3085
* TODO: provide forward progress for RECOVERY handler, so that
3086
* unprivileged device can benefit from it
3087
*/
3088
if (info.flags & UBLK_F_UNPRIVILEGED_DEV) {
3089
info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE |
3090
UBLK_F_USER_RECOVERY);
3091
3092
/*
3093
* For USER_COPY, we depends on userspace to fill request
3094
* buffer by pwrite() to ublk char device, which can't be
3095
* used for unprivileged device
3096
*
3097
* Same with zero copy or auto buffer register.
3098
*/
3099
if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
3100
UBLK_F_AUTO_BUF_REG))
3101
return -EINVAL;
3102
}
3103
3104
/* the created device is always owned by current user */
3105
ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid);
3106
3107
if (header->dev_id != info.dev_id) {
3108
pr_warn("%s: dev id not match %u %u\n",
3109
__func__, header->dev_id, info.dev_id);
3110
return -EINVAL;
3111
}
3112
3113
if (header->dev_id != U32_MAX && header->dev_id >= UBLK_MAX_UBLKS) {
3114
pr_warn("%s: dev id is too large. Max supported is %d\n",
3115
__func__, UBLK_MAX_UBLKS - 1);
3116
return -EINVAL;
3117
}
3118
3119
ublk_dump_dev_info(&info);
3120
3121
ret = mutex_lock_killable(&ublk_ctl_mutex);
3122
if (ret)
3123
return ret;
3124
3125
ret = -EACCES;
3126
if ((info.flags & UBLK_F_UNPRIVILEGED_DEV) &&
3127
unprivileged_ublks_added >= unprivileged_ublks_max)
3128
goto out_unlock;
3129
3130
ret = -ENOMEM;
3131
ub = kzalloc(sizeof(*ub), GFP_KERNEL);
3132
if (!ub)
3133
goto out_unlock;
3134
mutex_init(&ub->mutex);
3135
spin_lock_init(&ub->lock);
3136
mutex_init(&ub->cancel_mutex);
3137
3138
ret = ublk_alloc_dev_number(ub, header->dev_id);
3139
if (ret < 0)
3140
goto out_free_ub;
3141
3142
memcpy(&ub->dev_info, &info, sizeof(info));
3143
3144
/* update device id */
3145
ub->dev_info.dev_id = ub->ub_number;
3146
3147
/*
3148
* 64bit flags will be copied back to userspace as feature
3149
* negotiation result, so have to clear flags which driver
3150
* doesn't support yet, then userspace can get correct flags
3151
* (features) to handle.
3152
*/
3153
ub->dev_info.flags &= UBLK_F_ALL;
3154
3155
ub->dev_info.flags |= UBLK_F_CMD_IOCTL_ENCODE |
3156
UBLK_F_URING_CMD_COMP_IN_TASK |
3157
UBLK_F_PER_IO_DAEMON |
3158
UBLK_F_BUF_REG_OFF_DAEMON;
3159
3160
/* GET_DATA isn't needed any more with USER_COPY or ZERO COPY */
3161
if (ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY |
3162
UBLK_F_AUTO_BUF_REG))
3163
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
3164
3165
/*
3166
* Zoned storage support requires reuse `ublksrv_io_cmd->addr` for
3167
* returning write_append_lba, which is only allowed in case of
3168
* user copy or zero copy
3169
*/
3170
if (ublk_dev_is_zoned(ub) &&
3171
(!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !(ub->dev_info.flags &
3172
(UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)))) {
3173
ret = -EINVAL;
3174
goto out_free_dev_number;
3175
}
3176
3177
ub->dev_info.nr_hw_queues = min_t(unsigned int,
3178
ub->dev_info.nr_hw_queues, nr_cpu_ids);
3179
ublk_align_max_io_size(ub);
3180
3181
ret = ublk_init_queues(ub);
3182
if (ret)
3183
goto out_free_dev_number;
3184
3185
ret = ublk_add_tag_set(ub);
3186
if (ret)
3187
goto out_deinit_queues;
3188
3189
ret = -EFAULT;
3190
if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
3191
goto out_free_tag_set;
3192
3193
/*
3194
* Add the char dev so that ublksrv daemon can be setup.
3195
* ublk_add_chdev() will cleanup everything if it fails.
3196
*/
3197
ret = ublk_add_chdev(ub);
3198
goto out_unlock;
3199
3200
out_free_tag_set:
3201
blk_mq_free_tag_set(&ub->tag_set);
3202
out_deinit_queues:
3203
ublk_deinit_queues(ub);
3204
out_free_dev_number:
3205
ublk_free_dev_number(ub);
3206
out_free_ub:
3207
mutex_destroy(&ub->mutex);
3208
mutex_destroy(&ub->cancel_mutex);
3209
kfree(ub);
3210
out_unlock:
3211
mutex_unlock(&ublk_ctl_mutex);
3212
return ret;
3213
}
3214
3215
static inline bool ublk_idr_freed(int id)
3216
{
3217
void *ptr;
3218
3219
spin_lock(&ublk_idr_lock);
3220
ptr = idr_find(&ublk_index_idr, id);
3221
spin_unlock(&ublk_idr_lock);
3222
3223
return ptr == NULL;
3224
}
3225
3226
static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
3227
{
3228
struct ublk_device *ub = *p_ub;
3229
int idx = ub->ub_number;
3230
int ret;
3231
3232
ret = mutex_lock_killable(&ublk_ctl_mutex);
3233
if (ret)
3234
return ret;
3235
3236
if (!test_bit(UB_STATE_DELETED, &ub->state)) {
3237
ublk_remove(ub);
3238
set_bit(UB_STATE_DELETED, &ub->state);
3239
}
3240
3241
/* Mark the reference as consumed */
3242
*p_ub = NULL;
3243
ublk_put_device(ub);
3244
mutex_unlock(&ublk_ctl_mutex);
3245
3246
/*
3247
* Wait until the idr is removed, then it can be reused after
3248
* DEL_DEV command is returned.
3249
*
3250
* If we returns because of user interrupt, future delete command
3251
* may come:
3252
*
3253
* - the device number isn't freed, this device won't or needn't
3254
* be deleted again, since UB_STATE_DELETED is set, and device
3255
* will be released after the last reference is dropped
3256
*
3257
* - the device number is freed already, we will not find this
3258
* device via ublk_get_device_from_id()
3259
*/
3260
if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
3261
return -EINTR;
3262
return 0;
3263
}
3264
3265
static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
3266
{
3267
const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
3268
3269
pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
3270
__func__, cmd->cmd_op, header->dev_id, header->queue_id,
3271
header->data[0], header->addr, header->len);
3272
}
3273
3274
static int ublk_ctrl_stop_dev(struct ublk_device *ub)
3275
{
3276
ublk_stop_dev(ub);
3277
return 0;
3278
}
3279
3280
static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
3281
const struct ublksrv_ctrl_cmd *header)
3282
{
3283
void __user *argp = (void __user *)(unsigned long)header->addr;
3284
3285
if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
3286
return -EINVAL;
3287
3288
if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
3289
return -EFAULT;
3290
3291
return 0;
3292
}
3293
3294
/* TYPE_DEVT is readonly, so fill it up before returning to userspace */
3295
static void ublk_ctrl_fill_params_devt(struct ublk_device *ub)
3296
{
3297
ub->params.devt.char_major = MAJOR(ub->cdev_dev.devt);
3298
ub->params.devt.char_minor = MINOR(ub->cdev_dev.devt);
3299
3300
if (ub->ub_disk) {
3301
ub->params.devt.disk_major = MAJOR(disk_devt(ub->ub_disk));
3302
ub->params.devt.disk_minor = MINOR(disk_devt(ub->ub_disk));
3303
} else {
3304
ub->params.devt.disk_major = 0;
3305
ub->params.devt.disk_minor = 0;
3306
}
3307
ub->params.types |= UBLK_PARAM_TYPE_DEVT;
3308
}
3309
3310
static int ublk_ctrl_get_params(struct ublk_device *ub,
3311
const struct ublksrv_ctrl_cmd *header)
3312
{
3313
void __user *argp = (void __user *)(unsigned long)header->addr;
3314
struct ublk_params_header ph;
3315
int ret;
3316
3317
if (header->len <= sizeof(ph) || !header->addr)
3318
return -EINVAL;
3319
3320
if (copy_from_user(&ph, argp, sizeof(ph)))
3321
return -EFAULT;
3322
3323
if (ph.len > header->len || !ph.len)
3324
return -EINVAL;
3325
3326
if (ph.len > sizeof(struct ublk_params))
3327
ph.len = sizeof(struct ublk_params);
3328
3329
mutex_lock(&ub->mutex);
3330
ublk_ctrl_fill_params_devt(ub);
3331
if (copy_to_user(argp, &ub->params, ph.len))
3332
ret = -EFAULT;
3333
else
3334
ret = 0;
3335
mutex_unlock(&ub->mutex);
3336
3337
return ret;
3338
}
3339
3340
static int ublk_ctrl_set_params(struct ublk_device *ub,
3341
const struct ublksrv_ctrl_cmd *header)
3342
{
3343
void __user *argp = (void __user *)(unsigned long)header->addr;
3344
struct ublk_params_header ph;
3345
int ret = -EFAULT;
3346
3347
if (header->len <= sizeof(ph) || !header->addr)
3348
return -EINVAL;
3349
3350
if (copy_from_user(&ph, argp, sizeof(ph)))
3351
return -EFAULT;
3352
3353
if (ph.len > header->len || !ph.len || !ph.types)
3354
return -EINVAL;
3355
3356
if (ph.len > sizeof(struct ublk_params))
3357
ph.len = sizeof(struct ublk_params);
3358
3359
mutex_lock(&ub->mutex);
3360
if (test_bit(UB_STATE_USED, &ub->state)) {
3361
/*
3362
* Parameters can only be changed when device hasn't
3363
* been started yet
3364
*/
3365
ret = -EACCES;
3366
} else if (copy_from_user(&ub->params, argp, ph.len)) {
3367
ret = -EFAULT;
3368
} else {
3369
/* clear all we don't support yet */
3370
ub->params.types &= UBLK_PARAM_TYPE_ALL;
3371
ret = ublk_validate_params(ub);
3372
if (ret)
3373
ub->params.types = 0;
3374
}
3375
mutex_unlock(&ub->mutex);
3376
3377
return ret;
3378
}
3379
3380
static int ublk_ctrl_start_recovery(struct ublk_device *ub,
3381
const struct ublksrv_ctrl_cmd *header)
3382
{
3383
int ret = -EINVAL;
3384
3385
mutex_lock(&ub->mutex);
3386
if (ublk_nosrv_should_stop_dev(ub))
3387
goto out_unlock;
3388
/*
3389
* START_RECOVERY is only allowd after:
3390
*
3391
* (1) UB_STATE_OPEN is not set, which means the dying process is exited
3392
* and related io_uring ctx is freed so file struct of /dev/ublkcX is
3393
* released.
3394
*
3395
* and one of the following holds
3396
*
3397
* (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
3398
* (a)has quiesced request queue
3399
* (b)has requeued every inflight rqs whose io_flags is ACTIVE
3400
* (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
3401
* (d)has completed/camceled all ioucmds owned by ther dying process
3402
*
3403
* (3) UBLK_S_DEV_FAIL_IO is set, which means the queue is not
3404
* quiesced, but all I/O is being immediately errored
3405
*/
3406
if (test_bit(UB_STATE_OPEN, &ub->state) || !ublk_dev_in_recoverable_state(ub)) {
3407
ret = -EBUSY;
3408
goto out_unlock;
3409
}
3410
pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
3411
init_completion(&ub->completion);
3412
ret = 0;
3413
out_unlock:
3414
mutex_unlock(&ub->mutex);
3415
return ret;
3416
}
3417
3418
static int ublk_ctrl_end_recovery(struct ublk_device *ub,
3419
const struct ublksrv_ctrl_cmd *header)
3420
{
3421
int ublksrv_pid = (int)header->data[0];
3422
int ret = -EINVAL;
3423
3424
pr_devel("%s: Waiting for all FETCH_REQs, dev id %d...\n", __func__,
3425
header->dev_id);
3426
3427
if (wait_for_completion_interruptible(&ub->completion))
3428
return -EINTR;
3429
3430
pr_devel("%s: All FETCH_REQs received, dev id %d\n", __func__,
3431
header->dev_id);
3432
3433
if (ub->ublksrv_tgid != ublksrv_pid)
3434
return -EINVAL;
3435
3436
mutex_lock(&ub->mutex);
3437
if (ublk_nosrv_should_stop_dev(ub))
3438
goto out_unlock;
3439
3440
if (!ublk_dev_in_recoverable_state(ub)) {
3441
ret = -EBUSY;
3442
goto out_unlock;
3443
}
3444
ub->dev_info.ublksrv_pid = ublksrv_pid;
3445
ub->dev_info.state = UBLK_S_DEV_LIVE;
3446
pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
3447
__func__, ublksrv_pid, header->dev_id);
3448
blk_mq_kick_requeue_list(ub->ub_disk->queue);
3449
ret = 0;
3450
out_unlock:
3451
mutex_unlock(&ub->mutex);
3452
return ret;
3453
}
3454
3455
static int ublk_ctrl_get_features(const struct ublksrv_ctrl_cmd *header)
3456
{
3457
void __user *argp = (void __user *)(unsigned long)header->addr;
3458
u64 features = UBLK_F_ALL;
3459
3460
if (header->len != UBLK_FEATURES_LEN || !header->addr)
3461
return -EINVAL;
3462
3463
if (copy_to_user(argp, &features, UBLK_FEATURES_LEN))
3464
return -EFAULT;
3465
3466
return 0;
3467
}
3468
3469
static void ublk_ctrl_set_size(struct ublk_device *ub, const struct ublksrv_ctrl_cmd *header)
3470
{
3471
struct ublk_param_basic *p = &ub->params.basic;
3472
u64 new_size = header->data[0];
3473
3474
mutex_lock(&ub->mutex);
3475
p->dev_sectors = new_size;
3476
set_capacity_and_notify(ub->ub_disk, p->dev_sectors);
3477
mutex_unlock(&ub->mutex);
3478
}
3479
3480
struct count_busy {
3481
const struct ublk_queue *ubq;
3482
unsigned int nr_busy;
3483
};
3484
3485
static bool ublk_count_busy_req(struct request *rq, void *data)
3486
{
3487
struct count_busy *idle = data;
3488
3489
if (!blk_mq_request_started(rq) && rq->mq_hctx->driver_data == idle->ubq)
3490
idle->nr_busy += 1;
3491
return true;
3492
}
3493
3494
/* uring_cmd is guaranteed to be active if the associated request is idle */
3495
static bool ubq_has_idle_io(const struct ublk_queue *ubq)
3496
{
3497
struct count_busy data = {
3498
.ubq = ubq,
3499
};
3500
3501
blk_mq_tagset_busy_iter(&ubq->dev->tag_set, ublk_count_busy_req, &data);
3502
return data.nr_busy < ubq->q_depth;
3503
}
3504
3505
/* Wait until each hw queue has at least one idle IO */
3506
static int ublk_wait_for_idle_io(struct ublk_device *ub,
3507
unsigned int timeout_ms)
3508
{
3509
unsigned int elapsed = 0;
3510
int ret;
3511
3512
while (elapsed < timeout_ms && !signal_pending(current)) {
3513
unsigned int queues_cancelable = 0;
3514
int i;
3515
3516
for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
3517
struct ublk_queue *ubq = ublk_get_queue(ub, i);
3518
3519
queues_cancelable += !!ubq_has_idle_io(ubq);
3520
}
3521
3522
/*
3523
* Each queue needs at least one active command for
3524
* notifying ublk server
3525
*/
3526
if (queues_cancelable == ub->dev_info.nr_hw_queues)
3527
break;
3528
3529
msleep(UBLK_REQUEUE_DELAY_MS);
3530
elapsed += UBLK_REQUEUE_DELAY_MS;
3531
}
3532
3533
if (signal_pending(current))
3534
ret = -EINTR;
3535
else if (elapsed >= timeout_ms)
3536
ret = -EBUSY;
3537
else
3538
ret = 0;
3539
3540
return ret;
3541
}
3542
3543
static int ublk_ctrl_quiesce_dev(struct ublk_device *ub,
3544
const struct ublksrv_ctrl_cmd *header)
3545
{
3546
/* zero means wait forever */
3547
u64 timeout_ms = header->data[0];
3548
struct gendisk *disk;
3549
int ret = -ENODEV;
3550
3551
if (!(ub->dev_info.flags & UBLK_F_QUIESCE))
3552
return -EOPNOTSUPP;
3553
3554
mutex_lock(&ub->mutex);
3555
disk = ublk_get_disk(ub);
3556
if (!disk)
3557
goto unlock;
3558
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
3559
goto put_disk;
3560
3561
ret = 0;
3562
/* already in expected state */
3563
if (ub->dev_info.state != UBLK_S_DEV_LIVE)
3564
goto put_disk;
3565
3566
/* Mark the device as canceling */
3567
mutex_lock(&ub->cancel_mutex);
3568
blk_mq_quiesce_queue(disk->queue);
3569
ublk_set_canceling(ub, true);
3570
blk_mq_unquiesce_queue(disk->queue);
3571
mutex_unlock(&ub->cancel_mutex);
3572
3573
if (!timeout_ms)
3574
timeout_ms = UINT_MAX;
3575
ret = ublk_wait_for_idle_io(ub, timeout_ms);
3576
3577
put_disk:
3578
ublk_put_disk(disk);
3579
unlock:
3580
mutex_unlock(&ub->mutex);
3581
3582
/* Cancel pending uring_cmd */
3583
if (!ret)
3584
ublk_cancel_dev(ub);
3585
return ret;
3586
}
3587
3588
/*
3589
* All control commands are sent via /dev/ublk-control, so we have to check
3590
* the destination device's permission
3591
*/
3592
static int ublk_char_dev_permission(struct ublk_device *ub,
3593
const char *dev_path, int mask)
3594
{
3595
int err;
3596
struct path path;
3597
struct kstat stat;
3598
3599
err = kern_path(dev_path, LOOKUP_FOLLOW, &path);
3600
if (err)
3601
return err;
3602
3603
err = vfs_getattr(&path, &stat, STATX_TYPE, AT_STATX_SYNC_AS_STAT);
3604
if (err)
3605
goto exit;
3606
3607
err = -EPERM;
3608
if (stat.rdev != ub->cdev_dev.devt || !S_ISCHR(stat.mode))
3609
goto exit;
3610
3611
err = inode_permission(&nop_mnt_idmap,
3612
d_backing_inode(path.dentry), mask);
3613
exit:
3614
path_put(&path);
3615
return err;
3616
}
3617
3618
static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
3619
struct io_uring_cmd *cmd)
3620
{
3621
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)io_uring_sqe_cmd(cmd->sqe);
3622
bool unprivileged = ub->dev_info.flags & UBLK_F_UNPRIVILEGED_DEV;
3623
void __user *argp = (void __user *)(unsigned long)header->addr;
3624
char *dev_path = NULL;
3625
int ret = 0;
3626
int mask;
3627
3628
if (!unprivileged) {
3629
if (!capable(CAP_SYS_ADMIN))
3630
return -EPERM;
3631
/*
3632
* The new added command of UBLK_CMD_GET_DEV_INFO2 includes
3633
* char_dev_path in payload too, since userspace may not
3634
* know if the specified device is created as unprivileged
3635
* mode.
3636
*/
3637
if (_IOC_NR(cmd->cmd_op) != UBLK_CMD_GET_DEV_INFO2)
3638
return 0;
3639
}
3640
3641
/*
3642
* User has to provide the char device path for unprivileged ublk
3643
*
3644
* header->addr always points to the dev path buffer, and
3645
* header->dev_path_len records length of dev path buffer.
3646
*/
3647
if (!header->dev_path_len || header->dev_path_len > PATH_MAX)
3648
return -EINVAL;
3649
3650
if (header->len < header->dev_path_len)
3651
return -EINVAL;
3652
3653
dev_path = memdup_user_nul(argp, header->dev_path_len);
3654
if (IS_ERR(dev_path))
3655
return PTR_ERR(dev_path);
3656
3657
ret = -EINVAL;
3658
switch (_IOC_NR(cmd->cmd_op)) {
3659
case UBLK_CMD_GET_DEV_INFO:
3660
case UBLK_CMD_GET_DEV_INFO2:
3661
case UBLK_CMD_GET_QUEUE_AFFINITY:
3662
case UBLK_CMD_GET_PARAMS:
3663
case (_IOC_NR(UBLK_U_CMD_GET_FEATURES)):
3664
mask = MAY_READ;
3665
break;
3666
case UBLK_CMD_START_DEV:
3667
case UBLK_CMD_STOP_DEV:
3668
case UBLK_CMD_ADD_DEV:
3669
case UBLK_CMD_DEL_DEV:
3670
case UBLK_CMD_SET_PARAMS:
3671
case UBLK_CMD_START_USER_RECOVERY:
3672
case UBLK_CMD_END_USER_RECOVERY:
3673
case UBLK_CMD_UPDATE_SIZE:
3674
case UBLK_CMD_QUIESCE_DEV:
3675
mask = MAY_READ | MAY_WRITE;
3676
break;
3677
default:
3678
goto exit;
3679
}
3680
3681
ret = ublk_char_dev_permission(ub, dev_path, mask);
3682
if (!ret) {
3683
header->len -= header->dev_path_len;
3684
header->addr += header->dev_path_len;
3685
}
3686
pr_devel("%s: dev id %d cmd_op %x uid %d gid %d path %s ret %d\n",
3687
__func__, ub->ub_number, cmd->cmd_op,
3688
ub->dev_info.owner_uid, ub->dev_info.owner_gid,
3689
dev_path, ret);
3690
exit:
3691
kfree(dev_path);
3692
return ret;
3693
}
3694
3695
static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
3696
unsigned int issue_flags)
3697
{
3698
const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
3699
struct ublk_device *ub = NULL;
3700
u32 cmd_op = cmd->cmd_op;
3701
int ret = -EINVAL;
3702
3703
if (issue_flags & IO_URING_F_NONBLOCK)
3704
return -EAGAIN;
3705
3706
ublk_ctrl_cmd_dump(cmd);
3707
3708
if (!(issue_flags & IO_URING_F_SQE128))
3709
goto out;
3710
3711
ret = ublk_check_cmd_op(cmd_op);
3712
if (ret)
3713
goto out;
3714
3715
if (cmd_op == UBLK_U_CMD_GET_FEATURES) {
3716
ret = ublk_ctrl_get_features(header);
3717
goto out;
3718
}
3719
3720
if (_IOC_NR(cmd_op) != UBLK_CMD_ADD_DEV) {
3721
ret = -ENODEV;
3722
ub = ublk_get_device_from_id(header->dev_id);
3723
if (!ub)
3724
goto out;
3725
3726
ret = ublk_ctrl_uring_cmd_permission(ub, cmd);
3727
if (ret)
3728
goto put_dev;
3729
}
3730
3731
switch (_IOC_NR(cmd_op)) {
3732
case UBLK_CMD_START_DEV:
3733
ret = ublk_ctrl_start_dev(ub, header);
3734
break;
3735
case UBLK_CMD_STOP_DEV:
3736
ret = ublk_ctrl_stop_dev(ub);
3737
break;
3738
case UBLK_CMD_GET_DEV_INFO:
3739
case UBLK_CMD_GET_DEV_INFO2:
3740
ret = ublk_ctrl_get_dev_info(ub, header);
3741
break;
3742
case UBLK_CMD_ADD_DEV:
3743
ret = ublk_ctrl_add_dev(header);
3744
break;
3745
case UBLK_CMD_DEL_DEV:
3746
ret = ublk_ctrl_del_dev(&ub, true);
3747
break;
3748
case UBLK_CMD_DEL_DEV_ASYNC:
3749
ret = ublk_ctrl_del_dev(&ub, false);
3750
break;
3751
case UBLK_CMD_GET_QUEUE_AFFINITY:
3752
ret = ublk_ctrl_get_queue_affinity(ub, header);
3753
break;
3754
case UBLK_CMD_GET_PARAMS:
3755
ret = ublk_ctrl_get_params(ub, header);
3756
break;
3757
case UBLK_CMD_SET_PARAMS:
3758
ret = ublk_ctrl_set_params(ub, header);
3759
break;
3760
case UBLK_CMD_START_USER_RECOVERY:
3761
ret = ublk_ctrl_start_recovery(ub, header);
3762
break;
3763
case UBLK_CMD_END_USER_RECOVERY:
3764
ret = ublk_ctrl_end_recovery(ub, header);
3765
break;
3766
case UBLK_CMD_UPDATE_SIZE:
3767
ublk_ctrl_set_size(ub, header);
3768
ret = 0;
3769
break;
3770
case UBLK_CMD_QUIESCE_DEV:
3771
ret = ublk_ctrl_quiesce_dev(ub, header);
3772
break;
3773
default:
3774
ret = -EOPNOTSUPP;
3775
break;
3776
}
3777
3778
put_dev:
3779
if (ub)
3780
ublk_put_device(ub);
3781
out:
3782
pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
3783
__func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
3784
return ret;
3785
}
3786
3787
static const struct file_operations ublk_ctl_fops = {
3788
.open = nonseekable_open,
3789
.uring_cmd = ublk_ctrl_uring_cmd,
3790
.owner = THIS_MODULE,
3791
.llseek = noop_llseek,
3792
};
3793
3794
static struct miscdevice ublk_misc = {
3795
.minor = MISC_DYNAMIC_MINOR,
3796
.name = "ublk-control",
3797
.fops = &ublk_ctl_fops,
3798
};
3799
3800
static int __init ublk_init(void)
3801
{
3802
int ret;
3803
3804
BUILD_BUG_ON((u64)UBLKSRV_IO_BUF_OFFSET +
3805
UBLKSRV_IO_BUF_TOTAL_SIZE < UBLKSRV_IO_BUF_OFFSET);
3806
BUILD_BUG_ON(sizeof(struct ublk_auto_buf_reg) != 8);
3807
3808
init_waitqueue_head(&ublk_idr_wq);
3809
3810
ret = misc_register(&ublk_misc);
3811
if (ret)
3812
return ret;
3813
3814
ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
3815
if (ret)
3816
goto unregister_mis;
3817
3818
ret = class_register(&ublk_chr_class);
3819
if (ret)
3820
goto free_chrdev_region;
3821
3822
return 0;
3823
3824
free_chrdev_region:
3825
unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3826
unregister_mis:
3827
misc_deregister(&ublk_misc);
3828
return ret;
3829
}
3830
3831
static void __exit ublk_exit(void)
3832
{
3833
struct ublk_device *ub;
3834
int id;
3835
3836
idr_for_each_entry(&ublk_index_idr, ub, id)
3837
ublk_remove(ub);
3838
3839
class_unregister(&ublk_chr_class);
3840
misc_deregister(&ublk_misc);
3841
3842
idr_destroy(&ublk_index_idr);
3843
unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
3844
}
3845
3846
module_init(ublk_init);
3847
module_exit(ublk_exit);
3848
3849
static int ublk_set_max_unprivileged_ublks(const char *buf,
3850
const struct kernel_param *kp)
3851
{
3852
return param_set_uint_minmax(buf, kp, 0, UBLK_MAX_UBLKS);
3853
}
3854
3855
static int ublk_get_max_unprivileged_ublks(char *buf,
3856
const struct kernel_param *kp)
3857
{
3858
return sysfs_emit(buf, "%u\n", unprivileged_ublks_max);
3859
}
3860
3861
static const struct kernel_param_ops ublk_max_unprivileged_ublks_ops = {
3862
.set = ublk_set_max_unprivileged_ublks,
3863
.get = ublk_get_max_unprivileged_ublks,
3864
};
3865
3866
module_param_cb(ublks_max, &ublk_max_unprivileged_ublks_ops,
3867
&unprivileged_ublks_max, 0644);
3868
MODULE_PARM_DESC(ublks_max, "max number of unprivileged ublk devices allowed to add(default: 64)");
3869
3870
MODULE_AUTHOR("Ming Lei <[email protected]>");
3871
MODULE_DESCRIPTION("Userspace block device");
3872
MODULE_LICENSE("GPL");
3873
3874