Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
29285 views
1
/*
2
* Copyright 2008 Jerome Glisse.
3
* All Rights Reserved.
4
*
5
* Permission is hereby granted, free of charge, to any person obtaining a
6
* copy of this software and associated documentation files (the "Software"),
7
* to deal in the Software without restriction, including without limitation
8
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
9
* and/or sell copies of the Software, and to permit persons to whom the
10
* Software is furnished to do so, subject to the following conditions:
11
*
12
* The above copyright notice and this permission notice (including the next
13
* paragraph) shall be included in all copies or substantial portions of the
14
* Software.
15
*
16
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
* PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
22
* DEALINGS IN THE SOFTWARE.
23
*
24
* Authors:
25
* Jerome Glisse <[email protected]>
26
*/
27
28
#include <linux/file.h>
29
#include <linux/pagemap.h>
30
#include <linux/sync_file.h>
31
#include <linux/dma-buf.h>
32
#include <linux/hmm.h>
33
34
#include <drm/amdgpu_drm.h>
35
#include <drm/drm_syncobj.h>
36
#include <drm/ttm/ttm_tt.h>
37
38
#include "amdgpu_cs.h"
39
#include "amdgpu.h"
40
#include "amdgpu_trace.h"
41
#include "amdgpu_gmc.h"
42
#include "amdgpu_gem.h"
43
#include "amdgpu_ras.h"
44
45
static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p,
46
struct amdgpu_device *adev,
47
struct drm_file *filp,
48
union drm_amdgpu_cs *cs)
49
{
50
struct amdgpu_fpriv *fpriv = filp->driver_priv;
51
52
if (cs->in.num_chunks == 0)
53
return -EINVAL;
54
55
memset(p, 0, sizeof(*p));
56
p->adev = adev;
57
p->filp = filp;
58
59
p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id);
60
if (!p->ctx)
61
return -EINVAL;
62
63
if (atomic_read(&p->ctx->guilty)) {
64
amdgpu_ctx_put(p->ctx);
65
return -ECANCELED;
66
}
67
68
amdgpu_sync_create(&p->sync);
69
drm_exec_init(&p->exec, DRM_EXEC_INTERRUPTIBLE_WAIT |
70
DRM_EXEC_IGNORE_DUPLICATES, 0);
71
return 0;
72
}
73
74
static int amdgpu_cs_job_idx(struct amdgpu_cs_parser *p,
75
struct drm_amdgpu_cs_chunk_ib *chunk_ib)
76
{
77
struct drm_sched_entity *entity;
78
unsigned int i;
79
int r;
80
81
r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type,
82
chunk_ib->ip_instance,
83
chunk_ib->ring, &entity);
84
if (r)
85
return r;
86
87
/*
88
* Abort if there is no run queue associated with this entity.
89
* Possibly because of disabled HW IP.
90
*/
91
if (entity->rq == NULL)
92
return -EINVAL;
93
94
/* Check if we can add this IB to some existing job */
95
for (i = 0; i < p->gang_size; ++i)
96
if (p->entities[i] == entity)
97
return i;
98
99
/* If not increase the gang size if possible */
100
if (i == AMDGPU_CS_GANG_SIZE)
101
return -EINVAL;
102
103
p->entities[i] = entity;
104
p->gang_size = i + 1;
105
return i;
106
}
107
108
static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p,
109
struct drm_amdgpu_cs_chunk_ib *chunk_ib,
110
unsigned int *num_ibs)
111
{
112
int r;
113
114
r = amdgpu_cs_job_idx(p, chunk_ib);
115
if (r < 0)
116
return r;
117
118
if (num_ibs[r] >= amdgpu_ring_max_ibs(chunk_ib->ip_type))
119
return -EINVAL;
120
121
++(num_ibs[r]);
122
p->gang_leader_idx = r;
123
return 0;
124
}
125
126
static int amdgpu_cs_p1_user_fence(struct amdgpu_cs_parser *p,
127
struct drm_amdgpu_cs_chunk_fence *data,
128
uint32_t *offset)
129
{
130
struct drm_gem_object *gobj;
131
unsigned long size;
132
133
gobj = drm_gem_object_lookup(p->filp, data->handle);
134
if (gobj == NULL)
135
return -EINVAL;
136
137
p->uf_bo = amdgpu_bo_ref(gem_to_amdgpu_bo(gobj));
138
drm_gem_object_put(gobj);
139
140
size = amdgpu_bo_size(p->uf_bo);
141
if (size != PAGE_SIZE || data->offset > (size - 8))
142
return -EINVAL;
143
144
if (amdgpu_ttm_tt_get_usermm(p->uf_bo->tbo.ttm))
145
return -EINVAL;
146
147
*offset = data->offset;
148
return 0;
149
}
150
151
static int amdgpu_cs_p1_bo_handles(struct amdgpu_cs_parser *p,
152
struct drm_amdgpu_bo_list_in *data)
153
{
154
struct drm_amdgpu_bo_list_entry *info;
155
int r;
156
157
r = amdgpu_bo_create_list_entry_array(data, &info);
158
if (r)
159
return r;
160
161
r = amdgpu_bo_list_create(p->adev, p->filp, info, data->bo_number,
162
&p->bo_list);
163
if (r)
164
goto error_free;
165
166
kvfree(info);
167
return 0;
168
169
error_free:
170
kvfree(info);
171
172
return r;
173
}
174
175
/* Copy the data from userspace and go over it the first time */
176
static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
177
union drm_amdgpu_cs *cs)
178
{
179
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
180
unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { };
181
struct amdgpu_vm *vm = &fpriv->vm;
182
uint64_t *chunk_array;
183
uint32_t uf_offset = 0;
184
size_t size;
185
int ret;
186
int i;
187
188
chunk_array = memdup_array_user(u64_to_user_ptr(cs->in.chunks),
189
cs->in.num_chunks,
190
sizeof(uint64_t));
191
if (IS_ERR(chunk_array))
192
return PTR_ERR(chunk_array);
193
194
p->nchunks = cs->in.num_chunks;
195
p->chunks = kvmalloc_array(p->nchunks, sizeof(struct amdgpu_cs_chunk),
196
GFP_KERNEL);
197
if (!p->chunks) {
198
ret = -ENOMEM;
199
goto free_chunk;
200
}
201
202
for (i = 0; i < p->nchunks; i++) {
203
struct drm_amdgpu_cs_chunk __user *chunk_ptr = NULL;
204
struct drm_amdgpu_cs_chunk user_chunk;
205
206
chunk_ptr = u64_to_user_ptr(chunk_array[i]);
207
if (copy_from_user(&user_chunk, chunk_ptr,
208
sizeof(struct drm_amdgpu_cs_chunk))) {
209
ret = -EFAULT;
210
i--;
211
goto free_partial_kdata;
212
}
213
p->chunks[i].chunk_id = user_chunk.chunk_id;
214
p->chunks[i].length_dw = user_chunk.length_dw;
215
216
size = p->chunks[i].length_dw;
217
218
p->chunks[i].kdata = vmemdup_array_user(u64_to_user_ptr(user_chunk.chunk_data),
219
size,
220
sizeof(uint32_t));
221
if (IS_ERR(p->chunks[i].kdata)) {
222
ret = PTR_ERR(p->chunks[i].kdata);
223
i--;
224
goto free_partial_kdata;
225
}
226
size *= sizeof(uint32_t);
227
228
/* Assume the worst on the following checks */
229
ret = -EINVAL;
230
switch (p->chunks[i].chunk_id) {
231
case AMDGPU_CHUNK_ID_IB:
232
if (size < sizeof(struct drm_amdgpu_cs_chunk_ib))
233
goto free_partial_kdata;
234
235
ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs);
236
if (ret)
237
goto free_partial_kdata;
238
break;
239
240
case AMDGPU_CHUNK_ID_FENCE:
241
if (size < sizeof(struct drm_amdgpu_cs_chunk_fence))
242
goto free_partial_kdata;
243
244
ret = amdgpu_cs_p1_user_fence(p, p->chunks[i].kdata,
245
&uf_offset);
246
if (ret)
247
goto free_partial_kdata;
248
break;
249
250
case AMDGPU_CHUNK_ID_BO_HANDLES:
251
if (size < sizeof(struct drm_amdgpu_bo_list_in))
252
goto free_partial_kdata;
253
254
/* Only a single BO list is allowed to simplify handling. */
255
if (p->bo_list)
256
goto free_partial_kdata;
257
258
ret = amdgpu_cs_p1_bo_handles(p, p->chunks[i].kdata);
259
if (ret)
260
goto free_partial_kdata;
261
break;
262
263
case AMDGPU_CHUNK_ID_DEPENDENCIES:
264
case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
265
case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
266
case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
267
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT:
268
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL:
269
case AMDGPU_CHUNK_ID_CP_GFX_SHADOW:
270
break;
271
272
default:
273
goto free_partial_kdata;
274
}
275
}
276
277
if (!p->gang_size || (amdgpu_sriov_vf(p->adev) && p->gang_size > 1)) {
278
ret = -EINVAL;
279
goto free_all_kdata;
280
}
281
282
for (i = 0; i < p->gang_size; ++i) {
283
ret = amdgpu_job_alloc(p->adev, vm, p->entities[i], vm,
284
num_ibs[i], &p->jobs[i],
285
p->filp->client_id);
286
if (ret)
287
goto free_all_kdata;
288
switch (p->adev->enforce_isolation[fpriv->xcp_id]) {
289
case AMDGPU_ENFORCE_ISOLATION_DISABLE:
290
default:
291
p->jobs[i]->enforce_isolation = false;
292
p->jobs[i]->run_cleaner_shader = false;
293
break;
294
case AMDGPU_ENFORCE_ISOLATION_ENABLE:
295
p->jobs[i]->enforce_isolation = true;
296
p->jobs[i]->run_cleaner_shader = true;
297
break;
298
case AMDGPU_ENFORCE_ISOLATION_ENABLE_LEGACY:
299
p->jobs[i]->enforce_isolation = true;
300
p->jobs[i]->run_cleaner_shader = false;
301
break;
302
case AMDGPU_ENFORCE_ISOLATION_NO_CLEANER_SHADER:
303
p->jobs[i]->enforce_isolation = true;
304
p->jobs[i]->run_cleaner_shader = false;
305
break;
306
}
307
}
308
p->gang_leader = p->jobs[p->gang_leader_idx];
309
310
if (p->ctx->generation != p->gang_leader->generation) {
311
ret = -ECANCELED;
312
goto free_all_kdata;
313
}
314
315
if (p->uf_bo)
316
p->gang_leader->uf_addr = uf_offset;
317
kvfree(chunk_array);
318
319
/* Use this opportunity to fill in task info for the vm */
320
amdgpu_vm_set_task_info(vm);
321
322
return 0;
323
324
free_all_kdata:
325
i = p->nchunks - 1;
326
free_partial_kdata:
327
for (; i >= 0; i--)
328
kvfree(p->chunks[i].kdata);
329
kvfree(p->chunks);
330
p->chunks = NULL;
331
p->nchunks = 0;
332
free_chunk:
333
kvfree(chunk_array);
334
335
return ret;
336
}
337
338
static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
339
struct amdgpu_cs_chunk *chunk,
340
unsigned int *ce_preempt,
341
unsigned int *de_preempt)
342
{
343
struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata;
344
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
345
struct amdgpu_vm *vm = &fpriv->vm;
346
struct amdgpu_ring *ring;
347
struct amdgpu_job *job;
348
struct amdgpu_ib *ib;
349
int r;
350
351
r = amdgpu_cs_job_idx(p, chunk_ib);
352
if (r < 0)
353
return r;
354
355
job = p->jobs[r];
356
ring = amdgpu_job_ring(job);
357
ib = &job->ibs[job->num_ibs++];
358
359
/* submissions to kernel queues are disabled */
360
if (ring->no_user_submission)
361
return -EINVAL;
362
363
/* MM engine doesn't support user fences */
364
if (p->uf_bo && ring->funcs->no_user_fence)
365
return -EINVAL;
366
367
if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
368
chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
369
if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
370
(*ce_preempt)++;
371
else
372
(*de_preempt)++;
373
374
/* Each GFX command submit allows only 1 IB max
375
* preemptible for CE & DE */
376
if (*ce_preempt > 1 || *de_preempt > 1)
377
return -EINVAL;
378
}
379
380
if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
381
job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
382
383
r = amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ?
384
chunk_ib->ib_bytes : 0,
385
AMDGPU_IB_POOL_DELAYED, ib);
386
if (r) {
387
drm_err(adev_to_drm(p->adev), "Failed to get ib !\n");
388
return r;
389
}
390
391
ib->gpu_addr = chunk_ib->va_start;
392
ib->length_dw = chunk_ib->ib_bytes / 4;
393
ib->flags = chunk_ib->flags;
394
return 0;
395
}
396
397
static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
398
struct amdgpu_cs_chunk *chunk)
399
{
400
struct drm_amdgpu_cs_chunk_dep *deps = chunk->kdata;
401
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
402
unsigned int num_deps;
403
int i, r;
404
405
num_deps = chunk->length_dw * 4 /
406
sizeof(struct drm_amdgpu_cs_chunk_dep);
407
408
for (i = 0; i < num_deps; ++i) {
409
struct amdgpu_ctx *ctx;
410
struct drm_sched_entity *entity;
411
struct dma_fence *fence;
412
413
ctx = amdgpu_ctx_get(fpriv, deps[i].ctx_id);
414
if (ctx == NULL)
415
return -EINVAL;
416
417
r = amdgpu_ctx_get_entity(ctx, deps[i].ip_type,
418
deps[i].ip_instance,
419
deps[i].ring, &entity);
420
if (r) {
421
amdgpu_ctx_put(ctx);
422
return r;
423
}
424
425
fence = amdgpu_ctx_get_fence(ctx, entity, deps[i].handle);
426
amdgpu_ctx_put(ctx);
427
428
if (IS_ERR(fence))
429
return PTR_ERR(fence);
430
else if (!fence)
431
continue;
432
433
if (chunk->chunk_id == AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES) {
434
struct drm_sched_fence *s_fence;
435
struct dma_fence *old = fence;
436
437
s_fence = to_drm_sched_fence(fence);
438
fence = dma_fence_get(&s_fence->scheduled);
439
dma_fence_put(old);
440
}
441
442
r = amdgpu_sync_fence(&p->sync, fence, GFP_KERNEL);
443
dma_fence_put(fence);
444
if (r)
445
return r;
446
}
447
return 0;
448
}
449
450
static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
451
uint32_t handle, u64 point,
452
u64 flags)
453
{
454
struct dma_fence *fence;
455
int r;
456
457
r = drm_syncobj_find_fence(p->filp, handle, point, flags, &fence);
458
if (r) {
459
drm_err(adev_to_drm(p->adev), "syncobj %u failed to find fence @ %llu (%d)!\n",
460
handle, point, r);
461
return r;
462
}
463
464
r = amdgpu_sync_fence(&p->sync, fence, GFP_KERNEL);
465
dma_fence_put(fence);
466
return r;
467
}
468
469
static int amdgpu_cs_p2_syncobj_in(struct amdgpu_cs_parser *p,
470
struct amdgpu_cs_chunk *chunk)
471
{
472
struct drm_amdgpu_cs_chunk_sem *deps = chunk->kdata;
473
unsigned int num_deps;
474
int i, r;
475
476
num_deps = chunk->length_dw * 4 /
477
sizeof(struct drm_amdgpu_cs_chunk_sem);
478
for (i = 0; i < num_deps; ++i) {
479
r = amdgpu_syncobj_lookup_and_add(p, deps[i].handle, 0, 0);
480
if (r)
481
return r;
482
}
483
484
return 0;
485
}
486
487
static int amdgpu_cs_p2_syncobj_timeline_wait(struct amdgpu_cs_parser *p,
488
struct amdgpu_cs_chunk *chunk)
489
{
490
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps = chunk->kdata;
491
unsigned int num_deps;
492
int i, r;
493
494
num_deps = chunk->length_dw * 4 /
495
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
496
for (i = 0; i < num_deps; ++i) {
497
r = amdgpu_syncobj_lookup_and_add(p, syncobj_deps[i].handle,
498
syncobj_deps[i].point,
499
syncobj_deps[i].flags);
500
if (r)
501
return r;
502
}
503
504
return 0;
505
}
506
507
static int amdgpu_cs_p2_syncobj_out(struct amdgpu_cs_parser *p,
508
struct amdgpu_cs_chunk *chunk)
509
{
510
struct drm_amdgpu_cs_chunk_sem *deps = chunk->kdata;
511
unsigned int num_deps;
512
int i;
513
514
num_deps = chunk->length_dw * 4 /
515
sizeof(struct drm_amdgpu_cs_chunk_sem);
516
517
if (p->post_deps)
518
return -EINVAL;
519
520
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
521
GFP_KERNEL);
522
p->num_post_deps = 0;
523
524
if (!p->post_deps)
525
return -ENOMEM;
526
527
528
for (i = 0; i < num_deps; ++i) {
529
p->post_deps[i].syncobj =
530
drm_syncobj_find(p->filp, deps[i].handle);
531
if (!p->post_deps[i].syncobj)
532
return -EINVAL;
533
p->post_deps[i].chain = NULL;
534
p->post_deps[i].point = 0;
535
p->num_post_deps++;
536
}
537
538
return 0;
539
}
540
541
static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p,
542
struct amdgpu_cs_chunk *chunk)
543
{
544
struct drm_amdgpu_cs_chunk_syncobj *syncobj_deps = chunk->kdata;
545
unsigned int num_deps;
546
int i;
547
548
num_deps = chunk->length_dw * 4 /
549
sizeof(struct drm_amdgpu_cs_chunk_syncobj);
550
551
if (p->post_deps)
552
return -EINVAL;
553
554
p->post_deps = kmalloc_array(num_deps, sizeof(*p->post_deps),
555
GFP_KERNEL);
556
p->num_post_deps = 0;
557
558
if (!p->post_deps)
559
return -ENOMEM;
560
561
for (i = 0; i < num_deps; ++i) {
562
struct amdgpu_cs_post_dep *dep = &p->post_deps[i];
563
564
dep->chain = NULL;
565
if (syncobj_deps[i].point) {
566
dep->chain = dma_fence_chain_alloc();
567
if (!dep->chain)
568
return -ENOMEM;
569
}
570
571
dep->syncobj = drm_syncobj_find(p->filp,
572
syncobj_deps[i].handle);
573
if (!dep->syncobj) {
574
dma_fence_chain_free(dep->chain);
575
return -EINVAL;
576
}
577
dep->point = syncobj_deps[i].point;
578
p->num_post_deps++;
579
}
580
581
return 0;
582
}
583
584
static int amdgpu_cs_p2_shadow(struct amdgpu_cs_parser *p,
585
struct amdgpu_cs_chunk *chunk)
586
{
587
struct drm_amdgpu_cs_chunk_cp_gfx_shadow *shadow = chunk->kdata;
588
int i;
589
590
if (shadow->flags & ~AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW)
591
return -EINVAL;
592
593
for (i = 0; i < p->gang_size; ++i) {
594
p->jobs[i]->shadow_va = shadow->shadow_va;
595
p->jobs[i]->csa_va = shadow->csa_va;
596
p->jobs[i]->gds_va = shadow->gds_va;
597
p->jobs[i]->init_shadow =
598
shadow->flags & AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
599
}
600
601
return 0;
602
}
603
604
static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
605
{
606
unsigned int ce_preempt = 0, de_preempt = 0;
607
int i, r;
608
609
for (i = 0; i < p->nchunks; ++i) {
610
struct amdgpu_cs_chunk *chunk;
611
612
chunk = &p->chunks[i];
613
614
switch (chunk->chunk_id) {
615
case AMDGPU_CHUNK_ID_IB:
616
r = amdgpu_cs_p2_ib(p, chunk, &ce_preempt, &de_preempt);
617
if (r)
618
return r;
619
break;
620
case AMDGPU_CHUNK_ID_DEPENDENCIES:
621
case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
622
r = amdgpu_cs_p2_dependencies(p, chunk);
623
if (r)
624
return r;
625
break;
626
case AMDGPU_CHUNK_ID_SYNCOBJ_IN:
627
r = amdgpu_cs_p2_syncobj_in(p, chunk);
628
if (r)
629
return r;
630
break;
631
case AMDGPU_CHUNK_ID_SYNCOBJ_OUT:
632
r = amdgpu_cs_p2_syncobj_out(p, chunk);
633
if (r)
634
return r;
635
break;
636
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT:
637
r = amdgpu_cs_p2_syncobj_timeline_wait(p, chunk);
638
if (r)
639
return r;
640
break;
641
case AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL:
642
r = amdgpu_cs_p2_syncobj_timeline_signal(p, chunk);
643
if (r)
644
return r;
645
break;
646
case AMDGPU_CHUNK_ID_CP_GFX_SHADOW:
647
r = amdgpu_cs_p2_shadow(p, chunk);
648
if (r)
649
return r;
650
break;
651
}
652
}
653
654
return 0;
655
}
656
657
/* Convert microseconds to bytes. */
658
static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
659
{
660
if (us <= 0 || !adev->mm_stats.log2_max_MBps)
661
return 0;
662
663
/* Since accum_us is incremented by a million per second, just
664
* multiply it by the number of MB/s to get the number of bytes.
665
*/
666
return us << adev->mm_stats.log2_max_MBps;
667
}
668
669
static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
670
{
671
if (!adev->mm_stats.log2_max_MBps)
672
return 0;
673
674
return bytes >> adev->mm_stats.log2_max_MBps;
675
}
676
677
/* Returns how many bytes TTM can move right now. If no bytes can be moved,
678
* it returns 0. If it returns non-zero, it's OK to move at least one buffer,
679
* which means it can go over the threshold once. If that happens, the driver
680
* will be in debt and no other buffer migrations can be done until that debt
681
* is repaid.
682
*
683
* This approach allows moving a buffer of any size (it's important to allow
684
* that).
685
*
686
* The currency is simply time in microseconds and it increases as the clock
687
* ticks. The accumulated microseconds (us) are converted to bytes and
688
* returned.
689
*/
690
static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
691
u64 *max_bytes,
692
u64 *max_vis_bytes)
693
{
694
s64 time_us, increment_us;
695
u64 free_vram, total_vram, used_vram;
696
/* Allow a maximum of 200 accumulated ms. This is basically per-IB
697
* throttling.
698
*
699
* It means that in order to get full max MBps, at least 5 IBs per
700
* second must be submitted and not more than 200ms apart from each
701
* other.
702
*/
703
const s64 us_upper_bound = 200000;
704
705
if (!adev->mm_stats.log2_max_MBps) {
706
*max_bytes = 0;
707
*max_vis_bytes = 0;
708
return;
709
}
710
711
total_vram = adev->gmc.real_vram_size - atomic64_read(&adev->vram_pin_size);
712
used_vram = ttm_resource_manager_usage(&adev->mman.vram_mgr.manager);
713
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
714
715
spin_lock(&adev->mm_stats.lock);
716
717
/* Increase the amount of accumulated us. */
718
time_us = ktime_to_us(ktime_get());
719
increment_us = time_us - adev->mm_stats.last_update_us;
720
adev->mm_stats.last_update_us = time_us;
721
adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
722
us_upper_bound);
723
724
/* This prevents the short period of low performance when the VRAM
725
* usage is low and the driver is in debt or doesn't have enough
726
* accumulated us to fill VRAM quickly.
727
*
728
* The situation can occur in these cases:
729
* - a lot of VRAM is freed by userspace
730
* - the presence of a big buffer causes a lot of evictions
731
* (solution: split buffers into smaller ones)
732
*
733
* If 128 MB or 1/8th of VRAM is free, start filling it now by setting
734
* accum_us to a positive number.
735
*/
736
if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
737
s64 min_us;
738
739
/* Be more aggressive on dGPUs. Try to fill a portion of free
740
* VRAM now.
741
*/
742
if (!(adev->flags & AMD_IS_APU))
743
min_us = bytes_to_us(adev, free_vram / 4);
744
else
745
min_us = 0; /* Reset accum_us on APUs. */
746
747
adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
748
}
749
750
/* This is set to 0 if the driver is in debt to disallow (optional)
751
* buffer moves.
752
*/
753
*max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
754
755
/* Do the same for visible VRAM if half of it is free */
756
if (!amdgpu_gmc_vram_full_visible(&adev->gmc)) {
757
u64 total_vis_vram = adev->gmc.visible_vram_size;
758
u64 used_vis_vram =
759
amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr);
760
761
if (used_vis_vram < total_vis_vram) {
762
u64 free_vis_vram = total_vis_vram - used_vis_vram;
763
764
adev->mm_stats.accum_us_vis = min(adev->mm_stats.accum_us_vis +
765
increment_us, us_upper_bound);
766
767
if (free_vis_vram >= total_vis_vram / 2)
768
adev->mm_stats.accum_us_vis =
769
max(bytes_to_us(adev, free_vis_vram / 2),
770
adev->mm_stats.accum_us_vis);
771
}
772
773
*max_vis_bytes = us_to_bytes(adev, adev->mm_stats.accum_us_vis);
774
} else {
775
*max_vis_bytes = 0;
776
}
777
778
spin_unlock(&adev->mm_stats.lock);
779
}
780
781
/* Report how many bytes have really been moved for the last command
782
* submission. This can result in a debt that can stop buffer migrations
783
* temporarily.
784
*/
785
void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes,
786
u64 num_vis_bytes)
787
{
788
spin_lock(&adev->mm_stats.lock);
789
adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
790
adev->mm_stats.accum_us_vis -= bytes_to_us(adev, num_vis_bytes);
791
spin_unlock(&adev->mm_stats.lock);
792
}
793
794
static int amdgpu_cs_bo_validate(void *param, struct amdgpu_bo *bo)
795
{
796
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
797
struct amdgpu_cs_parser *p = param;
798
struct ttm_operation_ctx ctx = {
799
.interruptible = true,
800
.no_wait_gpu = false,
801
.resv = bo->tbo.base.resv
802
};
803
uint32_t domain;
804
int r;
805
806
if (bo->tbo.pin_count)
807
return 0;
808
809
/* Don't move this buffer if we have depleted our allowance
810
* to move it. Don't move anything if the threshold is zero.
811
*/
812
if (p->bytes_moved < p->bytes_moved_threshold &&
813
(!bo->tbo.base.dma_buf ||
814
list_empty(&bo->tbo.base.dma_buf->attachments))) {
815
if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
816
(bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) {
817
/* And don't move a CPU_ACCESS_REQUIRED BO to limited
818
* visible VRAM if we've depleted our allowance to do
819
* that.
820
*/
821
if (p->bytes_moved_vis < p->bytes_moved_vis_threshold)
822
domain = bo->preferred_domains;
823
else
824
domain = bo->allowed_domains;
825
} else {
826
domain = bo->preferred_domains;
827
}
828
} else {
829
domain = bo->allowed_domains;
830
}
831
832
retry:
833
amdgpu_bo_placement_from_domain(bo, domain);
834
r = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
835
836
p->bytes_moved += ctx.bytes_moved;
837
if (!amdgpu_gmc_vram_full_visible(&adev->gmc) &&
838
amdgpu_res_cpu_visible(adev, bo->tbo.resource))
839
p->bytes_moved_vis += ctx.bytes_moved;
840
841
if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) {
842
domain = bo->allowed_domains;
843
goto retry;
844
}
845
846
return r;
847
}
848
849
static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
850
union drm_amdgpu_cs *cs)
851
{
852
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
853
struct ttm_operation_ctx ctx = { true, false };
854
struct amdgpu_vm *vm = &fpriv->vm;
855
struct amdgpu_bo_list_entry *e;
856
struct drm_gem_object *obj;
857
unsigned long index;
858
unsigned int i;
859
int r;
860
861
/* p->bo_list could already be assigned if AMDGPU_CHUNK_ID_BO_HANDLES is present */
862
if (cs->in.bo_list_handle) {
863
if (p->bo_list)
864
return -EINVAL;
865
866
r = amdgpu_bo_list_get(fpriv, cs->in.bo_list_handle,
867
&p->bo_list);
868
if (r)
869
return r;
870
} else if (!p->bo_list) {
871
/* Create a empty bo_list when no handle is provided */
872
r = amdgpu_bo_list_create(p->adev, p->filp, NULL, 0,
873
&p->bo_list);
874
if (r)
875
return r;
876
}
877
878
mutex_lock(&p->bo_list->bo_list_mutex);
879
880
/* Get userptr backing pages. If pages are updated after registered
881
* in amdgpu_gem_userptr_ioctl(), amdgpu_cs_list_validate() will do
882
* amdgpu_ttm_backend_bind() to flush and invalidate new pages
883
*/
884
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
885
bool userpage_invalidated = false;
886
struct amdgpu_bo *bo = e->bo;
887
888
r = amdgpu_ttm_tt_get_user_pages(bo, &e->range);
889
if (r)
890
goto out_free_user_pages;
891
892
for (i = 0; i < bo->tbo.ttm->num_pages; i++) {
893
if (bo->tbo.ttm->pages[i] != hmm_pfn_to_page(e->range->hmm_pfns[i])) {
894
userpage_invalidated = true;
895
break;
896
}
897
}
898
e->user_invalidated = userpage_invalidated;
899
}
900
901
drm_exec_until_all_locked(&p->exec) {
902
r = amdgpu_vm_lock_pd(&fpriv->vm, &p->exec, 1 + p->gang_size);
903
drm_exec_retry_on_contention(&p->exec);
904
if (unlikely(r))
905
goto out_free_user_pages;
906
907
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
908
/* One fence for TTM and one for each CS job */
909
r = drm_exec_prepare_obj(&p->exec, &e->bo->tbo.base,
910
1 + p->gang_size);
911
drm_exec_retry_on_contention(&p->exec);
912
if (unlikely(r))
913
goto out_free_user_pages;
914
915
e->bo_va = amdgpu_vm_bo_find(vm, e->bo);
916
}
917
918
if (p->uf_bo) {
919
r = drm_exec_prepare_obj(&p->exec, &p->uf_bo->tbo.base,
920
1 + p->gang_size);
921
drm_exec_retry_on_contention(&p->exec);
922
if (unlikely(r))
923
goto out_free_user_pages;
924
}
925
}
926
927
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
928
struct mm_struct *usermm;
929
930
usermm = amdgpu_ttm_tt_get_usermm(e->bo->tbo.ttm);
931
if (usermm && usermm != current->mm) {
932
r = -EPERM;
933
goto out_free_user_pages;
934
}
935
936
if (amdgpu_ttm_tt_is_userptr(e->bo->tbo.ttm) &&
937
e->user_invalidated) {
938
amdgpu_bo_placement_from_domain(e->bo,
939
AMDGPU_GEM_DOMAIN_CPU);
940
r = ttm_bo_validate(&e->bo->tbo, &e->bo->placement,
941
&ctx);
942
if (r)
943
goto out_free_user_pages;
944
945
amdgpu_ttm_tt_set_user_pages(e->bo->tbo.ttm,
946
e->range);
947
}
948
}
949
950
amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
951
&p->bytes_moved_vis_threshold);
952
p->bytes_moved = 0;
953
p->bytes_moved_vis = 0;
954
955
r = amdgpu_vm_validate(p->adev, &fpriv->vm, NULL,
956
amdgpu_cs_bo_validate, p);
957
if (r) {
958
drm_err(adev_to_drm(p->adev), "amdgpu_vm_validate() failed.\n");
959
goto out_free_user_pages;
960
}
961
962
drm_exec_for_each_locked_object(&p->exec, index, obj) {
963
r = amdgpu_cs_bo_validate(p, gem_to_amdgpu_bo(obj));
964
if (unlikely(r))
965
goto out_free_user_pages;
966
}
967
968
if (p->uf_bo) {
969
r = amdgpu_ttm_alloc_gart(&p->uf_bo->tbo);
970
if (unlikely(r))
971
goto out_free_user_pages;
972
973
p->gang_leader->uf_addr += amdgpu_bo_gpu_offset(p->uf_bo);
974
}
975
976
amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
977
p->bytes_moved_vis);
978
979
for (i = 0; i < p->gang_size; ++i)
980
amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj,
981
p->bo_list->gws_obj,
982
p->bo_list->oa_obj);
983
return 0;
984
985
out_free_user_pages:
986
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
987
struct amdgpu_bo *bo = e->bo;
988
989
amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm, e->range);
990
e->range = NULL;
991
}
992
mutex_unlock(&p->bo_list->bo_list_mutex);
993
return r;
994
}
995
996
static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p)
997
{
998
int i, j;
999
1000
if (!trace_amdgpu_cs_enabled())
1001
return;
1002
1003
for (i = 0; i < p->gang_size; ++i) {
1004
struct amdgpu_job *job = p->jobs[i];
1005
1006
for (j = 0; j < job->num_ibs; ++j)
1007
trace_amdgpu_cs(p, job, &job->ibs[j]);
1008
}
1009
}
1010
1011
static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p,
1012
struct amdgpu_job *job)
1013
{
1014
struct amdgpu_ring *ring = amdgpu_job_ring(job);
1015
unsigned int i;
1016
int r;
1017
1018
/* Only for UVD/VCE VM emulation */
1019
if (!ring->funcs->parse_cs && !ring->funcs->patch_cs_in_place)
1020
return 0;
1021
1022
for (i = 0; i < job->num_ibs; ++i) {
1023
struct amdgpu_ib *ib = &job->ibs[i];
1024
struct amdgpu_bo_va_mapping *m;
1025
struct amdgpu_bo *aobj;
1026
uint64_t va_start;
1027
uint8_t *kptr;
1028
1029
va_start = ib->gpu_addr & AMDGPU_GMC_HOLE_MASK;
1030
r = amdgpu_cs_find_mapping(p, va_start, &aobj, &m);
1031
if (r) {
1032
drm_err(adev_to_drm(p->adev), "IB va_start is invalid\n");
1033
return r;
1034
}
1035
1036
if ((va_start + ib->length_dw * 4) >
1037
(m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
1038
drm_err(adev_to_drm(p->adev), "IB va_start+ib_bytes is invalid\n");
1039
return -EINVAL;
1040
}
1041
1042
/* the IB should be reserved at this point */
1043
r = amdgpu_bo_kmap(aobj, (void **)&kptr);
1044
if (r)
1045
return r;
1046
1047
kptr += va_start - (m->start * AMDGPU_GPU_PAGE_SIZE);
1048
1049
if (ring->funcs->parse_cs) {
1050
memcpy(ib->ptr, kptr, ib->length_dw * 4);
1051
amdgpu_bo_kunmap(aobj);
1052
1053
r = amdgpu_ring_parse_cs(ring, p, job, ib);
1054
if (r)
1055
return r;
1056
1057
if (ib->sa_bo)
1058
ib->gpu_addr = amdgpu_sa_bo_gpu_addr(ib->sa_bo);
1059
} else {
1060
ib->ptr = (uint32_t *)kptr;
1061
r = amdgpu_ring_patch_cs_in_place(ring, p, job, ib);
1062
amdgpu_bo_kunmap(aobj);
1063
if (r)
1064
return r;
1065
}
1066
}
1067
1068
return 0;
1069
}
1070
1071
static int amdgpu_cs_patch_jobs(struct amdgpu_cs_parser *p)
1072
{
1073
unsigned int i;
1074
int r;
1075
1076
for (i = 0; i < p->gang_size; ++i) {
1077
r = amdgpu_cs_patch_ibs(p, p->jobs[i]);
1078
if (r)
1079
return r;
1080
}
1081
return 0;
1082
}
1083
1084
static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
1085
{
1086
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
1087
struct amdgpu_job *job = p->gang_leader;
1088
struct amdgpu_device *adev = p->adev;
1089
struct amdgpu_vm *vm = &fpriv->vm;
1090
struct amdgpu_bo_list_entry *e;
1091
struct amdgpu_bo_va *bo_va;
1092
unsigned int i;
1093
int r;
1094
1095
/*
1096
* We can't use gang submit on with reserved VMIDs when the VM changes
1097
* can't be invalidated by more than one engine at the same time.
1098
*/
1099
if (p->gang_size > 1 && !adev->vm_manager.concurrent_flush) {
1100
for (i = 0; i < p->gang_size; ++i) {
1101
struct drm_sched_entity *entity = p->entities[i];
1102
struct drm_gpu_scheduler *sched = entity->rq->sched;
1103
struct amdgpu_ring *ring = to_amdgpu_ring(sched);
1104
1105
if (amdgpu_vmid_uses_reserved(vm, ring->vm_hub))
1106
return -EINVAL;
1107
}
1108
}
1109
1110
if (!amdgpu_vm_ready(vm))
1111
return -EINVAL;
1112
1113
r = amdgpu_vm_clear_freed(adev, vm, NULL);
1114
if (r)
1115
return r;
1116
1117
r = amdgpu_vm_bo_update(adev, fpriv->prt_va, false);
1118
if (r)
1119
return r;
1120
1121
r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update,
1122
GFP_KERNEL);
1123
if (r)
1124
return r;
1125
1126
if (fpriv->csa_va) {
1127
bo_va = fpriv->csa_va;
1128
BUG_ON(!bo_va);
1129
r = amdgpu_vm_bo_update(adev, bo_va, false);
1130
if (r)
1131
return r;
1132
1133
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update,
1134
GFP_KERNEL);
1135
if (r)
1136
return r;
1137
}
1138
1139
/* FIXME: In theory this loop shouldn't be needed any more when
1140
* amdgpu_vm_handle_moved handles all moved BOs that are reserved
1141
* with p->ticket. But removing it caused test regressions, so I'm
1142
* leaving it here for now.
1143
*/
1144
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
1145
bo_va = e->bo_va;
1146
if (bo_va == NULL)
1147
continue;
1148
1149
r = amdgpu_vm_bo_update(adev, bo_va, false);
1150
if (r)
1151
return r;
1152
1153
r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update,
1154
GFP_KERNEL);
1155
if (r)
1156
return r;
1157
}
1158
1159
r = amdgpu_vm_handle_moved(adev, vm, &p->exec.ticket);
1160
if (r)
1161
return r;
1162
1163
r = amdgpu_vm_update_pdes(adev, vm, false);
1164
if (r)
1165
return r;
1166
1167
r = amdgpu_sync_fence(&p->sync, vm->last_update, GFP_KERNEL);
1168
if (r)
1169
return r;
1170
1171
for (i = 0; i < p->gang_size; ++i) {
1172
job = p->jobs[i];
1173
1174
if (!job->vm)
1175
continue;
1176
1177
job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
1178
}
1179
1180
if (adev->debug_vm) {
1181
/* Invalidate all BOs to test for userspace bugs */
1182
amdgpu_bo_list_for_each_entry(e, p->bo_list) {
1183
struct amdgpu_bo *bo = e->bo;
1184
1185
/* ignore duplicates */
1186
if (!bo)
1187
continue;
1188
1189
amdgpu_vm_bo_invalidate(bo, false);
1190
}
1191
}
1192
1193
return 0;
1194
}
1195
1196
static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
1197
{
1198
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
1199
struct drm_gpu_scheduler *sched;
1200
struct drm_gem_object *obj;
1201
struct dma_fence *fence;
1202
unsigned long index;
1203
unsigned int i;
1204
int r;
1205
1206
r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_leader_idx]);
1207
if (r) {
1208
if (r != -ERESTARTSYS)
1209
drm_err(adev_to_drm(p->adev), "amdgpu_ctx_wait_prev_fence failed.\n");
1210
return r;
1211
}
1212
1213
drm_exec_for_each_locked_object(&p->exec, index, obj) {
1214
struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj);
1215
1216
struct dma_resv *resv = bo->tbo.base.resv;
1217
enum amdgpu_sync_mode sync_mode;
1218
1219
sync_mode = amdgpu_bo_explicit_sync(bo) ?
1220
AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
1221
r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode,
1222
&fpriv->vm);
1223
if (r)
1224
return r;
1225
}
1226
1227
for (i = 0; i < p->gang_size; ++i) {
1228
r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]);
1229
if (r)
1230
return r;
1231
}
1232
1233
sched = p->gang_leader->base.entity->rq->sched;
1234
while ((fence = amdgpu_sync_get_fence(&p->sync))) {
1235
struct drm_sched_fence *s_fence = to_drm_sched_fence(fence);
1236
1237
/*
1238
* When we have an dependency it might be necessary to insert a
1239
* pipeline sync to make sure that all caches etc are flushed and the
1240
* next job actually sees the results from the previous one
1241
* before we start executing on the same scheduler ring.
1242
*/
1243
if (!s_fence || s_fence->sched != sched) {
1244
dma_fence_put(fence);
1245
continue;
1246
}
1247
1248
r = amdgpu_sync_fence(&p->gang_leader->explicit_sync, fence,
1249
GFP_KERNEL);
1250
dma_fence_put(fence);
1251
if (r)
1252
return r;
1253
}
1254
return 0;
1255
}
1256
1257
static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p)
1258
{
1259
int i;
1260
1261
for (i = 0; i < p->num_post_deps; ++i) {
1262
if (p->post_deps[i].chain && p->post_deps[i].point) {
1263
drm_syncobj_add_point(p->post_deps[i].syncobj,
1264
p->post_deps[i].chain,
1265
p->fence, p->post_deps[i].point);
1266
p->post_deps[i].chain = NULL;
1267
} else {
1268
drm_syncobj_replace_fence(p->post_deps[i].syncobj,
1269
p->fence);
1270
}
1271
}
1272
}
1273
1274
static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
1275
union drm_amdgpu_cs *cs)
1276
{
1277
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
1278
struct amdgpu_job *leader = p->gang_leader;
1279
struct amdgpu_bo_list_entry *e;
1280
struct drm_gem_object *gobj;
1281
unsigned long index;
1282
unsigned int i;
1283
uint64_t seq;
1284
int r;
1285
1286
for (i = 0; i < p->gang_size; ++i)
1287
drm_sched_job_arm(&p->jobs[i]->base);
1288
1289
for (i = 0; i < p->gang_size; ++i) {
1290
struct dma_fence *fence;
1291
1292
if (p->jobs[i] == leader)
1293
continue;
1294
1295
fence = &p->jobs[i]->base.s_fence->scheduled;
1296
dma_fence_get(fence);
1297
r = drm_sched_job_add_dependency(&leader->base, fence);
1298
if (r) {
1299
dma_fence_put(fence);
1300
return r;
1301
}
1302
}
1303
1304
if (p->gang_size > 1) {
1305
for (i = 0; i < p->gang_size; ++i)
1306
amdgpu_job_set_gang_leader(p->jobs[i], leader);
1307
}
1308
1309
/* No memory allocation is allowed while holding the notifier lock.
1310
* The lock is held until amdgpu_cs_submit is finished and fence is
1311
* added to BOs.
1312
*/
1313
mutex_lock(&p->adev->notifier_lock);
1314
1315
/* If userptr are invalidated after amdgpu_cs_parser_bos(), return
1316
* -EAGAIN, drmIoctl in libdrm will restart the amdgpu_cs_ioctl.
1317
*/
1318
r = 0;
1319
amdgpu_bo_list_for_each_userptr_entry(e, p->bo_list) {
1320
r |= !amdgpu_ttm_tt_get_user_pages_done(e->bo->tbo.ttm,
1321
e->range);
1322
e->range = NULL;
1323
}
1324
if (r) {
1325
r = -EAGAIN;
1326
mutex_unlock(&p->adev->notifier_lock);
1327
return r;
1328
}
1329
1330
p->fence = dma_fence_get(&leader->base.s_fence->finished);
1331
drm_exec_for_each_locked_object(&p->exec, index, gobj) {
1332
1333
ttm_bo_move_to_lru_tail_unlocked(&gem_to_amdgpu_bo(gobj)->tbo);
1334
1335
/* Everybody except for the gang leader uses READ */
1336
for (i = 0; i < p->gang_size; ++i) {
1337
if (p->jobs[i] == leader)
1338
continue;
1339
1340
dma_resv_add_fence(gobj->resv,
1341
&p->jobs[i]->base.s_fence->finished,
1342
DMA_RESV_USAGE_READ);
1343
}
1344
1345
/* The gang leader as remembered as writer */
1346
dma_resv_add_fence(gobj->resv, p->fence, DMA_RESV_USAGE_WRITE);
1347
}
1348
1349
seq = amdgpu_ctx_add_fence(p->ctx, p->entities[p->gang_leader_idx],
1350
p->fence);
1351
amdgpu_cs_post_dependencies(p);
1352
1353
if ((leader->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
1354
!p->ctx->preamble_presented) {
1355
leader->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
1356
p->ctx->preamble_presented = true;
1357
}
1358
1359
cs->out.handle = seq;
1360
leader->uf_sequence = seq;
1361
1362
amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->exec.ticket);
1363
for (i = 0; i < p->gang_size; ++i) {
1364
amdgpu_job_free_resources(p->jobs[i]);
1365
trace_amdgpu_cs_ioctl(p->jobs[i]);
1366
drm_sched_entity_push_job(&p->jobs[i]->base);
1367
p->jobs[i] = NULL;
1368
}
1369
1370
amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
1371
1372
mutex_unlock(&p->adev->notifier_lock);
1373
mutex_unlock(&p->bo_list->bo_list_mutex);
1374
return 0;
1375
}
1376
1377
/* Cleanup the parser structure */
1378
static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser)
1379
{
1380
unsigned int i;
1381
1382
amdgpu_sync_free(&parser->sync);
1383
drm_exec_fini(&parser->exec);
1384
1385
for (i = 0; i < parser->num_post_deps; i++) {
1386
drm_syncobj_put(parser->post_deps[i].syncobj);
1387
kfree(parser->post_deps[i].chain);
1388
}
1389
kfree(parser->post_deps);
1390
1391
dma_fence_put(parser->fence);
1392
1393
if (parser->ctx)
1394
amdgpu_ctx_put(parser->ctx);
1395
if (parser->bo_list)
1396
amdgpu_bo_list_put(parser->bo_list);
1397
1398
for (i = 0; i < parser->nchunks; i++)
1399
kvfree(parser->chunks[i].kdata);
1400
kvfree(parser->chunks);
1401
for (i = 0; i < parser->gang_size; ++i) {
1402
if (parser->jobs[i])
1403
amdgpu_job_free(parser->jobs[i]);
1404
}
1405
amdgpu_bo_unref(&parser->uf_bo);
1406
}
1407
1408
int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
1409
{
1410
struct amdgpu_device *adev = drm_to_adev(dev);
1411
struct amdgpu_cs_parser parser;
1412
int r;
1413
1414
if (amdgpu_ras_intr_triggered())
1415
return -EHWPOISON;
1416
1417
if (!adev->accel_working)
1418
return -EBUSY;
1419
1420
r = amdgpu_cs_parser_init(&parser, adev, filp, data);
1421
if (r) {
1422
drm_err_ratelimited(dev, "Failed to initialize parser %d!\n", r);
1423
return r;
1424
}
1425
1426
r = amdgpu_cs_pass1(&parser, data);
1427
if (r)
1428
goto error_fini;
1429
1430
r = amdgpu_cs_pass2(&parser);
1431
if (r)
1432
goto error_fini;
1433
1434
r = amdgpu_cs_parser_bos(&parser, data);
1435
if (r) {
1436
if (r == -ENOMEM)
1437
drm_err(dev, "Not enough memory for command submission!\n");
1438
else if (r != -ERESTARTSYS && r != -EAGAIN)
1439
drm_dbg(dev, "Failed to process the buffer list %d!\n", r);
1440
goto error_fini;
1441
}
1442
1443
r = amdgpu_cs_patch_jobs(&parser);
1444
if (r)
1445
goto error_backoff;
1446
1447
r = amdgpu_cs_vm_handling(&parser);
1448
if (r)
1449
goto error_backoff;
1450
1451
r = amdgpu_cs_sync_rings(&parser);
1452
if (r)
1453
goto error_backoff;
1454
1455
trace_amdgpu_cs_ibs(&parser);
1456
1457
r = amdgpu_cs_submit(&parser, data);
1458
if (r)
1459
goto error_backoff;
1460
1461
amdgpu_cs_parser_fini(&parser);
1462
return 0;
1463
1464
error_backoff:
1465
mutex_unlock(&parser.bo_list->bo_list_mutex);
1466
1467
error_fini:
1468
amdgpu_cs_parser_fini(&parser);
1469
return r;
1470
}
1471
1472
/**
1473
* amdgpu_cs_wait_ioctl - wait for a command submission to finish
1474
*
1475
* @dev: drm device
1476
* @data: data from userspace
1477
* @filp: file private
1478
*
1479
* Wait for the command submission identified by handle to finish.
1480
*/
1481
int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
1482
struct drm_file *filp)
1483
{
1484
union drm_amdgpu_wait_cs *wait = data;
1485
unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
1486
struct drm_sched_entity *entity;
1487
struct amdgpu_ctx *ctx;
1488
struct dma_fence *fence;
1489
long r;
1490
1491
ctx = amdgpu_ctx_get(filp->driver_priv, wait->in.ctx_id);
1492
if (ctx == NULL)
1493
return -EINVAL;
1494
1495
r = amdgpu_ctx_get_entity(ctx, wait->in.ip_type, wait->in.ip_instance,
1496
wait->in.ring, &entity);
1497
if (r) {
1498
amdgpu_ctx_put(ctx);
1499
return r;
1500
}
1501
1502
fence = amdgpu_ctx_get_fence(ctx, entity, wait->in.handle);
1503
if (IS_ERR(fence))
1504
r = PTR_ERR(fence);
1505
else if (fence) {
1506
r = dma_fence_wait_timeout(fence, true, timeout);
1507
if (r > 0 && fence->error)
1508
r = fence->error;
1509
dma_fence_put(fence);
1510
} else
1511
r = 1;
1512
1513
amdgpu_ctx_put(ctx);
1514
if (r < 0)
1515
return r;
1516
1517
memset(wait, 0, sizeof(*wait));
1518
wait->out.status = (r == 0);
1519
1520
return 0;
1521
}
1522
1523
/**
1524
* amdgpu_cs_get_fence - helper to get fence from drm_amdgpu_fence
1525
*
1526
* @adev: amdgpu device
1527
* @filp: file private
1528
* @user: drm_amdgpu_fence copied from user space
1529
*/
1530
static struct dma_fence *amdgpu_cs_get_fence(struct amdgpu_device *adev,
1531
struct drm_file *filp,
1532
struct drm_amdgpu_fence *user)
1533
{
1534
struct drm_sched_entity *entity;
1535
struct amdgpu_ctx *ctx;
1536
struct dma_fence *fence;
1537
int r;
1538
1539
ctx = amdgpu_ctx_get(filp->driver_priv, user->ctx_id);
1540
if (ctx == NULL)
1541
return ERR_PTR(-EINVAL);
1542
1543
r = amdgpu_ctx_get_entity(ctx, user->ip_type, user->ip_instance,
1544
user->ring, &entity);
1545
if (r) {
1546
amdgpu_ctx_put(ctx);
1547
return ERR_PTR(r);
1548
}
1549
1550
fence = amdgpu_ctx_get_fence(ctx, entity, user->seq_no);
1551
amdgpu_ctx_put(ctx);
1552
1553
return fence;
1554
}
1555
1556
int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data,
1557
struct drm_file *filp)
1558
{
1559
struct amdgpu_device *adev = drm_to_adev(dev);
1560
union drm_amdgpu_fence_to_handle *info = data;
1561
struct dma_fence *fence;
1562
struct drm_syncobj *syncobj;
1563
struct sync_file *sync_file;
1564
int fd, r;
1565
1566
fence = amdgpu_cs_get_fence(adev, filp, &info->in.fence);
1567
if (IS_ERR(fence))
1568
return PTR_ERR(fence);
1569
1570
if (!fence)
1571
fence = dma_fence_get_stub();
1572
1573
switch (info->in.what) {
1574
case AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ:
1575
r = drm_syncobj_create(&syncobj, 0, fence);
1576
dma_fence_put(fence);
1577
if (r)
1578
return r;
1579
r = drm_syncobj_get_handle(filp, syncobj, &info->out.handle);
1580
drm_syncobj_put(syncobj);
1581
return r;
1582
1583
case AMDGPU_FENCE_TO_HANDLE_GET_SYNCOBJ_FD:
1584
r = drm_syncobj_create(&syncobj, 0, fence);
1585
dma_fence_put(fence);
1586
if (r)
1587
return r;
1588
r = drm_syncobj_get_fd(syncobj, (int *)&info->out.handle);
1589
drm_syncobj_put(syncobj);
1590
return r;
1591
1592
case AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD:
1593
fd = get_unused_fd_flags(O_CLOEXEC);
1594
if (fd < 0) {
1595
dma_fence_put(fence);
1596
return fd;
1597
}
1598
1599
sync_file = sync_file_create(fence);
1600
dma_fence_put(fence);
1601
if (!sync_file) {
1602
put_unused_fd(fd);
1603
return -ENOMEM;
1604
}
1605
1606
fd_install(fd, sync_file->file);
1607
info->out.handle = fd;
1608
return 0;
1609
1610
default:
1611
dma_fence_put(fence);
1612
return -EINVAL;
1613
}
1614
}
1615
1616
/**
1617
* amdgpu_cs_wait_all_fences - wait on all fences to signal
1618
*
1619
* @adev: amdgpu device
1620
* @filp: file private
1621
* @wait: wait parameters
1622
* @fences: array of drm_amdgpu_fence
1623
*/
1624
static int amdgpu_cs_wait_all_fences(struct amdgpu_device *adev,
1625
struct drm_file *filp,
1626
union drm_amdgpu_wait_fences *wait,
1627
struct drm_amdgpu_fence *fences)
1628
{
1629
uint32_t fence_count = wait->in.fence_count;
1630
unsigned int i;
1631
long r = 1;
1632
1633
for (i = 0; i < fence_count; i++) {
1634
struct dma_fence *fence;
1635
unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout_ns);
1636
1637
fence = amdgpu_cs_get_fence(adev, filp, &fences[i]);
1638
if (IS_ERR(fence))
1639
return PTR_ERR(fence);
1640
else if (!fence)
1641
continue;
1642
1643
r = dma_fence_wait_timeout(fence, true, timeout);
1644
if (r > 0 && fence->error)
1645
r = fence->error;
1646
1647
dma_fence_put(fence);
1648
if (r < 0)
1649
return r;
1650
1651
if (r == 0)
1652
break;
1653
}
1654
1655
memset(wait, 0, sizeof(*wait));
1656
wait->out.status = (r > 0);
1657
1658
return 0;
1659
}
1660
1661
/**
1662
* amdgpu_cs_wait_any_fence - wait on any fence to signal
1663
*
1664
* @adev: amdgpu device
1665
* @filp: file private
1666
* @wait: wait parameters
1667
* @fences: array of drm_amdgpu_fence
1668
*/
1669
static int amdgpu_cs_wait_any_fence(struct amdgpu_device *adev,
1670
struct drm_file *filp,
1671
union drm_amdgpu_wait_fences *wait,
1672
struct drm_amdgpu_fence *fences)
1673
{
1674
unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout_ns);
1675
uint32_t fence_count = wait->in.fence_count;
1676
uint32_t first = ~0;
1677
struct dma_fence **array;
1678
unsigned int i;
1679
long r;
1680
1681
/* Prepare the fence array */
1682
array = kcalloc(fence_count, sizeof(struct dma_fence *), GFP_KERNEL);
1683
1684
if (array == NULL)
1685
return -ENOMEM;
1686
1687
for (i = 0; i < fence_count; i++) {
1688
struct dma_fence *fence;
1689
1690
fence = amdgpu_cs_get_fence(adev, filp, &fences[i]);
1691
if (IS_ERR(fence)) {
1692
r = PTR_ERR(fence);
1693
goto err_free_fence_array;
1694
} else if (fence) {
1695
array[i] = fence;
1696
} else { /* NULL, the fence has been already signaled */
1697
r = 1;
1698
first = i;
1699
goto out;
1700
}
1701
}
1702
1703
r = dma_fence_wait_any_timeout(array, fence_count, true, timeout,
1704
&first);
1705
if (r < 0)
1706
goto err_free_fence_array;
1707
1708
out:
1709
memset(wait, 0, sizeof(*wait));
1710
wait->out.status = (r > 0);
1711
wait->out.first_signaled = first;
1712
1713
if (first < fence_count && array[first])
1714
r = array[first]->error;
1715
else
1716
r = 0;
1717
1718
err_free_fence_array:
1719
for (i = 0; i < fence_count; i++)
1720
dma_fence_put(array[i]);
1721
kfree(array);
1722
1723
return r;
1724
}
1725
1726
/**
1727
* amdgpu_cs_wait_fences_ioctl - wait for multiple command submissions to finish
1728
*
1729
* @dev: drm device
1730
* @data: data from userspace
1731
* @filp: file private
1732
*/
1733
int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data,
1734
struct drm_file *filp)
1735
{
1736
struct amdgpu_device *adev = drm_to_adev(dev);
1737
union drm_amdgpu_wait_fences *wait = data;
1738
struct drm_amdgpu_fence *fences;
1739
int r;
1740
1741
/* Get the fences from userspace */
1742
fences = memdup_array_user(u64_to_user_ptr(wait->in.fences),
1743
wait->in.fence_count,
1744
sizeof(struct drm_amdgpu_fence));
1745
if (IS_ERR(fences))
1746
return PTR_ERR(fences);
1747
1748
if (wait->in.wait_all)
1749
r = amdgpu_cs_wait_all_fences(adev, filp, wait, fences);
1750
else
1751
r = amdgpu_cs_wait_any_fence(adev, filp, wait, fences);
1752
1753
kfree(fences);
1754
1755
return r;
1756
}
1757
1758
/**
1759
* amdgpu_cs_find_mapping - find bo_va for VM address
1760
*
1761
* @parser: command submission parser context
1762
* @addr: VM address
1763
* @bo: resulting BO of the mapping found
1764
* @map: Placeholder to return found BO mapping
1765
*
1766
* Search the buffer objects in the command submission context for a certain
1767
* virtual memory address. Returns allocation structure when found, NULL
1768
* otherwise.
1769
*/
1770
int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser,
1771
uint64_t addr, struct amdgpu_bo **bo,
1772
struct amdgpu_bo_va_mapping **map)
1773
{
1774
struct amdgpu_fpriv *fpriv = parser->filp->driver_priv;
1775
struct ttm_operation_ctx ctx = { false, false };
1776
struct amdgpu_vm *vm = &fpriv->vm;
1777
struct amdgpu_bo_va_mapping *mapping;
1778
int i, r;
1779
1780
addr /= AMDGPU_GPU_PAGE_SIZE;
1781
1782
mapping = amdgpu_vm_bo_lookup_mapping(vm, addr);
1783
if (!mapping || !mapping->bo_va || !mapping->bo_va->base.bo)
1784
return -EINVAL;
1785
1786
*bo = mapping->bo_va->base.bo;
1787
*map = mapping;
1788
1789
/* Double check that the BO is reserved by this CS */
1790
if (dma_resv_locking_ctx((*bo)->tbo.base.resv) != &parser->exec.ticket)
1791
return -EINVAL;
1792
1793
/* Make sure VRAM is allocated contigiously */
1794
(*bo)->flags |= AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
1795
if ((*bo)->tbo.resource->mem_type == TTM_PL_VRAM &&
1796
!((*bo)->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) {
1797
1798
amdgpu_bo_placement_from_domain(*bo, (*bo)->allowed_domains);
1799
for (i = 0; i < (*bo)->placement.num_placement; i++)
1800
(*bo)->placements[i].flags |= TTM_PL_FLAG_CONTIGUOUS;
1801
r = ttm_bo_validate(&(*bo)->tbo, &(*bo)->placement, &ctx);
1802
if (r)
1803
return r;
1804
}
1805
1806
return amdgpu_ttm_alloc_gart(&(*bo)->tbo);
1807
}
1808
1809