Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/kvm/svm/nested.c
54339 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
* Kernel-based Virtual Machine driver for Linux
4
*
5
* AMD SVM support
6
*
7
* Copyright (C) 2006 Qumranet, Inc.
8
* Copyright 2010 Red Hat, Inc. and/or its affiliates.
9
*
10
* Authors:
11
* Yaniv Kamay <[email protected]>
12
* Avi Kivity <[email protected]>
13
*/
14
15
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17
#include <linux/kvm_types.h>
18
#include <linux/kvm_host.h>
19
#include <linux/kernel.h>
20
21
#include <asm/msr-index.h>
22
#include <asm/debugreg.h>
23
24
#include "kvm_emulate.h"
25
#include "trace.h"
26
#include "mmu.h"
27
#include "x86.h"
28
#include "smm.h"
29
#include "cpuid.h"
30
#include "lapic.h"
31
#include "svm.h"
32
#include "hyperv.h"
33
34
#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
35
36
static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
37
struct x86_exception *fault)
38
{
39
struct vcpu_svm *svm = to_svm(vcpu);
40
struct vmcb *vmcb = svm->vmcb;
41
42
if (vmcb->control.exit_code != SVM_EXIT_NPF) {
43
/*
44
* TODO: track the cause of the nested page fault, and
45
* correctly fill in the high bits of exit_info_1.
46
*/
47
vmcb->control.exit_code = SVM_EXIT_NPF;
48
vmcb->control.exit_info_1 = (1ULL << 32);
49
vmcb->control.exit_info_2 = fault->address;
50
}
51
52
vmcb->control.exit_info_1 &= ~0xffffffffULL;
53
vmcb->control.exit_info_1 |= fault->error_code;
54
55
nested_svm_vmexit(svm);
56
}
57
58
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
59
{
60
struct vcpu_svm *svm = to_svm(vcpu);
61
u64 cr3 = svm->nested.ctl.nested_cr3;
62
u64 pdpte;
63
int ret;
64
65
/*
66
* Note, nCR3 is "assumed" to be 32-byte aligned, i.e. the CPU ignores
67
* nCR3[4:0] when loading PDPTEs from memory.
68
*/
69
ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
70
(cr3 & GENMASK(11, 5)) + index * 8, 8);
71
if (ret)
72
return 0;
73
return pdpte;
74
}
75
76
static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
77
{
78
struct vcpu_svm *svm = to_svm(vcpu);
79
80
return svm->nested.ctl.nested_cr3;
81
}
82
83
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
84
{
85
struct vcpu_svm *svm = to_svm(vcpu);
86
87
WARN_ON(mmu_is_nested(vcpu));
88
89
vcpu->arch.mmu = &vcpu->arch.guest_mmu;
90
91
/*
92
* The NPT format depends on L1's CR4 and EFER, which is in vmcb01. Note,
93
* when called via KVM_SET_NESTED_STATE, that state may _not_ match current
94
* vCPU state. CR0.WP is explicitly ignored, while CR0.PG is required.
95
*/
96
kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
97
svm->vmcb01.ptr->save.efer,
98
svm->nested.ctl.nested_cr3);
99
vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3;
100
vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
101
vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
102
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
103
}
104
105
static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
106
{
107
vcpu->arch.mmu = &vcpu->arch.root_mmu;
108
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
109
}
110
111
static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
112
{
113
if (!guest_cpu_cap_has(&svm->vcpu, X86_FEATURE_V_VMSAVE_VMLOAD))
114
return true;
115
116
if (!nested_npt_enabled(svm))
117
return true;
118
119
if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
120
return true;
121
122
return false;
123
}
124
125
void recalc_intercepts(struct vcpu_svm *svm)
126
{
127
struct vmcb_control_area *c, *h;
128
struct vmcb_ctrl_area_cached *g;
129
unsigned int i;
130
131
vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
132
133
if (!is_guest_mode(&svm->vcpu))
134
return;
135
136
c = &svm->vmcb->control;
137
h = &svm->vmcb01.ptr->control;
138
g = &svm->nested.ctl;
139
140
for (i = 0; i < MAX_INTERCEPT; i++)
141
c->intercepts[i] = h->intercepts[i];
142
143
if (g->int_ctl & V_INTR_MASKING_MASK) {
144
/*
145
* If L2 is active and V_INTR_MASKING is enabled in vmcb12,
146
* disable intercept of CR8 writes as L2's CR8 does not affect
147
* any interrupt KVM may want to inject.
148
*
149
* Similarly, disable intercept of virtual interrupts (used to
150
* detect interrupt windows) if the saved RFLAGS.IF is '0', as
151
* the effective RFLAGS.IF for L1 interrupts will never be set
152
* while L2 is running (L2's RFLAGS.IF doesn't affect L1 IRQs).
153
*/
154
vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
155
if (!(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF))
156
vmcb_clr_intercept(c, INTERCEPT_VINTR);
157
}
158
159
/*
160
* We want to see VMMCALLs from a nested guest only when Hyper-V L2 TLB
161
* flush feature is enabled.
162
*/
163
if (!nested_svm_l2_tlb_flush_enabled(&svm->vcpu))
164
vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
165
166
for (i = 0; i < MAX_INTERCEPT; i++)
167
c->intercepts[i] |= g->intercepts[i];
168
169
/* If SMI is not intercepted, ignore guest SMI intercept as well */
170
if (!intercept_smi)
171
vmcb_clr_intercept(c, INTERCEPT_SMI);
172
173
if (nested_vmcb_needs_vls_intercept(svm)) {
174
/*
175
* If the virtual VMLOAD/VMSAVE is not enabled for the L2,
176
* we must intercept these instructions to correctly
177
* emulate them in case L1 doesn't intercept them.
178
*/
179
vmcb_set_intercept(c, INTERCEPT_VMLOAD);
180
vmcb_set_intercept(c, INTERCEPT_VMSAVE);
181
} else {
182
WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
183
}
184
}
185
186
/*
187
* This array (and its actual size) holds the set of offsets (indexing by chunk
188
* size) to process when merging vmcb12's MSRPM with vmcb01's MSRPM. Note, the
189
* set of MSRs for which interception is disabled in vmcb01 is per-vCPU, e.g.
190
* based on CPUID features. This array only tracks MSRs that *might* be passed
191
* through to the guest.
192
*
193
* Hardcode the capacity of the array based on the maximum number of _offsets_.
194
* MSRs are batched together, so there are fewer offsets than MSRs.
195
*/
196
static int nested_svm_msrpm_merge_offsets[10] __ro_after_init;
197
static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
198
typedef unsigned long nsvm_msrpm_merge_t;
199
200
int __init nested_svm_init_msrpm_merge_offsets(void)
201
{
202
static const u32 merge_msrs[] __initconst = {
203
MSR_STAR,
204
MSR_IA32_SYSENTER_CS,
205
MSR_IA32_SYSENTER_EIP,
206
MSR_IA32_SYSENTER_ESP,
207
#ifdef CONFIG_X86_64
208
MSR_GS_BASE,
209
MSR_FS_BASE,
210
MSR_KERNEL_GS_BASE,
211
MSR_LSTAR,
212
MSR_CSTAR,
213
MSR_SYSCALL_MASK,
214
#endif
215
MSR_IA32_SPEC_CTRL,
216
MSR_IA32_PRED_CMD,
217
MSR_IA32_FLUSH_CMD,
218
MSR_IA32_APERF,
219
MSR_IA32_MPERF,
220
MSR_IA32_LASTBRANCHFROMIP,
221
MSR_IA32_LASTBRANCHTOIP,
222
MSR_IA32_LASTINTFROMIP,
223
MSR_IA32_LASTINTTOIP,
224
225
MSR_K7_PERFCTR0,
226
MSR_K7_PERFCTR1,
227
MSR_K7_PERFCTR2,
228
MSR_K7_PERFCTR3,
229
MSR_F15H_PERF_CTR0,
230
MSR_F15H_PERF_CTR1,
231
MSR_F15H_PERF_CTR2,
232
MSR_F15H_PERF_CTR3,
233
MSR_F15H_PERF_CTR4,
234
MSR_F15H_PERF_CTR5,
235
236
MSR_AMD64_PERF_CNTR_GLOBAL_CTL,
237
MSR_AMD64_PERF_CNTR_GLOBAL_STATUS,
238
MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR,
239
MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET,
240
};
241
int i, j;
242
243
for (i = 0; i < ARRAY_SIZE(merge_msrs); i++) {
244
int bit_nr = svm_msrpm_bit_nr(merge_msrs[i]);
245
u32 offset;
246
247
if (WARN_ON(bit_nr < 0))
248
return -EIO;
249
250
/*
251
* Merging is done in chunks to reduce the number of accesses
252
* to L1's bitmap.
253
*/
254
offset = bit_nr / BITS_PER_BYTE / sizeof(nsvm_msrpm_merge_t);
255
256
for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
257
if (nested_svm_msrpm_merge_offsets[j] == offset)
258
break;
259
}
260
261
if (j < nested_svm_nr_msrpm_merge_offsets)
262
continue;
263
264
if (WARN_ON(j >= ARRAY_SIZE(nested_svm_msrpm_merge_offsets)))
265
return -EIO;
266
267
nested_svm_msrpm_merge_offsets[j] = offset;
268
nested_svm_nr_msrpm_merge_offsets++;
269
}
270
271
return 0;
272
}
273
274
/*
275
* Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
276
* is optimized in that it only merges the parts where KVM MSR permission bitmap
277
* may contain zero bits.
278
*/
279
static bool nested_svm_merge_msrpm(struct kvm_vcpu *vcpu)
280
{
281
struct vcpu_svm *svm = to_svm(vcpu);
282
nsvm_msrpm_merge_t *msrpm02 = svm->nested.msrpm;
283
nsvm_msrpm_merge_t *msrpm01 = svm->msrpm;
284
int i;
285
286
/*
287
* MSR bitmap update can be skipped when:
288
* - MSR bitmap for L1 hasn't changed.
289
* - Nested hypervisor (L1) is attempting to launch the same L2 as
290
* before.
291
* - Nested hypervisor (L1) is using Hyper-V emulation interface and
292
* tells KVM (L0) there were no changes in MSR bitmap for L2.
293
*/
294
#ifdef CONFIG_KVM_HYPERV
295
if (!svm->nested.force_msr_bitmap_recalc) {
296
struct hv_vmcb_enlightenments *hve = &svm->nested.ctl.hv_enlightenments;
297
298
if (kvm_hv_hypercall_enabled(vcpu) &&
299
hve->hv_enlightenments_control.msr_bitmap &&
300
(svm->nested.ctl.clean & BIT(HV_VMCB_NESTED_ENLIGHTENMENTS)))
301
goto set_msrpm_base_pa;
302
}
303
#endif
304
305
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
306
return true;
307
308
for (i = 0; i < nested_svm_nr_msrpm_merge_offsets; i++) {
309
const int p = nested_svm_msrpm_merge_offsets[i];
310
nsvm_msrpm_merge_t l1_val;
311
gpa_t gpa;
312
313
gpa = svm->nested.ctl.msrpm_base_pa + (p * sizeof(l1_val));
314
315
if (kvm_vcpu_read_guest(vcpu, gpa, &l1_val, sizeof(l1_val)))
316
return false;
317
318
msrpm02[p] = msrpm01[p] | l1_val;
319
}
320
321
svm->nested.force_msr_bitmap_recalc = false;
322
323
#ifdef CONFIG_KVM_HYPERV
324
set_msrpm_base_pa:
325
#endif
326
svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
327
328
return true;
329
}
330
331
/*
332
* Bits 11:0 of bitmap address are ignored by hardware
333
*/
334
static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
335
{
336
u64 addr = PAGE_ALIGN(pa);
337
338
return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
339
kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
340
}
341
342
static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
343
struct vmcb_ctrl_area_cached *control)
344
{
345
if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
346
return false;
347
348
if (CC(control->asid == 0))
349
return false;
350
351
if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
352
return false;
353
354
if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
355
MSRPM_SIZE)))
356
return false;
357
if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
358
IOPM_SIZE)))
359
return false;
360
361
if (CC((control->int_ctl & V_NMI_ENABLE_MASK) &&
362
!vmcb12_is_intercept(control, INTERCEPT_NMI))) {
363
return false;
364
}
365
366
return true;
367
}
368
369
/* Common checks that apply to both L1 and L2 state. */
370
static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
371
struct vmcb_save_area_cached *save)
372
{
373
if (CC(!(save->efer & EFER_SVME)))
374
return false;
375
376
if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
377
CC(save->cr0 & ~0xffffffffULL))
378
return false;
379
380
if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
381
return false;
382
383
/*
384
* These checks are also performed by KVM_SET_SREGS,
385
* except that EFER.LMA is not checked by SVM against
386
* CR0.PG && EFER.LME.
387
*/
388
if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
389
if (CC(!(save->cr4 & X86_CR4_PAE)) ||
390
CC(!(save->cr0 & X86_CR0_PE)) ||
391
CC(!kvm_vcpu_is_legal_cr3(vcpu, save->cr3)))
392
return false;
393
}
394
395
/* Note, SVM doesn't have any additional restrictions on CR4. */
396
if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4)))
397
return false;
398
399
if (CC(!kvm_valid_efer(vcpu, save->efer)))
400
return false;
401
402
return true;
403
}
404
405
static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
406
{
407
struct vcpu_svm *svm = to_svm(vcpu);
408
struct vmcb_save_area_cached *save = &svm->nested.save;
409
410
return __nested_vmcb_check_save(vcpu, save);
411
}
412
413
static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
414
{
415
struct vcpu_svm *svm = to_svm(vcpu);
416
struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
417
418
return __nested_vmcb_check_controls(vcpu, ctl);
419
}
420
421
/*
422
* If a feature is not advertised to L1, clear the corresponding vmcb12
423
* intercept.
424
*/
425
#define __nested_svm_sanitize_intercept(__vcpu, __control, fname, iname) \
426
do { \
427
if (!guest_cpu_cap_has(__vcpu, X86_FEATURE_##fname)) \
428
vmcb12_clr_intercept(__control, INTERCEPT_##iname); \
429
} while (0)
430
431
#define nested_svm_sanitize_intercept(__vcpu, __control, name) \
432
__nested_svm_sanitize_intercept(__vcpu, __control, name, name)
433
434
static
435
void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
436
struct vmcb_ctrl_area_cached *to,
437
struct vmcb_control_area *from)
438
{
439
unsigned int i;
440
441
for (i = 0; i < MAX_INTERCEPT; i++)
442
to->intercepts[i] = from->intercepts[i];
443
444
__nested_svm_sanitize_intercept(vcpu, to, XSAVE, XSETBV);
445
nested_svm_sanitize_intercept(vcpu, to, INVPCID);
446
nested_svm_sanitize_intercept(vcpu, to, RDTSCP);
447
nested_svm_sanitize_intercept(vcpu, to, SKINIT);
448
nested_svm_sanitize_intercept(vcpu, to, RDPRU);
449
450
to->iopm_base_pa = from->iopm_base_pa;
451
to->msrpm_base_pa = from->msrpm_base_pa;
452
to->tsc_offset = from->tsc_offset;
453
to->tlb_ctl = from->tlb_ctl;
454
to->erap_ctl = from->erap_ctl;
455
to->int_ctl = from->int_ctl;
456
to->int_vector = from->int_vector;
457
to->int_state = from->int_state;
458
to->exit_code = from->exit_code;
459
to->exit_info_1 = from->exit_info_1;
460
to->exit_info_2 = from->exit_info_2;
461
to->exit_int_info = from->exit_int_info;
462
to->exit_int_info_err = from->exit_int_info_err;
463
to->nested_ctl = from->nested_ctl;
464
to->event_inj = from->event_inj;
465
to->event_inj_err = from->event_inj_err;
466
to->next_rip = from->next_rip;
467
to->nested_cr3 = from->nested_cr3;
468
to->virt_ext = from->virt_ext;
469
to->pause_filter_count = from->pause_filter_count;
470
to->pause_filter_thresh = from->pause_filter_thresh;
471
472
/* Copy asid here because nested_vmcb_check_controls will check it. */
473
to->asid = from->asid;
474
to->msrpm_base_pa &= ~0x0fffULL;
475
to->iopm_base_pa &= ~0x0fffULL;
476
477
#ifdef CONFIG_KVM_HYPERV
478
/* Hyper-V extensions (Enlightened VMCB) */
479
if (kvm_hv_hypercall_enabled(vcpu)) {
480
to->clean = from->clean;
481
memcpy(&to->hv_enlightenments, &from->hv_enlightenments,
482
sizeof(to->hv_enlightenments));
483
}
484
#endif
485
}
486
487
void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
488
struct vmcb_control_area *control)
489
{
490
__nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
491
}
492
493
static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
494
struct vmcb_save_area *from)
495
{
496
/*
497
* Copy only fields that are validated, as we need them
498
* to avoid TOC/TOU races.
499
*/
500
to->efer = from->efer;
501
to->cr0 = from->cr0;
502
to->cr3 = from->cr3;
503
to->cr4 = from->cr4;
504
505
to->dr6 = from->dr6;
506
to->dr7 = from->dr7;
507
}
508
509
void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
510
struct vmcb_save_area *save)
511
{
512
__nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
513
}
514
515
/*
516
* Synchronize fields that are written by the processor, so that
517
* they can be copied back into the vmcb12.
518
*/
519
void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
520
{
521
u32 mask;
522
svm->nested.ctl.event_inj = svm->vmcb->control.event_inj;
523
svm->nested.ctl.event_inj_err = svm->vmcb->control.event_inj_err;
524
525
/* Only a few fields of int_ctl are written by the processor. */
526
mask = V_IRQ_MASK | V_TPR_MASK;
527
/*
528
* Don't sync vmcb02 V_IRQ back to vmcb12 if KVM (L0) is intercepting
529
* virtual interrupts in order to request an interrupt window, as KVM
530
* has usurped vmcb02's int_ctl. If an interrupt window opens before
531
* the next VM-Exit, svm_clear_vintr() will restore vmcb12's int_ctl.
532
* If no window opens, V_IRQ will be correctly preserved in vmcb12's
533
* int_ctl (because it was never recognized while L2 was running).
534
*/
535
if (svm_is_intercept(svm, INTERCEPT_VINTR) &&
536
!test_bit(INTERCEPT_VINTR, (unsigned long *)svm->nested.ctl.intercepts))
537
mask &= ~V_IRQ_MASK;
538
539
if (nested_vgif_enabled(svm))
540
mask |= V_GIF_MASK;
541
542
if (nested_vnmi_enabled(svm))
543
mask |= V_NMI_BLOCKING_MASK | V_NMI_PENDING_MASK;
544
545
svm->nested.ctl.int_ctl &= ~mask;
546
svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
547
}
548
549
/*
550
* Transfer any event that L0 or L1 wanted to inject into L2 to
551
* EXIT_INT_INFO.
552
*/
553
static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
554
struct vmcb *vmcb12)
555
{
556
struct kvm_vcpu *vcpu = &svm->vcpu;
557
u32 exit_int_info = 0;
558
unsigned int nr;
559
560
if (vcpu->arch.exception.injected) {
561
nr = vcpu->arch.exception.vector;
562
exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
563
564
if (vcpu->arch.exception.has_error_code) {
565
exit_int_info |= SVM_EVTINJ_VALID_ERR;
566
vmcb12->control.exit_int_info_err =
567
vcpu->arch.exception.error_code;
568
}
569
570
} else if (vcpu->arch.nmi_injected) {
571
exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
572
573
} else if (vcpu->arch.interrupt.injected) {
574
nr = vcpu->arch.interrupt.nr;
575
exit_int_info = nr | SVM_EVTINJ_VALID;
576
577
if (vcpu->arch.interrupt.soft)
578
exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
579
else
580
exit_int_info |= SVM_EVTINJ_TYPE_INTR;
581
}
582
583
vmcb12->control.exit_int_info = exit_int_info;
584
}
585
586
static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
587
{
588
/* Handle pending Hyper-V TLB flush requests */
589
kvm_hv_nested_transtion_tlb_flush(vcpu, npt_enabled);
590
591
/*
592
* TODO: optimize unconditional TLB flush/MMU sync. A partial list of
593
* things to fix before this can be conditional:
594
*
595
* - Flush TLBs for both L1 and L2 remote TLB flush
596
* - Honor L1's request to flush an ASID on nested VMRUN
597
* - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*]
598
* - Don't crush a pending TLB flush in vmcb02 on nested VMRUN
599
* - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST
600
*
601
* [*] Unlike nested EPT, SVM's ASID management can invalidate nested
602
* NPT guest-physical mappings on VMRUN.
603
*/
604
kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
605
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
606
}
607
608
/*
609
* Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
610
* if we are emulating VM-Entry into a guest with NPT enabled.
611
*/
612
static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
613
bool nested_npt, bool reload_pdptrs)
614
{
615
if (CC(!kvm_vcpu_is_legal_cr3(vcpu, cr3)))
616
return -EINVAL;
617
618
if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
619
CC(!load_pdptrs(vcpu, cr3)))
620
return -EINVAL;
621
622
vcpu->arch.cr3 = cr3;
623
624
/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
625
kvm_init_mmu(vcpu);
626
627
if (!nested_npt)
628
kvm_mmu_new_pgd(vcpu, cr3);
629
630
return 0;
631
}
632
633
void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
634
{
635
if (!svm->nested.vmcb02.ptr)
636
return;
637
638
/* FIXME: merge g_pat from vmcb01 and vmcb12. */
639
svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
640
}
641
642
static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
643
{
644
bool new_vmcb12 = false;
645
struct vmcb *vmcb01 = svm->vmcb01.ptr;
646
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
647
struct kvm_vcpu *vcpu = &svm->vcpu;
648
649
nested_vmcb02_compute_g_pat(svm);
650
vmcb_mark_dirty(vmcb02, VMCB_NPT);
651
652
/* Load the nested guest state */
653
if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
654
new_vmcb12 = true;
655
svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
656
svm->nested.force_msr_bitmap_recalc = true;
657
}
658
659
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
660
vmcb02->save.es = vmcb12->save.es;
661
vmcb02->save.cs = vmcb12->save.cs;
662
vmcb02->save.ss = vmcb12->save.ss;
663
vmcb02->save.ds = vmcb12->save.ds;
664
vmcb02->save.cpl = vmcb12->save.cpl;
665
vmcb_mark_dirty(vmcb02, VMCB_SEG);
666
}
667
668
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
669
vmcb02->save.gdtr = vmcb12->save.gdtr;
670
vmcb02->save.idtr = vmcb12->save.idtr;
671
vmcb_mark_dirty(vmcb02, VMCB_DT);
672
}
673
674
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK) &&
675
(unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_CET)))) {
676
vmcb02->save.s_cet = vmcb12->save.s_cet;
677
vmcb02->save.isst_addr = vmcb12->save.isst_addr;
678
vmcb02->save.ssp = vmcb12->save.ssp;
679
vmcb_mark_dirty(vmcb02, VMCB_CET);
680
}
681
682
kvm_set_rflags(vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
683
684
svm_set_efer(vcpu, svm->nested.save.efer);
685
686
svm_set_cr0(vcpu, svm->nested.save.cr0);
687
svm_set_cr4(vcpu, svm->nested.save.cr4);
688
689
svm->vcpu.arch.cr2 = vmcb12->save.cr2;
690
691
kvm_rax_write(vcpu, vmcb12->save.rax);
692
kvm_rsp_write(vcpu, vmcb12->save.rsp);
693
kvm_rip_write(vcpu, vmcb12->save.rip);
694
695
/* In case we don't even reach vcpu_run, the fields are not updated */
696
vmcb02->save.rax = vmcb12->save.rax;
697
vmcb02->save.rsp = vmcb12->save.rsp;
698
vmcb02->save.rip = vmcb12->save.rip;
699
700
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
701
vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
702
svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
703
vmcb_mark_dirty(vmcb02, VMCB_DR);
704
}
705
706
if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
707
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
708
/*
709
* Reserved bits of DEBUGCTL are ignored. Be consistent with
710
* svm_set_msr's definition of reserved bits.
711
*/
712
svm_copy_lbrs(vmcb02, vmcb12);
713
vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
714
} else {
715
svm_copy_lbrs(vmcb02, vmcb01);
716
}
717
svm_update_lbrv(&svm->vcpu);
718
}
719
720
static inline bool is_evtinj_soft(u32 evtinj)
721
{
722
u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
723
u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
724
725
if (!(evtinj & SVM_EVTINJ_VALID))
726
return false;
727
728
if (type == SVM_EVTINJ_TYPE_SOFT)
729
return true;
730
731
return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
732
}
733
734
static bool is_evtinj_nmi(u32 evtinj)
735
{
736
u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
737
738
if (!(evtinj & SVM_EVTINJ_VALID))
739
return false;
740
741
return type == SVM_EVTINJ_TYPE_NMI;
742
}
743
744
static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
745
unsigned long vmcb12_rip,
746
unsigned long vmcb12_csbase)
747
{
748
u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
749
u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
750
751
struct kvm_vcpu *vcpu = &svm->vcpu;
752
struct vmcb *vmcb01 = svm->vmcb01.ptr;
753
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
754
u32 pause_count12;
755
u32 pause_thresh12;
756
757
nested_svm_transition_tlb_flush(vcpu);
758
759
/* Enter Guest-Mode */
760
enter_guest_mode(vcpu);
761
762
/*
763
* Filled at exit: exit_code, exit_info_1, exit_info_2, exit_int_info,
764
* exit_int_info_err, next_rip, insn_len, insn_bytes.
765
*/
766
767
if (guest_cpu_cap_has(vcpu, X86_FEATURE_VGIF) &&
768
(svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
769
int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
770
else
771
int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
772
773
if (vnmi) {
774
if (vmcb01->control.int_ctl & V_NMI_PENDING_MASK) {
775
svm->vcpu.arch.nmi_pending++;
776
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
777
}
778
if (nested_vnmi_enabled(svm))
779
int_ctl_vmcb12_bits |= (V_NMI_PENDING_MASK |
780
V_NMI_ENABLE_MASK |
781
V_NMI_BLOCKING_MASK);
782
}
783
784
/* Copied from vmcb01. msrpm_base can be overwritten later. */
785
vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
786
vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
787
vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
788
vmcb_mark_dirty(vmcb02, VMCB_PERM_MAP);
789
790
/*
791
* Stash vmcb02's counter if the guest hasn't moved past the guilty
792
* instruction; otherwise, reset the counter to '0'.
793
*
794
* In order to detect if L2 has made forward progress or not, track the
795
* RIP at which a bus lock has occurred on a per-vmcb12 basis. If RIP
796
* is changed, guest has clearly made forward progress, bus_lock_counter
797
* still remained '1', so reset bus_lock_counter to '0'. Eg. In the
798
* scenario, where a buslock happened in L1 before VMRUN, the bus lock
799
* firmly happened on an instruction in the past. Even if vmcb01's
800
* counter is still '1', (because the guilty instruction got patched),
801
* the vCPU has clearly made forward progress and so KVM should reset
802
* vmcb02's counter to '0'.
803
*
804
* If the RIP hasn't changed, stash the bus lock counter at nested VMRUN
805
* to prevent the same guilty instruction from triggering a VM-Exit. Eg.
806
* if userspace rate-limits the vCPU, then it's entirely possible that
807
* L1's tick interrupt is pending by the time userspace re-runs the
808
* vCPU. If KVM unconditionally clears the counter on VMRUN, then when
809
* L1 re-enters L2, the same instruction will trigger a VM-Exit and the
810
* entire cycle start over.
811
*/
812
if (vmcb02->save.rip && (svm->nested.ctl.bus_lock_rip == vmcb02->save.rip))
813
vmcb02->control.bus_lock_counter = 1;
814
else
815
vmcb02->control.bus_lock_counter = 0;
816
817
/* Done at vmrun: asid. */
818
819
/* Also overwritten later if necessary. */
820
vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
821
822
/* nested_cr3. */
823
if (nested_npt_enabled(svm))
824
nested_svm_init_mmu_context(vcpu);
825
826
vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
827
vcpu->arch.l1_tsc_offset,
828
svm->nested.ctl.tsc_offset,
829
svm->tsc_ratio_msr);
830
831
vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
832
833
if (guest_cpu_cap_has(vcpu, X86_FEATURE_TSCRATEMSR) &&
834
svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio)
835
nested_svm_update_tsc_ratio_msr(vcpu);
836
837
vmcb02->control.int_ctl =
838
(svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
839
(vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
840
841
vmcb02->control.int_vector = svm->nested.ctl.int_vector;
842
vmcb02->control.int_state = svm->nested.ctl.int_state;
843
vmcb02->control.event_inj = svm->nested.ctl.event_inj;
844
vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
845
846
/*
847
* next_rip is consumed on VMRUN as the return address pushed on the
848
* stack for injected soft exceptions/interrupts. If nrips is exposed
849
* to L1, take it verbatim from vmcb12. If nrips is supported in
850
* hardware but not exposed to L1, stuff the actual L2 RIP to emulate
851
* what a nrips=0 CPU would do (L1 is responsible for advancing RIP
852
* prior to injecting the event).
853
*/
854
if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
855
vmcb02->control.next_rip = svm->nested.ctl.next_rip;
856
else if (boot_cpu_has(X86_FEATURE_NRIPS))
857
vmcb02->control.next_rip = vmcb12_rip;
858
859
svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
860
if (is_evtinj_soft(vmcb02->control.event_inj)) {
861
svm->soft_int_injected = true;
862
svm->soft_int_csbase = vmcb12_csbase;
863
svm->soft_int_old_rip = vmcb12_rip;
864
if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
865
svm->soft_int_next_rip = svm->nested.ctl.next_rip;
866
else
867
svm->soft_int_next_rip = vmcb12_rip;
868
}
869
870
/* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
871
872
if (!nested_vmcb_needs_vls_intercept(svm))
873
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
874
875
if (guest_cpu_cap_has(vcpu, X86_FEATURE_PAUSEFILTER))
876
pause_count12 = svm->nested.ctl.pause_filter_count;
877
else
878
pause_count12 = 0;
879
if (guest_cpu_cap_has(vcpu, X86_FEATURE_PFTHRESHOLD))
880
pause_thresh12 = svm->nested.ctl.pause_filter_thresh;
881
else
882
pause_thresh12 = 0;
883
if (kvm_pause_in_guest(svm->vcpu.kvm)) {
884
/* use guest values since host doesn't intercept PAUSE */
885
vmcb02->control.pause_filter_count = pause_count12;
886
vmcb02->control.pause_filter_thresh = pause_thresh12;
887
888
} else {
889
/* start from host values otherwise */
890
vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
891
vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
892
893
/* ... but ensure filtering is disabled if so requested. */
894
if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
895
if (!pause_count12)
896
vmcb02->control.pause_filter_count = 0;
897
if (!pause_thresh12)
898
vmcb02->control.pause_filter_thresh = 0;
899
}
900
}
901
902
/*
903
* Take ALLOW_LARGER_RAP from vmcb12 even though it should be safe to
904
* let L2 use a larger RAP since KVM will emulate the necessary clears,
905
* as it's possible L1 deliberately wants to restrict L2 to the legacy
906
* RAP size. Unconditionally clear the RAP on nested VMRUN, as KVM is
907
* responsible for emulating the host vs. guest tags (L1 is the "host",
908
* L2 is the "guest").
909
*/
910
if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
911
vmcb02->control.erap_ctl = (svm->nested.ctl.erap_ctl &
912
ERAP_CONTROL_ALLOW_LARGER_RAP) |
913
ERAP_CONTROL_CLEAR_RAP;
914
915
/*
916
* Merge guest and host intercepts - must be called with vcpu in
917
* guest-mode to take effect.
918
*/
919
recalc_intercepts(svm);
920
}
921
922
static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
923
{
924
/*
925
* Some VMCB state is shared between L1 and L2 and thus has to be
926
* moved at the time of nested vmrun and vmexit.
927
*
928
* VMLOAD/VMSAVE state would also belong in this category, but KVM
929
* always performs VMLOAD and VMSAVE from the VMCB01.
930
*/
931
to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
932
}
933
934
int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
935
struct vmcb *vmcb12, bool from_vmrun)
936
{
937
struct vcpu_svm *svm = to_svm(vcpu);
938
int ret;
939
940
trace_kvm_nested_vmenter(svm->vmcb->save.rip,
941
vmcb12_gpa,
942
vmcb12->save.rip,
943
vmcb12->control.int_ctl,
944
vmcb12->control.event_inj,
945
vmcb12->control.nested_ctl,
946
vmcb12->control.nested_cr3,
947
vmcb12->save.cr3,
948
KVM_ISA_SVM);
949
950
trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
951
vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
952
vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
953
vmcb12->control.intercepts[INTERCEPT_WORD3],
954
vmcb12->control.intercepts[INTERCEPT_WORD4],
955
vmcb12->control.intercepts[INTERCEPT_WORD5]);
956
957
958
svm->nested.vmcb12_gpa = vmcb12_gpa;
959
960
WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
961
962
nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
963
964
svm_switch_vmcb(svm, &svm->nested.vmcb02);
965
nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base);
966
nested_vmcb02_prepare_save(svm, vmcb12);
967
968
ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
969
nested_npt_enabled(svm), from_vmrun);
970
if (ret)
971
return ret;
972
973
if (!from_vmrun)
974
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
975
976
svm_set_gif(svm, true);
977
978
if (kvm_vcpu_apicv_active(vcpu))
979
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
980
981
nested_svm_hv_update_vm_vp_ids(vcpu);
982
983
return 0;
984
}
985
986
int nested_svm_vmrun(struct kvm_vcpu *vcpu)
987
{
988
struct vcpu_svm *svm = to_svm(vcpu);
989
int ret;
990
struct vmcb *vmcb12;
991
struct kvm_host_map map;
992
u64 vmcb12_gpa;
993
struct vmcb *vmcb01 = svm->vmcb01.ptr;
994
995
if (!svm->nested.hsave_msr) {
996
kvm_inject_gp(vcpu, 0);
997
return 1;
998
}
999
1000
if (is_smm(vcpu)) {
1001
kvm_queue_exception(vcpu, UD_VECTOR);
1002
return 1;
1003
}
1004
1005
/* This fails when VP assist page is enabled but the supplied GPA is bogus */
1006
ret = kvm_hv_verify_vp_assist(vcpu);
1007
if (ret) {
1008
kvm_inject_gp(vcpu, 0);
1009
return ret;
1010
}
1011
1012
vmcb12_gpa = svm->vmcb->save.rax;
1013
ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
1014
if (ret == -EINVAL) {
1015
kvm_inject_gp(vcpu, 0);
1016
return 1;
1017
} else if (ret) {
1018
return kvm_skip_emulated_instruction(vcpu);
1019
}
1020
1021
ret = kvm_skip_emulated_instruction(vcpu);
1022
1023
vmcb12 = map.hva;
1024
1025
if (WARN_ON_ONCE(!svm->nested.initialized))
1026
return -EINVAL;
1027
1028
nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
1029
nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
1030
1031
if (!nested_vmcb_check_save(vcpu) ||
1032
!nested_vmcb_check_controls(vcpu)) {
1033
vmcb12->control.exit_code = SVM_EXIT_ERR;
1034
vmcb12->control.exit_info_1 = 0;
1035
vmcb12->control.exit_info_2 = 0;
1036
goto out;
1037
}
1038
1039
/*
1040
* Since vmcb01 is not in use, we can use it to store some of the L1
1041
* state.
1042
*/
1043
vmcb01->save.efer = vcpu->arch.efer;
1044
vmcb01->save.cr0 = kvm_read_cr0(vcpu);
1045
vmcb01->save.cr4 = vcpu->arch.cr4;
1046
vmcb01->save.rflags = kvm_get_rflags(vcpu);
1047
vmcb01->save.rip = kvm_rip_read(vcpu);
1048
1049
if (!npt_enabled)
1050
vmcb01->save.cr3 = kvm_read_cr3(vcpu);
1051
1052
svm->nested.nested_run_pending = 1;
1053
1054
if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
1055
goto out_exit_err;
1056
1057
if (nested_svm_merge_msrpm(vcpu))
1058
goto out;
1059
1060
out_exit_err:
1061
svm->nested.nested_run_pending = 0;
1062
svm->nmi_l1_to_l2 = false;
1063
svm->soft_int_injected = false;
1064
1065
svm->vmcb->control.exit_code = SVM_EXIT_ERR;
1066
svm->vmcb->control.exit_info_1 = 0;
1067
svm->vmcb->control.exit_info_2 = 0;
1068
1069
nested_svm_vmexit(svm);
1070
1071
out:
1072
kvm_vcpu_unmap(vcpu, &map);
1073
1074
return ret;
1075
}
1076
1077
/* Copy state save area fields which are handled by VMRUN */
1078
void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
1079
struct vmcb_save_area *from_save)
1080
{
1081
to_save->es = from_save->es;
1082
to_save->cs = from_save->cs;
1083
to_save->ss = from_save->ss;
1084
to_save->ds = from_save->ds;
1085
to_save->gdtr = from_save->gdtr;
1086
to_save->idtr = from_save->idtr;
1087
to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED;
1088
to_save->efer = from_save->efer;
1089
to_save->cr0 = from_save->cr0;
1090
to_save->cr3 = from_save->cr3;
1091
to_save->cr4 = from_save->cr4;
1092
to_save->rax = from_save->rax;
1093
to_save->rsp = from_save->rsp;
1094
to_save->rip = from_save->rip;
1095
to_save->cpl = 0;
1096
1097
if (kvm_cpu_cap_has(X86_FEATURE_SHSTK)) {
1098
to_save->s_cet = from_save->s_cet;
1099
to_save->isst_addr = from_save->isst_addr;
1100
to_save->ssp = from_save->ssp;
1101
}
1102
}
1103
1104
void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
1105
{
1106
to_vmcb->save.fs = from_vmcb->save.fs;
1107
to_vmcb->save.gs = from_vmcb->save.gs;
1108
to_vmcb->save.tr = from_vmcb->save.tr;
1109
to_vmcb->save.ldtr = from_vmcb->save.ldtr;
1110
to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
1111
to_vmcb->save.star = from_vmcb->save.star;
1112
to_vmcb->save.lstar = from_vmcb->save.lstar;
1113
to_vmcb->save.cstar = from_vmcb->save.cstar;
1114
to_vmcb->save.sfmask = from_vmcb->save.sfmask;
1115
to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
1116
to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
1117
to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1118
}
1119
1120
int nested_svm_vmexit(struct vcpu_svm *svm)
1121
{
1122
struct kvm_vcpu *vcpu = &svm->vcpu;
1123
struct vmcb *vmcb01 = svm->vmcb01.ptr;
1124
struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
1125
struct vmcb *vmcb12;
1126
struct kvm_host_map map;
1127
int rc;
1128
1129
rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
1130
if (rc) {
1131
if (rc == -EINVAL)
1132
kvm_inject_gp(vcpu, 0);
1133
return 1;
1134
}
1135
1136
vmcb12 = map.hva;
1137
1138
/* Exit Guest-Mode */
1139
leave_guest_mode(vcpu);
1140
svm->nested.vmcb12_gpa = 0;
1141
WARN_ON_ONCE(svm->nested.nested_run_pending);
1142
1143
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1144
1145
/* in case we halted in L2 */
1146
kvm_set_mp_state(vcpu, KVM_MP_STATE_RUNNABLE);
1147
1148
/* Give the current vmcb to the guest */
1149
1150
vmcb12->save.es = vmcb02->save.es;
1151
vmcb12->save.cs = vmcb02->save.cs;
1152
vmcb12->save.ss = vmcb02->save.ss;
1153
vmcb12->save.ds = vmcb02->save.ds;
1154
vmcb12->save.gdtr = vmcb02->save.gdtr;
1155
vmcb12->save.idtr = vmcb02->save.idtr;
1156
vmcb12->save.efer = svm->vcpu.arch.efer;
1157
vmcb12->save.cr0 = kvm_read_cr0(vcpu);
1158
vmcb12->save.cr3 = kvm_read_cr3(vcpu);
1159
vmcb12->save.cr2 = vmcb02->save.cr2;
1160
vmcb12->save.cr4 = svm->vcpu.arch.cr4;
1161
vmcb12->save.rflags = kvm_get_rflags(vcpu);
1162
vmcb12->save.rip = kvm_rip_read(vcpu);
1163
vmcb12->save.rsp = kvm_rsp_read(vcpu);
1164
vmcb12->save.rax = kvm_rax_read(vcpu);
1165
vmcb12->save.dr7 = vmcb02->save.dr7;
1166
vmcb12->save.dr6 = svm->vcpu.arch.dr6;
1167
vmcb12->save.cpl = vmcb02->save.cpl;
1168
1169
if (guest_cpu_cap_has(vcpu, X86_FEATURE_SHSTK)) {
1170
vmcb12->save.s_cet = vmcb02->save.s_cet;
1171
vmcb12->save.isst_addr = vmcb02->save.isst_addr;
1172
vmcb12->save.ssp = vmcb02->save.ssp;
1173
}
1174
1175
vmcb12->control.int_state = vmcb02->control.int_state;
1176
vmcb12->control.exit_code = vmcb02->control.exit_code;
1177
vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1;
1178
vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2;
1179
1180
if (!svm_is_vmrun_failure(vmcb12->control.exit_code))
1181
nested_save_pending_event_to_vmcb12(svm, vmcb12);
1182
1183
if (guest_cpu_cap_has(vcpu, X86_FEATURE_NRIPS))
1184
vmcb12->control.next_rip = vmcb02->control.next_rip;
1185
1186
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
1187
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
1188
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
1189
1190
if (!kvm_pause_in_guest(vcpu->kvm)) {
1191
vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
1192
vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
1193
1194
}
1195
1196
/*
1197
* Invalidate bus_lock_rip unless KVM is still waiting for the guest
1198
* to make forward progress before re-enabling bus lock detection.
1199
*/
1200
if (!vmcb02->control.bus_lock_counter)
1201
svm->nested.ctl.bus_lock_rip = INVALID_GPA;
1202
1203
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
1204
1205
kvm_nested_vmexit_handle_ibrs(vcpu);
1206
1207
if (guest_cpu_cap_has(vcpu, X86_FEATURE_ERAPS))
1208
vmcb01->control.erap_ctl |= ERAP_CONTROL_CLEAR_RAP;
1209
1210
svm_switch_vmcb(svm, &svm->vmcb01);
1211
1212
/*
1213
* Rules for synchronizing int_ctl bits from vmcb02 to vmcb01:
1214
*
1215
* V_IRQ, V_IRQ_VECTOR, V_INTR_PRIO_MASK, V_IGN_TPR: If L1 doesn't
1216
* intercept interrupts, then KVM will use vmcb02's V_IRQ (and related
1217
* flags) to detect interrupt windows for L1 IRQs (even if L1 uses
1218
* virtual interrupt masking). Raise KVM_REQ_EVENT to ensure that
1219
* KVM re-requests an interrupt window if necessary, which implicitly
1220
* copies this bits from vmcb02 to vmcb01.
1221
*
1222
* V_TPR: If L1 doesn't use virtual interrupt masking, then L1's vTPR
1223
* is stored in vmcb02, but its value doesn't need to be copied from/to
1224
* vmcb01 because it is copied from/to the virtual APIC's TPR register
1225
* on each VM entry/exit.
1226
*
1227
* V_GIF: If nested vGIF is not used, KVM uses vmcb02's V_GIF for L1's
1228
* V_GIF. However, GIF is architecturally clear on each VM exit, thus
1229
* there is no need to copy V_GIF from vmcb02 to vmcb01.
1230
*/
1231
if (!nested_exit_on_intr(svm))
1232
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
1233
1234
if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
1235
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)))
1236
svm_copy_lbrs(vmcb12, vmcb02);
1237
else
1238
svm_copy_lbrs(vmcb01, vmcb02);
1239
1240
svm_update_lbrv(vcpu);
1241
1242
if (vnmi) {
1243
if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
1244
vmcb01->control.int_ctl |= V_NMI_BLOCKING_MASK;
1245
else
1246
vmcb01->control.int_ctl &= ~V_NMI_BLOCKING_MASK;
1247
1248
if (vcpu->arch.nmi_pending) {
1249
vcpu->arch.nmi_pending--;
1250
vmcb01->control.int_ctl |= V_NMI_PENDING_MASK;
1251
} else {
1252
vmcb01->control.int_ctl &= ~V_NMI_PENDING_MASK;
1253
}
1254
}
1255
1256
/*
1257
* On vmexit the GIF is set to false and
1258
* no event can be injected in L1.
1259
*/
1260
svm_set_gif(svm, false);
1261
vmcb01->control.exit_int_info = 0;
1262
1263
svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
1264
if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
1265
vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
1266
vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
1267
}
1268
1269
if (kvm_caps.has_tsc_control &&
1270
vcpu->arch.tsc_scaling_ratio != vcpu->arch.l1_tsc_scaling_ratio) {
1271
vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
1272
svm_write_tsc_multiplier(vcpu);
1273
}
1274
1275
svm->nested.ctl.nested_cr3 = 0;
1276
1277
/*
1278
* Restore processor state that had been saved in vmcb01
1279
*/
1280
kvm_set_rflags(vcpu, vmcb01->save.rflags);
1281
svm_set_efer(vcpu, vmcb01->save.efer);
1282
svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
1283
svm_set_cr4(vcpu, vmcb01->save.cr4);
1284
kvm_rax_write(vcpu, vmcb01->save.rax);
1285
kvm_rsp_write(vcpu, vmcb01->save.rsp);
1286
kvm_rip_write(vcpu, vmcb01->save.rip);
1287
1288
svm->vcpu.arch.dr7 = DR7_FIXED_1;
1289
kvm_update_dr7(&svm->vcpu);
1290
1291
trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
1292
vmcb12->control.exit_info_1,
1293
vmcb12->control.exit_info_2,
1294
vmcb12->control.exit_int_info,
1295
vmcb12->control.exit_int_info_err,
1296
KVM_ISA_SVM);
1297
1298
kvm_vcpu_unmap(vcpu, &map);
1299
1300
nested_svm_transition_tlb_flush(vcpu);
1301
1302
nested_svm_uninit_mmu_context(vcpu);
1303
1304
rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
1305
if (rc)
1306
return 1;
1307
1308
/*
1309
* Drop what we picked up for L2 via svm_complete_interrupts() so it
1310
* doesn't end up in L1.
1311
*/
1312
svm->vcpu.arch.nmi_injected = false;
1313
kvm_clear_exception_queue(vcpu);
1314
kvm_clear_interrupt_queue(vcpu);
1315
1316
/*
1317
* If we are here following the completion of a VMRUN that
1318
* is being single-stepped, queue the pending #DB intercept
1319
* right now so that it an be accounted for before we execute
1320
* L1's next instruction.
1321
*/
1322
if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
1323
kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
1324
1325
/*
1326
* Un-inhibit the AVIC right away, so that other vCPUs can start
1327
* to benefit from it right away.
1328
*/
1329
if (kvm_apicv_activated(vcpu->kvm))
1330
__kvm_vcpu_update_apicv(vcpu);
1331
1332
return 0;
1333
}
1334
1335
static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
1336
{
1337
struct vcpu_svm *svm = to_svm(vcpu);
1338
1339
if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_SHUTDOWN))
1340
return;
1341
1342
kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1343
nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
1344
}
1345
1346
int svm_allocate_nested(struct vcpu_svm *svm)
1347
{
1348
struct page *vmcb02_page;
1349
1350
if (svm->nested.initialized)
1351
return 0;
1352
1353
vmcb02_page = snp_safe_alloc_page();
1354
if (!vmcb02_page)
1355
return -ENOMEM;
1356
svm->nested.vmcb02.ptr = page_address(vmcb02_page);
1357
svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
1358
1359
svm->nested.msrpm = svm_vcpu_alloc_msrpm();
1360
if (!svm->nested.msrpm)
1361
goto err_free_vmcb02;
1362
1363
svm->nested.initialized = true;
1364
return 0;
1365
1366
err_free_vmcb02:
1367
__free_page(vmcb02_page);
1368
return -ENOMEM;
1369
}
1370
1371
void svm_free_nested(struct vcpu_svm *svm)
1372
{
1373
if (!svm->nested.initialized)
1374
return;
1375
1376
if (WARN_ON_ONCE(svm->vmcb != svm->vmcb01.ptr))
1377
svm_switch_vmcb(svm, &svm->vmcb01);
1378
1379
svm_vcpu_free_msrpm(svm->nested.msrpm);
1380
svm->nested.msrpm = NULL;
1381
1382
__free_page(virt_to_page(svm->nested.vmcb02.ptr));
1383
svm->nested.vmcb02.ptr = NULL;
1384
1385
/*
1386
* When last_vmcb12_gpa matches the current vmcb12 gpa,
1387
* some vmcb12 fields are not loaded if they are marked clean
1388
* in the vmcb12, since in this case they are up to date already.
1389
*
1390
* When the vmcb02 is freed, this optimization becomes invalid.
1391
*/
1392
svm->nested.last_vmcb12_gpa = INVALID_GPA;
1393
1394
svm->nested.initialized = false;
1395
}
1396
1397
void svm_leave_nested(struct kvm_vcpu *vcpu)
1398
{
1399
struct vcpu_svm *svm = to_svm(vcpu);
1400
1401
if (is_guest_mode(vcpu)) {
1402
svm->nested.nested_run_pending = 0;
1403
svm->nested.vmcb12_gpa = INVALID_GPA;
1404
1405
leave_guest_mode(vcpu);
1406
1407
svm_switch_vmcb(svm, &svm->vmcb01);
1408
1409
nested_svm_uninit_mmu_context(vcpu);
1410
vmcb_mark_all_dirty(svm->vmcb);
1411
1412
svm_set_gif(svm, true);
1413
1414
if (kvm_apicv_activated(vcpu->kvm))
1415
kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
1416
}
1417
1418
kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1419
}
1420
1421
static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1422
{
1423
gpa_t base = svm->nested.ctl.msrpm_base_pa;
1424
int write, bit_nr;
1425
u8 value, mask;
1426
u32 msr;
1427
1428
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
1429
return NESTED_EXIT_HOST;
1430
1431
msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1432
bit_nr = svm_msrpm_bit_nr(msr);
1433
write = svm->vmcb->control.exit_info_1 & 1;
1434
1435
if (bit_nr < 0)
1436
return NESTED_EXIT_DONE;
1437
1438
if (kvm_vcpu_read_guest(&svm->vcpu, base + bit_nr / BITS_PER_BYTE,
1439
&value, sizeof(value)))
1440
return NESTED_EXIT_DONE;
1441
1442
mask = BIT(write) << (bit_nr & (BITS_PER_BYTE - 1));
1443
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1444
}
1445
1446
static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
1447
{
1448
unsigned port, size, iopm_len;
1449
u16 val, mask;
1450
u8 start_bit;
1451
u64 gpa;
1452
1453
if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
1454
return NESTED_EXIT_HOST;
1455
1456
port = svm->vmcb->control.exit_info_1 >> 16;
1457
size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
1458
SVM_IOIO_SIZE_SHIFT;
1459
gpa = svm->nested.ctl.iopm_base_pa + (port / 8);
1460
start_bit = port % 8;
1461
iopm_len = (start_bit + size > 8) ? 2 : 1;
1462
mask = (0xf >> (4 - size)) << start_bit;
1463
val = 0;
1464
1465
if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
1466
return NESTED_EXIT_DONE;
1467
1468
return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1469
}
1470
1471
static int nested_svm_intercept(struct vcpu_svm *svm)
1472
{
1473
u64 exit_code = svm->vmcb->control.exit_code;
1474
int vmexit = NESTED_EXIT_HOST;
1475
1476
if (svm_is_vmrun_failure(exit_code))
1477
return NESTED_EXIT_DONE;
1478
1479
switch (exit_code) {
1480
case SVM_EXIT_MSR:
1481
vmexit = nested_svm_exit_handled_msr(svm);
1482
break;
1483
case SVM_EXIT_IOIO:
1484
vmexit = nested_svm_intercept_ioio(svm);
1485
break;
1486
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f:
1487
/*
1488
* Host-intercepted exceptions have been checked already in
1489
* nested_svm_exit_special. There is nothing to do here,
1490
* the vmexit is injected by svm_check_nested_events.
1491
*/
1492
vmexit = NESTED_EXIT_DONE;
1493
break;
1494
default:
1495
if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
1496
vmexit = NESTED_EXIT_DONE;
1497
break;
1498
}
1499
1500
return vmexit;
1501
}
1502
1503
int nested_svm_exit_handled(struct vcpu_svm *svm)
1504
{
1505
int vmexit;
1506
1507
vmexit = nested_svm_intercept(svm);
1508
1509
if (vmexit == NESTED_EXIT_DONE)
1510
nested_svm_vmexit(svm);
1511
1512
return vmexit;
1513
}
1514
1515
int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
1516
{
1517
if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
1518
kvm_queue_exception(vcpu, UD_VECTOR);
1519
return 1;
1520
}
1521
1522
if (to_svm(vcpu)->vmcb->save.cpl) {
1523
kvm_inject_gp(vcpu, 0);
1524
return 1;
1525
}
1526
1527
return 0;
1528
}
1529
1530
static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
1531
u32 error_code)
1532
{
1533
struct vcpu_svm *svm = to_svm(vcpu);
1534
1535
return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector));
1536
}
1537
1538
static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
1539
{
1540
struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
1541
struct vcpu_svm *svm = to_svm(vcpu);
1542
struct vmcb *vmcb = svm->vmcb;
1543
1544
vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector;
1545
1546
if (ex->has_error_code)
1547
vmcb->control.exit_info_1 = ex->error_code;
1548
1549
/*
1550
* EXITINFO2 is undefined for all exception intercepts other
1551
* than #PF.
1552
*/
1553
if (ex->vector == PF_VECTOR) {
1554
if (ex->has_payload)
1555
vmcb->control.exit_info_2 = ex->payload;
1556
else
1557
vmcb->control.exit_info_2 = vcpu->arch.cr2;
1558
} else if (ex->vector == DB_VECTOR) {
1559
/* See kvm_check_and_inject_events(). */
1560
kvm_deliver_exception_payload(vcpu, ex);
1561
1562
if (vcpu->arch.dr7 & DR7_GD) {
1563
vcpu->arch.dr7 &= ~DR7_GD;
1564
kvm_update_dr7(vcpu);
1565
}
1566
} else {
1567
WARN_ON(ex->has_payload);
1568
}
1569
1570
nested_svm_vmexit(svm);
1571
}
1572
1573
static inline bool nested_exit_on_init(struct vcpu_svm *svm)
1574
{
1575
return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
1576
}
1577
1578
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
1579
{
1580
struct kvm_lapic *apic = vcpu->arch.apic;
1581
struct vcpu_svm *svm = to_svm(vcpu);
1582
/*
1583
* Only a pending nested run blocks a pending exception. If there is a
1584
* previously injected event, the pending exception occurred while said
1585
* event was being delivered and thus needs to be handled.
1586
*/
1587
bool block_nested_exceptions = svm->nested.nested_run_pending;
1588
/*
1589
* New events (not exceptions) are only recognized at instruction
1590
* boundaries. If an event needs reinjection, then KVM is handling a
1591
* VM-Exit that occurred _during_ instruction execution; new events are
1592
* blocked until the instruction completes.
1593
*/
1594
bool block_nested_events = block_nested_exceptions ||
1595
kvm_event_needs_reinjection(vcpu);
1596
1597
if (lapic_in_kernel(vcpu) &&
1598
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
1599
if (block_nested_events)
1600
return -EBUSY;
1601
if (!nested_exit_on_init(svm))
1602
return 0;
1603
nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
1604
return 0;
1605
}
1606
1607
if (vcpu->arch.exception_vmexit.pending) {
1608
if (block_nested_exceptions)
1609
return -EBUSY;
1610
nested_svm_inject_exception_vmexit(vcpu);
1611
return 0;
1612
}
1613
1614
if (vcpu->arch.exception.pending) {
1615
if (block_nested_exceptions)
1616
return -EBUSY;
1617
return 0;
1618
}
1619
1620
#ifdef CONFIG_KVM_SMM
1621
if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
1622
if (block_nested_events)
1623
return -EBUSY;
1624
if (!nested_exit_on_smi(svm))
1625
return 0;
1626
nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
1627
return 0;
1628
}
1629
#endif
1630
1631
if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
1632
if (block_nested_events)
1633
return -EBUSY;
1634
if (!nested_exit_on_nmi(svm))
1635
return 0;
1636
nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
1637
return 0;
1638
}
1639
1640
if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
1641
if (block_nested_events)
1642
return -EBUSY;
1643
if (!nested_exit_on_intr(svm))
1644
return 0;
1645
trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1646
nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
1647
return 0;
1648
}
1649
1650
return 0;
1651
}
1652
1653
int nested_svm_exit_special(struct vcpu_svm *svm)
1654
{
1655
u32 exit_code = svm->vmcb->control.exit_code;
1656
struct kvm_vcpu *vcpu = &svm->vcpu;
1657
1658
switch (exit_code) {
1659
case SVM_EXIT_INTR:
1660
case SVM_EXIT_NMI:
1661
case SVM_EXIT_NPF:
1662
return NESTED_EXIT_HOST;
1663
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
1664
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1665
1666
if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
1667
excp_bits)
1668
return NESTED_EXIT_HOST;
1669
else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
1670
svm->vcpu.arch.apf.host_apf_flags)
1671
/* Trap async PF even if not shadowing */
1672
return NESTED_EXIT_HOST;
1673
break;
1674
}
1675
case SVM_EXIT_VMMCALL:
1676
/* Hyper-V L2 TLB flush hypercall is handled by L0 */
1677
if (guest_hv_cpuid_has_l2_tlb_flush(vcpu) &&
1678
nested_svm_l2_tlb_flush_enabled(vcpu) &&
1679
kvm_hv_is_tlb_flush_hcall(vcpu))
1680
return NESTED_EXIT_HOST;
1681
break;
1682
default:
1683
break;
1684
}
1685
1686
return NESTED_EXIT_CONTINUE;
1687
}
1688
1689
void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
1690
{
1691
struct vcpu_svm *svm = to_svm(vcpu);
1692
1693
vcpu->arch.tsc_scaling_ratio =
1694
kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
1695
svm->tsc_ratio_msr);
1696
svm_write_tsc_multiplier(vcpu);
1697
}
1698
1699
/* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
1700
static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
1701
struct vmcb_ctrl_area_cached *from)
1702
{
1703
unsigned int i;
1704
1705
memset(dst, 0, sizeof(struct vmcb_control_area));
1706
1707
for (i = 0; i < MAX_INTERCEPT; i++)
1708
dst->intercepts[i] = from->intercepts[i];
1709
1710
dst->iopm_base_pa = from->iopm_base_pa;
1711
dst->msrpm_base_pa = from->msrpm_base_pa;
1712
dst->tsc_offset = from->tsc_offset;
1713
dst->asid = from->asid;
1714
dst->tlb_ctl = from->tlb_ctl;
1715
dst->erap_ctl = from->erap_ctl;
1716
dst->int_ctl = from->int_ctl;
1717
dst->int_vector = from->int_vector;
1718
dst->int_state = from->int_state;
1719
dst->exit_code = from->exit_code;
1720
dst->exit_info_1 = from->exit_info_1;
1721
dst->exit_info_2 = from->exit_info_2;
1722
dst->exit_int_info = from->exit_int_info;
1723
dst->exit_int_info_err = from->exit_int_info_err;
1724
dst->nested_ctl = from->nested_ctl;
1725
dst->event_inj = from->event_inj;
1726
dst->event_inj_err = from->event_inj_err;
1727
dst->next_rip = from->next_rip;
1728
dst->nested_cr3 = from->nested_cr3;
1729
dst->virt_ext = from->virt_ext;
1730
dst->pause_filter_count = from->pause_filter_count;
1731
dst->pause_filter_thresh = from->pause_filter_thresh;
1732
/* 'clean' and 'hv_enlightenments' are not changed by KVM */
1733
}
1734
1735
static int svm_get_nested_state(struct kvm_vcpu *vcpu,
1736
struct kvm_nested_state __user *user_kvm_nested_state,
1737
u32 user_data_size)
1738
{
1739
struct vcpu_svm *svm;
1740
struct vmcb_control_area *ctl;
1741
unsigned long r;
1742
struct kvm_nested_state kvm_state = {
1743
.flags = 0,
1744
.format = KVM_STATE_NESTED_FORMAT_SVM,
1745
.size = sizeof(kvm_state),
1746
};
1747
struct vmcb __user *user_vmcb = (struct vmcb __user *)
1748
&user_kvm_nested_state->data.svm[0];
1749
1750
if (!vcpu)
1751
return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;
1752
1753
svm = to_svm(vcpu);
1754
1755
if (user_data_size < kvm_state.size)
1756
goto out;
1757
1758
/* First fill in the header and copy it out. */
1759
if (is_guest_mode(vcpu)) {
1760
kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
1761
kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
1762
kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
1763
1764
if (svm->nested.nested_run_pending)
1765
kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
1766
}
1767
1768
if (gif_set(svm))
1769
kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;
1770
1771
if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
1772
return -EFAULT;
1773
1774
if (!is_guest_mode(vcpu))
1775
goto out;
1776
1777
/*
1778
* Copy over the full size of the VMCB rather than just the size
1779
* of the structs.
1780
*/
1781
if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
1782
return -EFAULT;
1783
1784
ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
1785
if (!ctl)
1786
return -ENOMEM;
1787
1788
nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl);
1789
r = copy_to_user(&user_vmcb->control, ctl,
1790
sizeof(user_vmcb->control));
1791
kfree(ctl);
1792
if (r)
1793
return -EFAULT;
1794
1795
if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
1796
sizeof(user_vmcb->save)))
1797
return -EFAULT;
1798
out:
1799
return kvm_state.size;
1800
}
1801
1802
static int svm_set_nested_state(struct kvm_vcpu *vcpu,
1803
struct kvm_nested_state __user *user_kvm_nested_state,
1804
struct kvm_nested_state *kvm_state)
1805
{
1806
struct vcpu_svm *svm = to_svm(vcpu);
1807
struct vmcb __user *user_vmcb = (struct vmcb __user *)
1808
&user_kvm_nested_state->data.svm[0];
1809
struct vmcb_control_area *ctl;
1810
struct vmcb_save_area *save;
1811
struct vmcb_save_area_cached save_cached;
1812
struct vmcb_ctrl_area_cached ctl_cached;
1813
unsigned long cr0;
1814
int ret;
1815
1816
BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
1817
KVM_STATE_NESTED_SVM_VMCB_SIZE);
1818
1819
if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
1820
return -EINVAL;
1821
1822
if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
1823
KVM_STATE_NESTED_RUN_PENDING |
1824
KVM_STATE_NESTED_GIF_SET))
1825
return -EINVAL;
1826
1827
/*
1828
* If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
1829
* EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
1830
* If SVME is disabled, the only valid states are "none" and GIF=1
1831
* (clearing SVME does NOT set GIF, i.e. GIF=0 is allowed).
1832
*/
1833
if (!(vcpu->arch.efer & EFER_SVME) && kvm_state->flags &&
1834
kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
1835
return -EINVAL;
1836
1837
/* SMM temporarily disables SVM, so we cannot be in guest mode. */
1838
if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
1839
return -EINVAL;
1840
1841
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
1842
svm_leave_nested(vcpu);
1843
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
1844
return 0;
1845
}
1846
1847
if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
1848
return -EINVAL;
1849
if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
1850
return -EINVAL;
1851
1852
ctl = memdup_user(&user_vmcb->control, sizeof(*ctl));
1853
if (IS_ERR(ctl))
1854
return PTR_ERR(ctl);
1855
1856
save = memdup_user(&user_vmcb->save, sizeof(*save));
1857
if (IS_ERR(save)) {
1858
kfree(ctl);
1859
return PTR_ERR(save);
1860
}
1861
1862
ret = -EINVAL;
1863
__nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
1864
if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
1865
goto out_free;
1866
1867
/*
1868
* Processor state contains L2 state. Check that it is
1869
* valid for guest mode (see nested_vmcb_check_save).
1870
*/
1871
cr0 = kvm_read_cr0(vcpu);
1872
if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
1873
goto out_free;
1874
1875
/*
1876
* Validate host state saved from before VMRUN (see
1877
* nested_svm_check_permissions).
1878
*/
1879
__nested_copy_vmcb_save_to_cache(&save_cached, save);
1880
if (!(save->cr0 & X86_CR0_PG) ||
1881
!(save->cr0 & X86_CR0_PE) ||
1882
(save->rflags & X86_EFLAGS_VM) ||
1883
!__nested_vmcb_check_save(vcpu, &save_cached))
1884
goto out_free;
1885
1886
1887
/*
1888
* All checks done, we can enter guest mode. Userspace provides
1889
* vmcb12.control, which will be combined with L1 and stored into
1890
* vmcb02, and the L1 save state which we store in vmcb01.
1891
* L2 registers if needed are moved from the current VMCB to VMCB02.
1892
*/
1893
1894
if (is_guest_mode(vcpu))
1895
svm_leave_nested(vcpu);
1896
else
1897
svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
1898
1899
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
1900
1901
svm->nested.nested_run_pending =
1902
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
1903
1904
svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
1905
1906
svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
1907
nested_copy_vmcb_control_to_cache(svm, ctl);
1908
1909
svm_switch_vmcb(svm, &svm->nested.vmcb02);
1910
nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base);
1911
1912
/*
1913
* While the nested guest CR3 is already checked and set by
1914
* KVM_SET_SREGS, it was set when nested state was yet loaded,
1915
* thus MMU might not be initialized correctly.
1916
* Set it again to fix this.
1917
*/
1918
ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
1919
nested_npt_enabled(svm), false);
1920
if (ret)
1921
goto out_free;
1922
1923
svm->nested.force_msr_bitmap_recalc = true;
1924
1925
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1926
ret = 0;
1927
out_free:
1928
kfree(save);
1929
kfree(ctl);
1930
1931
return ret;
1932
}
1933
1934
static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
1935
{
1936
if (WARN_ON(!is_guest_mode(vcpu)))
1937
return true;
1938
1939
if (!vcpu->arch.pdptrs_from_userspace &&
1940
!nested_npt_enabled(to_svm(vcpu)) && is_pae_paging(vcpu))
1941
/*
1942
* Reload the guest's PDPTRs since after a migration
1943
* the guest CR3 might be restored prior to setting the nested
1944
* state which can lead to a load of wrong PDPTRs.
1945
*/
1946
if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
1947
return false;
1948
1949
if (!nested_svm_merge_msrpm(vcpu)) {
1950
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
1951
vcpu->run->internal.suberror =
1952
KVM_INTERNAL_ERROR_EMULATION;
1953
vcpu->run->internal.ndata = 0;
1954
return false;
1955
}
1956
1957
if (kvm_hv_verify_vp_assist(vcpu))
1958
return false;
1959
1960
return true;
1961
}
1962
1963
struct kvm_x86_nested_ops svm_nested_ops = {
1964
.leave_nested = svm_leave_nested,
1965
.is_exception_vmexit = nested_svm_is_exception_vmexit,
1966
.check_events = svm_check_nested_events,
1967
.triple_fault = nested_svm_triple_fault,
1968
.get_nested_state_pages = svm_get_nested_state_pages,
1969
.get_state = svm_get_nested_state,
1970
.set_state = svm_set_nested_state,
1971
.hv_inject_synthetic_vmexit_post_tlb_flush = svm_hv_inject_synthetic_vmexit_post_tlb_flush,
1972
};
1973
1974