Skip to content

Commit 0e5ba55

Browse files
committed
Merge tag 'kvm-x86-fixes-6.18-rc5' of https://github.com/kvm-x86/linux into HEAD
KVM x86 fixes for 6.18: - Inject #UD if the guest attempts to execute SEAMCALL or TDCALL as KVM doesn't support virtualization the instructions, but the instructions are gated only by VMXON, i.e. will VM-Exit instead of taking a #UD and thus result in KVM exiting to userspace with an emulation error. - Unload the "FPU" when emulating INIT of XSTATE features if and only if the FPU is actually loaded, instead of trying to predict when KVM will emulate an INIT (CET support missed the MP_STATE path). Add sanity checks to detect and harden against similar bugs in the future. - Unregister KVM's GALog notifier (for AVIC) when kvm-amd.ko is unloaded. - Use a raw spinlock for svm->ir_list_lock as the lock is taken during schedule(), and "normal" spinlocks are sleepable locks when PREEMPT_RT=y. - Remove guest_memfd bindings on memslot deletion when a gmem file is dying to fix a use-after-free race found by syzkaller. - Fix a goof in the EPT Violation handler where KVM checks the wrong variable when determining if the reported GVA is valid.
2 parents 36567f1 + d0164c1 commit 0e5ba55

File tree

9 files changed

+106
-51
lines changed

9 files changed

+106
-51
lines changed

arch/x86/include/uapi/asm/vmx.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
#define EXIT_REASON_TPAUSE 68
9494
#define EXIT_REASON_BUS_LOCK 74
9595
#define EXIT_REASON_NOTIFY 75
96+
#define EXIT_REASON_SEAMCALL 76
9697
#define EXIT_REASON_TDCALL 77
9798
#define EXIT_REASON_MSR_READ_IMM 84
9899
#define EXIT_REASON_MSR_WRITE_IMM 85

arch/x86/kvm/svm/avic.c

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm)
216216
* This function is called from IOMMU driver to notify
217217
* SVM to schedule in a particular vCPU of a particular VM.
218218
*/
219-
int avic_ga_log_notifier(u32 ga_tag)
219+
static int avic_ga_log_notifier(u32 ga_tag)
220220
{
221221
unsigned long flags;
222222
struct kvm_svm *kvm_svm;
@@ -788,7 +788,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
788788
struct kvm_vcpu *vcpu = &svm->vcpu;
789789

790790
INIT_LIST_HEAD(&svm->ir_list);
791-
spin_lock_init(&svm->ir_list_lock);
791+
raw_spin_lock_init(&svm->ir_list_lock);
792792

793793
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
794794
return 0;
@@ -816,9 +816,9 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
816816
if (!vcpu)
817817
return;
818818

819-
spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
819+
raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
820820
list_del(&irqfd->vcpu_list);
821-
spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
821+
raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
822822
}
823823

824824
int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
@@ -855,7 +855,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
855855
* list of IRQs being posted to the vCPU, to ensure the IRTE
856856
* isn't programmed with stale pCPU/IsRunning information.
857857
*/
858-
guard(spinlock_irqsave)(&svm->ir_list_lock);
858+
guard(raw_spinlock_irqsave)(&svm->ir_list_lock);
859859

860860
/*
861861
* Update the target pCPU for IOMMU doorbells if the vCPU is
@@ -972,7 +972,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
972972
* up-to-date entry information, or that this task will wait until
973973
* svm_ir_list_add() completes to set the new target pCPU.
974974
*/
975-
spin_lock_irqsave(&svm->ir_list_lock, flags);
975+
raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
976976

977977
entry = svm->avic_physical_id_entry;
978978
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
@@ -997,7 +997,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
997997

998998
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
999999

1000-
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1000+
raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
10011001
}
10021002

10031003
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1035,7 +1035,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
10351035
* or that this task will wait until svm_ir_list_add() completes to
10361036
* mark the vCPU as not running.
10371037
*/
1038-
spin_lock_irqsave(&svm->ir_list_lock, flags);
1038+
raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
10391039

10401040
avic_update_iommu_vcpu_affinity(vcpu, -1, action);
10411041

@@ -1059,7 +1059,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
10591059

10601060
svm->avic_physical_id_entry = entry;
10611061

1062-
spin_unlock_irqrestore(&svm->ir_list_lock, flags);
1062+
raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
10631063
}
10641064

10651065
void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1243,3 +1243,9 @@ bool __init avic_hardware_setup(void)
12431243

12441244
return true;
12451245
}
1246+
1247+
void avic_hardware_unsetup(void)
1248+
{
1249+
if (avic)
1250+
amd_iommu_register_ga_log_notifier(NULL);
1251+
}

arch/x86/kvm/svm/svm.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -921,6 +921,8 @@ static void svm_hardware_unsetup(void)
921921
{
922922
int cpu;
923923

924+
avic_hardware_unsetup();
925+
924926
sev_hardware_unsetup();
925927

926928
for_each_possible_cpu(cpu)
@@ -5386,12 +5388,6 @@ static __init int svm_hardware_setup(void)
53865388

53875389
svm_hv_hardware_setup();
53885390

5389-
for_each_possible_cpu(cpu) {
5390-
r = svm_cpu_init(cpu);
5391-
if (r)
5392-
goto err;
5393-
}
5394-
53955391
enable_apicv = avic_hardware_setup();
53965392
if (!enable_apicv) {
53975393
enable_ipiv = false;
@@ -5435,6 +5431,13 @@ static __init int svm_hardware_setup(void)
54355431
svm_set_cpu_caps();
54365432

54375433
kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;
5434+
5435+
for_each_possible_cpu(cpu) {
5436+
r = svm_cpu_init(cpu);
5437+
if (r)
5438+
goto err;
5439+
}
5440+
54385441
return 0;
54395442

54405443
err:

arch/x86/kvm/svm/svm.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ struct vcpu_svm {
329329
* back into remapped mode).
330330
*/
331331
struct list_head ir_list;
332-
spinlock_t ir_list_lock;
332+
raw_spinlock_t ir_list_lock;
333333

334334
struct vcpu_sev_es_state sev_es;
335335

@@ -805,7 +805,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
805805
)
806806

807807
bool __init avic_hardware_setup(void);
808-
int avic_ga_log_notifier(u32 ga_tag);
808+
void avic_hardware_unsetup(void);
809809
void avic_vm_destroy(struct kvm *kvm);
810810
int avic_vm_init(struct kvm *kvm);
811811
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);

arch/x86/kvm/vmx/common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
9898
error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
9999
? PFERR_PRESENT_MASK : 0;
100100

101-
if (error_code & EPT_VIOLATION_GVA_IS_VALID)
101+
if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID)
102102
error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
103103
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
104104

arch/x86/kvm/vmx/nested.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6728,6 +6728,14 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
67286728
case EXIT_REASON_NOTIFY:
67296729
/* Notify VM exit is not exposed to L1 */
67306730
return false;
6731+
case EXIT_REASON_SEAMCALL:
6732+
case EXIT_REASON_TDCALL:
6733+
/*
6734+
* SEAMCALL and TDCALL unconditionally VM-Exit, but aren't
6735+
* virtualized by KVM for L1 hypervisors, i.e. L1 should
6736+
* never want or expect such an exit.
6737+
*/
6738+
return false;
67316739
default:
67326740
return true;
67336741
}

arch/x86/kvm/vmx/vmx.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6032,6 +6032,12 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
60326032
return 1;
60336033
}
60346034

6035+
static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
6036+
{
6037+
kvm_queue_exception(vcpu, UD_VECTOR);
6038+
return 1;
6039+
}
6040+
60356041
#ifndef CONFIG_X86_SGX_KVM
60366042
static int handle_encls(struct kvm_vcpu *vcpu)
60376043
{
@@ -6157,6 +6163,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
61576163
[EXIT_REASON_ENCLS] = handle_encls,
61586164
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
61596165
[EXIT_REASON_NOTIFY] = handle_notify,
6166+
[EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
6167+
[EXIT_REASON_TDCALL] = handle_tdx_instruction,
61606168
[EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
61616169
[EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
61626170
};

arch/x86/kvm/x86.c

Lines changed: 30 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3874,15 +3874,9 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
38743874

38753875
/*
38763876
* Returns true if the MSR in question is managed via XSTATE, i.e. is context
3877-
* switched with the rest of guest FPU state. Note! S_CET is _not_ context
3878-
* switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS.
3879-
* Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields,
3880-
* the value saved/restored via XSTATE is always the host's value. That detail
3881-
* is _extremely_ important, as the guest's S_CET must _never_ be resident in
3882-
* hardware while executing in the host. Loading guest values for U_CET and
3883-
* PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to
3884-
* userspace, and PL[0-3]_SSP are only consumed when transitioning to lower
3885-
* privilege levels, i.e. are effectively only consumed by userspace as well.
3877+
* switched with the rest of guest FPU state.
3878+
*
3879+
* Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS.
38863880
*/
38873881
static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
38883882
{
@@ -3905,6 +3899,11 @@ static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
39053899
* MSR that is managed via XSTATE. Note, the caller is responsible for doing
39063900
* the initial FPU load, this helper only ensures that guest state is resident
39073901
* in hardware (the kernel can load its FPU state in IRQ context).
3902+
*
3903+
* Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the
3904+
* kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only
3905+
* consumed when transitioning to lower privilege levels, i.e. are effectively
3906+
* only consumed by userspace as well.
39083907
*/
39093908
static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu,
39103909
struct msr_data *msr_info,
@@ -11807,6 +11806,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
1180711806
/* Swap (qemu) user FPU context for the guest FPU context. */
1180811807
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
1180911808
{
11809+
if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
11810+
return;
11811+
1181011812
/* Exclude PKRU, it's restored separately immediately after VM-Exit. */
1181111813
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
1181211814
trace_kvm_fpu(1);
@@ -11815,6 +11817,9 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
1181511817
/* When vcpu_run ends, restore user space FPU context. */
1181611818
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
1181711819
{
11820+
if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
11821+
return;
11822+
1181811823
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
1181911824
++vcpu->stat.fpu_reload;
1182011825
trace_kvm_fpu(0);
@@ -12137,9 +12142,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
1213712142
int r;
1213812143

1213912144
vcpu_load(vcpu);
12140-
if (kvm_mpx_supported())
12141-
kvm_load_guest_fpu(vcpu);
12142-
1214312145
kvm_vcpu_srcu_read_lock(vcpu);
1214412146

1214512147
r = kvm_apic_accept_events(vcpu);
@@ -12156,9 +12158,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
1215612158

1215712159
out:
1215812160
kvm_vcpu_srcu_read_unlock(vcpu);
12159-
12160-
if (kvm_mpx_supported())
12161-
kvm_put_guest_fpu(vcpu);
1216212161
vcpu_put(vcpu);
1216312162
return r;
1216412163
}
@@ -12788,6 +12787,7 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
1278812787
{
1278912788
struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
1279012789
u64 xfeatures_mask;
12790+
bool fpu_in_use;
1279112791
int i;
1279212792

1279312793
/*
@@ -12811,13 +12811,23 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
1281112811
BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX);
1281212812

1281312813
/*
12814-
* All paths that lead to INIT are required to load the guest's FPU
12815-
* state (because most paths are buried in KVM_RUN).
12816-
*/
12817-
kvm_put_guest_fpu(vcpu);
12814+
* Unload guest FPU state (if necessary) before zeroing XSTATE fields
12815+
* as the kernel can only modify the state when its resident in memory,
12816+
* i.e. when it's not loaded into hardware.
12817+
*
12818+
* WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN,
12819+
* doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the
12820+
* only path that can trigger INIT emulation _and_ loads FPU state, and
12821+
* KVM_RUN should _always_ load FPU state.
12822+
*/
12823+
WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use);
12824+
fpu_in_use = fpstate->in_use;
12825+
if (fpu_in_use)
12826+
kvm_put_guest_fpu(vcpu);
1281812827
for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX)
1281912828
fpstate_clear_xstate_component(fpstate, i);
12820-
kvm_load_guest_fpu(vcpu);
12829+
if (fpu_in_use)
12830+
kvm_load_guest_fpu(vcpu);
1282112831
}
1282212832

1282312833
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)

virt/kvm/guest_memfd.c

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -623,31 +623,50 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
623623
return r;
624624
}
625625

626-
void kvm_gmem_unbind(struct kvm_memory_slot *slot)
626+
static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct kvm_gmem *gmem)
627627
{
628628
unsigned long start = slot->gmem.pgoff;
629629
unsigned long end = start + slot->npages;
630-
struct kvm_gmem *gmem;
630+
631+
xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
632+
633+
/*
634+
* synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
635+
* cannot see this memslot.
636+
*/
637+
WRITE_ONCE(slot->gmem.file, NULL);
638+
}
639+
640+
void kvm_gmem_unbind(struct kvm_memory_slot *slot)
641+
{
631642
struct file *file;
632643

633644
/*
634-
* Nothing to do if the underlying file was already closed (or is being
635-
* closed right now), kvm_gmem_release() invalidates all bindings.
645+
* Nothing to do if the underlying file was _already_ closed, as
646+
* kvm_gmem_release() invalidates and nullifies all bindings.
636647
*/
637-
file = kvm_gmem_get_file(slot);
638-
if (!file)
648+
if (!slot->gmem.file)
639649
return;
640650

641-
gmem = file->private_data;
642-
643-
filemap_invalidate_lock(file->f_mapping);
644-
xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
651+
file = kvm_gmem_get_file(slot);
645652

646653
/*
647-
* synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
648-
* cannot see this memslot.
654+
* However, if the file is _being_ closed, then the bindings need to be
655+
* removed as kvm_gmem_release() might not run until after the memslot
656+
* is freed. Note, modifying the bindings is safe even though the file
657+
* is dying as kvm_gmem_release() nullifies slot->gmem.file under
658+
* slots_lock, and only puts its reference to KVM after destroying all
659+
* bindings. I.e. reaching this point means kvm_gmem_release() hasn't
660+
* yet destroyed the bindings or freed the gmem_file, and can't do so
661+
* until the caller drops slots_lock.
649662
*/
650-
WRITE_ONCE(slot->gmem.file, NULL);
663+
if (!file) {
664+
__kvm_gmem_unbind(slot, slot->gmem.file->private_data);
665+
return;
666+
}
667+
668+
filemap_invalidate_lock(file->f_mapping);
669+
__kvm_gmem_unbind(slot, file->private_data);
651670
filemap_invalidate_unlock(file->f_mapping);
652671

653672
fput(file);

0 commit comments

Comments
 (0)