From d222af072380c4470295c07d84ecb15f4937e365 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 6 Sep 2017 15:20:55 +1000 Subject: [PATCH 01/28] KVM: PPC: Book3S HV: Don't access XIVE PIPR register using byte accesses The XIVE interrupt controller on POWER9 machines doesn't support byte accesses to any register in the thread management area other than the CPPR (current processor priority register). In particular, when reading the PIPR (pending interrupt priority register), we need to do a 32-bit or 64-bit load. Cc: stable@vger.kernel.org # v4.13 Fixes: 2c4fb78f78b6 ("KVM: PPC: Book3S HV: Workaround POWER9 DD1.0 bug causing IPB bit loss") Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_xive.c | 1 - arch/powerpc/kvm/book3s_xive.c | 1 - arch/powerpc/kvm/book3s_xive_template.c | 7 ++++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c index abf5f01b6eb1..5b81a807d742 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xive.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c @@ -38,7 +38,6 @@ static inline void __iomem *get_tima_phys(void) #define __x_tima get_tima_phys() #define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page)) #define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page)) -#define __x_readb __raw_rm_readb #define __x_writeb __raw_rm_writeb #define __x_readw __raw_rm_readw #define __x_readq __raw_rm_readq diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 08b200a0bbce..13304622ab1c 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -48,7 +48,6 @@ #define __x_tima xive_tima #define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio)) #define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio)) -#define __x_readb __raw_readb #define __x_writeb __raw_writeb #define __x_readw __raw_readw #define __x_readq __raw_readq diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index d1ed2c41b5d2..c7a5deadd1cc 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -28,7 +28,8 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) * bit. */ if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); + __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS); + u8 pipr = be64_to_cpu(qw1) & 0xff; if (pipr >= xc->hw_cppr) return; } @@ -336,7 +337,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; u8 pending = xc->pending; u32 hirq; - u8 pipr; pr_devel("H_IPOLL(server=%ld)\n", server); @@ -353,7 +353,8 @@ X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long pending = 0xff; } else { /* Grab pending interrupt if any */ - pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); + __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS); + u8 pipr = be64_to_cpu(qw1) & 0xff; if (pipr < 8) pending |= 1 << pipr; } From cf5f6f3125241853462334b1bc696f3c3c492178 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 11 Sep 2017 16:05:30 +1000 Subject: [PATCH 02/28] KVM: PPC: Book3S HV: Hold kvm->lock around call to kvmppc_update_lpcr Commit 468808bd35c4 ("KVM: PPC: Book3S HV: Set process table for HPT guests on POWER9", 2017-01-30) added a call to kvmppc_update_lpcr() which doesn't hold the kvm->lock mutex around the call, as required. This adds the lock/unlock pair, and for good measure, includes the kvmppc_setup_partition_table() call in the locked region, since it is altering global state of the VM. This error appears not to have any fatal consequences for the host; the consequences would be that the VCPUs could end up running with different LPCR values, or an update to the LPCR value by userspace using the one_reg interface could get overwritten, or the update done by kvmhv_configure_mmu() could get overwritten. Cc: stable@vger.kernel.org # v4.10+ Fixes: 468808bd35c4 ("KVM: PPC: Book3S HV: Set process table for HPT guests on POWER9") Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 18e974a34fce..a7177c284f9b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4212,11 +4212,13 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) if ((cfg->process_table & PRTS_MASK) > 24) return -EINVAL; + mutex_lock(&kvm->lock); kvm->arch.process_table = cfg->process_table; kvmppc_setup_partition_table(kvm); lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); + mutex_unlock(&kvm->lock); return 0; } From 67f8a8c1151c9ef3d1285905d1e66ebb769ecdf7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 12 Sep 2017 13:47:23 +1000 Subject: [PATCH 03/28] KVM: PPC: Book3S HV: Fix bug causing host SLB to be restored incorrectly Aneesh Kumar reported seeing host crashes when running recent kernels on POWER8. The symptom was an oops like this: Unable to handle kernel paging request for data at address 0xf00000000786c620 Faulting instruction address: 0xc00000000030e1e4 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA PowerNV Modules linked in: powernv_op_panel CPU: 24 PID: 6663 Comm: qemu-system-ppc Tainted: G W 4.13.0-rc7-43932-gfc36c59 #2 task: c000000fdeadfe80 task.stack: c000000fdeb68000 NIP: c00000000030e1e4 LR: c00000000030de6c CTR: c000000000103620 REGS: c000000fdeb6b450 TRAP: 0300 Tainted: G W (4.13.0-rc7-43932-gfc36c59) MSR: 9000000000009033 CR: 24044428 XER: 20000000 CFAR: c00000000030e134 DAR: f00000000786c620 DSISR: 40000000 SOFTE: 0 GPR00: 0000000000000000 c000000fdeb6b6d0 c0000000010bd000 000000000000e1b0 GPR04: c00000000115e168 c000001fffa6e4b0 c00000000115d000 c000001e1b180386 GPR08: f000000000000000 c000000f9a8913e0 f00000000786c600 00007fff587d0000 GPR12: c000000fdeb68000 c00000000fb0f000 0000000000000001 00007fff587cffff GPR16: 0000000000000000 c000000000000000 00000000003fffff c000000fdebfe1f8 GPR20: 0000000000000004 c000000fdeb6b8a8 0000000000000001 0008000000000040 GPR24: 07000000000000c0 00007fff587cffff c000000fdec20bf8 00007fff587d0000 GPR28: c000000fdeca9ac0 00007fff587d0000 00007fff587c0000 00007fff587d0000 NIP [c00000000030e1e4] __get_user_pages_fast+0x434/0x1070 LR [c00000000030de6c] __get_user_pages_fast+0xbc/0x1070 Call Trace: [c000000fdeb6b6d0] [c00000000139dab8] lock_classes+0x0/0x35fe50 (unreliable) [c000000fdeb6b7e0] [c00000000030ef38] get_user_pages_fast+0xf8/0x120 [c000000fdeb6b830] [c000000000112318] kvmppc_book3s_hv_page_fault+0x308/0xf30 [c000000fdeb6b960] [c00000000010e10c] kvmppc_vcpu_run_hv+0xfdc/0x1f00 [c000000fdeb6bb20] [c0000000000e915c] kvmppc_vcpu_run+0x2c/0x40 [c000000fdeb6bb40] [c0000000000e5650] kvm_arch_vcpu_ioctl_run+0x110/0x300 [c000000fdeb6bbe0] [c0000000000d6468] kvm_vcpu_ioctl+0x528/0x900 [c000000fdeb6bd40] [c0000000003bc04c] do_vfs_ioctl+0xcc/0x950 [c000000fdeb6bde0] [c0000000003bc930] SyS_ioctl+0x60/0x100 [c000000fdeb6be30] [c00000000000b96c] system_call+0x58/0x6c Instruction dump: 7ca81a14 2fa50000 41de0010 7cc8182a 68c60002 78c6ffe2 0b060000 3cc2000a 794a3664 390610d8 e9080000 7d485214 7d435378 790507e1 408202f0 ---[ end trace fad4a342d0414aa2 ]--- It turns out that what has happened is that the SLB entry for the vmmemap region hasn't been reloaded on exit from a guest, and it has the wrong page size. Then, when the host next accesses the vmemmap region, it gets a page fault. Commit a25bd72badfa ("powerpc/mm/radix: Workaround prefetch issue with KVM", 2017-07-24) modified the guest exit code so that it now only clears out the SLB for hash guest. The code tests the radix flag and puts the result in a non-volatile CR field, CR2, and later branches based on CR2. Unfortunately, the kvmppc_save_tm function, which gets called between those two points, modifies all the user-visible registers in the case where the guest was in transactional or suspended state, except for a few which it restores (namely r1, r2, r9 and r13). Thus the hash/radix indication in CR2 gets corrupted. This fixes the problem by re-doing the comparison just before the result is needed. For good measure, this also adds comments next to the call sites of kvmppc_save_tm and kvmppc_restore_tm pointing out that non-volatile register state will be lost. Cc: stable@vger.kernel.org # v4.13 Fixes: a25bd72badfa ("powerpc/mm/radix: Workaround prefetch issue with KVM") Tested-by: Aneesh Kumar K.V Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 663a4a861e7f..17936f82d3c7 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -771,6 +771,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ bl kvmppc_restore_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif @@ -1630,6 +1633,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ bl kvmppc_save_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif @@ -1749,7 +1755,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) /* * Are we running hash or radix ? */ - beq cr2,3f + ld r5, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r5) + cmpwi cr2, r0, 0 + beq cr2, 3f /* Radix: Handle the case where the guest used an illegal PID */ LOAD_REG_ADDR(r4, mmu_base_pid) @@ -2466,6 +2475,9 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ ld r9, HSTATE_KVM_VCPU(r13) bl kvmppc_save_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) @@ -2578,6 +2590,9 @@ kvm_end_cede: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ bl kvmppc_restore_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif From 98152b83e065d5593c261b0cc58197666f94a34f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 28 Aug 2017 16:38:35 +0200 Subject: [PATCH 04/28] KVM: x86: Remove .get_pkru() from kvm_x86_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 9dd21e104bc ('KVM: x86: simplify handling of PKRU') removed all users and providers of that call-back, but didn't remove it. Remove it now. Signed-off-by: Joerg Roedel Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8844eee290b2..bbb802382a3e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -951,7 +951,6 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - u32 (*get_pkru)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); From 49a8afca386ee1775519a4aa80f8e121bd227dd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Tue, 5 Sep 2017 23:58:44 +0200 Subject: [PATCH 05/28] KVM: SVM: Add a missing 'break' statement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jan H. Schönherr Fixes: f6511935f424 ("KVM: SVM: Add checks for IO instructions") Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2c1cfe68a9af..af54327b9017 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5302,6 +5302,7 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, */ if (info->rep_prefix != REPE_PREFIX) goto out; + break; case SVM_EXIT_IOIO: { u64 exit_info; u32 bytes; From a05950009f50ca971a1d616655d01628177bd2e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Wed, 6 Sep 2017 00:27:19 +0200 Subject: [PATCH 06/28] KVM: x86: Fix handling of pending signal on uninitialized AP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit KVM API says that KVM_RUN will return with -EINTR when a signal is pending. However, if a vCPU is in KVM_MP_STATE_UNINITIALIZED, then the return value is unconditionally -EAGAIN. Copy over some code from vcpu_run(), so that the case of a pending signal results in the expected return value. Signed-off-by: Jan H. Schönherr Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6069af86da3b..b27f7f0020e3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7235,6 +7235,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_apic_accept_events(vcpu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; + if (signal_pending(current)) { + r = -EINTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } goto out; } From 2f173d2688559a6f85643d38a2ad6f45eb420c42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Wed, 6 Sep 2017 18:34:06 +0200 Subject: [PATCH 07/28] KVM: x86: Fix immediate_exit handling for uninitialized AP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When user space sets kvm_run->immediate_exit, KVM is supposed to return quickly. However, when a vCPU is in KVM_MP_STATE_UNINITIALIZED, the value is not considered and the vCPU blocks. Fix that oversight. Fixes: 460df4c1fc7c008 ("KVM: race-free exit from KVM_RUN without POSIX signals") Signed-off-by: Jan H. Schönherr Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b27f7f0020e3..c777a8f681a8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7231,6 +7231,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { + if (kvm_run->immediate_exit) { + r = -EINTR; + goto out; + } kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); kvm_clear_request(KVM_REQ_UNHALT, vcpu); From 021086e383fa408a219f6c6541b37f495f59d576 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 13 Sep 2017 14:17:22 +0200 Subject: [PATCH 08/28] KVM: fix rcu warning on VM_CREATE errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3898da947bba ("KVM: avoid using rcu_dereference_protected") can trigger the following lockdep/rcu splat if the VM_CREATE ioctl fails, for example if kvm_arch_init_vm fails: WARNING: suspicious RCU usage 4.13.0+ #105 Not tainted ----------------------------- ./include/linux/kvm_host.h:481 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 no locks held by qemu-system-s39/79. stack backtrace: CPU: 0 PID: 79 Comm: qemu-system-s39 Not tainted 4.13.0+ #105 Hardware name: IBM 2964 NC9 704 (KVM/Linux) Call Trace: ([<00000000001140b2>] show_stack+0xea/0xf0) [<00000000008a68a4>] dump_stack+0x94/0xd8 [<0000000000134c12>] kvm_dev_ioctl+0x372/0x7a0 [<000000000038f940>] do_vfs_ioctl+0xa8/0x6c8 [<0000000000390004>] SyS_ioctl+0xa4/0xb8 [<00000000008c7a8c>] system_call+0xc4/0x27c no locks held by qemu-system-s39/79. We have to reset the just created users_count back to 0 to tell the check to not trigger. Reported-by: Stefan Haberland Signed-off-by: Christian Borntraeger Fixes: 3898da947bba ("KVM: avoid using rcu_dereference_protected") Cc: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- virt/kvm/kvm_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6ed1c2021198..2d7df5cc955b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -674,6 +674,7 @@ static struct kvm *kvm_create_vm(unsigned long type) out_err_no_srcu: hardware_disable_all(); out_err_no_disable: + refcount_set(&kvm->users_count, 0); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) From 5153723388c8691b886eb9ad28dde92c01f9b191 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20Kr=C4=8Dm=C3=A1=C5=99?= Date: Wed, 13 Sep 2017 15:13:46 +0200 Subject: [PATCH 09/28] KVM: x86: fix clang build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clang resolves __builtin_constant_p() to false even if the expression is constant in the end. The only purpose of that expression was to differentiate a case where the following expression couldn't be checked at compile-time, so we can just remove the check. Clang handles the following two correctly. Turn it into BUG_ON if there are any more problems with this. Fixes: d6321d493319 ("KVM: x86: generalize guest_cpuid_has_ helpers") Reported-by: Dmitry Vyukov Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 1ea3c0e1e3a9..0bc5c1315708 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -59,7 +59,6 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) { unsigned x86_leaf = x86_feature / 32; - BUILD_BUG_ON(!__builtin_constant_p(x86_leaf)); BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); From dfa20099e26e35cd9e83d4d43172615c73c4e9f5 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 12 Sep 2017 10:42:40 -0500 Subject: [PATCH 10/28] KVM: SVM: Refactor AVIC vcpu initialization into avic_init_vcpu() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Preparing the base code for subsequent changes. This does not change existing logic. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af54327b9017..cab020acb509 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1600,6 +1600,23 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } +static int avic_init_vcpu(struct vcpu_svm *svm) +{ + int ret; + + if (!avic) + return 0; + + ret = avic_init_backing_page(&svm->vcpu); + if (ret) + return ret; + + INIT_LIST_HEAD(&svm->ir_list); + spin_lock_init(&svm->ir_list_lock); + + return ret; +} + static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { struct vcpu_svm *svm; @@ -1636,14 +1653,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (!hsave_page) goto free_page3; - if (avic) { - err = avic_init_backing_page(&svm->vcpu); - if (err) - goto free_page4; - - INIT_LIST_HEAD(&svm->ir_list); - spin_lock_init(&svm->ir_list_lock); - } + err = avic_init_vcpu(svm); + if (err) + goto free_page4; /* We initialize this flag to true to make sure that the is_running * bit would be set the first time the vcpu is loaded. From b2a05feff274e007abd7cda0d2cdae7fdf5cbb0a Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 12 Sep 2017 10:42:41 -0500 Subject: [PATCH 11/28] KVM: Add struct kvm_vcpu pointer parameter to get_enable_apicv() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modify struct kvm_x86_ops.arch.apicv_active() to take struct kvm_vcpu pointer as parameter in preparation to subsequent changes. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bbb802382a3e..c73e493adf07 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -972,7 +972,7 @@ struct kvm_x86_ops { void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); - bool (*get_enable_apicv)(void); + bool (*get_enable_apicv)(struct kvm_vcpu *vcpu); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cab020acb509..76f559f5a3dd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4407,7 +4407,7 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) return; } -static bool svm_get_enable_apicv(void) +static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) { return avic; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4253adef9044..09204993a739 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5012,7 +5012,7 @@ static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_activ } } -static bool vmx_get_enable_apicv(void) +static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu) { return enable_apicv; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c777a8f681a8..f4a978e36030 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7980,7 +7980,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; - vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(); + vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); vcpu->arch.pv.pv_unhalted = false; vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu)) From 67034bb9dd5ecc0171bdacfbd31ebf7c2634df98 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 12 Sep 2017 10:42:42 -0500 Subject: [PATCH 12/28] KVM: SVM: Add irqchip_split() checks before enabling AVIC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SVM AVIC hardware accelerates guest write to APIC_EOI register (for edge-trigger interrupt), which means it does not trap to KVM. So, only enable SVM AVIC only in split irqchip mode. (e.g. launching qemu w/ option '-machine kernel_irqchip=split'). Suggested-by: Paolo Bonzini Signed-off-by: Suravee Suthikulpanit Fixes: 44a95dae1d22 ("KVM: x86: Detect and Initialize AVIC support") [Removed pr_debug - Radim.] Signed-off-by: Radim Krčmář --- arch/x86/kvm/svm.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 76f559f5a3dd..0e68f0b3cbf7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1200,7 +1200,6 @@ static void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; vmcb->control.int_ctl |= AVIC_ENABLE_MASK; - svm->vcpu.arch.apicv_active = true; } static void init_vmcb(struct vcpu_svm *svm) @@ -1316,7 +1315,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_PAUSE); } - if (avic) + if (kvm_vcpu_apicv_active(&svm->vcpu)) avic_init_vmcb(svm); /* @@ -1604,7 +1603,7 @@ static int avic_init_vcpu(struct vcpu_svm *svm) { int ret; - if (!avic) + if (!kvm_vcpu_apicv_active(&svm->vcpu)) return 0; ret = avic_init_backing_page(&svm->vcpu); @@ -4409,7 +4408,7 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu) { - return avic; + return avic && irqchip_split(vcpu->kvm); } static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) @@ -4426,7 +4425,7 @@ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - if (!avic) + if (!kvm_vcpu_apicv_active(&svm->vcpu)) return; vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; From a5f01f8e9756b40a56e351f98b3fae6d235b834f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 13 Sep 2017 04:04:01 -0700 Subject: [PATCH 13/28] KVM: X86: Don't block vCPU if there is pending exception MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Don't block vCPU if there is pending exception. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f4a978e36030..bfda79fb102c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8461,6 +8461,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (vcpu->arch.pv.pv_unhalted) return true; + if (vcpu->arch.exception.pending) + return true; + if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu))) From 9a6e7c39810e4a8bc7fc95056cefb40583fe07ef Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 14 Sep 2017 03:54:16 -0700 Subject: [PATCH 14/28] KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page Ready" exceptions simultaneously MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qemu-system-x86-8600 [004] d..1 7205.687530: kvm_entry: vcpu 2 qemu-system-x86-8600 [004] .... 7205.687532: kvm_exit: reason EXCEPTION_NMI rip 0xffffffffa921297d info ffffeb2c0e44e018 80000b0e qemu-system-x86-8600 [004] .... 7205.687532: kvm_page_fault: address ffffeb2c0e44e018 error_code 0 qemu-system-x86-8600 [004] .... 7205.687620: kvm_try_async_get_page: gva = 0xffffeb2c0e44e018, gfn = 0x427e4e qemu-system-x86-8600 [004] .N.. 7205.687628: kvm_async_pf_not_present: token 0x8b002 gva 0xffffeb2c0e44e018 kworker/4:2-7814 [004] .... 7205.687655: kvm_async_pf_completed: gva 0xffffeb2c0e44e018 address 0x7fcc30c4e000 qemu-system-x86-8600 [004] .... 7205.687703: kvm_async_pf_ready: token 0x8b002 gva 0xffffeb2c0e44e018 qemu-system-x86-8600 [004] d..1 7205.687711: kvm_entry: vcpu 2 After running some memory intensive workload in guest, I catch the kworker which completes the GUP too quickly, and queues an "Page Ready" #PF exception after the "Page not Present" exception before the next vmentry as the above trace which will result in #DF injected to guest. This patch fixes it by clearing the queue for "Page not Present" if "Page Ready" occurs before the next vmentry since the GUP has already got the required page and shadow page table has already been fixed by "Page Ready" handler. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Fixes: 7c90705bf2a3 ("KVM: Inject asynchronous page fault into a PV guest if page is swapped out.") [Changed indentation and added clearing of injected. - Radim] Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bfda79fb102c..cd17b7d9a107 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8631,6 +8631,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) sizeof(val)); } +static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val, + sizeof(u32)); +} + void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { @@ -8658,6 +8665,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { struct x86_exception fault; + u32 val; if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ @@ -8665,15 +8673,26 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_del_async_pf_gfn(vcpu, work->arch.gfn); trace_kvm_async_pf_ready(work->arch.token, work->gva); - if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - fault.async_page_fault = true; - kvm_inject_page_fault(vcpu, &fault); + if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && + !apf_get_user(vcpu, &val)) { + if (val == KVM_PV_REASON_PAGE_NOT_PRESENT && + vcpu->arch.exception.pending && + vcpu->arch.exception.nr == PF_VECTOR && + !apf_put_user(vcpu, 0)) { + vcpu->arch.exception.injected = false; + vcpu->arch.exception.pending = false; + vcpu->arch.exception.nr = 0; + vcpu->arch.exception.has_error_code = false; + vcpu->arch.exception.error_code = 0; + } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = 0; + fault.nested_page_fault = false; + fault.address = work->arch.token; + fault.async_page_fault = true; + kvm_inject_page_fault(vcpu, &fault); + } } vcpu->arch.apf.halted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; From 488e32f1981e506976d666b3772d6ccc8b726fee Mon Sep 17 00:00:00 2001 From: Ladi Prosek Date: Thu, 14 Sep 2017 11:50:07 +0200 Subject: [PATCH 15/28] KVM: trace events: update list of exit reasons MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding entries for exit reasons 23 - 27: KVM_EXIT_EPR KVM_EXIT_SYSTEM_EVENT KVM_EXIT_S390_STSI KVM_EXIT_IOAPIC_EOI KVM_EXIT_HYPERV Signed-off-by: Ladi Prosek Reviewed-by: Cornelia Huck Signed-off-by: Radim Krčmář --- include/trace/events/kvm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 8ade3eb6c640..dcffedfac431 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -14,7 +14,9 @@ ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR), \ ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\ ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL), \ - ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH) + ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\ + ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI), \ + ERSN(HYPERV) TRACE_EVENT(kvm_userspace_exit, TP_PROTO(__u32 reason, int errno), From 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 12 Sep 2017 13:02:54 -0700 Subject: [PATCH 16/28] kvm: nVMX: Don't allow L2 to access the hardware CR8 If L1 does not specify the "use TPR shadow" VM-execution control in vmcs12, then L0 must specify the "CR8-load exiting" and "CR8-store exiting" VM-execution controls in vmcs02. Failure to do so will give the L2 VM unrestricted read/write access to the hardware CR8. This fixes CVE-2017-12154. Signed-off-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 09204993a739..8e1ae716f938 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10525,6 +10525,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (exec_control & CPU_BASED_TPR_SHADOW) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull); vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif } /* From 36ae3c0a36b7456432fedce38ae2f7bd3e01a563 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Thu, 7 Sep 2017 19:02:48 +0100 Subject: [PATCH 17/28] KVM: Don't accept obviously wrong gsi values via KVM_IRQFD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We cannot add routes for gsi values >= KVM_MAX_IRQ_ROUTES -- see kvm_set_irq_routing(). Hence, there is no sense in accepting them via KVM_IRQFD. Prevent them from entering the system in the first place. Signed-off-by: Jan H. Schönherr Signed-off-by: Paolo Bonzini --- virt/kvm/eventfd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index f2ac53ab8243..c608ab495282 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -565,6 +565,8 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) return -EINVAL; + if (args->gsi >= KVM_MAX_IRQ_ROUTES) + return -EINVAL; if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) return kvm_irqfd_deassign(kvm, args); From 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Thu, 7 Sep 2017 19:02:30 +0100 Subject: [PATCH 18/28] KVM: VMX: Do not BUG() on out-of-bounds guest IRQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value of the guest_irq argument to vmx_update_pi_irte() is ultimately coming from a KVM_IRQFD API call. Do not BUG() in vmx_update_pi_irte() if the value is out-of bounds. (Especially, since KVM as a whole seems to hang after that.) Instead, print a message only once if we find that we don't have a route for a certain IRQ (which can be out-of-bounds or within the array). This fixes CVE-2017-1000252. Fixes: efc644048ecde54 ("KVM: x86: Update IRTE for posted-interrupts") Signed-off-by: Jan H. Schönherr Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8e1ae716f938..0b15b43ef45d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11834,7 +11834,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || @@ -11843,7 +11843,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { if (e->type != KVM_IRQ_ROUTING_MSI) From 8cd641e3c7cbf86c7cbd2a17a160dd137d86c860 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 13 Sep 2017 13:08:18 -0700 Subject: [PATCH 19/28] sched/wait: Add swq_has_sleeper() Which is the equivalent of what we have in regular waitqueues. I'm not crazy about the name, but this also helps us get both apis closer -- which iirc comes originally from the -net folks. We also duplicate the comments for the lockless swait_active(), from wait.h. Future users will make use of this interface. Signed-off-by: Davidlohr Bueso Signed-off-by: Paolo Bonzini --- include/linux/swait.h | 58 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/include/linux/swait.h b/include/linux/swait.h index 4a4e180d0a35..73e97a08d3d0 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -79,9 +79,63 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name DECLARE_SWAIT_QUEUE_HEAD(name) #endif -static inline int swait_active(struct swait_queue_head *q) +/** + * swait_active -- locklessly test for waiters on the queue + * @wq: the waitqueue to test for waiters + * + * returns true if the wait list is not empty + * + * NOTE: this function is lockless and requires care, incorrect usage _will_ + * lead to sporadic and non-obvious failure. + * + * NOTE2: this function has the same above implications as regular waitqueues. + * + * Use either while holding swait_queue_head::lock or when used for wakeups + * with an extra smp_mb() like: + * + * CPU0 - waker CPU1 - waiter + * + * for (;;) { + * @cond = true; prepare_to_swait(&wq_head, &wait, state); + * smp_mb(); // smp_mb() from set_current_state() + * if (swait_active(wq_head)) if (@cond) + * wake_up(wq_head); break; + * schedule(); + * } + * finish_swait(&wq_head, &wait); + * + * Because without the explicit smp_mb() it's possible for the + * swait_active() load to get hoisted over the @cond store such that we'll + * observe an empty wait list while the waiter might not observe @cond. + * This, in turn, can trigger missing wakeups. + * + * Also note that this 'optimization' trades a spin_lock() for an smp_mb(), + * which (when the lock is uncontended) are of roughly equal cost. + */ +static inline int swait_active(struct swait_queue_head *wq) { - return !list_empty(&q->task_list); + return !list_empty(&wq->task_list); +} + +/** + * swq_has_sleeper - check if there are any waiting processes + * @wq: the waitqueue to test for waiters + * + * Returns true if @wq has waiting processes + * + * Please refer to the comment for swait_active. + */ +static inline bool swq_has_sleeper(struct swait_queue_head *wq) +{ + /* + * We need to be sure we are in sync with the list_add() + * modifications to the wait queue (task_list). + * + * This memory barrier should be paired with one on the + * waiting side. + */ + smp_mb(); + return swait_active(wq); } extern void swake_up(struct swait_queue_head *q); From b9f67a420b3d76991592558af06e9cf1b8953b3d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 13 Sep 2017 13:08:19 -0700 Subject: [PATCH 20/28] kvm,async_pf: Use swq_has_sleeper() ... as we've got the new helper now. This caller already does the right thing, hence no changes in semantics. Signed-off-by: Davidlohr Bueso Signed-off-by: Paolo Bonzini --- virt/kvm/async_pf.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index bb298a200cd3..57bcb27dcf30 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -106,11 +106,7 @@ static void async_pf_execute(struct work_struct *work) trace_kvm_async_pf_completed(addr, gva); - /* - * This memory barrier pairs with prepare_to_wait's set_current_state() - */ - smp_mb(); - if (swait_active(&vcpu->wq)) + if (swq_has_sleeper(&vcpu->wq)) swake_up(&vcpu->wq); mmput(mm); From cc1b46803a671047be22f7832ef4a2bb3f63dd94 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 13 Sep 2017 13:08:20 -0700 Subject: [PATCH 21/28] kvm,lapic: Justify use of swait_active() A comment might serve future readers. Signed-off-by: Davidlohr Bueso Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index aaf10b6f5380..69c5612be786 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1324,6 +1324,10 @@ static void apic_timer_expired(struct kvm_lapic *apic) atomic_inc(&apic->lapic_timer.pending); kvm_set_pending_timer(vcpu); + /* + * For x86, the atomic_inc() is serialized, thus + * using swait_active() is safe. + */ if (swait_active(q)) swake_up(q); From a0cff57bb2a41cb9cbf13d3203097b4156d8c0ae Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 13 Sep 2017 13:08:21 -0700 Subject: [PATCH 22/28] kvm,x86: Fix apf_task_wake_one() wq serialization During code inspection, the following potential race was seen: CPU0 CPU1 kvm_async_pf_task_wait apf_task_wake_one [L] swait_active(&n->wq) [S] prepare_to_swait(&n.wq) [L] if (!hlist_unhahed(&n.link)) schedule() [S] hlist_del_init(&n->link); Properly serialize swait_active() checks such that a wakeup is not missed. Signed-off-by: Davidlohr Bueso Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 874827b0d7ca..aa60a08b65b1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -180,7 +180,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n) hlist_del_init(&n->link); if (n->halted) smp_send_reschedule(n->cpu); - else if (swait_active(&n->wq)) + else if (swq_has_sleeper(&n->wq)) swake_up(&n->wq); } From 5e0018b3e39e9b44dbfb380b83026e55d2f65b91 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 13 Sep 2017 13:08:22 -0700 Subject: [PATCH 23/28] kvm: Serialize wq active checks in kvm_vcpu_wake_up() This is a generic call and can be suceptible to races in reading the wq task_list while another task is adding itself to the list. Add a full barrier by using the swq_has_sleeper() helper. Signed-off-by: Davidlohr Bueso Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2d7df5cc955b..9deb5a245b83 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2187,7 +2187,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) struct swait_queue_head *wqp; wqp = kvm_arch_vcpu_wq(vcpu); - if (swait_active(wqp)) { + if (swq_has_sleeper(wqp)) { swake_up(wqp); ++vcpu->stat.halt_wakeup; return true; From 267ad7bc2d3f69af536035b6a3e4a9a2b6ae11dc Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 13 Sep 2017 13:08:23 -0700 Subject: [PATCH 24/28] kvm,powerpc: Serialize wq active checks in ops->vcpu_kick Particularly because kvmppc_fast_vcpu_kick_hv() is a callback, ensure that we properly serialize wq active checks in order to avoid potentially missing a wakeup due to racing with the waiter side. Signed-off-by: Davidlohr Bueso Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a7177c284f9b..73bf1ebfa78f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -181,7 +181,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) struct swait_queue_head *wqp; wqp = kvm_arch_vcpu_wq(vcpu); - if (swait_active(wqp)) { + if (swq_has_sleeper(wqp)) { swake_up(wqp); ++vcpu->stat.halt_wakeup; } From 4c0b4bc60f95de4741c89b41174760258343f091 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 13 Sep 2017 13:08:24 -0700 Subject: [PATCH 25/28] kvm,mips: Fix potential swait_active() races For example, the following could occur, making us miss a wakeup: CPU0 CPU1 kvm_vcpu_block kvm_mips_comparecount_func [L] swait_active(&vcpu->wq) [S] prepare_to_swait(&vcpu->wq) [L] if (!kvm_vcpu_has_pending_timer(vcpu)) schedule() [S] queue_timer_int(vcpu) Ensure that the swait_active() check is not hoisted over the interrupt. Signed-off-by: Davidlohr Bueso Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index bce2a6431430..d535edc01434 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -514,7 +514,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, dvcpu->arch.wait = 0; - if (swait_active(&dvcpu->wq)) + if (swq_has_sleeper(&dvcpu->wq)) swake_up(&dvcpu->wq); return 0; @@ -1179,7 +1179,7 @@ static void kvm_mips_comparecount_func(unsigned long data) kvm_mips_callbacks->queue_timer_int(vcpu); vcpu->arch.wait = 0; - if (swait_active(&vcpu->wq)) + if (swq_has_sleeper(&vcpu->wq)) swake_up(&vcpu->wq); } From 7881f96cac4d420c94e62a4e1eea243899a7052e Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 14 Sep 2017 16:31:40 -0700 Subject: [PATCH 26/28] kvm: nVMX: Remove nested_vmx_succeed after successful VM-entry After a successful VM-entry, RFLAGS is cleared, with the exception of bit 1, which is always set. This is handled by load_vmcs12_host_state. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0b15b43ef45d..29f85ed5a329 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11491,16 +11491,18 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - /* - * Exiting from L2 to L1, we're now back to L1 which thinks it just - * finished a VMLAUNCH or VMRESUME instruction, so we need to set the - * success or failure flag accordingly. - */ if (unlikely(vmx->fail)) { + /* + * After an early L2 VM-entry failure, we're now back + * in L1 which thinks it just finished a VMLAUNCH or + * VMRESUME instruction, so we need to set the failure + * flag and the VM-instruction error field of the VMCS + * accordingly. + */ vmx->fail = 0; nested_vmx_failValid(vcpu, vm_inst_error); - } else - nested_vmx_succeed(vcpu); + } + if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; From b060ca3b2e9e72ef005e2042476f95ee0b8839e9 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 14 Sep 2017 16:31:42 -0700 Subject: [PATCH 27/28] kvm: vmx: Handle VMLAUNCH/VMRESUME failure properly On an early VMLAUNCH/VMRESUME failure (i.e. one which sets the VM-instruction error field of the current VMCS), the launch state of the current VMCS is not set to "launched," and the VM-exit information fields of the current VMCS (including IDT-vectoring information and exit reason) are stale. On a late VMLAUNCH/VMRESUME failure (i.e. one which sets the high bit of the exit reason field), the launch state of the current VMCS is not set to "launched," and only two of the VM-exit information fields of the current VMCS are modified (exit reason and exit qualification). The remaining VM-exit information fields of the current VMCS (including IDT-vectoring information, in particular) are stale. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29f85ed5a329..a0a78f09b22d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9424,12 +9424,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; - vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - - vmx->loaded_vmcs->launched = 1; - - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - /* * eager fpu is enabled if PKEY is supported and CR4 is switched * back on host, so it is safe to read guest PKRU from current @@ -9451,6 +9445,14 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vmx->nested.nested_run_pending = 0; + vmx->idt_vectoring_info = 0; + + vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON); + if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) + return; + + vmx->loaded_vmcs->launched = 1; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); From 4f350c6dbcb9000e18907515ec8a7b205ac33c69 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Thu, 14 Sep 2017 16:31:44 -0700 Subject: [PATCH 28/28] kvm: nVMX: Handle deferred early VMLAUNCH/VMRESUME failure properly When emulating a nested VM-entry from L1 to L2, several control field validation checks are deferred to the hardware. Should one of these validation checks fail, vcpu_vmx_run will set the vmx->fail flag. When this happens, the L2 guest state is not loaded (even in part), and execution should continue in L1 with the next instruction after the VMLAUNCH/VMRESUME. The VMCS12 is not modified (except for the VM-instruction error field), the VMCS12 MSR save/load lists are not processed, and the CPU state is not loaded from the VMCS12 host area. Moreover, the vmcs02 exit reason is stale, so it should not be consulted for any reason. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 136 +++++++++++++++++++++++++-------------------- 1 file changed, 76 insertions(+), 60 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a0a78f09b22d..a406afbb6d21 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8344,12 +8344,14 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, - vmcs_readl(EXIT_QUALIFICATION), - vmx->idt_vectoring_info, - intr_info, - vmcs_read32(VM_EXIT_INTR_ERROR_CODE), - KVM_ISA_VMX); + if (vmx->nested.nested_run_pending) + return false; + + if (unlikely(vmx->fail)) { + pr_info_ratelimited("%s failed vm entry %x\n", __func__, + vmcs_read32(VM_INSTRUCTION_ERROR)); + return true; + } /* * The host physical addresses of some pages of guest memory @@ -8363,14 +8365,12 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) */ nested_mark_vmcs12_pages_dirty(vcpu); - if (vmx->nested.nested_run_pending) - return false; - - if (unlikely(vmx->fail)) { - pr_info_ratelimited("%s failed vm entry %x\n", __func__, - vmcs_read32(VM_INSTRUCTION_ERROR)); - return true; - } + trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, + vmcs_readl(EXIT_QUALIFICATION), + vmx->idt_vectoring_info, + intr_info, + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + KVM_ISA_VMX); switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: @@ -11395,46 +11395,30 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 vm_inst_error = 0; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); - leave_guest_mode(vcpu); - prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, - exit_qualification); - - if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, - vmcs12->vm_exit_msr_store_count)) - nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); - - if (unlikely(vmx->fail)) - vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); - - vmx_switch_vmcs(vcpu, &vmx->vmcs01); - /* - * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of - * the VM-exit interrupt information (valid interrupt) is always set to - * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need - * kvm_cpu_has_interrupt(). See the commit message for details. + * The only expected VM-instruction error is "VM entry with + * invalid control field(s)." Anything else indicates a + * problem with L0. */ - if (nested_exit_intr_ack_set(vcpu) && - exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - kvm_cpu_has_interrupt(vcpu)) { - int irq = kvm_cpu_get_interrupt(vcpu); - WARN_ON(irq < 0); - vmcs12->vm_exit_intr_info = irq | - INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + WARN_ON_ONCE(vmx->fail && (vmcs_read32(VM_INSTRUCTION_ERROR) != + VMXERR_ENTRY_INVALID_CONTROL_FIELD)); + + leave_guest_mode(vcpu); + + if (likely(!vmx->fail)) { + prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, + exit_qualification); + + if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, + vmcs12->vm_exit_msr_store_count)) + nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); } - trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, - vmcs12->exit_qualification, - vmcs12->idt_vectoring_info_field, - vmcs12->vm_exit_intr_info, - vmcs12->vm_exit_intr_error_code, - KVM_ISA_VMX); - + vmx_switch_vmcs(vcpu, &vmx->vmcs01); vm_entry_controls_reset_shadow(vmx); vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); @@ -11443,8 +11427,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (VMCS02_POOL_SIZE == 0) nested_free_vmcs02(vmx, vmx->nested.current_vmptr); - load_vmcs12_host_state(vcpu, vmcs12); - /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); @@ -11493,23 +11475,57 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - if (unlikely(vmx->fail)) { - /* - * After an early L2 VM-entry failure, we're now back - * in L1 which thinks it just finished a VMLAUNCH or - * VMRESUME instruction, so we need to set the failure - * flag and the VM-instruction error field of the VMCS - * accordingly. - */ - vmx->fail = 0; - nested_vmx_failValid(vcpu, vm_inst_error); - } - if (enable_shadow_vmcs) vmx->nested.sync_shadow_vmcs = true; /* in case we halted in L2 */ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (likely(!vmx->fail)) { + /* + * TODO: SDM says that with acknowledge interrupt on + * exit, bit 31 of the VM-exit interrupt information + * (valid interrupt) is always set to 1 on + * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't + * need kvm_cpu_has_interrupt(). See the commit + * message for details. + */ + if (nested_exit_intr_ack_set(vcpu) && + exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + kvm_cpu_has_interrupt(vcpu)) { + int irq = kvm_cpu_get_interrupt(vcpu); + WARN_ON(irq < 0); + vmcs12->vm_exit_intr_info = irq | + INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; + } + + trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, + vmcs12->exit_qualification, + vmcs12->idt_vectoring_info_field, + vmcs12->vm_exit_intr_info, + vmcs12->vm_exit_intr_error_code, + KVM_ISA_VMX); + + load_vmcs12_host_state(vcpu, vmcs12); + + return; + } + + /* + * After an early L2 VM-entry failure, we're now back + * in L1 which thinks it just finished a VMLAUNCH or + * VMRESUME instruction, so we need to set the failure + * flag and the VM-instruction error field of the VMCS + * accordingly. + */ + nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + /* + * The emulated instruction was already skipped in + * nested_vmx_run, but the updated RIP was never + * written back to the vmcs01. + */ + skip_emulated_instruction(vcpu); + vmx->fail = 0; } /*