Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
[firefly-linux-kernel-4.4.55.git] / arch / x86 / kvm / svm.c
index 6c79a14a3b6f8c784c6f6118ece14fd4e2bdd6b3..1d9b33843c80ef521dc059697285cfed06cfd7d7 100644 (file)
@@ -46,6 +46,7 @@ MODULE_LICENSE("GPL");
 #define SVM_FEATURE_NPT  (1 << 0)
 #define SVM_FEATURE_LBRV (1 << 1)
 #define SVM_FEATURE_SVML (1 << 2)
+#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
 
 #define NESTED_EXIT_HOST       0       /* Exit handled on host level */
 #define NESTED_EXIT_DONE       1       /* Exit caused nested vmexit  */
@@ -53,15 +54,6 @@ MODULE_LICENSE("GPL");
 
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
-/* Turn on to get debugging output*/
-/* #define NESTED_DEBUG */
-
-#ifdef NESTED_DEBUG
-#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
-#else
-#define nsvm_printk(fmt, args...) do {} while(0)
-#endif
-
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
        MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +77,9 @@ struct nested_state {
        /* gpa pointers to the real vectors */
        u64 vmcb_msrpm;
 
+       /* A VMEXIT is required but not yet emulated */
+       bool exit_required;
+
        /* cache for intercepts of the guest */
        u16 intercept_cr_read;
        u16 intercept_cr_write;
@@ -112,6 +107,8 @@ struct vcpu_svm {
        u32 *msrpm;
 
        struct nested_state nested;
+
+       bool nmi_singlestep;
 };
 
 /* enable NPT for AMD64 and X86 with PAE */
@@ -286,7 +283,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
        struct vcpu_svm *svm = to_svm(vcpu);
 
        if (!svm->next_rip) {
-               if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
+               if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
                                EMULATE_DONE)
                        printk(KERN_DEBUG "%s: NOP\n", __func__);
                return;
@@ -316,7 +313,7 @@ static void svm_hardware_disable(void *garbage)
        cpu_svm_disable();
 }
 
-static void svm_hardware_enable(void *garbage)
+static int svm_hardware_enable(void *garbage)
 {
 
        struct svm_cpu_data *sd;
@@ -325,16 +322,21 @@ static void svm_hardware_enable(void *garbage)
        struct desc_struct *gdt;
        int me = raw_smp_processor_id();
 
+       rdmsrl(MSR_EFER, efer);
+       if (efer & EFER_SVME)
+               return -EBUSY;
+
        if (!has_svm()) {
-               printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me);
-               return;
+               printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
+                      me);
+               return -EINVAL;
        }
        sd = per_cpu(svm_data, me);
 
        if (!sd) {
-               printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n",
+               printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
                       me);
-               return;
+               return -EINVAL;
        }
 
        sd->asid_generation = 1;
@@ -345,11 +347,11 @@ static void svm_hardware_enable(void *garbage)
        gdt = (struct desc_struct *)gdt_descr.base;
        sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
 
-       rdmsrl(MSR_EFER, efer);
        wrmsrl(MSR_EFER, efer | EFER_SVME);
 
-       wrmsrl(MSR_VM_HSAVE_PA,
-              page_to_pfn(sd->save_area) << PAGE_SHIFT);
+       wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
+
+       return 0;
 }
 
 static void svm_cpu_uninit(int cpu)
@@ -475,7 +477,7 @@ static __init int svm_hardware_setup(void)
                kvm_enable_efer_bits(EFER_SVME);
        }
 
-       for_each_online_cpu(cpu) {
+       for_each_possible_cpu(cpu) {
                r = svm_cpu_init(cpu);
                if (r)
                        goto err;
@@ -509,7 +511,7 @@ static __exit void svm_hardware_unsetup(void)
 {
        int cpu;
 
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                svm_cpu_uninit(cpu);
 
        __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -624,11 +626,12 @@ static void init_vmcb(struct vcpu_svm *svm)
        save->rip = 0x0000fff0;
        svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
 
-       /*
-        * cr0 val on cpu init should be 0x60000010, we enable cpu
-        * cache by default. the orderly way is to enable cache in bios.
+       /* This is the guest-visible cr0 value.
+        * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
         */
-       save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
+       svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+       kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+
        save->cr4 = X86_CR4_PAE;
        /* rdx = ?? */
 
@@ -643,8 +646,6 @@ static void init_vmcb(struct vcpu_svm *svm)
                control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
                                                 INTERCEPT_CR3_MASK);
                save->g_pat = 0x0007040600070406ULL;
-               /* enable caching because the QEMU Bios doesn't enable it */
-               save->cr0 = X86_CR0_ET;
                save->cr3 = 0;
                save->cr4 = 0;
        }
@@ -653,6 +654,11 @@ static void init_vmcb(struct vcpu_svm *svm)
        svm->nested.vmcb = 0;
        svm->vcpu.arch.hflags = 0;
 
+       if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+               control->pause_filter_count = 3000;
+               control->intercept |= (1ULL << INTERCEPT_PAUSE);
+       }
+
        enable_gif(svm);
 }
 
@@ -757,15 +763,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
        int i;
 
        if (unlikely(cpu != vcpu->cpu)) {
-               u64 tsc_this, delta;
+               u64 delta;
 
                /*
                 * Make sure that the guest sees a monotonically
                 * increasing TSC.
                 */
-               rdtscll(tsc_this);
-               delta = vcpu->arch.host_tsc - tsc_this;
+               delta = vcpu->arch.host_tsc - native_read_tsc();
                svm->vmcb->control.tsc_offset += delta;
+               if (is_nested(svm))
+                       svm->nested.hsave->control.tsc_offset += delta;
                vcpu->cpu = cpu;
                kvm_migrate_timers(vcpu);
                svm->asid_generation = 0;
@@ -784,7 +791,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
        for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
                wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
 
-       rdtscll(vcpu->arch.host_tsc);
+       vcpu->arch.host_tsc = native_read_tsc();
 }
 
 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -1042,7 +1049,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
        svm->vmcb->control.intercept_exceptions &=
                ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
 
-       if (vcpu->arch.singlestep)
+       if (svm->nmi_singlestep)
                svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
 
        if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1057,26 +1064,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
                vcpu->guest_debug = 0;
 }
 
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 {
-       int old_debug = vcpu->guest_debug;
        struct vcpu_svm *svm = to_svm(vcpu);
 
-       vcpu->guest_debug = dbg->control;
-
-       update_db_intercept(vcpu);
-
        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
                svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
        else
                svm->vmcb->save.dr7 = vcpu->arch.dr7;
 
-       if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-               svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
-       else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
-               svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
-
-       return 0;
+       update_db_intercept(vcpu);
 }
 
 static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1177,7 +1174,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
        }
 }
 
-static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int pf_interception(struct vcpu_svm *svm)
 {
        u64 fault_address;
        u32 error_code;
@@ -1191,17 +1188,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
 
-static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int db_interception(struct vcpu_svm *svm)
 {
+       struct kvm_run *kvm_run = svm->vcpu.run;
+
        if (!(svm->vcpu.guest_debug &
              (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
-               !svm->vcpu.arch.singlestep) {
+               !svm->nmi_singlestep) {
                kvm_queue_exception(&svm->vcpu, DB_VECTOR);
                return 1;
        }
 
-       if (svm->vcpu.arch.singlestep) {
-               svm->vcpu.arch.singlestep = false;
+       if (svm->nmi_singlestep) {
+               svm->nmi_singlestep = false;
                if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
                        svm->vmcb->save.rflags &=
                                ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1220,25 +1219,27 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int bp_interception(struct vcpu_svm *svm)
 {
+       struct kvm_run *kvm_run = svm->vcpu.run;
+
        kvm_run->exit_reason = KVM_EXIT_DEBUG;
        kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
        kvm_run->debug.arch.exception = BP_VECTOR;
        return 0;
 }
 
-static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int ud_interception(struct vcpu_svm *svm)
 {
        int er;
 
-       er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD);
+       er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
        if (er != EMULATE_DONE)
                kvm_queue_exception(&svm->vcpu, UD_VECTOR);
        return 1;
 }
 
-static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int nm_interception(struct vcpu_svm *svm)
 {
        svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
        if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
@@ -1248,7 +1249,7 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int mc_interception(struct vcpu_svm *svm)
 {
        /*
         * On an #MC intercept the MCE handler is not called automatically in
@@ -1261,8 +1262,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int shutdown_interception(struct vcpu_svm *svm)
 {
+       struct kvm_run *kvm_run = svm->vcpu.run;
+
        /*
         * VMCB is undefined after a SHUTDOWN intercept
         * so reinitialize it.
@@ -1274,7 +1277,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 0;
 }
 
-static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int io_interception(struct vcpu_svm *svm)
 {
        u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
        int size, in, string;
@@ -1288,7 +1291,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 
        if (string) {
                if (emulate_instruction(&svm->vcpu,
-                                       kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
+                                       0, 0, 0) == EMULATE_DO_MMIO)
                        return 0;
                return 1;
        }
@@ -1298,33 +1301,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
 
        skip_emulated_instruction(&svm->vcpu);
-       return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
+       return kvm_emulate_pio(&svm->vcpu, in, size, port);
 }
 
-static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int nmi_interception(struct vcpu_svm *svm)
 {
        return 1;
 }
 
-static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int intr_interception(struct vcpu_svm *svm)
 {
        ++svm->vcpu.stat.irq_exits;
        return 1;
 }
 
-static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int nop_on_interception(struct vcpu_svm *svm)
 {
        return 1;
 }
 
-static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int halt_interception(struct vcpu_svm *svm)
 {
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
        skip_emulated_instruction(&svm->vcpu);
        return kvm_emulate_halt(&svm->vcpu);
 }
 
-static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmmcall_interception(struct vcpu_svm *svm)
 {
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
        skip_emulated_instruction(&svm->vcpu);
@@ -1375,8 +1378,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
 
        svm->vmcb->control.exit_code = SVM_EXIT_INTR;
 
-       if (nested_svm_exit_handled(svm)) {
-               nsvm_printk("VMexit -> INTR\n");
+       if (svm->nested.intercept & 1ULL) {
+               /*
+                * The #vmexit can't be emulated here directly because this
+                * code path runs with irqs and preemtion disabled. A
+                * #vmexit emulation might sleep. Only signal request for
+                * the #vmexit here.
+                */
+               svm->nested.exit_required = true;
+               trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
                return 1;
        }
 
@@ -1387,10 +1397,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
 {
        struct page *page;
 
-       down_read(&current->mm->mmap_sem);
        page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
-       up_read(&current->mm->mmap_sem);
-
        if (is_error_page(page))
                goto error;
 
@@ -1529,14 +1536,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
        }
        default: {
                u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
-               nsvm_printk("exit code: 0x%x\n", exit_code);
                if (svm->nested.intercept & exit_bits)
                        vmexit = NESTED_EXIT_DONE;
        }
        }
 
        if (vmexit == NESTED_EXIT_DONE) {
-               nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
                nested_svm_vmexit(svm);
        }
 
@@ -1581,6 +1586,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
        struct vmcb *hsave = svm->nested.hsave;
        struct vmcb *vmcb = svm->vmcb;
 
+       trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
+                                      vmcb->control.exit_info_1,
+                                      vmcb->control.exit_info_2,
+                                      vmcb->control.exit_int_info,
+                                      vmcb->control.exit_int_info_err);
+
        nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
        if (!nested_vmcb)
                return 1;
@@ -1614,6 +1625,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
        nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
        nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
        nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
+
+       /*
+        * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
+        * to make sure that we do not lose injected events. So check event_inj
+        * here and copy it to exit_int_info if it is valid.
+        * Exit_int_info and event_inj can't be both valid because the case
+        * below only happens on a VMRUN instruction intercept which has
+        * no valid exit_int_info set.
+        */
+       if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
+               struct vmcb_control_area *nc = &nested_vmcb->control;
+
+               nc->exit_int_info     = vmcb->control.event_inj;
+               nc->exit_int_info_err = vmcb->control.event_inj_err;
+       }
+
        nested_vmcb->control.tlb_ctl           = 0;
        nested_vmcb->control.event_inj         = 0;
        nested_vmcb->control.event_inj_err     = 0;
@@ -1625,10 +1652,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
        /* Restore the original control entries */
        copy_vmcb_control_area(vmcb, hsave);
 
-       /* Kill any pending exceptions */
-       if (svm->vcpu.arch.exception.pending == true)
-               nsvm_printk("WARNING: Pending Exception\n");
-
        kvm_clear_exception_queue(&svm->vcpu);
        kvm_clear_interrupt_queue(&svm->vcpu);
 
@@ -1699,6 +1722,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
        /* nested_vmcb is our indicator if nested SVM is activated */
        svm->nested.vmcb = svm->vmcb->save.rax;
 
+       trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
+                              nested_vmcb->save.rip,
+                              nested_vmcb->control.int_ctl,
+                              nested_vmcb->control.event_inj,
+                              nested_vmcb->control.nested_ctl);
+
        /* Clear internal status */
        kvm_clear_exception_queue(&svm->vcpu);
        kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1786,28 +1815,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
        svm->nested.intercept            = nested_vmcb->control.intercept;
 
        force_new_asid(&svm->vcpu);
-       svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
-       svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
        svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
-       if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
-               nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
-                               nested_vmcb->control.int_ctl);
-       }
        if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
                svm->vcpu.arch.hflags |= HF_VINTR_MASK;
        else
                svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
 
-       nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
-                       nested_vmcb->control.exit_int_info,
-                       nested_vmcb->control.int_state);
-
        svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
        svm->vmcb->control.int_state = nested_vmcb->control.int_state;
        svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
-       if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
-               nsvm_printk("Injecting Event: 0x%x\n",
-                               nested_vmcb->control.event_inj);
        svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
        svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
 
@@ -1834,7 +1850,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
        to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
 }
 
-static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmload_interception(struct vcpu_svm *svm)
 {
        struct vmcb *nested_vmcb;
 
@@ -1854,7 +1870,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmsave_interception(struct vcpu_svm *svm)
 {
        struct vmcb *nested_vmcb;
 
@@ -1874,10 +1890,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int vmrun_interception(struct vcpu_svm *svm)
 {
-       nsvm_printk("VMrun\n");
-
        if (nested_svm_check_permissions(svm))
                return 1;
 
@@ -1904,7 +1918,7 @@ failed:
        return 1;
 }
 
-static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int stgi_interception(struct vcpu_svm *svm)
 {
        if (nested_svm_check_permissions(svm))
                return 1;
@@ -1917,7 +1931,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int clgi_interception(struct vcpu_svm *svm)
 {
        if (nested_svm_check_permissions(svm))
                return 1;
@@ -1934,10 +1948,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int invlpga_interception(struct vcpu_svm *svm)
 {
        struct kvm_vcpu *vcpu = &svm->vcpu;
-       nsvm_printk("INVLPGA\n");
+
+       trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
+                         vcpu->arch.regs[VCPU_REGS_RAX]);
 
        /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
        kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1947,15 +1963,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int invalid_op_interception(struct vcpu_svm *svm,
-                                  struct kvm_run *kvm_run)
+static int skinit_interception(struct vcpu_svm *svm)
 {
+       trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
+
        kvm_queue_exception(&svm->vcpu, UD_VECTOR);
        return 1;
 }
 
-static int task_switch_interception(struct vcpu_svm *svm,
-                                   struct kvm_run *kvm_run)
+static int invalid_op_interception(struct vcpu_svm *svm)
+{
+       kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+       return 1;
+}
+
+static int task_switch_interception(struct vcpu_svm *svm)
 {
        u16 tss_selector;
        int reason;
@@ -2005,14 +2027,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
        return kvm_task_switch(&svm->vcpu, tss_selector, reason);
 }
 
-static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int cpuid_interception(struct vcpu_svm *svm)
 {
        svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
        kvm_emulate_cpuid(&svm->vcpu);
        return 1;
 }
 
-static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int iret_interception(struct vcpu_svm *svm)
 {
        ++svm->vcpu.stat.nmi_window_exits;
        svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
@@ -2020,26 +2042,27 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int invlpg_interception(struct vcpu_svm *svm)
 {
-       if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
+       if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
                pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
        return 1;
 }
 
-static int emulate_on_interception(struct vcpu_svm *svm,
-                                  struct kvm_run *kvm_run)
+static int emulate_on_interception(struct vcpu_svm *svm)
 {
-       if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
+       if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
                pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
        return 1;
 }
 
-static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int cr8_write_interception(struct vcpu_svm *svm)
 {
+       struct kvm_run *kvm_run = svm->vcpu.run;
+
        u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
        /* instruction emulation calls kvm_set_cr8() */
-       emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
+       emulate_instruction(&svm->vcpu, 0, 0, 0);
        if (irqchip_in_kernel(svm->vcpu.kvm)) {
                svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
                return 1;
@@ -2056,10 +2079,14 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 
        switch (ecx) {
        case MSR_IA32_TSC: {
-               u64 tsc;
+               u64 tsc_offset;
 
-               rdtscll(tsc);
-               *data = svm->vmcb->control.tsc_offset + tsc;
+               if (is_nested(svm))
+                       tsc_offset = svm->nested.hsave->control.tsc_offset;
+               else
+                       tsc_offset = svm->vmcb->control.tsc_offset;
+
+               *data = tsc_offset + native_read_tsc();
                break;
        }
        case MSR_K6_STAR:
@@ -2121,7 +2148,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
        return 0;
 }
 
-static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int rdmsr_interception(struct vcpu_svm *svm)
 {
        u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
        u64 data;
@@ -2145,10 +2172,17 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
 
        switch (ecx) {
        case MSR_IA32_TSC: {
-               u64 tsc;
+               u64 tsc_offset = data - native_read_tsc();
+               u64 g_tsc_offset = 0;
+
+               if (is_nested(svm)) {
+                       g_tsc_offset = svm->vmcb->control.tsc_offset -
+                                      svm->nested.hsave->control.tsc_offset;
+                       svm->nested.hsave->control.tsc_offset = tsc_offset;
+               }
+
+               svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
 
-               rdtscll(tsc);
-               svm->vmcb->control.tsc_offset = data - tsc;
                break;
        }
        case MSR_K6_STAR:
@@ -2207,7 +2241,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
        return 0;
 }
 
-static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int wrmsr_interception(struct vcpu_svm *svm)
 {
        u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
        u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
@@ -2223,17 +2257,18 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
        return 1;
 }
 
-static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+static int msr_interception(struct vcpu_svm *svm)
 {
        if (svm->vmcb->control.exit_info_1)
-               return wrmsr_interception(svm, kvm_run);
+               return wrmsr_interception(svm);
        else
-               return rdmsr_interception(svm, kvm_run);
+               return rdmsr_interception(svm);
 }
 
-static int interrupt_window_interception(struct vcpu_svm *svm,
-                                  struct kvm_run *kvm_run)
+static int interrupt_window_interception(struct vcpu_svm *svm)
 {
+       struct kvm_run *kvm_run = svm->vcpu.run;
+
        svm_clear_vintr(svm);
        svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
        /*
@@ -2251,8 +2286,13 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
        return 1;
 }
 
-static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
-                                     struct kvm_run *kvm_run) = {
+static int pause_interception(struct vcpu_svm *svm)
+{
+       kvm_vcpu_on_spin(&(svm->vcpu));
+       return 1;
+}
+
+static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
        [SVM_EXIT_READ_CR0]                     = emulate_on_interception,
        [SVM_EXIT_READ_CR3]                     = emulate_on_interception,
        [SVM_EXIT_READ_CR4]                     = emulate_on_interception,
@@ -2287,6 +2327,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
        [SVM_EXIT_CPUID]                        = cpuid_interception,
        [SVM_EXIT_IRET]                         = iret_interception,
        [SVM_EXIT_INVD]                         = emulate_on_interception,
+       [SVM_EXIT_PAUSE]                        = pause_interception,
        [SVM_EXIT_HLT]                          = halt_interception,
        [SVM_EXIT_INVLPG]                       = invlpg_interception,
        [SVM_EXIT_INVLPGA]                      = invlpga_interception,
@@ -2300,26 +2341,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
        [SVM_EXIT_VMSAVE]                       = vmsave_interception,
        [SVM_EXIT_STGI]                         = stgi_interception,
        [SVM_EXIT_CLGI]                         = clgi_interception,
-       [SVM_EXIT_SKINIT]                       = invalid_op_interception,
+       [SVM_EXIT_SKINIT]                       = skinit_interception,
        [SVM_EXIT_WBINVD]                       = emulate_on_interception,
        [SVM_EXIT_MONITOR]                      = invalid_op_interception,
        [SVM_EXIT_MWAIT]                        = invalid_op_interception,
        [SVM_EXIT_NPF]                          = pf_interception,
 };
 
-static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+static int handle_exit(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
+       struct kvm_run *kvm_run = vcpu->run;
        u32 exit_code = svm->vmcb->control.exit_code;
 
        trace_kvm_exit(exit_code, svm->vmcb->save.rip);
 
+       if (unlikely(svm->nested.exit_required)) {
+               nested_svm_vmexit(svm);
+               svm->nested.exit_required = false;
+
+               return 1;
+       }
+
        if (is_nested(svm)) {
                int vmexit;
 
-               nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n",
-                           exit_code, svm->vmcb->control.exit_info_1,
-                           svm->vmcb->control.exit_info_2, svm->vmcb->save.rip);
+               trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
+                                       svm->vmcb->control.exit_info_1,
+                                       svm->vmcb->control.exit_info_2,
+                                       svm->vmcb->control.exit_int_info,
+                                       svm->vmcb->control.exit_int_info_err);
 
                vmexit = nested_svm_exit_special(svm);
 
@@ -2369,7 +2420,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       return svm_exit_handlers[exit_code](svm, kvm_run);
+       return svm_exit_handlers[exit_code](svm);
 }
 
 static void reload_tss(struct kvm_vcpu *vcpu)
@@ -2446,20 +2497,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
                !(svm->vcpu.arch.hflags & HF_NMI_MASK);
 }
 
+static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
+}
+
+static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
+{
+       struct vcpu_svm *svm = to_svm(vcpu);
+
+       if (masked) {
+               svm->vcpu.arch.hflags |= HF_NMI_MASK;
+               svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+       } else {
+               svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+               svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+       }
+}
+
 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        struct vmcb *vmcb = svm->vmcb;
-       return (vmcb->save.rflags & X86_EFLAGS_IF) &&
-               !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-               gif_set(svm) &&
-               !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK));
+       int ret;
+
+       if (!gif_set(svm) ||
+            (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
+               return 0;
+
+       ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
+
+       if (is_nested(svm))
+               return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
+
+       return ret;
 }
 
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
-       nsvm_printk("Trying to open IRQ window\n");
 
        nested_svm_intr(svm);
 
@@ -2484,7 +2562,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
        /* Something prevents NMI from been injected. Single step over
           possible problem (IRET or exception injection or interrupt
           shadow) */
-       vcpu->arch.singlestep = true;
+       svm->nmi_singlestep = true;
        svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
        update_db_intercept(vcpu);
 }
@@ -2574,13 +2652,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 #define R "e"
 #endif
 
-static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 {
        struct vcpu_svm *svm = to_svm(vcpu);
        u16 fs_selector;
        u16 gs_selector;
        u16 ldt_selector;
 
+       /*
+        * A vmexit emulation is required before the vcpu can be executed
+        * again.
+        */
+       if (unlikely(svm->nested.exit_required))
+               return;
+
        svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
        svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
        svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2879,6 +2964,8 @@ static struct kvm_x86_ops svm_x86_ops = {
        .queue_exception = svm_queue_exception,
        .interrupt_allowed = svm_interrupt_allowed,
        .nmi_allowed = svm_nmi_allowed,
+       .get_nmi_mask = svm_get_nmi_mask,
+       .set_nmi_mask = svm_set_nmi_mask,
        .enable_nmi_window = enable_nmi_window,
        .enable_irq_window = enable_irq_window,
        .update_cr8_intercept = update_cr8_intercept,