kvm: x86: vmx: NULL out hwapic_isr_update() in case of !enable_apicv
[firefly-linux-kernel-4.4.55.git] / arch / x86 / kvm / vmx.c
index 3e556c68351b7a1356c05e7fe94c6c0895d9755d..6e71fac27d4eea41cf3f580b3b5f721ab948ff66 100644 (file)
@@ -99,13 +99,15 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
 static bool __read_mostly nested = 0;
 module_param(nested, bool, S_IRUGO);
 
+static u64 __read_mostly host_xss;
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON                                           \
        (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
 #define KVM_CR4_GUEST_OWNED_BITS                                     \
        (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
-        | X86_CR4_OSXMMEXCPT)
+        | X86_CR4_OSXMMEXCPT | X86_CR4_TSD)
 
 #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
 #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
@@ -214,6 +216,7 @@ struct __packed vmcs12 {
        u64 virtual_apic_page_addr;
        u64 apic_access_addr;
        u64 ept_pointer;
+       u64 xss_exit_bitmap;
        u64 guest_physical_address;
        u64 vmcs_link_pointer;
        u64 guest_ia32_debugctl;
@@ -616,6 +619,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
        FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
        FIELD64(EPT_POINTER, ept_pointer),
+       FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
        FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
        FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
        FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
@@ -720,12 +724,15 @@ static const unsigned short vmcs_field_to_offset_table[] = {
        FIELD(HOST_RSP, host_rsp),
        FIELD(HOST_RIP, host_rip),
 };
-static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
 
 static inline short vmcs_field_to_offset(unsigned long field)
 {
-       if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
-               return -1;
+       BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+
+       if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
+           vmcs_field_to_offset_table[field] == 0)
+               return -ENOENT;
+
        return vmcs_field_to_offset_table[field];
 }
 
@@ -758,6 +765,7 @@ static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
+static bool vmx_xsaves_supported(void);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -1098,6 +1106,12 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
 }
 
+static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
+{
+       return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) &&
+               vmx_xsaves_supported();
+}
+
 static inline bool is_exception(u32 intr_info)
 {
        return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -1659,12 +1673,20 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
        vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
 
        clear_atomic_switch_msr(vmx, MSR_EFER);
-       /* On ept, can't emulate nx, and must switch nx atomically */
-       if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
+
+       /*
+        * On EPT, we can't emulate NX, so we must switch EFER atomically.
+        * On CPUs that support "load IA32_EFER", always switch EFER
+        * atomically, since it's faster than switching it manually.
+        */
+       if (cpu_has_load_ia32_efer ||
+           (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
                guest_efer = vmx->vcpu.arch.efer;
                if (!(guest_efer & EFER_LMA))
                        guest_efer &= ~EFER_LME;
-               add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
+               if (guest_efer != host_efer)
+                       add_atomic_switch_msr(vmx, MSR_EFER,
+                                             guest_efer, host_efer);
                return false;
        }
 
@@ -2377,12 +2399,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)
        nested_vmx_secondary_ctls_low = 0;
        nested_vmx_secondary_ctls_high &=
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-               SECONDARY_EXEC_UNRESTRICTED_GUEST |
-               SECONDARY_EXEC_WBINVD_EXITING;
+               SECONDARY_EXEC_WBINVD_EXITING |
+               SECONDARY_EXEC_XSAVES;
 
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
-               nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+               nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+                       SECONDARY_EXEC_UNRESTRICTED_GUEST;
                nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
                         VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
                         VMX_EPT_INVEPT_BIT;
@@ -2558,6 +2581,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                if (!nested_vmx_allowed(vcpu))
                        return 1;
                return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+       case MSR_IA32_XSS:
+               if (!vmx_xsaves_supported())
+                       return 1;
+               data = vcpu->arch.ia32_xss;
+               break;
        case MSR_TSC_AUX:
                if (!to_vmx(vcpu)->rdtscp_enabled)
                        return 1;
@@ -2649,6 +2677,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                break;
        case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
                return 1; /* they are read-only */
+       case MSR_IA32_XSS:
+               if (!vmx_xsaves_supported())
+                       return 1;
+               /*
+                * The only supported bit as of Skylake is bit 8, but
+                * it is not supported on KVM.
+                */
+               if (data != 0)
+                       return 1;
+               vcpu->arch.ia32_xss = data;
+               if (vcpu->arch.ia32_xss != host_xss)
+                       add_atomic_switch_msr(vmx, MSR_IA32_XSS,
+                               vcpu->arch.ia32_xss, host_xss);
+               else
+                       clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+               break;
        case MSR_TSC_AUX:
                if (!vmx->rdtscp_enabled)
                        return 1;
@@ -2884,7 +2928,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_ENABLE_INVPCID |
                        SECONDARY_EXEC_APIC_REGISTER_VIRT |
                        SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-                       SECONDARY_EXEC_SHADOW_VMCS;
+                       SECONDARY_EXEC_SHADOW_VMCS |
+                       SECONDARY_EXEC_XSAVES;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@ -3007,6 +3052,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                }
        }
 
+       if (cpu_has_xsaves)
+               rdmsrl(MSR_IA32_XSS, host_xss);
+
        return 0;
 }
 
@@ -3110,76 +3158,6 @@ static __init int alloc_kvm_area(void)
        return 0;
 }
 
-static __init int hardware_setup(void)
-{
-       if (setup_vmcs_config(&vmcs_config) < 0)
-               return -EIO;
-
-       if (boot_cpu_has(X86_FEATURE_NX))
-               kvm_enable_efer_bits(EFER_NX);
-
-       if (!cpu_has_vmx_vpid())
-               enable_vpid = 0;
-       if (!cpu_has_vmx_shadow_vmcs())
-               enable_shadow_vmcs = 0;
-       if (enable_shadow_vmcs)
-               init_vmcs_shadow_fields();
-
-       if (!cpu_has_vmx_ept() ||
-           !cpu_has_vmx_ept_4levels()) {
-               enable_ept = 0;
-               enable_unrestricted_guest = 0;
-               enable_ept_ad_bits = 0;
-       }
-
-       if (!cpu_has_vmx_ept_ad_bits())
-               enable_ept_ad_bits = 0;
-
-       if (!cpu_has_vmx_unrestricted_guest())
-               enable_unrestricted_guest = 0;
-
-       if (!cpu_has_vmx_flexpriority()) {
-               flexpriority_enabled = 0;
-
-               /*
-                * set_apic_access_page_addr() is used to reload apic access
-                * page upon invalidation.  No need to do anything if the
-                * processor does not have the APIC_ACCESS_ADDR VMCS field.
-                */
-               kvm_x86_ops->set_apic_access_page_addr = NULL;
-       }
-
-       if (!cpu_has_vmx_tpr_shadow())
-               kvm_x86_ops->update_cr8_intercept = NULL;
-
-       if (enable_ept && !cpu_has_vmx_ept_2m_page())
-               kvm_disable_largepages();
-
-       if (!cpu_has_vmx_ple())
-               ple_gap = 0;
-
-       if (!cpu_has_vmx_apicv())
-               enable_apicv = 0;
-
-       if (enable_apicv)
-               kvm_x86_ops->update_cr8_intercept = NULL;
-       else {
-               kvm_x86_ops->hwapic_irr_update = NULL;
-               kvm_x86_ops->deliver_posted_interrupt = NULL;
-               kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
-       }
-
-       if (nested)
-               nested_vmx_setup_ctls_msrs();
-
-       return alloc_kvm_area();
-}
-
-static __exit void hardware_unsetup(void)
-{
-       free_kvm_area();
-}
-
 static bool emulation_required(struct kvm_vcpu *vcpu)
 {
        return emulate_invalid_guest_state && !guest_state_valid(vcpu);
@@ -4396,6 +4374,7 @@ static void ept_set_mmio_spte_mask(void)
        kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
 }
 
+#define VMX_XSS_EXIT_BITMAP 0
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -4505,6 +4484,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
        set_cr4_guest_host_mask(vmx);
 
+       if (vmx_xsaves_supported())
+               vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
+
        return 0;
 }
 
@@ -5163,13 +5145,20 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 static int handle_dr(struct kvm_vcpu *vcpu)
 {
        unsigned long exit_qualification;
-       int dr, reg;
+       int dr, dr7, reg;
+
+       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+       dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+
+       /* First, if DR does not exist, trigger UD */
+       if (!kvm_require_dr(vcpu, dr))
+               return 1;
 
        /* Do not handle if the CPL > 0, will trigger GP on re-entry */
        if (!kvm_require_cpl(vcpu, 0))
                return 1;
-       dr = vmcs_readl(GUEST_DR7);
-       if (dr & DR7_GD) {
+       dr7 = vmcs_readl(GUEST_DR7);
+       if (dr7 & DR7_GD) {
                /*
                 * As the vm-exit takes precedence over the debug trap, we
                 * need to emulate the latter, either for the host or the
@@ -5177,17 +5166,14 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                 */
                if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
                        vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
-                       vcpu->run->debug.arch.dr7 = dr;
-                       vcpu->run->debug.arch.pc =
-                               vmcs_readl(GUEST_CS_BASE) +
-                               vmcs_readl(GUEST_RIP);
+                       vcpu->run->debug.arch.dr7 = dr7;
+                       vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
                        vcpu->run->debug.arch.exception = DB_VECTOR;
                        vcpu->run->exit_reason = KVM_EXIT_DEBUG;
                        return 0;
                } else {
-                       vcpu->arch.dr7 &= ~DR7_GD;
+                       vcpu->arch.dr6 &= ~15;
                        vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
-                       vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
                        kvm_queue_exception(vcpu, DB_VECTOR);
                        return 1;
                }
@@ -5209,8 +5195,6 @@ static int handle_dr(struct kvm_vcpu *vcpu)
                return 1;
        }
 
-       exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-       dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
        reg = DEBUG_REG_ACCESS_REG(exit_qualification);
        if (exit_qualification & TYPE_MOV_FROM_DR) {
                unsigned long val;
@@ -5391,6 +5375,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_xsaves(struct kvm_vcpu *vcpu)
+{
+       skip_emulated_instruction(vcpu);
+       WARN(1, "this should never happen\n");
+       return 1;
+}
+
+static int handle_xrstors(struct kvm_vcpu *vcpu)
+{
+       skip_emulated_instruction(vcpu);
+       WARN(1, "this should never happen\n");
+       return 1;
+}
+
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
        if (likely(fasteoi)) {
@@ -5492,7 +5490,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
        }
 
        /* clear all local breakpoint enable flags */
-       vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55);
+       vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
 
        /*
         * TODO: What about debug traps on tss switch?
@@ -5539,11 +5537,11 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        trace_kvm_page_fault(gpa, exit_qualification);
 
        /* It is a write fault? */
-       error_code = exit_qualification & (1U << 1);
+       error_code = exit_qualification & PFERR_WRITE_MASK;
        /* It is a fetch fault? */
-       error_code |= (exit_qualification & (1U << 2)) << 2;
+       error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
        /* ept page table is present? */
-       error_code |= (exit_qualification >> 3) & 0x1;
+       error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
 
        vcpu->arch.exit_qualification = exit_qualification;
 
@@ -5785,129 +5783,328 @@ static void update_ple_window_actual_max(void)
                                            ple_window_grow, INT_MIN);
 }
 
-/*
- * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
- * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
- */
-static int handle_pause(struct kvm_vcpu *vcpu)
+static __init int hardware_setup(void)
 {
-       if (ple_gap)
-               grow_ple_window(vcpu);
+       int r = -ENOMEM, i, msr;
 
-       skip_emulated_instruction(vcpu);
-       kvm_vcpu_on_spin(vcpu);
+       rdmsrl_safe(MSR_EFER, &host_efer);
 
-       return 1;
-}
+       for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
+               kvm_define_shared_msr(i, vmx_msr_index[i]);
 
-static int handle_nop(struct kvm_vcpu *vcpu)
-{
-       skip_emulated_instruction(vcpu);
-       return 1;
-}
+       vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_io_bitmap_a)
+               return r;
 
-static int handle_mwait(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
+       vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_io_bitmap_b)
+               goto out;
 
-static int handle_monitor(struct kvm_vcpu *vcpu)
-{
-       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
-       return handle_nop(vcpu);
-}
+       vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_legacy)
+               goto out1;
 
-/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
+       vmx_msr_bitmap_legacy_x2apic =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_legacy_x2apic)
+               goto out2;
 
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
-       struct vmcs02_list *item;
-       list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
-               if (item->vmptr == vmx->nested.current_vmptr) {
-                       list_move(&item->list, &vmx->nested.vmcs02_pool);
-                       return &item->vmcs02;
-               }
+       vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_longmode)
+               goto out3;
 
-       if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
-               /* Recycle the least recently used VMCS. */
-               item = list_entry(vmx->nested.vmcs02_pool.prev,
-                       struct vmcs02_list, list);
-               item->vmptr = vmx->nested.current_vmptr;
-               list_move(&item->list, &vmx->nested.vmcs02_pool);
-               return &item->vmcs02;
-       }
+       vmx_msr_bitmap_longmode_x2apic =
+                               (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_msr_bitmap_longmode_x2apic)
+               goto out4;
+       vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_vmread_bitmap)
+               goto out5;
 
-       /* Create a new VMCS */
-       item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
-       if (!item)
-               return NULL;
-       item->vmcs02.vmcs = alloc_vmcs();
-       if (!item->vmcs02.vmcs) {
-               kfree(item);
-               return NULL;
-       }
-       loaded_vmcs_init(&item->vmcs02);
-       item->vmptr = vmx->nested.current_vmptr;
-       list_add(&(item->list), &(vmx->nested.vmcs02_pool));
-       vmx->nested.vmcs02_num++;
-       return &item->vmcs02;
-}
+       vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+       if (!vmx_vmwrite_bitmap)
+               goto out6;
 
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
-       struct vmcs02_list *item;
-       list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
-               if (item->vmptr == vmptr) {
-                       free_loaded_vmcs(&item->vmcs02);
-                       list_del(&item->list);
-                       kfree(item);
-                       vmx->nested.vmcs02_num--;
-                       return;
-               }
-}
+       memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+       memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
 
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
-       struct vmcs02_list *item, *n;
+       /*
+        * Allow direct access to the PC debug port (it is often used for I/O
+        * delays, but the vmexits simply slow things down).
+        */
+       memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
+       clear_bit(0x80, vmx_io_bitmap_a);
 
-       WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
-       list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
-               /*
-                * Something will leak if the above WARN triggers.  Better than
-                * a use-after-free.
-                */
-               if (vmx->loaded_vmcs == &item->vmcs02)
-                       continue;
+       memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
 
-               free_loaded_vmcs(&item->vmcs02);
-               list_del(&item->list);
-               kfree(item);
-               vmx->nested.vmcs02_num--;
+       memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+       memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+
+       if (setup_vmcs_config(&vmcs_config) < 0) {
+               r = -EIO;
+               goto out7;
        }
-}
 
-/*
- * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
- * set the success or error code of an emulated VMX instruction, as specified
+       if (boot_cpu_has(X86_FEATURE_NX))
+               kvm_enable_efer_bits(EFER_NX);
+
+       if (!cpu_has_vmx_vpid())
+               enable_vpid = 0;
+       if (!cpu_has_vmx_shadow_vmcs())
+               enable_shadow_vmcs = 0;
+       if (enable_shadow_vmcs)
+               init_vmcs_shadow_fields();
+
+       if (!cpu_has_vmx_ept() ||
+           !cpu_has_vmx_ept_4levels()) {
+               enable_ept = 0;
+               enable_unrestricted_guest = 0;
+               enable_ept_ad_bits = 0;
+       }
+
+       if (!cpu_has_vmx_ept_ad_bits())
+               enable_ept_ad_bits = 0;
+
+       if (!cpu_has_vmx_unrestricted_guest())
+               enable_unrestricted_guest = 0;
+
+       if (!cpu_has_vmx_flexpriority()) {
+               flexpriority_enabled = 0;
+
+               /*
+                * set_apic_access_page_addr() is used to reload apic access
+                * page upon invalidation.  No need to do anything if the
+                * processor does not have the APIC_ACCESS_ADDR VMCS field.
+                */
+               kvm_x86_ops->set_apic_access_page_addr = NULL;
+       }
+
+       if (!cpu_has_vmx_tpr_shadow())
+               kvm_x86_ops->update_cr8_intercept = NULL;
+
+       if (enable_ept && !cpu_has_vmx_ept_2m_page())
+               kvm_disable_largepages();
+
+       if (!cpu_has_vmx_ple())
+               ple_gap = 0;
+
+       if (!cpu_has_vmx_apicv())
+               enable_apicv = 0;
+
+       if (enable_apicv)
+               kvm_x86_ops->update_cr8_intercept = NULL;
+       else {
+               kvm_x86_ops->hwapic_irr_update = NULL;
+               kvm_x86_ops->hwapic_isr_update = NULL;
+               kvm_x86_ops->deliver_posted_interrupt = NULL;
+               kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
+       }
+
+       if (nested)
+               nested_vmx_setup_ctls_msrs();
+
+       vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+       vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+       vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+       vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
+
+       memcpy(vmx_msr_bitmap_legacy_x2apic,
+                       vmx_msr_bitmap_legacy, PAGE_SIZE);
+       memcpy(vmx_msr_bitmap_longmode_x2apic,
+                       vmx_msr_bitmap_longmode, PAGE_SIZE);
+
+       if (enable_apicv) {
+               for (msr = 0x800; msr <= 0x8ff; msr++)
+                       vmx_disable_intercept_msr_read_x2apic(msr);
+
+               /* According SDM, in x2apic mode, the whole id reg is used.
+                * But in KVM, it only use the highest eight bits. Need to
+                * intercept it */
+               vmx_enable_intercept_msr_read_x2apic(0x802);
+               /* TMCCT */
+               vmx_enable_intercept_msr_read_x2apic(0x839);
+               /* TPR */
+               vmx_disable_intercept_msr_write_x2apic(0x808);
+               /* EOI */
+               vmx_disable_intercept_msr_write_x2apic(0x80b);
+               /* SELF-IPI */
+               vmx_disable_intercept_msr_write_x2apic(0x83f);
+       }
+
+       if (enable_ept) {
+               kvm_mmu_set_mask_ptes(0ull,
+                       (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
+                       (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
+                       0ull, VMX_EPT_EXECUTABLE_MASK);
+               ept_set_mmio_spte_mask();
+               kvm_enable_tdp();
+       } else
+               kvm_disable_tdp();
+
+       update_ple_window_actual_max();
+
+       return alloc_kvm_area();
+
+out7:
+       free_page((unsigned long)vmx_vmwrite_bitmap);
+out6:
+       free_page((unsigned long)vmx_vmread_bitmap);
+out5:
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+out4:
+       free_page((unsigned long)vmx_msr_bitmap_longmode);
+out3:
+       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+out2:
+       free_page((unsigned long)vmx_msr_bitmap_legacy);
+out1:
+       free_page((unsigned long)vmx_io_bitmap_b);
+out:
+       free_page((unsigned long)vmx_io_bitmap_a);
+
+    return r;
+}
+
+static __exit void hardware_unsetup(void)
+{
+       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+       free_page((unsigned long)vmx_msr_bitmap_legacy);
+       free_page((unsigned long)vmx_msr_bitmap_longmode);
+       free_page((unsigned long)vmx_io_bitmap_b);
+       free_page((unsigned long)vmx_io_bitmap_a);
+       free_page((unsigned long)vmx_vmwrite_bitmap);
+       free_page((unsigned long)vmx_vmread_bitmap);
+
+       free_kvm_area();
+}
+
+/*
+ * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
+ * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
+ */
+static int handle_pause(struct kvm_vcpu *vcpu)
+{
+       if (ple_gap)
+               grow_ple_window(vcpu);
+
+       skip_emulated_instruction(vcpu);
+       kvm_vcpu_on_spin(vcpu);
+
+       return 1;
+}
+
+static int handle_nop(struct kvm_vcpu *vcpu)
+{
+       skip_emulated_instruction(vcpu);
+       return 1;
+}
+
+static int handle_mwait(struct kvm_vcpu *vcpu)
+{
+       printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
+       return handle_nop(vcpu);
+}
+
+static int handle_monitor(struct kvm_vcpu *vcpu)
+{
+       printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
+       return handle_nop(vcpu);
+}
+
+/*
+ * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
+ * We could reuse a single VMCS for all the L2 guests, but we also want the
+ * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
+ * allows keeping them loaded on the processor, and in the future will allow
+ * optimizations where prepare_vmcs02 doesn't need to set all the fields on
+ * every entry if they never change.
+ * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
+ * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
+ *
+ * The following functions allocate and free a vmcs02 in this pool.
+ */
+
+/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
+static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
+{
+       struct vmcs02_list *item;
+       list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+               if (item->vmptr == vmx->nested.current_vmptr) {
+                       list_move(&item->list, &vmx->nested.vmcs02_pool);
+                       return &item->vmcs02;
+               }
+
+       if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
+               /* Recycle the least recently used VMCS. */
+               item = list_entry(vmx->nested.vmcs02_pool.prev,
+                       struct vmcs02_list, list);
+               item->vmptr = vmx->nested.current_vmptr;
+               list_move(&item->list, &vmx->nested.vmcs02_pool);
+               return &item->vmcs02;
+       }
+
+       /* Create a new VMCS */
+       item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+       if (!item)
+               return NULL;
+       item->vmcs02.vmcs = alloc_vmcs();
+       if (!item->vmcs02.vmcs) {
+               kfree(item);
+               return NULL;
+       }
+       loaded_vmcs_init(&item->vmcs02);
+       item->vmptr = vmx->nested.current_vmptr;
+       list_add(&(item->list), &(vmx->nested.vmcs02_pool));
+       vmx->nested.vmcs02_num++;
+       return &item->vmcs02;
+}
+
+/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
+static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+       struct vmcs02_list *item;
+       list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
+               if (item->vmptr == vmptr) {
+                       free_loaded_vmcs(&item->vmcs02);
+                       list_del(&item->list);
+                       kfree(item);
+                       vmx->nested.vmcs02_num--;
+                       return;
+               }
+}
+
+/*
+ * Free all VMCSs saved for this vcpu, except the one pointed by
+ * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
+ * must be &vmx->vmcs01.
+ */
+static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
+{
+       struct vmcs02_list *item, *n;
+
+       WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
+       list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
+               /*
+                * Something will leak if the above WARN triggers.  Better than
+                * a use-after-free.
+                */
+               if (vmx->loaded_vmcs == &item->vmcs02)
+                       continue;
+
+               free_loaded_vmcs(&item->vmcs02);
+               list_del(&item->list);
+               kfree(item);
+               vmx->nested.vmcs02_num--;
+       }
+}
+
+/*
+ * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
+ * set the success or error code of an emulated VMX instruction, as specified
  * by Vol 2B, VMX Instruction Reference, "Conventions".
  */
 static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
@@ -5947,6 +6144,13 @@ static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
         */
 }
 
+static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
+{
+       /* TODO: not to reset guest simply here. */
+       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+       pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
+}
+
 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
 {
        struct vcpu_vmx *vmx =
@@ -6361,58 +6565,60 @@ static inline int vmcs_field_readonly(unsigned long field)
  * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
  * 64-bit fields are to be returned).
  */
-static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
-                                       unsigned long field, u64 *ret)
+static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
+                                 unsigned long field, u64 *ret)
 {
        short offset = vmcs_field_to_offset(field);
        char *p;
 
        if (offset < 0)
-               return 0;
+               return offset;
 
        p = ((char *)(get_vmcs12(vcpu))) + offset;
 
        switch (vmcs_field_type(field)) {
        case VMCS_FIELD_TYPE_NATURAL_WIDTH:
                *ret = *((natural_width *)p);
-               return 1;
+               return 0;
        case VMCS_FIELD_TYPE_U16:
                *ret = *((u16 *)p);
-               return 1;
+               return 0;
        case VMCS_FIELD_TYPE_U32:
                *ret = *((u32 *)p);
-               return 1;
+               return 0;
        case VMCS_FIELD_TYPE_U64:
                *ret = *((u64 *)p);
-               return 1;
+               return 0;
        default:
-               return 0; /* can never happen. */
+               WARN_ON(1);
+               return -ENOENT;
        }
 }
 
 
-static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu,
-                                   unsigned long field, u64 field_value){
+static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
+                                  unsigned long field, u64 field_value){
        short offset = vmcs_field_to_offset(field);
        char *p = ((char *) get_vmcs12(vcpu)) + offset;
        if (offset < 0)
-               return false;
+               return offset;
 
        switch (vmcs_field_type(field)) {
        case VMCS_FIELD_TYPE_U16:
                *(u16 *)p = field_value;
-               return true;
+               return 0;
        case VMCS_FIELD_TYPE_U32:
                *(u32 *)p = field_value;
-               return true;
+               return 0;
        case VMCS_FIELD_TYPE_U64:
                *(u64 *)p = field_value;
-               return true;
+               return 0;
        case VMCS_FIELD_TYPE_NATURAL_WIDTH:
                *(natural_width *)p = field_value;
-               return true;
+               return 0;
        default:
-               return false; /* can never happen. */
+               WARN_ON(1);
+               return -ENOENT;
        }
 
 }
@@ -6445,6 +6651,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
                case VMCS_FIELD_TYPE_NATURAL_WIDTH:
                        field_value = vmcs_readl(field);
                        break;
+               default:
+                       WARN_ON(1);
+                       continue;
                }
                vmcs12_write_any(&vmx->vcpu, field, field_value);
        }
@@ -6490,6 +6699,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
                        case VMCS_FIELD_TYPE_NATURAL_WIDTH:
                                vmcs_writel(field, (long)field_value);
                                break;
+                       default:
+                               WARN_ON(1);
+                               break;
                        }
                }
        }
@@ -6528,7 +6740,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
        /* Decode instruction info and find the field to read */
        field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
        /* Read the field, zero-extended to a u64 field_value */
-       if (!vmcs12_read_any(vcpu, field, &field_value)) {
+       if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
                nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
                skip_emulated_instruction(vcpu);
                return 1;
@@ -6598,7 +6810,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
                return 1;
        }
 
-       if (!vmcs12_write_any(vcpu, field, field_value)) {
+       if (vmcs12_write_any(vcpu, field, field_value) < 0) {
                nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
                skip_emulated_instruction(vcpu);
                return 1;
@@ -6802,6 +7014,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
        [EXIT_REASON_INVEPT]                  = handle_invept,
        [EXIT_REASON_INVVPID]                 = handle_invvpid,
+       [EXIT_REASON_XSAVES]                  = handle_xsaves,
+       [EXIT_REASON_XRSTORS]                 = handle_xrstors,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7089,6 +7303,14 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
        case EXIT_REASON_XSETBV:
                return 1;
+       case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
+               /*
+                * This should never happen, since it is not possible to
+                * set XSS to a non-zero value---neither in L1 nor in L2.
+                * If if it were, XSS would have to be checked against
+                * the XSS exit bitmap in vmcs12.
+                */
+               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
        default:
                return 1;
        }
@@ -7257,9 +7479,6 @@ static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
        u16 status;
        u8 old;
 
-       if (!vmx_vm_has_apicv(kvm))
-               return;
-
        if (isr == -1)
                isr = 0;
 
@@ -7277,6 +7496,9 @@ static void vmx_set_rvi(int vector)
        u16 status;
        u8 old;
 
+       if (vector == -1)
+               vector = 0;
+
        status = vmcs_read16(GUEST_INTR_STATUS);
        old = (u8)status & 0xff;
        if ((u8)vector != old) {
@@ -7288,22 +7510,23 @@ static void vmx_set_rvi(int vector)
 
 static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 {
+       if (!is_guest_mode(vcpu)) {
+               vmx_set_rvi(max_irr);
+               return;
+       }
+
        if (max_irr == -1)
                return;
 
        /*
-        * If a vmexit is needed, vmx_check_nested_events handles it.
+        * In guest mode.  If a vmexit is needed, vmx_check_nested_events
+        * handles it.
         */
-       if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+       if (nested_exit_on_intr(vcpu))
                return;
 
-       if (!is_guest_mode(vcpu)) {
-               vmx_set_rvi(max_irr);
-               return;
-       }
-
        /*
-        * Fall back to pre-APICv interrupt injection since L2
+        * Else, fall back to pre-APICv interrupt injection since L2
         * is run without virtual interrupt delivery.
         */
        if (!kvm_event_needs_reinjection(vcpu) &&
@@ -7400,6 +7623,12 @@ static bool vmx_mpx_supported(void)
                (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
 }
 
+static bool vmx_xsaves_supported(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_XSAVES;
+}
+
 static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 {
        u32 exit_intr_info;
@@ -7975,6 +8204,18 @@ static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
        vcpu->arch.walk_mmu = &vcpu->arch.mmu;
 }
 
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+                                           u16 error_code)
+{
+       bool inequality, bit;
+
+       bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
+       inequality =
+               (error_code & vmcs12->page_fault_error_code_mask) !=
+                vmcs12->page_fault_error_code_match;
+       return inequality ^ bit;
+}
+
 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
                struct x86_exception *fault)
 {
@@ -7982,8 +8223,7 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
        WARN_ON(!is_guest_mode(vcpu));
 
-       /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */
-       if (vmcs12->exception_bitmap & (1u << PF_VECTOR))
+       if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
                nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
                                  vmcs_read32(VM_EXIT_INTR_INFO),
                                  vmcs_readl(EXIT_QUALIFICATION));
@@ -8062,6 +8302,162 @@ static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
                      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
 }
 
+static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
+                                      unsigned long count_field,
+                                      unsigned long addr_field,
+                                      int maxphyaddr)
+{
+       u64 count, addr;
+
+       if (vmcs12_read_any(vcpu, count_field, &count) ||
+           vmcs12_read_any(vcpu, addr_field, &addr)) {
+               WARN_ON(1);
+               return -EINVAL;
+       }
+       if (count == 0)
+               return 0;
+       if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
+           (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
+               pr_warn_ratelimited(
+                       "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
+                       addr_field, maxphyaddr, count, addr);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
+                                               struct vmcs12 *vmcs12)
+{
+       int maxphyaddr;
+
+       if (vmcs12->vm_exit_msr_load_count == 0 &&
+           vmcs12->vm_exit_msr_store_count == 0 &&
+           vmcs12->vm_entry_msr_load_count == 0)
+               return 0; /* Fast path */
+       maxphyaddr = cpuid_maxphyaddr(vcpu);
+       if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
+                                       VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+           nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
+                                       VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+           nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
+                                       VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+               return -EINVAL;
+       return 0;
+}
+
+static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
+                                      struct vmx_msr_entry *e)
+{
+       /* x2APIC MSR accesses are not allowed */
+       if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+               return -EINVAL;
+       if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
+           e->index == MSR_IA32_UCODE_REV)
+               return -EINVAL;
+       if (e->reserved != 0)
+               return -EINVAL;
+       return 0;
+}
+
+static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
+                                    struct vmx_msr_entry *e)
+{
+       if (e->index == MSR_FS_BASE ||
+           e->index == MSR_GS_BASE ||
+           e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
+           nested_vmx_msr_check_common(vcpu, e))
+               return -EINVAL;
+       return 0;
+}
+
+static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
+                                     struct vmx_msr_entry *e)
+{
+       if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
+           nested_vmx_msr_check_common(vcpu, e))
+               return -EINVAL;
+       return 0;
+}
+
+/*
+ * Load guest's/host's msr at nested entry/exit.
+ * return 0 for success, entry index for failure.
+ */
+static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+       u32 i;
+       struct vmx_msr_entry e;
+       struct msr_data msr;
+
+       msr.host_initiated = false;
+       for (i = 0; i < count; i++) {
+               if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
+                                  &e, sizeof(e))) {
+                       pr_warn_ratelimited(
+                               "%s cannot read MSR entry (%u, 0x%08llx)\n",
+                               __func__, i, gpa + i * sizeof(e));
+                       goto fail;
+               }
+               if (nested_vmx_load_msr_check(vcpu, &e)) {
+                       pr_warn_ratelimited(
+                               "%s check failed (%u, 0x%x, 0x%x)\n",
+                               __func__, i, e.index, e.reserved);
+                       goto fail;
+               }
+               msr.index = e.index;
+               msr.data = e.value;
+               if (kvm_set_msr(vcpu, &msr)) {
+                       pr_warn_ratelimited(
+                               "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+                               __func__, i, e.index, e.value);
+                       goto fail;
+               }
+       }
+       return 0;
+fail:
+       return i + 1;
+}
+
+static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
+{
+       u32 i;
+       struct vmx_msr_entry e;
+
+       for (i = 0; i < count; i++) {
+               if (kvm_read_guest(vcpu->kvm,
+                                  gpa + i * sizeof(e),
+                                  &e, 2 * sizeof(u32))) {
+                       pr_warn_ratelimited(
+                               "%s cannot read MSR entry (%u, 0x%08llx)\n",
+                               __func__, i, gpa + i * sizeof(e));
+                       return -EINVAL;
+               }
+               if (nested_vmx_store_msr_check(vcpu, &e)) {
+                       pr_warn_ratelimited(
+                               "%s check failed (%u, 0x%x, 0x%x)\n",
+                               __func__, i, e.index, e.reserved);
+                       return -EINVAL;
+               }
+               if (kvm_get_msr(vcpu, e.index, &e.value)) {
+                       pr_warn_ratelimited(
+                               "%s cannot read MSR (%u, 0x%x)\n",
+                               __func__, i, e.index);
+                       return -EINVAL;
+               }
+               if (kvm_write_guest(vcpu->kvm,
+                                   gpa + i * sizeof(e) +
+                                       offsetof(struct vmx_msr_entry, value),
+                                   &e.value, sizeof(e.value))) {
+                       pr_warn_ratelimited(
+                               "%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
+                               __func__, i, e.index, e.value);
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+
 /*
  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
@@ -8135,6 +8531,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
        vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
        vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
 
+       if (nested_cpu_has_xsaves(vmcs12))
+               vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
        vmcs_write64(VMCS_LINK_POINTER, -1ull);
 
        exec_control = vmcs12->pin_based_vm_exec_control;
@@ -8356,6 +8754,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
        int cpu;
        struct loaded_vmcs *vmcs02;
        bool ia32e;
+       u32 msr_entry_idx;
 
        if (!nested_vmx_check_permission(vcpu) ||
            !nested_vmx_check_vmcs12(vcpu))
@@ -8403,11 +8802,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
                return 1;
        }
 
-       if (vmcs12->vm_entry_msr_load_count > 0 ||
-           vmcs12->vm_exit_msr_load_count > 0 ||
-           vmcs12->vm_exit_msr_store_count > 0) {
-               pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n",
-                                   __func__);
+       if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
                nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
                return 1;
        }
@@ -8513,10 +8908,21 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
        vmx_segment_cache_clear(vmx);
 
-       vmcs12->launch_state = 1;
-
        prepare_vmcs02(vcpu, vmcs12);
 
+       msr_entry_idx = nested_vmx_load_msr(vcpu,
+                                           vmcs12->vm_entry_msr_load_addr,
+                                           vmcs12->vm_entry_msr_load_count);
+       if (msr_entry_idx) {
+               leave_guest_mode(vcpu);
+               vmx_load_vmcs01(vcpu);
+               nested_vmx_entry_failure(vcpu, vmcs12,
+                               EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
+               return 1;
+       }
+
+       vmcs12->launch_state = 1;
+
        if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
                return kvm_emulate_halt(vcpu);
 
@@ -8775,6 +9181,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
        if (vmx_mpx_supported())
                vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+       if (nested_cpu_has_xsaves(vmcs12))
+               vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
 
        /* update exit information fields: */
 
@@ -8944,6 +9352,10 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
 
        kvm_set_dr(vcpu, 7, 0x400);
        vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+
+       if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
+                               vmcs12->vm_exit_msr_load_count))
+               nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
 }
 
 /*
@@ -8965,6 +9377,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
                       exit_qualification);
 
+       if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
+                                vmcs12->vm_exit_msr_store_count))
+               nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
+
        vmx_load_vmcs01(vcpu);
 
        if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
@@ -9176,6 +9592,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .check_intercept = vmx_check_intercept,
        .handle_external_intr = vmx_handle_external_intr,
        .mpx_supported = vmx_mpx_supported,
+       .xsaves_supported = vmx_xsaves_supported,
 
        .check_nested_events = vmx_check_nested_events,
 
@@ -9184,150 +9601,21 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 static int __init vmx_init(void)
 {
-       int r, i, msr;
-
-       rdmsrl_safe(MSR_EFER, &host_efer);
-
-       for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
-               kvm_define_shared_msr(i, vmx_msr_index[i]);
-
-       vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_io_bitmap_a)
-               return -ENOMEM;
-
-       r = -ENOMEM;
-
-       vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_io_bitmap_b)
-               goto out;
-
-       vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_msr_bitmap_legacy)
-               goto out1;
-
-       vmx_msr_bitmap_legacy_x2apic =
-                               (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_msr_bitmap_legacy_x2apic)
-               goto out2;
-
-       vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_msr_bitmap_longmode)
-               goto out3;
-
-       vmx_msr_bitmap_longmode_x2apic =
-                               (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_msr_bitmap_longmode_x2apic)
-               goto out4;
-       vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_vmread_bitmap)
-               goto out5;
-
-       vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
-       if (!vmx_vmwrite_bitmap)
-               goto out6;
-
-       memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
-       memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
-
-       /*
-        * Allow direct access to the PC debug port (it is often used for I/O
-        * delays, but the vmexits simply slow things down).
-        */
-       memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
-       clear_bit(0x80, vmx_io_bitmap_a);
-
-       memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
-
-       memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
-       memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-
-       set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
-
-       r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
-                    __alignof__(struct vcpu_vmx), THIS_MODULE);
+       int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+                     __alignof__(struct vcpu_vmx), THIS_MODULE);
        if (r)
-               goto out7;
+               return r;
 
 #ifdef CONFIG_KEXEC
        rcu_assign_pointer(crash_vmclear_loaded_vmcss,
                           crash_vmclear_local_loaded_vmcss);
 #endif
 
-       vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
-       vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
-       vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
-       vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
-       vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
-
-       memcpy(vmx_msr_bitmap_legacy_x2apic,
-                       vmx_msr_bitmap_legacy, PAGE_SIZE);
-       memcpy(vmx_msr_bitmap_longmode_x2apic,
-                       vmx_msr_bitmap_longmode, PAGE_SIZE);
-
-       if (enable_apicv) {
-               for (msr = 0x800; msr <= 0x8ff; msr++)
-                       vmx_disable_intercept_msr_read_x2apic(msr);
-
-               /* According SDM, in x2apic mode, the whole id reg is used.
-                * But in KVM, it only use the highest eight bits. Need to
-                * intercept it */
-               vmx_enable_intercept_msr_read_x2apic(0x802);
-               /* TMCCT */
-               vmx_enable_intercept_msr_read_x2apic(0x839);
-               /* TPR */
-               vmx_disable_intercept_msr_write_x2apic(0x808);
-               /* EOI */
-               vmx_disable_intercept_msr_write_x2apic(0x80b);
-               /* SELF-IPI */
-               vmx_disable_intercept_msr_write_x2apic(0x83f);
-       }
-
-       if (enable_ept) {
-               kvm_mmu_set_mask_ptes(0ull,
-                       (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
-                       (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
-                       0ull, VMX_EPT_EXECUTABLE_MASK);
-               ept_set_mmio_spte_mask();
-               kvm_enable_tdp();
-       } else
-               kvm_disable_tdp();
-
-       update_ple_window_actual_max();
-
        return 0;
-
-out7:
-       free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
-       free_page((unsigned long)vmx_vmread_bitmap);
-out5:
-       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
-       free_page((unsigned long)vmx_msr_bitmap_longmode);
-out3:
-       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-out2:
-       free_page((unsigned long)vmx_msr_bitmap_legacy);
-out1:
-       free_page((unsigned long)vmx_io_bitmap_b);
-out:
-       free_page((unsigned long)vmx_io_bitmap_a);
-       return r;
 }
 
 static void __exit vmx_exit(void)
 {
-       free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-       free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-       free_page((unsigned long)vmx_msr_bitmap_legacy);
-       free_page((unsigned long)vmx_msr_bitmap_longmode);
-       free_page((unsigned long)vmx_io_bitmap_b);
-       free_page((unsigned long)vmx_io_bitmap_a);
-       free_page((unsigned long)vmx_vmwrite_bitmap);
-       free_page((unsigned long)vmx_vmread_bitmap);
-
 #ifdef CONFIG_KEXEC
        RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
        synchronize_rcu();