KVM: MMU: Reinstate pte prefetch on invlpg
authorAvi Kivity <avi@redhat.com>
Mon, 15 Mar 2010 11:59:57 +0000 (13:59 +0200)
committerAvi Kivity <avi@redhat.com>
Mon, 17 May 2010 09:15:43 +0000 (12:15 +0300)
Commit fb341f57 removed the pte prefetch on guest invlpg, citing guest races.
However, the SDM is adamant that prefetch is allowed:

  "The processor may create entries in paging-structure caches for
   translations required for prefetches and for accesses that are a
   result of speculative execution that would never actually occur
   in the executed code path."

And, in fact, there was a race in the prefetch code: we picked up the pte
without the mmu lock held, so an older invlpg could install the pte over
a newer invlpg.

Reinstate the prefetch logic, but this time note whether another invlpg has
executed using a counter.  If a race occured, do not install the pte.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
arch/x86/include/asm/kvm_host.h
arch/x86/kvm/mmu.c
arch/x86/kvm/paging_tmpl.h

index ea1b6c615f9f986cc9dea5af763e2f02873c4372..28826c82d1e212654e931eb3536f3ba546d434f3 100644 (file)
@@ -389,6 +389,7 @@ struct kvm_arch {
        unsigned int n_free_mmu_pages;
        unsigned int n_requested_mmu_pages;
        unsigned int n_alloc_mmu_pages;
+       atomic_t invlpg_counter;
        struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
        /*
         * Hash table of struct kvm_mmu_page.
index 91f8b171c825b0516f208090314574433a383b5e..064c3efb49dc6e6c6c83ea327b531e5b3f94d30d 100644 (file)
@@ -2613,20 +2613,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        int flooded = 0;
        int npte;
        int r;
+       int invlpg_counter;
 
        pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
-       switch (bytes) {
-       case 4:
-               gentry = *(const u32 *)new;
-               break;
-       case 8:
-               gentry = *(const u64 *)new;
-               break;
-       default:
-               gentry = 0;
-               break;
-       }
+       invlpg_counter = atomic_read(&vcpu->kvm->arch.invlpg_counter);
 
        /*
         * Assume that the pte write on a page table of the same type
@@ -2634,16 +2625,34 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
         * (might be false while changing modes).  Note it is verified later
         * by update_pte().
         */
-       if (is_pae(vcpu) && bytes == 4) {
+       if ((is_pae(vcpu) && bytes == 4) || !new) {
                /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
-               gpa &= ~(gpa_t)7;
-               r = kvm_read_guest(vcpu->kvm, gpa, &gentry, 8);
+               if (is_pae(vcpu)) {
+                       gpa &= ~(gpa_t)7;
+                       bytes = 8;
+               }
+               r = kvm_read_guest(vcpu->kvm, gpa, &gentry, min(bytes, 8));
                if (r)
                        gentry = 0;
+               new = (const u8 *)&gentry;
+       }
+
+       switch (bytes) {
+       case 4:
+               gentry = *(const u32 *)new;
+               break;
+       case 8:
+               gentry = *(const u64 *)new;
+               break;
+       default:
+               gentry = 0;
+               break;
        }
 
        mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
        spin_lock(&vcpu->kvm->mmu_lock);
+       if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
+               gentry = 0;
        kvm_mmu_access_page(vcpu, gfn);
        kvm_mmu_free_some_pages(vcpu);
        ++vcpu->kvm->stat.mmu_pte_write;
index 4b37e1acd3752446ad12b39da97c0b794c3c20c7..067797a72768b3f1ec2024fbd271d64afbca68a5 100644 (file)
@@ -463,6 +463,7 @@ out_unlock:
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
        struct kvm_shadow_walk_iterator iterator;
+       gpa_t pte_gpa = -1;
        int level;
        u64 *sptep;
        int need_flush = 0;
@@ -476,6 +477,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
                if (level == PT_PAGE_TABLE_LEVEL  ||
                    ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
                    ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
+                       struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+                       pte_gpa = (sp->gfn << PAGE_SHIFT);
+                       pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
                        if (is_shadow_present_pte(*sptep)) {
                                rmap_remove(vcpu->kvm, sptep);
@@ -493,7 +498,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 
        if (need_flush)
                kvm_flush_remote_tlbs(vcpu->kvm);
+
+       atomic_inc(&vcpu->kvm->arch.invlpg_counter);
+
        spin_unlock(&vcpu->kvm->mmu_lock);
+
+       if (pte_gpa == -1)
+               return;
+
+       if (mmu_topup_memory_caches(vcpu))
+               return;
+       kvm_mmu_pte_write(vcpu, pte_gpa, NULL, sizeof(pt_element_t), 0);
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,