thp: kvm mmu transparent hugepage support
This should work for both hugetlbfs and transparent hugepages. [akpm@linux-foundation.org: bring forward PageTransCompound() addition for bisectability] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Avi Kivity <avi@redhat.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
47ad8475c0
commit
936a5fe6e6
4 changed files with 125 additions and 19 deletions
|
@ -554,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
|
||||
static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
|
||||
{
|
||||
struct kvm_memory_slot *slot;
|
||||
int host_level, level, max_level;
|
||||
|
||||
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
|
||||
if (slot && slot->dirty_bitmap)
|
||||
return PT_PAGE_TABLE_LEVEL;
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
|
||||
{
|
||||
int host_level, level, max_level;
|
||||
|
||||
host_level = host_mapping_level(vcpu->kvm, large_gfn);
|
||||
|
||||
|
@ -2281,6 +2285,48 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
|
|||
return 1;
|
||||
}
|
||||
|
||||
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
|
||||
gfn_t *gfnp, pfn_t *pfnp, int *levelp)
|
||||
{
|
||||
pfn_t pfn = *pfnp;
|
||||
gfn_t gfn = *gfnp;
|
||||
int level = *levelp;
|
||||
|
||||
/*
|
||||
* Check if it's a transparent hugepage. If this would be an
|
||||
* hugetlbfs page, level wouldn't be set to
|
||||
* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
|
||||
* here.
|
||||
*/
|
||||
if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
|
||||
level == PT_PAGE_TABLE_LEVEL &&
|
||||
PageTransCompound(pfn_to_page(pfn)) &&
|
||||
!has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
|
||||
unsigned long mask;
|
||||
/*
|
||||
* mmu_notifier_retry was successful and we hold the
|
||||
* mmu_lock here, so the pmd can't become splitting
|
||||
* from under us, and in turn
|
||||
* __split_huge_page_refcount() can't run from under
|
||||
* us and we can safely transfer the refcount from
|
||||
* PG_tail to PG_head as we switch the pfn to tail to
|
||||
* head.
|
||||
*/
|
||||
*levelp = level = PT_DIRECTORY_LEVEL;
|
||||
mask = KVM_PAGES_PER_HPAGE(level) - 1;
|
||||
VM_BUG_ON((gfn & mask) != (pfn & mask));
|
||||
if (pfn & mask) {
|
||||
gfn &= ~mask;
|
||||
*gfnp = gfn;
|
||||
kvm_release_pfn_clean(pfn);
|
||||
pfn &= ~mask;
|
||||
if (!get_page_unless_zero(pfn_to_page(pfn)))
|
||||
BUG();
|
||||
*pfnp = pfn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
|
||||
gva_t gva, pfn_t *pfn, bool write, bool *writable);
|
||||
|
||||
|
@ -2289,20 +2335,25 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
|
|||
{
|
||||
int r;
|
||||
int level;
|
||||
int force_pt_level;
|
||||
pfn_t pfn;
|
||||
unsigned long mmu_seq;
|
||||
bool map_writable;
|
||||
|
||||
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
|
||||
if (likely(!force_pt_level)) {
|
||||
level = mapping_level(vcpu, gfn);
|
||||
|
||||
/*
|
||||
* This path builds a PAE pagetable - so we can map 2mb pages at
|
||||
* maximum. Therefore check if the level is larger than that.
|
||||
* This path builds a PAE pagetable - so we can map
|
||||
* 2mb pages at maximum. Therefore check if the level
|
||||
* is larger than that.
|
||||
*/
|
||||
if (level > PT_DIRECTORY_LEVEL)
|
||||
level = PT_DIRECTORY_LEVEL;
|
||||
|
||||
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
|
||||
} else
|
||||
level = PT_PAGE_TABLE_LEVEL;
|
||||
|
||||
mmu_seq = vcpu->kvm->mmu_notifier_seq;
|
||||
smp_rmb();
|
||||
|
@ -2318,6 +2369,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
|
|||
if (mmu_notifier_retry(vcpu, mmu_seq))
|
||||
goto out_unlock;
|
||||
kvm_mmu_free_some_pages(vcpu);
|
||||
if (likely(!force_pt_level))
|
||||
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
|
||||
r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
|
||||
prefault);
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
|
@ -2655,6 +2708,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
|||
pfn_t pfn;
|
||||
int r;
|
||||
int level;
|
||||
int force_pt_level;
|
||||
gfn_t gfn = gpa >> PAGE_SHIFT;
|
||||
unsigned long mmu_seq;
|
||||
int write = error_code & PFERR_WRITE_MASK;
|
||||
|
@ -2667,9 +2721,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
|||
if (r)
|
||||
return r;
|
||||
|
||||
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
|
||||
if (likely(!force_pt_level)) {
|
||||
level = mapping_level(vcpu, gfn);
|
||||
|
||||
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
|
||||
} else
|
||||
level = PT_PAGE_TABLE_LEVEL;
|
||||
|
||||
mmu_seq = vcpu->kvm->mmu_notifier_seq;
|
||||
smp_rmb();
|
||||
|
@ -2684,6 +2741,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
|
|||
if (mmu_notifier_retry(vcpu, mmu_seq))
|
||||
goto out_unlock;
|
||||
kvm_mmu_free_some_pages(vcpu);
|
||||
if (likely(!force_pt_level))
|
||||
transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
|
||||
r = __direct_map(vcpu, gpa, write, map_writable,
|
||||
level, gfn, pfn, prefault);
|
||||
spin_unlock(&vcpu->kvm->mmu_lock);
|
||||
|
|
|
@ -550,6 +550,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
|||
int r;
|
||||
pfn_t pfn;
|
||||
int level = PT_PAGE_TABLE_LEVEL;
|
||||
int force_pt_level;
|
||||
unsigned long mmu_seq;
|
||||
bool map_writable;
|
||||
|
||||
|
@ -577,7 +578,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
|||
return 0;
|
||||
}
|
||||
|
||||
if (walker.level >= PT_DIRECTORY_LEVEL) {
|
||||
if (walker.level >= PT_DIRECTORY_LEVEL)
|
||||
force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
|
||||
else
|
||||
force_pt_level = 1;
|
||||
if (!force_pt_level) {
|
||||
level = min(walker.level, mapping_level(vcpu, walker.gfn));
|
||||
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
|
||||
}
|
||||
|
@ -599,6 +604,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
|
|||
|
||||
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
|
||||
kvm_mmu_free_some_pages(vcpu);
|
||||
if (!force_pt_level)
|
||||
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
|
||||
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
|
||||
level, &write_pt, pfn, map_writable, prefault);
|
||||
(void)sptep;
|
||||
|
|
|
@ -409,6 +409,18 @@ static inline void ClearPageCompound(struct page *page)
|
|||
|
||||
#endif /* !PAGEFLAGS_EXTENDED */
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
static inline int PageTransCompound(struct page *page)
|
||||
{
|
||||
return PageCompound(page);
|
||||
}
|
||||
#else
|
||||
static inline int PageTransCompound(struct page *page)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
#define __PG_MLOCKED (1 << PG_mlocked)
|
||||
#else
|
||||
|
|
|
@ -104,8 +104,36 @@ static pfn_t fault_pfn;
|
|||
inline int kvm_is_mmio_pfn(pfn_t pfn)
|
||||
{
|
||||
if (pfn_valid(pfn)) {
|
||||
struct page *page = compound_head(pfn_to_page(pfn));
|
||||
return PageReserved(page);
|
||||
struct page *head;
|
||||
struct page *tail = pfn_to_page(pfn);
|
||||
head = compound_head(tail);
|
||||
if (head != tail) {
|
||||
smp_rmb();
|
||||
/*
|
||||
* head may be a dangling pointer.
|
||||
* __split_huge_page_refcount clears PageTail
|
||||
* before overwriting first_page, so if
|
||||
* PageTail is still there it means the head
|
||||
* pointer isn't dangling.
|
||||
*/
|
||||
if (PageTail(tail)) {
|
||||
/*
|
||||
* the "head" is not a dangling
|
||||
* pointer but the hugepage may have
|
||||
* been splitted from under us (and we
|
||||
* may not hold a reference count on
|
||||
* the head page so it can be reused
|
||||
* before we run PageReferenced), so
|
||||
* we've to recheck PageTail before
|
||||
* returning what we just read.
|
||||
*/
|
||||
int reserved = PageReserved(head);
|
||||
smp_rmb();
|
||||
if (PageTail(tail))
|
||||
return reserved;
|
||||
}
|
||||
}
|
||||
return PageReserved(tail);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
Loading…
Reference in a new issue