[PATCH] hugetlb: demand fault handler
Below is a patch to implement demand faulting for huge pages. The main motivation for changing from prefaulting to demand faulting is so that huge page memory areas can be allocated according to NUMA policy. Thanks to consolidated hugetlb code, switching the behavior requires changing only one fault handler. The bulk of the patch just moves the logic from hugelb_prefault() to hugetlb_pte_fault() and find_get_huge_page(). Signed-off-by: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
parent
551110a94a
commit
4c88726597
2 changed files with 95 additions and 88 deletions
|
@ -48,7 +48,6 @@ int sysctl_hugetlb_shm_group;
|
||||||
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
|
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
|
||||||
{
|
{
|
||||||
struct inode *inode = file->f_dentry->d_inode;
|
struct inode *inode = file->f_dentry->d_inode;
|
||||||
struct address_space *mapping = inode->i_mapping;
|
|
||||||
loff_t len, vma_len;
|
loff_t len, vma_len;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
@ -79,10 +78,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
|
||||||
if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
|
if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
|
||||||
goto out;
|
goto out;
|
||||||
|
|
||||||
ret = hugetlb_prefault(mapping, vma);
|
ret = 0;
|
||||||
if (ret)
|
hugetlb_prefault_arch_hook(vma->vm_mm);
|
||||||
goto out;
|
|
||||||
|
|
||||||
if (inode->i_size < len)
|
if (inode->i_size < len)
|
||||||
inode->i_size = len;
|
inode->i_size = len;
|
||||||
out:
|
out:
|
||||||
|
|
146
mm/hugetlb.c
146
mm/hugetlb.c
|
@ -322,9 +322,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
||||||
for (address = start; address < end; address += HPAGE_SIZE) {
|
for (address = start; address < end; address += HPAGE_SIZE) {
|
||||||
ptep = huge_pte_offset(mm, address);
|
ptep = huge_pte_offset(mm, address);
|
||||||
if (!ptep)
|
if (!ptep)
|
||||||
/* This can happen on truncate, or if an
|
|
||||||
* mmap() is aborted due to an error before
|
|
||||||
* the prefault */
|
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
pte = huge_ptep_get_and_clear(mm, address, ptep);
|
pte = huge_ptep_get_and_clear(mm, address, ptep);
|
||||||
|
@ -340,81 +337,92 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
|
||||||
flush_tlb_range(vma, start, end);
|
flush_tlb_range(vma, start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
|
static struct page *find_lock_huge_page(struct address_space *mapping,
|
||||||
|
unsigned long idx)
|
||||||
{
|
{
|
||||||
struct mm_struct *mm = current->mm;
|
|
||||||
unsigned long addr;
|
|
||||||
int ret = 0;
|
|
||||||
|
|
||||||
WARN_ON(!is_vm_hugetlb_page(vma));
|
|
||||||
BUG_ON(vma->vm_start & ~HPAGE_MASK);
|
|
||||||
BUG_ON(vma->vm_end & ~HPAGE_MASK);
|
|
||||||
|
|
||||||
hugetlb_prefault_arch_hook(mm);
|
|
||||||
|
|
||||||
for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
|
|
||||||
unsigned long idx;
|
|
||||||
pte_t *pte = huge_pte_alloc(mm, addr);
|
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
int err;
|
||||||
|
struct inode *inode = mapping->host;
|
||||||
|
unsigned long size;
|
||||||
|
|
||||||
if (!pte) {
|
retry:
|
||||||
ret = -ENOMEM;
|
page = find_lock_page(mapping, idx);
|
||||||
|
if (page)
|
||||||
goto out;
|
goto out;
|
||||||
}
|
|
||||||
|
|
||||||
idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
|
/* Check to make sure the mapping hasn't been truncated */
|
||||||
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
|
size = i_size_read(inode) >> HPAGE_SHIFT;
|
||||||
page = find_get_page(mapping, idx);
|
if (idx >= size)
|
||||||
if (!page) {
|
goto out;
|
||||||
/* charge the fs quota first */
|
|
||||||
if (hugetlb_get_quota(mapping)) {
|
if (hugetlb_get_quota(mapping))
|
||||||
ret = -ENOMEM;
|
|
||||||
goto out;
|
goto out;
|
||||||
}
|
|
||||||
page = alloc_huge_page();
|
page = alloc_huge_page();
|
||||||
if (!page) {
|
if (!page) {
|
||||||
hugetlb_put_quota(mapping);
|
hugetlb_put_quota(mapping);
|
||||||
ret = -ENOMEM;
|
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
|
|
||||||
if (! ret) {
|
|
||||||
unlock_page(page);
|
|
||||||
} else {
|
|
||||||
hugetlb_put_quota(mapping);
|
|
||||||
free_huge_page(page);
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
spin_lock(&mm->page_table_lock);
|
|
||||||
add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
|
|
||||||
set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
|
|
||||||
spin_unlock(&mm->page_table_lock);
|
|
||||||
}
|
|
||||||
out:
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
|
||||||
* On ia64 at least, it is possible to receive a hugetlb fault from a
|
if (err) {
|
||||||
* stale zero entry left in the TLB from earlier hardware prefetching.
|
put_page(page);
|
||||||
* Low-level arch code should already have flushed the stale entry as
|
hugetlb_put_quota(mapping);
|
||||||
* part of its fault handling, but we do need to accept this minor fault
|
if (err == -EEXIST)
|
||||||
* and return successfully. Whereas the "normal" case is that this is
|
goto retry;
|
||||||
* an access to a hugetlb page which has been truncated off since mmap.
|
page = NULL;
|
||||||
*/
|
}
|
||||||
|
out:
|
||||||
|
return page;
|
||||||
|
}
|
||||||
|
|
||||||
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
unsigned long address, int write_access)
|
unsigned long address, int write_access)
|
||||||
{
|
{
|
||||||
int ret = VM_FAULT_SIGBUS;
|
int ret = VM_FAULT_SIGBUS;
|
||||||
|
unsigned long idx;
|
||||||
|
unsigned long size;
|
||||||
pte_t *pte;
|
pte_t *pte;
|
||||||
|
struct page *page;
|
||||||
|
struct address_space *mapping;
|
||||||
|
|
||||||
|
pte = huge_pte_alloc(mm, address);
|
||||||
|
if (!pte)
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
mapping = vma->vm_file->f_mapping;
|
||||||
|
idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
|
||||||
|
+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Use page lock to guard against racing truncation
|
||||||
|
* before we get page_table_lock.
|
||||||
|
*/
|
||||||
|
page = find_lock_huge_page(mapping, idx);
|
||||||
|
if (!page)
|
||||||
|
goto out;
|
||||||
|
|
||||||
spin_lock(&mm->page_table_lock);
|
spin_lock(&mm->page_table_lock);
|
||||||
pte = huge_pte_offset(mm, address);
|
size = i_size_read(mapping->host) >> HPAGE_SHIFT;
|
||||||
if (pte && !pte_none(*pte))
|
if (idx >= size)
|
||||||
|
goto backout;
|
||||||
|
|
||||||
ret = VM_FAULT_MINOR;
|
ret = VM_FAULT_MINOR;
|
||||||
|
if (!pte_none(*pte))
|
||||||
|
goto backout;
|
||||||
|
|
||||||
|
add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
|
||||||
|
set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
|
||||||
spin_unlock(&mm->page_table_lock);
|
spin_unlock(&mm->page_table_lock);
|
||||||
|
unlock_page(page);
|
||||||
|
out:
|
||||||
return ret;
|
return ret;
|
||||||
|
|
||||||
|
backout:
|
||||||
|
spin_unlock(&mm->page_table_lock);
|
||||||
|
hugetlb_put_quota(mapping);
|
||||||
|
unlock_page(page);
|
||||||
|
put_page(page);
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
|
@ -424,34 +432,36 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||||
unsigned long vpfn, vaddr = *position;
|
unsigned long vpfn, vaddr = *position;
|
||||||
int remainder = *length;
|
int remainder = *length;
|
||||||
|
|
||||||
BUG_ON(!is_vm_hugetlb_page(vma));
|
|
||||||
|
|
||||||
vpfn = vaddr/PAGE_SIZE;
|
vpfn = vaddr/PAGE_SIZE;
|
||||||
spin_lock(&mm->page_table_lock);
|
spin_lock(&mm->page_table_lock);
|
||||||
while (vaddr < vma->vm_end && remainder) {
|
while (vaddr < vma->vm_end && remainder) {
|
||||||
|
|
||||||
if (pages) {
|
|
||||||
pte_t *pte;
|
pte_t *pte;
|
||||||
struct page *page;
|
struct page *page;
|
||||||
|
|
||||||
/* Some archs (sparc64, sh*) have multiple
|
/*
|
||||||
* pte_ts to each hugepage. We have to make
|
* Some archs (sparc64, sh*) have multiple pte_ts to
|
||||||
* sure we get the first, for the page
|
* each hugepage. We have to make * sure we get the
|
||||||
* indexing below to work. */
|
* first, for the page indexing below to work.
|
||||||
|
*/
|
||||||
pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
|
pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
|
||||||
|
|
||||||
/* the hugetlb file might have been truncated */
|
|
||||||
if (!pte || pte_none(*pte)) {
|
if (!pte || pte_none(*pte)) {
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
spin_unlock(&mm->page_table_lock);
|
||||||
|
ret = hugetlb_fault(mm, vma, vaddr, 0);
|
||||||
|
spin_lock(&mm->page_table_lock);
|
||||||
|
if (ret == VM_FAULT_MINOR)
|
||||||
|
continue;
|
||||||
|
|
||||||
remainder = 0;
|
remainder = 0;
|
||||||
if (!i)
|
if (!i)
|
||||||
i = -EFAULT;
|
i = -EFAULT;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (pages) {
|
||||||
page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
|
page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
|
||||||
|
|
||||||
WARN_ON(!PageCompound(page));
|
|
||||||
|
|
||||||
get_page(page);
|
get_page(page);
|
||||||
pages[i] = page;
|
pages[i] = page;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue