b5810039a5
Remove PageReserved() calls from core code by tightening VM_RESERVED handling in mm/ to cover PageReserved functionality. PageReserved special casing is removed from get_page and put_page. All setting and clearing of PageReserved is retained, and it is now flagged in the page_alloc checks to help ensure we don't introduce any refcount based freeing of Reserved pages. MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being deprecated. We never completely handled it correctly anyway, and is be reintroduced in future if required (Hugh has a proof of concept). Once PageReserved() calls are removed from kernel/power/swsusp.c, and all arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can be trivially removed. Last real user of PageReserved is swsusp, which uses PageReserved to determine whether a struct page points to valid memory or not. This still needs to be addressed (a generic page_is_ram() should work). A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and thus mapcounted and count towards shared rss). These writes to the struct page could cause excessive cacheline bouncing on big systems. There are a number of ways this could be addressed if it is an issue. Signed-off-by: Nick Piggin <npiggin@suse.de> Refcount bug fix for filemap_xip.c Signed-off-by: Carsten Otte <cotte@de.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
231 lines
5.5 KiB
C
231 lines
5.5 KiB
C
/*
|
|
* linux/mm/msync.c
|
|
*
|
|
* Copyright (C) 1994-1999 Linus Torvalds
|
|
*/
|
|
|
|
/*
|
|
* The msync() system call.
|
|
*/
|
|
#include <linux/slab.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/syscalls.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
#include <asm/tlbflush.h>
|
|
|
|
/*
|
|
* Called with mm->page_table_lock held to protect against other
|
|
* threads/the swapper from ripping pte's out from under us.
|
|
*/
|
|
|
|
static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
|
unsigned long addr, unsigned long end)
|
|
{
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
pte_t *pte;
|
|
int progress = 0;
|
|
|
|
again:
|
|
pte = pte_offset_map(pmd, addr);
|
|
do {
|
|
unsigned long pfn;
|
|
struct page *page;
|
|
|
|
if (progress >= 64) {
|
|
progress = 0;
|
|
if (need_resched() ||
|
|
need_lockbreak(&mm->page_table_lock))
|
|
break;
|
|
}
|
|
progress++;
|
|
if (!pte_present(*pte))
|
|
continue;
|
|
if (!pte_maybe_dirty(*pte))
|
|
continue;
|
|
pfn = pte_pfn(*pte);
|
|
if (unlikely(!pfn_valid(pfn))) {
|
|
print_bad_pte(vma, *pte, addr);
|
|
continue;
|
|
}
|
|
page = pfn_to_page(pfn);
|
|
|
|
if (ptep_clear_flush_dirty(vma, addr, pte) ||
|
|
page_test_and_clear_dirty(page))
|
|
set_page_dirty(page);
|
|
progress += 3;
|
|
} while (pte++, addr += PAGE_SIZE, addr != end);
|
|
pte_unmap(pte - 1);
|
|
cond_resched_lock(&mm->page_table_lock);
|
|
if (addr != end)
|
|
goto again;
|
|
}
|
|
|
|
static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
|
unsigned long addr, unsigned long end)
|
|
{
|
|
pmd_t *pmd;
|
|
unsigned long next;
|
|
|
|
pmd = pmd_offset(pud, addr);
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
if (pmd_none_or_clear_bad(pmd))
|
|
continue;
|
|
msync_pte_range(vma, pmd, addr, next);
|
|
} while (pmd++, addr = next, addr != end);
|
|
}
|
|
|
|
static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
|
|
unsigned long addr, unsigned long end)
|
|
{
|
|
pud_t *pud;
|
|
unsigned long next;
|
|
|
|
pud = pud_offset(pgd, addr);
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
if (pud_none_or_clear_bad(pud))
|
|
continue;
|
|
msync_pmd_range(vma, pud, addr, next);
|
|
} while (pud++, addr = next, addr != end);
|
|
}
|
|
|
|
static void msync_page_range(struct vm_area_struct *vma,
|
|
unsigned long addr, unsigned long end)
|
|
{
|
|
struct mm_struct *mm = vma->vm_mm;
|
|
pgd_t *pgd;
|
|
unsigned long next;
|
|
|
|
/* For hugepages we can't go walking the page table normally,
|
|
* but that's ok, hugetlbfs is memory based, so we don't need
|
|
* to do anything more on an msync().
|
|
* Can't do anything with VM_RESERVED regions either.
|
|
*/
|
|
if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
|
|
return;
|
|
|
|
BUG_ON(addr >= end);
|
|
pgd = pgd_offset(mm, addr);
|
|
flush_cache_range(vma, addr, end);
|
|
spin_lock(&mm->page_table_lock);
|
|
do {
|
|
next = pgd_addr_end(addr, end);
|
|
if (pgd_none_or_clear_bad(pgd))
|
|
continue;
|
|
msync_pud_range(vma, pgd, addr, next);
|
|
} while (pgd++, addr = next, addr != end);
|
|
spin_unlock(&mm->page_table_lock);
|
|
}
|
|
|
|
/*
|
|
* MS_SYNC syncs the entire file - including mappings.
|
|
*
|
|
* MS_ASYNC does not start I/O (it used to, up to 2.5.67). Instead, it just
|
|
* marks the relevant pages dirty. The application may now run fsync() to
|
|
* write out the dirty pages and wait on the writeout and check the result.
|
|
* Or the application may run fadvise(FADV_DONTNEED) against the fd to start
|
|
* async writeout immediately.
|
|
* So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
|
|
* applications.
|
|
*/
|
|
static int msync_interval(struct vm_area_struct *vma,
|
|
unsigned long addr, unsigned long end, int flags)
|
|
{
|
|
int ret = 0;
|
|
struct file *file = vma->vm_file;
|
|
|
|
if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
|
|
return -EBUSY;
|
|
|
|
if (file && (vma->vm_flags & VM_SHARED)) {
|
|
msync_page_range(vma, addr, end);
|
|
|
|
if (flags & MS_SYNC) {
|
|
struct address_space *mapping = file->f_mapping;
|
|
int err;
|
|
|
|
ret = filemap_fdatawrite(mapping);
|
|
if (file->f_op && file->f_op->fsync) {
|
|
/*
|
|
* We don't take i_sem here because mmap_sem
|
|
* is already held.
|
|
*/
|
|
err = file->f_op->fsync(file,file->f_dentry,1);
|
|
if (err && !ret)
|
|
ret = err;
|
|
}
|
|
err = filemap_fdatawait(mapping);
|
|
if (!ret)
|
|
ret = err;
|
|
}
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
|
|
{
|
|
unsigned long end;
|
|
struct vm_area_struct *vma;
|
|
int unmapped_error, error = -EINVAL;
|
|
|
|
if (flags & MS_SYNC)
|
|
current->flags |= PF_SYNCWRITE;
|
|
|
|
down_read(¤t->mm->mmap_sem);
|
|
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
|
|
goto out;
|
|
if (start & ~PAGE_MASK)
|
|
goto out;
|
|
if ((flags & MS_ASYNC) && (flags & MS_SYNC))
|
|
goto out;
|
|
error = -ENOMEM;
|
|
len = (len + ~PAGE_MASK) & PAGE_MASK;
|
|
end = start + len;
|
|
if (end < start)
|
|
goto out;
|
|
error = 0;
|
|
if (end == start)
|
|
goto out;
|
|
/*
|
|
* If the interval [start,end) covers some unmapped address ranges,
|
|
* just ignore them, but return -ENOMEM at the end.
|
|
*/
|
|
vma = find_vma(current->mm, start);
|
|
unmapped_error = 0;
|
|
for (;;) {
|
|
/* Still start < end. */
|
|
error = -ENOMEM;
|
|
if (!vma)
|
|
goto out;
|
|
/* Here start < vma->vm_end. */
|
|
if (start < vma->vm_start) {
|
|
unmapped_error = -ENOMEM;
|
|
start = vma->vm_start;
|
|
}
|
|
/* Here vma->vm_start <= start < vma->vm_end. */
|
|
if (end <= vma->vm_end) {
|
|
if (start < end) {
|
|
error = msync_interval(vma, start, end, flags);
|
|
if (error)
|
|
goto out;
|
|
}
|
|
error = unmapped_error;
|
|
goto out;
|
|
}
|
|
/* Here vma->vm_start <= start < vma->vm_end < end. */
|
|
error = msync_interval(vma, start, vma->vm_end, flags);
|
|
if (error)
|
|
goto out;
|
|
start = vma->vm_end;
|
|
vma = vma->vm_next;
|
|
}
|
|
out:
|
|
up_read(¤t->mm->mmap_sem);
|
|
current->flags &= ~PF_SYNCWRITE;
|
|
return error;
|
|
}
|