2005-04-16 16:20:36 -06:00
|
|
|
/*
|
|
|
|
* linux/arch/i386/mm/pgtable.c
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/mm.h>
|
2007-10-17 10:04:34 -06:00
|
|
|
#include <linux/nmi.h>
|
2005-04-16 16:20:36 -06:00
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/spinlock.h>
|
2006-09-26 00:32:25 -06:00
|
|
|
#include <linux/module.h>
|
2007-05-12 12:15:24 -06:00
|
|
|
#include <linux/quicklist.h>
|
2005-04-16 16:20:36 -06:00
|
|
|
|
|
|
|
#include <asm/system.h>
|
|
|
|
#include <asm/pgtable.h>
|
|
|
|
#include <asm/pgalloc.h>
|
|
|
|
#include <asm/fixmap.h>
|
|
|
|
#include <asm/e820.h>
|
|
|
|
#include <asm/tlb.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
|
|
|
|
void show_mem(void)
|
|
|
|
{
|
|
|
|
int total = 0, reserved = 0;
|
|
|
|
int shared = 0, cached = 0;
|
|
|
|
int highmem = 0;
|
|
|
|
struct page *page;
|
|
|
|
pg_data_t *pgdat;
|
|
|
|
unsigned long i;
|
2005-10-29 19:16:52 -06:00
|
|
|
unsigned long flags;
|
2005-04-16 16:20:36 -06:00
|
|
|
|
2005-06-25 15:59:24 -06:00
|
|
|
printk(KERN_INFO "Mem-info:\n");
|
2005-04-16 16:20:36 -06:00
|
|
|
show_free_areas();
|
2006-03-27 02:15:59 -07:00
|
|
|
for_each_online_pgdat(pgdat) {
|
2005-10-29 19:16:52 -06:00
|
|
|
pgdat_resize_lock(pgdat, &flags);
|
2005-04-16 16:20:36 -06:00
|
|
|
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
|
2007-10-17 10:04:34 -06:00
|
|
|
if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
|
|
|
|
touch_nmi_watchdog();
|
[PATCH] remove non-DISCONTIG use of pgdat->node_mem_map
This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code. On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.
There was also a node_mem_map(nid) macro on many architectures. Its use
along with the use of ->node_mem_map itself was not consistent. It has
been removed in favor of two new, more explicit, arch-independent macros:
pgdat_page_nr(pgdat, pagenr)
nid_page_nr(nid, pagenr)
I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways. I
believe the newer names are much clearer.
These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead. We could
make this the only behavior if people want, but I don't want to change too
much at once. One thing at a time.
This patch removes more code than it adds.
Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64. Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/
Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-23 01:07:37 -06:00
|
|
|
page = pgdat_page_nr(pgdat, i);
|
2005-04-16 16:20:36 -06:00
|
|
|
total++;
|
|
|
|
if (PageHighMem(page))
|
|
|
|
highmem++;
|
|
|
|
if (PageReserved(page))
|
|
|
|
reserved++;
|
|
|
|
else if (PageSwapCache(page))
|
|
|
|
cached++;
|
|
|
|
else if (page_count(page))
|
|
|
|
shared += page_count(page) - 1;
|
|
|
|
}
|
2005-10-29 19:16:52 -06:00
|
|
|
pgdat_resize_unlock(pgdat, &flags);
|
2005-04-16 16:20:36 -06:00
|
|
|
}
|
2005-06-25 15:59:24 -06:00
|
|
|
printk(KERN_INFO "%d pages of RAM\n", total);
|
|
|
|
printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
|
|
|
|
printk(KERN_INFO "%d reserved pages\n", reserved);
|
|
|
|
printk(KERN_INFO "%d pages shared\n", shared);
|
|
|
|
printk(KERN_INFO "%d pages swap cached\n", cached);
|
2005-06-23 01:08:08 -06:00
|
|
|
|
2006-06-30 02:55:39 -06:00
|
|
|
printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
|
2006-06-30 02:55:40 -06:00
|
|
|
printk(KERN_INFO "%lu pages writeback\n",
|
|
|
|
global_page_state(NR_WRITEBACK));
|
2006-06-30 02:55:34 -06:00
|
|
|
printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
|
2006-09-26 00:31:51 -06:00
|
|
|
printk(KERN_INFO "%lu pages slab\n",
|
|
|
|
global_page_state(NR_SLAB_RECLAIMABLE) +
|
|
|
|
global_page_state(NR_SLAB_UNRECLAIMABLE));
|
2006-06-30 02:55:38 -06:00
|
|
|
printk(KERN_INFO "%lu pages pagetables\n",
|
|
|
|
global_page_state(NR_PAGETABLE));
|
2005-04-16 16:20:36 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Associate a virtual page frame with a given physical page frame
|
|
|
|
* and protection flags for that frame.
|
|
|
|
*/
|
|
|
|
static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
pte_t *pte;
|
|
|
|
|
|
|
|
pgd = swapper_pg_dir + pgd_index(vaddr);
|
|
|
|
if (pgd_none(*pgd)) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pud = pud_offset(pgd, vaddr);
|
|
|
|
if (pud_none(*pud)) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pmd = pmd_offset(pud, vaddr);
|
|
|
|
if (pmd_none(*pmd)) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pte = pte_offset_kernel(pmd, vaddr);
|
2006-12-06 18:14:09 -07:00
|
|
|
if (pgprot_val(flags))
|
2007-10-17 10:04:33 -06:00
|
|
|
set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags));
|
2006-12-06 18:14:09 -07:00
|
|
|
else
|
|
|
|
pte_clear(&init_mm, vaddr, pte);
|
2005-04-16 16:20:36 -06:00
|
|
|
|
|
|
|
/*
|
|
|
|
* It's enough to flush this one mapping.
|
|
|
|
* (PGE mappings get flushed as well)
|
|
|
|
*/
|
|
|
|
__flush_tlb_one(vaddr);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Associate a large virtual page frame with a given physical page frame
|
|
|
|
* and protection flags for that frame. pfn is for the base of the page,
|
|
|
|
* vaddr is what the page gets mapped to - both must be properly aligned.
|
|
|
|
* The pmd must already be instantiated. Assumes PAE mode.
|
|
|
|
*/
|
|
|
|
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
|
|
|
|
{
|
|
|
|
pgd_t *pgd;
|
|
|
|
pud_t *pud;
|
|
|
|
pmd_t *pmd;
|
|
|
|
|
|
|
|
if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
|
2005-06-25 15:59:24 -06:00
|
|
|
printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
|
2005-04-16 16:20:36 -06:00
|
|
|
return; /* BUG(); */
|
|
|
|
}
|
|
|
|
if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
|
2005-06-25 15:59:24 -06:00
|
|
|
printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
|
2005-04-16 16:20:36 -06:00
|
|
|
return; /* BUG(); */
|
|
|
|
}
|
|
|
|
pgd = swapper_pg_dir + pgd_index(vaddr);
|
|
|
|
if (pgd_none(*pgd)) {
|
2005-06-25 15:59:24 -06:00
|
|
|
printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
|
2005-04-16 16:20:36 -06:00
|
|
|
return; /* BUG(); */
|
|
|
|
}
|
|
|
|
pud = pud_offset(pgd, vaddr);
|
|
|
|
pmd = pmd_offset(pud, vaddr);
|
|
|
|
set_pmd(pmd, pfn_pmd(pfn, flags));
|
|
|
|
/*
|
|
|
|
* It's enough to flush this one mapping.
|
|
|
|
* (PGE mappings get flushed as well)
|
|
|
|
*/
|
|
|
|
__flush_tlb_one(vaddr);
|
|
|
|
}
|
|
|
|
|
2006-09-26 00:32:25 -06:00
|
|
|
static int fixmaps;
|
|
|
|
unsigned long __FIXADDR_TOP = 0xfffff000;
|
|
|
|
EXPORT_SYMBOL(__FIXADDR_TOP);
|
|
|
|
|
2005-04-16 16:20:36 -06:00
|
|
|
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
|
|
|
|
{
|
|
|
|
unsigned long address = __fix_to_virt(idx);
|
|
|
|
|
|
|
|
if (idx >= __end_of_fixed_addresses) {
|
|
|
|
BUG();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
|
2006-09-26 00:32:25 -06:00
|
|
|
fixmaps++;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* reserve_top_address - reserves a hole in the top of kernel address space
|
|
|
|
* @reserve - size of hole to reserve
|
|
|
|
*
|
|
|
|
* Can be used to relocate the fixmap area and poke a hole in the top
|
|
|
|
* of kernel address space to make room for a hypervisor.
|
|
|
|
*/
|
|
|
|
void reserve_top_address(unsigned long reserve)
|
|
|
|
{
|
|
|
|
BUG_ON(fixmaps > 0);
|
2007-02-13 05:26:21 -07:00
|
|
|
printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
|
|
|
|
(int)-reserve);
|
2006-09-26 00:32:25 -06:00
|
|
|
__FIXADDR_TOP = -reserve - PAGE_SIZE;
|
|
|
|
__VMALLOC_RESERVE += reserve;
|
2005-04-16 16:20:36 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
|
|
|
|
{
|
|
|
|
return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
|
|
|
|
}
|
|
|
|
|
2008-02-08 05:22:04 -07:00
|
|
|
pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
|
2005-04-16 16:20:36 -06:00
|
|
|
{
|
|
|
|
struct page *pte;
|
|
|
|
|
|
|
|
#ifdef CONFIG_HIGHPTE
|
|
|
|
pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
|
|
|
|
#else
|
|
|
|
pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
|
|
|
|
#endif
|
2008-02-08 05:22:04 -07:00
|
|
|
if (pte)
|
|
|
|
pgtable_page_ctor(pte);
|
2005-04-16 16:20:36 -06:00
|
|
|
return pte;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* List of all pgd's needed for non-PAE so it can invalidate entries
|
|
|
|
* in both cached and uncached pgd's; not needed for PAE since the
|
|
|
|
* kernel pmd is shared. If PAE were not to share the pmd a similar
|
|
|
|
* tactic would be needed. This is essentially codepath-based locking
|
|
|
|
* against pageattr.c; it is the unique case in which a valid change
|
|
|
|
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
|
|
|
|
* vmalloc faults work because attached pagetables are never freed.
|
|
|
|
* -- wli
|
|
|
|
*/
|
|
|
|
static inline void pgd_list_add(pgd_t *pgd)
|
|
|
|
{
|
|
|
|
struct page *page = virt_to_page(pgd);
|
2008-01-30 05:34:11 -07:00
|
|
|
|
|
|
|
list_add(&page->lru, &pgd_list);
|
2005-04-16 16:20:36 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void pgd_list_del(pgd_t *pgd)
|
|
|
|
{
|
2008-01-30 05:34:11 -07:00
|
|
|
struct page *page = virt_to_page(pgd);
|
|
|
|
|
|
|
|
list_del(&page->lru);
|
2005-04-16 16:20:36 -06:00
|
|
|
}
|
|
|
|
|
2008-02-04 08:48:02 -07:00
|
|
|
#define UNSHARED_PTRS_PER_PGD \
|
|
|
|
(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
|
2007-05-12 12:15:24 -06:00
|
|
|
|
2008-02-04 08:48:02 -07:00
|
|
|
static void pgd_ctor(void *p)
|
2005-04-16 16:20:36 -06:00
|
|
|
{
|
2008-02-04 08:48:02 -07:00
|
|
|
pgd_t *pgd = p;
|
2005-04-16 16:20:36 -06:00
|
|
|
unsigned long flags;
|
|
|
|
|
2008-02-04 08:48:02 -07:00
|
|
|
/* Clear usermode parts of PGD */
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 11:27:13 -06:00
|
|
|
memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
|
|
|
|
|
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
2005-04-16 16:20:36 -06:00
|
|
|
|
2008-02-04 08:48:02 -07:00
|
|
|
/* If the pgd points to a shared pagetable level (either the
|
|
|
|
ptes in non-PAE, or shared PMD in PAE), then just copy the
|
|
|
|
references from swapper_pg_dir. */
|
|
|
|
if (PAGETABLE_LEVELS == 2 ||
|
|
|
|
(PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
|
|
|
|
clone_pgd_range(pgd + USER_PTRS_PER_PGD,
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 11:27:13 -06:00
|
|
|
swapper_pg_dir + USER_PTRS_PER_PGD,
|
|
|
|
KERNEL_PGD_PTRS);
|
2008-02-04 08:48:02 -07:00
|
|
|
paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
|
|
|
|
__pa(swapper_pg_dir) >> PAGE_SHIFT,
|
|
|
|
USER_PTRS_PER_PGD,
|
|
|
|
KERNEL_PGD_PTRS);
|
|
|
|
}
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 11:27:13 -06:00
|
|
|
|
2008-02-04 08:48:02 -07:00
|
|
|
/* list required to sync kernel mapping updates */
|
|
|
|
if (!SHARED_KERNEL_PMD)
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 11:27:13 -06:00
|
|
|
pgd_list_add(pgd);
|
2008-02-04 08:48:02 -07:00
|
|
|
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 11:27:13 -06:00
|
|
|
}
|
2005-04-16 16:20:36 -06:00
|
|
|
|
2007-07-21 09:11:07 -06:00
|
|
|
static void pgd_dtor(void *pgd)
|
2005-04-16 16:20:36 -06:00
|
|
|
{
|
|
|
|
unsigned long flags; /* can be called from interrupt context */
|
|
|
|
|
2007-05-12 12:15:24 -06:00
|
|
|
if (SHARED_KERNEL_PMD)
|
|
|
|
return;
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 11:27:13 -06:00
|
|
|
|
2005-04-16 16:20:36 -06:00
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
|
|
pgd_list_del(pgd);
|
|
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
|
|
}
|
|
|
|
|
2008-01-30 05:33:40 -07:00
|
|
|
#ifdef CONFIG_X86_PAE
|
|
|
|
/*
|
|
|
|
* Mop up any pmd pages which may still be attached to the pgd.
|
|
|
|
* Normally they will be freed by munmap/exit_mmap, but any pmd we
|
|
|
|
* preallocate which never got a corresponding vma will need to be
|
|
|
|
* freed manually.
|
|
|
|
*/
|
2008-02-04 23:29:14 -07:00
|
|
|
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
|
2008-01-30 05:33:40 -07:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2008-01-30 05:33:40 -07:00
|
|
|
for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
|
2008-01-30 05:33:40 -07:00
|
|
|
pgd_t pgd = pgdp[i];
|
|
|
|
|
|
|
|
if (pgd_val(pgd) != 0) {
|
|
|
|
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
|
|
|
|
|
|
|
|
pgdp[i] = native_make_pgd(0);
|
|
|
|
|
|
|
|
paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
|
2008-02-04 23:29:14 -07:00
|
|
|
pmd_free(mm, pmd);
|
2008-01-30 05:33:40 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In PAE mode, we need to do a cr3 reload (=tlb flush) when
|
|
|
|
* updating the top-level pagetable entries to guarantee the
|
|
|
|
* processor notices the update. Since this is expensive, and
|
|
|
|
* all 4 top-level entries are used almost immediately in a
|
|
|
|
* new process's life, we just pre-populate them here.
|
2008-01-30 05:33:40 -07:00
|
|
|
*
|
|
|
|
* Also, if we're in a paravirt environment where the kernel pmd is
|
|
|
|
* not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
|
|
|
|
* and initialize the kernel pmds here.
|
2008-01-30 05:33:40 -07:00
|
|
|
*/
|
|
|
|
static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
|
|
|
|
{
|
|
|
|
pud_t *pud;
|
|
|
|
unsigned long addr;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
pud = pud_offset(pgd, 0);
|
2008-01-30 05:33:40 -07:00
|
|
|
for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
|
|
|
|
i++, pud++, addr += PUD_SIZE) {
|
2008-01-30 05:33:40 -07:00
|
|
|
pmd_t *pmd = pmd_alloc_one(mm, addr);
|
|
|
|
|
|
|
|
if (!pmd) {
|
2008-02-04 23:29:14 -07:00
|
|
|
pgd_mop_up_pmds(mm, pgd);
|
2008-01-30 05:33:40 -07:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2008-01-30 05:33:40 -07:00
|
|
|
if (i >= USER_PTRS_PER_PGD)
|
|
|
|
memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
|
|
|
|
sizeof(pmd_t) * PTRS_PER_PMD);
|
|
|
|
|
2008-01-30 05:33:40 -07:00
|
|
|
pud_populate(mm, pud, pmd);
|
|
|
|
}
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
#else /* !CONFIG_X86_PAE */
|
|
|
|
/* No need to prepopulate any pagetable entries in non-PAE modes. */
|
|
|
|
static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2008-02-04 23:29:14 -07:00
|
|
|
static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
|
2008-01-30 05:33:40 -07:00
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_X86_PAE */
|
|
|
|
|
2005-04-16 16:20:36 -06:00
|
|
|
pgd_t *pgd_alloc(struct mm_struct *mm)
|
|
|
|
{
|
2008-03-09 06:14:37 -06:00
|
|
|
pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
|
2005-04-16 16:20:36 -06:00
|
|
|
|
2008-03-09 06:14:37 -06:00
|
|
|
/* so that alloc_pd can use it */
|
|
|
|
mm->pgd = pgd;
|
|
|
|
if (pgd)
|
|
|
|
pgd_ctor(pgd);
|
2008-01-30 05:33:39 -07:00
|
|
|
|
2008-01-30 05:33:40 -07:00
|
|
|
if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
|
2008-03-09 06:14:37 -06:00
|
|
|
pgd_dtor(pgd);
|
|
|
|
free_page((unsigned long)pgd);
|
2008-01-30 05:33:40 -07:00
|
|
|
pgd = NULL;
|
|
|
|
}
|
|
|
|
|
2005-04-16 16:20:36 -06:00
|
|
|
return pgd;
|
|
|
|
}
|
|
|
|
|
2008-02-04 23:29:14 -07:00
|
|
|
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
|
2005-04-16 16:20:36 -06:00
|
|
|
{
|
2008-02-04 23:29:14 -07:00
|
|
|
pgd_mop_up_pmds(mm, pgd);
|
2008-03-09 06:14:37 -06:00
|
|
|
pgd_dtor(pgd);
|
|
|
|
free_page((unsigned long)pgd);
|
2007-05-12 12:15:24 -06:00
|
|
|
}
|
2008-01-31 14:05:48 -07:00
|
|
|
|
|
|
|
void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
|
|
|
|
{
|
2008-02-08 05:22:04 -07:00
|
|
|
pgtable_page_dtor(pte);
|
2008-01-31 14:05:48 -07:00
|
|
|
paravirt_release_pt(page_to_pfn(pte));
|
|
|
|
tlb_remove_page(tlb, pte);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_X86_PAE
|
|
|
|
|
|
|
|
void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
|
|
|
|
{
|
|
|
|
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
|
|
|
|
tlb_remove_page(tlb, virt_to_page(pmd));
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
2008-03-03 01:53:17 -07:00
|
|
|
|
|
|
|
int pmd_bad(pmd_t pmd)
|
|
|
|
{
|
|
|
|
WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
|
|
|
|
|
|
|
|
return pmd_bad_v1(pmd);
|
|
|
|
}
|