diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c index 47d075bdfb95..2040a831d5b3 100644 --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -132,6 +132,7 @@ struct paravirt_ops paravirt_ops = { .name = "bare hardware", .paravirt_enabled = 0, .kernel_rpl = 0, + .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ .patch = native_patch, .banner = default_banner, diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c index c6a0a06258e6..f534c29e80b2 100644 --- a/arch/i386/mm/fault.c +++ b/arch/i386/mm/fault.c @@ -603,7 +603,6 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs, force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } -#ifndef CONFIG_X86_PAE void vmalloc_sync_all(void) { /* @@ -616,6 +615,9 @@ void vmalloc_sync_all(void) static unsigned long start = TASK_SIZE; unsigned long address; + if (SHARED_KERNEL_PMD) + return; + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { if (!test_bit(pgd_index(address), insync)) { @@ -638,4 +640,3 @@ void vmalloc_sync_all(void) start = address + PGDIR_SIZE; } } -#endif diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index e8545dcf06c5..dbe16f63a566 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -745,6 +745,8 @@ struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) { + size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); + if (PTRS_PER_PMD > 1) { pmd_cache = kmem_cache_create("pmd", PTRS_PER_PMD*sizeof(pmd_t), @@ -754,13 +756,23 @@ void __init pgtable_cache_init(void) NULL); if (!pmd_cache) panic("pgtable_cache_init(): cannot create pmd cache"); + + if (!SHARED_KERNEL_PMD) { + /* If we're in PAE mode and have a non-shared + kernel pmd, then the pgd size must be a + page size. This is because the pgd_list + links through the page structure, so there + can only be one pgd per page for this to + work. */ + pgd_size = PAGE_SIZE; + } } pgd_cache = kmem_cache_create("pgd", - PTRS_PER_PGD*sizeof(pgd_t), - PTRS_PER_PGD*sizeof(pgd_t), + pgd_size, + pgd_size, 0, pgd_ctor, - PTRS_PER_PMD == 1 ? pgd_dtor : NULL); + (!SHARED_KERNEL_PMD) ? pgd_dtor : NULL); if (!pgd_cache) panic("pgtable_cache_init(): Cannot create pgd cache"); } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index ea6b6d4a0a2a..47bd477c8ecc 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -91,7 +91,7 @@ static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) unsigned long flags; set_pte_atomic(kpte, pte); /* change init_mm */ - if (PTRS_PER_PMD > 1) + if (SHARED_KERNEL_PMD) return; spin_lock_irqsave(&pgd_lock, flags); diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 99c09edc3dbb..9a96c1647428 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c @@ -232,42 +232,92 @@ static inline void pgd_list_del(pgd_t *pgd) set_page_private(next, (unsigned long)pprev); } +#if (PTRS_PER_PMD == 1) +/* Non-PAE pgd constructor */ void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; - if (PTRS_PER_PMD == 1) { - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - spin_lock_irqsave(&pgd_lock, flags); - } + /* !PAE, no pagetable sharing */ + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + spin_lock_irqsave(&pgd_lock, flags); + + /* must happen under lock */ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); - - if (PTRS_PER_PMD > 1) - return; - - /* must happen under lock */ paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); - + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); pgd_list_add(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } +#else /* PTRS_PER_PMD > 1 */ +/* PAE pgd constructor */ +void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) +{ + /* PAE, kernel PMD may be shared */ + + if (SHARED_KERNEL_PMD) { + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); + } else { + unsigned long flags; + + memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); + } +} +#endif /* PTRS_PER_PMD */ -/* never called when PTRS_PER_PMD > 1 */ void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) { unsigned long flags; /* can be called from interrupt context */ + BUG_ON(SHARED_KERNEL_PMD); + paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); pgd_list_del(pgd); spin_unlock_irqrestore(&pgd_lock, flags); } +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) + +/* If we allocate a pmd for part of the kernel address space, then + make sure its initialized with the appropriate kernel mappings. + Otherwise use a cached zeroed pmd. */ +static pmd_t *pmd_cache_alloc(int idx) +{ + pmd_t *pmd; + + if (idx >= USER_PTRS_PER_PGD) { + pmd = (pmd_t *)__get_free_page(GFP_KERNEL); + + if (pmd) + memcpy(pmd, + (void *)pgd_page_vaddr(swapper_pg_dir[idx]), + sizeof(pmd_t) * PTRS_PER_PMD); + } else + pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + + return pmd; +} + +static void pmd_cache_free(pmd_t *pmd, int idx) +{ + if (idx >= USER_PTRS_PER_PGD) + free_page((unsigned long)pmd); + else + kmem_cache_free(pmd_cache, pmd); +} + pgd_t *pgd_alloc(struct mm_struct *mm) { int i; @@ -276,10 +326,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (PTRS_PER_PMD == 1 || !pgd) return pgd; - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { + pmd_t *pmd = pmd_cache_alloc(i); + if (!pmd) goto out_oom; + paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); } @@ -290,7 +342,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd); + pmd_cache_free(pmd, i); } kmem_cache_free(pgd_cache, pgd); return NULL; @@ -302,11 +354,11 @@ void pgd_free(pgd_t *pgd) /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) - for (i = 0; i < USER_PTRS_PER_PGD; ++i) { + for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { pgd_t pgdent = pgd[i]; void* pmd = (void *)__va(pgd_val(pgdent)-1); paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - kmem_cache_free(pmd_cache, pmd); + pmd_cache_free(pmd, i); } /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h index c49b44cdd8ee..f93599dc7756 100644 --- a/include/asm-i386/paravirt.h +++ b/include/asm-i386/paravirt.h @@ -35,6 +35,7 @@ struct desc_struct; struct paravirt_ops { unsigned int kernel_rpl; + int shared_kernel_pmd; int paravirt_enabled; const char *name; diff --git a/include/asm-i386/pgtable-2level-defs.h b/include/asm-i386/pgtable-2level-defs.h index 02518079f816..0f71c9f13da4 100644 --- a/include/asm-i386/pgtable-2level-defs.h +++ b/include/asm-i386/pgtable-2level-defs.h @@ -1,6 +1,8 @@ #ifndef _I386_PGTABLE_2LEVEL_DEFS_H #define _I386_PGTABLE_2LEVEL_DEFS_H +#define SHARED_KERNEL_PMD 0 + /* * traditional i386 two-level paging structure: */ diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index 043a2bcfa86a..781fe4bcc962 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -82,6 +82,4 @@ static inline int pte_exec_kernel(pte_t pte) #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) -void vmalloc_sync_all(void); - #endif /* _I386_PGTABLE_2LEVEL_H */ diff --git a/include/asm-i386/pgtable-3level-defs.h b/include/asm-i386/pgtable-3level-defs.h index eb3a1ea88671..c0df89f66e8b 100644 --- a/include/asm-i386/pgtable-3level-defs.h +++ b/include/asm-i386/pgtable-3level-defs.h @@ -1,6 +1,12 @@ #ifndef _I386_PGTABLE_3LEVEL_DEFS_H #define _I386_PGTABLE_3LEVEL_DEFS_H +#ifdef CONFIG_PARAVIRT +#define SHARED_KERNEL_PMD (paravirt_ops.shared_kernel_pmd) +#else +#define SHARED_KERNEL_PMD 1 +#endif + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index be6017f37a91..664bfee5a2f2 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h @@ -200,6 +200,4 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) #define __pmd_free_tlb(tlb, x) do { } while (0) -#define vmalloc_sync_all() ((void)0) - #endif /* _I386_PGTABLE_3LEVEL_H */ diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index 0790ad6ed440..5b88a6a1278e 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -243,6 +243,8 @@ static inline pte_t pte_mkyoung(pte_t pte) { (pte).pte_low |= _PAGE_ACCESSED; re static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return pte; } static inline pte_t pte_mkhuge(pte_t pte) { (pte).pte_low |= _PAGE_PSE; return pte; } +extern void vmalloc_sync_all(void); + #ifdef CONFIG_X86_PAE # include #else