Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Ingo Molnar:
 "The main x86 MM changes in this cycle were:

   - continued native kernel PCID support preparation patches to the TLB
     flushing code (Andy Lutomirski)

   - various fixes related to 32-bit compat syscall returning address
     over 4Gb in applications, launched from 64-bit binaries - motivated
     by C/R frameworks such as Virtuozzo. (Dmitry Safonov)

   - continued Intel 5-level paging enablement: in particular the
     conversion of x86 GUP to the generic GUP code. (Kirill A. Shutemov)

   - x86/mpx ABI corner case fixes/enhancements (Joerg Roedel)

   - ... plus misc updates, fixes and cleanups"

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (62 commits)
  mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash
  x86/mm: Fix flush_tlb_page() on Xen
  x86/mm: Make flush_tlb_mm_range() more predictable
  x86/mm: Remove flush_tlb() and flush_tlb_current_task()
  x86/vm86/32: Switch to flush_tlb_mm_range() in mark_screen_rdonly()
  x86/mm/64: Fix crash in remove_pagetable()
  Revert "x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation"
  x86/boot/e820: Remove a redundant self assignment
  x86/mm: Fix dump pagetables for 4 levels of page tables
  x86/mpx, selftests: Only check bounds-vs-shadow when we keep shadow
  x86/mpx: Correctly report do_mpx_bt_fault() failures to user-space
  Revert "x86/mm/numa: Remove numa_nodemask_from_meminfo()"
  x86/espfix: Add support for 5-level paging
  x86/kasan: Extend KASAN to support 5-level paging
  x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL=y
  x86/paravirt: Add 5-level support to the paravirt code
  x86/mm: Define virtual memory map for 5-level paging
  x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert
  x86/boot: Detect 5-level paging support
  x86/mm/numa: Remove numa_nodemask_from_meminfo()
  ...
This commit is contained in:
Linus Torvalds 2017-05-01 23:54:56 -07:00
commit d3b5d35290
93 changed files with 1851 additions and 717 deletions

View file

@ -4,7 +4,7 @@
Virtual memory map with 4 level page tables:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
hole caused by [48:63] sign extension
hole caused by [47:63] sign extension
ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
@ -19,16 +19,43 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Virtual memory map with 5 level page tables:
0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm
hole caused by [56:63] sign extension
ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
ff90000000000000 - ff91ffffffffffff (=49 bits) hole
ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
... unused hole ...
ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB)
... unused hole ...
ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
... unused hole ...
ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
... unused hole ...
ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
Architecture defines a 64-bit virtual address. Implementations can support
less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
through to the most-significant implemented bit are set to either all ones
or all zero. This causes hole between user space and kernel addresses.
The direct mapping covers all memory in the system up to the highest
memory address (this means in some cases it can also include PCI memory
holes).
vmalloc space is lazily synchronized into the different PML4 pages of
the processes using the page fault handler, with init_level4_pgt as
vmalloc space is lazily synchronized into the different PML4/PML5 pages of
the processes using the page fault handler, with init_top_pgt as
reference.
Current X86-64 implementations support up to 46 bits of address space (64 TB),
@ -39,6 +66,9 @@ memory window (this size is arbitrary, it can be raised later if needed).
The mappings are not part of any other kernel PGD and are only available
during EFI runtime calls.
The module mapping space size changes based on the CONFIG requirements for the
following fixmap section.
Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
physical memory, vmalloc/ioremap space and virtual memory map are randomized.
Their order is preserved but their base will be offset early at boot time.

View file

@ -700,6 +700,13 @@ config ARCH_MMAP_RND_COMPAT_BITS
This value can be changed after boot using the
/proc/sys/vm/mmap_rnd_compat_bits tunable
config HAVE_ARCH_COMPAT_MMAP_BASES
bool
help
This allows 64bit applications to invoke 32-bit mmap() syscall
and vice-versa 32-bit applications to call 64-bit mmap().
Required for applications doing different bitness syscalls.
config HAVE_COPY_THREAD_TLS
bool
help

View file

@ -163,11 +163,5 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
/* by default, allow everything */
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
#endif /* __KERNEL__ */
#endif /* __ASM_POWERPC_MMU_CONTEXT_H */

View file

@ -156,10 +156,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
/* by default, allow everything */
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
#endif /* __S390_MMU_CONTEXT_H */

View file

@ -37,12 +37,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
/*
* end asm-generic/mm_hooks.h functions
*/

View file

@ -103,10 +103,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
/* by default, allow everything */
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
#endif

View file

@ -105,6 +105,7 @@ config X86
select HAVE_ARCH_KMEMCHECK
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
@ -289,6 +290,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
config KASAN_SHADOW_OFFSET
hex
depends on KASAN
default 0xdff8000000000000 if X86_5LEVEL
default 0xdffffc0000000000
config HAVE_INTEL_TXT

View file

@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] =
0, /* REQUIRED_MASK5 not implemented in this file */
REQUIRED_MASK6,
0, /* REQUIRED_MASK7 not implemented in this file */
0, /* REQUIRED_MASK8 not implemented in this file */
0, /* REQUIRED_MASK9 not implemented in this file */
0, /* REQUIRED_MASK10 not implemented in this file */
0, /* REQUIRED_MASK11 not implemented in this file */
0, /* REQUIRED_MASK12 not implemented in this file */
0, /* REQUIRED_MASK13 not implemented in this file */
0, /* REQUIRED_MASK14 not implemented in this file */
0, /* REQUIRED_MASK15 not implemented in this file */
REQUIRED_MASK16,
};
#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))

View file

@ -70,16 +70,19 @@ int has_eflag(unsigned long mask)
# define EBX_REG "=b"
#endif
static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
static inline void cpuid_count(u32 id, u32 count,
u32 *a, u32 *b, u32 *c, u32 *d)
{
asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t"
"cpuid \n\t"
".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t"
: "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
: "a" (id)
: "a" (id), "c" (count)
);
}
#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d)
void get_cpuflags(void)
{
u32 max_intel_level, max_amd_level;
@ -108,6 +111,11 @@ void get_cpuflags(void)
cpu.model += ((tfms >> 16) & 0xf) << 4;
}
if (max_intel_level >= 0x00000007) {
cpuid_count(0x00000007, 0, &ignored, &ignored,
&cpu.flags[16], &ignored);
}
cpuid(0x80000000, &max_amd_level, &ignored, &ignored,
&ignored);

View file

@ -265,12 +265,9 @@ return_from_SYSCALL_64:
*
* If width of "canonical tail" ever becomes variable, this will need
* to be updated to remain correct on both old and new CPUs.
*
* Change top 16 bits to be the sign-extension of 47th bit
*/
.ifne __VIRTUAL_MASK_SHIFT - 47
.error "virtual address width changed -- SYSRET checks need update"
.endif
/* Change top 16 bits to be the sign-extension of 47th bit */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx

View file

@ -361,7 +361,7 @@ static void vgetcpu_cpu_init(void *arg)
d.p = 1; /* Present */
d.d = 1; /* 32-bit */
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
}
static int vgetcpu_online(unsigned int cpu)

View file

@ -4,6 +4,7 @@
#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
#include <asm/fixmap.h>
#include <linux/smp.h>
#include <linux/percpu.h>
@ -45,11 +46,43 @@ struct gdt_page {
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
/* Provide the original GDT */
static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu)
{
return per_cpu(gdt_page, cpu).gdt;
}
/* Provide the current original GDT */
static inline struct desc_struct *get_current_gdt_rw(void)
{
return this_cpu_ptr(&gdt_page)->gdt;
}
/* Get the fixmap index for a specific processor */
static inline unsigned int get_cpu_gdt_ro_index(int cpu)
{
return FIX_GDT_REMAP_BEGIN + cpu;
}
/* Provide the fixmap address of the remapped GDT */
static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
{
unsigned int idx = get_cpu_gdt_ro_index(cpu);
return (struct desc_struct *)__fix_to_virt(idx);
}
/* Provide the current read-only GDT */
static inline struct desc_struct *get_current_gdt_ro(void)
{
return get_cpu_gdt_ro(smp_processor_id());
}
/* Provide the physical address of the GDT page. */
static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu)
{
return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
}
#ifdef CONFIG_X86_64
static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
@ -174,7 +207,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t
static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
{
struct desc_struct *d = get_cpu_gdt_table(cpu);
struct desc_struct *d = get_cpu_gdt_rw(cpu);
tss_desc tss;
set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
@ -194,22 +227,90 @@ static inline void native_set_ldt(const void *addr, unsigned int entries)
set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
entries * LDT_ENTRY_SIZE - 1);
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT,
&ldt, DESC_LDT);
asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
}
}
static inline void native_load_gdt(const struct desc_ptr *dtr)
{
asm volatile("lgdt %0"::"m" (*dtr));
}
static inline void native_load_idt(const struct desc_ptr *dtr)
{
asm volatile("lidt %0"::"m" (*dtr));
}
static inline void native_store_gdt(struct desc_ptr *dtr)
{
asm volatile("sgdt %0":"=m" (*dtr));
}
static inline void native_store_idt(struct desc_ptr *dtr)
{
asm volatile("sidt %0":"=m" (*dtr));
}
/*
* The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is
* a read-only remapping. To prevent a page fault, the GDT is switched to the
* original writeable version when needed.
*/
#ifdef CONFIG_X86_64
static inline void native_load_tr_desc(void)
{
struct desc_ptr gdt;
int cpu = raw_smp_processor_id();
bool restore = 0;
struct desc_struct *fixmap_gdt;
native_store_gdt(&gdt);
fixmap_gdt = get_cpu_gdt_ro(cpu);
/*
* If the current GDT is the read-only fixmap, swap to the original
* writeable version. Swap back at the end.
*/
if (gdt.address == (unsigned long)fixmap_gdt) {
load_direct_gdt(cpu);
restore = 1;
}
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
if (restore)
load_fixmap_gdt(cpu);
}
#else
static inline void native_load_tr_desc(void)
{
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
}
#endif
static inline unsigned long native_store_tr(void)
{
unsigned long tr;
asm volatile("str %0":"=r" (tr));
return tr;
}
static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
{
struct desc_struct *gdt = get_cpu_gdt_rw(cpu);
unsigned int i;
for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
}
DECLARE_PER_CPU(bool, __tss_limit_invalid);
static inline void force_reload_TR(void)
{
struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
struct desc_struct *d = get_current_gdt_rw();
tss_desc tss;
memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
@ -257,44 +358,6 @@ static inline void invalidate_tss_limit(void)
this_cpu_write(__tss_limit_invalid, true);
}
static inline void native_load_gdt(const struct desc_ptr *dtr)
{
asm volatile("lgdt %0"::"m" (*dtr));
}
static inline void native_load_idt(const struct desc_ptr *dtr)
{
asm volatile("lidt %0"::"m" (*dtr));
}
static inline void native_store_gdt(struct desc_ptr *dtr)
{
asm volatile("sgdt %0":"=m" (*dtr));
}
static inline void native_store_idt(struct desc_ptr *dtr)
{
asm volatile("sidt %0":"=m" (*dtr));
}
static inline unsigned long native_store_tr(void)
{
unsigned long tr;
asm volatile("str %0":"=r" (tr));
return tr;
}
static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
{
struct desc_struct *gdt = get_cpu_gdt_table(cpu);
unsigned int i;
for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
}
/* This intentionally ignores lm, since 32-bit apps don't have that field. */
#define LDT_empty(info) \
((info)->base_addr == 0 && \

View file

@ -36,6 +36,12 @@
# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
#ifdef CONFIG_X86_5LEVEL
# define DISABLE_LA57 0
#else
# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31))
#endif
/*
* Make sure to add features to the correct mask
*/
@ -55,7 +61,7 @@
#define DISABLED_MASK13 0
#define DISABLED_MASK14 0
#define DISABLED_MASK15 0
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE)
#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
#define DISABLED_MASK17 0
#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)

View file

@ -293,8 +293,23 @@ do { \
} \
} while (0)
/*
* True on X86_32 or when emulating IA32 on X86_64
*/
static inline int mmap_is_ia32(void)
{
return IS_ENABLED(CONFIG_X86_32) ||
(IS_ENABLED(CONFIG_COMPAT) &&
test_thread_flag(TIF_ADDR32));
}
extern unsigned long tasksize_32bit(void);
extern unsigned long tasksize_64bit(void);
extern unsigned long get_mmap_base(int is_legacy);
#ifdef CONFIG_X86_32
#define __STACK_RND_MASK(is32bit) (0x7ff)
#define STACK_RND_MASK (0x7ff)
#define ARCH_DLINFO ARCH_DLINFO_IA32
@ -304,7 +319,8 @@ do { \
#else /* CONFIG_X86_32 */
/* 1GB for 64bit, 8MB for 32bit */
#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff)
#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff)
#define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32())
#define ARCH_DLINFO \
do { \
@ -348,16 +364,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
/*
* True on X86_32 or when emulating IA32 on X86_64
*/
static inline int mmap_is_ia32(void)
{
return IS_ENABLED(CONFIG_X86_32) ||
(IS_ENABLED(CONFIG_COMPAT) &&
test_thread_flag(TIF_ADDR32));
}
/* Do not change the values. See get_align_mask() */
enum align_flags {
ALIGN_VA_32 = BIT(0),

View file

@ -100,6 +100,10 @@ enum fixed_addresses {
#ifdef CONFIG_X86_INTEL_MID
FIX_LNW_VRTC,
#endif
/* Fixmap entries to remap the GDTs, one per processor. */
FIX_GDT_REMAP_BEGIN,
FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
__end_of_permanent_fixed_addresses,
/*

View file

@ -11,9 +11,12 @@
* 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
*/
#define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \
(0xffff800000000000ULL >> 3))
/* 47 bits for kernel address -> (47 - 3) bits for shadow */
#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3)))
((-1UL << __VIRTUAL_MASK_SHIFT) >> 3))
/*
* 47 bits for kernel address -> (47 - 3) bits for shadow
* 56 bits for kernel address -> (56 - 3) bits for shadow
*/
#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (__VIRTUAL_MASK_SHIFT - 3)))
#ifndef __ASSEMBLY__

View file

@ -164,6 +164,7 @@ struct kimage_arch {
};
#else
struct kimage_arch {
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;

View file

@ -268,8 +268,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
return __pkru_allows_pkey(vma_pkey(vma), write);
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
}
#endif /* _ASM_X86_MMU_CONTEXT_H */

View file

@ -36,7 +36,12 @@
* hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
* what Xen requires.
*/
#ifdef CONFIG_X86_5LEVEL
#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL)
#else
#define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL)
#endif
#ifdef CONFIG_RANDOMIZE_MEMORY
#define __PAGE_OFFSET page_offset_base
#else
@ -46,8 +51,13 @@
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#ifdef CONFIG_X86_5LEVEL
#define __PHYSICAL_MASK_SHIFT 52
#define __VIRTUAL_MASK_SHIFT 56
#else
#define __PHYSICAL_MASK_SHIFT 46
#define __VIRTUAL_MASK_SHIFT 47
#endif
/*
* Kernel image size is limited to 1GiB due to the fixmap living in the

View file

@ -357,6 +357,16 @@ static inline void paravirt_release_pud(unsigned long pfn)
PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
}
static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
{
PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn);
}
static inline void paravirt_release_p4d(unsigned long pfn)
{
PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
}
static inline void pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
@ -536,7 +546,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
val);
}
#if CONFIG_PGTABLE_LEVELS == 4
#if CONFIG_PGTABLE_LEVELS >= 4
static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
@ -565,16 +575,42 @@ static inline pudval_t pud_val(pud_t pud)
return ret;
}
static inline void pud_clear(pud_t *pudp)
{
set_pud(pudp, __pud(0));
}
static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
{
p4dval_t val = native_p4d_val(p4d);
if (sizeof(p4dval_t) > sizeof(long))
PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp,
val, (u64)val >> 32);
else
PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp,
val);
}
#if CONFIG_PGTABLE_LEVELS >= 5
static inline p4d_t __p4d(p4dval_t val)
{
p4dval_t ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d, val);
return (p4d_t) { ret };
}
static inline p4dval_t p4d_val(p4d_t p4d)
{
return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d);
}
static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
{
pgdval_t val = native_pgd_val(pgd);
if (sizeof(pgdval_t) > sizeof(long))
PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
val, (u64)val >> 32);
else
PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
val);
PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val);
}
static inline void pgd_clear(pgd_t *pgdp)
@ -582,9 +618,11 @@ static inline void pgd_clear(pgd_t *pgdp)
set_pgd(pgdp, __pgd(0));
}
static inline void pud_clear(pud_t *pudp)
#endif /* CONFIG_PGTABLE_LEVELS == 5 */
static inline void p4d_clear(p4d_t *p4dp)
{
set_pud(pudp, __pud(0));
set_p4d(p4dp, __p4d(0));
}
#endif /* CONFIG_PGTABLE_LEVELS == 4 */

View file

@ -238,9 +238,11 @@ struct pv_mmu_ops {
void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn);
void (*release_pte)(unsigned long pfn);
void (*release_pmd)(unsigned long pfn);
void (*release_pud)(unsigned long pfn);
void (*release_p4d)(unsigned long pfn);
/* Pagetable manipulation functions */
void (*set_pte)(pte_t *ptep, pte_t pteval);
@ -279,12 +281,21 @@ struct pv_mmu_ops {
struct paravirt_callee_save pmd_val;
struct paravirt_callee_save make_pmd;
#if CONFIG_PGTABLE_LEVELS == 4
#if CONFIG_PGTABLE_LEVELS >= 4
struct paravirt_callee_save pud_val;
struct paravirt_callee_save make_pud;
void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
#endif /* CONFIG_PGTABLE_LEVELS == 4 */
void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval);
#if CONFIG_PGTABLE_LEVELS >= 5
struct paravirt_callee_save p4d_val;
struct paravirt_callee_save make_p4d;
void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval);
#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
struct pv_lazy_ops lazy_mode;

View file

@ -17,9 +17,11 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {
static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
unsigned long start, unsigned long count) {}
static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {}
static inline void paravirt_release_pte(unsigned long pfn) {}
static inline void paravirt_release_pmd(unsigned long pfn) {}
static inline void paravirt_release_pud(unsigned long pfn) {}
static inline void paravirt_release_p4d(unsigned long pfn) {}
#endif
/*
@ -121,10 +123,10 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
#endif /* CONFIG_X86_PAE */
#if CONFIG_PGTABLE_LEVELS > 3
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
{
paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
}
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@ -150,6 +152,37 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
___pud_free_tlb(tlb, pud);
}
#if CONFIG_PGTABLE_LEVELS > 4
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
{
paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
}
static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
{
gfp_t gfp = GFP_KERNEL_ACCOUNT;
if (mm == &init_mm)
gfp &= ~__GFP_ACCOUNT;
return (p4d_t *)get_zeroed_page(gfp);
}
static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
free_page((unsigned long)p4d);
}
extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
unsigned long address)
{
___p4d_free_tlb(tlb, p4d);
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* CONFIG_PGTABLE_LEVELS > 2 */

View file

@ -7,6 +7,7 @@
typedef unsigned long pteval_t;
typedef unsigned long pmdval_t;
typedef unsigned long pudval_t;
typedef unsigned long p4dval_t;
typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;

View file

@ -7,6 +7,7 @@
typedef u64 pteval_t;
typedef u64 pmdval_t;
typedef u64 pudval_t;
typedef u64 p4dval_t;
typedef u64 pgdval_t;
typedef u64 pgprotval_t;

View file

@ -51,11 +51,19 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
#ifndef __PAGETABLE_PUD_FOLDED
#ifndef __PAGETABLE_P4D_FOLDED
#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
#define pgd_clear(pgd) native_pgd_clear(pgd)
#endif
#ifndef set_p4d
# define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d)
#endif
#ifndef __PAGETABLE_PUD_FOLDED
#define p4d_clear(p4d) native_p4d_clear(p4d)
#endif
#ifndef set_pud
# define set_pud(pudp, pud) native_set_pud(pudp, pud)
#endif
@ -72,6 +80,11 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
#ifndef __PAGETABLE_P4D_FOLDED
#define p4d_val(x) native_p4d_val(x)
#define __p4d(x) native_make_p4d(x)
#endif
#ifndef __PAGETABLE_PUD_FOLDED
#define pud_val(x) native_pud_val(x)
#define __pud(x) native_make_pud(x)
@ -177,6 +190,17 @@ static inline unsigned long pud_pfn(pud_t pud)
return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}
static inline unsigned long p4d_pfn(p4d_t p4d)
{
return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
}
static inline int p4d_large(p4d_t p4d)
{
/* No 512 GiB pages yet */
return 0;
}
#define pte_page(pte) pfn_to_page(pte_pfn(pte))
static inline int pmd_large(pmd_t pte)
@ -536,6 +560,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
#define pte_pgprot(x) __pgprot(pte_flags(x))
#define pmd_pgprot(x) __pgprot(pmd_flags(x))
#define pud_pgprot(x) __pgprot(pud_flags(x))
#define p4d_pgprot(x) __pgprot(p4d_flags(x))
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
@ -585,6 +610,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/log2.h>
#include <asm/fixmap.h>
static inline int pte_none(pte_t pte)
{
@ -768,7 +794,52 @@ static inline int pud_large(pud_t pud)
}
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
static inline unsigned long pud_index(unsigned long address)
{
return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}
#if CONFIG_PGTABLE_LEVELS > 3
static inline int p4d_none(p4d_t p4d)
{
return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}
static inline int p4d_present(p4d_t p4d)
{
return p4d_flags(p4d) & _PAGE_PRESENT;
}
static inline unsigned long p4d_page_vaddr(p4d_t p4d)
{
return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}
/*
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
#define p4d_page(p4d) \
pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT)
/* Find an entry in the third-level page table.. */
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
}
static inline int p4d_bad(p4d_t p4d)
{
return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
}
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
static inline unsigned long p4d_index(unsigned long address)
{
return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
}
#if CONFIG_PGTABLE_LEVELS > 4
static inline int pgd_present(pgd_t pgd)
{
return pgd_flags(pgd) & _PAGE_PRESENT;
@ -786,14 +857,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
/* to find an entry in a page-table-directory. */
static inline unsigned long pud_index(unsigned long address)
static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}
static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
{
return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}
static inline int pgd_bad(pgd_t pgd)
@ -811,7 +877,7 @@ static inline int pgd_none(pgd_t pgd)
*/
return !native_pgd_val(pgd);
}
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* __ASSEMBLY__ */

View file

@ -14,7 +14,6 @@
*/
#ifndef __ASSEMBLY__
#include <asm/processor.h>
#include <asm/fixmap.h>
#include <linux/threads.h>
#include <asm/paravirt.h>

View file

@ -35,15 +35,22 @@ extern void paging_init(void);
#define pud_ERROR(e) \
pr_err("%s:%d: bad pud %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pud_val(e))
#if CONFIG_PGTABLE_LEVELS >= 5
#define p4d_ERROR(e) \
pr_err("%s:%d: bad p4d %p(%016lx)\n", \
__FILE__, __LINE__, &(e), p4d_val(e))
#endif
#define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %p(%016lx)\n", \
__FILE__, __LINE__, &(e), pgd_val(e))
struct mm_struct;
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
@ -121,6 +128,20 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
#endif
}
static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
{
*p4dp = p4d;
}
static inline void native_p4d_clear(p4d_t *p4d)
{
#ifdef CONFIG_X86_5LEVEL
native_set_p4d(p4d, native_make_p4d(0));
#else
native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
#endif
}
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
*pgdp = pgd;

View file

@ -13,6 +13,7 @@
typedef unsigned long pteval_t;
typedef unsigned long pmdval_t;
typedef unsigned long pudval_t;
typedef unsigned long p4dval_t;
typedef unsigned long pgdval_t;
typedef unsigned long pgprotval_t;
@ -22,12 +23,32 @@ typedef struct { pteval_t pte; } pte_t;
#define SHARED_KERNEL_PMD 0
#ifdef CONFIG_X86_5LEVEL
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
#define PGDIR_SHIFT 48
#define PTRS_PER_PGD 512
/*
* 4th level page in 5-level paging case
*/
#define P4D_SHIFT 39
#define PTRS_PER_P4D 512
#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT)
#define P4D_MASK (~(P4D_SIZE - 1))
#else /* CONFIG_X86_5LEVEL */
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
*/
#define PGDIR_SHIFT 39
#define PTRS_PER_PGD 512
#endif /* CONFIG_X86_5LEVEL */
/*
* 3rd level page
*/
@ -55,9 +76,15 @@ typedef struct { pteval_t pte; } pte_t;
/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
#ifdef CONFIG_X86_5LEVEL
#define VMALLOC_SIZE_TB _AC(16384, UL)
#define __VMALLOC_BASE _AC(0xff92000000000000, UL)
#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
#else
#define VMALLOC_SIZE_TB _AC(32, UL)
#define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
#endif
#ifdef CONFIG_RANDOMIZE_MEMORY
#define VMALLOC_START vmalloc_base
#define VMEMMAP_START vmemmap_base
@ -67,10 +94,11 @@ typedef struct { pteval_t pte; } pte_t;
#endif /* CONFIG_RANDOMIZE_MEMORY */
#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL))
#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE)
#define MODULES_END _AC(0xffffffffff000000, UL)
/* The module sections ends with the start of the fixmap */
#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)
#define ESPFIX_PGD_ENTRY _AC(-2, UL)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT)
#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT)
#define EFI_VA_START ( -4 * (_AC(1, UL) << 30))
#define EFI_VA_END (-68 * (_AC(1, UL) << 30))

View file

@ -272,9 +272,28 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
return native_pgd_val(pgd) & PTE_FLAGS_MASK;
}
#if CONFIG_PGTABLE_LEVELS > 3
#include <asm-generic/5level-fixup.h>
#if CONFIG_PGTABLE_LEVELS > 4
typedef struct { p4dval_t p4d; } p4d_t;
static inline p4d_t native_make_p4d(pudval_t val)
{
return (p4d_t) { val };
}
static inline p4dval_t native_p4d_val(p4d_t p4d)
{
return p4d.p4d;
}
#else
#include <asm-generic/pgtable-nop4d.h>
static inline p4dval_t native_p4d_val(p4d_t p4d)
{
return native_pgd_val(p4d.pgd);
}
#endif
#if CONFIG_PGTABLE_LEVELS > 3
typedef struct { pudval_t pud; } pud_t;
static inline pud_t native_make_pud(pmdval_t val)
@ -287,12 +306,11 @@ static inline pudval_t native_pud_val(pud_t pud)
return pud.pud;
}
#else
#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopud.h>
static inline pudval_t native_pud_val(pud_t pud)
{
return native_pgd_val(pud.pgd);
return native_pgd_val(pud.p4d.pgd);
}
#endif
@ -309,15 +327,30 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
return pmd.pmd;
}
#else
#define __ARCH_USE_5LEVEL_HACK
#include <asm-generic/pgtable-nopmd.h>
static inline pmdval_t native_pmd_val(pmd_t pmd)
{
return native_pgd_val(pmd.pud.pgd);
return native_pgd_val(pmd.pud.p4d.pgd);
}
#endif
static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
{
/* No 512 GiB huge pages yet */
return PTE_PFN_MASK;
}
static inline p4dval_t p4d_flags_mask(p4d_t p4d)
{
return ~p4d_pfn_mask(p4d);
}
static inline p4dval_t p4d_flags(p4d_t p4d)
{
return native_p4d_val(p4d) & p4d_flags_mask(p4d);
}
static inline pudval_t pud_pfn_mask(pud_t pud)
{
if (native_pud_val(pud) & _PAGE_PSE)
@ -461,6 +494,7 @@ enum pg_level {
PG_LEVEL_4K,
PG_LEVEL_2M,
PG_LEVEL_1G,
PG_LEVEL_512G,
PG_LEVEL_NUM
};

View file

@ -709,6 +709,8 @@ extern struct desc_ptr early_gdt_descr;
extern void cpu_set_gdt(int);
extern void switch_to_new_gdt(int);
extern void load_direct_gdt(int);
extern void load_fixmap_gdt(int);
extern void load_percpu_segment(int);
extern void cpu_init(void);
@ -790,6 +792,7 @@ static inline void spin_lock_prefetch(const void *x)
/*
* User space process size: 3GB (default).
*/
#define IA32_PAGE_OFFSET PAGE_OFFSET
#define TASK_SIZE PAGE_OFFSET
#define TASK_SIZE_MAX TASK_SIZE
#define STACK_TOP TASK_SIZE
@ -866,7 +869,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3))
#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE)
#define KSTK_EIP(task) (task_pt_regs(task)->ip)

View file

@ -53,6 +53,12 @@
# define NEED_MOVBE 0
#endif
#ifdef CONFIG_X86_5LEVEL
# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
#else
# define NEED_LA57 0
#endif
#ifdef CONFIG_X86_64
#ifdef CONFIG_PARAVIRT
/* Paravirtualized systems may not have PSE or PGE available */
@ -98,7 +104,7 @@
#define REQUIRED_MASK13 0
#define REQUIRED_MASK14 0
#define REQUIRED_MASK15 0
#define REQUIRED_MASK16 0
#define REQUIRED_MASK16 (NEED_LA57)
#define REQUIRED_MASK17 0
#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)

View file

@ -26,8 +26,13 @@
# endif
#else /* CONFIG_X86_32 */
# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
# define MAX_PHYSADDR_BITS 44
# define MAX_PHYSMEM_BITS 46
# ifdef CONFIG_X86_5LEVEL
# define MAX_PHYSADDR_BITS 52
# define MAX_PHYSMEM_BITS 52
# else
# define MAX_PHYSADDR_BITS 44
# define MAX_PHYSMEM_BITS 46
# endif
#endif
#endif /* CONFIG_SPARSEMEM */

View file

@ -87,7 +87,7 @@ static inline void setup_stack_canary_segment(int cpu)
{
#ifdef CONFIG_X86_32
unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu);
struct desc_struct desc;
desc = gdt_table[GDT_ENTRY_STACK_CANARY];

View file

@ -215,7 +215,6 @@ static inline void __flush_tlb_one(unsigned long addr)
/*
* TLB flushing:
*
* - flush_tlb() flushes the current mm struct TLBs
* - flush_tlb_all() flushes all processes TLBs
* - flush_tlb_mm(mm) flushes the specified mm context TLB's
* - flush_tlb_page(vma, vmaddr) flushes one page
@ -247,11 +246,6 @@ static inline void flush_tlb_all(void)
__flush_tlb_all();
}
static inline void flush_tlb(void)
{
__flush_tlb_up();
}
static inline void local_flush_tlb(void)
{
__flush_tlb_up();
@ -313,14 +307,11 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
extern void flush_tlb_all(void);
extern void flush_tlb_current_task(void);
extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
#define flush_tlb() flush_tlb_current_task()
void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long start, unsigned long end);

View file

@ -280,13 +280,17 @@ static inline pte_t __pte_ma(pteval_t x)
#define pmd_val_ma(v) ((v).pmd)
#ifdef __PAGETABLE_PUD_FOLDED
#define pud_val_ma(v) ((v).pgd.pgd)
#define pud_val_ma(v) ((v).p4d.pgd.pgd)
#else
#define pud_val_ma(v) ((v).pud)
#endif
#define __pmd_ma(x) ((pmd_t) { (x) } )
#define pgd_val_ma(x) ((x).pgd)
#ifdef __PAGETABLE_P4D_FOLDED
#define p4d_val_ma(x) ((x).pgd.pgd)
#else
#define p4d_val_ma(x) ((x).p4d)
#endif
void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);

View file

@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void)
#ifdef CONFIG_SMP
initial_stack = (unsigned long)temp_stack + sizeof(temp_stack);
early_gdt_descr.address =
(unsigned long)get_cpu_gdt_table(smp_processor_id());
(unsigned long)get_cpu_gdt_rw(smp_processor_id());
initial_gs = per_cpu_offset(smp_processor_id());
#endif
initial_code = (unsigned long)wakeup_long64;

View file

@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call)
cpu = get_cpu();
BUG_ON(cpu != 0);
gdt = get_cpu_gdt_table(cpu);
gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;
@ -685,7 +685,7 @@ static long __apm_bios_call_simple(void *_call)
cpu = get_cpu();
BUG_ON(cpu != 0);
gdt = get_cpu_gdt_table(cpu);
gdt = get_cpu_gdt_rw(cpu);
save_desc_40 = gdt[0x40 / 8];
gdt[0x40 / 8] = bad_bios_desc;
@ -2352,7 +2352,7 @@ static int __init apm_init(void)
* Note we only set APM segments on CPU zero, since we pin the APM
* code to that CPU.
*/
gdt = get_cpu_gdt_table(0);
gdt = get_cpu_gdt_rw(0);
set_desc_base(&gdt[APM_CS >> 3],
(unsigned long)__va((unsigned long)apm_info.bios.cseg << 4));
set_desc_base(&gdt[APM_CS_16 >> 3],

View file

@ -448,19 +448,60 @@ void load_percpu_segment(int cpu)
load_stack_canary_segment();
}
/* Setup the fixmap mapping only once per-processor */
static inline void setup_fixmap_gdt(int cpu)
{
#ifdef CONFIG_X86_64
/* On 64-bit systems, we use a read-only fixmap GDT. */
pgprot_t prot = PAGE_KERNEL_RO;
#else
/*
* On native 32-bit systems, the GDT cannot be read-only because
* our double fault handler uses a task gate, and entering through
* a task gate needs to change an available TSS to busy. If the GDT
* is read-only, that will triple fault.
*
* On Xen PV, the GDT must be read-only because the hypervisor requires
* it.
*/
pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ?
PAGE_KERNEL_RO : PAGE_KERNEL;
#endif
__set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot);
}
/* Load the original GDT from the per-cpu structure */
void load_direct_gdt(int cpu)
{
struct desc_ptr gdt_descr;
gdt_descr.address = (long)get_cpu_gdt_rw(cpu);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
}
EXPORT_SYMBOL_GPL(load_direct_gdt);
/* Load a fixmap remapping of the per-cpu GDT */
void load_fixmap_gdt(int cpu)
{
struct desc_ptr gdt_descr;
gdt_descr.address = (long)get_cpu_gdt_ro(cpu);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
}
EXPORT_SYMBOL_GPL(load_fixmap_gdt);
/*
* Current gdt points %fs at the "master" per-cpu area: after this,
* it's on the real one.
*/
void switch_to_new_gdt(int cpu)
{
struct desc_ptr gdt_descr;
gdt_descr.address = (long)get_cpu_gdt_table(cpu);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
/* Load the original GDT */
load_direct_gdt(cpu);
/* Reload the per-cpu base */
load_percpu_segment(cpu);
}
@ -1526,6 +1567,9 @@ void cpu_init(void)
if (is_uv_system())
uv_cpu_init();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
#else
@ -1581,6 +1625,9 @@ void cpu_init(void)
dbg_restore_debug_regs();
fpu__init_cpu();
setup_fixmap_gdt(cpu);
load_fixmap_gdt(cpu);
}
#endif

View file

@ -270,7 +270,6 @@ int __init e820__update_table(struct e820_table *table)
if (table->nr_entries < 2)
return -1;
table->nr_entries = table->nr_entries;
BUG_ON(table->nr_entries > max_nr_entries);
/* Bail out if we find any unreasonable addresses in the map: */

View file

@ -50,11 +50,11 @@
#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
/* There is address space for how many espfix pages? */
#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16))
#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE)
#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
# error "Need more than one PGD for the ESPFIX hack"
# error "Need more virtual address space for the ESPFIX hack"
#endif
#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
@ -121,11 +121,13 @@ static void init_espfix_random(void)
void __init init_espfix_bsp(void)
{
pgd_t *pgd_p;
pgd_t *pgd;
p4d_t *p4d;
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR);
p4d_populate(&init_mm, p4d, espfix_pud_page);
/* Randomize the locations */
init_espfix_random();

View file

@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one(
pgd_t *pgd, pmd_t *pmd, pte_t *pte,
unsigned long vaddr, unsigned long paddr)
{
p4d_t *p4d;
pud_t *pud;
pgd += pgd_index(vaddr);
@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one(
if (!(pgd_val(*pgd) & _PAGE_PRESENT))
set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
#endif
pud = pud_offset(pgd, vaddr);
p4d = p4d_offset(pgd, vaddr);
pud = pud_offset(p4d, vaddr);
pmd = pmd_offset(pud, vaddr);
if (!(pmd_val(*pmd) & _PAGE_PRESENT))
set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));

View file

@ -36,6 +36,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = {
static void free_transition_pgtable(struct kimage *image)
{
free_page((unsigned long)image->arch.p4d);
free_page((unsigned long)image->arch.pud);
free_page((unsigned long)image->arch.pmd);
free_page((unsigned long)image->arch.pte);
@ -43,6 +44,7 @@ static void free_transition_pgtable(struct kimage *image)
static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@ -53,13 +55,21 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
if (!p4d)
goto err;
image->arch.p4d = p4d;
set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
}
p4d = p4d_offset(pgd, vaddr);
if (!p4d_present(*p4d)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
if (!pud)
goto err;
image->arch.pud = pud;
set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
pud = pud_offset(pgd, vaddr);
pud = pud_offset(p4d, vaddr);
if (!pud_present(*pud)) {
pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd)

View file

@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
.alloc_pud = paravirt_nop,
.alloc_p4d = paravirt_nop,
.release_pte = paravirt_nop,
.release_pmd = paravirt_nop,
.release_pud = paravirt_nop,
.release_p4d = paravirt_nop,
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
@ -430,12 +432,19 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.pmd_val = PTE_IDENT,
.make_pmd = PTE_IDENT,
#if CONFIG_PGTABLE_LEVELS == 4
#if CONFIG_PGTABLE_LEVELS >= 4
.pud_val = PTE_IDENT,
.make_pud = PTE_IDENT,
.set_p4d = native_set_p4d,
#if CONFIG_PGTABLE_LEVELS >= 5
.p4d_val = PTE_IDENT,
.make_p4d = PTE_IDENT,
.set_pgd = native_set_pgd,
#endif
#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
.pte_val = PTE_IDENT,

View file

@ -53,6 +53,11 @@
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
#include <asm/intel_rdt.h>
#include <asm/unistd.h>
#ifdef CONFIG_IA32_EMULATION
/* Not included via unistd.h */
#include <asm/unistd_32_ia32.h>
#endif
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
@ -494,6 +499,8 @@ void set_personality_64bit(void)
clear_thread_flag(TIF_IA32);
clear_thread_flag(TIF_ADDR32);
clear_thread_flag(TIF_X32);
/* Pretend that this comes from a 64bit execve */
task_pt_regs(current)->orig_ax = __NR_execve;
/* Ensure the corresponding mm is not marked. */
if (current->mm)
@ -506,32 +513,50 @@ void set_personality_64bit(void)
current->personality &= ~READ_IMPLIES_EXEC;
}
static void __set_personality_x32(void)
{
#ifdef CONFIG_X86_X32
clear_thread_flag(TIF_IA32);
set_thread_flag(TIF_X32);
if (current->mm)
current->mm->context.ia32_compat = TIF_X32;
current->personality &= ~READ_IMPLIES_EXEC;
/*
* in_compat_syscall() uses the presence of the x32 syscall bit
* flag to determine compat status. The x86 mmap() code relies on
* the syscall bitness so set x32 syscall bit right here to make
* in_compat_syscall() work during exec().
*
* Pretend to come from a x32 execve.
*/
task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
current->thread.status &= ~TS_COMPAT;
#endif
}
static void __set_personality_ia32(void)
{
#ifdef CONFIG_IA32_EMULATION
set_thread_flag(TIF_IA32);
clear_thread_flag(TIF_X32);
if (current->mm)
current->mm->context.ia32_compat = TIF_IA32;
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
task_pt_regs(current)->orig_ax = __NR_ia32_execve;
current->thread.status |= TS_COMPAT;
#endif
}
void set_personality_ia32(bool x32)
{
/* inherit personality from parent */
/* Make sure to be in 32bit mode */
set_thread_flag(TIF_ADDR32);
/* Mark the associated mm as containing 32-bit tasks. */
if (x32) {
clear_thread_flag(TIF_IA32);
set_thread_flag(TIF_X32);
if (current->mm)
current->mm->context.ia32_compat = TIF_X32;
current->personality &= ~READ_IMPLIES_EXEC;
/* in_compat_syscall() uses the presence of the x32
syscall bit flag to determine compat status */
current->thread.status &= ~TS_COMPAT;
} else {
set_thread_flag(TIF_IA32);
clear_thread_flag(TIF_X32);
if (current->mm)
current->mm->context.ia32_compat = TIF_IA32;
current->personality |= force_personality32;
/* Prepare the first "return" to user space */
current->thread.status |= TS_COMPAT;
}
if (x32)
__set_personality_x32();
else
__set_personality_ia32();
}
EXPORT_SYMBOL_GPL(set_personality_ia32);

View file

@ -1225,21 +1225,6 @@ void __init setup_arch(char **cmdline_p)
kasan_init();
#ifdef CONFIG_X86_32
/* sync back kernel address range */
clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
/*
* sync back low identity map too. It is used for example
* in the 32-bit EFI stub.
*/
clone_pgd_range(initial_page_table,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
#endif
tboot_probe();
map_vsyscall();

View file

@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu)
pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
0x2 | DESCTYPE_S, 0x8);
gdt.s = 1;
write_gdt_entry(get_cpu_gdt_table(cpu),
write_gdt_entry(get_cpu_gdt_rw(cpu),
GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
#endif
}
@ -288,4 +288,25 @@ void __init setup_per_cpu_areas(void)
/* Setup cpu initialized, callin, callout masks */
setup_cpu_local_masks();
#ifdef CONFIG_X86_32
/*
* Sync back kernel address range. We want to make sure that
* all kernel mappings, including percpu mappings, are available
* in the smpboot asm. We can't reliably pick up percpu
* mappings using vmalloc_fault(), because exception dispatch
* needs percpu data.
*/
clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
/*
* sync back low identity map too. It is used for example
* in the 32-bit EFI stub.
*/
clone_pgd_range(initial_page_table,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
#endif
}

View file

@ -983,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
unsigned long timeout;
idle->thread.sp = (unsigned long)task_pt_regs(idle);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
initial_code = (unsigned long)start_secondary;
initial_stack = idle->thread.sp;

View file

@ -17,6 +17,8 @@
#include <linux/uaccess.h>
#include <linux/elf.h>
#include <asm/elf.h>
#include <asm/compat.h>
#include <asm/ia32.h>
#include <asm/syscalls.h>
@ -101,7 +103,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
static void find_start_end(unsigned long flags, unsigned long *begin,
unsigned long *end)
{
if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) {
if (!in_compat_syscall() && (flags & MAP_32BIT)) {
/* This is usually used needed to map code in small
model, so it needs to be in the first 31bit. Limit
it to that. This means we need to move the
@ -114,10 +116,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
if (current->flags & PF_RANDOMIZE) {
*begin = randomize_page(*begin, 0x02000000);
}
} else {
*begin = current->mm->mmap_legacy_base;
*end = TASK_SIZE;
return;
}
*begin = get_mmap_base(1);
*end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit();
}
unsigned long
@ -176,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
return addr;
/* for MAP_32BIT mappings we force the legacy mmap base */
if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT))
if (!in_compat_syscall() && (flags & MAP_32BIT))
goto bottomup;
/* requesting a specific address */
@ -191,7 +194,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
info.high_limit = get_mmap_base(0);
info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
if (filp) {

View file

@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
pgprot_t prot)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd = pgd_offset(&tboot_mm, vaddr);
pud = pud_alloc(&tboot_mm, pgd, vaddr);
p4d = p4d_alloc(&tboot_mm, pgd, vaddr);
if (!p4d)
return -1;
pud = pud_alloc(&tboot_mm, p4d, vaddr);
if (!pud)
return -1;
pmd = pmd_alloc(&tboot_mm, pud, vaddr);

View file

@ -92,10 +92,17 @@ static void set_tls_desc(struct task_struct *p, int idx,
cpu = get_cpu();
while (n-- > 0) {
if (LDT_empty(info) || LDT_zero(info))
if (LDT_empty(info) || LDT_zero(info)) {
desc->a = desc->b = 0;
else
} else {
fill_ldt(desc, info);
/*
* Always set the accessed bit so that the CPU
* doesn't try to write to the (read-only) GDT.
*/
desc->type |= 1;
}
++info;
++desc;
}

View file

@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
struct vm_area_struct *vma;
spinlock_t *ptl;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
pud = pud_offset(pgd, 0xA0000);
p4d = p4d_offset(pgd, 0xA0000);
if (p4d_none_or_clear_bad(p4d))
goto out;
pud = pud_offset(p4d, 0xA0000);
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
@ -193,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_unmap_unlock(pte, ptl);
out:
up_write(&mm->mmap_sem);
flush_tlb();
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
}

View file

@ -741,7 +741,6 @@ static int svm_hardware_enable(void)
struct svm_cpu_data *sd;
uint64_t efer;
struct desc_ptr gdt_descr;
struct desc_struct *gdt;
int me = raw_smp_processor_id();
@ -763,8 +762,7 @@ static int svm_hardware_enable(void)
sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
sd->next_asid = sd->max_asid + 1;
native_store_gdt(&gdt_descr);
gdt = (struct desc_struct *)gdt_descr.address;
gdt = get_current_gdt_rw();
sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
wrmsrl(MSR_EFER, efer | EFER_SVME);

View file

@ -935,7 +935,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
* when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
/*
* We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
@ -2057,14 +2056,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
*/
static unsigned long segment_base(u16 selector)
{
struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
struct desc_struct *table;
unsigned long v;
if (!(selector & ~SEGMENT_RPL_MASK))
return 0;
table = (struct desc_struct *)gdt->address;
table = get_current_gdt_ro();
if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
u16 ldt_selector = kvm_read_ldt();
@ -2169,7 +2167,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
#endif
if (vmx->host_state.msr_host_bndcfgs)
wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
load_gdt(this_cpu_ptr(&host_gdt));
load_fixmap_gdt(raw_smp_processor_id());
}
static void vmx_load_host_state(struct vcpu_vmx *vmx)
@ -2271,7 +2269,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
if (!already_loaded) {
struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
void *gdt = get_current_gdt_ro();
unsigned long sysenter_esp;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@ -2282,7 +2280,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
*/
vmcs_writel(HOST_TR_BASE,
(unsigned long)this_cpu_ptr(&cpu_tss));
vmcs_writel(HOST_GDTR_BASE, gdt->address);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
/*
* VM exits change the host TR limit to 0x67 after a VM
@ -3471,8 +3469,6 @@ static int hardware_enable(void)
ept_sync_global();
}
native_store_gdt(this_cpu_ptr(&host_gdt));
return 0;
}

View file

@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = {
#define PTE_LEVEL_MULT (PAGE_SIZE)
#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
({ \
@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st,
}
}
static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
unsigned long P)
static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P)
{
int i;
pte_t *start;
pgprotval_t prot;
start = (pte_t *) pmd_page_vaddr(addr);
start = (pte_t *)pmd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PTE; i++) {
prot = pte_flags(*start);
st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
#if PTRS_PER_PMD > 1
static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
unsigned long P)
static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P)
{
int i;
pmd_t *start;
pgprotval_t prot;
start = (pmd_t *) pud_page_vaddr(addr);
start = (pmd_t *)pud_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PMD; i++) {
st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
if (!pmd_none(*start)) {
@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx)
return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
}
static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
unsigned long P)
static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P)
{
int i;
pud_t *start;
pgprotval_t prot;
pud_t *prev_pud = NULL;
start = (pud_t *) pgd_page_vaddr(addr);
start = (pud_t *)p4d_page_vaddr(addr);
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
}
#else
#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
#define pgd_large(a) pud_large(__pud(pgd_val(a)))
#define pgd_none(a) pud_none(__pud(pgd_val(a)))
#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
#define p4d_large(a) pud_large(__pud(p4d_val(a)))
#define p4d_none(a) pud_none(__pud(p4d_val(a)))
#endif
#if PTRS_PER_P4D > 1
static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P)
{
int i;
p4d_t *start;
pgprotval_t prot;
start = (p4d_t *)pgd_page_vaddr(addr);
for (i = 0; i < PTRS_PER_P4D; i++) {
st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
if (!p4d_none(*start)) {
if (p4d_large(*start) || !p4d_present(*start)) {
prot = p4d_flags(*start);
note_page(m, st, __pgprot(prot), 2);
} else {
walk_pud_level(m, st, *start,
P + i * P4D_LEVEL_MULT);
}
} else
note_page(m, st, __pgprot(0), 2);
start++;
}
}
#else
#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
#define pgd_none(a) p4d_none(__p4d(pgd_val(a)))
#endif
static inline bool is_hypervisor_range(int idx)
@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
prot = pgd_flags(*start);
note_page(m, &st, __pgprot(prot), 1);
} else {
walk_pud_level(m, &st, *start,
walk_p4d_level(m, &st, *start,
i * PGD_LEVEL_MULT);
}
} else

View file

@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
{
unsigned index = pgd_index(address);
pgd_t *pgd_k;
p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
/*
* set_pgd(pgd, *pgd_k); here would be useless on PAE
* and redundant with the set_pmd() on non-PAE. As would
* set_pud.
* set_p4d/set_pud.
*/
pud = pud_offset(pgd, address);
pud_k = pud_offset(pgd_k, address);
p4d = p4d_offset(pgd, address);
p4d_k = p4d_offset(pgd_k, address);
if (!p4d_present(*p4d_k))
return NULL;
pud = pud_offset(p4d, address);
pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
return NULL;
@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address)
{
pgd_t *base = __va(read_cr3());
pgd_t *pgd = &base[pgd_index(address)];
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address)
if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
goto out;
#endif
pmd = pmd_offset(pud_offset(pgd, address), address);
p4d = p4d_offset(pgd, address);
pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
/*
@ -425,6 +435,7 @@ void vmalloc_sync_all(void)
static noinline int vmalloc_fault(unsigned long address)
{
pgd_t *pgd, *pgd_ref;
p4d_t *p4d, *p4d_ref;
pud_t *pud, *pud_ref;
pmd_t *pmd, *pmd_ref;
pte_t *pte, *pte_ref;
@ -448,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
arch_flush_lazy_mmu_mode();
} else {
} else if (CONFIG_PGTABLE_LEVELS > 4) {
/*
* With folded p4d, pgd_none() is always false, so the pgd may
* point to an empty page table entry and pgd_page_vaddr()
* will return garbage.
*
* We will do the correct sanity check on the p4d level.
*/
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
}
/* With 4-level paging, copying happens on the p4d level. */
p4d = p4d_offset(pgd, address);
p4d_ref = p4d_offset(pgd_ref, address);
if (p4d_none(*p4d_ref))
return -1;
if (p4d_none(*p4d)) {
set_p4d(p4d, *p4d_ref);
arch_flush_lazy_mmu_mode();
} else {
BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref));
}
/*
* Below here mismatches are bugs because these lower tables
* are shared:
*/
pud = pud_offset(pgd, address);
pud_ref = pud_offset(pgd_ref, address);
pud = pud_offset(p4d, address);
pud_ref = pud_offset(p4d_ref, address);
if (pud_none(*pud_ref))
return -1;
@ -526,6 +557,7 @@ static void dump_pagetable(unsigned long address)
{
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
pgd_t *pgd = base + pgd_index(address);
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@ -538,7 +570,15 @@ static void dump_pagetable(unsigned long address)
if (!pgd_present(*pgd))
goto out;
pud = pud_offset(pgd, address);
p4d = p4d_offset(pgd, address);
if (bad_address(p4d))
goto bad;
printk("P4D %lx ", p4d_val(*p4d));
if (!p4d_present(*p4d) || p4d_large(*p4d))
goto out;
pud = pud_offset(p4d, address);
if (bad_address(pud))
goto bad;
@ -1082,6 +1122,7 @@ static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@ -1104,7 +1145,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
if (!pgd_present(*pgd))
return 0;
pud = pud_offset(pgd, address);
p4d = p4d_offset(pgd, address);
if (!p4d_present(*p4d))
return 0;
if (p4d_large(*p4d))
return spurious_fault_check(error_code, (pte_t *) p4d);
pud = pud_offset(p4d, address);
if (!pud_present(*pud))
return 0;

View file

@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
/*
* 'pteval' can come from a pte, pmd or pud. We only check
* 'pteval' can come from a pte, pmd, pud or p4d. We only check
* _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
* same value on all 3 types.
* same value on all 4 types.
*/
static inline int pte_allows_gup(unsigned long pteval, int write)
{
@ -295,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
return 1;
}
static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
pudp = pud_offset(&pgd, addr);
pudp = pud_offset(&p4d, addr);
do {
pud_t pud = *pudp;
@ -320,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
return 1;
}
static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;
p4dp = p4d_offset(&pgd, addr);
do {
p4d_t p4d = *p4dp;
next = p4d_addr_end(addr, end);
if (p4d_none(p4d))
return 0;
BUILD_BUG_ON(p4d_large(p4d));
if (!gup_pud_range(p4d, addr, next, write, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
return 1;
}
/*
* Like get_user_pages_fast() except its IRQ-safe in that it won't fall
* back to the regular GUP.
@ -368,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
break;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
@ -440,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
if (!gup_p4d_range(pgd, addr, next, write, pages, &nr))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();

View file

@ -12,10 +12,12 @@
#include <linux/pagemap.h>
#include <linux/err.h>
#include <linux/sysctl.h>
#include <linux/compat.h>
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/elf.h>
#if 0 /* This is just for testing */
struct page *
@ -82,8 +84,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_legacy_base;
info.high_limit = TASK_SIZE;
info.low_limit = get_mmap_base(1);
info.high_limit = in_compat_syscall() ?
tasksize_32bit() : tasksize_64bit();
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
@ -100,7 +103,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = current->mm->mmap_base;
info.high_limit = get_mmap_base(0);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);

View file

@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
return 0;
}
static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
unsigned long addr, unsigned long end)
{
unsigned long next;
for (; addr < end; addr = next) {
p4d_t *p4d = p4d_page + p4d_index(addr);
pud_t *pud;
next = (addr & P4D_MASK) + P4D_SIZE;
if (next > end)
next = end;
if (p4d_present(*p4d)) {
pud = pud_offset(p4d, 0);
ident_pud_init(info, pud, addr, next);
continue;
}
pud = (pud_t *)info->alloc_pgt_page(info->context);
if (!pud)
return -ENOMEM;
ident_pud_init(info, pud, addr, next);
set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
return 0;
}
int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
unsigned long pstart, unsigned long pend)
{
@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
for (; addr < end; addr = next) {
pgd_t *pgd = pgd_page + pgd_index(addr);
pud_t *pud;
p4d_t *p4d;
next = (addr & PGDIR_MASK) + PGDIR_SIZE;
if (next > end)
next = end;
if (pgd_present(*pgd)) {
pud = pud_offset(pgd, 0);
result = ident_pud_init(info, pud, addr, next);
p4d = p4d_offset(pgd, 0);
result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
continue;
}
pud = (pud_t *)info->alloc_pgt_page(info->context);
if (!pud)
p4d = (p4d_t *)info->alloc_pgt_page(info->context);
if (!p4d)
return -ENOMEM;
result = ident_pud_init(info, pud, addr, next);
result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
} else {
/*
* With p4d folded, pgd is equal to p4d.
* The pgd entry has to point to the pud page table in this case.
*/
pud_t *pud = pud_offset(p4d, 0);
set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
}
}
return 0;

View file

@ -56,8 +56,6 @@
unsigned long highstart_pfn, highend_pfn;
static noinline int do_test_wp_bit(void);
bool __read_mostly __vmalloc_start_set = false;
/*
@ -67,6 +65,7 @@ bool __read_mostly __vmalloc_start_set = false;
*/
static pmd_t * __init one_md_table_init(pgd_t *pgd)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd_table;
@ -75,13 +74,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
pmd_table = (pmd_t *)alloc_low_page();
paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
p4d = p4d_offset(pgd, 0);
pud = pud_offset(p4d, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
return pmd_table;
}
#endif
pud = pud_offset(pgd, 0);
p4d = p4d_offset(pgd, 0);
pud = pud_offset(p4d, 0);
pmd_table = pmd_offset(pud, 0);
return pmd_table;
@ -390,8 +391,11 @@ pte_t *kmap_pte;
static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr)
{
return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr),
vaddr), vaddr), vaddr);
pgd_t *pgd = pgd_offset_k(vaddr);
p4d_t *p4d = p4d_offset(pgd, vaddr);
pud_t *pud = pud_offset(p4d, vaddr);
pmd_t *pmd = pmd_offset(pud, vaddr);
return pte_offset_kernel(pmd, vaddr);
}
static void __init kmap_init(void)
@ -410,6 +414,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
unsigned long vaddr;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@ -418,7 +423,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
pgd = swapper_pg_dir + pgd_index(vaddr);
pud = pud_offset(pgd, vaddr);
p4d = p4d_offset(pgd, vaddr);
pud = pud_offset(p4d, vaddr);
pmd = pmd_offset(pud, vaddr);
pte = pte_offset_kernel(pmd, vaddr);
pkmap_page_table = pte;
@ -450,6 +456,7 @@ void __init native_pagetable_init(void)
{
unsigned long pfn, va;
pgd_t *pgd, *base = swapper_pg_dir;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@ -469,7 +476,8 @@ void __init native_pagetable_init(void)
if (!pgd_present(*pgd))
break;
pud = pud_offset(pgd, va);
p4d = p4d_offset(pgd, va);
pud = pud_offset(p4d, va);
pmd = pmd_offset(pud, va);
if (!pmd_present(*pmd))
break;
@ -716,22 +724,20 @@ void __init paging_init(void)
*/
static void __init test_wp_bit(void)
{
int wp_works_ok;
char z = 0;
printk(KERN_INFO
"Checking if this processor honours the WP bit even in supervisor mode...");
printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode...");
/* Any page-aligned address will do, the test is non-destructive */
__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);
wp_works_ok = do_test_wp_bit();
clear_fixmap(FIX_WP_TEST);
__set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO);
if (!wp_works_ok) {
printk(KERN_CONT "No.\n");
panic("Linux doesn't support CPUs with broken WP.");
} else {
if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) {
clear_fixmap(FIX_WP_TEST);
printk(KERN_CONT "Ok.\n");
return;
}
printk(KERN_CONT "No.\n");
panic("Linux doesn't support CPUs with broken WP.");
}
void __init mem_init(void)
@ -841,30 +847,6 @@ int arch_remove_memory(u64 start, u64 size)
#endif
#endif
/*
* This function cannot be __init, since exceptions don't work in that
* section. Put this after the callers, so that it cannot be inlined.
*/
static noinline int do_test_wp_bit(void)
{
char tmp_reg;
int flag;
__asm__ __volatile__(
" movb %0, %1 \n"
"1: movb %1, %0 \n"
" xorl %2, %2 \n"
"2: \n"
_ASM_EXTABLE(1b,2b)
:"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
"=q" (tmp_reg),
"=r" (flag)
:"2" (1)
:"memory");
return flag;
}
int kernel_set_to_readonly __read_mostly;
void set_kernel_text_rw(void)

View file

@ -97,28 +97,38 @@ void sync_global_pgds(unsigned long start, unsigned long end)
unsigned long address;
for (address = start; address <= end; address += PGDIR_SIZE) {
const pgd_t *pgd_ref = pgd_offset_k(address);
pgd_t *pgd_ref = pgd_offset_k(address);
const p4d_t *p4d_ref;
struct page *page;
if (pgd_none(*pgd_ref))
/*
* With folded p4d, pgd_none() is always false, we need to
* handle synchonization on p4d level.
*/
BUILD_BUG_ON(pgd_none(*pgd_ref));
p4d_ref = p4d_offset(pgd_ref, address);
if (p4d_none(*p4d_ref))
continue;
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
p4d_t *p4d;
spinlock_t *pgt_lock;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
p4d = p4d_offset(pgd, address);
/* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
BUG_ON(pgd_page_vaddr(*pgd)
!= pgd_page_vaddr(*pgd_ref));
if (!p4d_none(*p4d_ref) && !p4d_none(*p4d))
BUG_ON(p4d_page_vaddr(*p4d)
!= p4d_page_vaddr(*p4d_ref));
if (pgd_none(*pgd))
set_pgd(pgd, *pgd_ref);
if (p4d_none(*p4d))
set_p4d(p4d, *p4d_ref);
spin_unlock(pgt_lock);
}
@ -149,16 +159,28 @@ static __ref void *spp_getpage(void)
return ptr;
}
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr)
{
if (pgd_none(*pgd)) {
pud_t *pud = (pud_t *)spp_getpage();
pgd_populate(&init_mm, pgd, pud);
if (pud != pud_offset(pgd, 0))
p4d_t *p4d = (p4d_t *)spp_getpage();
pgd_populate(&init_mm, pgd, p4d);
if (p4d != p4d_offset(pgd, 0))
printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
pud, pud_offset(pgd, 0));
p4d, p4d_offset(pgd, 0));
}
return pud_offset(pgd, vaddr);
return p4d_offset(pgd, vaddr);
}
static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr)
{
if (p4d_none(*p4d)) {
pud_t *pud = (pud_t *)spp_getpage();
p4d_populate(&init_mm, p4d, pud);
if (pud != pud_offset(p4d, 0))
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
pud, pud_offset(p4d, 0));
}
return pud_offset(p4d, vaddr);
}
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
@ -167,7 +189,7 @@ static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
pmd_t *pmd = (pmd_t *) spp_getpage();
pud_populate(&init_mm, pud, pmd);
if (pmd != pmd_offset(pud, 0))
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n",
pmd, pmd_offset(pud, 0));
}
return pmd_offset(pud, vaddr);
@ -179,20 +201,15 @@ static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
pte_t *pte = (pte_t *) spp_getpage();
pmd_populate_kernel(&init_mm, pmd, pte);
if (pte != pte_offset_kernel(pmd, 0))
printk(KERN_ERR "PAGETABLE BUG #02!\n");
printk(KERN_ERR "PAGETABLE BUG #03!\n");
}
return pte_offset_kernel(pmd, vaddr);
}
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte)
{
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pud = pud_page + pud_index(vaddr);
pmd = fill_pmd(pud, vaddr);
pte = fill_pte(pmd, vaddr);
pmd_t *pmd = fill_pmd(pud, vaddr);
pte_t *pte = fill_pte(pmd, vaddr);
set_pte(pte, new_pte);
@ -203,10 +220,25 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
__flush_tlb_one(vaddr);
}
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte)
{
p4d_t *p4d = p4d_page + p4d_index(vaddr);
pud_t *pud = fill_pud(p4d, vaddr);
__set_pte_vaddr(pud, vaddr, new_pte);
}
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
pud_t *pud = pud_page + pud_index(vaddr);
__set_pte_vaddr(pud, vaddr, new_pte);
}
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
pud_t *pud_page;
p4d_t *p4d_page;
pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));
@ -216,17 +248,20 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
"PGD FIXMAP MISSING, it should be setup in head.S!\n");
return;
}
pud_page = (pud_t*)pgd_page_vaddr(*pgd);
set_pte_vaddr_pud(pud_page, vaddr, pteval);
p4d_page = p4d_offset(pgd, 0);
set_pte_vaddr_p4d(p4d_page, vaddr, pteval);
}
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset_k(vaddr);
pud = fill_pud(pgd, vaddr);
p4d = fill_p4d(pgd, vaddr);
pud = fill_pud(p4d, vaddr);
return fill_pmd(pud, vaddr);
}
@ -245,6 +280,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
enum page_cache_mode cache)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgprot_t prot;
@ -255,11 +291,17 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
pgd = pgd_offset_k((unsigned long)__va(phys));
if (pgd_none(*pgd)) {
pud = (pud_t *) spp_getpage();
set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
p4d = (p4d_t *) spp_getpage();
set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE |
_PAGE_USER));
}
pud = pud_offset(pgd, (unsigned long)__va(phys));
p4d = p4d_offset(pgd, (unsigned long)__va(phys));
if (p4d_none(*p4d)) {
pud = (pud_t *) spp_getpage();
set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE |
_PAGE_USER));
}
pud = pud_offset(p4d, (unsigned long)__va(phys));
if (pud_none(*pud)) {
pmd = (pmd_t *) spp_getpage();
set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
@ -563,12 +605,15 @@ kernel_physical_mapping_init(unsigned long paddr_start,
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
pgd_t *pgd = pgd_offset_k(vaddr);
p4d_t *p4d;
pud_t *pud;
vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
if (pgd_val(*pgd)) {
pud = (pud_t *)pgd_page_vaddr(*pgd);
BUILD_BUG_ON(pgd_none(*pgd));
p4d = p4d_offset(pgd, vaddr);
if (p4d_val(*p4d)) {
pud = (pud_t *)p4d_page_vaddr(*p4d);
paddr_last = phys_pud_init(pud, __pa(vaddr),
__pa(vaddr_end),
page_size_mask);
@ -580,7 +625,7 @@ kernel_physical_mapping_init(unsigned long paddr_start,
page_size_mask);
spin_lock(&init_mm.page_table_lock);
pgd_populate(&init_mm, pgd, pud);
p4d_populate(&init_mm, p4d, pud);
spin_unlock(&init_mm.page_table_lock);
pgd_changed = true;
}
@ -726,6 +771,24 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
spin_unlock(&init_mm.page_table_lock);
}
static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
{
pud_t *pud;
int i;
for (i = 0; i < PTRS_PER_PUD; i++) {
pud = pud_start + i;
if (!pud_none(*pud))
return;
}
/* free a pud talbe */
free_pagetable(p4d_page(*p4d), 0);
spin_lock(&init_mm.page_table_lock);
p4d_clear(p4d);
spin_unlock(&init_mm.page_table_lock);
}
static void __meminit
remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
bool direct)
@ -899,7 +962,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
continue;
}
pmd_base = (pmd_t *)pud_page_vaddr(*pud);
pmd_base = pmd_offset(pud, 0);
remove_pmd_table(pmd_base, addr, next, direct);
free_pmd_table(pmd_base, pud);
}
@ -908,6 +971,32 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end,
update_page_count(PG_LEVEL_1G, -pages);
}
static void __meminit
remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end,
bool direct)
{
unsigned long next, pages = 0;
pud_t *pud_base;
p4d_t *p4d;
p4d = p4d_start + p4d_index(addr);
for (; addr < end; addr = next, p4d++) {
next = p4d_addr_end(addr, end);
if (!p4d_present(*p4d))
continue;
BUILD_BUG_ON(p4d_large(*p4d));
pud_base = pud_offset(p4d, 0);
remove_pud_table(pud_base, addr, next, direct);
free_pud_table(pud_base, p4d);
}
if (direct)
update_page_count(PG_LEVEL_512G, -pages);
}
/* start and end are both virtual address. */
static void __meminit
remove_pagetable(unsigned long start, unsigned long end, bool direct)
@ -915,7 +1004,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
unsigned long next;
unsigned long addr;
pgd_t *pgd;
pud_t *pud;
p4d_t *p4d;
for (addr = start; addr < end; addr = next) {
next = pgd_addr_end(addr, end);
@ -924,8 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
if (!pgd_present(*pgd))
continue;
pud = (pud_t *)pgd_page_vaddr(*pgd);
remove_pud_table(pud, addr, next, direct);
p4d = p4d_offset(pgd, 0);
remove_p4d_table(p4d, addr, next, direct);
}
flush_tlb_all();
@ -1090,6 +1179,7 @@ int kern_addr_valid(unsigned long addr)
{
unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@ -1101,7 +1191,11 @@ int kern_addr_valid(unsigned long addr)
if (pgd_none(*pgd))
return 0;
pud = pud_offset(pgd, addr);
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d))
return 0;
pud = pud_offset(p4d, addr);
if (pud_none(*pud))
return 0;
@ -1158,6 +1252,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
unsigned long addr;
unsigned long next;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@ -1168,7 +1263,11 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
if (!pgd)
return -ENOMEM;
pud = vmemmap_pud_populate(pgd, addr, node);
p4d = vmemmap_p4d_populate(pgd, addr, node);
if (!p4d)
return -ENOMEM;
pud = vmemmap_pud_populate(p4d, addr, node);
if (!pud)
return -ENOMEM;
@ -1236,6 +1335,7 @@ void register_page_bootmem_memmap(unsigned long section_nr,
unsigned long end = (unsigned long)(start_page + size);
unsigned long next;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
unsigned int nr_pages;
@ -1251,7 +1351,14 @@ void register_page_bootmem_memmap(unsigned long section_nr,
}
get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO);
pud = pud_offset(pgd, addr);
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
continue;
}
get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO);
pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
continue;

View file

@ -426,7 +426,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
/* Don't assume we're using swapper_pg_dir at this point */
pgd_t *base = __va(read_cr3());
pgd_t *pgd = &base[pgd_index(addr)];
pud_t *pud = pud_offset(pgd, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);
pmd_t *pmd = pmd_offset(pud, addr);
return pmd;

View file

@ -34,8 +34,19 @@ static int __init map_range(struct range *range)
static void __init clear_pgds(unsigned long start,
unsigned long end)
{
for (; start < end; start += PGDIR_SIZE)
pgd_clear(pgd_offset_k(start));
pgd_t *pgd;
for (; start < end; start += PGDIR_SIZE) {
pgd = pgd_offset_k(start);
/*
* With folded p4d, pgd_clear() is nop, use p4d_clear()
* instead.
*/
if (CONFIG_PGTABLE_LEVELS < 5)
p4d_clear(p4d_offset(pgd, start));
else
pgd_clear(pgd);
}
}
static void __init kasan_map_early_shadow(pgd_t *pgd)
@ -45,8 +56,18 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
unsigned long end = KASAN_SHADOW_END;
for (i = pgd_index(start); start < end; i++) {
pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
| _KERNPG_TABLE);
switch (CONFIG_PGTABLE_LEVELS) {
case 4:
pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
_KERNPG_TABLE);
break;
case 5:
pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
_KERNPG_TABLE);
break;
default:
BUILD_BUG();
}
start += PGDIR_SIZE;
}
}
@ -74,6 +95,7 @@ void __init kasan_early_init(void)
pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE;
for (i = 0; i < PTRS_PER_PTE; i++)
kasan_zero_pte[i] = __pte(pte_val);
@ -84,6 +106,9 @@ void __init kasan_early_init(void)
for (i = 0; i < PTRS_PER_PUD; i++)
kasan_zero_pud[i] = __pud(pud_val);
for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
kasan_zero_p4d[i] = __p4d(p4d_val);
kasan_map_early_shadow(early_level4_pgt);
kasan_map_early_shadow(init_level4_pgt);
}

View file

@ -30,30 +30,44 @@
#include <linux/limits.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/compat.h>
#include <asm/elf.h>
struct va_alignment __read_mostly va_align = {
.flags = -1,
};
static unsigned long stack_maxrandom_size(void)
unsigned long tasksize_32bit(void)
{
return IA32_PAGE_OFFSET;
}
unsigned long tasksize_64bit(void)
{
return TASK_SIZE_MAX;
}
static unsigned long stack_maxrandom_size(unsigned long task_size)
{
unsigned long max = 0;
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT;
max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit());
max <<= PAGE_SHIFT;
}
return max;
}
/*
* Top of mmap area (just below the process stack).
*
* Leave an at least ~128 MB hole with possible stack randomization.
*/
#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)
#ifdef CONFIG_COMPAT
# define mmap32_rnd_bits mmap_rnd_compat_bits
# define mmap64_rnd_bits mmap_rnd_bits
#else
# define mmap32_rnd_bits mmap_rnd_bits
# define mmap64_rnd_bits mmap_rnd_bits
#endif
#define SIZE_128M (128 * 1024 * 1024UL)
static int mmap_is_legacy(void)
{
@ -66,54 +80,91 @@ static int mmap_is_legacy(void)
return sysctl_legacy_va_layout;
}
unsigned long arch_mmap_rnd(void)
static unsigned long arch_rnd(unsigned int rndbits)
{
unsigned long rnd;
if (mmap_is_ia32())
#ifdef CONFIG_COMPAT
rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
#else
rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
#endif
else
rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
return rnd << PAGE_SHIFT;
return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT;
}
static unsigned long mmap_base(unsigned long rnd)
unsigned long arch_mmap_rnd(void)
{
if (!(current->flags & PF_RANDOMIZE))
return 0;
return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits);
}
static unsigned long mmap_base(unsigned long rnd, unsigned long task_size)
{
unsigned long gap = rlimit(RLIMIT_STACK);
unsigned long gap_min, gap_max;
if (gap < MIN_GAP)
gap = MIN_GAP;
else if (gap > MAX_GAP)
gap = MAX_GAP;
/*
* Top of mmap area (just below the process stack).
* Leave an at least ~128 MB hole with possible stack randomization.
*/
gap_min = SIZE_128M + stack_maxrandom_size(task_size);
gap_max = (task_size / 6) * 5;
return PAGE_ALIGN(TASK_SIZE - gap - rnd);
if (gap < gap_min)
gap = gap_min;
else if (gap > gap_max)
gap = gap_max;
return PAGE_ALIGN(task_size - gap - rnd);
}
static unsigned long mmap_legacy_base(unsigned long rnd,
unsigned long task_size)
{
return __TASK_UNMAPPED_BASE(task_size) + rnd;
}
/*
* This function, called very early during the creation of a new
* process VM image, sets up which VM layout function to use:
*/
static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base,
unsigned long random_factor, unsigned long task_size)
{
*legacy_base = mmap_legacy_base(random_factor, task_size);
if (mmap_is_legacy())
*base = *legacy_base;
else
*base = mmap_base(random_factor, task_size);
}
void arch_pick_mmap_layout(struct mm_struct *mm)
{
unsigned long random_factor = 0UL;
if (current->flags & PF_RANDOMIZE)
random_factor = arch_mmap_rnd();
mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor;
if (mmap_is_legacy()) {
mm->mmap_base = mm->mmap_legacy_base;
if (mmap_is_legacy())
mm->get_unmapped_area = arch_get_unmapped_area;
} else {
mm->mmap_base = mmap_base(random_factor);
else
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base,
arch_rnd(mmap64_rnd_bits), tasksize_64bit());
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/*
* The mmap syscall mapping base decision depends solely on the
* syscall type (64-bit or compat). This applies for 64bit
* applications and 32bit applications. The 64bit syscall uses
* mmap_base, the compat syscall uses mmap_compat_base.
*/
arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base,
arch_rnd(mmap32_rnd_bits), tasksize_32bit());
#endif
}
unsigned long get_mmap_base(int is_legacy)
{
struct mm_struct *mm = current->mm;
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
if (in_compat_syscall()) {
return is_legacy ? mm->mmap_compat_legacy_base
: mm->mmap_compat_base;
}
#endif
return is_legacy ? mm->mmap_legacy_base : mm->mmap_base;
}
const char *arch_vma_name(struct vm_area_struct *vma)

View file

@ -526,15 +526,7 @@ int mpx_handle_bd_fault(void)
if (!kernel_managing_mpx_tables(current->mm))
return -EINVAL;
if (do_mpx_bt_fault()) {
force_sig(SIGSEGV, current);
/*
* The force_sig() is essentially "handling" this
* exception, so we do not pass up the error
* from do_mpx_bt_fault().
*/
}
return 0;
return do_mpx_bt_fault();
}
/*

View file

@ -201,7 +201,7 @@ static void __init alloc_node_data(int nid)
nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES,
MEMBLOCK_ALLOC_ACCESSIBLE);
if (!nd_pa) {
pr_err("Cannot find %zu bytes in node %d\n",
pr_err("Cannot find %zu bytes in any node (initial node: %d)\n",
nd_size, nid);
return;
}
@ -225,7 +225,7 @@ static void __init alloc_node_data(int nid)
* numa_cleanup_meminfo - Cleanup a numa_meminfo
* @mi: numa_meminfo to clean up
*
* Sanitize @mi by merging and removing unncessary memblks. Also check for
* Sanitize @mi by merging and removing unnecessary memblks. Also check for
* conflicts and clear unused memblks.
*
* RETURNS:

View file

@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
if (pgd_none(*pgd))
return NULL;
pud = pud_offset(pgd, address);
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d))
return NULL;
*level = PG_LEVEL_512G;
if (p4d_large(*p4d) || !p4d_present(*p4d))
return (pte_t *)p4d;
pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
pmd_t *lookup_pmd_address(unsigned long address)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pgd = pgd_offset_k(address);
if (pgd_none(*pgd))
return NULL;
pud = pud_offset(pgd, address);
p4d = p4d_offset(pgd, address);
if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
return NULL;
pud = pud_offset(p4d, address);
if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
return NULL;
@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
list_for_each_entry(page, &pgd_list, lru) {
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pgd = (pgd_t *)page_address(page) + pgd_index(address);
pud = pud_offset(pgd, address);
p4d = p4d_offset(pgd, address);
pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
set_pte_atomic((pte_t *)pmd, pte);
}
@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
pud_clear(pud);
}
static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
{
pud_t *pud = pud_offset(pgd, start);
pud_t *pud = pud_offset(p4d, start);
/*
* Not on a GB page boundary?
@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa,
return num_pages;
}
static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
pgprot_t pgprot)
static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
pgprot_t pgprot)
{
pud_t *pud;
unsigned long end;
@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
cur_pages = (pre_end - start) >> PAGE_SHIFT;
cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
pud = pud_offset(pgd, start);
pud = pud_offset(p4d, start);
/*
* Need a PMD page?
@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
if (cpa->numpages == cur_pages)
return cur_pages;
pud = pud_offset(pgd, start);
pud = pud_offset(p4d, start);
pud_pgprot = pgprot_4k_2_large(pgprot);
/*
@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
if (start < end) {
long tmp;
pud = pud_offset(pgd, start);
pud = pud_offset(p4d, start);
if (pud_none(*pud))
if (alloc_pmd_page(pud))
return -1;
@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
{
pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
pud_t *pud = NULL; /* shut up gcc */
p4d_t *p4d;
pgd_t *pgd_entry;
long ret;
pgd_entry = cpa->pgd + pgd_index(addr);
if (pgd_none(*pgd_entry)) {
p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
if (!p4d)
return -1;
set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
}
/*
* Allocate a PUD page and hand it down for mapping.
*/
if (pgd_none(*pgd_entry)) {
p4d = p4d_offset(pgd_entry, addr);
if (p4d_none(*p4d)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
if (!pud)
return -1;
set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE));
set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr);
pgprot_val(pgprot) |= pgprot_val(cpa->mask_set);
ret = populate_pud(cpa, addr, pgd_entry, pgprot);
ret = populate_pud(cpa, addr, p4d, pgprot);
if (ret < 0) {
/*
* Leave the PUD page in place in case some other CPU or thread
* already found it, but remove any useless entries we just
* added to it.
*/
unmap_pud_range(pgd_entry, addr,
unmap_pud_range(p4d, addr,
addr + (cpa->numpages << PAGE_SHIFT));
return ret;
}

View file

@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
references from swapper_pg_dir. */
if (CONFIG_PGTABLE_LEVELS == 2 ||
(CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
CONFIG_PGTABLE_LEVELS == 4) {
CONFIG_PGTABLE_LEVELS >= 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@ -261,13 +269,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
{
p4d_t *p4d;
pud_t *pud;
int i;
if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
return;
pud = pud_offset(pgd, 0);
p4d = p4d_offset(pgd, 0);
pud = pud_offset(p4d, 0);
for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) {
pmd_t *pmd = pmds[i];
@ -580,6 +590,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
}
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
#ifdef CONFIG_X86_5LEVEL
/**
* p4d_set_huge - setup kernel P4D mapping
*
* No 512GB pages yet -- always return 0
*/
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
return 0;
}
/**
* p4d_clear_huge - clear kernel P4D mapping when it is set
*
* No 512GB pages yet -- always return 0
*/
int p4d_clear_huge(p4d_t *p4d)
{
return 0;
}
#endif
/**
* pud_set_huge - setup kernel PUD mapping
*

View file

@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20;
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
BUG();
return;
}
pud = pud_offset(pgd, vaddr);
p4d = p4d_offset(pgd, vaddr);
if (p4d_none(*p4d)) {
BUG();
return;
}
pud = pud_offset(p4d, vaddr);
if (pud_none(*pud)) {
BUG();
return;

View file

@ -263,8 +263,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
{
struct flush_tlb_info info;
if (end == 0)
end = start + PAGE_SIZE;
info.flush_mm = mm;
info.flush_start = start;
info.flush_end = end;
@ -289,23 +287,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
}
void flush_tlb_current_task(void)
{
struct mm_struct *mm = current->mm;
preempt_disable();
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
/* This is an implicit full barrier that synchronizes with switch_mm. */
local_flush_tlb();
trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
preempt_enable();
}
/*
* See Documentation/x86/tlb.txt for details. We choose 33
* because it is large enough to cover the vast majority (at
@ -326,6 +307,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable();
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
base_pages_to_flush = (end - start) >> PAGE_SHIFT;
if (base_pages_to_flush > tlb_single_page_flush_ceiling)
base_pages_to_flush = TLB_FLUSH_ALL;
if (current->active_mm != mm) {
/* Synchronize with switch_mm. */
smp_mb();
@ -342,15 +329,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
goto out;
}
if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
base_pages_to_flush = (end - start) >> PAGE_SHIFT;
/*
* Both branches below are implicit full barriers (MOV to CR or
* INVLPG) that synchronize with switch_mm.
*/
if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
base_pages_to_flush = TLB_FLUSH_ALL;
if (base_pages_to_flush == TLB_FLUSH_ALL) {
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
@ -393,7 +376,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
}
if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
flush_tlb_others(mm_cpumask(mm), mm, start, 0UL);
flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
preempt_enable();
}

View file

@ -68,7 +68,7 @@ pgd_t * __init efi_call_phys_prolog(void)
load_cr3(initial_page_table);
__flush_tlb_all();
gdt_descr.address = __pa(get_cpu_gdt_table(0));
gdt_descr.address = get_cpu_gdt_paddr(0);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);
@ -79,7 +79,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd)
{
struct desc_ptr gdt_descr;
gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0);
gdt_descr.size = GDT_SIZE - 1;
load_gdt(&gdt_descr);

View file

@ -135,6 +135,7 @@ static pgd_t *efi_pgd;
int __init efi_alloc_page_tables(void)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
gfp_t gfp_mask;
@ -147,14 +148,19 @@ int __init efi_alloc_page_tables(void)
return -ENOMEM;
pgd = efi_pgd + pgd_index(EFI_VA_END);
pud = pud_alloc_one(NULL, 0);
if (!pud) {
p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END);
if (!p4d) {
free_page((unsigned long)efi_pgd);
return -ENOMEM;
}
pgd_populate(NULL, pgd, pud);
pud = pud_alloc(&init_mm, p4d, EFI_VA_END);
if (!pud) {
if (CONFIG_PGTABLE_LEVELS > 4)
free_page((unsigned long) pgd_page_vaddr(*pgd));
free_page((unsigned long)efi_pgd);
return -ENOMEM;
}
return 0;
}
@ -166,6 +172,7 @@ void efi_sync_low_kernel_mappings(void)
{
unsigned num_entries;
pgd_t *pgd_k, *pgd_efi;
p4d_t *p4d_k, *p4d_efi;
pud_t *pud_k, *pud_efi;
if (efi_enabled(EFI_OLD_MEMMAP))
@ -189,6 +196,21 @@ void efi_sync_low_kernel_mappings(void)
num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
/*
* As with PGDs, we share all P4D entries apart from the one entry
* that covers the EFI runtime mapping space.
*/
BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END));
BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK));
pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
pgd_k = pgd_offset_k(EFI_VA_END);
p4d_efi = p4d_offset(pgd_efi, 0);
p4d_k = p4d_offset(pgd_k, 0);
num_entries = p4d_index(EFI_VA_END);
memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries);
/*
* We share all the PUD entries apart from those that map the
* EFI regions. Copy around them.
@ -196,17 +218,16 @@ void efi_sync_low_kernel_mappings(void)
BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0);
BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0);
pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
pud_efi = pud_offset(pgd_efi, 0);
pgd_k = pgd_offset_k(EFI_VA_END);
pud_k = pud_offset(pgd_k, 0);
p4d_efi = p4d_offset(pgd_efi, EFI_VA_END);
p4d_k = p4d_offset(pgd_k, EFI_VA_END);
pud_efi = pud_offset(p4d_efi, 0);
pud_k = pud_offset(p4d_k, 0);
num_entries = pud_index(EFI_VA_END);
memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
pud_efi = pud_offset(pgd_efi, EFI_VA_START);
pud_k = pud_offset(pgd_k, EFI_VA_START);
pud_efi = pud_offset(p4d_efi, EFI_VA_START);
pud_k = pud_offset(p4d_k, EFI_VA_START);
num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);

View file

@ -95,7 +95,7 @@ static void __save_processor_state(struct saved_context *ctxt)
* 'pmode_gdt' in wakeup_start.
*/
ctxt->gdt_desc.size = GDT_SIZE - 1;
ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_table(smp_processor_id());
ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id());
store_tr(ctxt->tr);
@ -162,7 +162,7 @@ static void fix_processor_context(void)
int cpu = smp_processor_id();
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
struct desc_struct *desc = get_cpu_gdt_table(cpu);
struct desc_struct *desc = get_cpu_gdt_rw(cpu);
tss_desc tss;
#endif
set_tss_desc(cpu, t); /*
@ -183,6 +183,9 @@ static void fix_processor_context(void)
load_mm_ldt(current->active_mm); /* This does lldt */
fpu__resume_cpu();
/* The processor is back on the direct GDT, load back the fixmap */
load_fixmap_gdt(cpu);
}
/**

View file

@ -32,6 +32,7 @@ pgd_t *resume_pg_dir;
*/
static pmd_t *resume_one_md_table_init(pgd_t *pgd)
{
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd_table;
@ -41,11 +42,13 @@ static pmd_t *resume_one_md_table_init(pgd_t *pgd)
return NULL;
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
p4d = p4d_offset(pgd, 0);
pud = pud_offset(p4d, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
#else
pud = pud_offset(pgd, 0);
p4d = p4d_offset(pgd, 0);
pud = pud_offset(p4d, 0);
pmd_table = pmd_offset(pud, 0);
#endif

View file

@ -50,6 +50,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
{
pmd_t *pmd;
pud_t *pud;
p4d_t *p4d;
/*
* The new mapping only has to cover the page containing the image
@ -64,6 +65,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
* the virtual address space after switching over to the original page
* tables used by the image kernel.
*/
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
if (!p4d)
return -ENOMEM;
}
pud = (pud_t *)get_safe_page(GFP_ATOMIC);
if (!pud)
return -ENOMEM;
@ -76,8 +84,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
__pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC));
set_pud(pud + pud_index(restore_jump_address),
__pud(__pa(pmd) | _KERNPG_TABLE));
set_pgd(pgd + pgd_index(restore_jump_address),
__pgd(__pa(pud) | _KERNPG_TABLE));
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE));
set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE));
} else {
/* No p4d for 4-level paging: point the pgd to the pud page table */
set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE));
}
return 0;
}
@ -125,7 +138,10 @@ static int set_up_temporary_mappings(void)
static int relocate_restore_code(void)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
relocated_restore_code = get_safe_page(GFP_ATOMIC);
if (!relocated_restore_code)
@ -135,22 +151,25 @@ static int relocate_restore_code(void)
/* Make the page containing the relocated code executable */
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
pud = pud_offset(pgd, relocated_restore_code);
p4d = p4d_offset(pgd, relocated_restore_code);
if (p4d_large(*p4d)) {
set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
goto out;
}
pud = pud_offset(p4d, relocated_restore_code);
if (pud_large(*pud)) {
set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
} else {
pmd_t *pmd = pmd_offset(pud, relocated_restore_code);
if (pmd_large(*pmd)) {
set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
} else {
pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code);
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
}
goto out;
}
pmd = pmd_offset(pud, relocated_restore_code);
if (pmd_large(*pmd)) {
set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
goto out;
}
pte = pte_offset_kernel(pmd, relocated_restore_code);
set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
out:
__flush_tlb_all();
return 0;
}

View file

@ -711,7 +711,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
*shadow = t->tls_array[i];
gdt = get_cpu_gdt_table(cpu);
gdt = get_cpu_gdt_rw(cpu);
maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
mc = __xen_mc_entry(0);

View file

@ -535,40 +535,41 @@ static pgd_t *xen_get_user_pgd(pgd_t *pgd)
return user_ptr;
}
static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
{
struct mmu_update u;
u.ptr = virt_to_machine(ptr).maddr;
u.val = pgd_val_ma(val);
u.val = p4d_val_ma(val);
xen_extend_mmu_update(&u);
}
/*
* Raw hypercall-based set_pgd, intended for in early boot before
* Raw hypercall-based set_p4d, intended for in early boot before
* there's a page structure. This implies:
* 1. The only existing pagetable is the kernel's
* 2. It is always pinned
* 3. It has no user pagetable attached to it
*/
static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
{
preempt_disable();
xen_mc_batch();
__xen_set_pgd_hyper(ptr, val);
__xen_set_p4d_hyper(ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
static void xen_set_pgd(pgd_t *ptr, pgd_t val)
static void xen_set_p4d(p4d_t *ptr, p4d_t val)
{
pgd_t *user_ptr = xen_get_user_pgd(ptr);
pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
pgd_t pgd_val;
trace_xen_mmu_set_pgd(ptr, user_ptr, val);
trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
/* If page is not pinned, we can just update the entry
directly */
@ -576,7 +577,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
*ptr = val;
if (user_ptr) {
WARN_ON(xen_page_pinned(user_ptr));
*user_ptr = val;
pgd_val.pgd = p4d_val_ma(val);
*user_ptr = pgd_val;
}
return;
}
@ -585,14 +587,72 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
user updates together. */
xen_mc_batch();
__xen_set_pgd_hyper(ptr, val);
__xen_set_p4d_hyper(ptr, val);
if (user_ptr)
__xen_set_pgd_hyper(user_ptr, val);
__xen_set_p4d_hyper((p4d_t *)user_ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
#endif /* CONFIG_PGTABLE_LEVELS == 4 */
static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
bool last, unsigned long limit)
{
int i, nr, flush = 0;
nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
for (i = 0; i < nr; i++) {
if (!pmd_none(pmd[i]))
flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
}
return flush;
}
static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
bool last, unsigned long limit)
{
int i, nr, flush = 0;
nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
for (i = 0; i < nr; i++) {
pmd_t *pmd;
if (pud_none(pud[i]))
continue;
pmd = pmd_offset(&pud[i], 0);
if (PTRS_PER_PMD > 1)
flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
flush |= xen_pmd_walk(mm, pmd, func,
last && i == nr - 1, limit);
}
return flush;
}
static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
bool last, unsigned long limit)
{
int i, nr, flush = 0;
nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
for (i = 0; i < nr; i++) {
pud_t *pud;
if (p4d_none(p4d[i]))
continue;
pud = pud_offset(&p4d[i], 0);
if (PTRS_PER_PUD > 1)
flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
flush |= xen_pud_walk(mm, pud, func,
last && i == nr - 1, limit);
}
return flush;
}
/*
* (Yet another) pagetable walker. This one is intended for pinning a
* pagetable. This means that it walks a pagetable and calls the
@ -613,10 +673,8 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
enum pt_level),
unsigned long limit)
{
int flush = 0;
int i, nr, flush = 0;
unsigned hole_low, hole_high;
unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
unsigned pgdidx, pudidx, pmdidx;
/* The limit is the last byte to be touched */
limit--;
@ -633,65 +691,22 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
hole_low = pgd_index(USER_LIMIT);
hole_high = pgd_index(PAGE_OFFSET);
pgdidx_limit = pgd_index(limit);
#if PTRS_PER_PUD > 1
pudidx_limit = pud_index(limit);
#else
pudidx_limit = 0;
#endif
#if PTRS_PER_PMD > 1
pmdidx_limit = pmd_index(limit);
#else
pmdidx_limit = 0;
#endif
nr = pgd_index(limit) + 1;
for (i = 0; i < nr; i++) {
p4d_t *p4d;
for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
pud_t *pud;
if (pgdidx >= hole_low && pgdidx < hole_high)
if (i >= hole_low && i < hole_high)
continue;
if (!pgd_val(pgd[pgdidx]))
if (pgd_none(pgd[i]))
continue;
pud = pud_offset(&pgd[pgdidx], 0);
if (PTRS_PER_PUD > 1) /* not folded */
flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
pmd_t *pmd;
if (pgdidx == pgdidx_limit &&
pudidx > pudidx_limit)
goto out;
if (pud_none(pud[pudidx]))
continue;
pmd = pmd_offset(&pud[pudidx], 0);
if (PTRS_PER_PMD > 1) /* not folded */
flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
struct page *pte;
if (pgdidx == pgdidx_limit &&
pudidx == pudidx_limit &&
pmdidx > pmdidx_limit)
goto out;
if (pmd_none(pmd[pmdidx]))
continue;
pte = pmd_page(pmd[pmdidx]);
flush |= (*func)(mm, pte, PT_PTE);
}
}
p4d = p4d_offset(&pgd[i], 0);
if (PTRS_PER_P4D > 1)
flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
}
out:
/* Do the top level last, so that the callbacks can use it as
a cue to do final things like tlb flushes. */
flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
@ -1150,57 +1165,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
xen_free_ro_pages(pa, PAGE_SIZE);
}
static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
{
unsigned long pa;
pte_t *pte_tbl;
int i;
if (pmd_large(*pmd)) {
pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PMD_SIZE);
return;
}
pte_tbl = pte_offset_kernel(pmd, 0);
for (i = 0; i < PTRS_PER_PTE; i++) {
if (pte_none(pte_tbl[i]))
continue;
pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
xen_free_ro_pages(pa, PAGE_SIZE);
}
set_pmd(pmd, __pmd(0));
xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
}
static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
{
unsigned long pa;
pmd_t *pmd_tbl;
int i;
if (pud_large(*pud)) {
pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PUD_SIZE);
return;
}
pmd_tbl = pmd_offset(pud, 0);
for (i = 0; i < PTRS_PER_PMD; i++) {
if (pmd_none(pmd_tbl[i]))
continue;
xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
}
set_pud(pud, __pud(0));
xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
}
static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
{
unsigned long pa;
pud_t *pud_tbl;
int i;
if (p4d_large(*p4d)) {
pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, P4D_SIZE);
return;
}
pud_tbl = pud_offset(p4d, 0);
for (i = 0; i < PTRS_PER_PUD; i++) {
if (pud_none(pud_tbl[i]))
continue;
xen_cleanmfnmap_pud(pud_tbl + i, unpin);
}
set_p4d(p4d, __p4d(0));
xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
}
/*
* Since it is well isolated we can (and since it is perhaps large we should)
* also free the page tables mapping the initial P->M table.
*/
static void __init xen_cleanmfnmap(unsigned long vaddr)
{
unsigned long va = vaddr & PMD_MASK;
unsigned long pa;
pgd_t *pgd = pgd_offset_k(va);
pud_t *pud_page = pud_offset(pgd, 0);
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pgd_t *pgd;
p4d_t *p4d;
unsigned int i;
bool unpin;
unpin = (vaddr == 2 * PGDIR_SIZE);
set_pgd(pgd, __pgd(0));
do {
pud = pud_page + pud_index(va);
if (pud_none(*pud)) {
va += PUD_SIZE;
} else if (pud_large(*pud)) {
pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PUD_SIZE);
va += PUD_SIZE;
} else {
pmd = pmd_offset(pud, va);
if (pmd_large(*pmd)) {
pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
xen_free_ro_pages(pa, PMD_SIZE);
} else if (!pmd_none(*pmd)) {
pte = pte_offset_kernel(pmd, va);
set_pmd(pmd, __pmd(0));
for (i = 0; i < PTRS_PER_PTE; ++i) {
if (pte_none(pte[i]))
break;
pa = pte_pfn(pte[i]) << PAGE_SHIFT;
xen_free_ro_pages(pa, PAGE_SIZE);
}
xen_cleanmfnmap_free_pgtbl(pte, unpin);
}
va += PMD_SIZE;
if (pmd_index(va))
continue;
set_pud(pud, __pud(0));
xen_cleanmfnmap_free_pgtbl(pmd, unpin);
}
} while (pud_index(va) || pmd_index(va));
xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
vaddr &= PMD_MASK;
pgd = pgd_offset_k(vaddr);
p4d = p4d_offset(pgd, 0);
for (i = 0; i < PTRS_PER_P4D; i++) {
if (p4d_none(p4d[i]))
continue;
xen_cleanmfnmap_p4d(p4d + i, unpin);
}
if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
set_pgd(pgd, __pgd(0));
xen_cleanmfnmap_free_pgtbl(p4d, unpin);
}
}
static void __init xen_pagetable_p2m_free(void)
@ -1538,7 +1593,6 @@ static int xen_pgd_alloc(struct mm_struct *mm)
BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
}
#endif
return ret;
}
@ -1730,7 +1784,7 @@ static void xen_release_pmd(unsigned long pfn)
xen_release_ptpage(pfn, PT_PMD);
}
#if CONFIG_PGTABLE_LEVELS == 4
#if CONFIG_PGTABLE_LEVELS >= 4
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
@ -2071,21 +2125,27 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
*/
void __init xen_relocate_p2m(void)
{
phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
pte_t *pt;
pmd_t *pmd;
pud_t *pud;
p4d_t *p4d = NULL;
pgd_t *pgd;
unsigned long *new_p2m;
int save_pud;
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
n_frames = n_pte + n_pt + n_pmd + n_pud;
n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
if (PTRS_PER_P4D > 1)
n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
else
n_p4d = 0;
n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
new_area = xen_find_free_area(PFN_PHYS(n_frames));
if (!new_area) {
@ -2101,55 +2161,76 @@ void __init xen_relocate_p2m(void)
* To avoid any possible virtual address collision, just use
* 2 * PUD_SIZE for the new area.
*/
pud_phys = new_area;
p4d_phys = new_area;
pud_phys = p4d_phys + PFN_PHYS(n_p4d);
pmd_phys = pud_phys + PFN_PHYS(n_pud);
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
pgd = __va(read_cr3());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
pud = early_memremap(pud_phys, PAGE_SIZE);
clear_page(pud);
for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
idx_pmd++) {
pmd = early_memremap(pmd_phys, PAGE_SIZE);
clear_page(pmd);
for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
idx_pt++) {
pt = early_memremap(pt_phys, PAGE_SIZE);
clear_page(pt);
for (idx_pte = 0;
idx_pte < min(n_pte, PTRS_PER_PTE);
idx_pte++) {
set_pte(pt + idx_pte,
pfn_pte(p2m_pfn, PAGE_KERNEL));
p2m_pfn++;
}
n_pte -= PTRS_PER_PTE;
early_memunmap(pt, PAGE_SIZE);
make_lowmem_page_readonly(__va(pt_phys));
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
PFN_DOWN(pt_phys));
set_pmd(pmd + idx_pt,
__pmd(_PAGE_TABLE | pt_phys));
pt_phys += PAGE_SIZE;
}
n_pt -= PTRS_PER_PMD;
early_memunmap(pmd, PAGE_SIZE);
make_lowmem_page_readonly(__va(pmd_phys));
pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
PFN_DOWN(pmd_phys));
set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
pmd_phys += PAGE_SIZE;
idx_p4d = 0;
save_pud = n_pud;
do {
if (n_p4d > 0) {
p4d = early_memremap(p4d_phys, PAGE_SIZE);
clear_page(p4d);
n_pud = min(save_pud, PTRS_PER_P4D);
}
n_pmd -= PTRS_PER_PUD;
early_memunmap(pud, PAGE_SIZE);
make_lowmem_page_readonly(__va(pud_phys));
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
pud_phys += PAGE_SIZE;
}
for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
pud = early_memremap(pud_phys, PAGE_SIZE);
clear_page(pud);
for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
idx_pmd++) {
pmd = early_memremap(pmd_phys, PAGE_SIZE);
clear_page(pmd);
for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
idx_pt++) {
pt = early_memremap(pt_phys, PAGE_SIZE);
clear_page(pt);
for (idx_pte = 0;
idx_pte < min(n_pte, PTRS_PER_PTE);
idx_pte++) {
set_pte(pt + idx_pte,
pfn_pte(p2m_pfn, PAGE_KERNEL));
p2m_pfn++;
}
n_pte -= PTRS_PER_PTE;
early_memunmap(pt, PAGE_SIZE);
make_lowmem_page_readonly(__va(pt_phys));
pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
PFN_DOWN(pt_phys));
set_pmd(pmd + idx_pt,
__pmd(_PAGE_TABLE | pt_phys));
pt_phys += PAGE_SIZE;
}
n_pt -= PTRS_PER_PMD;
early_memunmap(pmd, PAGE_SIZE);
make_lowmem_page_readonly(__va(pmd_phys));
pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
PFN_DOWN(pmd_phys));
set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
pmd_phys += PAGE_SIZE;
}
n_pmd -= PTRS_PER_PUD;
early_memunmap(pud, PAGE_SIZE);
make_lowmem_page_readonly(__va(pud_phys));
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
if (n_p4d > 0)
set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
else
set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
pud_phys += PAGE_SIZE;
}
if (n_p4d > 0) {
save_pud -= PTRS_PER_P4D;
early_memunmap(p4d, PAGE_SIZE);
make_lowmem_page_readonly(__va(p4d_phys));
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
p4d_phys += PAGE_SIZE;
}
} while (++idx_p4d < n_p4d);
/* Now copy the old p2m info to the new area. */
memcpy(new_p2m, xen_p2m_addr, size);
@ -2326,6 +2407,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;
@ -2378,8 +2460,8 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
#if CONFIG_PGTABLE_LEVELS == 4
pv_mmu_ops.set_pgd = xen_set_pgd;
#if CONFIG_PGTABLE_LEVELS >= 4
pv_mmu_ops.set_p4d = xen_set_p4d;
#endif
/* This will work as long as patching hasn't happened yet
@ -2388,7 +2470,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
#if CONFIG_PGTABLE_LEVELS == 4
#if CONFIG_PGTABLE_LEVELS >= 4
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
@ -2454,10 +2536,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
#if CONFIG_PGTABLE_LEVELS == 4
#if CONFIG_PGTABLE_LEVELS >= 4
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
.set_pgd = xen_set_pgd_hyper,
.set_p4d = xen_set_p4d_hyper,
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,

View file

@ -5,6 +5,7 @@
enum pt_level {
PT_PGD,
PT_P4D,
PT_PUD,
PT_PMD,
PT_PTE

View file

@ -392,7 +392,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
if (ctxt == NULL)
return -ENOMEM;
gdt = get_cpu_gdt_table(cpu);
gdt = get_cpu_gdt_rw(cpu);
#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;

View file

@ -43,6 +43,7 @@ static void dax_pmem_percpu_exit(void *data)
struct dax_pmem *dax_pmem = to_dax_pmem(ref);
dev_dbg(dax_pmem->dev, "%s\n", __func__);
wait_for_completion(&dax_pmem->cmp);
percpu_ref_exit(ref);
}
@ -53,7 +54,6 @@ static void dax_pmem_percpu_kill(void *data)
dev_dbg(dax_pmem->dev, "%s\n", __func__);
percpu_ref_kill(ref);
wait_for_completion(&dax_pmem->cmp);
}
static int dax_pmem_probe(struct device *dev)

View file

@ -504,7 +504,7 @@ void __init lguest_arch_host_init(void)
* byte, not the size, hence the "-1").
*/
state->host_gdt_desc.size = GDT_SIZE-1;
state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i);
/*
* All CPUs on the Host use the same Interrupt Descriptor
@ -554,8 +554,8 @@ void __init lguest_arch_host_init(void)
* The Host needs to be able to use the LGUEST segments on this
* CPU, too, so put them in the Host GDT.
*/
get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
}
/*

View file

@ -25,6 +25,7 @@
#include <linux/badblocks.h>
#include <linux/memremap.h>
#include <linux/vmalloc.h>
#include <linux/blk-mq.h>
#include <linux/pfn_t.h>
#include <linux/slab.h>
#include <linux/pmem.h>
@ -231,6 +232,11 @@ static void pmem_release_queue(void *q)
blk_cleanup_queue(q);
}
static void pmem_freeze_queue(void *q)
{
blk_freeze_queue_start(q);
}
static void pmem_release_disk(void *disk)
{
del_gendisk(disk);
@ -284,6 +290,9 @@ static int pmem_attach_disk(struct device *dev,
if (!q)
return -ENOMEM;
if (devm_add_action_or_reset(dev, pmem_release_queue, q))
return -ENOMEM;
pmem->pfn_flags = PFN_DEV;
if (is_nd_pfn(dev)) {
addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
@ -303,10 +312,10 @@ static int pmem_attach_disk(struct device *dev,
pmem->size, ARCH_MEMREMAP_PMEM);
/*
* At release time the queue must be dead before
* At release time the queue must be frozen before
* devm_memremap_pages is unwound
*/
if (devm_add_action_or_reset(dev, pmem_release_queue, q))
if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
return -ENOMEM;
if (IS_ERR(addr))

View file

@ -54,7 +54,7 @@ __asm__(".text \n"
#define Q2_SET_SEL(cpu, selname, address, size) \
do { \
struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \
struct desc_struct *gdt = get_cpu_gdt_rw((cpu)); \
set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \
set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \
} while(0)
@ -95,8 +95,8 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
return PNP_FUNCTION_NOT_SUPPORTED;
cpu = get_cpu();
save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8];
get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc;
save_desc_40 = get_cpu_gdt_rw(cpu)[0x40 / 8];
get_cpu_gdt_rw(cpu)[0x40 / 8] = bad_bios_desc;
/* On some boxes IRQ's during PnP BIOS calls are deadly. */
spin_lock_irqsave(&pnp_bios_lock, flags);
@ -134,7 +134,7 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3,
:"memory");
spin_unlock_irqrestore(&pnp_bios_lock, flags);
get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40;
get_cpu_gdt_rw(cpu)[0x40 / 8] = save_desc_40;
put_cpu();
/* If we get here and this is set then the PnP BIOS faulted on us. */
@ -477,7 +477,7 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header)
pnp_bios_callpoint.segment = PNP_CS16;
for_each_possible_cpu(i) {
struct desc_struct *gdt = get_cpu_gdt_table(i);
struct desc_struct *gdt = get_cpu_gdt_rw(i);
if (!gdt)
continue;
set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32],

View file

@ -32,10 +32,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
/* by default, allow everything */
return true;
}
static inline bool arch_pte_access_permitted(pte_t pte, bool write)
{
/* by default, allow everything */
return true;
}
#endif /* _ASM_GENERIC_MM_HOOKS_H */

View file

@ -341,6 +341,31 @@ static inline int pte_unused(pte_t pte)
}
#endif
#ifndef pte_access_permitted
#define pte_access_permitted(pte, write) \
(pte_present(pte) && (!(write) || pte_write(pte)))
#endif
#ifndef pmd_access_permitted
#define pmd_access_permitted(pmd, write) \
(pmd_present(pmd) && (!(write) || pmd_write(pmd)))
#endif
#ifndef pud_access_permitted
#define pud_access_permitted(pud, write) \
(pud_present(pud) && (!(write) || pud_write(pud)))
#endif
#ifndef p4d_access_permitted
#define p4d_access_permitted(p4d, write) \
(p4d_present(p4d) && (!(write) || p4d_write(p4d)))
#endif
#ifndef pgd_access_permitted
#define pgd_access_permitted(pgd, write) \
(pgd_present(pgd) && (!(write) || pgd_write(pgd)))
#endif
#ifndef __HAVE_ARCH_PMD_SAME
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)

View file

@ -432,6 +432,10 @@ static inline int pud_devmap(pud_t pud)
{
return 0;
}
static inline int pgd_devmap(pgd_t pgd)
{
return 0;
}
#endif
/*
@ -758,19 +762,11 @@ static inline enum zone_type page_zonenum(const struct page *page)
}
#ifdef CONFIG_ZONE_DEVICE
void get_zone_device_page(struct page *page);
void put_zone_device_page(struct page *page);
static inline bool is_zone_device_page(const struct page *page)
{
return page_zonenum(page) == ZONE_DEVICE;
}
#else
static inline void get_zone_device_page(struct page *page)
{
}
static inline void put_zone_device_page(struct page *page)
{
}
static inline bool is_zone_device_page(const struct page *page)
{
return false;
@ -786,9 +782,6 @@ static inline void get_page(struct page *page)
*/
VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
page_ref_inc(page);
if (unlikely(is_zone_device_page(page)))
get_zone_device_page(page);
}
static inline void put_page(struct page *page)
@ -797,9 +790,6 @@ static inline void put_page(struct page *page)
if (put_page_testzero(page))
__put_page(page);
if (unlikely(is_zone_device_page(page)))
put_zone_device_page(page);
}
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)

View file

@ -367,6 +367,11 @@ struct mm_struct {
#endif
unsigned long mmap_base; /* base of mmap area */
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
/* Base adresses for compatible mmap() */
unsigned long mmap_compat_base;
unsigned long mmap_compat_legacy_base;
#endif
unsigned long task_size; /* size of task vm space */
unsigned long highest_vm_end; /* highest vma end address */
pgd_t * pgd;

View file

@ -148,7 +148,7 @@ static inline int page_cache_get_speculative(struct page *page)
#ifdef CONFIG_TINY_RCU
# ifdef CONFIG_PREEMPT_COUNT
VM_BUG_ON(!in_atomic());
VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
/*
* Preempt must be disabled here - we rely on rcu_read_lock doing
@ -186,7 +186,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
# ifdef CONFIG_PREEMPT_COUNT
VM_BUG_ON(!in_atomic());
VM_BUG_ON(!in_atomic() && !irqs_disabled());
# endif
VM_BUG_ON_PAGE(page_count(page) == 0, page);
page_ref_add(page, count);

View file

@ -241,21 +241,21 @@ TRACE_EVENT(xen_mmu_set_pud,
(int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval)
);
TRACE_EVENT(xen_mmu_set_pgd,
TP_PROTO(pgd_t *pgdp, pgd_t *user_pgdp, pgd_t pgdval),
TP_ARGS(pgdp, user_pgdp, pgdval),
TRACE_EVENT(xen_mmu_set_p4d,
TP_PROTO(p4d_t *p4dp, p4d_t *user_p4dp, p4d_t p4dval),
TP_ARGS(p4dp, user_p4dp, p4dval),
TP_STRUCT__entry(
__field(pgd_t *, pgdp)
__field(pgd_t *, user_pgdp)
__field(pgdval_t, pgdval)
__field(p4d_t *, p4dp)
__field(p4d_t *, user_p4dp)
__field(p4dval_t, p4dval)
),
TP_fast_assign(__entry->pgdp = pgdp;
__entry->user_pgdp = user_pgdp;
__entry->pgdval = pgdval.pgd),
TP_printk("pgdp %p user_pgdp %p pgdval %0*llx (raw %0*llx)",
__entry->pgdp, __entry->user_pgdp,
(int)sizeof(pgdval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pgdval)),
(int)sizeof(pgdval_t) * 2, (unsigned long long)__entry->pgdval)
TP_fast_assign(__entry->p4dp = p4dp;
__entry->user_p4dp = user_p4dp;
__entry->p4dval = p4d_val(p4dval)),
TP_printk("p4dp %p user_p4dp %p p4dval %0*llx (raw %0*llx)",
__entry->p4dp, __entry->user_p4dp,
(int)sizeof(p4dval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->p4dval)),
(int)sizeof(p4dval_t) * 2, (unsigned long long)__entry->p4dval)
);
TRACE_EVENT(xen_mmu_pud_clear,

View file

@ -182,18 +182,6 @@ struct page_map {
struct vmem_altmap altmap;
};
void get_zone_device_page(struct page *page)
{
percpu_ref_get(page->pgmap->ref);
}
EXPORT_SYMBOL(get_zone_device_page);
void put_zone_device_page(struct page *page)
{
put_dev_pagemap(page->pgmap);
}
EXPORT_SYMBOL(put_zone_device_page);
static void pgmap_radix_release(struct resource *res)
{
resource_size_t key, align_start, align_size, align_end;
@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
struct resource *res = &page_map->res;
resource_size_t align_start, align_size;
struct dev_pagemap *pgmap = &page_map->pgmap;
unsigned long pfn;
for_each_device_pfn(pfn, page_map)
put_page(pfn_to_page(pfn));
if (percpu_ref_tryget_live(pgmap->ref)) {
dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
*
* Notes:
* 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
* (or devm release event).
* (or devm release event). The expected order of events is that @ref has
* been through percpu_ref_kill() before devm_memremap_pages_release(). The
* wait for the completion of all references being dropped and
* percpu_ref_exit() must occur after devm_memremap_pages_release().
*
* 2/ @res is expected to be a host memory range that could feasibly be
* treated as a "System RAM" range, i.e. not a device mmio range, but
@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
*/
list_del(&page->lru);
page->pgmap = pgmap;
percpu_ref_get(ref);
}
devres_add(dev, page_map);
return __va(res->start);

148
mm/gup.c
View file

@ -1189,34 +1189,57 @@ struct page *get_dump_page(unsigned long addr)
*/
#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
#ifndef gup_get_pte
/*
* We assume that the PTE can be read atomically. If this is not the case for
* your architecture, please provide the helper.
*/
static inline pte_t gup_get_pte(pte_t *ptep)
{
return READ_ONCE(*ptep);
}
#endif
static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
{
while ((*nr) - nr_start) {
struct page *page = pages[--(*nr)];
ClearPageReferenced(page);
put_page(page);
}
}
#ifdef __HAVE_ARCH_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
pte_t *ptep, *ptem;
int ret = 0;
ptem = ptep = pte_offset_map(&pmd, addr);
do {
/*
* In the line below we are assuming that the pte can be read
* atomically. If this is not the case for your architecture,
* please wrap this in a helper function!
*
* for an example see gup_get_pte in arch/x86/mm/gup.c
*/
pte_t pte = READ_ONCE(*ptep);
pte_t pte = gup_get_pte(ptep);
struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
* path using the pte_protnone check.
*/
if (!pte_present(pte) || pte_special(pte) ||
pte_protnone(pte) || (write && !pte_write(pte)))
if (pte_protnone(pte))
goto pte_unmap;
if (!arch_pte_access_permitted(pte, write))
if (!pte_access_permitted(pte, write))
goto pte_unmap;
if (pte_devmap(pte)) {
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
goto pte_unmap;
}
} else if (pte_special(pte))
goto pte_unmap;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@ -1232,6 +1255,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
VM_BUG_ON_PAGE(compound_head(page) != head, page);
put_dev_pagemap(pgmap);
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@ -1261,15 +1287,76 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */
#ifdef __HAVE_ARCH_PTE_DEVMAP
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
int nr_start = *nr;
struct dev_pagemap *pgmap = NULL;
do {
struct page *page = pfn_to_page(pfn);
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
return 0;
}
SetPageReferenced(page);
pages[*nr] = page;
get_page(page);
put_dev_pagemap(pgmap);
(*nr)++;
pfn++;
} while (addr += PAGE_SIZE, addr != end);
return 1;
}
static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
unsigned long fault_pfn;
fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
return __gup_device_huge(fault_pfn, addr, end, pages, nr);
}
static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
unsigned long fault_pfn;
fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
return __gup_device_huge(fault_pfn, addr, end, pages, nr);
}
#else
static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
BUILD_BUG();
return 0;
}
static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
BUILD_BUG();
return 0;
}
#endif
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
if (write && !pmd_write(orig))
if (!pmd_access_permitted(orig, write))
return 0;
if (pmd_devmap(orig))
return __gup_device_huge_pmd(orig, addr, end, pages, nr);
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@ -1293,6 +1380,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
SetPageReferenced(head);
return 1;
}
@ -1302,9 +1390,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
struct page *head, *page;
int refs;
if (write && !pud_write(orig))
if (!pud_access_permitted(orig, write))
return 0;
if (pud_devmap(orig))
return __gup_device_huge_pud(orig, addr, end, pages, nr);
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@ -1328,6 +1419,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
SetPageReferenced(head);
return 1;
}
@ -1338,9 +1430,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
int refs;
struct page *head, *page;
if (write && !pgd_write(orig))
if (!pgd_access_permitted(orig, write))
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
@ -1364,6 +1457,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
SetPageReferenced(head);
return 1;
}
@ -1520,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
return nr;
}
#ifndef gup_fast_permitted
/*
* Check if it's allowed to use __get_user_pages_fast() for the range, or
* we need to fall back to the slow version:
*/
bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
{
unsigned long len, end;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
return end >= start;
}
#endif
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
@ -1539,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
int nr, ret;
int nr = 0, ret = 0;
start &= PAGE_MASK;
nr = __get_user_pages_fast(start, nr_pages, write, pages);
ret = nr;
if (gup_fast_permitted(start, nr_pages, write)) {
nr = __get_user_pages_fast(start, nr_pages, write, pages);
ret = nr;
}
if (nr < nr_pages) {
/* Try to get the remaining pages with get_user_pages */

View file

@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page)
void __put_page(struct page *page)
{
if (is_zone_device_page(page)) {
put_dev_pagemap(page->pgmap);
/*
* The page belongs to the device that created pgmap. Do
* not return it to page allocator.
*/
return;
}
if (unlikely(PageCompound(page)))
__put_compound_page(page);
else

View file

@ -409,6 +409,51 @@ static void *threadproc(void *ctx)
}
}
#ifdef __i386__
#ifndef SA_RESTORE
#define SA_RESTORER 0x04000000
#endif
/*
* The UAPI header calls this 'struct sigaction', which conflicts with
* glibc. Sigh.
*/
struct fake_ksigaction {
void *handler; /* the real type is nasty */
unsigned long sa_flags;
void (*sa_restorer)(void);
unsigned char sigset[8];
};
static void fix_sa_restorer(int sig)
{
struct fake_ksigaction ksa;
if (syscall(SYS_rt_sigaction, sig, NULL, &ksa, 8) == 0) {
/*
* glibc has a nasty bug: it sometimes writes garbage to
* sa_restorer. This interacts quite badly with anything
* that fiddles with SS because it can trigger legacy
* stack switching. Patch it up. See:
*
* https://sourceware.org/bugzilla/show_bug.cgi?id=21269
*/
if (!(ksa.sa_flags & SA_RESTORER) && ksa.sa_restorer) {
ksa.sa_restorer = NULL;
if (syscall(SYS_rt_sigaction, sig, &ksa, NULL,
sizeof(ksa.sigset)) != 0)
err(1, "rt_sigaction");
}
}
}
#else
static void fix_sa_restorer(int sig)
{
/* 64-bit glibc works fine. */
}
#endif
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
@ -420,6 +465,7 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
if (sigaction(sig, &sa, 0))
err(1, "sigaction");
fix_sa_restorer(sig);
}
static jmp_buf jmpbuf;

View file

@ -404,8 +404,6 @@ void handler(int signum, siginfo_t *si, void *vucontext)
dprintf2("info->si_lower: %p\n", __si_bounds_lower(si));
dprintf2("info->si_upper: %p\n", __si_bounds_upper(si));
check_siginfo_vs_shadow(si);
for (i = 0; i < 8; i++)
dprintf3("[%d]: %p\n", i, si_addr_ptr[i]);
switch (br_reason) {
@ -416,6 +414,9 @@ void handler(int signum, siginfo_t *si, void *vucontext)
exit(5);
case 1: /* #BR MPX bounds exception */
/* these are normal and we expect to see them */
check_siginfo_vs_shadow(si);
dprintf1("bounds exception (normal): status 0x%jx at %p si_addr: %p\n",
status, (void *)ip, si->si_addr);
num_bnd_chk++;