x86, amd: Avoid cache aliasing penalties on AMD family 15h
This patch provides performance tuning for the "Bulldozer" CPU. With its shared instruction cache there is a chance of generating an excessive number of cache cross-invalidates when running specific workloads on the cores of a compute module. This excessive amount of cross-invalidations can be observed if cache lines backed by shared physical memory alias in bits [14:12] of their virtual addresses, as those bits are used for the index generation. This patch addresses the issue by clearing all the bits in the [14:12] slice of the file mapping's virtual address at generation time, thus forcing those bits the same for all mappings of a single shared library across processes and, in doing so, avoids instruction cache aliases. It also adds the command line option "align_va_addr=(32|64|on|off)" with which virtual address alignment can be enabled for 32-bit or 64-bit x86 individually, or both, or be completely disabled. This change leaves virtual region address allocation on other families and/or vendors unaffected. Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> Link: http://lkml.kernel.org/r/1312550110-24160-2-git-send-email-bp@amd64.org Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
This commit is contained in:
parent
13f9a3737c
commit
dfb09f9b7a
6 changed files with 144 additions and 18 deletions
|
@ -299,6 +299,19 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|||
behaviour to be specified. Bit 0 enables warnings,
|
||||
bit 1 enables fixups, and bit 2 sends a segfault.
|
||||
|
||||
align_va_addr= [X86-64]
|
||||
Align virtual addresses by clearing slice [14:12] when
|
||||
allocating a VMA at process creation time. This option
|
||||
gives you up to 3% performance improvement on AMD F15h
|
||||
machines (where it is enabled by default) for a
|
||||
CPU-intensive style benchmark, and it can vary highly in
|
||||
a microbenchmark depending on workload and compiler.
|
||||
|
||||
1: only for 32-bit processes
|
||||
2: only for 64-bit processes
|
||||
on: enable for both 32- and 64-bit processes
|
||||
off: disable for both 32- and 64-bit processes
|
||||
|
||||
amd_iommu= [HW,X86-84]
|
||||
Pass parameters to the AMD IOMMU driver in the system.
|
||||
Possible values are:
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
/*
|
||||
* ELF register definitions..
|
||||
*/
|
||||
#include <linux/thread_info.h>
|
||||
|
||||
#include <asm/ptrace.h>
|
||||
#include <asm/user.h>
|
||||
|
@ -320,4 +321,34 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
|
|||
extern unsigned long arch_randomize_brk(struct mm_struct *mm);
|
||||
#define arch_randomize_brk arch_randomize_brk
|
||||
|
||||
/*
|
||||
* True on X86_32 or when emulating IA32 on X86_64
|
||||
*/
|
||||
static inline int mmap_is_ia32(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_32
|
||||
return 1;
|
||||
#endif
|
||||
#ifdef CONFIG_IA32_EMULATION
|
||||
if (test_thread_flag(TIF_IA32))
|
||||
return 1;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* The first two values are special, do not change. See align_addr() */
|
||||
enum align_flags {
|
||||
ALIGN_VA_32 = BIT(0),
|
||||
ALIGN_VA_64 = BIT(1),
|
||||
ALIGN_VDSO = BIT(2),
|
||||
ALIGN_TOPDOWN = BIT(3),
|
||||
};
|
||||
|
||||
struct va_alignment {
|
||||
int flags;
|
||||
unsigned long mask;
|
||||
} ____cacheline_aligned;
|
||||
|
||||
extern struct va_alignment va_align;
|
||||
extern unsigned long align_addr(unsigned long, struct file *, enum align_flags);
|
||||
#endif /* _ASM_X86_ELF_H */
|
||||
|
|
|
@ -458,6 +458,19 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
|
|||
"with P0 frequency!\n");
|
||||
}
|
||||
}
|
||||
|
||||
if (c->x86 == 0x15) {
|
||||
unsigned long upperbit;
|
||||
u32 cpuid, assoc;
|
||||
|
||||
cpuid = cpuid_edx(0x80000005);
|
||||
assoc = cpuid >> 16 & 0xff;
|
||||
upperbit = ((cpuid >> 24) << 10) / assoc;
|
||||
|
||||
va_align.mask = (upperbit - 1) & PAGE_MASK;
|
||||
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
static void __cpuinit init_amd(struct cpuinfo_x86 *c)
|
||||
|
|
|
@ -18,6 +18,72 @@
|
|||
#include <asm/ia32.h>
|
||||
#include <asm/syscalls.h>
|
||||
|
||||
struct __read_mostly va_alignment va_align = {
|
||||
.flags = -1,
|
||||
};
|
||||
|
||||
/*
|
||||
* Align a virtual address to avoid aliasing in the I$ on AMD F15h.
|
||||
*
|
||||
* @flags denotes the allocation direction - bottomup or topdown -
|
||||
* or vDSO; see call sites below.
|
||||
*/
|
||||
unsigned long align_addr(unsigned long addr, struct file *filp,
|
||||
enum align_flags flags)
|
||||
{
|
||||
unsigned long tmp_addr;
|
||||
|
||||
/* handle 32- and 64-bit case with a single conditional */
|
||||
if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
|
||||
return addr;
|
||||
|
||||
if (!(current->flags & PF_RANDOMIZE))
|
||||
return addr;
|
||||
|
||||
if (!((flags & ALIGN_VDSO) || filp))
|
||||
return addr;
|
||||
|
||||
tmp_addr = addr;
|
||||
|
||||
/*
|
||||
* We need an address which is <= than the original
|
||||
* one only when in topdown direction.
|
||||
*/
|
||||
if (!(flags & ALIGN_TOPDOWN))
|
||||
tmp_addr += va_align.mask;
|
||||
|
||||
tmp_addr &= ~va_align.mask;
|
||||
|
||||
return tmp_addr;
|
||||
}
|
||||
|
||||
static int __init control_va_addr_alignment(char *str)
|
||||
{
|
||||
/* guard against enabling this on other CPU families */
|
||||
if (va_align.flags < 0)
|
||||
return 1;
|
||||
|
||||
if (*str == 0)
|
||||
return 1;
|
||||
|
||||
if (*str == '=')
|
||||
str++;
|
||||
|
||||
if (!strcmp(str, "32"))
|
||||
va_align.flags = ALIGN_VA_32;
|
||||
else if (!strcmp(str, "64"))
|
||||
va_align.flags = ALIGN_VA_64;
|
||||
else if (!strcmp(str, "off"))
|
||||
va_align.flags = 0;
|
||||
else if (!strcmp(str, "on"))
|
||||
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
|
||||
else
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
__setup("align_va_addr", control_va_addr_alignment);
|
||||
|
||||
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
|
||||
unsigned long, prot, unsigned long, flags,
|
||||
unsigned long, fd, unsigned long, off)
|
||||
|
@ -92,6 +158,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
|
|||
start_addr = addr;
|
||||
|
||||
full_search:
|
||||
|
||||
addr = align_addr(addr, filp, 0);
|
||||
|
||||
for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
|
||||
/* At this point: (!vma || addr < vma->vm_end). */
|
||||
if (end - len < addr) {
|
||||
|
@ -117,6 +186,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
|
|||
mm->cached_hole_size = vma->vm_start - addr;
|
||||
|
||||
addr = vma->vm_end;
|
||||
addr = align_addr(addr, filp, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -161,10 +231,13 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
|
|||
|
||||
/* make sure it can fit in the remaining address space */
|
||||
if (addr > len) {
|
||||
vma = find_vma(mm, addr-len);
|
||||
if (!vma || addr <= vma->vm_start)
|
||||
unsigned long tmp_addr = align_addr(addr - len, filp,
|
||||
ALIGN_TOPDOWN);
|
||||
|
||||
vma = find_vma(mm, tmp_addr);
|
||||
if (!vma || tmp_addr + len <= vma->vm_start)
|
||||
/* remember the address as a hint for next time */
|
||||
return mm->free_area_cache = addr-len;
|
||||
return mm->free_area_cache = tmp_addr;
|
||||
}
|
||||
|
||||
if (mm->mmap_base < len)
|
||||
|
@ -173,6 +246,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
|
|||
addr = mm->mmap_base-len;
|
||||
|
||||
do {
|
||||
addr = align_addr(addr, filp, ALIGN_TOPDOWN);
|
||||
|
||||
/*
|
||||
* Lookup failure means no vma is above this address,
|
||||
* else if new region fits below vma->vm_start,
|
||||
|
|
|
@ -51,21 +51,6 @@ static unsigned int stack_maxrandom_size(void)
|
|||
#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
|
||||
#define MAX_GAP (TASK_SIZE/6*5)
|
||||
|
||||
/*
|
||||
* True on X86_32 or when emulating IA32 on X86_64
|
||||
*/
|
||||
static int mmap_is_ia32(void)
|
||||
{
|
||||
#ifdef CONFIG_X86_32
|
||||
return 1;
|
||||
#endif
|
||||
#ifdef CONFIG_IA32_EMULATION
|
||||
if (test_thread_flag(TIF_IA32))
|
||||
return 1;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int mmap_is_legacy(void)
|
||||
{
|
||||
if (current->personality & ADDR_COMPAT_LAYOUT)
|
||||
|
|
|
@ -69,6 +69,15 @@ static unsigned long vdso_addr(unsigned long start, unsigned len)
|
|||
addr = start + (offset << PAGE_SHIFT);
|
||||
if (addr >= end)
|
||||
addr = end;
|
||||
|
||||
/*
|
||||
* page-align it here so that get_unmapped_area doesn't
|
||||
* align it wrongfully again to the next page. addr can come in 4K
|
||||
* unaligned here as a result of stack start randomization.
|
||||
*/
|
||||
addr = PAGE_ALIGN(addr);
|
||||
addr = align_addr(addr, NULL, ALIGN_VDSO);
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue