917d9290af
Currently the per cpu counter's batch size for memory accounting is configured as twice the number of cpus in the system. However, for system with very large memory, it is more appropriate to make it proportional to the memory size per cpu in the system. For example, for a x86_64 system with 64 cpus and 128 GB of memory, the batch size is only 2*64 pages (0.5 MB). So any memory accounting changes of more than 0.5MB will overflow the per cpu counter into the global counter. Instead, for the new scheme, the batch size is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256), which is more inline with the memory size. I've done a repeated brk test of 800KB (from will-it-scale test suite) with 80 concurrent processes on a 4 socket Westmere machine with a total of 40 cores. Without the patch, about 80% of cpu is spent on spin-lock contention within the vm_committed_as counter. With the patch, there's a 73x speedup on the benchmark and the lock contention drops off almost entirely. [akpm@linux-foundation.org: fix section mismatch] Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> Cc: Tejun Heo <tj@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
90 lines
2.2 KiB
C
90 lines
2.2 KiB
C
#ifndef _LINUX_MMAN_H
|
|
#define _LINUX_MMAN_H
|
|
|
|
#include <linux/mm.h>
|
|
#include <linux/percpu_counter.h>
|
|
|
|
#include <linux/atomic.h>
|
|
#include <uapi/linux/mman.h>
|
|
|
|
extern int sysctl_overcommit_memory;
|
|
extern int sysctl_overcommit_ratio;
|
|
extern struct percpu_counter vm_committed_as;
|
|
|
|
#ifdef CONFIG_SMP
|
|
extern s32 vm_committed_as_batch;
|
|
#else
|
|
#define vm_committed_as_batch 0
|
|
#endif
|
|
|
|
unsigned long vm_memory_committed(void);
|
|
|
|
static inline void vm_acct_memory(long pages)
|
|
{
|
|
__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
|
|
}
|
|
|
|
static inline void vm_unacct_memory(long pages)
|
|
{
|
|
vm_acct_memory(-pages);
|
|
}
|
|
|
|
/*
|
|
* Allow architectures to handle additional protection bits
|
|
*/
|
|
|
|
#ifndef arch_calc_vm_prot_bits
|
|
#define arch_calc_vm_prot_bits(prot) 0
|
|
#endif
|
|
|
|
#ifndef arch_vm_get_page_prot
|
|
#define arch_vm_get_page_prot(vm_flags) __pgprot(0)
|
|
#endif
|
|
|
|
#ifndef arch_validate_prot
|
|
/*
|
|
* This is called from mprotect(). PROT_GROWSDOWN and PROT_GROWSUP have
|
|
* already been masked out.
|
|
*
|
|
* Returns true if the prot flags are valid
|
|
*/
|
|
static inline int arch_validate_prot(unsigned long prot)
|
|
{
|
|
return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0;
|
|
}
|
|
#define arch_validate_prot arch_validate_prot
|
|
#endif
|
|
|
|
/*
|
|
* Optimisation macro. It is equivalent to:
|
|
* (x & bit1) ? bit2 : 0
|
|
* but this version is faster.
|
|
* ("bit1" and "bit2" must be single bits)
|
|
*/
|
|
#define _calc_vm_trans(x, bit1, bit2) \
|
|
((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
|
|
: ((x) & (bit1)) / ((bit1) / (bit2)))
|
|
|
|
/*
|
|
* Combine the mmap "prot" argument into "vm_flags" used internally.
|
|
*/
|
|
static inline unsigned long
|
|
calc_vm_prot_bits(unsigned long prot)
|
|
{
|
|
return _calc_vm_trans(prot, PROT_READ, VM_READ ) |
|
|
_calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
|
|
_calc_vm_trans(prot, PROT_EXEC, VM_EXEC) |
|
|
arch_calc_vm_prot_bits(prot);
|
|
}
|
|
|
|
/*
|
|
* Combine the mmap "flags" argument into "vm_flags" used internally.
|
|
*/
|
|
static inline unsigned long
|
|
calc_vm_flag_bits(unsigned long flags)
|
|
{
|
|
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
|
|
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
|
|
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
|
|
}
|
|
#endif /* _LINUX_MMAN_H */
|