BKL: revert back to the old spinlock implementation
The generic semaphore rewrite had a huge performance regression on AIM7
(and potentially other BKL-heavy benchmarks) because the generic
semaphores had been rewritten to be simple to understand and fair. The
latter, in particular, turns a semaphore-based BKL implementation into a
mess of scheduling.
The attempt to fix the performance regression failed miserably (see the
previous commit 00b41ec261
'Revert
"semaphore: fix"'), and so for now the simple and sane approach is to
instead just go back to the old spinlock-based BKL implementation that
never had any issues like this.
This patch also has the advantage of being reported to fix the
regression completely according to Yanmin Zhang, unlike the semaphore
hack which still left a couple percentage point regression.
As a spinlock, the BKL obviously has the potential to be a latency
issue, but it's not really any different from any other spinlock in that
respect. We do want to get rid of the BKL asap, but that has been the
plan for several years.
These days, the biggest users are in the tty layer (open/release in
particular) and Alan holds out some hope:
"tty release is probably a few months away from getting cured - I'm
afraid it will almost certainly be the very last user of the BKL in
tty to get fixed as it depends on everything else being sanely locked."
so while we're not there yet, we do have a plan of action.
Tested-by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Alexander Viro <viro@ftp.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
00b41ec261
commit
8e3e076c5a
4 changed files with 98 additions and 84 deletions
|
@ -186,17 +186,6 @@ config PREEMPT
|
||||||
Say Y here if you are building a kernel for a desktop, embedded
|
Say Y here if you are building a kernel for a desktop, embedded
|
||||||
or real-time system. Say N if you are unsure.
|
or real-time system. Say N if you are unsure.
|
||||||
|
|
||||||
config PREEMPT_BKL
|
|
||||||
bool "Preempt The Big Kernel Lock"
|
|
||||||
depends on PREEMPT
|
|
||||||
default y
|
|
||||||
help
|
|
||||||
This option reduces the latency of the kernel by making the
|
|
||||||
big kernel lock preemptible.
|
|
||||||
|
|
||||||
Say Y here if you are building a kernel for a desktop system.
|
|
||||||
Say N if you are unsure.
|
|
||||||
|
|
||||||
config MN10300_CURRENT_IN_E2
|
config MN10300_CURRENT_IN_E2
|
||||||
bool "Hold current task address in E2 register"
|
bool "Hold current task address in E2 register"
|
||||||
default y
|
default y
|
||||||
|
|
|
@ -72,6 +72,14 @@
|
||||||
#define in_softirq() (softirq_count())
|
#define in_softirq() (softirq_count())
|
||||||
#define in_interrupt() (irq_count())
|
#define in_interrupt() (irq_count())
|
||||||
|
|
||||||
|
#if defined(CONFIG_PREEMPT)
|
||||||
|
# define PREEMPT_INATOMIC_BASE kernel_locked()
|
||||||
|
# define PREEMPT_CHECK_OFFSET 1
|
||||||
|
#else
|
||||||
|
# define PREEMPT_INATOMIC_BASE 0
|
||||||
|
# define PREEMPT_CHECK_OFFSET 0
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Are we running in atomic context? WARNING: this macro cannot
|
* Are we running in atomic context? WARNING: this macro cannot
|
||||||
* always detect atomic context; in particular, it cannot know about
|
* always detect atomic context; in particular, it cannot know about
|
||||||
|
@ -79,17 +87,11 @@
|
||||||
* used in the general case to determine whether sleeping is possible.
|
* used in the general case to determine whether sleeping is possible.
|
||||||
* Do not use in_atomic() in driver code.
|
* Do not use in_atomic() in driver code.
|
||||||
*/
|
*/
|
||||||
#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0)
|
#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_INATOMIC_BASE)
|
||||||
|
|
||||||
#ifdef CONFIG_PREEMPT
|
|
||||||
# define PREEMPT_CHECK_OFFSET 1
|
|
||||||
#else
|
|
||||||
# define PREEMPT_CHECK_OFFSET 0
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check whether we were atomic before we did preempt_disable():
|
* Check whether we were atomic before we did preempt_disable():
|
||||||
* (used by the scheduler)
|
* (used by the scheduler, *after* releasing the kernel lock)
|
||||||
*/
|
*/
|
||||||
#define in_atomic_preempt_off() \
|
#define in_atomic_preempt_off() \
|
||||||
((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
|
((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
|
||||||
|
|
|
@ -4567,8 +4567,6 @@ EXPORT_SYMBOL(schedule);
|
||||||
asmlinkage void __sched preempt_schedule(void)
|
asmlinkage void __sched preempt_schedule(void)
|
||||||
{
|
{
|
||||||
struct thread_info *ti = current_thread_info();
|
struct thread_info *ti = current_thread_info();
|
||||||
struct task_struct *task = current;
|
|
||||||
int saved_lock_depth;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If there is a non-zero preempt_count or interrupts are disabled,
|
* If there is a non-zero preempt_count or interrupts are disabled,
|
||||||
|
@ -4579,16 +4577,7 @@ asmlinkage void __sched preempt_schedule(void)
|
||||||
|
|
||||||
do {
|
do {
|
||||||
add_preempt_count(PREEMPT_ACTIVE);
|
add_preempt_count(PREEMPT_ACTIVE);
|
||||||
|
|
||||||
/*
|
|
||||||
* We keep the big kernel semaphore locked, but we
|
|
||||||
* clear ->lock_depth so that schedule() doesnt
|
|
||||||
* auto-release the semaphore:
|
|
||||||
*/
|
|
||||||
saved_lock_depth = task->lock_depth;
|
|
||||||
task->lock_depth = -1;
|
|
||||||
schedule();
|
schedule();
|
||||||
task->lock_depth = saved_lock_depth;
|
|
||||||
sub_preempt_count(PREEMPT_ACTIVE);
|
sub_preempt_count(PREEMPT_ACTIVE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -4609,26 +4598,15 @@ EXPORT_SYMBOL(preempt_schedule);
|
||||||
asmlinkage void __sched preempt_schedule_irq(void)
|
asmlinkage void __sched preempt_schedule_irq(void)
|
||||||
{
|
{
|
||||||
struct thread_info *ti = current_thread_info();
|
struct thread_info *ti = current_thread_info();
|
||||||
struct task_struct *task = current;
|
|
||||||
int saved_lock_depth;
|
|
||||||
|
|
||||||
/* Catch callers which need to be fixed */
|
/* Catch callers which need to be fixed */
|
||||||
BUG_ON(ti->preempt_count || !irqs_disabled());
|
BUG_ON(ti->preempt_count || !irqs_disabled());
|
||||||
|
|
||||||
do {
|
do {
|
||||||
add_preempt_count(PREEMPT_ACTIVE);
|
add_preempt_count(PREEMPT_ACTIVE);
|
||||||
|
|
||||||
/*
|
|
||||||
* We keep the big kernel semaphore locked, but we
|
|
||||||
* clear ->lock_depth so that schedule() doesnt
|
|
||||||
* auto-release the semaphore:
|
|
||||||
*/
|
|
||||||
saved_lock_depth = task->lock_depth;
|
|
||||||
task->lock_depth = -1;
|
|
||||||
local_irq_enable();
|
local_irq_enable();
|
||||||
schedule();
|
schedule();
|
||||||
local_irq_disable();
|
local_irq_disable();
|
||||||
task->lock_depth = saved_lock_depth;
|
|
||||||
sub_preempt_count(PREEMPT_ACTIVE);
|
sub_preempt_count(PREEMPT_ACTIVE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -5853,8 +5831,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
|
||||||
spin_unlock_irqrestore(&rq->lock, flags);
|
spin_unlock_irqrestore(&rq->lock, flags);
|
||||||
|
|
||||||
/* Set the preempt count _outside_ the spinlocks! */
|
/* Set the preempt count _outside_ the spinlocks! */
|
||||||
|
#if defined(CONFIG_PREEMPT)
|
||||||
|
task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
|
||||||
|
#else
|
||||||
task_thread_info(idle)->preempt_count = 0;
|
task_thread_info(idle)->preempt_count = 0;
|
||||||
|
#endif
|
||||||
/*
|
/*
|
||||||
* The idle tasks have their own, simple scheduling class:
|
* The idle tasks have their own, simple scheduling class:
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -11,79 +11,121 @@
|
||||||
#include <linux/semaphore.h>
|
#include <linux/semaphore.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The 'big kernel semaphore'
|
* The 'big kernel lock'
|
||||||
*
|
*
|
||||||
* This mutex is taken and released recursively by lock_kernel()
|
* This spinlock is taken and released recursively by lock_kernel()
|
||||||
* and unlock_kernel(). It is transparently dropped and reacquired
|
* and unlock_kernel(). It is transparently dropped and reacquired
|
||||||
* over schedule(). It is used to protect legacy code that hasn't
|
* over schedule(). It is used to protect legacy code that hasn't
|
||||||
* been migrated to a proper locking design yet.
|
* been migrated to a proper locking design yet.
|
||||||
*
|
*
|
||||||
* Note: code locked by this semaphore will only be serialized against
|
|
||||||
* other code using the same locking facility. The code guarantees that
|
|
||||||
* the task remains on the same CPU.
|
|
||||||
*
|
|
||||||
* Don't use in new code.
|
* Don't use in new code.
|
||||||
*/
|
*/
|
||||||
static DECLARE_MUTEX(kernel_sem);
|
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Re-acquire the kernel semaphore.
|
* Acquire/release the underlying lock from the scheduler.
|
||||||
*
|
*
|
||||||
* This function is called with preemption off.
|
* This is called with preemption disabled, and should
|
||||||
|
* return an error value if it cannot get the lock and
|
||||||
|
* TIF_NEED_RESCHED gets set.
|
||||||
*
|
*
|
||||||
* We are executing in schedule() so the code must be extremely careful
|
* If it successfully gets the lock, it should increment
|
||||||
* about recursion, both due to the down() and due to the enabling of
|
* the preemption count like any spinlock does.
|
||||||
* preemption. schedule() will re-check the preemption flag after
|
*
|
||||||
* reacquiring the semaphore.
|
* (This works on UP too - _raw_spin_trylock will never
|
||||||
|
* return false in that case)
|
||||||
*/
|
*/
|
||||||
int __lockfunc __reacquire_kernel_lock(void)
|
int __lockfunc __reacquire_kernel_lock(void)
|
||||||
{
|
{
|
||||||
struct task_struct *task = current;
|
while (!_raw_spin_trylock(&kernel_flag)) {
|
||||||
int saved_lock_depth = task->lock_depth;
|
if (test_thread_flag(TIF_NEED_RESCHED))
|
||||||
|
return -EAGAIN;
|
||||||
BUG_ON(saved_lock_depth < 0);
|
cpu_relax();
|
||||||
|
}
|
||||||
task->lock_depth = -1;
|
|
||||||
preempt_enable_no_resched();
|
|
||||||
|
|
||||||
down(&kernel_sem);
|
|
||||||
|
|
||||||
preempt_disable();
|
preempt_disable();
|
||||||
task->lock_depth = saved_lock_depth;
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void __lockfunc __release_kernel_lock(void)
|
void __lockfunc __release_kernel_lock(void)
|
||||||
{
|
{
|
||||||
up(&kernel_sem);
|
_raw_spin_unlock(&kernel_flag);
|
||||||
|
preempt_enable_no_resched();
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Getting the big kernel semaphore.
|
* These are the BKL spinlocks - we try to be polite about preemption.
|
||||||
|
* If SMP is not on (ie UP preemption), this all goes away because the
|
||||||
|
* _raw_spin_trylock() will always succeed.
|
||||||
|
*/
|
||||||
|
#ifdef CONFIG_PREEMPT
|
||||||
|
static inline void __lock_kernel(void)
|
||||||
|
{
|
||||||
|
preempt_disable();
|
||||||
|
if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
|
||||||
|
/*
|
||||||
|
* If preemption was disabled even before this
|
||||||
|
* was called, there's nothing we can be polite
|
||||||
|
* about - just spin.
|
||||||
|
*/
|
||||||
|
if (preempt_count() > 1) {
|
||||||
|
_raw_spin_lock(&kernel_flag);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Otherwise, let's wait for the kernel lock
|
||||||
|
* with preemption enabled..
|
||||||
|
*/
|
||||||
|
do {
|
||||||
|
preempt_enable();
|
||||||
|
while (spin_is_locked(&kernel_flag))
|
||||||
|
cpu_relax();
|
||||||
|
preempt_disable();
|
||||||
|
} while (!_raw_spin_trylock(&kernel_flag));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Non-preemption case - just get the spinlock
|
||||||
|
*/
|
||||||
|
static inline void __lock_kernel(void)
|
||||||
|
{
|
||||||
|
_raw_spin_lock(&kernel_flag);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static inline void __unlock_kernel(void)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* the BKL is not covered by lockdep, so we open-code the
|
||||||
|
* unlocking sequence (and thus avoid the dep-chain ops):
|
||||||
|
*/
|
||||||
|
_raw_spin_unlock(&kernel_flag);
|
||||||
|
preempt_enable();
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Getting the big kernel lock.
|
||||||
|
*
|
||||||
|
* This cannot happen asynchronously, so we only need to
|
||||||
|
* worry about other CPU's.
|
||||||
*/
|
*/
|
||||||
void __lockfunc lock_kernel(void)
|
void __lockfunc lock_kernel(void)
|
||||||
{
|
{
|
||||||
struct task_struct *task = current;
|
int depth = current->lock_depth+1;
|
||||||
int depth = task->lock_depth + 1;
|
|
||||||
|
|
||||||
if (likely(!depth))
|
if (likely(!depth))
|
||||||
/*
|
__lock_kernel();
|
||||||
* No recursion worries - we set up lock_depth _after_
|
current->lock_depth = depth;
|
||||||
*/
|
|
||||||
down(&kernel_sem);
|
|
||||||
|
|
||||||
task->lock_depth = depth;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void __lockfunc unlock_kernel(void)
|
void __lockfunc unlock_kernel(void)
|
||||||
{
|
{
|
||||||
struct task_struct *task = current;
|
BUG_ON(current->lock_depth < 0);
|
||||||
|
if (likely(--current->lock_depth < 0))
|
||||||
BUG_ON(task->lock_depth < 0);
|
__unlock_kernel();
|
||||||
|
|
||||||
if (likely(--task->lock_depth < 0))
|
|
||||||
up(&kernel_sem);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
EXPORT_SYMBOL(lock_kernel);
|
EXPORT_SYMBOL(lock_kernel);
|
||||||
|
|
Loading…
Reference in a new issue