sched: fix the theoretical signal_wake_up() vs schedule() race
This is only theoretical, but after try_to_wake_up(p) was changed to check p->state under p->pi_lock the code like __set_current_state(TASK_INTERRUPTIBLE); schedule(); can miss a signal. This is the special case of wait-for-condition, it relies on try_to_wake_up/schedule interaction and thus it does not need mb() between __set_current_state() and if(signal_pending). However, this __set_current_state() can move into the critical section protected by rq->lock, now that try_to_wake_up() takes another lock we need to ensure that it can't be reordered with "if (signal_pending(current))" check inside that section. The patch is actually one-liner, it simply adds smp_wmb() before spin_lock_irq(rq->lock). This is what try_to_wake_up() already does by the same reason. We turn this wmb() into the new helper, smp_mb__before_spinlock(), for better documentation and to allow the architectures to change the default implementation. While at it, kill smp_mb__after_lock(), it has no callers. Perhaps we can also add smp_mb__before/after_spinunlock() for prepare_to_wait(). Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
584d88b2cd
commit
e0acd0a68e
3 changed files with 24 additions and 8 deletions
|
@ -233,8 +233,4 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
|
||||||
#define arch_read_relax(lock) cpu_relax()
|
#define arch_read_relax(lock) cpu_relax()
|
||||||
#define arch_write_relax(lock) cpu_relax()
|
#define arch_write_relax(lock) cpu_relax()
|
||||||
|
|
||||||
/* The {read|write|spin}_lock() on x86 are full memory barriers. */
|
|
||||||
static inline void smp_mb__after_lock(void) { }
|
|
||||||
#define ARCH_HAS_SMP_MB_AFTER_LOCK
|
|
||||||
|
|
||||||
#endif /* _ASM_X86_SPINLOCK_H */
|
#endif /* _ASM_X86_SPINLOCK_H */
|
||||||
|
|
|
@ -117,9 +117,17 @@ do { \
|
||||||
#endif /*arch_spin_is_contended*/
|
#endif /*arch_spin_is_contended*/
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* The lock does not imply full memory barrier. */
|
/*
|
||||||
#ifndef ARCH_HAS_SMP_MB_AFTER_LOCK
|
* Despite its name it doesn't necessarily has to be a full barrier.
|
||||||
static inline void smp_mb__after_lock(void) { smp_mb(); }
|
* It should only guarantee that a STORE before the critical section
|
||||||
|
* can not be reordered with a LOAD inside this section.
|
||||||
|
* spin_lock() is the one-way barrier, this LOAD can not escape out
|
||||||
|
* of the region. So the default implementation simply ensures that
|
||||||
|
* a STORE can not move into the critical section, smp_wmb() should
|
||||||
|
* serialize it with another STORE done by spin_lock().
|
||||||
|
*/
|
||||||
|
#ifndef smp_mb__before_spinlock
|
||||||
|
#define smp_mb__before_spinlock() smp_wmb()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -1491,7 +1491,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
int cpu, success = 0;
|
int cpu, success = 0;
|
||||||
|
|
||||||
smp_wmb();
|
/*
|
||||||
|
* If we are going to wake up a thread waiting for CONDITION we
|
||||||
|
* need to ensure that CONDITION=1 done by the caller can not be
|
||||||
|
* reordered with p->state check below. This pairs with mb() in
|
||||||
|
* set_current_state() the waiting thread does.
|
||||||
|
*/
|
||||||
|
smp_mb__before_spinlock();
|
||||||
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
raw_spin_lock_irqsave(&p->pi_lock, flags);
|
||||||
if (!(p->state & state))
|
if (!(p->state & state))
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -2394,6 +2400,12 @@ static void __sched __schedule(void)
|
||||||
if (sched_feat(HRTICK))
|
if (sched_feat(HRTICK))
|
||||||
hrtick_clear(rq);
|
hrtick_clear(rq);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Make sure that signal_pending_state()->signal_pending() below
|
||||||
|
* can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
|
||||||
|
* done by the caller to avoid the race with signal_wake_up().
|
||||||
|
*/
|
||||||
|
smp_mb__before_spinlock();
|
||||||
raw_spin_lock_irq(&rq->lock);
|
raw_spin_lock_irq(&rq->lock);
|
||||||
|
|
||||||
switch_count = &prev->nivcsw;
|
switch_count = &prev->nivcsw;
|
||||||
|
|
Loading…
Reference in a new issue