[PATCH] pi-futex: scheduler support for pi
Add framework to boost/unboost the priority of RT tasks. This consists of: - caching the 'normal' priority in ->normal_prio - providing a functions to set/get the priority of the task - make sched_setscheduler() aware of boosting The effective_prio() cleanups also fix a priority-calculation bug pointed out by Andrey Gelman, in set_user_nice(). has_rt_policy() fix: Peter Williams <pwil3058@bigpond.net.au> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Andrey Gelman <agelman@012.net.il> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
parent
77ba89c5cf
commit
b29739f902
3 changed files with 181 additions and 31 deletions
|
@ -87,6 +87,7 @@ extern struct group_info init_groups;
|
|||
.lock_depth = -1, \
|
||||
.prio = MAX_PRIO-20, \
|
||||
.static_prio = MAX_PRIO-20, \
|
||||
.normal_prio = MAX_PRIO-20, \
|
||||
.policy = SCHED_NORMAL, \
|
||||
.cpus_allowed = CPU_MASK_ALL, \
|
||||
.mm = NULL, \
|
||||
|
@ -122,6 +123,7 @@ extern struct group_info init_groups;
|
|||
.journal_info = NULL, \
|
||||
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
|
||||
.fs_excl = ATOMIC_INIT(0), \
|
||||
.pi_lock = SPIN_LOCK_UNLOCKED, \
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -495,8 +495,11 @@ struct signal_struct {
|
|||
|
||||
#define MAX_PRIO (MAX_RT_PRIO + 40)
|
||||
|
||||
#define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO))
|
||||
#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
|
||||
#define rt_task(p) rt_prio((p)->prio)
|
||||
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
|
||||
#define has_rt_policy(p) \
|
||||
unlikely((p)->policy != SCHED_NORMAL && (p)->policy != SCHED_BATCH)
|
||||
|
||||
/*
|
||||
* Some day this will be a full-fledged user tracking system..
|
||||
|
@ -725,7 +728,7 @@ struct task_struct {
|
|||
#endif
|
||||
#endif
|
||||
int load_weight; /* for niceness load balancing purposes */
|
||||
int prio, static_prio;
|
||||
int prio, static_prio, normal_prio;
|
||||
struct list_head run_list;
|
||||
prio_array_t *array;
|
||||
|
||||
|
@ -852,6 +855,9 @@ struct task_struct {
|
|||
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
|
||||
spinlock_t alloc_lock;
|
||||
|
||||
/* Protection of the PI data structures: */
|
||||
spinlock_t pi_lock;
|
||||
|
||||
#ifdef CONFIG_DEBUG_MUTEXES
|
||||
/* mutex deadlock detection */
|
||||
struct mutex_waiter *blocked_on;
|
||||
|
@ -1018,6 +1024,17 @@ static inline void idle_task_exit(void) {}
|
|||
#endif
|
||||
|
||||
extern void sched_idle_next(void);
|
||||
|
||||
#ifdef CONFIG_RT_MUTEXES
|
||||
extern int rt_mutex_getprio(task_t *p);
|
||||
extern void rt_mutex_setprio(task_t *p, int prio);
|
||||
#else
|
||||
static inline int rt_mutex_getprio(task_t *p)
|
||||
{
|
||||
return p->normal_prio;
|
||||
}
|
||||
#endif
|
||||
|
||||
extern void set_user_nice(task_t *p, long nice);
|
||||
extern int task_prio(const task_t *p);
|
||||
extern int task_nice(const task_t *p);
|
||||
|
|
189
kernel/sched.c
189
kernel/sched.c
|
@ -354,6 +354,25 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
|
|||
}
|
||||
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
|
||||
|
||||
/*
|
||||
* __task_rq_lock - lock the runqueue a given task resides on.
|
||||
* Must be called interrupts disabled.
|
||||
*/
|
||||
static inline runqueue_t *__task_rq_lock(task_t *p)
|
||||
__acquires(rq->lock)
|
||||
{
|
||||
struct runqueue *rq;
|
||||
|
||||
repeat_lock_task:
|
||||
rq = task_rq(p);
|
||||
spin_lock(&rq->lock);
|
||||
if (unlikely(rq != task_rq(p))) {
|
||||
spin_unlock(&rq->lock);
|
||||
goto repeat_lock_task;
|
||||
}
|
||||
return rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* task_rq_lock - lock the runqueue a given task resides on and disable
|
||||
* interrupts. Note the ordering: we can safely lookup the task_rq without
|
||||
|
@ -375,6 +394,12 @@ static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
|
|||
return rq;
|
||||
}
|
||||
|
||||
static inline void __task_rq_unlock(runqueue_t *rq)
|
||||
__releases(rq->lock)
|
||||
{
|
||||
spin_unlock(&rq->lock);
|
||||
}
|
||||
|
||||
static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
|
||||
__releases(rq->lock)
|
||||
{
|
||||
|
@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
|
|||
}
|
||||
|
||||
/*
|
||||
* effective_prio - return the priority that is based on the static
|
||||
* __normal_prio - return the priority that is based on the static
|
||||
* priority but is modified by bonuses/penalties.
|
||||
*
|
||||
* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
|
||||
|
@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
|
|||
*
|
||||
* Both properties are important to certain workloads.
|
||||
*/
|
||||
static int effective_prio(task_t *p)
|
||||
|
||||
static inline int __normal_prio(task_t *p)
|
||||
{
|
||||
int bonus, prio;
|
||||
|
||||
if (rt_task(p))
|
||||
return p->prio;
|
||||
|
||||
bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
|
||||
|
||||
prio = p->static_prio - bonus;
|
||||
|
@ -692,7 +715,7 @@ static int effective_prio(task_t *p)
|
|||
|
||||
static void set_load_weight(task_t *p)
|
||||
{
|
||||
if (rt_task(p)) {
|
||||
if (has_rt_policy(p)) {
|
||||
#ifdef CONFIG_SMP
|
||||
if (p == task_rq(p)->migration_thread)
|
||||
/*
|
||||
|
@ -730,6 +753,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq)
|
|||
dec_raw_weighted_load(rq, p);
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the expected normal priority: i.e. priority
|
||||
* without taking RT-inheritance into account. Might be
|
||||
* boosted by interactivity modifiers. Changes upon fork,
|
||||
* setprio syscalls, and whenever the interactivity
|
||||
* estimator recalculates.
|
||||
*/
|
||||
static inline int normal_prio(task_t *p)
|
||||
{
|
||||
int prio;
|
||||
|
||||
if (has_rt_policy(p))
|
||||
prio = MAX_RT_PRIO-1 - p->rt_priority;
|
||||
else
|
||||
prio = __normal_prio(p);
|
||||
return prio;
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the current priority, i.e. the priority
|
||||
* taken into account by the scheduler. This value might
|
||||
* be boosted by RT tasks, or might be boosted by
|
||||
* interactivity modifiers. Will be RT if the task got
|
||||
* RT-boosted. If not then it returns p->normal_prio.
|
||||
*/
|
||||
static int effective_prio(task_t *p)
|
||||
{
|
||||
p->normal_prio = normal_prio(p);
|
||||
/*
|
||||
* If we are RT tasks or we were boosted to RT priority,
|
||||
* keep the priority unchanged. Otherwise, update priority
|
||||
* to the normal priority:
|
||||
*/
|
||||
if (!rt_prio(p->prio))
|
||||
return p->normal_prio;
|
||||
return p->prio;
|
||||
}
|
||||
|
||||
/*
|
||||
* __activate_task - move a task to the runqueue.
|
||||
*/
|
||||
|
@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
|
|||
inc_nr_running(p, rq);
|
||||
}
|
||||
|
||||
/*
|
||||
* Recalculate p->normal_prio and p->prio after having slept,
|
||||
* updating the sleep-average too:
|
||||
*/
|
||||
static int recalc_task_prio(task_t *p, unsigned long long now)
|
||||
{
|
||||
/* Caller must always ensure 'now >= p->timestamp' */
|
||||
|
@ -1448,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
|
|||
* event cannot wake it up and insert it on the runqueue either.
|
||||
*/
|
||||
p->state = TASK_RUNNING;
|
||||
|
||||
/*
|
||||
* Make sure we do not leak PI boosting priority to the child:
|
||||
*/
|
||||
p->prio = current->normal_prio;
|
||||
|
||||
INIT_LIST_HEAD(&p->run_list);
|
||||
p->array = NULL;
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
|
@ -1527,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
|
|||
__activate_task(p, rq);
|
||||
else {
|
||||
p->prio = current->prio;
|
||||
p->normal_prio = current->normal_prio;
|
||||
list_add_tail(&p->run_list, ¤t->run_list);
|
||||
p->array = current->array;
|
||||
p->array->nr_active++;
|
||||
|
@ -3668,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
|
|||
|
||||
EXPORT_SYMBOL(sleep_on_timeout);
|
||||
|
||||
#ifdef CONFIG_RT_MUTEXES
|
||||
|
||||
/*
|
||||
* rt_mutex_setprio - set the current priority of a task
|
||||
* @p: task
|
||||
* @prio: prio value (kernel-internal form)
|
||||
*
|
||||
* This function changes the 'effective' priority of a task. It does
|
||||
* not touch ->normal_prio like __setscheduler().
|
||||
*
|
||||
* Used by the rt_mutex code to implement priority inheritance logic.
|
||||
*/
|
||||
void rt_mutex_setprio(task_t *p, int prio)
|
||||
{
|
||||
unsigned long flags;
|
||||
prio_array_t *array;
|
||||
runqueue_t *rq;
|
||||
int oldprio;
|
||||
|
||||
BUG_ON(prio < 0 || prio > MAX_PRIO);
|
||||
|
||||
rq = task_rq_lock(p, &flags);
|
||||
|
||||
oldprio = p->prio;
|
||||
array = p->array;
|
||||
if (array)
|
||||
dequeue_task(p, array);
|
||||
p->prio = prio;
|
||||
|
||||
if (array) {
|
||||
/*
|
||||
* If changing to an RT priority then queue it
|
||||
* in the active array!
|
||||
*/
|
||||
if (rt_task(p))
|
||||
array = rq->active;
|
||||
enqueue_task(p, array);
|
||||
/*
|
||||
* Reschedule if we are currently running on this runqueue and
|
||||
* our priority decreased, or if we are not currently running on
|
||||
* this runqueue and our priority is higher than the current's
|
||||
*/
|
||||
if (task_running(rq, p)) {
|
||||
if (p->prio > oldprio)
|
||||
resched_task(rq->curr);
|
||||
} else if (TASK_PREEMPTS_CURR(p, rq))
|
||||
resched_task(rq->curr);
|
||||
}
|
||||
task_rq_unlock(rq, &flags);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void set_user_nice(task_t *p, long nice)
|
||||
{
|
||||
unsigned long flags;
|
||||
prio_array_t *array;
|
||||
runqueue_t *rq;
|
||||
int old_prio, new_prio, delta;
|
||||
int old_prio, delta;
|
||||
|
||||
if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
|
||||
return;
|
||||
|
@ -3688,7 +3813,7 @@ void set_user_nice(task_t *p, long nice)
|
|||
* it wont have any effect on scheduling until the task is
|
||||
* not SCHED_NORMAL/SCHED_BATCH:
|
||||
*/
|
||||
if (rt_task(p)) {
|
||||
if (has_rt_policy(p)) {
|
||||
p->static_prio = NICE_TO_PRIO(nice);
|
||||
goto out_unlock;
|
||||
}
|
||||
|
@ -3698,12 +3823,11 @@ void set_user_nice(task_t *p, long nice)
|
|||
dec_raw_weighted_load(rq, p);
|
||||
}
|
||||
|
||||
old_prio = p->prio;
|
||||
new_prio = NICE_TO_PRIO(nice);
|
||||
delta = new_prio - old_prio;
|
||||
p->static_prio = NICE_TO_PRIO(nice);
|
||||
set_load_weight(p);
|
||||
p->prio += delta;
|
||||
old_prio = p->prio;
|
||||
p->prio = effective_prio(p);
|
||||
delta = p->prio - old_prio;
|
||||
|
||||
if (array) {
|
||||
enqueue_task(p, array);
|
||||
|
@ -3718,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
|
|||
out_unlock:
|
||||
task_rq_unlock(rq, &flags);
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL(set_user_nice);
|
||||
|
||||
/*
|
||||
|
@ -3833,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
|
|||
BUG_ON(p->array);
|
||||
p->policy = policy;
|
||||
p->rt_priority = prio;
|
||||
if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
|
||||
p->prio = MAX_RT_PRIO-1 - p->rt_priority;
|
||||
} else {
|
||||
p->prio = p->static_prio;
|
||||
/*
|
||||
* SCHED_BATCH tasks are treated as perpetual CPU hogs:
|
||||
*/
|
||||
if (policy == SCHED_BATCH)
|
||||
p->sleep_avg = 0;
|
||||
}
|
||||
p->normal_prio = normal_prio(p);
|
||||
/* we are holding p->pi_lock already */
|
||||
p->prio = rt_mutex_getprio(p);
|
||||
/*
|
||||
* SCHED_BATCH tasks are treated as perpetual CPU hogs:
|
||||
*/
|
||||
if (policy == SCHED_BATCH)
|
||||
p->sleep_avg = 0;
|
||||
set_load_weight(p);
|
||||
}
|
||||
|
||||
|
@ -3911,15 +4032,21 @@ int sched_setscheduler(struct task_struct *p, int policy,
|
|||
retval = security_task_setscheduler(p, policy, param);
|
||||
if (retval)
|
||||
return retval;
|
||||
/*
|
||||
* make sure no PI-waiters arrive (or leave) while we are
|
||||
* changing the priority of the task:
|
||||
*/
|
||||
spin_lock_irqsave(&p->pi_lock, flags);
|
||||
/*
|
||||
* To be able to change p->policy safely, the apropriate
|
||||
* runqueue lock must be held.
|
||||
*/
|
||||
rq = task_rq_lock(p, &flags);
|
||||
rq = __task_rq_lock(p);
|
||||
/* recheck policy now with rq lock held */
|
||||
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
|
||||
policy = oldpolicy = -1;
|
||||
task_rq_unlock(rq, &flags);
|
||||
__task_rq_unlock(rq);
|
||||
spin_unlock_irqrestore(&p->pi_lock, flags);
|
||||
goto recheck;
|
||||
}
|
||||
array = p->array;
|
||||
|
@ -3940,7 +4067,9 @@ int sched_setscheduler(struct task_struct *p, int policy,
|
|||
} else if (TASK_PREEMPTS_CURR(p, rq))
|
||||
resched_task(rq->curr);
|
||||
}
|
||||
task_rq_unlock(rq, &flags);
|
||||
__task_rq_unlock(rq);
|
||||
spin_unlock_irqrestore(&p->pi_lock, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sched_setscheduler);
|
||||
|
@ -4575,7 +4704,7 @@ void __devinit init_idle(task_t *idle, int cpu)
|
|||
idle->timestamp = sched_clock();
|
||||
idle->sleep_avg = 0;
|
||||
idle->array = NULL;
|
||||
idle->prio = MAX_PRIO;
|
||||
idle->prio = idle->normal_prio = MAX_PRIO;
|
||||
idle->state = TASK_RUNNING;
|
||||
idle->cpus_allowed = cpumask_of_cpu(cpu);
|
||||
set_task_cpu(idle, cpu);
|
||||
|
@ -6582,7 +6711,8 @@ void normalize_rt_tasks(void)
|
|||
if (!rt_task(p))
|
||||
continue;
|
||||
|
||||
rq = task_rq_lock(p, &flags);
|
||||
spin_lock_irqsave(&p->pi_lock, flags);
|
||||
rq = __task_rq_lock(p);
|
||||
|
||||
array = p->array;
|
||||
if (array)
|
||||
|
@ -6593,7 +6723,8 @@ void normalize_rt_tasks(void)
|
|||
resched_task(rq->curr);
|
||||
}
|
||||
|
||||
task_rq_unlock(rq, &flags);
|
||||
__task_rq_unlock(rq);
|
||||
spin_unlock_irqrestore(&p->pi_lock, flags);
|
||||
}
|
||||
read_unlock_irq(&tasklist_lock);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue