Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Thomas Gleixner: - The hopefully final fix for the reported race problems in kthread_parkme(). The previous attempt still left a hole and was partially wrong. - Plug a race in the remote tick mechanism which triggers a warning about updates not being done correctly. That's a false positive if the race condition is hit as the remote CPU is idle. Plug it by checking the condition again when holding run queue lock. - Fix a bug in the utilization estimation of a run queue which causes the estimation to be 0 when a run queue is throttled. - Advance the global expiration of the period timer when the timer is restarted after a idle period. Otherwise the expiry time is stale and the timer fires prematurely. - Cure the drift between the bandwidth timer and the runqueue accounting, which leads to bogus throttling of runqueues - Place the call to cpufreq_update_util() correctly so the function will observe the correct number of running RT tasks and not a stale one. * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: kthread, sched/core: Fix kthread_parkme() (again...) sched/util_est: Fix util_est_dequeue() for throttled cfs_rq sched/fair: Advance global expiration when period timer is restarted sched/fair: Fix bandwidth timer clock drift condition sched/rt: Fix call to cpufreq_update_util() sched/nohz: Skip remote tick on idle task entirely
This commit is contained in:
commit
6fb2489d7f
8 changed files with 99 additions and 75 deletions
|
@ -62,7 +62,6 @@ void *kthread_probe_data(struct task_struct *k);
|
|||
int kthread_park(struct task_struct *k);
|
||||
void kthread_unpark(struct task_struct *k);
|
||||
void kthread_parkme(void);
|
||||
void kthread_park_complete(struct task_struct *k);
|
||||
|
||||
int kthreadd(void *unused);
|
||||
extern struct task_struct *kthreadd_task;
|
||||
|
|
|
@ -118,7 +118,7 @@ struct task_group;
|
|||
* the comment with set_special_state().
|
||||
*/
|
||||
#define is_special_task_state(state) \
|
||||
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD))
|
||||
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
|
||||
|
||||
#define __set_current_state(state_value) \
|
||||
do { \
|
||||
|
|
|
@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task)
|
|||
static void __kthread_parkme(struct kthread *self)
|
||||
{
|
||||
for (;;) {
|
||||
set_current_state(TASK_PARKED);
|
||||
/*
|
||||
* TASK_PARKED is a special state; we must serialize against
|
||||
* possible pending wakeups to avoid store-store collisions on
|
||||
* task->state.
|
||||
*
|
||||
* Such a collision might possibly result in the task state
|
||||
* changin from TASK_PARKED and us failing the
|
||||
* wait_task_inactive() in kthread_park().
|
||||
*/
|
||||
set_special_state(TASK_PARKED);
|
||||
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
|
||||
break;
|
||||
|
||||
complete_all(&self->parked);
|
||||
schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
@ -191,11 +202,6 @@ void kthread_parkme(void)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(kthread_parkme);
|
||||
|
||||
void kthread_park_complete(struct task_struct *k)
|
||||
{
|
||||
complete_all(&to_kthread(k)->parked);
|
||||
}
|
||||
|
||||
static int kthread(void *_create)
|
||||
{
|
||||
/* Copy data: it's on kthread's stack */
|
||||
|
@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k)
|
|||
|
||||
reinit_completion(&kthread->parked);
|
||||
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
|
||||
/*
|
||||
* __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
|
||||
*/
|
||||
wake_up_state(k, TASK_PARKED);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kthread_unpark);
|
||||
|
@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k)
|
|||
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
|
||||
if (k != current) {
|
||||
wake_up_process(k);
|
||||
/*
|
||||
* Wait for __kthread_parkme() to complete(), this means we
|
||||
* _will_ have TASK_PARKED and are about to call schedule().
|
||||
*/
|
||||
wait_for_completion(&kthread->parked);
|
||||
/*
|
||||
* Now wait for that schedule() to complete and the task to
|
||||
* get scheduled out.
|
||||
*/
|
||||
WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -7,7 +7,6 @@
|
|||
*/
|
||||
#include "sched.h"
|
||||
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/nospec.h>
|
||||
|
||||
#include <linux/kcov.h>
|
||||
|
@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev)
|
|||
membarrier_mm_sync_core_before_usermode(mm);
|
||||
mmdrop(mm);
|
||||
}
|
||||
if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) {
|
||||
switch (prev_state) {
|
||||
case TASK_DEAD:
|
||||
if (prev->sched_class->task_dead)
|
||||
prev->sched_class->task_dead(prev);
|
||||
if (unlikely(prev_state == TASK_DEAD)) {
|
||||
if (prev->sched_class->task_dead)
|
||||
prev->sched_class->task_dead(prev);
|
||||
|
||||
/*
|
||||
* Remove function-return probe instances associated with this
|
||||
* task and put them back on the free list.
|
||||
*/
|
||||
kprobe_flush_task(prev);
|
||||
/*
|
||||
* Remove function-return probe instances associated with this
|
||||
* task and put them back on the free list.
|
||||
*/
|
||||
kprobe_flush_task(prev);
|
||||
|
||||
/* Task is done with its stack. */
|
||||
put_task_stack(prev);
|
||||
/* Task is done with its stack. */
|
||||
put_task_stack(prev);
|
||||
|
||||
put_task_struct(prev);
|
||||
break;
|
||||
|
||||
case TASK_PARKED:
|
||||
kthread_park_complete(prev);
|
||||
break;
|
||||
}
|
||||
put_task_struct(prev);
|
||||
}
|
||||
|
||||
tick_nohz_task_switch();
|
||||
|
@ -3113,7 +3104,9 @@ static void sched_tick_remote(struct work_struct *work)
|
|||
struct tick_work *twork = container_of(dwork, struct tick_work, work);
|
||||
int cpu = twork->cpu;
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct task_struct *curr;
|
||||
struct rq_flags rf;
|
||||
u64 delta;
|
||||
|
||||
/*
|
||||
* Handle the tick only if it appears the remote CPU is running in full
|
||||
|
@ -3122,24 +3115,28 @@ static void sched_tick_remote(struct work_struct *work)
|
|||
* statistics and checks timeslices in a time-independent way, regardless
|
||||
* of when exactly it is running.
|
||||
*/
|
||||
if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
|
||||
struct task_struct *curr;
|
||||
u64 delta;
|
||||
if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
|
||||
goto out_requeue;
|
||||
|
||||
rq_lock_irq(rq, &rf);
|
||||
update_rq_clock(rq);
|
||||
curr = rq->curr;
|
||||
delta = rq_clock_task(rq) - curr->se.exec_start;
|
||||
rq_lock_irq(rq, &rf);
|
||||
curr = rq->curr;
|
||||
if (is_idle_task(curr))
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
* Make sure the next tick runs within a reasonable
|
||||
* amount of time.
|
||||
*/
|
||||
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
rq_unlock_irq(rq, &rf);
|
||||
}
|
||||
update_rq_clock(rq);
|
||||
delta = rq_clock_task(rq) - curr->se.exec_start;
|
||||
|
||||
/*
|
||||
* Make sure the next tick runs within a reasonable
|
||||
* amount of time.
|
||||
*/
|
||||
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
|
||||
out_unlock:
|
||||
rq_unlock_irq(rq, &rf);
|
||||
|
||||
out_requeue:
|
||||
/*
|
||||
* Run the remote tick once per second (1Hz). This arbitrary
|
||||
* frequency is large enough to avoid overload but short enough
|
||||
|
|
|
@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
|
|||
{
|
||||
struct rq *rq = cpu_rq(sg_cpu->cpu);
|
||||
|
||||
if (rq->rt.rt_nr_running)
|
||||
if (rt_rq_is_runnable(&rq->rt))
|
||||
return sg_cpu->max;
|
||||
|
||||
/*
|
||||
|
|
|
@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
|
|||
if (!sched_feat(UTIL_EST))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Update root cfs_rq's estimated utilization
|
||||
*
|
||||
* If *p is the last task then the root cfs_rq's estimated utilization
|
||||
* of a CPU is 0 by definition.
|
||||
*/
|
||||
ue.enqueued = 0;
|
||||
if (cfs_rq->nr_running) {
|
||||
ue.enqueued = cfs_rq->avg.util_est.enqueued;
|
||||
ue.enqueued -= min_t(unsigned int, ue.enqueued,
|
||||
(_task_util_est(p) | UTIL_AVG_UNCHANGED));
|
||||
}
|
||||
/* Update root cfs_rq's estimated utilization */
|
||||
ue.enqueued = cfs_rq->avg.util_est.enqueued;
|
||||
ue.enqueued -= min_t(unsigned int, ue.enqueued,
|
||||
(_task_util_est(p) | UTIL_AVG_UNCHANGED));
|
||||
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
|
||||
|
||||
/*
|
||||
|
@ -4590,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
|
|||
now = sched_clock_cpu(smp_processor_id());
|
||||
cfs_b->runtime = cfs_b->quota;
|
||||
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
|
||||
cfs_b->expires_seq++;
|
||||
}
|
||||
|
||||
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
||||
|
@ -4612,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|||
struct task_group *tg = cfs_rq->tg;
|
||||
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
||||
u64 amount = 0, min_amount, expires;
|
||||
int expires_seq;
|
||||
|
||||
/* note: this is a positive sum as runtime_remaining <= 0 */
|
||||
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
|
||||
|
@ -4628,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|||
cfs_b->idle = 0;
|
||||
}
|
||||
}
|
||||
expires_seq = cfs_b->expires_seq;
|
||||
expires = cfs_b->runtime_expires;
|
||||
raw_spin_unlock(&cfs_b->lock);
|
||||
|
||||
|
@ -4637,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|||
* spread between our sched_clock and the one on which runtime was
|
||||
* issued.
|
||||
*/
|
||||
if ((s64)(expires - cfs_rq->runtime_expires) > 0)
|
||||
if (cfs_rq->expires_seq != expires_seq) {
|
||||
cfs_rq->expires_seq = expires_seq;
|
||||
cfs_rq->runtime_expires = expires;
|
||||
}
|
||||
|
||||
return cfs_rq->runtime_remaining > 0;
|
||||
}
|
||||
|
@ -4664,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|||
* has not truly expired.
|
||||
*
|
||||
* Fortunately we can check determine whether this the case by checking
|
||||
* whether the global deadline has advanced. It is valid to compare
|
||||
* cfs_b->runtime_expires without any locks since we only care about
|
||||
* exact equality, so a partial write will still work.
|
||||
* whether the global deadline(cfs_b->expires_seq) has advanced.
|
||||
*/
|
||||
|
||||
if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
|
||||
if (cfs_rq->expires_seq == cfs_b->expires_seq) {
|
||||
/* extend local deadline, drift is bounded above by 2 ticks */
|
||||
cfs_rq->runtime_expires += TICK_NSEC;
|
||||
} else {
|
||||
|
@ -5202,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|||
|
||||
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
{
|
||||
u64 overrun;
|
||||
|
||||
lockdep_assert_held(&cfs_b->lock);
|
||||
|
||||
if (!cfs_b->period_active) {
|
||||
cfs_b->period_active = 1;
|
||||
hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
|
||||
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
|
||||
}
|
||||
if (cfs_b->period_active)
|
||||
return;
|
||||
|
||||
cfs_b->period_active = 1;
|
||||
overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
|
||||
cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
|
||||
cfs_b->expires_seq++;
|
||||
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
|
||||
}
|
||||
|
||||
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
||||
|
|
|
@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
|
|||
|
||||
rt_se = rt_rq->tg->rt_se[cpu];
|
||||
|
||||
if (!rt_se)
|
||||
if (!rt_se) {
|
||||
dequeue_top_rt_rq(rt_rq);
|
||||
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
|
||||
cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
|
||||
}
|
||||
else if (on_rt_rq(rt_se))
|
||||
dequeue_rt_entity(rt_se, 0);
|
||||
}
|
||||
|
@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
|
|||
sub_nr_running(rq, rt_rq->rt_nr_running);
|
||||
rt_rq->rt_queued = 0;
|
||||
|
||||
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
|
||||
cpufreq_update_util(rq, 0);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
|
|||
|
||||
if (rt_rq->rt_queued)
|
||||
return;
|
||||
if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
|
||||
|
||||
if (rt_rq_throttled(rt_rq))
|
||||
return;
|
||||
|
||||
add_nr_running(rq, rt_rq->rt_nr_running);
|
||||
rt_rq->rt_queued = 1;
|
||||
if (rt_rq->rt_nr_running) {
|
||||
add_nr_running(rq, rt_rq->rt_nr_running);
|
||||
rt_rq->rt_queued = 1;
|
||||
}
|
||||
|
||||
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
|
||||
cpufreq_update_util(rq, 0);
|
||||
|
|
|
@ -334,9 +334,10 @@ struct cfs_bandwidth {
|
|||
u64 runtime;
|
||||
s64 hierarchical_quota;
|
||||
u64 runtime_expires;
|
||||
int expires_seq;
|
||||
|
||||
int idle;
|
||||
int period_active;
|
||||
short idle;
|
||||
short period_active;
|
||||
struct hrtimer period_timer;
|
||||
struct hrtimer slack_timer;
|
||||
struct list_head throttled_cfs_rq;
|
||||
|
@ -551,6 +552,7 @@ struct cfs_rq {
|
|||
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
int runtime_enabled;
|
||||
int expires_seq;
|
||||
u64 runtime_expires;
|
||||
s64 runtime_remaining;
|
||||
|
||||
|
@ -609,6 +611,11 @@ struct rt_rq {
|
|||
#endif
|
||||
};
|
||||
|
||||
static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
|
||||
{
|
||||
return rt_rq->rt_queued && rt_rq->rt_nr_running;
|
||||
}
|
||||
|
||||
/* Deadline class' related fields in a runqueue */
|
||||
struct dl_rq {
|
||||
/* runqueue is an rbtree, ordered by deadline */
|
||||
|
|
Loading…
Reference in a new issue