Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:

 - The hopefully final fix for the reported race problems in
   kthread_parkme(). The previous attempt still left a hole and was
   partially wrong.

 - Plug a race in the remote tick mechanism which triggers a warning
   about updates not being done correctly. That's a false positive if
   the race condition is hit as the remote CPU is idle. Plug it by
   checking the condition again when holding run queue lock.

 - Fix a bug in the utilization estimation of a run queue which causes
   the estimation to be 0 when a run queue is throttled.

 - Advance the global expiration of the period timer when the timer is
   restarted after a idle period. Otherwise the expiry time is stale and
   the timer fires prematurely.

 - Cure the drift between the bandwidth timer and the runqueue
   accounting, which leads to bogus throttling of runqueues

 - Place the call to cpufreq_update_util() correctly so the function
   will observe the correct number of running RT tasks and not a stale
   one.

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  kthread, sched/core: Fix kthread_parkme() (again...)
  sched/util_est: Fix util_est_dequeue() for throttled cfs_rq
  sched/fair: Advance global expiration when period timer is restarted
  sched/fair: Fix bandwidth timer clock drift condition
  sched/rt: Fix call to cpufreq_update_util()
  sched/nohz: Skip remote tick on idle task entirely
This commit is contained in:
Linus Torvalds 2018-07-08 12:41:23 -07:00
commit 6fb2489d7f
8 changed files with 99 additions and 75 deletions

View file

@ -62,7 +62,6 @@ void *kthread_probe_data(struct task_struct *k);
int kthread_park(struct task_struct *k);
void kthread_unpark(struct task_struct *k);
void kthread_parkme(void);
void kthread_park_complete(struct task_struct *k);
int kthreadd(void *unused);
extern struct task_struct *kthreadd_task;

View file

@ -118,7 +118,7 @@ struct task_group;
* the comment with set_special_state().
*/
#define is_special_task_state(state) \
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD))
((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
#define __set_current_state(state_value) \
do { \

View file

@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task)
static void __kthread_parkme(struct kthread *self)
{
for (;;) {
set_current_state(TASK_PARKED);
/*
* TASK_PARKED is a special state; we must serialize against
* possible pending wakeups to avoid store-store collisions on
* task->state.
*
* Such a collision might possibly result in the task state
* changin from TASK_PARKED and us failing the
* wait_task_inactive() in kthread_park().
*/
set_special_state(TASK_PARKED);
if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
break;
complete_all(&self->parked);
schedule();
}
__set_current_state(TASK_RUNNING);
@ -191,11 +202,6 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);
void kthread_park_complete(struct task_struct *k)
{
complete_all(&to_kthread(k)->parked);
}
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k)
reinit_completion(&kthread->parked);
clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
/*
* __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
*/
wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);
@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k)
set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
if (k != current) {
wake_up_process(k);
/*
* Wait for __kthread_parkme() to complete(), this means we
* _will_ have TASK_PARKED and are about to call schedule().
*/
wait_for_completion(&kthread->parked);
/*
* Now wait for that schedule() to complete and the task to
* get scheduled out.
*/
WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
}
return 0;

View file

@ -7,7 +7,6 @@
*/
#include "sched.h"
#include <linux/kthread.h>
#include <linux/nospec.h>
#include <linux/kcov.h>
@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev)
membarrier_mm_sync_core_before_usermode(mm);
mmdrop(mm);
}
if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) {
switch (prev_state) {
case TASK_DEAD:
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
*/
kprobe_flush_task(prev);
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
*/
kprobe_flush_task(prev);
/* Task is done with its stack. */
put_task_stack(prev);
/* Task is done with its stack. */
put_task_stack(prev);
put_task_struct(prev);
break;
case TASK_PARKED:
kthread_park_complete(prev);
break;
}
put_task_struct(prev);
}
tick_nohz_task_switch();
@ -3113,7 +3104,9 @@ static void sched_tick_remote(struct work_struct *work)
struct tick_work *twork = container_of(dwork, struct tick_work, work);
int cpu = twork->cpu;
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr;
struct rq_flags rf;
u64 delta;
/*
* Handle the tick only if it appears the remote CPU is running in full
@ -3122,24 +3115,28 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
struct task_struct *curr;
u64 delta;
if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
goto out_requeue;
rq_lock_irq(rq, &rf);
update_rq_clock(rq);
curr = rq->curr;
delta = rq_clock_task(rq) - curr->se.exec_start;
rq_lock_irq(rq, &rf);
curr = rq->curr;
if (is_idle_task(curr))
goto out_unlock;
/*
* Make sure the next tick runs within a reasonable
* amount of time.
*/
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
curr->sched_class->task_tick(rq, curr, 0);
rq_unlock_irq(rq, &rf);
}
update_rq_clock(rq);
delta = rq_clock_task(rq) - curr->se.exec_start;
/*
* Make sure the next tick runs within a reasonable
* amount of time.
*/
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
curr->sched_class->task_tick(rq, curr, 0);
out_unlock:
rq_unlock_irq(rq, &rf);
out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough

View file

@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
if (rq->rt.rt_nr_running)
if (rt_rq_is_runnable(&rq->rt))
return sg_cpu->max;
/*

View file

@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
if (!sched_feat(UTIL_EST))
return;
/*
* Update root cfs_rq's estimated utilization
*
* If *p is the last task then the root cfs_rq's estimated utilization
* of a CPU is 0 by definition.
*/
ue.enqueued = 0;
if (cfs_rq->nr_running) {
ue.enqueued = cfs_rq->avg.util_est.enqueued;
ue.enqueued -= min_t(unsigned int, ue.enqueued,
(_task_util_est(p) | UTIL_AVG_UNCHANGED));
}
/* Update root cfs_rq's estimated utilization */
ue.enqueued = cfs_rq->avg.util_est.enqueued;
ue.enqueued -= min_t(unsigned int, ue.enqueued,
(_task_util_est(p) | UTIL_AVG_UNCHANGED));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
/*
@ -4590,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
cfs_b->expires_seq++;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@ -4612,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
u64 amount = 0, min_amount, expires;
int expires_seq;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@ -4628,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
expires_seq = cfs_b->expires_seq;
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
@ -4637,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* spread between our sched_clock and the one on which runtime was
* issued.
*/
if ((s64)(expires - cfs_rq->runtime_expires) > 0)
if (cfs_rq->expires_seq != expires_seq) {
cfs_rq->expires_seq = expires_seq;
cfs_rq->runtime_expires = expires;
}
return cfs_rq->runtime_remaining > 0;
}
@ -4664,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
* whether the global deadline has advanced. It is valid to compare
* cfs_b->runtime_expires without any locks since we only care about
* exact equality, so a partial write will still work.
* whether the global deadline(cfs_b->expires_seq) has advanced.
*/
if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
if (cfs_rq->expires_seq == cfs_b->expires_seq) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
@ -5202,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
u64 overrun;
lockdep_assert_held(&cfs_b->lock);
if (!cfs_b->period_active) {
cfs_b->period_active = 1;
hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
if (cfs_b->period_active)
return;
cfs_b->period_active = 1;
overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
cfs_b->expires_seq++;
hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
}
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)

View file

@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
rt_se = rt_rq->tg->rt_se[cpu];
if (!rt_se)
if (!rt_se) {
dequeue_top_rt_rq(rt_rq);
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
}
else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se, 0);
}
@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
sub_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 0;
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, 0);
}
static void
@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
if (rt_rq->rt_queued)
return;
if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
if (rt_rq_throttled(rt_rq))
return;
add_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 1;
if (rt_rq->rt_nr_running) {
add_nr_running(rq, rt_rq->rt_nr_running);
rt_rq->rt_queued = 1;
}
/* Kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq, 0);

View file

@ -334,9 +334,10 @@ struct cfs_bandwidth {
u64 runtime;
s64 hierarchical_quota;
u64 runtime_expires;
int expires_seq;
int idle;
int period_active;
short idle;
short period_active;
struct hrtimer period_timer;
struct hrtimer slack_timer;
struct list_head throttled_cfs_rq;
@ -551,6 +552,7 @@ struct cfs_rq {
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
int expires_seq;
u64 runtime_expires;
s64 runtime_remaining;
@ -609,6 +611,11 @@ struct rt_rq {
#endif
};
static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
{
return rt_rq->rt_queued && rt_rq->rt_nr_running;
}
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */