2a070382c9
psi has provisions to shut off the periodic aggregation worker when there is a period of no task activity - and thus no data that needs aggregating. However, while developing psi monitoring, Suren noticed that the aggregation clock currently won't stay shut off for good. Debugging this revealed a flaw in the idle design: an aggregation run will see no task activity and decide to go to sleep; shortly thereafter, the kworker thread that executed the aggregation will go idle and cause a scheduling change, during which the psi callback will kick the !pending worker again. This will ping-pong forever, and is equivalent to having no shut-off logic at all (but with more code!) Fix this by exempting aggregation workers from psi's clock waking logic when the state change is them going to sleep. To do this, tag workers with the last work function they executed, and if in psi we see a worker going to sleep after aggregating psi data, we will not reschedule the aggregation work item. What if the worker is also executing other items before or after? Any psi state times that were incurred by work items preceding the aggregation work will have been collected from the per-cpu buckets during the aggregation itself. If there are work items following the aggregation work, the worker's last_func tag will be overwritten and the aggregator will be kept alive to process this genuine new activity. If the aggregation work is the last thing the worker does, and we decide to go idle, the brief period of non-idle time incurred between the aggregation run and the kworker's dequeue will be stranded in the per-cpu buckets until the clock is woken by later activity. But that should not be a problem. The buckets can hold 4s worth of time, and future activity will wake the clock with a 2s delay, giving us 2s worth of data we can leave behind when disabling aggregation. If it takes a worker more than two seconds to go idle after it finishes its last work item, we likely have bigger problems in the system, and won't notice one sample that was averaged with a bogus per-CPU weight. Link: http://lkml.kernel.org/r/20190116193501.1910-1-hannes@cmpxchg.org Fixes: eb414681d5a0 ("psi: pressure stall information for CPU, memory, and IO") Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reported-by: Suren Baghdasaryan <surenb@google.com> Acked-by: Tejun Heo <tj@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Lai Jiangshan <jiangshanlai@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> (cherry picked from commit 1b69ac6b40ebd85eed73e4dbccde2a36961ab990) Bug: 127712811 Test: lmkd in PSI mode Change-Id: I2877fec3d381b1006b8bd1261895fdfd68bd21db Signed-off-by: Suren Baghdasaryan <surenb@google.com>
79 lines
2.4 KiB
C
79 lines
2.4 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* kernel/workqueue_internal.h
|
|
*
|
|
* Workqueue internal header file. Only to be included by workqueue and
|
|
* core kernel subsystems.
|
|
*/
|
|
#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
|
|
#define _KERNEL_WORKQUEUE_INTERNAL_H
|
|
|
|
#include <linux/workqueue.h>
|
|
#include <linux/kthread.h>
|
|
#include <linux/preempt.h>
|
|
|
|
struct worker_pool;
|
|
|
|
/*
|
|
* The poor guys doing the actual heavy lifting. All on-duty workers are
|
|
* either serving the manager role, on idle list or on busy hash. For
|
|
* details on the locking annotation (L, I, X...), refer to workqueue.c.
|
|
*
|
|
* Only to be used in workqueue and async.
|
|
*/
|
|
struct worker {
|
|
/* on idle list while idle, on busy hash table while busy */
|
|
union {
|
|
struct list_head entry; /* L: while idle */
|
|
struct hlist_node hentry; /* L: while busy */
|
|
};
|
|
|
|
struct work_struct *current_work; /* L: work being processed */
|
|
work_func_t current_func; /* L: current_work's fn */
|
|
struct pool_workqueue *current_pwq; /* L: current_work's pwq */
|
|
struct list_head scheduled; /* L: scheduled works */
|
|
|
|
/* 64 bytes boundary on 64bit, 32 on 32bit */
|
|
|
|
struct task_struct *task; /* I: worker task */
|
|
struct worker_pool *pool; /* A: the associated pool */
|
|
/* L: for rescuers */
|
|
struct list_head node; /* A: anchored at pool->workers */
|
|
/* A: runs through worker->node */
|
|
|
|
unsigned long last_active; /* L: last active timestamp */
|
|
unsigned int flags; /* X: flags */
|
|
int id; /* I: worker id */
|
|
|
|
/*
|
|
* Opaque string set with work_set_desc(). Printed out with task
|
|
* dump for debugging - WARN, BUG, panic or sysrq.
|
|
*/
|
|
char desc[WORKER_DESC_LEN];
|
|
|
|
/* used only by rescuers to point to the target workqueue */
|
|
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
|
|
|
|
/* used by the scheduler to determine a worker's last known identity */
|
|
work_func_t last_func;
|
|
};
|
|
|
|
/**
|
|
* current_wq_worker - return struct worker if %current is a workqueue worker
|
|
*/
|
|
static inline struct worker *current_wq_worker(void)
|
|
{
|
|
if (in_task() && (current->flags & PF_WQ_WORKER))
|
|
return kthread_data(current);
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Scheduler hooks for concurrency managed workqueue. Only to be used from
|
|
* sched/ and workqueue.c.
|
|
*/
|
|
void wq_worker_waking_up(struct task_struct *task, int cpu);
|
|
struct task_struct *wq_worker_sleeping(struct task_struct *task);
|
|
work_func_t wq_worker_last_func(struct task_struct *task);
|
|
|
|
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
|