mm/oom-kill: Run oom-killer if ULMK is stuck
ULMK has been observed to get stuck waiting for mutexes held by processes in direct reclaim for many seconds. rwsem_down_read_failed_killable+0x20 down_read_killable+0xa8 __access_remote_vm+0x54 access_remote_vm+0x48 proc_pid_cmdline_read+0xe0 __vfs_read+0x54 vfs_read+0xa4 __arm64_sys_pread64+0x80 el0_svc_common+0xac el0_svc_handler+0x7c el0_svc+0x8 To resolve these deadlocks, allow the oom-killer to run when a low memory situtation is detected. One side effect of running the oom-killer is that it may make it more difficult to detect issues with ULMK. Therefore on debug builds, attempt to detect whether ULMK is stuck using a watchdog timer mechanism, as opposed to running, but deciding not to issue a kill. Change-Id: If1d629e1553c3562b3d23442abffc2faedb31ba2 Signed-off-by: Patrick Daly <pdaly@codeaurora.org>
This commit is contained in:
parent
5ec07046cd
commit
2aacc29713
6 changed files with 127 additions and 36 deletions
|
@ -123,14 +123,18 @@ extern void dump_tasks(struct mem_cgroup *memcg,
|
||||||
const nodemask_t *nodemask);
|
const nodemask_t *nodemask);
|
||||||
|
|
||||||
#ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER
|
#ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER
|
||||||
extern bool should_ulmk_retry(void);
|
extern bool should_ulmk_retry(gfp_t gfp);
|
||||||
extern void ulmk_update_last_kill(void);
|
extern void ulmk_update_last_kill(void);
|
||||||
|
extern void ulmk_watchdog_fn(struct timer_list *t);
|
||||||
|
extern void ulmk_watchdog_pet(struct timer_list *t);
|
||||||
#else
|
#else
|
||||||
static inline bool should_ulmk_retry(void)
|
static inline bool should_ulmk_retry(gfp_t gfp)
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
static inline void ulmk_update_last_kill(void) {}
|
static inline void ulmk_update_last_kill(void) {}
|
||||||
|
static inline void ulmk_watchdog_fn(struct timer_list *t) {}
|
||||||
|
static inline void ulmk_watchdog_pet(struct timer_list *t) {}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* sysctls */
|
/* sysctls */
|
||||||
|
|
|
@ -24,6 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
|
||||||
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
|
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
|
||||||
|
|
||||||
void psi_emergency_trigger(void);
|
void psi_emergency_trigger(void);
|
||||||
|
bool psi_is_trigger_active(void);
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUPS
|
#ifdef CONFIG_CGROUPS
|
||||||
int psi_cgroup_alloc(struct cgroup *cgrp);
|
int psi_cgroup_alloc(struct cgroup *cgrp);
|
||||||
|
@ -46,6 +47,10 @@ static inline void psi_memstall_enter(unsigned long *flags) {}
|
||||||
static inline void psi_memstall_leave(unsigned long *flags) {}
|
static inline void psi_memstall_leave(unsigned long *flags) {}
|
||||||
|
|
||||||
static inline void psi_emergency_trigger(void){}
|
static inline void psi_emergency_trigger(void){}
|
||||||
|
static inline bool psi_is_trigger_active(void)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUPS
|
#ifdef CONFIG_CGROUPS
|
||||||
static inline int psi_cgroup_alloc(struct cgroup *cgrp)
|
static inline int psi_cgroup_alloc(struct cgroup *cgrp)
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
#include <linux/types.h>
|
#include <linux/types.h>
|
||||||
#include <linux/kref.h>
|
#include <linux/kref.h>
|
||||||
#include <linux/wait.h>
|
#include <linux/wait.h>
|
||||||
|
#include <linux/timer.h>
|
||||||
|
|
||||||
#ifdef CONFIG_PSI
|
#ifdef CONFIG_PSI
|
||||||
|
|
||||||
|
@ -126,6 +127,7 @@ struct psi_trigger {
|
||||||
|
|
||||||
/* Task that created the trigger */
|
/* Task that created the trigger */
|
||||||
char comm[TASK_COMM_LEN];
|
char comm[TASK_COMM_LEN];
|
||||||
|
struct timer_list wdog_timer;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct psi_group {
|
struct psi_group {
|
||||||
|
|
|
@ -575,8 +575,12 @@ static u64 update_triggers(struct psi_group *group, u64 now)
|
||||||
trace_psi_event(t->state, t->threshold);
|
trace_psi_event(t->state, t->threshold);
|
||||||
|
|
||||||
/* Generate an event */
|
/* Generate an event */
|
||||||
if (cmpxchg(&t->event, 0, 1) == 0)
|
if (cmpxchg(&t->event, 0, 1) == 0) {
|
||||||
|
if (!strcmp(t->comm, ULMK_MAGIC))
|
||||||
|
mod_timer(&t->wdog_timer, jiffies +
|
||||||
|
nsecs_to_jiffies(2 * t->win.size));
|
||||||
wake_up_interruptible(&t->event_wait);
|
wake_up_interruptible(&t->event_wait);
|
||||||
|
}
|
||||||
t->last_event_time = now;
|
t->last_event_time = now;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -588,10 +592,14 @@ static u64 update_triggers(struct psi_group *group, u64 now)
|
||||||
return now + group->poll_min_period;
|
return now + group->poll_min_period;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Allows sending more than one event per window.
|
||||||
|
*/
|
||||||
void psi_emergency_trigger(void)
|
void psi_emergency_trigger(void)
|
||||||
{
|
{
|
||||||
struct psi_group *group = &psi_system;
|
struct psi_group *group = &psi_system;
|
||||||
struct psi_trigger *t;
|
struct psi_trigger *t;
|
||||||
|
u64 now;
|
||||||
|
|
||||||
if (static_branch_likely(&psi_disabled))
|
if (static_branch_likely(&psi_disabled))
|
||||||
return;
|
return;
|
||||||
|
@ -603,18 +611,54 @@ void psi_emergency_trigger(void)
|
||||||
if (!mutex_trylock(&group->trigger_lock))
|
if (!mutex_trylock(&group->trigger_lock))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
now = sched_clock();
|
||||||
list_for_each_entry(t, &group->triggers, node) {
|
list_for_each_entry(t, &group->triggers, node) {
|
||||||
if (strcmp(t->comm, ULMK_MAGIC))
|
if (strcmp(t->comm, ULMK_MAGIC))
|
||||||
continue;
|
continue;
|
||||||
trace_psi_event(t->state, t->threshold);
|
trace_psi_event(t->state, t->threshold);
|
||||||
|
|
||||||
/* Generate an event */
|
/* Generate an event */
|
||||||
if (cmpxchg(&t->event, 0, 1) == 0)
|
if (cmpxchg(&t->event, 0, 1) == 0) {
|
||||||
|
mod_timer(&t->wdog_timer, (unsigned long)t->win.size);
|
||||||
wake_up_interruptible(&t->event_wait);
|
wake_up_interruptible(&t->event_wait);
|
||||||
|
}
|
||||||
|
t->last_event_time = now;
|
||||||
}
|
}
|
||||||
mutex_unlock(&group->trigger_lock);
|
mutex_unlock(&group->trigger_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return true if any trigger is active.
|
||||||
|
*/
|
||||||
|
bool psi_is_trigger_active(void)
|
||||||
|
{
|
||||||
|
struct psi_group *group = &psi_system;
|
||||||
|
struct psi_trigger *t;
|
||||||
|
bool trigger_active = false;
|
||||||
|
u64 now;
|
||||||
|
|
||||||
|
if (static_branch_likely(&psi_disabled))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* In unlikely case that OOM was triggered while adding/
|
||||||
|
* removing triggers.
|
||||||
|
*/
|
||||||
|
if (!mutex_trylock(&group->trigger_lock))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
now = sched_clock();
|
||||||
|
list_for_each_entry(t, &group->triggers, node) {
|
||||||
|
if (strcmp(t->comm, ULMK_MAGIC))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (now <= t->last_event_time + t->win.size)
|
||||||
|
trigger_active = true;
|
||||||
|
}
|
||||||
|
mutex_unlock(&group->trigger_lock);
|
||||||
|
return trigger_active;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Schedule polling if it's not already scheduled. It's safe to call even from
|
* Schedule polling if it's not already scheduled. It's safe to call even from
|
||||||
* hotpath because even though kthread_queue_delayed_work takes worker->lock
|
* hotpath because even though kthread_queue_delayed_work takes worker->lock
|
||||||
|
@ -1116,6 +1160,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
||||||
init_waitqueue_head(&t->event_wait);
|
init_waitqueue_head(&t->event_wait);
|
||||||
kref_init(&t->refcount);
|
kref_init(&t->refcount);
|
||||||
get_task_comm(t->comm, current);
|
get_task_comm(t->comm, current);
|
||||||
|
timer_setup(&t->wdog_timer, ulmk_watchdog_fn, TIMER_DEFERRABLE);
|
||||||
|
|
||||||
mutex_lock(&group->trigger_lock);
|
mutex_lock(&group->trigger_lock);
|
||||||
|
|
||||||
|
@ -1188,6 +1233,7 @@ static void psi_trigger_destroy(struct kref *ref)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
del_timer_sync(&t->wdog_timer);
|
||||||
mutex_unlock(&group->trigger_lock);
|
mutex_unlock(&group->trigger_lock);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1241,8 +1287,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
|
||||||
|
|
||||||
poll_wait(file, &t->event_wait, wait);
|
poll_wait(file, &t->event_wait, wait);
|
||||||
|
|
||||||
if (cmpxchg(&t->event, 1, 0) == 1)
|
if (cmpxchg(&t->event, 1, 0) == 1) {
|
||||||
ret |= EPOLLPRI;
|
ret |= EPOLLPRI;
|
||||||
|
if (!strcmp(t->comm, ULMK_MAGIC))
|
||||||
|
ulmk_watchdog_pet(&t->wdog_timer);
|
||||||
|
}
|
||||||
|
|
||||||
kref_put(&t->refcount, psi_trigger_destroy);
|
kref_put(&t->refcount, psi_trigger_destroy);
|
||||||
|
|
||||||
|
|
|
@ -77,8 +77,14 @@ DEFINE_MUTEX(oom_lock);
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER
|
#ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER
|
||||||
|
|
||||||
|
/* The maximum amount of time to loop in should_ulmk_retry() */
|
||||||
|
#define ULMK_TIMEOUT (20 * HZ)
|
||||||
|
|
||||||
|
static atomic64_t ulmk_wdog_expired = ATOMIC64_INIT(0);
|
||||||
static atomic64_t ulmk_kill_jiffies = ATOMIC64_INIT(INITIAL_JIFFIES);
|
static atomic64_t ulmk_kill_jiffies = ATOMIC64_INIT(INITIAL_JIFFIES);
|
||||||
static unsigned long psi_emergency_jiffies = INITIAL_JIFFIES;
|
static unsigned long psi_emergency_jiffies = INITIAL_JIFFIES;
|
||||||
|
/* Prevents contention on the mutex_trylock in psi_emergency_jiffies */
|
||||||
static DEFINE_MUTEX(ulmk_retry_lock);
|
static DEFINE_MUTEX(ulmk_retry_lock);
|
||||||
|
|
||||||
static bool ulmk_kill_possible(void)
|
static bool ulmk_kill_possible(void)
|
||||||
|
@ -105,50 +111,74 @@ static bool ulmk_kill_possible(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* psi_emergency_jiffies represents the last ULMK emergency event.
|
* If CONFIG_DEBUG_PANIC_ON_OOM is enabled, attempt to determine *why*
|
||||||
* Give ULMK a 2 second window to handle this event.
|
* we are in this state.
|
||||||
* If ULMK has made some progress since then, send another.
|
* 1) No events were sent by PSI to userspace
|
||||||
* Repeat as necessary.
|
* 2) PSI sent an event to userspace, but userspace was not able to
|
||||||
|
* receive the event. Possible causes of this include waiting for a
|
||||||
|
* mutex which is held by a process in direct relcaim. Or the userspace
|
||||||
|
* component has crashed.
|
||||||
|
* 3) Userspace received the event, but decided not to kill anything.
|
||||||
*/
|
*/
|
||||||
bool should_ulmk_retry(void)
|
bool should_ulmk_retry(gfp_t gfp_mask)
|
||||||
{
|
{
|
||||||
unsigned long now, last_kill;
|
unsigned long now, last_kill;
|
||||||
bool ret = false;
|
bool ret = true;
|
||||||
|
bool wdog_expired, trigger_active;
|
||||||
|
|
||||||
|
struct oom_control oc = {
|
||||||
|
.zonelist = node_zonelist(first_memory_node, gfp_mask),
|
||||||
|
.nodemask = NULL,
|
||||||
|
.memcg = NULL,
|
||||||
|
.gfp_mask = gfp_mask,
|
||||||
|
.order = 0,
|
||||||
|
/* Also causes check_panic_on_oom not to panic */
|
||||||
|
.only_positive_adj = true,
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!sysctl_panic_on_oom)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
/* Someone else is already checking. */
|
||||||
|
if (!mutex_trylock(&ulmk_retry_lock))
|
||||||
|
return true;
|
||||||
|
|
||||||
mutex_lock(&ulmk_retry_lock);
|
|
||||||
now = jiffies;
|
now = jiffies;
|
||||||
last_kill = atomic64_read(&ulmk_kill_jiffies);
|
last_kill = atomic64_read(&ulmk_kill_jiffies);
|
||||||
if (time_before(now, psi_emergency_jiffies + 2 * HZ)) {
|
wdog_expired = atomic64_read(&ulmk_wdog_expired);
|
||||||
ret = true;
|
trigger_active = psi_is_trigger_active();
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (time_after_eq(last_kill, psi_emergency_jiffies)) {
|
if (time_after(last_kill, psi_emergency_jiffies)) {
|
||||||
psi_emergency_jiffies = now;
|
psi_emergency_jiffies = now;
|
||||||
|
ret = true;
|
||||||
|
} else if (time_after(now, psi_emergency_jiffies + ULMK_TIMEOUT)) {
|
||||||
|
ret = false;
|
||||||
|
} else if (!trigger_active) {
|
||||||
psi_emergency_trigger();
|
psi_emergency_trigger();
|
||||||
ret = true;
|
ret = true;
|
||||||
goto out;
|
} else if (wdog_expired) {
|
||||||
|
mutex_lock(&oom_lock);
|
||||||
|
ret = out_of_memory(&oc);
|
||||||
|
mutex_unlock(&oom_lock);
|
||||||
|
} else if (!ulmk_kill_possible()) {
|
||||||
|
ret = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* We reached here means no kill have had happened since the last
|
|
||||||
* emergency trigger for 2*HZ window. We can't derive the status
|
|
||||||
* of the low memory killer here. So, before falling back to OOM,
|
|
||||||
* check for any +ve adj tasks left in the system in repeat for
|
|
||||||
* next 20*HZ. Indirectly the below logic also giving 20HZ window
|
|
||||||
* for the first emergency trigger.
|
|
||||||
*/
|
|
||||||
if (time_after(psi_emergency_jiffies + 20 * HZ, now) &&
|
|
||||||
ulmk_kill_possible()) {
|
|
||||||
ret = true;
|
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
|
|
||||||
out:
|
|
||||||
mutex_unlock(&ulmk_retry_lock);
|
mutex_unlock(&ulmk_retry_lock);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ulmk_watchdog_fn(struct timer_list *t)
|
||||||
|
{
|
||||||
|
atomic64_set(&ulmk_wdog_expired, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ulmk_watchdog_pet(struct timer_list *t)
|
||||||
|
{
|
||||||
|
del_timer_sync(t);
|
||||||
|
atomic64_set(&ulmk_wdog_expired, 0);
|
||||||
|
}
|
||||||
|
|
||||||
void ulmk_update_last_kill(void)
|
void ulmk_update_last_kill(void)
|
||||||
{
|
{
|
||||||
atomic64_set(&ulmk_kill_jiffies, jiffies);
|
atomic64_set(&ulmk_kill_jiffies, jiffies);
|
||||||
|
@ -1143,7 +1173,7 @@ static void check_panic_on_oom(struct oom_control *oc,
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
/* Do not panic for oom kills triggered by sysrq */
|
/* Do not panic for oom kills triggered by sysrq */
|
||||||
if (is_sysrq_oom(oc))
|
if (is_sysrq_oom(oc) || oc->only_positive_adj)
|
||||||
return;
|
return;
|
||||||
dump_header(oc, NULL);
|
dump_header(oc, NULL);
|
||||||
panic("Out of memory: %s panic_on_oom is enabled\n",
|
panic("Out of memory: %s panic_on_oom is enabled\n",
|
||||||
|
@ -1243,7 +1273,8 @@ bool out_of_memory(struct oom_control *oc)
|
||||||
* system level, we cannot survive this and will enter
|
* system level, we cannot survive this and will enter
|
||||||
* an endless loop in the allocator. Bail out now.
|
* an endless loop in the allocator. Bail out now.
|
||||||
*/
|
*/
|
||||||
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
|
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc) &&
|
||||||
|
!oc->only_positive_adj)
|
||||||
panic("System is deadlocked on memory\n");
|
panic("System is deadlocked on memory\n");
|
||||||
}
|
}
|
||||||
if (oc->chosen && oc->chosen != (void *)-1UL)
|
if (oc->chosen && oc->chosen != (void *)-1UL)
|
||||||
|
|
|
@ -4582,7 +4582,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
||||||
&compaction_retries))
|
&compaction_retries))
|
||||||
goto retry;
|
goto retry;
|
||||||
|
|
||||||
if (order <= PAGE_ALLOC_COSTLY_ORDER && should_ulmk_retry())
|
if (order <= PAGE_ALLOC_COSTLY_ORDER && should_ulmk_retry(gfp_mask))
|
||||||
goto retry;
|
goto retry;
|
||||||
|
|
||||||
/* Deal with possible cpuset update races before we start OOM killing */
|
/* Deal with possible cpuset update races before we start OOM killing */
|
||||||
|
|
Loading…
Reference in a new issue