mm/oom-kill: Run oom-killer if ULMK is stuck

ULMK has been observed to get stuck waiting for mutexes held by processes
in direct reclaim for many seconds.

rwsem_down_read_failed_killable+0x20
down_read_killable+0xa8
__access_remote_vm+0x54
access_remote_vm+0x48
proc_pid_cmdline_read+0xe0
__vfs_read+0x54
vfs_read+0xa4
__arm64_sys_pread64+0x80
el0_svc_common+0xac
el0_svc_handler+0x7c
el0_svc+0x8

To resolve these deadlocks, allow the oom-killer to run when a low memory
situtation is detected.

One side effect of running the oom-killer is that it may make it more
difficult to detect issues with ULMK. Therefore on debug builds,
attempt to detect whether ULMK is stuck using a watchdog timer mechanism,
as opposed to running, but deciding not to issue a kill.

Change-Id: If1d629e1553c3562b3d23442abffc2faedb31ba2
Signed-off-by: Patrick Daly <pdaly@codeaurora.org>
This commit is contained in:
Patrick Daly 2019-10-24 14:25:38 -07:00
parent 5ec07046cd
commit 2aacc29713
6 changed files with 127 additions and 36 deletions

View file

@ -123,14 +123,18 @@ extern void dump_tasks(struct mem_cgroup *memcg,
const nodemask_t *nodemask); const nodemask_t *nodemask);
#ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER #ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER
extern bool should_ulmk_retry(void); extern bool should_ulmk_retry(gfp_t gfp);
extern void ulmk_update_last_kill(void); extern void ulmk_update_last_kill(void);
extern void ulmk_watchdog_fn(struct timer_list *t);
extern void ulmk_watchdog_pet(struct timer_list *t);
#else #else
static inline bool should_ulmk_retry(void) static inline bool should_ulmk_retry(gfp_t gfp)
{ {
return false; return false;
} }
static inline void ulmk_update_last_kill(void) {} static inline void ulmk_update_last_kill(void) {}
static inline void ulmk_watchdog_fn(struct timer_list *t) {}
static inline void ulmk_watchdog_pet(struct timer_list *t) {}
#endif #endif
/* sysctls */ /* sysctls */

View file

@ -24,6 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
void psi_emergency_trigger(void); void psi_emergency_trigger(void);
bool psi_is_trigger_active(void);
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgrp); int psi_cgroup_alloc(struct cgroup *cgrp);
@ -46,6 +47,10 @@ static inline void psi_memstall_enter(unsigned long *flags) {}
static inline void psi_memstall_leave(unsigned long *flags) {} static inline void psi_memstall_leave(unsigned long *flags) {}
static inline void psi_emergency_trigger(void){} static inline void psi_emergency_trigger(void){}
static inline bool psi_is_trigger_active(void)
{
return false;
}
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
static inline int psi_cgroup_alloc(struct cgroup *cgrp) static inline int psi_cgroup_alloc(struct cgroup *cgrp)

View file

@ -6,6 +6,7 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/kref.h> #include <linux/kref.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/timer.h>
#ifdef CONFIG_PSI #ifdef CONFIG_PSI
@ -126,6 +127,7 @@ struct psi_trigger {
/* Task that created the trigger */ /* Task that created the trigger */
char comm[TASK_COMM_LEN]; char comm[TASK_COMM_LEN];
struct timer_list wdog_timer;
}; };
struct psi_group { struct psi_group {

View file

@ -575,8 +575,12 @@ static u64 update_triggers(struct psi_group *group, u64 now)
trace_psi_event(t->state, t->threshold); trace_psi_event(t->state, t->threshold);
/* Generate an event */ /* Generate an event */
if (cmpxchg(&t->event, 0, 1) == 0) if (cmpxchg(&t->event, 0, 1) == 0) {
if (!strcmp(t->comm, ULMK_MAGIC))
mod_timer(&t->wdog_timer, jiffies +
nsecs_to_jiffies(2 * t->win.size));
wake_up_interruptible(&t->event_wait); wake_up_interruptible(&t->event_wait);
}
t->last_event_time = now; t->last_event_time = now;
} }
@ -588,10 +592,14 @@ static u64 update_triggers(struct psi_group *group, u64 now)
return now + group->poll_min_period; return now + group->poll_min_period;
} }
/*
* Allows sending more than one event per window.
*/
void psi_emergency_trigger(void) void psi_emergency_trigger(void)
{ {
struct psi_group *group = &psi_system; struct psi_group *group = &psi_system;
struct psi_trigger *t; struct psi_trigger *t;
u64 now;
if (static_branch_likely(&psi_disabled)) if (static_branch_likely(&psi_disabled))
return; return;
@ -603,18 +611,54 @@ void psi_emergency_trigger(void)
if (!mutex_trylock(&group->trigger_lock)) if (!mutex_trylock(&group->trigger_lock))
return; return;
now = sched_clock();
list_for_each_entry(t, &group->triggers, node) { list_for_each_entry(t, &group->triggers, node) {
if (strcmp(t->comm, ULMK_MAGIC)) if (strcmp(t->comm, ULMK_MAGIC))
continue; continue;
trace_psi_event(t->state, t->threshold); trace_psi_event(t->state, t->threshold);
/* Generate an event */ /* Generate an event */
if (cmpxchg(&t->event, 0, 1) == 0) if (cmpxchg(&t->event, 0, 1) == 0) {
mod_timer(&t->wdog_timer, (unsigned long)t->win.size);
wake_up_interruptible(&t->event_wait); wake_up_interruptible(&t->event_wait);
}
t->last_event_time = now;
} }
mutex_unlock(&group->trigger_lock); mutex_unlock(&group->trigger_lock);
} }
/*
* Return true if any trigger is active.
*/
bool psi_is_trigger_active(void)
{
struct psi_group *group = &psi_system;
struct psi_trigger *t;
bool trigger_active = false;
u64 now;
if (static_branch_likely(&psi_disabled))
return false;
/*
* In unlikely case that OOM was triggered while adding/
* removing triggers.
*/
if (!mutex_trylock(&group->trigger_lock))
return true;
now = sched_clock();
list_for_each_entry(t, &group->triggers, node) {
if (strcmp(t->comm, ULMK_MAGIC))
continue;
if (now <= t->last_event_time + t->win.size)
trigger_active = true;
}
mutex_unlock(&group->trigger_lock);
return trigger_active;
}
/* /*
* Schedule polling if it's not already scheduled. It's safe to call even from * Schedule polling if it's not already scheduled. It's safe to call even from
* hotpath because even though kthread_queue_delayed_work takes worker->lock * hotpath because even though kthread_queue_delayed_work takes worker->lock
@ -1116,6 +1160,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
init_waitqueue_head(&t->event_wait); init_waitqueue_head(&t->event_wait);
kref_init(&t->refcount); kref_init(&t->refcount);
get_task_comm(t->comm, current); get_task_comm(t->comm, current);
timer_setup(&t->wdog_timer, ulmk_watchdog_fn, TIMER_DEFERRABLE);
mutex_lock(&group->trigger_lock); mutex_lock(&group->trigger_lock);
@ -1188,6 +1233,7 @@ static void psi_trigger_destroy(struct kref *ref)
} }
} }
del_timer_sync(&t->wdog_timer);
mutex_unlock(&group->trigger_lock); mutex_unlock(&group->trigger_lock);
/* /*
@ -1241,8 +1287,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
poll_wait(file, &t->event_wait, wait); poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1) if (cmpxchg(&t->event, 1, 0) == 1) {
ret |= EPOLLPRI; ret |= EPOLLPRI;
if (!strcmp(t->comm, ULMK_MAGIC))
ulmk_watchdog_pet(&t->wdog_timer);
}
kref_put(&t->refcount, psi_trigger_destroy); kref_put(&t->refcount, psi_trigger_destroy);

View file

@ -77,8 +77,14 @@ DEFINE_MUTEX(oom_lock);
*/ */
#ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER #ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER
/* The maximum amount of time to loop in should_ulmk_retry() */
#define ULMK_TIMEOUT (20 * HZ)
static atomic64_t ulmk_wdog_expired = ATOMIC64_INIT(0);
static atomic64_t ulmk_kill_jiffies = ATOMIC64_INIT(INITIAL_JIFFIES); static atomic64_t ulmk_kill_jiffies = ATOMIC64_INIT(INITIAL_JIFFIES);
static unsigned long psi_emergency_jiffies = INITIAL_JIFFIES; static unsigned long psi_emergency_jiffies = INITIAL_JIFFIES;
/* Prevents contention on the mutex_trylock in psi_emergency_jiffies */
static DEFINE_MUTEX(ulmk_retry_lock); static DEFINE_MUTEX(ulmk_retry_lock);
static bool ulmk_kill_possible(void) static bool ulmk_kill_possible(void)
@ -105,50 +111,74 @@ static bool ulmk_kill_possible(void)
} }
/* /*
* psi_emergency_jiffies represents the last ULMK emergency event. * If CONFIG_DEBUG_PANIC_ON_OOM is enabled, attempt to determine *why*
* Give ULMK a 2 second window to handle this event. * we are in this state.
* If ULMK has made some progress since then, send another. * 1) No events were sent by PSI to userspace
* Repeat as necessary. * 2) PSI sent an event to userspace, but userspace was not able to
* receive the event. Possible causes of this include waiting for a
* mutex which is held by a process in direct relcaim. Or the userspace
* component has crashed.
* 3) Userspace received the event, but decided not to kill anything.
*/ */
bool should_ulmk_retry(void) bool should_ulmk_retry(gfp_t gfp_mask)
{ {
unsigned long now, last_kill; unsigned long now, last_kill;
bool ret = false; bool ret = true;
bool wdog_expired, trigger_active;
struct oom_control oc = {
.zonelist = node_zonelist(first_memory_node, gfp_mask),
.nodemask = NULL,
.memcg = NULL,
.gfp_mask = gfp_mask,
.order = 0,
/* Also causes check_panic_on_oom not to panic */
.only_positive_adj = true,
};
if (!sysctl_panic_on_oom)
return false;
/* Someone else is already checking. */
if (!mutex_trylock(&ulmk_retry_lock))
return true;
mutex_lock(&ulmk_retry_lock);
now = jiffies; now = jiffies;
last_kill = atomic64_read(&ulmk_kill_jiffies); last_kill = atomic64_read(&ulmk_kill_jiffies);
if (time_before(now, psi_emergency_jiffies + 2 * HZ)) { wdog_expired = atomic64_read(&ulmk_wdog_expired);
ret = true; trigger_active = psi_is_trigger_active();
goto out;
}
if (time_after_eq(last_kill, psi_emergency_jiffies)) { if (time_after(last_kill, psi_emergency_jiffies)) {
psi_emergency_jiffies = now; psi_emergency_jiffies = now;
ret = true;
} else if (time_after(now, psi_emergency_jiffies + ULMK_TIMEOUT)) {
ret = false;
} else if (!trigger_active) {
psi_emergency_trigger(); psi_emergency_trigger();
ret = true; ret = true;
goto out; } else if (wdog_expired) {
mutex_lock(&oom_lock);
ret = out_of_memory(&oc);
mutex_unlock(&oom_lock);
} else if (!ulmk_kill_possible()) {
ret = false;
} }
/*
* We reached here means no kill have had happened since the last
* emergency trigger for 2*HZ window. We can't derive the status
* of the low memory killer here. So, before falling back to OOM,
* check for any +ve adj tasks left in the system in repeat for
* next 20*HZ. Indirectly the below logic also giving 20HZ window
* for the first emergency trigger.
*/
if (time_after(psi_emergency_jiffies + 20 * HZ, now) &&
ulmk_kill_possible()) {
ret = true;
goto out;
}
out:
mutex_unlock(&ulmk_retry_lock); mutex_unlock(&ulmk_retry_lock);
return ret; return ret;
} }
void ulmk_watchdog_fn(struct timer_list *t)
{
atomic64_set(&ulmk_wdog_expired, 1);
}
void ulmk_watchdog_pet(struct timer_list *t)
{
del_timer_sync(t);
atomic64_set(&ulmk_wdog_expired, 0);
}
void ulmk_update_last_kill(void) void ulmk_update_last_kill(void)
{ {
atomic64_set(&ulmk_kill_jiffies, jiffies); atomic64_set(&ulmk_kill_jiffies, jiffies);
@ -1143,7 +1173,7 @@ static void check_panic_on_oom(struct oom_control *oc,
return; return;
} }
/* Do not panic for oom kills triggered by sysrq */ /* Do not panic for oom kills triggered by sysrq */
if (is_sysrq_oom(oc)) if (is_sysrq_oom(oc) || oc->only_positive_adj)
return; return;
dump_header(oc, NULL); dump_header(oc, NULL);
panic("Out of memory: %s panic_on_oom is enabled\n", panic("Out of memory: %s panic_on_oom is enabled\n",
@ -1243,7 +1273,8 @@ bool out_of_memory(struct oom_control *oc)
* system level, we cannot survive this and will enter * system level, we cannot survive this and will enter
* an endless loop in the allocator. Bail out now. * an endless loop in the allocator. Bail out now.
*/ */
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) if (!is_sysrq_oom(oc) && !is_memcg_oom(oc) &&
!oc->only_positive_adj)
panic("System is deadlocked on memory\n"); panic("System is deadlocked on memory\n");
} }
if (oc->chosen && oc->chosen != (void *)-1UL) if (oc->chosen && oc->chosen != (void *)-1UL)

View file

@ -4582,7 +4582,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
&compaction_retries)) &compaction_retries))
goto retry; goto retry;
if (order <= PAGE_ALLOC_COSTLY_ORDER && should_ulmk_retry()) if (order <= PAGE_ALLOC_COSTLY_ORDER && should_ulmk_retry(gfp_mask))
goto retry; goto retry;
/* Deal with possible cpuset update races before we start OOM killing */ /* Deal with possible cpuset update races before we start OOM killing */