mm/oom-kill: Run oom-killer if ULMK is stuck

ULMK has been observed to get stuck waiting for mutexes held by processes
in direct reclaim for many seconds.

rwsem_down_read_failed_killable+0x20
down_read_killable+0xa8
__access_remote_vm+0x54
access_remote_vm+0x48
proc_pid_cmdline_read+0xe0
__vfs_read+0x54
vfs_read+0xa4
__arm64_sys_pread64+0x80
el0_svc_common+0xac
el0_svc_handler+0x7c
el0_svc+0x8

To resolve these deadlocks, allow the oom-killer to run when a low memory
situtation is detected.

One side effect of running the oom-killer is that it may make it more
difficult to detect issues with ULMK. Therefore on debug builds,
attempt to detect whether ULMK is stuck using a watchdog timer mechanism,
as opposed to running, but deciding not to issue a kill.

Change-Id: If1d629e1553c3562b3d23442abffc2faedb31ba2
Signed-off-by: Patrick Daly <pdaly@codeaurora.org>
This commit is contained in:
Patrick Daly 2019-10-24 14:25:38 -07:00
parent 5ec07046cd
commit 2aacc29713
6 changed files with 127 additions and 36 deletions

View file

@ -123,14 +123,18 @@ extern void dump_tasks(struct mem_cgroup *memcg,
const nodemask_t *nodemask);
#ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER
extern bool should_ulmk_retry(void);
extern bool should_ulmk_retry(gfp_t gfp);
extern void ulmk_update_last_kill(void);
extern void ulmk_watchdog_fn(struct timer_list *t);
extern void ulmk_watchdog_pet(struct timer_list *t);
#else
static inline bool should_ulmk_retry(void)
static inline bool should_ulmk_retry(gfp_t gfp)
{
return false;
}
static inline void ulmk_update_last_kill(void) {}
static inline void ulmk_watchdog_fn(struct timer_list *t) {}
static inline void ulmk_watchdog_pet(struct timer_list *t) {}
#endif
/* sysctls */

View file

@ -24,6 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
void psi_emergency_trigger(void);
bool psi_is_trigger_active(void);
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgrp);
@ -46,6 +47,10 @@ static inline void psi_memstall_enter(unsigned long *flags) {}
static inline void psi_memstall_leave(unsigned long *flags) {}
static inline void psi_emergency_trigger(void){}
static inline bool psi_is_trigger_active(void)
{
return false;
}
#ifdef CONFIG_CGROUPS
static inline int psi_cgroup_alloc(struct cgroup *cgrp)

View file

@ -6,6 +6,7 @@
#include <linux/types.h>
#include <linux/kref.h>
#include <linux/wait.h>
#include <linux/timer.h>
#ifdef CONFIG_PSI
@ -126,6 +127,7 @@ struct psi_trigger {
/* Task that created the trigger */
char comm[TASK_COMM_LEN];
struct timer_list wdog_timer;
};
struct psi_group {

View file

@ -575,8 +575,12 @@ static u64 update_triggers(struct psi_group *group, u64 now)
trace_psi_event(t->state, t->threshold);
/* Generate an event */
if (cmpxchg(&t->event, 0, 1) == 0)
if (cmpxchg(&t->event, 0, 1) == 0) {
if (!strcmp(t->comm, ULMK_MAGIC))
mod_timer(&t->wdog_timer, jiffies +
nsecs_to_jiffies(2 * t->win.size));
wake_up_interruptible(&t->event_wait);
}
t->last_event_time = now;
}
@ -588,10 +592,14 @@ static u64 update_triggers(struct psi_group *group, u64 now)
return now + group->poll_min_period;
}
/*
* Allows sending more than one event per window.
*/
void psi_emergency_trigger(void)
{
struct psi_group *group = &psi_system;
struct psi_trigger *t;
u64 now;
if (static_branch_likely(&psi_disabled))
return;
@ -603,18 +611,54 @@ void psi_emergency_trigger(void)
if (!mutex_trylock(&group->trigger_lock))
return;
now = sched_clock();
list_for_each_entry(t, &group->triggers, node) {
if (strcmp(t->comm, ULMK_MAGIC))
continue;
trace_psi_event(t->state, t->threshold);
/* Generate an event */
if (cmpxchg(&t->event, 0, 1) == 0)
if (cmpxchg(&t->event, 0, 1) == 0) {
mod_timer(&t->wdog_timer, (unsigned long)t->win.size);
wake_up_interruptible(&t->event_wait);
}
t->last_event_time = now;
}
mutex_unlock(&group->trigger_lock);
}
/*
* Return true if any trigger is active.
*/
bool psi_is_trigger_active(void)
{
struct psi_group *group = &psi_system;
struct psi_trigger *t;
bool trigger_active = false;
u64 now;
if (static_branch_likely(&psi_disabled))
return false;
/*
* In unlikely case that OOM was triggered while adding/
* removing triggers.
*/
if (!mutex_trylock(&group->trigger_lock))
return true;
now = sched_clock();
list_for_each_entry(t, &group->triggers, node) {
if (strcmp(t->comm, ULMK_MAGIC))
continue;
if (now <= t->last_event_time + t->win.size)
trigger_active = true;
}
mutex_unlock(&group->trigger_lock);
return trigger_active;
}
/*
* Schedule polling if it's not already scheduled. It's safe to call even from
* hotpath because even though kthread_queue_delayed_work takes worker->lock
@ -1116,6 +1160,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
init_waitqueue_head(&t->event_wait);
kref_init(&t->refcount);
get_task_comm(t->comm, current);
timer_setup(&t->wdog_timer, ulmk_watchdog_fn, TIMER_DEFERRABLE);
mutex_lock(&group->trigger_lock);
@ -1188,6 +1233,7 @@ static void psi_trigger_destroy(struct kref *ref)
}
}
del_timer_sync(&t->wdog_timer);
mutex_unlock(&group->trigger_lock);
/*
@ -1241,8 +1287,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
if (cmpxchg(&t->event, 1, 0) == 1) {
ret |= EPOLLPRI;
if (!strcmp(t->comm, ULMK_MAGIC))
ulmk_watchdog_pet(&t->wdog_timer);
}
kref_put(&t->refcount, psi_trigger_destroy);

View file

@ -77,8 +77,14 @@ DEFINE_MUTEX(oom_lock);
*/
#ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER
/* The maximum amount of time to loop in should_ulmk_retry() */
#define ULMK_TIMEOUT (20 * HZ)
static atomic64_t ulmk_wdog_expired = ATOMIC64_INIT(0);
static atomic64_t ulmk_kill_jiffies = ATOMIC64_INIT(INITIAL_JIFFIES);
static unsigned long psi_emergency_jiffies = INITIAL_JIFFIES;
/* Prevents contention on the mutex_trylock in psi_emergency_jiffies */
static DEFINE_MUTEX(ulmk_retry_lock);
static bool ulmk_kill_possible(void)
@ -105,50 +111,74 @@ static bool ulmk_kill_possible(void)
}
/*
* psi_emergency_jiffies represents the last ULMK emergency event.
* Give ULMK a 2 second window to handle this event.
* If ULMK has made some progress since then, send another.
* Repeat as necessary.
* If CONFIG_DEBUG_PANIC_ON_OOM is enabled, attempt to determine *why*
* we are in this state.
* 1) No events were sent by PSI to userspace
* 2) PSI sent an event to userspace, but userspace was not able to
* receive the event. Possible causes of this include waiting for a
* mutex which is held by a process in direct relcaim. Or the userspace
* component has crashed.
* 3) Userspace received the event, but decided not to kill anything.
*/
bool should_ulmk_retry(void)
bool should_ulmk_retry(gfp_t gfp_mask)
{
unsigned long now, last_kill;
bool ret = false;
bool ret = true;
bool wdog_expired, trigger_active;
struct oom_control oc = {
.zonelist = node_zonelist(first_memory_node, gfp_mask),
.nodemask = NULL,
.memcg = NULL,
.gfp_mask = gfp_mask,
.order = 0,
/* Also causes check_panic_on_oom not to panic */
.only_positive_adj = true,
};
if (!sysctl_panic_on_oom)
return false;
/* Someone else is already checking. */
if (!mutex_trylock(&ulmk_retry_lock))
return true;
mutex_lock(&ulmk_retry_lock);
now = jiffies;
last_kill = atomic64_read(&ulmk_kill_jiffies);
if (time_before(now, psi_emergency_jiffies + 2 * HZ)) {
ret = true;
goto out;
}
wdog_expired = atomic64_read(&ulmk_wdog_expired);
trigger_active = psi_is_trigger_active();
if (time_after_eq(last_kill, psi_emergency_jiffies)) {
if (time_after(last_kill, psi_emergency_jiffies)) {
psi_emergency_jiffies = now;
ret = true;
} else if (time_after(now, psi_emergency_jiffies + ULMK_TIMEOUT)) {
ret = false;
} else if (!trigger_active) {
psi_emergency_trigger();
ret = true;
goto out;
} else if (wdog_expired) {
mutex_lock(&oom_lock);
ret = out_of_memory(&oc);
mutex_unlock(&oom_lock);
} else if (!ulmk_kill_possible()) {
ret = false;
}
/*
* We reached here means no kill have had happened since the last
* emergency trigger for 2*HZ window. We can't derive the status
* of the low memory killer here. So, before falling back to OOM,
* check for any +ve adj tasks left in the system in repeat for
* next 20*HZ. Indirectly the below logic also giving 20HZ window
* for the first emergency trigger.
*/
if (time_after(psi_emergency_jiffies + 20 * HZ, now) &&
ulmk_kill_possible()) {
ret = true;
goto out;
}
out:
mutex_unlock(&ulmk_retry_lock);
return ret;
}
void ulmk_watchdog_fn(struct timer_list *t)
{
atomic64_set(&ulmk_wdog_expired, 1);
}
void ulmk_watchdog_pet(struct timer_list *t)
{
del_timer_sync(t);
atomic64_set(&ulmk_wdog_expired, 0);
}
void ulmk_update_last_kill(void)
{
atomic64_set(&ulmk_kill_jiffies, jiffies);
@ -1143,7 +1173,7 @@ static void check_panic_on_oom(struct oom_control *oc,
return;
}
/* Do not panic for oom kills triggered by sysrq */
if (is_sysrq_oom(oc))
if (is_sysrq_oom(oc) || oc->only_positive_adj)
return;
dump_header(oc, NULL);
panic("Out of memory: %s panic_on_oom is enabled\n",
@ -1243,7 +1273,8 @@ bool out_of_memory(struct oom_control *oc)
* system level, we cannot survive this and will enter
* an endless loop in the allocator. Bail out now.
*/
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc) &&
!oc->only_positive_adj)
panic("System is deadlocked on memory\n");
}
if (oc->chosen && oc->chosen != (void *)-1UL)

View file

@ -4582,7 +4582,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
&compaction_retries))
goto retry;
if (order <= PAGE_ALLOC_COSTLY_ORDER && should_ulmk_retry())
if (order <= PAGE_ALLOC_COSTLY_ORDER && should_ulmk_retry(gfp_mask))
goto retry;
/* Deal with possible cpuset update races before we start OOM killing */