mm/oom-kill: Run oom-killer if ULMK is stuck
ULMK has been observed to get stuck waiting for mutexes held by processes in direct reclaim for many seconds. rwsem_down_read_failed_killable+0x20 down_read_killable+0xa8 __access_remote_vm+0x54 access_remote_vm+0x48 proc_pid_cmdline_read+0xe0 __vfs_read+0x54 vfs_read+0xa4 __arm64_sys_pread64+0x80 el0_svc_common+0xac el0_svc_handler+0x7c el0_svc+0x8 To resolve these deadlocks, allow the oom-killer to run when a low memory situtation is detected. One side effect of running the oom-killer is that it may make it more difficult to detect issues with ULMK. Therefore on debug builds, attempt to detect whether ULMK is stuck using a watchdog timer mechanism, as opposed to running, but deciding not to issue a kill. Change-Id: If1d629e1553c3562b3d23442abffc2faedb31ba2 Signed-off-by: Patrick Daly <pdaly@codeaurora.org>
This commit is contained in:
parent
5ec07046cd
commit
2aacc29713
6 changed files with 127 additions and 36 deletions
|
@ -123,14 +123,18 @@ extern void dump_tasks(struct mem_cgroup *memcg,
|
|||
const nodemask_t *nodemask);
|
||||
|
||||
#ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER
|
||||
extern bool should_ulmk_retry(void);
|
||||
extern bool should_ulmk_retry(gfp_t gfp);
|
||||
extern void ulmk_update_last_kill(void);
|
||||
extern void ulmk_watchdog_fn(struct timer_list *t);
|
||||
extern void ulmk_watchdog_pet(struct timer_list *t);
|
||||
#else
|
||||
static inline bool should_ulmk_retry(void)
|
||||
static inline bool should_ulmk_retry(gfp_t gfp)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
static inline void ulmk_update_last_kill(void) {}
|
||||
static inline void ulmk_watchdog_fn(struct timer_list *t) {}
|
||||
static inline void ulmk_watchdog_pet(struct timer_list *t) {}
|
||||
#endif
|
||||
|
||||
/* sysctls */
|
||||
|
|
|
@ -24,6 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
|
|||
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
|
||||
|
||||
void psi_emergency_trigger(void);
|
||||
bool psi_is_trigger_active(void);
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
int psi_cgroup_alloc(struct cgroup *cgrp);
|
||||
|
@ -46,6 +47,10 @@ static inline void psi_memstall_enter(unsigned long *flags) {}
|
|||
static inline void psi_memstall_leave(unsigned long *flags) {}
|
||||
|
||||
static inline void psi_emergency_trigger(void){}
|
||||
static inline bool psi_is_trigger_active(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
static inline int psi_cgroup_alloc(struct cgroup *cgrp)
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include <linux/types.h>
|
||||
#include <linux/kref.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/timer.h>
|
||||
|
||||
#ifdef CONFIG_PSI
|
||||
|
||||
|
@ -126,6 +127,7 @@ struct psi_trigger {
|
|||
|
||||
/* Task that created the trigger */
|
||||
char comm[TASK_COMM_LEN];
|
||||
struct timer_list wdog_timer;
|
||||
};
|
||||
|
||||
struct psi_group {
|
||||
|
|
|
@ -575,8 +575,12 @@ static u64 update_triggers(struct psi_group *group, u64 now)
|
|||
trace_psi_event(t->state, t->threshold);
|
||||
|
||||
/* Generate an event */
|
||||
if (cmpxchg(&t->event, 0, 1) == 0)
|
||||
if (cmpxchg(&t->event, 0, 1) == 0) {
|
||||
if (!strcmp(t->comm, ULMK_MAGIC))
|
||||
mod_timer(&t->wdog_timer, jiffies +
|
||||
nsecs_to_jiffies(2 * t->win.size));
|
||||
wake_up_interruptible(&t->event_wait);
|
||||
}
|
||||
t->last_event_time = now;
|
||||
}
|
||||
|
||||
|
@ -588,10 +592,14 @@ static u64 update_triggers(struct psi_group *group, u64 now)
|
|||
return now + group->poll_min_period;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allows sending more than one event per window.
|
||||
*/
|
||||
void psi_emergency_trigger(void)
|
||||
{
|
||||
struct psi_group *group = &psi_system;
|
||||
struct psi_trigger *t;
|
||||
u64 now;
|
||||
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
@ -603,18 +611,54 @@ void psi_emergency_trigger(void)
|
|||
if (!mutex_trylock(&group->trigger_lock))
|
||||
return;
|
||||
|
||||
now = sched_clock();
|
||||
list_for_each_entry(t, &group->triggers, node) {
|
||||
if (strcmp(t->comm, ULMK_MAGIC))
|
||||
continue;
|
||||
trace_psi_event(t->state, t->threshold);
|
||||
|
||||
/* Generate an event */
|
||||
if (cmpxchg(&t->event, 0, 1) == 0)
|
||||
if (cmpxchg(&t->event, 0, 1) == 0) {
|
||||
mod_timer(&t->wdog_timer, (unsigned long)t->win.size);
|
||||
wake_up_interruptible(&t->event_wait);
|
||||
}
|
||||
t->last_event_time = now;
|
||||
}
|
||||
mutex_unlock(&group->trigger_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if any trigger is active.
|
||||
*/
|
||||
bool psi_is_trigger_active(void)
|
||||
{
|
||||
struct psi_group *group = &psi_system;
|
||||
struct psi_trigger *t;
|
||||
bool trigger_active = false;
|
||||
u64 now;
|
||||
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* In unlikely case that OOM was triggered while adding/
|
||||
* removing triggers.
|
||||
*/
|
||||
if (!mutex_trylock(&group->trigger_lock))
|
||||
return true;
|
||||
|
||||
now = sched_clock();
|
||||
list_for_each_entry(t, &group->triggers, node) {
|
||||
if (strcmp(t->comm, ULMK_MAGIC))
|
||||
continue;
|
||||
|
||||
if (now <= t->last_event_time + t->win.size)
|
||||
trigger_active = true;
|
||||
}
|
||||
mutex_unlock(&group->trigger_lock);
|
||||
return trigger_active;
|
||||
}
|
||||
|
||||
/*
|
||||
* Schedule polling if it's not already scheduled. It's safe to call even from
|
||||
* hotpath because even though kthread_queue_delayed_work takes worker->lock
|
||||
|
@ -1116,6 +1160,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
|||
init_waitqueue_head(&t->event_wait);
|
||||
kref_init(&t->refcount);
|
||||
get_task_comm(t->comm, current);
|
||||
timer_setup(&t->wdog_timer, ulmk_watchdog_fn, TIMER_DEFERRABLE);
|
||||
|
||||
mutex_lock(&group->trigger_lock);
|
||||
|
||||
|
@ -1188,6 +1233,7 @@ static void psi_trigger_destroy(struct kref *ref)
|
|||
}
|
||||
}
|
||||
|
||||
del_timer_sync(&t->wdog_timer);
|
||||
mutex_unlock(&group->trigger_lock);
|
||||
|
||||
/*
|
||||
|
@ -1241,8 +1287,11 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
|
|||
|
||||
poll_wait(file, &t->event_wait, wait);
|
||||
|
||||
if (cmpxchg(&t->event, 1, 0) == 1)
|
||||
if (cmpxchg(&t->event, 1, 0) == 1) {
|
||||
ret |= EPOLLPRI;
|
||||
if (!strcmp(t->comm, ULMK_MAGIC))
|
||||
ulmk_watchdog_pet(&t->wdog_timer);
|
||||
}
|
||||
|
||||
kref_put(&t->refcount, psi_trigger_destroy);
|
||||
|
||||
|
|
|
@ -77,8 +77,14 @@ DEFINE_MUTEX(oom_lock);
|
|||
*/
|
||||
|
||||
#ifdef CONFIG_HAVE_USERSPACE_LOW_MEMORY_KILLER
|
||||
|
||||
/* The maximum amount of time to loop in should_ulmk_retry() */
|
||||
#define ULMK_TIMEOUT (20 * HZ)
|
||||
|
||||
static atomic64_t ulmk_wdog_expired = ATOMIC64_INIT(0);
|
||||
static atomic64_t ulmk_kill_jiffies = ATOMIC64_INIT(INITIAL_JIFFIES);
|
||||
static unsigned long psi_emergency_jiffies = INITIAL_JIFFIES;
|
||||
/* Prevents contention on the mutex_trylock in psi_emergency_jiffies */
|
||||
static DEFINE_MUTEX(ulmk_retry_lock);
|
||||
|
||||
static bool ulmk_kill_possible(void)
|
||||
|
@ -105,50 +111,74 @@ static bool ulmk_kill_possible(void)
|
|||
}
|
||||
|
||||
/*
|
||||
* psi_emergency_jiffies represents the last ULMK emergency event.
|
||||
* Give ULMK a 2 second window to handle this event.
|
||||
* If ULMK has made some progress since then, send another.
|
||||
* Repeat as necessary.
|
||||
* If CONFIG_DEBUG_PANIC_ON_OOM is enabled, attempt to determine *why*
|
||||
* we are in this state.
|
||||
* 1) No events were sent by PSI to userspace
|
||||
* 2) PSI sent an event to userspace, but userspace was not able to
|
||||
* receive the event. Possible causes of this include waiting for a
|
||||
* mutex which is held by a process in direct relcaim. Or the userspace
|
||||
* component has crashed.
|
||||
* 3) Userspace received the event, but decided not to kill anything.
|
||||
*/
|
||||
bool should_ulmk_retry(void)
|
||||
bool should_ulmk_retry(gfp_t gfp_mask)
|
||||
{
|
||||
unsigned long now, last_kill;
|
||||
bool ret = false;
|
||||
bool ret = true;
|
||||
bool wdog_expired, trigger_active;
|
||||
|
||||
struct oom_control oc = {
|
||||
.zonelist = node_zonelist(first_memory_node, gfp_mask),
|
||||
.nodemask = NULL,
|
||||
.memcg = NULL,
|
||||
.gfp_mask = gfp_mask,
|
||||
.order = 0,
|
||||
/* Also causes check_panic_on_oom not to panic */
|
||||
.only_positive_adj = true,
|
||||
};
|
||||
|
||||
if (!sysctl_panic_on_oom)
|
||||
return false;
|
||||
|
||||
/* Someone else is already checking. */
|
||||
if (!mutex_trylock(&ulmk_retry_lock))
|
||||
return true;
|
||||
|
||||
mutex_lock(&ulmk_retry_lock);
|
||||
now = jiffies;
|
||||
last_kill = atomic64_read(&ulmk_kill_jiffies);
|
||||
if (time_before(now, psi_emergency_jiffies + 2 * HZ)) {
|
||||
ret = true;
|
||||
goto out;
|
||||
}
|
||||
wdog_expired = atomic64_read(&ulmk_wdog_expired);
|
||||
trigger_active = psi_is_trigger_active();
|
||||
|
||||
if (time_after_eq(last_kill, psi_emergency_jiffies)) {
|
||||
if (time_after(last_kill, psi_emergency_jiffies)) {
|
||||
psi_emergency_jiffies = now;
|
||||
ret = true;
|
||||
} else if (time_after(now, psi_emergency_jiffies + ULMK_TIMEOUT)) {
|
||||
ret = false;
|
||||
} else if (!trigger_active) {
|
||||
psi_emergency_trigger();
|
||||
ret = true;
|
||||
goto out;
|
||||
} else if (wdog_expired) {
|
||||
mutex_lock(&oom_lock);
|
||||
ret = out_of_memory(&oc);
|
||||
mutex_unlock(&oom_lock);
|
||||
} else if (!ulmk_kill_possible()) {
|
||||
ret = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* We reached here means no kill have had happened since the last
|
||||
* emergency trigger for 2*HZ window. We can't derive the status
|
||||
* of the low memory killer here. So, before falling back to OOM,
|
||||
* check for any +ve adj tasks left in the system in repeat for
|
||||
* next 20*HZ. Indirectly the below logic also giving 20HZ window
|
||||
* for the first emergency trigger.
|
||||
*/
|
||||
if (time_after(psi_emergency_jiffies + 20 * HZ, now) &&
|
||||
ulmk_kill_possible()) {
|
||||
ret = true;
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
mutex_unlock(&ulmk_retry_lock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ulmk_watchdog_fn(struct timer_list *t)
|
||||
{
|
||||
atomic64_set(&ulmk_wdog_expired, 1);
|
||||
}
|
||||
|
||||
void ulmk_watchdog_pet(struct timer_list *t)
|
||||
{
|
||||
del_timer_sync(t);
|
||||
atomic64_set(&ulmk_wdog_expired, 0);
|
||||
}
|
||||
|
||||
void ulmk_update_last_kill(void)
|
||||
{
|
||||
atomic64_set(&ulmk_kill_jiffies, jiffies);
|
||||
|
@ -1143,7 +1173,7 @@ static void check_panic_on_oom(struct oom_control *oc,
|
|||
return;
|
||||
}
|
||||
/* Do not panic for oom kills triggered by sysrq */
|
||||
if (is_sysrq_oom(oc))
|
||||
if (is_sysrq_oom(oc) || oc->only_positive_adj)
|
||||
return;
|
||||
dump_header(oc, NULL);
|
||||
panic("Out of memory: %s panic_on_oom is enabled\n",
|
||||
|
@ -1243,7 +1273,8 @@ bool out_of_memory(struct oom_control *oc)
|
|||
* system level, we cannot survive this and will enter
|
||||
* an endless loop in the allocator. Bail out now.
|
||||
*/
|
||||
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
|
||||
if (!is_sysrq_oom(oc) && !is_memcg_oom(oc) &&
|
||||
!oc->only_positive_adj)
|
||||
panic("System is deadlocked on memory\n");
|
||||
}
|
||||
if (oc->chosen && oc->chosen != (void *)-1UL)
|
||||
|
|
|
@ -4582,7 +4582,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
|||
&compaction_retries))
|
||||
goto retry;
|
||||
|
||||
if (order <= PAGE_ALLOC_COSTLY_ORDER && should_ulmk_retry())
|
||||
if (order <= PAGE_ALLOC_COSTLY_ORDER && should_ulmk_retry(gfp_mask))
|
||||
goto retry;
|
||||
|
||||
/* Deal with possible cpuset update races before we start OOM killing */
|
||||
|
|
Loading…
Reference in a new issue