2005-04-16 16:20:36 -06:00
|
|
|
#ifndef _LINUX_CPUSET_H
|
|
|
|
#define _LINUX_CPUSET_H
|
|
|
|
/*
|
|
|
|
* cpuset interface
|
|
|
|
*
|
|
|
|
* Copyright (C) 2003 BULL SA
|
|
|
|
* Copyright (C) 2004 Silicon Graphics, Inc.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/nodemask.h>
|
|
|
|
|
|
|
|
#ifdef CONFIG_CPUSETS
|
|
|
|
|
2006-01-08 02:01:57 -07:00
|
|
|
extern int number_of_cpusets; /* How many cpusets are defined in system? */
|
|
|
|
|
2006-01-08 02:02:01 -07:00
|
|
|
extern int cpuset_init_early(void);
|
2005-04-16 16:20:36 -06:00
|
|
|
extern int cpuset_init(void);
|
|
|
|
extern void cpuset_init_smp(void);
|
|
|
|
extern void cpuset_fork(struct task_struct *p);
|
|
|
|
extern void cpuset_exit(struct task_struct *p);
|
2006-01-08 02:01:55 -07:00
|
|
|
extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
|
|
|
|
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
|
2005-04-16 16:20:36 -06:00
|
|
|
void cpuset_init_current_mems_allowed(void);
|
2006-01-08 02:01:54 -07:00
|
|
|
void cpuset_update_task_memory_state(void);
|
2006-01-08 02:01:47 -07:00
|
|
|
#define cpuset_nodes_subset_current_mems_allowed(nodes) \
|
|
|
|
nodes_subset((nodes), current->mems_allowed)
|
2005-04-16 16:20:36 -06:00
|
|
|
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
|
2006-01-08 02:01:57 -07:00
|
|
|
|
|
|
|
extern int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
|
|
|
|
static int inline cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
|
|
|
|
{
|
|
|
|
return number_of_cpusets <= 1 || __cpuset_zone_allowed(z, gfp_mask);
|
|
|
|
}
|
|
|
|
|
2005-09-06 16:18:13 -06:00
|
|
|
extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 02:01:49 -07:00
|
|
|
|
|
|
|
#define cpuset_memory_pressure_bump() \
|
|
|
|
do { \
|
|
|
|
if (cpuset_memory_pressure_enabled) \
|
|
|
|
__cpuset_memory_pressure_bump(); \
|
|
|
|
} while (0)
|
|
|
|
extern int cpuset_memory_pressure_enabled;
|
|
|
|
extern void __cpuset_memory_pressure_bump(void);
|
|
|
|
|
2005-04-16 16:20:36 -06:00
|
|
|
extern struct file_operations proc_cpuset_operations;
|
|
|
|
extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
|
|
|
|
|
2006-01-14 14:21:06 -07:00
|
|
|
extern void cpuset_lock(void);
|
|
|
|
extern void cpuset_unlock(void);
|
|
|
|
|
2005-04-16 16:20:36 -06:00
|
|
|
#else /* !CONFIG_CPUSETS */
|
|
|
|
|
2006-01-08 02:02:01 -07:00
|
|
|
static inline int cpuset_init_early(void) { return 0; }
|
2005-04-16 16:20:36 -06:00
|
|
|
static inline int cpuset_init(void) { return 0; }
|
|
|
|
static inline void cpuset_init_smp(void) {}
|
|
|
|
static inline void cpuset_fork(struct task_struct *p) {}
|
|
|
|
static inline void cpuset_exit(struct task_struct *p) {}
|
|
|
|
|
|
|
|
static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return cpu_possible_map;
|
|
|
|
}
|
|
|
|
|
2006-01-08 02:01:55 -07:00
|
|
|
static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return node_possible_map;
|
|
|
|
}
|
|
|
|
|
2005-04-16 16:20:36 -06:00
|
|
|
static inline void cpuset_init_current_mems_allowed(void) {}
|
2006-01-08 02:01:54 -07:00
|
|
|
static inline void cpuset_update_task_memory_state(void) {}
|
2006-01-08 02:01:47 -07:00
|
|
|
#define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
|
2005-04-16 16:20:36 -06:00
|
|
|
|
|
|
|
static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2005-10-07 00:46:04 -06:00
|
|
|
static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
|
2005-04-16 16:20:36 -06:00
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2005-09-06 16:18:13 -06:00
|
|
|
static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate-
that the tasks in a cpuset call try_to_free_pages(), the synchronous
(direct) memory reclaim code.
This enables batch managers monitoring jobs running in dedicated cpusets to
efficiently detect what level of memory pressure that job is causing.
This is useful both on tightly managed systems running a wide mix of
submitted jobs, which may choose to terminate or reprioritize jobs that are
trying to use more memory than allowed on the nodes assigned them, and with
tightly coupled, long running, massively parallel scientific computing jobs
that will dramatically fail to meet required performance goals if they
start to use more memory than allowed to them.
This patch just provides a very economical way for the batch manager to
monitor a cpuset for signs of memory pressure. It's up to the batch
manager or other user code to decide what to do about it and take action.
==> Unless this feature is enabled by writing "1" to the special file
/dev/cpuset/memory_pressure_enabled, the hook in the rebalance
code of __alloc_pages() for this metric reduces to simply noticing
that the cpuset_memory_pressure_enabled flag is zero. So only
systems that enable this feature will compute the metric.
Why a per-cpuset, running average:
Because this meter is per-cpuset, rather than per-task or mm, the
system load imposed by a batch scheduler monitoring this metric is
sharply reduced on large systems, because a scan of the tasklist can be
avoided on each set of queries.
Because this meter is a running average, instead of an accumulating
counter, a batch scheduler can detect memory pressure with a single
read, instead of having to read and accumulate results for a period of
time.
Because this meter is per-cpuset rather than per-task or mm, the
batch scheduler can obtain the key information, memory pressure in a
cpuset, with a single read, rather than having to query and accumulate
results over all the (dynamically changing) set of tasks in the cpuset.
A per-cpuset simple digital filter (requires a spinlock and 3 words of data
per-cpuset) is kept, and updated by any task attached to that cpuset, if it
enters the synchronous (direct) page reclaim code.
A per-cpuset file provides an integer number representing the recent
(half-life of 10 seconds) rate of direct page reclaims caused by the tasks
in the cpuset, in units of reclaims attempted per second, times 1000.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-01-08 02:01:49 -07:00
|
|
|
static inline void cpuset_memory_pressure_bump(void) {}
|
|
|
|
|
2005-04-16 16:20:36 -06:00
|
|
|
static inline char *cpuset_task_status_allowed(struct task_struct *task,
|
|
|
|
char *buffer)
|
|
|
|
{
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
|
2006-01-14 14:21:06 -07:00
|
|
|
static inline void cpuset_lock(void) {}
|
|
|
|
static inline void cpuset_unlock(void) {}
|
|
|
|
|
2005-04-16 16:20:36 -06:00
|
|
|
#endif /* !CONFIG_CPUSETS */
|
|
|
|
|
|
|
|
#endif /* _LINUX_CPUSET_H */
|