sched/numa: Rewrite the CONFIG_NUMA sched domain support
The current code groups up to 16 nodes in a level and then puts an ALLNODES domain spanning the entire tree on top of that. This doesn't reflect the numa topology and esp for the smaller not-fully-connected machines out there today this might make a difference. Therefore, build a proper numa topology based on node_distance(). Since there's no fixed numa layers anymore, the static SD_NODE_INIT and SD_ALLNODES_INIT aren't usable anymore, the new code tries to construct something similar and scales some values either on the number of cpus in the domain and/or the node_distance() ratio. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Anton Blanchard <anton@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: David Howells <dhowells@redhat.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Fenghua Yu <fenghua.yu@intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: linux-alpha@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-sh@vger.kernel.org Cc: Matt Turner <mattst88@gmail.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Richard Henderson <rth@twiddle.net> Cc: sparclinux@vger.kernel.org Cc: Tony Luck <tony.luck@intel.com> Cc: x86@kernel.org Cc: Dimitri Sivanich <sivanich@sgi.com> Cc: Greg Pearson <greg.pearson@hp.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: bob.picco@oracle.com Cc: chris.mason@oracle.com Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/n/tip-r74n3n8hhuc2ynbrnp3vt954@git.kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
parent
bd939f45da
commit
cb83b629ba
9 changed files with 185 additions and 318 deletions
|
@ -70,31 +70,6 @@ void build_cpu_to_node_map(void);
|
||||||
.nr_balance_failed = 0, \
|
.nr_balance_failed = 0, \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* sched_domains SD_NODE_INIT for IA64 NUMA machines */
|
|
||||||
#define SD_NODE_INIT (struct sched_domain) { \
|
|
||||||
.parent = NULL, \
|
|
||||||
.child = NULL, \
|
|
||||||
.groups = NULL, \
|
|
||||||
.min_interval = 8, \
|
|
||||||
.max_interval = 8*(min(num_online_cpus(), 32U)), \
|
|
||||||
.busy_factor = 64, \
|
|
||||||
.imbalance_pct = 125, \
|
|
||||||
.cache_nice_tries = 2, \
|
|
||||||
.busy_idx = 3, \
|
|
||||||
.idle_idx = 2, \
|
|
||||||
.newidle_idx = 0, \
|
|
||||||
.wake_idx = 0, \
|
|
||||||
.forkexec_idx = 0, \
|
|
||||||
.flags = SD_LOAD_BALANCE \
|
|
||||||
| SD_BALANCE_NEWIDLE \
|
|
||||||
| SD_BALANCE_EXEC \
|
|
||||||
| SD_BALANCE_FORK \
|
|
||||||
| SD_SERIALIZE, \
|
|
||||||
.last_balance = jiffies, \
|
|
||||||
.balance_interval = 64, \
|
|
||||||
.nr_balance_failed = 0, \
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif /* CONFIG_NUMA */
|
#endif /* CONFIG_NUMA */
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
|
|
|
@ -36,23 +36,6 @@ extern unsigned char __node_distances[MAX_COMPACT_NODES][MAX_COMPACT_NODES];
|
||||||
|
|
||||||
#define node_distance(from, to) (__node_distances[(from)][(to)])
|
#define node_distance(from, to) (__node_distances[(from)][(to)])
|
||||||
|
|
||||||
/* sched_domains SD_NODE_INIT for SGI IP27 machines */
|
|
||||||
#define SD_NODE_INIT (struct sched_domain) { \
|
|
||||||
.parent = NULL, \
|
|
||||||
.child = NULL, \
|
|
||||||
.groups = NULL, \
|
|
||||||
.min_interval = 8, \
|
|
||||||
.max_interval = 32, \
|
|
||||||
.busy_factor = 32, \
|
|
||||||
.imbalance_pct = 125, \
|
|
||||||
.cache_nice_tries = 1, \
|
|
||||||
.flags = SD_LOAD_BALANCE | \
|
|
||||||
SD_BALANCE_EXEC, \
|
|
||||||
.last_balance = jiffies, \
|
|
||||||
.balance_interval = 1, \
|
|
||||||
.nr_balance_failed = 0, \
|
|
||||||
}
|
|
||||||
|
|
||||||
#include <asm-generic/topology.h>
|
#include <asm-generic/topology.h>
|
||||||
|
|
||||||
#endif /* _ASM_MACH_TOPOLOGY_H */
|
#endif /* _ASM_MACH_TOPOLOGY_H */
|
||||||
|
|
|
@ -18,12 +18,6 @@ struct device_node;
|
||||||
*/
|
*/
|
||||||
#define RECLAIM_DISTANCE 10
|
#define RECLAIM_DISTANCE 10
|
||||||
|
|
||||||
/*
|
|
||||||
* Avoid creating an extra level of balancing (SD_ALLNODES) on the largest
|
|
||||||
* POWER7 boxes which have a maximum of 32 nodes.
|
|
||||||
*/
|
|
||||||
#define SD_NODES_PER_DOMAIN 32
|
|
||||||
|
|
||||||
#include <asm/mmzone.h>
|
#include <asm/mmzone.h>
|
||||||
|
|
||||||
static inline int cpu_to_node(int cpu)
|
static inline int cpu_to_node(int cpu)
|
||||||
|
@ -51,36 +45,6 @@ static inline int pcibus_to_node(struct pci_bus *bus)
|
||||||
cpu_all_mask : \
|
cpu_all_mask : \
|
||||||
cpumask_of_node(pcibus_to_node(bus)))
|
cpumask_of_node(pcibus_to_node(bus)))
|
||||||
|
|
||||||
/* sched_domains SD_NODE_INIT for PPC64 machines */
|
|
||||||
#define SD_NODE_INIT (struct sched_domain) { \
|
|
||||||
.min_interval = 8, \
|
|
||||||
.max_interval = 32, \
|
|
||||||
.busy_factor = 32, \
|
|
||||||
.imbalance_pct = 125, \
|
|
||||||
.cache_nice_tries = 1, \
|
|
||||||
.busy_idx = 3, \
|
|
||||||
.idle_idx = 1, \
|
|
||||||
.newidle_idx = 0, \
|
|
||||||
.wake_idx = 0, \
|
|
||||||
.forkexec_idx = 0, \
|
|
||||||
\
|
|
||||||
.flags = 1*SD_LOAD_BALANCE \
|
|
||||||
| 0*SD_BALANCE_NEWIDLE \
|
|
||||||
| 1*SD_BALANCE_EXEC \
|
|
||||||
| 1*SD_BALANCE_FORK \
|
|
||||||
| 0*SD_BALANCE_WAKE \
|
|
||||||
| 1*SD_WAKE_AFFINE \
|
|
||||||
| 0*SD_PREFER_LOCAL \
|
|
||||||
| 0*SD_SHARE_CPUPOWER \
|
|
||||||
| 0*SD_POWERSAVINGS_BALANCE \
|
|
||||||
| 0*SD_SHARE_PKG_RESOURCES \
|
|
||||||
| 1*SD_SERIALIZE \
|
|
||||||
| 0*SD_PREFER_SIBLING \
|
|
||||||
, \
|
|
||||||
.last_balance = jiffies, \
|
|
||||||
.balance_interval = 1, \
|
|
||||||
}
|
|
||||||
|
|
||||||
extern int __node_distance(int, int);
|
extern int __node_distance(int, int);
|
||||||
#define node_distance(a, b) __node_distance(a, b)
|
#define node_distance(a, b) __node_distance(a, b)
|
||||||
|
|
||||||
|
|
|
@ -3,31 +3,6 @@
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
#ifdef CONFIG_NUMA
|
||||||
|
|
||||||
/* sched_domains SD_NODE_INIT for sh machines */
|
|
||||||
#define SD_NODE_INIT (struct sched_domain) { \
|
|
||||||
.parent = NULL, \
|
|
||||||
.child = NULL, \
|
|
||||||
.groups = NULL, \
|
|
||||||
.min_interval = 8, \
|
|
||||||
.max_interval = 32, \
|
|
||||||
.busy_factor = 32, \
|
|
||||||
.imbalance_pct = 125, \
|
|
||||||
.cache_nice_tries = 2, \
|
|
||||||
.busy_idx = 3, \
|
|
||||||
.idle_idx = 2, \
|
|
||||||
.newidle_idx = 0, \
|
|
||||||
.wake_idx = 0, \
|
|
||||||
.forkexec_idx = 0, \
|
|
||||||
.flags = SD_LOAD_BALANCE \
|
|
||||||
| SD_BALANCE_FORK \
|
|
||||||
| SD_BALANCE_EXEC \
|
|
||||||
| SD_BALANCE_NEWIDLE \
|
|
||||||
| SD_SERIALIZE, \
|
|
||||||
.last_balance = jiffies, \
|
|
||||||
.balance_interval = 1, \
|
|
||||||
.nr_balance_failed = 0, \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define cpu_to_node(cpu) ((void)(cpu),0)
|
#define cpu_to_node(cpu) ((void)(cpu),0)
|
||||||
#define parent_node(node) ((void)(node),0)
|
#define parent_node(node) ((void)(node),0)
|
||||||
|
|
||||||
|
|
|
@ -31,25 +31,6 @@ static inline int pcibus_to_node(struct pci_bus *pbus)
|
||||||
cpu_all_mask : \
|
cpu_all_mask : \
|
||||||
cpumask_of_node(pcibus_to_node(bus)))
|
cpumask_of_node(pcibus_to_node(bus)))
|
||||||
|
|
||||||
#define SD_NODE_INIT (struct sched_domain) { \
|
|
||||||
.min_interval = 8, \
|
|
||||||
.max_interval = 32, \
|
|
||||||
.busy_factor = 32, \
|
|
||||||
.imbalance_pct = 125, \
|
|
||||||
.cache_nice_tries = 2, \
|
|
||||||
.busy_idx = 3, \
|
|
||||||
.idle_idx = 2, \
|
|
||||||
.newidle_idx = 0, \
|
|
||||||
.wake_idx = 0, \
|
|
||||||
.forkexec_idx = 0, \
|
|
||||||
.flags = SD_LOAD_BALANCE \
|
|
||||||
| SD_BALANCE_FORK \
|
|
||||||
| SD_BALANCE_EXEC \
|
|
||||||
| SD_SERIALIZE, \
|
|
||||||
.last_balance = jiffies, \
|
|
||||||
.balance_interval = 1, \
|
|
||||||
}
|
|
||||||
|
|
||||||
#else /* CONFIG_NUMA */
|
#else /* CONFIG_NUMA */
|
||||||
|
|
||||||
#include <asm-generic/topology.h>
|
#include <asm-generic/topology.h>
|
||||||
|
|
|
@ -78,32 +78,6 @@ static inline const struct cpumask *cpumask_of_node(int node)
|
||||||
.balance_interval = 32, \
|
.balance_interval = 32, \
|
||||||
}
|
}
|
||||||
|
|
||||||
/* sched_domains SD_NODE_INIT for TILE architecture */
|
|
||||||
#define SD_NODE_INIT (struct sched_domain) { \
|
|
||||||
.min_interval = 16, \
|
|
||||||
.max_interval = 512, \
|
|
||||||
.busy_factor = 32, \
|
|
||||||
.imbalance_pct = 125, \
|
|
||||||
.cache_nice_tries = 1, \
|
|
||||||
.busy_idx = 3, \
|
|
||||||
.idle_idx = 1, \
|
|
||||||
.newidle_idx = 2, \
|
|
||||||
.wake_idx = 1, \
|
|
||||||
.flags = 1*SD_LOAD_BALANCE \
|
|
||||||
| 1*SD_BALANCE_NEWIDLE \
|
|
||||||
| 1*SD_BALANCE_EXEC \
|
|
||||||
| 1*SD_BALANCE_FORK \
|
|
||||||
| 0*SD_BALANCE_WAKE \
|
|
||||||
| 0*SD_WAKE_AFFINE \
|
|
||||||
| 0*SD_PREFER_LOCAL \
|
|
||||||
| 0*SD_SHARE_CPUPOWER \
|
|
||||||
| 0*SD_SHARE_PKG_RESOURCES \
|
|
||||||
| 1*SD_SERIALIZE \
|
|
||||||
, \
|
|
||||||
.last_balance = jiffies, \
|
|
||||||
.balance_interval = 128, \
|
|
||||||
}
|
|
||||||
|
|
||||||
/* By definition, we create nodes based on online memory. */
|
/* By definition, we create nodes based on online memory. */
|
||||||
#define node_has_online_mem(nid) 1
|
#define node_has_online_mem(nid) 1
|
||||||
|
|
||||||
|
|
|
@ -92,44 +92,6 @@ extern void setup_node_to_cpumask_map(void);
|
||||||
|
|
||||||
#define pcibus_to_node(bus) __pcibus_to_node(bus)
|
#define pcibus_to_node(bus) __pcibus_to_node(bus)
|
||||||
|
|
||||||
#ifdef CONFIG_X86_32
|
|
||||||
# define SD_CACHE_NICE_TRIES 1
|
|
||||||
# define SD_IDLE_IDX 1
|
|
||||||
#else
|
|
||||||
# define SD_CACHE_NICE_TRIES 2
|
|
||||||
# define SD_IDLE_IDX 2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* sched_domains SD_NODE_INIT for NUMA machines */
|
|
||||||
#define SD_NODE_INIT (struct sched_domain) { \
|
|
||||||
.min_interval = 8, \
|
|
||||||
.max_interval = 32, \
|
|
||||||
.busy_factor = 32, \
|
|
||||||
.imbalance_pct = 125, \
|
|
||||||
.cache_nice_tries = SD_CACHE_NICE_TRIES, \
|
|
||||||
.busy_idx = 3, \
|
|
||||||
.idle_idx = SD_IDLE_IDX, \
|
|
||||||
.newidle_idx = 0, \
|
|
||||||
.wake_idx = 0, \
|
|
||||||
.forkexec_idx = 0, \
|
|
||||||
\
|
|
||||||
.flags = 1*SD_LOAD_BALANCE \
|
|
||||||
| 1*SD_BALANCE_NEWIDLE \
|
|
||||||
| 1*SD_BALANCE_EXEC \
|
|
||||||
| 1*SD_BALANCE_FORK \
|
|
||||||
| 0*SD_BALANCE_WAKE \
|
|
||||||
| 1*SD_WAKE_AFFINE \
|
|
||||||
| 0*SD_PREFER_LOCAL \
|
|
||||||
| 0*SD_SHARE_CPUPOWER \
|
|
||||||
| 0*SD_POWERSAVINGS_BALANCE \
|
|
||||||
| 0*SD_SHARE_PKG_RESOURCES \
|
|
||||||
| 1*SD_SERIALIZE \
|
|
||||||
| 0*SD_PREFER_SIBLING \
|
|
||||||
, \
|
|
||||||
.last_balance = jiffies, \
|
|
||||||
.balance_interval = 1, \
|
|
||||||
}
|
|
||||||
|
|
||||||
extern int __node_distance(int, int);
|
extern int __node_distance(int, int);
|
||||||
#define node_distance(a, b) __node_distance(a, b)
|
#define node_distance(a, b) __node_distance(a, b)
|
||||||
|
|
||||||
|
|
|
@ -70,7 +70,6 @@ int arch_update_cpu_topology(void);
|
||||||
* Below are the 3 major initializers used in building sched_domains:
|
* Below are the 3 major initializers used in building sched_domains:
|
||||||
* SD_SIBLING_INIT, for SMT domains
|
* SD_SIBLING_INIT, for SMT domains
|
||||||
* SD_CPU_INIT, for SMP domains
|
* SD_CPU_INIT, for SMP domains
|
||||||
* SD_NODE_INIT, for NUMA domains
|
|
||||||
*
|
*
|
||||||
* Any architecture that cares to do any tuning to these values should do so
|
* Any architecture that cares to do any tuning to these values should do so
|
||||||
* by defining their own arch-specific initializer in include/asm/topology.h.
|
* by defining their own arch-specific initializer in include/asm/topology.h.
|
||||||
|
@ -176,48 +175,12 @@ int arch_update_cpu_topology(void);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* sched_domains SD_ALLNODES_INIT for NUMA machines */
|
|
||||||
#define SD_ALLNODES_INIT (struct sched_domain) { \
|
|
||||||
.min_interval = 64, \
|
|
||||||
.max_interval = 64*num_online_cpus(), \
|
|
||||||
.busy_factor = 128, \
|
|
||||||
.imbalance_pct = 133, \
|
|
||||||
.cache_nice_tries = 1, \
|
|
||||||
.busy_idx = 3, \
|
|
||||||
.idle_idx = 3, \
|
|
||||||
.flags = 1*SD_LOAD_BALANCE \
|
|
||||||
| 1*SD_BALANCE_NEWIDLE \
|
|
||||||
| 0*SD_BALANCE_EXEC \
|
|
||||||
| 0*SD_BALANCE_FORK \
|
|
||||||
| 0*SD_BALANCE_WAKE \
|
|
||||||
| 0*SD_WAKE_AFFINE \
|
|
||||||
| 0*SD_SHARE_CPUPOWER \
|
|
||||||
| 0*SD_POWERSAVINGS_BALANCE \
|
|
||||||
| 0*SD_SHARE_PKG_RESOURCES \
|
|
||||||
| 1*SD_SERIALIZE \
|
|
||||||
| 0*SD_PREFER_SIBLING \
|
|
||||||
, \
|
|
||||||
.last_balance = jiffies, \
|
|
||||||
.balance_interval = 64, \
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef SD_NODES_PER_DOMAIN
|
|
||||||
#define SD_NODES_PER_DOMAIN 16
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef CONFIG_SCHED_BOOK
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
#ifndef SD_BOOK_INIT
|
#ifndef SD_BOOK_INIT
|
||||||
#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
|
#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
|
||||||
#endif
|
#endif
|
||||||
#endif /* CONFIG_SCHED_BOOK */
|
#endif /* CONFIG_SCHED_BOOK */
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
|
||||||
#ifndef SD_NODE_INIT
|
|
||||||
#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* CONFIG_NUMA */
|
|
||||||
|
|
||||||
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
|
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
|
||||||
DECLARE_PER_CPU(int, numa_node);
|
DECLARE_PER_CPU(int, numa_node);
|
||||||
|
|
||||||
|
|
|
@ -5560,7 +5560,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
|
if (!(sd->flags & SD_OVERLAP) &&
|
||||||
|
cpumask_intersects(groupmask, sched_group_cpus(group))) {
|
||||||
printk(KERN_CONT "\n");
|
printk(KERN_CONT "\n");
|
||||||
printk(KERN_ERR "ERROR: repeated CPUs\n");
|
printk(KERN_ERR "ERROR: repeated CPUs\n");
|
||||||
break;
|
break;
|
||||||
|
@ -5898,92 +5899,6 @@ static int __init isolated_cpu_setup(char *str)
|
||||||
|
|
||||||
__setup("isolcpus=", isolated_cpu_setup);
|
__setup("isolcpus=", isolated_cpu_setup);
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
|
||||||
|
|
||||||
/**
|
|
||||||
* find_next_best_node - find the next node to include in a sched_domain
|
|
||||||
* @node: node whose sched_domain we're building
|
|
||||||
* @used_nodes: nodes already in the sched_domain
|
|
||||||
*
|
|
||||||
* Find the next node to include in a given scheduling domain. Simply
|
|
||||||
* finds the closest node not already in the @used_nodes map.
|
|
||||||
*
|
|
||||||
* Should use nodemask_t.
|
|
||||||
*/
|
|
||||||
static int find_next_best_node(int node, nodemask_t *used_nodes)
|
|
||||||
{
|
|
||||||
int i, n, val, min_val, best_node = -1;
|
|
||||||
|
|
||||||
min_val = INT_MAX;
|
|
||||||
|
|
||||||
for (i = 0; i < nr_node_ids; i++) {
|
|
||||||
/* Start at @node */
|
|
||||||
n = (node + i) % nr_node_ids;
|
|
||||||
|
|
||||||
if (!nr_cpus_node(n))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
/* Skip already used nodes */
|
|
||||||
if (node_isset(n, *used_nodes))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
/* Simple min distance search */
|
|
||||||
val = node_distance(node, n);
|
|
||||||
|
|
||||||
if (val < min_val) {
|
|
||||||
min_val = val;
|
|
||||||
best_node = n;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (best_node != -1)
|
|
||||||
node_set(best_node, *used_nodes);
|
|
||||||
return best_node;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* sched_domain_node_span - get a cpumask for a node's sched_domain
|
|
||||||
* @node: node whose cpumask we're constructing
|
|
||||||
* @span: resulting cpumask
|
|
||||||
*
|
|
||||||
* Given a node, construct a good cpumask for its sched_domain to span. It
|
|
||||||
* should be one that prevents unnecessary balancing, but also spreads tasks
|
|
||||||
* out optimally.
|
|
||||||
*/
|
|
||||||
static void sched_domain_node_span(int node, struct cpumask *span)
|
|
||||||
{
|
|
||||||
nodemask_t used_nodes;
|
|
||||||
int i;
|
|
||||||
|
|
||||||
cpumask_clear(span);
|
|
||||||
nodes_clear(used_nodes);
|
|
||||||
|
|
||||||
cpumask_or(span, span, cpumask_of_node(node));
|
|
||||||
node_set(node, used_nodes);
|
|
||||||
|
|
||||||
for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
|
|
||||||
int next_node = find_next_best_node(node, &used_nodes);
|
|
||||||
if (next_node < 0)
|
|
||||||
break;
|
|
||||||
cpumask_or(span, span, cpumask_of_node(next_node));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct cpumask *cpu_node_mask(int cpu)
|
|
||||||
{
|
|
||||||
lockdep_assert_held(&sched_domains_mutex);
|
|
||||||
|
|
||||||
sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
|
|
||||||
|
|
||||||
return sched_domains_tmpmask;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const struct cpumask *cpu_allnodes_mask(int cpu)
|
|
||||||
{
|
|
||||||
return cpu_possible_mask;
|
|
||||||
}
|
|
||||||
#endif /* CONFIG_NUMA */
|
|
||||||
|
|
||||||
static const struct cpumask *cpu_cpu_mask(int cpu)
|
static const struct cpumask *cpu_cpu_mask(int cpu)
|
||||||
{
|
{
|
||||||
return cpumask_of_node(cpu_to_node(cpu));
|
return cpumask_of_node(cpu_to_node(cpu));
|
||||||
|
@ -6020,6 +5935,7 @@ struct sched_domain_topology_level {
|
||||||
sched_domain_init_f init;
|
sched_domain_init_f init;
|
||||||
sched_domain_mask_f mask;
|
sched_domain_mask_f mask;
|
||||||
int flags;
|
int flags;
|
||||||
|
int numa_level;
|
||||||
struct sd_data data;
|
struct sd_data data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -6213,10 +6129,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
|
||||||
}
|
}
|
||||||
|
|
||||||
SD_INIT_FUNC(CPU)
|
SD_INIT_FUNC(CPU)
|
||||||
#ifdef CONFIG_NUMA
|
|
||||||
SD_INIT_FUNC(ALLNODES)
|
|
||||||
SD_INIT_FUNC(NODE)
|
|
||||||
#endif
|
|
||||||
#ifdef CONFIG_SCHED_SMT
|
#ifdef CONFIG_SCHED_SMT
|
||||||
SD_INIT_FUNC(SIBLING)
|
SD_INIT_FUNC(SIBLING)
|
||||||
#endif
|
#endif
|
||||||
|
@ -6338,15 +6250,191 @@ static struct sched_domain_topology_level default_topology[] = {
|
||||||
{ sd_init_BOOK, cpu_book_mask, },
|
{ sd_init_BOOK, cpu_book_mask, },
|
||||||
#endif
|
#endif
|
||||||
{ sd_init_CPU, cpu_cpu_mask, },
|
{ sd_init_CPU, cpu_cpu_mask, },
|
||||||
#ifdef CONFIG_NUMA
|
|
||||||
{ sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
|
|
||||||
{ sd_init_ALLNODES, cpu_allnodes_mask, },
|
|
||||||
#endif
|
|
||||||
{ NULL, },
|
{ NULL, },
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
|
static struct sched_domain_topology_level *sched_domain_topology = default_topology;
|
||||||
|
|
||||||
|
#ifdef CONFIG_NUMA
|
||||||
|
|
||||||
|
static int sched_domains_numa_levels;
|
||||||
|
static int sched_domains_numa_scale;
|
||||||
|
static int *sched_domains_numa_distance;
|
||||||
|
static struct cpumask ***sched_domains_numa_masks;
|
||||||
|
static int sched_domains_curr_level;
|
||||||
|
|
||||||
|
static inline unsigned long numa_scale(unsigned long x, int level)
|
||||||
|
{
|
||||||
|
return x * sched_domains_numa_distance[level] / sched_domains_numa_scale;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int sd_local_flags(int level)
|
||||||
|
{
|
||||||
|
if (sched_domains_numa_distance[level] > REMOTE_DISTANCE)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct sched_domain *
|
||||||
|
sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
|
||||||
|
{
|
||||||
|
struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
|
||||||
|
int level = tl->numa_level;
|
||||||
|
int sd_weight = cpumask_weight(
|
||||||
|
sched_domains_numa_masks[level][cpu_to_node(cpu)]);
|
||||||
|
|
||||||
|
*sd = (struct sched_domain){
|
||||||
|
.min_interval = sd_weight,
|
||||||
|
.max_interval = 2*sd_weight,
|
||||||
|
.busy_factor = 32,
|
||||||
|
.imbalance_pct = 100 + numa_scale(25, level),
|
||||||
|
.cache_nice_tries = 2,
|
||||||
|
.busy_idx = 3,
|
||||||
|
.idle_idx = 2,
|
||||||
|
.newidle_idx = 0,
|
||||||
|
.wake_idx = 0,
|
||||||
|
.forkexec_idx = 0,
|
||||||
|
|
||||||
|
.flags = 1*SD_LOAD_BALANCE
|
||||||
|
| 1*SD_BALANCE_NEWIDLE
|
||||||
|
| 0*SD_BALANCE_EXEC
|
||||||
|
| 0*SD_BALANCE_FORK
|
||||||
|
| 0*SD_BALANCE_WAKE
|
||||||
|
| 0*SD_WAKE_AFFINE
|
||||||
|
| 0*SD_PREFER_LOCAL
|
||||||
|
| 0*SD_SHARE_CPUPOWER
|
||||||
|
| 0*SD_POWERSAVINGS_BALANCE
|
||||||
|
| 0*SD_SHARE_PKG_RESOURCES
|
||||||
|
| 1*SD_SERIALIZE
|
||||||
|
| 0*SD_PREFER_SIBLING
|
||||||
|
| sd_local_flags(level)
|
||||||
|
,
|
||||||
|
.last_balance = jiffies,
|
||||||
|
.balance_interval = sd_weight,
|
||||||
|
};
|
||||||
|
SD_INIT_NAME(sd, NUMA);
|
||||||
|
sd->private = &tl->data;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Ugly hack to pass state to sd_numa_mask()...
|
||||||
|
*/
|
||||||
|
sched_domains_curr_level = tl->numa_level;
|
||||||
|
|
||||||
|
return sd;
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct cpumask *sd_numa_mask(int cpu)
|
||||||
|
{
|
||||||
|
return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sched_init_numa(void)
|
||||||
|
{
|
||||||
|
int next_distance, curr_distance = node_distance(0, 0);
|
||||||
|
struct sched_domain_topology_level *tl;
|
||||||
|
int level = 0;
|
||||||
|
int i, j, k;
|
||||||
|
|
||||||
|
sched_domains_numa_scale = curr_distance;
|
||||||
|
sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
|
||||||
|
if (!sched_domains_numa_distance)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
|
||||||
|
* unique distances in the node_distance() table.
|
||||||
|
*
|
||||||
|
* Assumes node_distance(0,j) includes all distances in
|
||||||
|
* node_distance(i,j) in order to avoid cubic time.
|
||||||
|
*
|
||||||
|
* XXX: could be optimized to O(n log n) by using sort()
|
||||||
|
*/
|
||||||
|
next_distance = curr_distance;
|
||||||
|
for (i = 0; i < nr_node_ids; i++) {
|
||||||
|
for (j = 0; j < nr_node_ids; j++) {
|
||||||
|
int distance = node_distance(0, j);
|
||||||
|
if (distance > curr_distance &&
|
||||||
|
(distance < next_distance ||
|
||||||
|
next_distance == curr_distance))
|
||||||
|
next_distance = distance;
|
||||||
|
}
|
||||||
|
if (next_distance != curr_distance) {
|
||||||
|
sched_domains_numa_distance[level++] = next_distance;
|
||||||
|
sched_domains_numa_levels = level;
|
||||||
|
curr_distance = next_distance;
|
||||||
|
} else break;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* 'level' contains the number of unique distances, excluding the
|
||||||
|
* identity distance node_distance(i,i).
|
||||||
|
*
|
||||||
|
* The sched_domains_nume_distance[] array includes the actual distance
|
||||||
|
* numbers.
|
||||||
|
*/
|
||||||
|
|
||||||
|
sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
|
||||||
|
if (!sched_domains_numa_masks)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Now for each level, construct a mask per node which contains all
|
||||||
|
* cpus of nodes that are that many hops away from us.
|
||||||
|
*/
|
||||||
|
for (i = 0; i < level; i++) {
|
||||||
|
sched_domains_numa_masks[i] =
|
||||||
|
kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
|
||||||
|
if (!sched_domains_numa_masks[i])
|
||||||
|
return;
|
||||||
|
|
||||||
|
for (j = 0; j < nr_node_ids; j++) {
|
||||||
|
struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j);
|
||||||
|
if (!mask)
|
||||||
|
return;
|
||||||
|
|
||||||
|
sched_domains_numa_masks[i][j] = mask;
|
||||||
|
|
||||||
|
for (k = 0; k < nr_node_ids; k++) {
|
||||||
|
if (node_distance(cpu_to_node(j), k) >
|
||||||
|
sched_domains_numa_distance[i])
|
||||||
|
continue;
|
||||||
|
|
||||||
|
cpumask_or(mask, mask, cpumask_of_node(k));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
|
||||||
|
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
|
||||||
|
if (!tl)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copy the default topology bits..
|
||||||
|
*/
|
||||||
|
for (i = 0; default_topology[i].init; i++)
|
||||||
|
tl[i] = default_topology[i];
|
||||||
|
|
||||||
|
/*
|
||||||
|
* .. and append 'j' levels of NUMA goodness.
|
||||||
|
*/
|
||||||
|
for (j = 0; j < level; i++, j++) {
|
||||||
|
tl[i] = (struct sched_domain_topology_level){
|
||||||
|
.init = sd_numa_init,
|
||||||
|
.mask = sd_numa_mask,
|
||||||
|
.flags = SDTL_OVERLAP,
|
||||||
|
.numa_level = j,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
sched_domain_topology = tl;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
static inline void sched_init_numa(void)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_NUMA */
|
||||||
|
|
||||||
static int __sdt_alloc(const struct cpumask *cpu_map)
|
static int __sdt_alloc(const struct cpumask *cpu_map)
|
||||||
{
|
{
|
||||||
struct sched_domain_topology_level *tl;
|
struct sched_domain_topology_level *tl;
|
||||||
|
@ -6840,6 +6928,8 @@ void __init sched_init_smp(void)
|
||||||
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
|
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
|
||||||
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
|
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
|
||||||
|
|
||||||
|
sched_init_numa();
|
||||||
|
|
||||||
get_online_cpus();
|
get_online_cpus();
|
||||||
mutex_lock(&sched_domains_mutex);
|
mutex_lock(&sched_domains_mutex);
|
||||||
init_sched_domains(cpu_active_mask);
|
init_sched_domains(cpu_active_mask);
|
||||||
|
|
Loading…
Reference in a new issue