From 0271f91003d3703675be13b8865618359a6caa1f Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Thu, 4 Feb 2010 19:06:33 +0800 Subject: [PATCH 1/4] x86, acpi: Map hotadded cpu to correct node. When hotadd new cpu to system, if its affinitive node is online, should map the cpu to its own node. Otherwise, let kernel select one online node for the new cpu later. Signed-off-by: Haicheng Li LKML-Reference: <4B6AAA39.6000300@linux.intel.com> Tested-by: Thomas Renninger Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/boot.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 036d28adf59d..7db15e161aa0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -49,6 +49,7 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include +# include #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -482,6 +483,25 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) */ #ifdef CONFIG_ACPI_HOTPLUG_CPU +static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + nid = acpi_get_node(handle); + if (nid == -1 || !node_online(nid)) + return; +#ifdef CONFIG_X86_64 + apicid_to_node[physid] = nid; + numa_set_node(cpu, nid); +#else /* CONFIG_X86_32 */ + apicid_2_node[physid] = nid; + cpu_to_node_map[cpu] = nid; +#endif + +#endif +} + static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -540,6 +560,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) } cpu = cpumask_first(new_map); + acpi_map_cpu2node(handle, cpu, physid); *pcpu = cpu; retval = 0; From 68fd111e02b979876359c7b471a8bcbca0628b75 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 15 Feb 2010 13:43:25 -0800 Subject: [PATCH 2/4] x86, numa: Fix numa emulation calculation of big nodes numa=fake=N uses split_nodes_interleave() to partition the system into N fake nodes. Each node size must have be a multiple of FAKE_NODE_MIN_SIZE, otherwise it is possible to get strange alignments. Because of this, the remaining memory from each node when rounded to FAKE_NODE_MIN_SIZE is consolidated into a number of "big nodes" that are bigger than the rest. The calculation of the number of big nodes is incorrect since it is using a logical AND operator when it should be multiplying the rounded-off portion of each node with N. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 83bbc70d11bb..2ecbe0ca0dfc 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -427,7 +427,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, * Calculate the number of big nodes that can be allocated as a result * of consolidating the remainder. */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / + big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) / FAKE_NODE_MIN_SIZE; size &= FAKE_NODE_MIN_HASH_MASK; From 8df5bb34defd685fe86f60746bbf3d47d1c6f033 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 15 Feb 2010 13:43:30 -0800 Subject: [PATCH 3/4] x86, numa: Add fixed node size option for numa emulation numa=fake=N specifies the number of fake nodes, N, to partition the system into and then allocates them by interleaving over physical nodes. This requires knowledge of the system capacity when attempting to allocate nodes of a certain size: either very large nodes to benchmark scalability of code that operates on individual nodes, or very small nodes to find bugs in the VM. This patch introduces numa=fake=[MG] so it is possible to specify the size of each node to allocate. When used, nodes of the size specified will be allocated and interleaved over the set of physical nodes. FAKE_NODE_MIN_SIZE was also moved to the more-appropriate include/asm/numa_64.h. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- Documentation/x86/x86_64/boot-options.txt | 4 + arch/x86/include/asm/mmzone_64.h | 6 -- arch/x86/include/asm/numa_64.h | 5 + arch/x86/mm/numa_64.c | 117 ++++++++++++++++++++-- 4 files changed, 118 insertions(+), 14 deletions(-) diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 29a6ff8bc7d3..01150c64aa73 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -166,6 +166,10 @@ NUMA numa=noacpi Don't parse the SRAT table for NUMA setup + numa=fake=[MG] + If given as a memory unit, fills all system RAM with nodes of + size interleaved over physical nodes. + numa=fake=CMDLINE If a number, fakes CMDLINE nodes and ignores NUMA setup of the actual machine. Otherwise, system memory is configured diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h index a29f48c2a322..288b96f815a6 100644 --- a/arch/x86/include/asm/mmzone_64.h +++ b/arch/x86/include/asm/mmzone_64.h @@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ NODE_DATA(nid)->node_spanned_pages) - -#ifdef CONFIG_NUMA_EMU -#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) -#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) -#endif - #endif #endif /* _ASM_X86_MMZONE_64_H */ diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index c4ae822e415f..823e070e7c26 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); extern void __cpuinit numa_add_cpu(int cpu); extern void __cpuinit numa_remove_cpu(int cpu); + +#ifdef CONFIG_NUMA_EMU +#define FAKE_NODE_MIN_SIZE ((u64)64 << 20) +#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) +#endif /* CONFIG_NUMA_EMU */ #else static inline void init_cpu_to_node(void) { } static inline void numa_set_node(int cpu, int node) { } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 2ecbe0ca0dfc..c47c78ba3aca 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -501,6 +501,102 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, return ret; } +/* + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached. + */ +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) +{ + u64 end = start + size; + + while (end - start - e820_hole_size(start, end) < size) { + end += FAKE_NODE_MIN_SIZE; + if (end > max_addr) { + end = max_addr; + break; + } + } + return end; +} + +/* + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'. The return value is the number of nodes allocated. + */ +static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) +{ + nodemask_t physnode_mask = NODE_MASK_NONE; + u64 min_size; + int ret = 0; + int i; + + if (!size) + return -1; + /* + * The limit on emulated nodes is MAX_NUMNODES, so the size per node is + * increased accordingly if the requested size is too small. This + * creates a uniform distribution of node sizes across the entire + * machine (but not necessarily over physical nodes). + */ + min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / + MAX_NUMNODES; + min_size = max(min_size, FAKE_NODE_MIN_SIZE); + if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) + min_size = (min_size + FAKE_NODE_MIN_SIZE) & + FAKE_NODE_MIN_HASH_MASK; + if (size < min_size) { + pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", + size >> 20, min_size >> 20); + size = min_size; + } + size &= FAKE_NODE_MIN_HASH_MASK; + + for (i = 0; i < MAX_NUMNODES; i++) + if (physnodes[i].start != physnodes[i].end) + node_set(i, physnode_mask); + /* + * Fill physical nodes with fake nodes of size until there is no memory + * left on any of them. + */ + while (nodes_weight(physnode_mask)) { + for_each_node_mask(i, physnode_mask) { + u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; + u64 end; + + end = find_end_of_node(physnodes[i].start, + physnodes[i].end, size); + /* + * If there won't be at least FAKE_NODE_MIN_SIZE of + * non-reserved memory in ZONE_DMA32 for the next node, + * this one must extend to the boundary. + */ + if (end < dma32_end && dma32_end - end - + e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + end = dma32_end; + + /* + * If there won't be enough non-reserved memory for the + * next node, this one must extend to the end of the + * physical node. + */ + if (physnodes[i].end - end - + e820_hole_size(end, physnodes[i].end) < size) + end = physnodes[i].end; + + /* + * Setup the fake node that will be allocated as bootmem + * later. If setup_node_range() returns non-zero, there + * is no more memory available on this physical node. + */ + if (setup_node_range(ret++, &physnodes[i].start, + end - physnodes[i].start, + physnodes[i].end) < 0) + node_clear(i, physnode_mask); + } + } + return ret; +} + /* * Splits num_nodes nodes up equally starting at node_start. The return value * is the number of nodes split up and addr is adjusted to be at the end of the @@ -546,14 +642,7 @@ static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, if (i == num_nodes + node_start - 1) end = max_addr; else - while (end - *addr - e820_hole_size(*addr, end) < - size) { - end += FAKE_NODE_MIN_SIZE; - if (end > max_addr) { - end = max_addr; - break; - } - } + end = find_end_of_node(*addr, max_addr, size); if (setup_node_range(i, addr, end - *addr, max_addr) < 0) break; } @@ -588,6 +677,18 @@ static int __init numa_emulation(unsigned long start_pfn, int num_phys_nodes; num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); + /* + * If the numa=fake command-line contains a 'M' or 'G', it represents + * the fixed node size. + */ + if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { + size = memparse(cmdline, &cmdline); + num_nodes = split_nodes_size_interleave(addr, max_addr, size); + if (num_nodes < 0) + return num_nodes; + goto out; + } + /* * If the numa=fake command-line is just a single number N, split the * system RAM into N fake nodes. From ca2107c9d6cf44fb915402d6f12b9d9ff3925cd7 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 15 Feb 2010 13:43:33 -0800 Subject: [PATCH 4/4] x86, numa: Remove configurable node size support for numa emulation Now that numa=fake=[MG] is implemented, it is possible to remove configurable node size support. The command-line parsing was already broken (numa=fake=*128, for example, would not work) and since fake nodes are now interleaved over physical nodes, this support is no longer required. Signed-off-by: David Rientjes LKML-Reference: Signed-off-by: H. Peter Anvin --- Documentation/x86/x86_64/boot-options.txt | 16 +-- arch/x86/mm/numa_64.c | 162 ++-------------------- 2 files changed, 17 insertions(+), 161 deletions(-) diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 01150c64aa73..7fbbaf85f5b7 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -170,19 +170,9 @@ NUMA If given as a memory unit, fills all system RAM with nodes of size interleaved over physical nodes. - numa=fake=CMDLINE - If a number, fakes CMDLINE nodes and ignores NUMA setup of the - actual machine. Otherwise, system memory is configured - depending on the sizes and coefficients listed. For example: - numa=fake=2*512,1024,4*256,*128 - gives two 512M nodes, a 1024M node, four 256M nodes, and the - rest split into 128M chunks. If the last character of CMDLINE - is a *, the remaining memory is divided up equally among its - coefficient: - numa=fake=2*512,2* - gives two 512M nodes and the rest split into two nodes. - Otherwise, the remaining system RAM is allocated to an - additional node. + numa=fake= + If given as an integer, fills all system RAM with N fake nodes + interleaved over physical nodes. ACPI diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index c47c78ba3aca..3307ea8bd43a 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -597,73 +597,6 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) return ret; } -/* - * Splits num_nodes nodes up equally starting at node_start. The return value - * is the number of nodes split up and addr is adjusted to be at the end of the - * last node allocated. - */ -static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, - int num_nodes) -{ - unsigned int big; - u64 size; - int i; - - if (num_nodes <= 0) - return -1; - if (num_nodes > MAX_NUMNODES) - num_nodes = MAX_NUMNODES; - size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / - num_nodes; - /* - * Calculate the number of big nodes that can be allocated as a result - * of consolidating the leftovers. - */ - big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / - FAKE_NODE_MIN_SIZE; - - /* Round down to nearest FAKE_NODE_MIN_SIZE. */ - size &= FAKE_NODE_MIN_HASH_MASK; - if (!size) { - printk(KERN_ERR "Not enough memory for each node. " - "NUMA emulation disabled.\n"); - return -1; - } - - for (i = node_start; i < num_nodes + node_start; i++) { - u64 end = *addr + size; - - if (i < big) - end += FAKE_NODE_MIN_SIZE; - /* - * The final node can have the remaining system RAM. Other - * nodes receive roughly the same amount of available pages. - */ - if (i == num_nodes + node_start - 1) - end = max_addr; - else - end = find_end_of_node(*addr, max_addr, size); - if (setup_node_range(i, addr, end - *addr, max_addr) < 0) - break; - } - return i - node_start + 1; -} - -/* - * Splits the remaining system RAM into chunks of size. The remaining memory is - * always assigned to a final node and can be asymmetric. Returns the number of - * nodes split. - */ -static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, - u64 size) -{ - int i = node_start; - size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, addr, size, max_addr)) - ; - return i - node_start; -} - /* * Sets up the system RAM area from start_pfn to last_pfn according to the * numa=fake command-line option. @@ -671,99 +604,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn, int acpi, int k8) { - u64 size, addr = start_pfn << PAGE_SHIFT; + u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; - int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; int num_phys_nodes; + int num_nodes; + int i; num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); /* * If the numa=fake command-line contains a 'M' or 'G', it represents - * the fixed node size. + * the fixed node size. Otherwise, if it is just a single number N, + * split the system RAM into N fake nodes. */ if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { + u64 size; + size = memparse(cmdline, &cmdline); num_nodes = split_nodes_size_interleave(addr, max_addr, size); - if (num_nodes < 0) - return num_nodes; - goto out; + } else { + unsigned long n; + + n = simple_strtoul(cmdline, NULL, 0); + num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); } - /* - * If the numa=fake command-line is just a single number N, split the - * system RAM into N fake nodes. - */ - if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - long n = simple_strtol(cmdline, NULL, 0); - - num_nodes = split_nodes_interleave(addr, max_addr, - num_phys_nodes, n); - if (num_nodes < 0) - return num_nodes; - goto out; - } - - /* Parse the command line. */ - for (coeff_flag = 0; ; cmdline++) { - if (*cmdline && isdigit(*cmdline)) { - num = num * 10 + *cmdline - '0'; - continue; - } - if (*cmdline == '*') { - if (num > 0) - coeff = num; - coeff_flag = 1; - } - if (!*cmdline || *cmdline == ',') { - if (!coeff_flag) - coeff = 1; - /* - * Round down to the nearest FAKE_NODE_MIN_SIZE. - * Command-line coefficients are in megabytes. - */ - size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; - if (size) - for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, &addr, - size, max_addr) < 0) - goto done; - if (!*cmdline) - break; - coeff_flag = 0; - coeff = -1; - } - num = 0; - } -done: - if (!num_nodes) - return -1; - /* Fill remainder of system RAM, if appropriate. */ - if (addr < max_addr) { - if (coeff_flag && coeff < 0) { - /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(&addr, max_addr, - num_nodes, num); - goto out; - } - switch (*(cmdline - 1)) { - case '*': - /* Split remaining nodes into coeff chunks */ - if (coeff <= 0) - break; - num_nodes += split_nodes_equally(&addr, max_addr, - num_nodes, coeff); - break; - case ',': - /* Do not allocate remaining system RAM */ - break; - default: - /* Give one final node */ - setup_node_range(num_nodes, &addr, max_addr - addr, - max_addr); - num_nodes++; - } - } -out: + if (num_nodes < 0) + return num_nodes; memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); if (memnode_shift < 0) { memnode_shift = 0;