052936080c
phys_to_nid() maps physical address to NUMA node id. This is
implemented by building perfect hash in compute_hash_shift() during
initialization.
However, with SPARSE memory model, the nid is encoded in page flags.
The perfect hash implementation was for DISCONTIG memory model which
got removed years ago by b263295dbf
(x86: 64-bit, make sparsemem
vmemmap the only memory model).
So, the perfect hash ends up being used only during initialization
when the core SPARSE code already provides perfectly acceptable
generic early_pfn_to_nid() implementation.
Drop phys_to_nid() and use the generic ealry_pfn_to_nid() instead.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
546 lines
14 KiB
C
546 lines
14 KiB
C
/*
|
|
* Generic VM initialization for x86-64 NUMA setups.
|
|
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
|
|
*/
|
|
#include <linux/kernel.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/string.h>
|
|
#include <linux/init.h>
|
|
#include <linux/bootmem.h>
|
|
#include <linux/memblock.h>
|
|
#include <linux/mmzone.h>
|
|
#include <linux/ctype.h>
|
|
#include <linux/module.h>
|
|
#include <linux/nodemask.h>
|
|
#include <linux/sched.h>
|
|
#include <linux/acpi.h>
|
|
|
|
#include <asm/e820.h>
|
|
#include <asm/proto.h>
|
|
#include <asm/dma.h>
|
|
#include <asm/acpi.h>
|
|
#include <asm/amd_nb.h>
|
|
|
|
#include "numa_internal.h"
|
|
|
|
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
|
|
EXPORT_SYMBOL(node_data);
|
|
|
|
nodemask_t numa_nodes_parsed __initdata;
|
|
|
|
static struct numa_meminfo numa_meminfo __initdata;
|
|
static int numa_distance_cnt;
|
|
static u8 *numa_distance;
|
|
|
|
static void * __init early_node_mem(int nodeid, unsigned long start,
|
|
unsigned long end, unsigned long size,
|
|
unsigned long align)
|
|
{
|
|
unsigned long mem;
|
|
|
|
/*
|
|
* put it on high as possible
|
|
* something will go with NODE_DATA
|
|
*/
|
|
if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
|
|
start = MAX_DMA_PFN<<PAGE_SHIFT;
|
|
if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
|
|
end > (MAX_DMA32_PFN<<PAGE_SHIFT))
|
|
start = MAX_DMA32_PFN<<PAGE_SHIFT;
|
|
mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
|
|
if (mem != MEMBLOCK_ERROR)
|
|
return __va(mem);
|
|
|
|
/* extend the search scope */
|
|
end = max_pfn_mapped << PAGE_SHIFT;
|
|
start = MAX_DMA_PFN << PAGE_SHIFT;
|
|
mem = memblock_find_in_range(start, end, size, align);
|
|
if (mem != MEMBLOCK_ERROR)
|
|
return __va(mem);
|
|
|
|
printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
|
|
size, nodeid);
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
|
|
struct numa_meminfo *mi)
|
|
{
|
|
/* ignore zero length blks */
|
|
if (start == end)
|
|
return 0;
|
|
|
|
/* whine about and ignore invalid blks */
|
|
if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
|
|
pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
|
|
nid, start, end);
|
|
return 0;
|
|
}
|
|
|
|
if (mi->nr_blks >= NR_NODE_MEMBLKS) {
|
|
pr_err("NUMA: too many memblk ranges\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
mi->blk[mi->nr_blks].start = start;
|
|
mi->blk[mi->nr_blks].end = end;
|
|
mi->blk[mi->nr_blks].nid = nid;
|
|
mi->nr_blks++;
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
|
|
* @idx: Index of memblk to remove
|
|
* @mi: numa_meminfo to remove memblk from
|
|
*
|
|
* Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
|
|
* decrementing @mi->nr_blks.
|
|
*/
|
|
void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
|
|
{
|
|
mi->nr_blks--;
|
|
memmove(&mi->blk[idx], &mi->blk[idx + 1],
|
|
(mi->nr_blks - idx) * sizeof(mi->blk[0]));
|
|
}
|
|
|
|
/**
|
|
* numa_add_memblk - Add one numa_memblk to numa_meminfo
|
|
* @nid: NUMA node ID of the new memblk
|
|
* @start: Start address of the new memblk
|
|
* @end: End address of the new memblk
|
|
*
|
|
* Add a new memblk to the default numa_meminfo.
|
|
*
|
|
* RETURNS:
|
|
* 0 on success, -errno on failure.
|
|
*/
|
|
int __init numa_add_memblk(int nid, u64 start, u64 end)
|
|
{
|
|
return numa_add_memblk_to(nid, start, end, &numa_meminfo);
|
|
}
|
|
|
|
/* Initialize bootmem allocator for a node */
|
|
void __init
|
|
setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
|
|
{
|
|
unsigned long start_pfn, last_pfn, nodedata_phys;
|
|
const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
|
|
int nid;
|
|
|
|
if (!end)
|
|
return;
|
|
|
|
/*
|
|
* Don't confuse VM with a node that doesn't have the
|
|
* minimum amount of memory:
|
|
*/
|
|
if (end && (end - start) < NODE_MIN_SIZE)
|
|
return;
|
|
|
|
start = roundup(start, ZONE_ALIGN);
|
|
|
|
printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
|
|
start, end);
|
|
|
|
start_pfn = start >> PAGE_SHIFT;
|
|
last_pfn = end >> PAGE_SHIFT;
|
|
|
|
node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
|
|
SMP_CACHE_BYTES);
|
|
if (node_data[nodeid] == NULL)
|
|
return;
|
|
nodedata_phys = __pa(node_data[nodeid]);
|
|
memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
|
|
printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
|
|
nodedata_phys + pgdat_size - 1);
|
|
nid = early_pfn_to_nid(nodedata_phys >> PAGE_SHIFT);
|
|
if (nid != nodeid)
|
|
printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
|
|
|
|
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
|
|
NODE_DATA(nodeid)->node_id = nodeid;
|
|
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
|
|
NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
|
|
|
|
node_set_online(nodeid);
|
|
}
|
|
|
|
/**
|
|
* numa_cleanup_meminfo - Cleanup a numa_meminfo
|
|
* @mi: numa_meminfo to clean up
|
|
*
|
|
* Sanitize @mi by merging and removing unncessary memblks. Also check for
|
|
* conflicts and clear unused memblks.
|
|
*
|
|
* RETURNS:
|
|
* 0 on success, -errno on failure.
|
|
*/
|
|
int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
|
|
{
|
|
const u64 low = 0;
|
|
const u64 high = (u64)max_pfn << PAGE_SHIFT;
|
|
int i, j, k;
|
|
|
|
for (i = 0; i < mi->nr_blks; i++) {
|
|
struct numa_memblk *bi = &mi->blk[i];
|
|
|
|
/* make sure all blocks are inside the limits */
|
|
bi->start = max(bi->start, low);
|
|
bi->end = min(bi->end, high);
|
|
|
|
/* and there's no empty block */
|
|
if (bi->start == bi->end) {
|
|
numa_remove_memblk_from(i--, mi);
|
|
continue;
|
|
}
|
|
|
|
for (j = i + 1; j < mi->nr_blks; j++) {
|
|
struct numa_memblk *bj = &mi->blk[j];
|
|
unsigned long start, end;
|
|
|
|
/*
|
|
* See whether there are overlapping blocks. Whine
|
|
* about but allow overlaps of the same nid. They
|
|
* will be merged below.
|
|
*/
|
|
if (bi->end > bj->start && bi->start < bj->end) {
|
|
if (bi->nid != bj->nid) {
|
|
pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
|
|
bi->nid, bi->start, bi->end,
|
|
bj->nid, bj->start, bj->end);
|
|
return -EINVAL;
|
|
}
|
|
pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
|
|
bi->nid, bi->start, bi->end,
|
|
bj->start, bj->end);
|
|
}
|
|
|
|
/*
|
|
* Join together blocks on the same node, holes
|
|
* between which don't overlap with memory on other
|
|
* nodes.
|
|
*/
|
|
if (bi->nid != bj->nid)
|
|
continue;
|
|
start = max(min(bi->start, bj->start), low);
|
|
end = min(max(bi->end, bj->end), high);
|
|
for (k = 0; k < mi->nr_blks; k++) {
|
|
struct numa_memblk *bk = &mi->blk[k];
|
|
|
|
if (bi->nid == bk->nid)
|
|
continue;
|
|
if (start < bk->end && end > bk->start)
|
|
break;
|
|
}
|
|
if (k < mi->nr_blks)
|
|
continue;
|
|
printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
|
|
bi->nid, bi->start, bi->end, bj->start, bj->end,
|
|
start, end);
|
|
bi->start = start;
|
|
bi->end = end;
|
|
numa_remove_memblk_from(j--, mi);
|
|
}
|
|
}
|
|
|
|
for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
|
|
mi->blk[i].start = mi->blk[i].end = 0;
|
|
mi->blk[i].nid = NUMA_NO_NODE;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Set nodes, which have memory in @mi, in *@nodemask.
|
|
*/
|
|
static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
|
|
const struct numa_meminfo *mi)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
|
|
if (mi->blk[i].start != mi->blk[i].end &&
|
|
mi->blk[i].nid != NUMA_NO_NODE)
|
|
node_set(mi->blk[i].nid, *nodemask);
|
|
}
|
|
|
|
/**
|
|
* numa_reset_distance - Reset NUMA distance table
|
|
*
|
|
* The current table is freed. The next numa_set_distance() call will
|
|
* create a new one.
|
|
*/
|
|
void __init numa_reset_distance(void)
|
|
{
|
|
size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
|
|
|
|
/* numa_distance could be 1LU marking allocation failure, test cnt */
|
|
if (numa_distance_cnt)
|
|
memblock_x86_free_range(__pa(numa_distance),
|
|
__pa(numa_distance) + size);
|
|
numa_distance_cnt = 0;
|
|
numa_distance = NULL; /* enable table creation */
|
|
}
|
|
|
|
static int __init numa_alloc_distance(void)
|
|
{
|
|
nodemask_t nodes_parsed;
|
|
size_t size;
|
|
int i, j, cnt = 0;
|
|
u64 phys;
|
|
|
|
/* size the new table and allocate it */
|
|
nodes_parsed = numa_nodes_parsed;
|
|
numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
|
|
|
|
for_each_node_mask(i, nodes_parsed)
|
|
cnt = i;
|
|
cnt++;
|
|
size = cnt * cnt * sizeof(numa_distance[0]);
|
|
|
|
phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
|
|
size, PAGE_SIZE);
|
|
if (phys == MEMBLOCK_ERROR) {
|
|
pr_warning("NUMA: Warning: can't allocate distance table!\n");
|
|
/* don't retry until explicitly reset */
|
|
numa_distance = (void *)1LU;
|
|
return -ENOMEM;
|
|
}
|
|
memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
|
|
|
|
numa_distance = __va(phys);
|
|
numa_distance_cnt = cnt;
|
|
|
|
/* fill with the default distances */
|
|
for (i = 0; i < cnt; i++)
|
|
for (j = 0; j < cnt; j++)
|
|
numa_distance[i * cnt + j] = i == j ?
|
|
LOCAL_DISTANCE : REMOTE_DISTANCE;
|
|
printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* numa_set_distance - Set NUMA distance from one NUMA to another
|
|
* @from: the 'from' node to set distance
|
|
* @to: the 'to' node to set distance
|
|
* @distance: NUMA distance
|
|
*
|
|
* Set the distance from node @from to @to to @distance. If distance table
|
|
* doesn't exist, one which is large enough to accommodate all the currently
|
|
* known nodes will be created.
|
|
*
|
|
* If such table cannot be allocated, a warning is printed and further
|
|
* calls are ignored until the distance table is reset with
|
|
* numa_reset_distance().
|
|
*
|
|
* If @from or @to is higher than the highest known node at the time of
|
|
* table creation or @distance doesn't make sense, the call is ignored.
|
|
* This is to allow simplification of specific NUMA config implementations.
|
|
*/
|
|
void __init numa_set_distance(int from, int to, int distance)
|
|
{
|
|
if (!numa_distance && numa_alloc_distance() < 0)
|
|
return;
|
|
|
|
if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
|
|
printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
|
|
from, to, distance);
|
|
return;
|
|
}
|
|
|
|
if ((u8)distance != distance ||
|
|
(from == to && distance != LOCAL_DISTANCE)) {
|
|
pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
|
|
from, to, distance);
|
|
return;
|
|
}
|
|
|
|
numa_distance[from * numa_distance_cnt + to] = distance;
|
|
}
|
|
|
|
int __node_distance(int from, int to)
|
|
{
|
|
if (from >= numa_distance_cnt || to >= numa_distance_cnt)
|
|
return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
|
|
return numa_distance[from * numa_distance_cnt + to];
|
|
}
|
|
EXPORT_SYMBOL(__node_distance);
|
|
|
|
/*
|
|
* Sanity check to catch more bad NUMA configurations (they are amazingly
|
|
* common). Make sure the nodes cover all memory.
|
|
*/
|
|
static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
|
|
{
|
|
unsigned long numaram, e820ram;
|
|
int i;
|
|
|
|
numaram = 0;
|
|
for (i = 0; i < mi->nr_blks; i++) {
|
|
unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
|
|
unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
|
|
numaram += e - s;
|
|
numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
|
|
if ((long)numaram < 0)
|
|
numaram = 0;
|
|
}
|
|
|
|
e820ram = max_pfn - (memblock_x86_hole_size(0,
|
|
max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
|
|
/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
|
|
if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
|
|
printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
|
|
(numaram << PAGE_SHIFT) >> 20,
|
|
(e820ram << PAGE_SHIFT) >> 20);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static int __init numa_register_memblks(struct numa_meminfo *mi)
|
|
{
|
|
int i, nid;
|
|
|
|
/* Account for nodes with cpus and no memory */
|
|
node_possible_map = numa_nodes_parsed;
|
|
numa_nodemask_from_meminfo(&node_possible_map, mi);
|
|
if (WARN_ON(nodes_empty(node_possible_map)))
|
|
return -EINVAL;
|
|
|
|
for (i = 0; i < mi->nr_blks; i++)
|
|
memblock_x86_register_active_regions(mi->blk[i].nid,
|
|
mi->blk[i].start >> PAGE_SHIFT,
|
|
mi->blk[i].end >> PAGE_SHIFT);
|
|
|
|
/* for out of order entries */
|
|
sort_node_map();
|
|
if (!numa_meminfo_cover_memory(mi))
|
|
return -EINVAL;
|
|
|
|
/* Finally register nodes. */
|
|
for_each_node_mask(nid, node_possible_map) {
|
|
u64 start = (u64)max_pfn << PAGE_SHIFT;
|
|
u64 end = 0;
|
|
|
|
for (i = 0; i < mi->nr_blks; i++) {
|
|
if (nid != mi->blk[i].nid)
|
|
continue;
|
|
start = min(mi->blk[i].start, start);
|
|
end = max(mi->blk[i].end, end);
|
|
}
|
|
|
|
if (start < end)
|
|
setup_node_bootmem(nid, start, end);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* dummy_numma_init - Fallback dummy NUMA init
|
|
*
|
|
* Used if there's no underlying NUMA architecture, NUMA initialization
|
|
* fails, or NUMA is disabled on the command line.
|
|
*
|
|
* Must online at least one node and add memory blocks that cover all
|
|
* allowed memory. This function must not fail.
|
|
*/
|
|
static int __init dummy_numa_init(void)
|
|
{
|
|
printk(KERN_INFO "%s\n",
|
|
numa_off ? "NUMA turned off" : "No NUMA configuration found");
|
|
printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
|
|
0LU, max_pfn << PAGE_SHIFT);
|
|
|
|
node_set(0, numa_nodes_parsed);
|
|
numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int __init numa_init(int (*init_func)(void))
|
|
{
|
|
int i;
|
|
int ret;
|
|
|
|
for (i = 0; i < MAX_LOCAL_APIC; i++)
|
|
set_apicid_to_node(i, NUMA_NO_NODE);
|
|
|
|
nodes_clear(numa_nodes_parsed);
|
|
nodes_clear(node_possible_map);
|
|
nodes_clear(node_online_map);
|
|
memset(&numa_meminfo, 0, sizeof(numa_meminfo));
|
|
remove_all_active_ranges();
|
|
numa_reset_distance();
|
|
|
|
ret = init_func();
|
|
if (ret < 0)
|
|
return ret;
|
|
ret = numa_cleanup_meminfo(&numa_meminfo);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
numa_emulation(&numa_meminfo, numa_distance_cnt);
|
|
|
|
ret = numa_register_memblks(&numa_meminfo);
|
|
if (ret < 0)
|
|
return ret;
|
|
|
|
for (i = 0; i < nr_cpu_ids; i++) {
|
|
int nid = early_cpu_to_node(i);
|
|
|
|
if (nid == NUMA_NO_NODE)
|
|
continue;
|
|
if (!node_online(nid))
|
|
numa_clear_node(i);
|
|
}
|
|
numa_init_array();
|
|
return 0;
|
|
}
|
|
|
|
void __init initmem_init(void)
|
|
{
|
|
int ret;
|
|
|
|
if (!numa_off) {
|
|
#ifdef CONFIG_ACPI_NUMA
|
|
ret = numa_init(x86_acpi_numa_init);
|
|
if (!ret)
|
|
return;
|
|
#endif
|
|
#ifdef CONFIG_AMD_NUMA
|
|
ret = numa_init(amd_numa_init);
|
|
if (!ret)
|
|
return;
|
|
#endif
|
|
}
|
|
|
|
numa_init(dummy_numa_init);
|
|
}
|
|
|
|
unsigned long __init numa_free_all_bootmem(void)
|
|
{
|
|
unsigned long pages = 0;
|
|
int i;
|
|
|
|
for_each_online_node(i)
|
|
pages += free_all_bootmem_node(NODE_DATA(i));
|
|
|
|
pages += free_all_memory_core_early(MAX_NUMNODES);
|
|
|
|
return pages;
|
|
}
|
|
|
|
int __cpuinit numa_cpu_node(int cpu)
|
|
{
|
|
int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
|
|
|
|
if (apicid != BAD_APICID)
|
|
return __apicid_to_node[apicid];
|
|
return NUMA_NO_NODE;
|
|
}
|