d98d053efa
commit 2c91f8fc6c999fe10185d8ad99fda1759f662f70 upstream. -- snip -- Only contextual issues: - Unrelated check_and_unmap_cpu_on_node() changes are missing. - Unrelated walk_memory_blocks() has not been moved/refactored yet. -- snip -- try_offline_node() is pretty much broken right now: - The node span is updated when onlining memory, not when adding it. We ignore memory that was mever onlined. Bad. - We touch possible garbage memmaps. The pfn_to_nid(pfn) can easily trigger a kernel panic. Bad for memory that is offline but also bad for subsection hotadd with ZONE_DEVICE, whereby the memmap of the first PFN of a section might contain garbage. - Sections belonging to mixed nodes are not properly considered. As memory blocks might belong to multiple nodes, we would have to walk all pageblocks (or at least subsections) within present sections. However, we don't have a way to identify whether a memmap that is not online was initialized (relevant for ZONE_DEVICE). This makes things more complicated. Luckily, we can piggy pack on the node span and the nid stored in memory blocks. Currently, the node span is grown when calling move_pfn_range_to_zone() - e.g., when onlining memory, and shrunk when removing memory, before calling try_offline_node(). Sysfs links are created via link_mem_sections(), e.g., during boot or when adding memory. If the node still spans memory or if any memory block belongs to the nid, we don't set the node offline. As memory blocks that span multiple nodes cannot get offlined, the nid stored in memory blocks is reliable enough (for such online memory blocks, the node still spans the memory). Introduce for_each_memory_block() to efficiently walk all memory blocks. Note: We will soon stop shrinking the ZONE_DEVICE zone and the node span when removing ZONE_DEVICE memory to fix similar issues (access of garbage memmaps) - until we have a reliable way to identify whether these memmaps were properly initialized. This implies later, that once a node had ZONE_DEVICE memory, we won't be able to set a node offline - which should be acceptable. Since commitf1dd2cd13c
("mm, memory_hotplug: do not associate hotadded memory to zones until online") memory that is added is not assoziated with a zone/node (memmap not initialized). The introducing commit60a5a19e74
("memory-hotplug: remove sysfs file of node") already missed that we could have multiple nodes for a section and that the zone/node span is updated when onlining pages, not when adding them. I tested this by hotplugging two DIMMs to a memory-less and cpu-less NUMA node. The node is properly onlined when adding the DIMMs. When removing the DIMMs, the node is properly offlined. Masayoshi Mizuma reported: : Without this patch, memory hotplug fails as panic: : : BUG: kernel NULL pointer dereference, address: 0000000000000000 : ... : Call Trace: : remove_memory_block_devices+0x81/0xc0 : try_remove_memory+0xb4/0x130 : __remove_memory+0xa/0x20 : acpi_memory_device_remove+0x84/0x100 : acpi_bus_trim+0x57/0x90 : acpi_bus_trim+0x2e/0x90 : acpi_device_hotplug+0x2b2/0x4d0 : acpi_hotplug_work_fn+0x1a/0x30 : process_one_work+0x171/0x380 : worker_thread+0x49/0x3f0 : kthread+0xf8/0x130 : ret_from_fork+0x35/0x40 [david@redhat.com: v3] Link: http://lkml.kernel.org/r/20191102120221.7553-1-david@redhat.com Link: http://lkml.kernel.org/r/20191028105458.28320-1-david@redhat.com Fixes:60a5a19e74
("memory-hotplug: remove sysfs file of node") Fixes:f1dd2cd13c
("mm, memory_hotplug: do not associate hotadded memory to zones until online") # visiable afterd0dc12e86b
Signed-off-by: David Hildenbrand <david@redhat.com> Tested-by: Masayoshi Mizuma <m.mizuma@jp.fujitsu.com> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: "Rafael J. Wysocki" <rafael@kernel.org> Cc: Keith Busch <keith.busch@intel.com> Cc: Jiri Olsa <jolsa@kernel.org> Cc: "Peter Zijlstra (Intel)" <peterz@infradead.org> Cc: Jani Nikula <jani.nikula@intel.com> Cc: Nayna Jain <nayna@linux.ibm.com> Cc: Michal Hocko <mhocko@suse.com> Cc: Oscar Salvador <osalvador@suse.de> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: David Hildenbrand <david@redhat.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
148 lines
4.8 KiB
C
148 lines
4.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
/*
|
|
* include/linux/memory.h - generic memory definition
|
|
*
|
|
* This is mainly for topological representation. We define the
|
|
* basic "struct memory_block" here, which can be embedded in per-arch
|
|
* definitions or NUMA information.
|
|
*
|
|
* Basic handling of the devices is done in drivers/base/memory.c
|
|
* and system devices are handled in drivers/base/sys.c.
|
|
*
|
|
* Memory block are exported via sysfs in the class/memory/devices/
|
|
* directory.
|
|
*
|
|
*/
|
|
#ifndef _LINUX_MEMORY_H_
|
|
#define _LINUX_MEMORY_H_
|
|
|
|
#include <linux/node.h>
|
|
#include <linux/compiler.h>
|
|
#include <linux/mutex.h>
|
|
#include <linux/notifier.h>
|
|
|
|
#define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS)
|
|
|
|
struct memory_block {
|
|
unsigned long start_section_nr;
|
|
unsigned long end_section_nr;
|
|
unsigned long state; /* serialized by the dev->lock */
|
|
int section_count; /* serialized by mem_sysfs_mutex */
|
|
int online_type; /* for passing data to online routine */
|
|
int phys_device; /* to which fru does this belong? */
|
|
void *hw; /* optional pointer to fw/hw data */
|
|
int (*phys_callback)(struct memory_block *);
|
|
struct device dev;
|
|
int nid; /* NID for this memory block */
|
|
};
|
|
|
|
int arch_get_memory_phys_device(unsigned long start_pfn);
|
|
unsigned long memory_block_size_bytes(void);
|
|
int set_memory_block_size_order(unsigned int order);
|
|
|
|
/* These states are exposed to userspace as text strings in sysfs */
|
|
#define MEM_ONLINE (1<<0) /* exposed to userspace */
|
|
#define MEM_GOING_OFFLINE (1<<1) /* exposed to userspace */
|
|
#define MEM_OFFLINE (1<<2) /* exposed to userspace */
|
|
#define MEM_GOING_ONLINE (1<<3)
|
|
#define MEM_CANCEL_ONLINE (1<<4)
|
|
#define MEM_CANCEL_OFFLINE (1<<5)
|
|
|
|
struct memory_notify {
|
|
unsigned long start_pfn;
|
|
unsigned long nr_pages;
|
|
int status_change_nid_normal;
|
|
int status_change_nid_high;
|
|
int status_change_nid;
|
|
};
|
|
|
|
/*
|
|
* During pageblock isolation, count the number of pages within the
|
|
* range [start_pfn, start_pfn + nr_pages) which are owned by code
|
|
* in the notifier chain.
|
|
*/
|
|
#define MEM_ISOLATE_COUNT (1<<0)
|
|
|
|
struct memory_isolate_notify {
|
|
unsigned long start_pfn; /* Start of range to check */
|
|
unsigned int nr_pages; /* # pages in range to check */
|
|
unsigned int pages_found; /* # pages owned found by callbacks */
|
|
};
|
|
|
|
struct notifier_block;
|
|
struct mem_section;
|
|
|
|
/*
|
|
* Priorities for the hotplug memory callback routines (stored in decreasing
|
|
* order in the callback chain)
|
|
*/
|
|
#define SLAB_CALLBACK_PRI 1
|
|
#define IPC_CALLBACK_PRI 10
|
|
|
|
#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE
|
|
static inline int memory_dev_init(void)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int register_memory_notifier(struct notifier_block *nb)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void unregister_memory_notifier(struct notifier_block *nb)
|
|
{
|
|
}
|
|
static inline int memory_notify(unsigned long val, void *v)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int register_memory_isolate_notifier(struct notifier_block *nb)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void unregister_memory_isolate_notifier(struct notifier_block *nb)
|
|
{
|
|
}
|
|
static inline int memory_isolate_notify(unsigned long val, void *v)
|
|
{
|
|
return 0;
|
|
}
|
|
#else
|
|
extern int register_memory_notifier(struct notifier_block *nb);
|
|
extern void unregister_memory_notifier(struct notifier_block *nb);
|
|
extern int register_memory_isolate_notifier(struct notifier_block *nb);
|
|
extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
|
|
int create_memory_block_devices(unsigned long start, unsigned long size);
|
|
void remove_memory_block_devices(unsigned long start, unsigned long size);
|
|
extern int memory_dev_init(void);
|
|
extern int memory_notify(unsigned long val, void *v);
|
|
extern int memory_isolate_notify(unsigned long val, void *v);
|
|
extern struct memory_block *find_memory_block_hinted(struct mem_section *,
|
|
struct memory_block *);
|
|
extern struct memory_block *find_memory_block(struct mem_section *);
|
|
typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
|
|
extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func);
|
|
#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
|
|
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
#define hotplug_memory_notifier(fn, pri) ({ \
|
|
static __meminitdata struct notifier_block fn##_mem_nb =\
|
|
{ .notifier_call = fn, .priority = pri };\
|
|
register_memory_notifier(&fn##_mem_nb); \
|
|
})
|
|
#define register_hotmemory_notifier(nb) register_memory_notifier(nb)
|
|
#define unregister_hotmemory_notifier(nb) unregister_memory_notifier(nb)
|
|
#else
|
|
#define hotplug_memory_notifier(fn, pri) ({ 0; })
|
|
/* These aren't inline functions due to a GCC bug. */
|
|
#define register_hotmemory_notifier(nb) ({ (void)(nb); 0; })
|
|
#define unregister_hotmemory_notifier(nb) ({ (void)(nb); })
|
|
#endif
|
|
|
|
/*
|
|
* Kernel text modification mutex, used for code patching. Users of this lock
|
|
* can sleep.
|
|
*/
|
|
extern struct mutex text_mutex;
|
|
|
|
#endif /* _LINUX_MEMORY_H_ */
|