86834898d5
commit feee6b2989165631b17ac6d4ccdbf6759254e85a upstream. -- snip -- - Missing arm64 hot(un)plug support - Missing some vmem_altmap_offset() cleanups - Missing sub-section hotadd support - Missing unification of mm/hmm.c and kernel/memremap.c -- snip -- We currently try to shrink a single zone when removing memory. We use the zone of the first page of the memory we are removing. If that memmap was never initialized (e.g., memory was never onlined), we will read garbage and can trigger kernel BUGs (due to a stale pointer): BUG: unable to handle page fault for address: 000000000000353d #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] SMP PTI CPU: 1 PID: 7 Comm: kworker/u8:0 Not tainted 5.3.0-rc5-next-20190820+ #317 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.4 Workqueue: kacpi_hotplug acpi_hotplug_work_fn RIP: 0010:clear_zone_contiguous+0x5/0x10 Code: 48 89 c6 48 89 c3 e8 2a fe ff ff 48 85 c0 75 cf 5b 5d c3 c6 85 fd 05 00 00 01 5b 5d c3 0f 1f 840 RSP: 0018:ffffad2400043c98 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000200000000 RCX: 0000000000000000 RDX: 0000000000200000 RSI: 0000000000140000 RDI: 0000000000002f40 RBP: 0000000140000000 R08: 0000000000000000 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000140000 R13: 0000000000140000 R14: 0000000000002f40 R15: ffff9e3e7aff3680 FS: 0000000000000000(0000) GS:ffff9e3e7bb00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000353d CR3: 0000000058610000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __remove_pages+0x4b/0x640 arch_remove_memory+0x63/0x8d try_remove_memory+0xdb/0x130 __remove_memory+0xa/0x11 acpi_memory_device_remove+0x70/0x100 acpi_bus_trim+0x55/0x90 acpi_device_hotplug+0x227/0x3a0 acpi_hotplug_work_fn+0x1a/0x30 process_one_work+0x221/0x550 worker_thread+0x50/0x3b0 kthread+0x105/0x140 ret_from_fork+0x3a/0x50 Modules linked in: CR2: 000000000000353d Instead, shrink the zones when offlining memory or when onlining failed. Introduce and use remove_pfn_range_from_zone(() for that. We now properly shrink the zones, even if we have DIMMs whereby - Some memory blocks fall into no zone (never onlined) - Some memory blocks fall into multiple zones (offlined+re-onlined) - Multiple memory blocks that fall into different zones Drop the zone parameter (with a potential dubious value) from __remove_pages() and __remove_section(). Link: http://lkml.kernel.org/r/20191006085646.5768-6-david@redhat.com Fixes:f1dd2cd13c
("mm, memory_hotplug: do not associate hotadded memory to zones until online") [visible afterd0dc12e86b
] Signed-off-by: David Hildenbrand <david@redhat.com> Reviewed-by: Oscar Salvador <osalvador@suse.de> Cc: Michal Hocko <mhocko@suse.com> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com> Cc: Pavel Tatashin <pasha.tatashin@soleen.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Logan Gunthorpe <logang@deltatee.com> Cc: <stable@vger.kernel.org> [5.0+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: David Hildenbrand <david@redhat.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
349 lines
11 KiB
C
349 lines
11 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __LINUX_MEMORY_HOTPLUG_H
|
|
#define __LINUX_MEMORY_HOTPLUG_H
|
|
|
|
#include <linux/mmzone.h>
|
|
#include <linux/spinlock.h>
|
|
#include <linux/notifier.h>
|
|
#include <linux/bug.h>
|
|
|
|
struct page;
|
|
struct zone;
|
|
struct pglist_data;
|
|
struct mem_section;
|
|
struct memory_block;
|
|
struct resource;
|
|
struct vmem_altmap;
|
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG
|
|
/*
|
|
* Return page for the valid pfn only if the page is online. All pfn
|
|
* walkers which rely on the fully initialized page->flags and others
|
|
* should use this rather than pfn_valid && pfn_to_page
|
|
*/
|
|
#define pfn_to_online_page(pfn) \
|
|
({ \
|
|
struct page *___page = NULL; \
|
|
unsigned long ___pfn = pfn; \
|
|
unsigned long ___nr = pfn_to_section_nr(___pfn); \
|
|
\
|
|
if (___nr < NR_MEM_SECTIONS && online_section_nr(___nr) && \
|
|
pfn_valid_within(___pfn)) \
|
|
___page = pfn_to_page(___pfn); \
|
|
___page; \
|
|
})
|
|
|
|
/*
|
|
* Types for free bootmem stored in page->lru.next. These have to be in
|
|
* some random range in unsigned long space for debugging purposes.
|
|
*/
|
|
enum {
|
|
MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
|
|
SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
|
|
MIX_SECTION_INFO,
|
|
NODE_INFO,
|
|
MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
|
|
};
|
|
|
|
/* Types for control the zone type of onlined and offlined memory */
|
|
enum {
|
|
MMOP_OFFLINE = -1,
|
|
MMOP_ONLINE_KEEP,
|
|
MMOP_ONLINE_KERNEL,
|
|
MMOP_ONLINE_MOVABLE,
|
|
};
|
|
|
|
/*
|
|
* Zone resizing functions
|
|
*
|
|
* Note: any attempt to resize a zone should has pgdat_resize_lock()
|
|
* zone_span_writelock() both held. This ensure the size of a zone
|
|
* can't be changed while pgdat_resize_lock() held.
|
|
*/
|
|
static inline unsigned zone_span_seqbegin(struct zone *zone)
|
|
{
|
|
return read_seqbegin(&zone->span_seqlock);
|
|
}
|
|
static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
|
|
{
|
|
return read_seqretry(&zone->span_seqlock, iv);
|
|
}
|
|
static inline void zone_span_writelock(struct zone *zone)
|
|
{
|
|
write_seqlock(&zone->span_seqlock);
|
|
}
|
|
static inline void zone_span_writeunlock(struct zone *zone)
|
|
{
|
|
write_sequnlock(&zone->span_seqlock);
|
|
}
|
|
static inline void zone_seqlock_init(struct zone *zone)
|
|
{
|
|
seqlock_init(&zone->span_seqlock);
|
|
}
|
|
extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages);
|
|
extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
|
|
extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
|
|
/* VM interface that may be used by firmware interface */
|
|
extern int online_pages(unsigned long, unsigned long, int);
|
|
extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
|
|
unsigned long *valid_start, unsigned long *valid_end);
|
|
extern void __offline_isolated_pages(unsigned long, unsigned long);
|
|
|
|
typedef void (*online_page_callback_t)(struct page *page);
|
|
|
|
extern int set_online_page_callback(online_page_callback_t callback);
|
|
extern int restore_online_page_callback(online_page_callback_t callback);
|
|
|
|
extern void __online_page_set_limits(struct page *page);
|
|
extern void __online_page_increment_counters(struct page *page);
|
|
extern void __online_page_free(struct page *page);
|
|
|
|
extern int try_online_node(int nid);
|
|
|
|
extern bool memhp_auto_online;
|
|
/* If movable_node boot option specified */
|
|
extern bool movable_node_enabled;
|
|
static inline bool movable_node_is_enabled(void)
|
|
{
|
|
return movable_node_enabled;
|
|
}
|
|
|
|
extern void arch_remove_memory(int nid, u64 start, u64 size,
|
|
struct vmem_altmap *altmap);
|
|
extern void __remove_pages(unsigned long start_pfn, unsigned long nr_pages,
|
|
struct vmem_altmap *altmap);
|
|
|
|
/* reasonably generic interface to expand the physical pages */
|
|
extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
|
|
struct vmem_altmap *altmap, bool want_memblock);
|
|
|
|
#ifndef CONFIG_ARCH_HAS_ADD_PAGES
|
|
static inline int add_pages(int nid, unsigned long start_pfn,
|
|
unsigned long nr_pages, struct vmem_altmap *altmap,
|
|
bool want_memblock)
|
|
{
|
|
return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
|
|
}
|
|
#else /* ARCH_HAS_ADD_PAGES */
|
|
int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
|
|
struct vmem_altmap *altmap, bool want_memblock);
|
|
#endif /* ARCH_HAS_ADD_PAGES */
|
|
|
|
#ifdef CONFIG_NUMA
|
|
extern int memory_add_physaddr_to_nid(u64 start);
|
|
#else
|
|
static inline int memory_add_physaddr_to_nid(u64 start)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION
|
|
/*
|
|
* For supporting node-hotadd, we have to allocate a new pgdat.
|
|
*
|
|
* If an arch has generic style NODE_DATA(),
|
|
* node_data[nid] = kzalloc() works well. But it depends on the architecture.
|
|
*
|
|
* In general, generic_alloc_nodedata() is used.
|
|
* Now, arch_free_nodedata() is just defined for error path of node_hot_add.
|
|
*
|
|
*/
|
|
extern pg_data_t *arch_alloc_nodedata(int nid);
|
|
extern void arch_free_nodedata(pg_data_t *pgdat);
|
|
extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat);
|
|
|
|
#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
|
|
|
|
#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid)
|
|
#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat)
|
|
|
|
#ifdef CONFIG_NUMA
|
|
/*
|
|
* If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat.
|
|
* XXX: kmalloc_node() can't work well to get new node's memory at this time.
|
|
* Because, pgdat for the new node is not allocated/initialized yet itself.
|
|
* To use new node's memory, more consideration will be necessary.
|
|
*/
|
|
#define generic_alloc_nodedata(nid) \
|
|
({ \
|
|
kzalloc(sizeof(pg_data_t), GFP_KERNEL); \
|
|
})
|
|
/*
|
|
* This definition is just for error path in node hotadd.
|
|
* For node hotremove, we have to replace this.
|
|
*/
|
|
#define generic_free_nodedata(pgdat) kfree(pgdat)
|
|
|
|
extern pg_data_t *node_data[];
|
|
static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
|
|
{
|
|
node_data[nid] = pgdat;
|
|
}
|
|
|
|
#else /* !CONFIG_NUMA */
|
|
|
|
/* never called */
|
|
static inline pg_data_t *generic_alloc_nodedata(int nid)
|
|
{
|
|
BUG();
|
|
return NULL;
|
|
}
|
|
static inline void generic_free_nodedata(pg_data_t *pgdat)
|
|
{
|
|
}
|
|
static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
|
|
{
|
|
}
|
|
#endif /* CONFIG_NUMA */
|
|
#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
|
|
|
|
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
|
|
extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
|
|
#else
|
|
static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
|
|
{
|
|
}
|
|
#endif
|
|
extern void put_page_bootmem(struct page *page);
|
|
extern void get_page_bootmem(unsigned long ingo, struct page *page,
|
|
unsigned long type);
|
|
|
|
void get_online_mems(void);
|
|
void put_online_mems(void);
|
|
|
|
void mem_hotplug_begin(void);
|
|
void mem_hotplug_done(void);
|
|
|
|
extern void set_zone_contiguous(struct zone *zone);
|
|
extern void clear_zone_contiguous(struct zone *zone);
|
|
|
|
#else /* ! CONFIG_MEMORY_HOTPLUG */
|
|
#define pfn_to_online_page(pfn) \
|
|
({ \
|
|
struct page *___page = NULL; \
|
|
if (pfn_valid(pfn)) \
|
|
___page = pfn_to_page(pfn); \
|
|
___page; \
|
|
})
|
|
|
|
static inline unsigned zone_span_seqbegin(struct zone *zone)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline int zone_span_seqretry(struct zone *zone, unsigned iv)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void zone_span_writelock(struct zone *zone) {}
|
|
static inline void zone_span_writeunlock(struct zone *zone) {}
|
|
static inline void zone_seqlock_init(struct zone *zone) {}
|
|
|
|
static inline int mhp_notimplemented(const char *func)
|
|
{
|
|
printk(KERN_WARNING "%s() called, with CONFIG_MEMORY_HOTPLUG disabled\n", func);
|
|
dump_stack();
|
|
return -ENOSYS;
|
|
}
|
|
|
|
static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
|
|
{
|
|
}
|
|
|
|
static inline int try_online_node(int nid)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline void get_online_mems(void) {}
|
|
static inline void put_online_mems(void) {}
|
|
|
|
static inline void mem_hotplug_begin(void) {}
|
|
static inline void mem_hotplug_done(void) {}
|
|
|
|
static inline bool movable_node_is_enabled(void)
|
|
{
|
|
return false;
|
|
}
|
|
#endif /* ! CONFIG_MEMORY_HOTPLUG */
|
|
|
|
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
|
|
/*
|
|
* pgdat resizing functions
|
|
*/
|
|
static inline
|
|
void pgdat_resize_lock(struct pglist_data *pgdat, unsigned long *flags)
|
|
{
|
|
spin_lock_irqsave(&pgdat->node_size_lock, *flags);
|
|
}
|
|
static inline
|
|
void pgdat_resize_unlock(struct pglist_data *pgdat, unsigned long *flags)
|
|
{
|
|
spin_unlock_irqrestore(&pgdat->node_size_lock, *flags);
|
|
}
|
|
static inline
|
|
void pgdat_resize_init(struct pglist_data *pgdat)
|
|
{
|
|
spin_lock_init(&pgdat->node_size_lock);
|
|
}
|
|
#else /* !(CONFIG_MEMORY_HOTPLUG || CONFIG_DEFERRED_STRUCT_PAGE_INIT) */
|
|
/*
|
|
* Stub functions for when hotplug is off
|
|
*/
|
|
static inline void pgdat_resize_lock(struct pglist_data *p, unsigned long *f) {}
|
|
static inline void pgdat_resize_unlock(struct pglist_data *p, unsigned long *f) {}
|
|
static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
|
|
#endif /* !(CONFIG_MEMORY_HOTPLUG || CONFIG_DEFERRED_STRUCT_PAGE_INIT) */
|
|
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
|
|
|
extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
|
|
extern void try_offline_node(int nid);
|
|
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
|
|
extern void remove_memory(int nid, u64 start, u64 size);
|
|
extern void __remove_memory(int nid, u64 start, u64 size);
|
|
|
|
#else
|
|
static inline bool is_mem_section_removable(unsigned long pfn,
|
|
unsigned long nr_pages)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline void try_offline_node(int nid) {}
|
|
|
|
static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
|
|
{
|
|
return -EINVAL;
|
|
}
|
|
|
|
static inline void remove_memory(int nid, u64 start, u64 size) {}
|
|
static inline void __remove_memory(int nid, u64 start, u64 size) {}
|
|
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
|
|
|
extern void __ref free_area_init_core_hotplug(int nid);
|
|
extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
|
|
void *arg, int (*func)(struct memory_block *, void *));
|
|
extern int __add_memory(int nid, u64 start, u64 size);
|
|
extern int add_memory(int nid, u64 start, u64 size);
|
|
extern int add_memory_resource(int nid, struct resource *resource, bool online);
|
|
extern int arch_add_memory(int nid, u64 start, u64 size,
|
|
struct vmem_altmap *altmap, bool want_memblock);
|
|
extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
|
|
unsigned long nr_pages, struct vmem_altmap *altmap);
|
|
extern void remove_pfn_range_from_zone(struct zone *zone,
|
|
unsigned long start_pfn,
|
|
unsigned long nr_pages);
|
|
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
|
|
extern bool is_memblock_offlined(struct memory_block *mem);
|
|
extern int sparse_add_one_section(int nid, unsigned long start_pfn,
|
|
struct vmem_altmap *altmap);
|
|
extern void sparse_remove_one_section(struct mem_section *ms,
|
|
unsigned long map_offset, struct vmem_altmap *altmap);
|
|
extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
|
|
unsigned long pnum);
|
|
extern bool allow_online_pfn_range(int nid, unsigned long pfn, unsigned long nr_pages,
|
|
int online_type);
|
|
extern struct zone *zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
|
|
unsigned long nr_pages);
|
|
#endif /* __LINUX_MEMORY_HOTPLUG_H */
|