da1b13ccfb
Wanpeng Li reported a race between soft_offline_page() and unpoison_memory(), which causes the following kernel panic: BUG: Bad page state in process bash pfn:97000 page:ffffea00025c0000 count:0 mapcount:1 mapping: (null) index:0x7f4fdbe00 flags: 0x1fffff80080048(uptodate|active|swapbacked) page dumped because: PAGE_FLAGS_CHECK_AT_FREE flag(s) set bad because of flags: flags: 0x40(active) Modules linked in: snd_hda_codec_hdmi i915 rpcsec_gss_krb5 nfsv4 dns_resolver bnep rfcomm nfsd bluetooth auth_rpcgss nfs_acl nfs rfkill lockd grace sunrpc i2c_algo_bit drm_kms_helper snd_hda_codec_realtek snd_hda_codec_generic drm snd_hda_intel fscache snd_hda_codec x86_pkg_temp_thermal coretemp kvm_intel snd_hda_core snd_hwdep kvm snd_pcm snd_seq_dummy snd_seq_oss crct10dif_pclmul snd_seq_midi crc32_pclmul snd_seq_midi_event ghash_clmulni_intel snd_rawmidi aesni_intel lrw gf128mul snd_seq glue_helper ablk_helper snd_seq_device cryptd fuse snd_timer dcdbas serio_raw mei_me parport_pc snd mei ppdev i2c_core video lp soundcore parport lpc_ich shpchp mfd_core ext4 mbcache jbd2 sd_mod e1000e ahci ptp libahci crc32c_intel libata pps_core CPU: 3 PID: 2211 Comm: bash Not tainted 4.2.0-rc5-mm1+ #45 Hardware name: Dell Inc. OptiPlex 7020/0F5C5X, BIOS A03 01/08/2015 Call Trace: dump_stack+0x48/0x5c bad_page+0xe6/0x140 free_pages_prepare+0x2f9/0x320 ? uncharge_list+0xdd/0x100 free_hot_cold_page+0x40/0x170 __put_single_page+0x20/0x30 put_page+0x25/0x40 unmap_and_move+0x1a6/0x1f0 migrate_pages+0x100/0x1d0 ? kill_procs+0x100/0x100 ? unlock_page+0x6f/0x90 __soft_offline_page+0x127/0x2a0 soft_offline_page+0xa6/0x200 This race is explained like below: CPU0 CPU1 soft_offline_page __soft_offline_page TestSetPageHWPoison unpoison_memory PageHWPoison check (true) TestClearPageHWPoison put_page -> release refcount held by get_hwpoison_page in unpoison_memory put_page -> release refcount held by isolate_lru_page in __soft_offline_page migrate_pages The second put_page() releases refcount held by isolate_lru_page() which will lead to unmap_and_move() releases the last refcount of page and w/ mapcount still 1 since try_to_unmap() is not called if there is only one user map the page. Anyway, the page refcount and mapcount will still mess if the page is mapped by multiple users. This race was introduced by commit4491f71260
("mm/memory-failure: set PageHWPoison before migrate_pages()"), which focuses on preventing the reuse of successfully migrated page. Before this commit we prevent the reuse by changing the migratetype to MIGRATE_ISOLATE during soft offlining, which has the following problems, so simply reverting the commit is not a best option: 1) it doesn't eliminate the reuse completely, because set_migratetype_isolate() can fail to set MIGRATE_ISOLATE to the target page if the pageblock of the page contains one or more unmovable pages (i.e. has_unmovable_pages() returns true). 2) the original code changes migratetype to MIGRATE_ISOLATE forcibly, and sets it to MIGRATE_MOVABLE forcibly after soft offline, regardless of the original migratetype state, which could impact other subsystems like memory hotplug or compaction. This patch moves PageSetHWPoison just after put_page() in unmap_and_move(), which closes up the reported race window and minimizes another race window b/w SetPageHWPoison and reallocation (which causes the reuse of soft-offlined page.) The latter race window still exists but it's acceptable, because it's rare and effectively the same as ordinary "containment failure" case even if it happens, so keep the window open is acceptable. Fixes:4491f71260
("mm/memory-failure: set PageHWPoison before migrate_pages()") Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com> Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Reported-by: Wanpeng Li <wanpeng.li@hotmail.com> Tested-by: Wanpeng Li <wanpeng.li@hotmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
242 lines
5.9 KiB
C
242 lines
5.9 KiB
C
#ifndef _LINUX_SWAPOPS_H
|
|
#define _LINUX_SWAPOPS_H
|
|
|
|
#include <linux/radix-tree.h>
|
|
#include <linux/bug.h>
|
|
|
|
/*
|
|
* swapcache pages are stored in the swapper_space radix tree. We want to
|
|
* get good packing density in that tree, so the index should be dense in
|
|
* the low-order bits.
|
|
*
|
|
* We arrange the `type' and `offset' fields so that `type' is at the seven
|
|
* high-order bits of the swp_entry_t and `offset' is right-aligned in the
|
|
* remaining bits. Although `type' itself needs only five bits, we allow for
|
|
* shmem/tmpfs to shift it all up a further two bits: see swp_to_radix_entry().
|
|
*
|
|
* swp_entry_t's are *never* stored anywhere in their arch-dependent format.
|
|
*/
|
|
#define SWP_TYPE_SHIFT(e) ((sizeof(e.val) * 8) - \
|
|
(MAX_SWAPFILES_SHIFT + RADIX_TREE_EXCEPTIONAL_SHIFT))
|
|
#define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1)
|
|
|
|
/*
|
|
* Store a type+offset into a swp_entry_t in an arch-independent format
|
|
*/
|
|
static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
|
|
{
|
|
swp_entry_t ret;
|
|
|
|
ret.val = (type << SWP_TYPE_SHIFT(ret)) |
|
|
(offset & SWP_OFFSET_MASK(ret));
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Extract the `type' field from a swp_entry_t. The swp_entry_t is in
|
|
* arch-independent format
|
|
*/
|
|
static inline unsigned swp_type(swp_entry_t entry)
|
|
{
|
|
return (entry.val >> SWP_TYPE_SHIFT(entry));
|
|
}
|
|
|
|
/*
|
|
* Extract the `offset' field from a swp_entry_t. The swp_entry_t is in
|
|
* arch-independent format
|
|
*/
|
|
static inline pgoff_t swp_offset(swp_entry_t entry)
|
|
{
|
|
return entry.val & SWP_OFFSET_MASK(entry);
|
|
}
|
|
|
|
#ifdef CONFIG_MMU
|
|
/* check whether a pte points to a swap entry */
|
|
static inline int is_swap_pte(pte_t pte)
|
|
{
|
|
return !pte_none(pte) && !pte_present(pte);
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Convert the arch-dependent pte representation of a swp_entry_t into an
|
|
* arch-independent swp_entry_t.
|
|
*/
|
|
static inline swp_entry_t pte_to_swp_entry(pte_t pte)
|
|
{
|
|
swp_entry_t arch_entry;
|
|
|
|
if (pte_swp_soft_dirty(pte))
|
|
pte = pte_swp_clear_soft_dirty(pte);
|
|
arch_entry = __pte_to_swp_entry(pte);
|
|
return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
|
|
}
|
|
|
|
/*
|
|
* Convert the arch-independent representation of a swp_entry_t into the
|
|
* arch-dependent pte representation.
|
|
*/
|
|
static inline pte_t swp_entry_to_pte(swp_entry_t entry)
|
|
{
|
|
swp_entry_t arch_entry;
|
|
|
|
arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
|
|
return __swp_entry_to_pte(arch_entry);
|
|
}
|
|
|
|
static inline swp_entry_t radix_to_swp_entry(void *arg)
|
|
{
|
|
swp_entry_t entry;
|
|
|
|
entry.val = (unsigned long)arg >> RADIX_TREE_EXCEPTIONAL_SHIFT;
|
|
return entry;
|
|
}
|
|
|
|
static inline void *swp_to_radix_entry(swp_entry_t entry)
|
|
{
|
|
unsigned long value;
|
|
|
|
value = entry.val << RADIX_TREE_EXCEPTIONAL_SHIFT;
|
|
return (void *)(value | RADIX_TREE_EXCEPTIONAL_ENTRY);
|
|
}
|
|
|
|
#ifdef CONFIG_MIGRATION
|
|
static inline swp_entry_t make_migration_entry(struct page *page, int write)
|
|
{
|
|
BUG_ON(!PageLocked(page));
|
|
return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
|
|
page_to_pfn(page));
|
|
}
|
|
|
|
static inline int is_migration_entry(swp_entry_t entry)
|
|
{
|
|
return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
|
|
swp_type(entry) == SWP_MIGRATION_WRITE);
|
|
}
|
|
|
|
static inline int is_write_migration_entry(swp_entry_t entry)
|
|
{
|
|
return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
|
|
}
|
|
|
|
static inline struct page *migration_entry_to_page(swp_entry_t entry)
|
|
{
|
|
struct page *p = pfn_to_page(swp_offset(entry));
|
|
/*
|
|
* Any use of migration entries may only occur while the
|
|
* corresponding page is locked
|
|
*/
|
|
BUG_ON(!PageLocked(p));
|
|
return p;
|
|
}
|
|
|
|
static inline void make_migration_entry_read(swp_entry_t *entry)
|
|
{
|
|
*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
|
|
}
|
|
|
|
extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
|
|
spinlock_t *ptl);
|
|
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
|
|
unsigned long address);
|
|
extern void migration_entry_wait_huge(struct vm_area_struct *vma,
|
|
struct mm_struct *mm, pte_t *pte);
|
|
#else
|
|
|
|
#define make_migration_entry(page, write) swp_entry(0, 0)
|
|
static inline int is_migration_entry(swp_entry_t swp)
|
|
{
|
|
return 0;
|
|
}
|
|
#define migration_entry_to_page(swp) NULL
|
|
static inline void make_migration_entry_read(swp_entry_t *entryp) { }
|
|
static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
|
|
spinlock_t *ptl) { }
|
|
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
|
|
unsigned long address) { }
|
|
static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
|
|
struct mm_struct *mm, pte_t *pte) { }
|
|
static inline int is_write_migration_entry(swp_entry_t entry)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_MEMORY_FAILURE
|
|
|
|
extern atomic_long_t num_poisoned_pages __read_mostly;
|
|
|
|
/*
|
|
* Support for hardware poisoned pages
|
|
*/
|
|
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
|
{
|
|
BUG_ON(!PageLocked(page));
|
|
return swp_entry(SWP_HWPOISON, page_to_pfn(page));
|
|
}
|
|
|
|
static inline int is_hwpoison_entry(swp_entry_t entry)
|
|
{
|
|
return swp_type(entry) == SWP_HWPOISON;
|
|
}
|
|
|
|
static inline bool test_set_page_hwpoison(struct page *page)
|
|
{
|
|
return TestSetPageHWPoison(page);
|
|
}
|
|
|
|
static inline void num_poisoned_pages_inc(void)
|
|
{
|
|
atomic_long_inc(&num_poisoned_pages);
|
|
}
|
|
|
|
static inline void num_poisoned_pages_dec(void)
|
|
{
|
|
atomic_long_dec(&num_poisoned_pages);
|
|
}
|
|
|
|
static inline void num_poisoned_pages_add(long num)
|
|
{
|
|
atomic_long_add(num, &num_poisoned_pages);
|
|
}
|
|
|
|
static inline void num_poisoned_pages_sub(long num)
|
|
{
|
|
atomic_long_sub(num, &num_poisoned_pages);
|
|
}
|
|
#else
|
|
|
|
static inline swp_entry_t make_hwpoison_entry(struct page *page)
|
|
{
|
|
return swp_entry(0, 0);
|
|
}
|
|
|
|
static inline int is_hwpoison_entry(swp_entry_t swp)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline bool test_set_page_hwpoison(struct page *page)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
static inline void num_poisoned_pages_inc(void)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
#if defined(CONFIG_MEMORY_FAILURE) || defined(CONFIG_MIGRATION)
|
|
static inline int non_swap_entry(swp_entry_t entry)
|
|
{
|
|
return swp_type(entry) >= MAX_SWAPFILES;
|
|
}
|
|
#else
|
|
static inline int non_swap_entry(swp_entry_t entry)
|
|
{
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
#endif /* _LINUX_SWAPOPS_H */
|