diff --git a/include/linux/plist.h b/include/linux/plist.h index 97883604a3c5..0ea3e1bc7ccc 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -266,6 +266,9 @@ static inline int plist_node_empty(const struct plist_node *node) #define plist_next(pos) \ list_next_entry(pos, node_list) +#define plist_next_entry(pos, type, member) \ + container_of(plist_next(pos), type, member) + /** * plist_prev - get the prev entry in list * @pos: the type * to cursor diff --git a/include/linux/swap.h b/include/linux/swap.h index 8e2c11e692ba..475003604959 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -178,6 +178,7 @@ enum { #define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX +#define SWAPFILE_CLUSTER 256 #define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */ #define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */ @@ -273,6 +274,8 @@ struct swap_info_struct { */ struct work_struct discard_work; /* discard worker */ struct swap_cluster_list discard_clusters; /* discard clusters list */ + unsigned int write_pending; + unsigned int max_writes; }; #ifdef CONFIG_64BIT @@ -355,6 +358,8 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; +extern int sysctl_swap_ratio; +extern int sysctl_swap_ratio_enable; extern int remove_mapping(struct address_space *mapping, struct page *page); extern unsigned long vm_total_pages; diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index e06febf62978..d7973074192f 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -8,9 +8,14 @@ */ extern spinlock_t swap_lock; extern struct plist_head swap_active_head; +extern spinlock_t swap_avail_lock; +extern struct plist_head *swap_avail_heads; extern struct swap_info_struct *swap_info[]; extern int try_to_unuse(unsigned int, bool, unsigned long); extern unsigned long generic_max_swapfile_size(void); extern unsigned long max_swapfile_size(void); +extern int swap_ratio(struct swap_info_struct **si, int node); +extern void setup_swap_ratio(struct swap_info_struct *p, int prio); +extern bool is_swap_ratio_group(int prio); #endif /* _LINUX_SWAPFILE_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c7635ecb752..c23e0b039069 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1672,6 +1672,22 @@ static struct ctl_table vm_table[] = { .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, +#endif +#ifdef CONFIG_SWAP + { + .procname = "swap_ratio", + .data = &sysctl_swap_ratio, + .maxlen = sizeof(sysctl_swap_ratio), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + }, + { + .procname = "swap_ratio_enable", + .data = &sysctl_swap_ratio_enable, + .maxlen = sizeof(sysctl_swap_ratio_enable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + }, #endif { } }; diff --git a/mm/Makefile b/mm/Makefile index 7332f8908f9e..8aa143d57eb9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -54,7 +54,7 @@ ifdef CONFIG_MMU endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o swap_ratio.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o diff --git a/mm/swap_ratio.c b/mm/swap_ratio.c new file mode 100644 index 000000000000..577b8c1034e3 --- /dev/null +++ b/mm/swap_ratio.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2015-2018, The Linux Foundation. All rights reserved. + */ + +#include +#include +#include + +#define SWAP_RATIO_GROUP_START (SWAP_FLAG_PRIO_MASK - 9) /* 32758 */ +#define SWAP_RATIO_GROUP_END (SWAP_FLAG_PRIO_MASK) /* 32767 */ +#define SWAP_FAST_WRITES (SWAPFILE_CLUSTER * (SWAP_CLUSTER_MAX / 8)) +#define SWAP_SLOW_WRITES SWAPFILE_CLUSTER + +/* + * The fast/slow swap write ratio. + * 100 indicates that all writes should + * go to fast swap device. + */ +int sysctl_swap_ratio = 100; + +/* Enable the swap ratio feature */ +int sysctl_swap_ratio_enable; + +static bool is_same_group(struct swap_info_struct *a, + struct swap_info_struct *b) +{ + if (!sysctl_swap_ratio_enable) + return false; + + if (!is_swap_ratio_group(a->prio)) + return false; + + if (a->prio == b->prio) + return true; + + return false; +} + +/* Caller must hold swap_avail_lock */ +static int calculate_write_pending(struct swap_info_struct *si, + struct swap_info_struct *n) +{ + int ratio = sysctl_swap_ratio; + + if ((ratio < 0) || (ratio > 100)) + return -EINVAL; + + if (WARN_ON(!(si->flags & SWP_SYNCHRONOUS_IO))) + return -ENODEV; + + if ((n->flags & SWP_SYNCHRONOUS_IO) || !is_same_group(si, n)) + return -ENODEV; + + si->max_writes = ratio ? SWAP_FAST_WRITES : 0; + n->max_writes = ratio ? (SWAP_FAST_WRITES * 100) / + ratio - SWAP_FAST_WRITES : SWAP_SLOW_WRITES; + + si->write_pending = si->max_writes; + n->write_pending = n->max_writes; + + return 0; +} + +static int swap_ratio_slow(struct swap_info_struct **si, int node) +{ + struct swap_info_struct *n = NULL; + int ret = 0; + + spin_lock(&(*si)->lock); + spin_lock(&swap_avail_lock); + if (&(*si)->avail_lists[node] == plist_last(&swap_avail_heads[node])) { + /* just to make skip work */ + n = *si; + ret = -ENODEV; + goto skip; + } + n = plist_next_entry(&(*si)->avail_lists[node], + struct swap_info_struct, + avail_lists[node]); + spin_unlock(&swap_avail_lock); + spin_lock(&n->lock); + spin_lock(&swap_avail_lock); + + if ((*si)->flags & SWP_SYNCHRONOUS_IO) { + if ((*si)->write_pending) { + (*si)->write_pending--; + goto exit; + } else { + if ((n->flags & SWP_SYNCHRONOUS_IO) || + !is_same_group(*si, n)) { + /* Should never happen */ + ret = -ENODEV; + } else if (n->write_pending) { + /* + * Requeue fast device, since there are pending + * writes for slow device. + */ + plist_requeue(&(*si)->avail_lists[node], + &swap_avail_heads[node]); + n->write_pending--; + spin_unlock(&(*si)->lock); + *si = n; + goto skip; + } else { + if (calculate_write_pending(*si, n) < 0) { + ret = -ENODEV; + goto exit; + } + /* Restart from fast device */ + (*si)->write_pending--; + } + } + } else { + if (!(n->flags & SWP_SYNCHRONOUS_IO) || + !is_same_group(*si, n)) { + /* Should never happen */ + ret = -ENODEV; + } else if (n->write_pending) { + /* + * Pending writes for fast device. + * We reach here when slow device is swapped on first, + * before fast device. + */ + /* requeue slow device to the end */ + plist_requeue(&(*si)->avail_lists[node], + &swap_avail_heads[node]); + n->write_pending--; + spin_unlock(&(*si)->lock); + *si = n; + goto skip; + } else { + if ((*si)->write_pending) { + (*si)->write_pending--; + } else { + if (calculate_write_pending(n, *si) < 0) { + ret = -ENODEV; + goto exit; + } + n->write_pending--; + plist_requeue(&(*si)->avail_lists[node], + &swap_avail_heads[node]); + spin_unlock(&(*si)->lock); + *si = n; + goto skip; + } + } + } +exit: + spin_unlock(&(*si)->lock); +skip: + spin_unlock(&swap_avail_lock); + /* n and si would have got interchanged */ + spin_unlock(&n->lock); + return ret; +} + +bool is_swap_ratio_group(int prio) +{ + return ((prio >= SWAP_RATIO_GROUP_START) && + (prio <= SWAP_RATIO_GROUP_END)) ? true : false; +} + +void setup_swap_ratio(struct swap_info_struct *p, int prio) +{ + /* Used only if sysctl_swap_ratio_enable is set */ + if (is_swap_ratio_group(prio)) { + if (p->flags & SWP_SYNCHRONOUS_IO) + p->write_pending = SWAP_FAST_WRITES; + else + p->write_pending = SWAP_SLOW_WRITES; + p->max_writes = p->write_pending; + } +} + +int swap_ratio(struct swap_info_struct **si, int node) +{ + if (is_swap_ratio_group((*si)->prio)) + return swap_ratio_slow(si, node); + else + return -ENODEV; +} diff --git a/mm/swapfile.c b/mm/swapfile.c index d954b71c4f9c..b9ad9c36246b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -85,8 +85,8 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static struct plist_head *swap_avail_heads; -static DEFINE_SPINLOCK(swap_avail_lock); +struct plist_head *swap_avail_heads; +DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -947,6 +947,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) long avail_pgs; int n_ret = 0; int node; + int swap_ratio_off = 0; /* Only single cluster request supported */ WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); @@ -963,14 +964,34 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) atomic_long_sub(n_goal * size, &nr_swap_pages); +lock_and_start: spin_lock(&swap_avail_lock); start_over: node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { + + if (sysctl_swap_ratio && !swap_ratio_off) { + int ret; + + spin_unlock(&swap_avail_lock); + ret = swap_ratio(&si, node); + if (ret < 0) { + /* + * Error. Start again with swap + * ratio disabled. + */ + swap_ratio_off = 1; + goto lock_and_start; + } else { + goto start; + } + } + /* requeue si to after same-priority siblings */ plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); +start: spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); @@ -3270,9 +3291,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mutex_lock(&swapon_mutex); prio = -1; - if (swap_flags & SWAP_FLAG_PREFER) + if (swap_flags & SWAP_FLAG_PREFER) { prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; + setup_swap_ratio(p, prio); + } enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",