449dd6984d
Previously, page cache radix tree nodes were freed after reclaim emptied out their page pointers. But now reclaim stores shadow entries in their place, which are only reclaimed when the inodes themselves are reclaimed. This is problematic for bigger files that are still in use after they have a significant amount of their cache reclaimed, without any of those pages actually refaulting. The shadow entries will just sit there and waste memory. In the worst case, the shadow entries will accumulate until the machine runs out of memory. To get this under control, the VM will track radix tree nodes exclusively containing shadow entries on a per-NUMA node list. Per-NUMA rather than global because we expect the radix tree nodes themselves to be allocated node-locally and we want to reduce cross-node references of otherwise independent cache workloads. A simple shrinker will then reclaim these nodes on memory pressure. A few things need to be stored in the radix tree node to implement the shadow node LRU and allow tree deletions coming from the list: 1. There is no index available that would describe the reverse path from the node up to the tree root, which is needed to perform a deletion. To solve this, encode in each node its offset inside the parent. This can be stored in the unused upper bits of the same member that stores the node's height at no extra space cost. 2. The number of shadow entries needs to be counted in addition to the regular entries, to quickly detect when the node is ready to go to the shadow node LRU list. The current entry count is an unsigned int but the maximum number of entries is 64, so a shadow counter can easily be stored in the unused upper bits. 3. Tree modification needs tree lock and tree root, which are located in the address space, so store an address_space backpointer in the node. The parent pointer of the node is in a union with the 2-word rcu_head, so the backpointer comes at no extra cost as well. 4. The node needs to be linked to an LRU list, which requires a list head inside the node. This does increase the size of the node, but it does not change the number of objects that fit into a slab page. [akpm@linux-foundation.org: export the right function] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Bob Liu <bob.liu@oracle.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
137 lines
4.5 KiB
C
137 lines
4.5 KiB
C
/*
|
|
* Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
|
|
* Authors: David Chinner and Glauber Costa
|
|
*
|
|
* Generic LRU infrastructure
|
|
*/
|
|
#ifndef _LRU_LIST_H
|
|
#define _LRU_LIST_H
|
|
|
|
#include <linux/list.h>
|
|
#include <linux/nodemask.h>
|
|
|
|
/* list_lru_walk_cb has to always return one of those */
|
|
enum lru_status {
|
|
LRU_REMOVED, /* item removed from list */
|
|
LRU_REMOVED_RETRY, /* item removed, but lock has been
|
|
dropped and reacquired */
|
|
LRU_ROTATE, /* item referenced, give another pass */
|
|
LRU_SKIP, /* item cannot be locked, skip */
|
|
LRU_RETRY, /* item not freeable. May drop the lock
|
|
internally, but has to return locked. */
|
|
};
|
|
|
|
struct list_lru_node {
|
|
spinlock_t lock;
|
|
struct list_head list;
|
|
/* kept as signed so we can catch imbalance bugs */
|
|
long nr_items;
|
|
} ____cacheline_aligned_in_smp;
|
|
|
|
struct list_lru {
|
|
struct list_lru_node *node;
|
|
nodemask_t active_nodes;
|
|
};
|
|
|
|
void list_lru_destroy(struct list_lru *lru);
|
|
int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
|
|
static inline int list_lru_init(struct list_lru *lru)
|
|
{
|
|
return list_lru_init_key(lru, NULL);
|
|
}
|
|
|
|
/**
|
|
* list_lru_add: add an element to the lru list's tail
|
|
* @list_lru: the lru pointer
|
|
* @item: the item to be added.
|
|
*
|
|
* If the element is already part of a list, this function returns doing
|
|
* nothing. Therefore the caller does not need to keep state about whether or
|
|
* not the element already belongs in the list and is allowed to lazy update
|
|
* it. Note however that this is valid for *a* list, not *this* list. If
|
|
* the caller organize itself in a way that elements can be in more than
|
|
* one type of list, it is up to the caller to fully remove the item from
|
|
* the previous list (with list_lru_del() for instance) before moving it
|
|
* to @list_lru
|
|
*
|
|
* Return value: true if the list was updated, false otherwise
|
|
*/
|
|
bool list_lru_add(struct list_lru *lru, struct list_head *item);
|
|
|
|
/**
|
|
* list_lru_del: delete an element to the lru list
|
|
* @list_lru: the lru pointer
|
|
* @item: the item to be deleted.
|
|
*
|
|
* This function works analogously as list_lru_add in terms of list
|
|
* manipulation. The comments about an element already pertaining to
|
|
* a list are also valid for list_lru_del.
|
|
*
|
|
* Return value: true if the list was updated, false otherwise
|
|
*/
|
|
bool list_lru_del(struct list_lru *lru, struct list_head *item);
|
|
|
|
/**
|
|
* list_lru_count_node: return the number of objects currently held by @lru
|
|
* @lru: the lru pointer.
|
|
* @nid: the node id to count from.
|
|
*
|
|
* Always return a non-negative number, 0 for empty lists. There is no
|
|
* guarantee that the list is not updated while the count is being computed.
|
|
* Callers that want such a guarantee need to provide an outer lock.
|
|
*/
|
|
unsigned long list_lru_count_node(struct list_lru *lru, int nid);
|
|
static inline unsigned long list_lru_count(struct list_lru *lru)
|
|
{
|
|
long count = 0;
|
|
int nid;
|
|
|
|
for_each_node_mask(nid, lru->active_nodes)
|
|
count += list_lru_count_node(lru, nid);
|
|
|
|
return count;
|
|
}
|
|
|
|
typedef enum lru_status
|
|
(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
|
|
/**
|
|
* list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
|
|
* @lru: the lru pointer.
|
|
* @nid: the node id to scan from.
|
|
* @isolate: callback function that is resposible for deciding what to do with
|
|
* the item currently being scanned
|
|
* @cb_arg: opaque type that will be passed to @isolate
|
|
* @nr_to_walk: how many items to scan.
|
|
*
|
|
* This function will scan all elements in a particular list_lru, calling the
|
|
* @isolate callback for each of those items, along with the current list
|
|
* spinlock and a caller-provided opaque. The @isolate callback can choose to
|
|
* drop the lock internally, but *must* return with the lock held. The callback
|
|
* will return an enum lru_status telling the list_lru infrastructure what to
|
|
* do with the object being scanned.
|
|
*
|
|
* Please note that nr_to_walk does not mean how many objects will be freed,
|
|
* just how many objects will be scanned.
|
|
*
|
|
* Return value: the number of objects effectively removed from the LRU.
|
|
*/
|
|
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
|
|
list_lru_walk_cb isolate, void *cb_arg,
|
|
unsigned long *nr_to_walk);
|
|
|
|
static inline unsigned long
|
|
list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
|
|
void *cb_arg, unsigned long nr_to_walk)
|
|
{
|
|
long isolated = 0;
|
|
int nid;
|
|
|
|
for_each_node_mask(nid, lru->active_nodes) {
|
|
isolated += list_lru_walk_node(lru, nid, isolate,
|
|
cb_arg, &nr_to_walk);
|
|
if (nr_to_walk <= 0)
|
|
break;
|
|
}
|
|
return isolated;
|
|
}
|
|
#endif /* _LRU_LIST_H */
|