ext4: limit number of scanned extents in status tree shrinker

Currently we scan extent status trees of inodes until we reclaim nr_to_scan
extents. This can however require a lot of scanning when there are lots
of delayed extents (as those cannot be reclaimed).

Change shrinker to work as shrinkers are supposed to and *scan* only
nr_to_scan extents regardless of how many extents did we actually
reclaim. We however need to be careful and avoid scanning each status
tree from the beginning - that could lead to a situation where we would
not be able to reclaim anything at all when first nr_to_scan extents in
the tree are always unreclaimable. We remember with each inode offset
where we stopped scanning and continue from there when we next come
across the inode.

Note that we also need to update places calling __es_shrink() manually
to pass reasonable nr_to_scan to have a chance of reclaiming anything and
not just 1.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
Jan Kara 2014-11-25 11:51:23 -05:00 committed by Theodore Ts'o
parent b0dea4c165
commit dd47592551
3 changed files with 66 additions and 35 deletions

View file

@ -881,6 +881,9 @@ struct ext4_inode_info {
struct list_head i_es_list; struct list_head i_es_list;
unsigned int i_es_all_nr; /* protected by i_es_lock */ unsigned int i_es_all_nr; /* protected by i_es_lock */
unsigned int i_es_shk_nr; /* protected by i_es_lock */ unsigned int i_es_shk_nr; /* protected by i_es_lock */
ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for
extents to shrink. Protected by
i_es_lock */
/* ialloc */ /* ialloc */
ext4_group_t i_last_alloc_group; ext4_group_t i_last_alloc_group;
@ -1321,7 +1324,7 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */ /* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker; struct shrinker s_es_shrinker;
struct list_head s_es_list; struct list_head s_es_list; /* List of inodes with reclaimable extents */
long s_es_nr_inode; long s_es_nr_inode;
struct ext4_es_stats s_es_stats; struct ext4_es_stats s_es_stats;
struct mb_cache *s_mb_cache; struct mb_cache *s_mb_cache;

View file

@ -147,8 +147,7 @@ static struct kmem_cache *ext4_es_cachep;
static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end); ext4_lblk_t end);
static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
int nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei); struct ext4_inode_info *locked_ei);
@ -716,7 +715,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
retry: retry:
err = __es_insert_extent(inode, &newes); err = __es_insert_extent(inode, &newes);
if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
1, EXT4_I(inode))) 128, EXT4_I(inode)))
goto retry; goto retry;
if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
err = 0; err = 0;
@ -874,7 +873,7 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
es->es_len = orig_es.es_len; es->es_len = orig_es.es_len;
if ((err == -ENOMEM) && if ((err == -ENOMEM) &&
__es_shrink(EXT4_SB(inode->i_sb), __es_shrink(EXT4_SB(inode->i_sb),
1, EXT4_I(inode))) 128, EXT4_I(inode)))
goto retry; goto retry;
goto out; goto out;
} }
@ -976,8 +975,6 @@ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
spin_lock(&sbi->s_es_lock); spin_lock(&sbi->s_es_lock);
nr_to_walk = sbi->s_es_nr_inode; nr_to_walk = sbi->s_es_nr_inode;
while (nr_to_walk-- > 0) { while (nr_to_walk-- > 0) {
int shrunk;
if (list_empty(&sbi->s_es_list)) { if (list_empty(&sbi->s_es_list)) {
spin_unlock(&sbi->s_es_lock); spin_unlock(&sbi->s_es_lock);
goto out; goto out;
@ -985,7 +982,7 @@ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
i_es_list); i_es_list);
/* Move the inode to the tail */ /* Move the inode to the tail */
list_move(&ei->i_es_list, sbi->s_es_list.prev); list_move_tail(&ei->i_es_list, &sbi->s_es_list);
/* /*
* Normally we try hard to avoid shrinking precached inodes, * Normally we try hard to avoid shrinking precached inodes,
@ -1007,13 +1004,10 @@ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
*/ */
spin_unlock(&sbi->s_es_lock); spin_unlock(&sbi->s_es_lock);
shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
write_unlock(&ei->i_es_lock); write_unlock(&ei->i_es_lock);
nr_shrunk += shrunk; if (nr_to_scan <= 0)
nr_to_scan -= shrunk;
if (nr_to_scan == 0)
goto out; goto out;
spin_lock(&sbi->s_es_lock); spin_lock(&sbi->s_es_lock);
} }
@ -1029,7 +1023,7 @@ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
} }
if (locked_ei && nr_shrunk == 0) if (locked_ei && nr_shrunk == 0)
nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);
out: out:
scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
@ -1224,14 +1218,59 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
unregister_shrinker(&sbi->s_es_shrinker); unregister_shrinker(&sbi->s_es_shrinker);
} }
static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, /*
int nr_to_scan) * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
* most *nr_to_scan extents, update *nr_to_scan accordingly.
*
* Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
* Increment *nr_shrunk by the number of reclaimed extents. Also update
* ei->i_es_shrink_lblk to where we should continue scanning.
*/
static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
int *nr_to_scan, int *nr_shrunk)
{ {
struct inode *inode = &ei->vfs_inode; struct inode *inode = &ei->vfs_inode;
struct ext4_es_tree *tree = &ei->i_es_tree; struct ext4_es_tree *tree = &ei->i_es_tree;
struct rb_node *node;
struct extent_status *es; struct extent_status *es;
unsigned long nr_shrunk = 0; struct rb_node *node;
es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
if (!es)
goto out_wrap;
node = &es->rb_node;
while (*nr_to_scan > 0) {
if (es->es_lblk > end) {
ei->i_es_shrink_lblk = end + 1;
return 0;
}
(*nr_to_scan)--;
node = rb_next(&es->rb_node);
/*
* We can't reclaim delayed extent from status tree because
* fiemap, bigallic, and seek_data/hole need to use it.
*/
if (!ext4_es_is_delayed(es)) {
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
(*nr_shrunk)++;
}
if (!node)
goto out_wrap;
es = rb_entry(node, struct extent_status, rb_node);
}
ei->i_es_shrink_lblk = es->es_lblk;
return 1;
out_wrap:
ei->i_es_shrink_lblk = 0;
return 0;
}
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
{
struct inode *inode = &ei->vfs_inode;
int nr_shrunk = 0;
ext4_lblk_t start = ei->i_es_shrink_lblk;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST); DEFAULT_RATELIMIT_BURST);
@ -1242,22 +1281,10 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
__ratelimit(&_rs)) __ratelimit(&_rs))
ext4_warning(inode->i_sb, "forced shrink of precached extents"); ext4_warning(inode->i_sb, "forced shrink of precached extents");
node = rb_first(&tree->root); if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
while (node != NULL) { start != 0)
es = rb_entry(node, struct extent_status, rb_node); es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);
node = rb_next(&es->rb_node);
/* ei->i_es_tree.cache_es = NULL;
* We can't reclaim delayed extent from status tree because
* fiemap, bigallic, and seek_data/hole need to use it.
*/
if (!ext4_es_is_delayed(es)) {
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
nr_shrunk++;
if (--nr_to_scan == 0)
break;
}
}
tree->cache_es = NULL;
return nr_shrunk; return nr_shrunk;
} }

View file

@ -874,6 +874,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ei->i_es_list); INIT_LIST_HEAD(&ei->i_es_list);
ei->i_es_all_nr = 0; ei->i_es_all_nr = 0;
ei->i_es_shk_nr = 0; ei->i_es_shk_nr = 0;
ei->i_es_shrink_lblk = 0;
ei->i_reserved_data_blocks = 0; ei->i_reserved_data_blocks = 0;
ei->i_reserved_meta_blocks = 0; ei->i_reserved_meta_blocks = 0;
ei->i_allocated_meta_blocks = 0; ei->i_allocated_meta_blocks = 0;