ext4: track extent status tree shrinker delay statictics

This commit adds some statictics in extent status tree shrinker.  The
purpose to add these is that we want to collect more details when we
encounter a stall caused by extent status tree shrinker.  Here we count
the following statictics:
  stats:
    the number of all objects on all extent status trees
    the number of reclaimable objects on lru list
    cache hits/misses
    the last sorted interval
    the number of inodes on lru list
  average:
    scan time for shrinking some objects
    the number of shrunk objects
  maximum:
    the inode that has max nr. of objects on lru list
    the maximum scan time for shrinking some objects

The output looks like below:
  $ cat /proc/fs/ext4/sda1/es_shrinker_info
  stats:
    28228 objects
    6341 reclaimable objects
    5281/631 cache hits/misses
    586 ms last sorted interval
    250 inodes on lru list
  average:
    153 us scan time
    128 shrunk objects
  maximum:
    255 inode (255 objects, 198 reclaimable)
    125723 us max scan time

If the lru list has never been sorted, the following line will not be
printed:
    586ms last sorted interval
If there is an empty lru list, the following lines also will not be
printed:
    250 inodes on lru list
  ...
  maximum:
    255 inode (255 objects, 198 reclaimable)
    0 us max scan time

Meanwhile in this commit a new trace point is defined to print some
details in __ext4_es_shrink().

Cc: Andreas Dilger <adilger.kernel@dilger.ca>
Cc: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
Zheng Liu 2014-09-01 22:26:49 -04:00 committed by Theodore Ts'o
parent e963bb1de4
commit eb68d0e2fc
5 changed files with 224 additions and 21 deletions

View file

@ -891,6 +891,7 @@ struct ext4_inode_info {
struct ext4_es_tree i_es_tree;
rwlock_t i_es_lock;
struct list_head i_es_lru;
unsigned int i_es_all_nr; /* protected by i_es_lock */
unsigned int i_es_lru_nr; /* protected by i_es_lock */
unsigned long i_touch_when; /* jiffies of last accessing */
@ -1331,8 +1332,7 @@ struct ext4_sb_info {
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
struct list_head s_es_lru;
unsigned long s_es_last_sorted;
struct percpu_counter s_extent_cache_cnt;
struct ext4_es_stats s_es_stats;
struct mb_cache *s_mb_cache;
spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;

View file

@ -11,6 +11,8 @@
*/
#include <linux/rbtree.h>
#include <linux/list_sort.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "ext4.h"
#include "extents_status.h"
@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
*/
if (!ext4_es_is_delayed(es)) {
EXT4_I(inode)->i_es_lru_nr++;
percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
percpu_counter_inc(&EXT4_SB(inode->i_sb)->
s_es_stats.es_stats_lru_cnt);
}
EXT4_I(inode)->i_es_all_nr++;
percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
return es;
}
static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
EXT4_I(inode)->i_es_all_nr--;
percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
/* Decrease the lru counter when this es is not delayed */
if (!ext4_es_is_delayed(es)) {
BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0);
EXT4_I(inode)->i_es_lru_nr--;
percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt);
percpu_counter_dec(&EXT4_SB(inode->i_sb)->
s_es_stats.es_stats_lru_cnt);
}
kmem_cache_free(ext4_es_cachep, es);
@ -729,6 +739,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es)
{
struct ext4_es_tree *tree;
struct ext4_es_stats *stats;
struct extent_status *es1 = NULL;
struct rb_node *node;
int found = 0;
@ -765,11 +776,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
}
out:
stats = &EXT4_SB(inode->i_sb)->s_es_stats;
if (found) {
BUG_ON(!es1);
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
stats->es_stats_cache_hits++;
} else {
stats->es_stats_cache_misses++;
}
read_unlock(&EXT4_I(inode)->i_es_lock);
@ -931,11 +946,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei)
{
struct ext4_inode_info *ei;
struct ext4_es_stats *es_stats;
struct list_head *cur, *tmp;
LIST_HEAD(skipped);
ktime_t start_time;
u64 scan_time;
int nr_shrunk = 0;
int retried = 0, skip_precached = 1, nr_skipped = 0;
es_stats = &sbi->s_es_stats;
start_time = ktime_get();
spin_lock(&sbi->s_es_lru_lock);
retry:
@ -946,7 +966,8 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
* If we have already reclaimed all extents from extent
* status tree, just stop the loop immediately.
*/
if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
if (percpu_counter_read_positive(
&es_stats->es_stats_lru_cnt) == 0)
break;
ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
@ -956,7 +977,7 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
* time. Normally we try hard to avoid shrinking
* precached inodes, but we will as a last resort.
*/
if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
if ((es_stats->es_stats_last_sorted < ei->i_touch_when) ||
(skip_precached && ext4_test_inode_state(&ei->vfs_inode,
EXT4_STATE_EXT_PRECACHED))) {
nr_skipped++;
@ -990,7 +1011,7 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
if ((nr_shrunk == 0) && nr_skipped && !retried) {
retried++;
list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
sbi->s_es_last_sorted = jiffies;
es_stats->es_stats_last_sorted = jiffies;
ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
i_es_lru);
/*
@ -1008,6 +1029,22 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
if (locked_ei && nr_shrunk == 0)
nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
if (likely(es_stats->es_stats_scan_time))
es_stats->es_stats_scan_time = (scan_time +
es_stats->es_stats_scan_time*3) / 4;
else
es_stats->es_stats_scan_time = scan_time;
if (scan_time > es_stats->es_stats_max_scan_time)
es_stats->es_stats_max_scan_time = scan_time;
if (likely(es_stats->es_stats_shrunk))
es_stats->es_stats_shrunk = (nr_shrunk +
es_stats->es_stats_shrunk*3) / 4;
else
es_stats->es_stats_shrunk = nr_shrunk;
trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached,
nr_skipped, retried);
return nr_shrunk;
}
@ -1018,7 +1055,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink,
struct ext4_sb_info *sbi;
sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
return nr;
}
@ -1031,7 +1068,7 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
int nr_to_scan = sc->nr_to_scan;
int ret, nr_shrunk;
ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt);
trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);
if (!nr_to_scan)
@ -1043,19 +1080,148 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
return nr_shrunk;
}
void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
{
return *pos ? NULL : SEQ_START_TOKEN;
}
static void *
ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
{
return NULL;
}
static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
{
struct ext4_sb_info *sbi = seq->private;
struct ext4_es_stats *es_stats = &sbi->s_es_stats;
struct ext4_inode_info *ei, *max = NULL;
unsigned int inode_cnt = 0;
if (v != SEQ_START_TOKEN)
return 0;
/* here we just find an inode that has the max nr. of objects */
spin_lock(&sbi->s_es_lru_lock);
list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) {
inode_cnt++;
if (max && max->i_es_all_nr < ei->i_es_all_nr)
max = ei;
else if (!max)
max = ei;
}
spin_unlock(&sbi->s_es_lru_lock);
seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n",
percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt));
seq_printf(seq, " %lu/%lu cache hits/misses\n",
es_stats->es_stats_cache_hits,
es_stats->es_stats_cache_misses);
if (es_stats->es_stats_last_sorted != 0)
seq_printf(seq, " %u ms last sorted interval\n",
jiffies_to_msecs(jiffies -
es_stats->es_stats_last_sorted));
if (inode_cnt)
seq_printf(seq, " %d inodes on lru list\n", inode_cnt);
seq_printf(seq, "average:\n %llu us scan time\n",
div_u64(es_stats->es_stats_scan_time, 1000));
seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk);
if (inode_cnt)
seq_printf(seq,
"maximum:\n %lu inode (%u objects, %u reclaimable)\n"
" %llu us max scan time\n",
max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr,
div_u64(es_stats->es_stats_max_scan_time, 1000));
return 0;
}
static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
{
}
static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
.start = ext4_es_seq_shrinker_info_start,
.next = ext4_es_seq_shrinker_info_next,
.stop = ext4_es_seq_shrinker_info_stop,
.show = ext4_es_seq_shrinker_info_show,
};
static int
ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
{
int ret;
ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = PDE_DATA(inode);
}
return ret;
}
static int
ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
{
return seq_release(inode, file);
}
static const struct file_operations ext4_es_seq_shrinker_info_fops = {
.owner = THIS_MODULE,
.open = ext4_es_seq_shrinker_info_open,
.read = seq_read,
.llseek = seq_lseek,
.release = ext4_es_seq_shrinker_info_release,
};
int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
int err;
INIT_LIST_HEAD(&sbi->s_es_lru);
spin_lock_init(&sbi->s_es_lru_lock);
sbi->s_es_last_sorted = 0;
sbi->s_es_stats.es_stats_last_sorted = 0;
sbi->s_es_stats.es_stats_shrunk = 0;
sbi->s_es_stats.es_stats_cache_hits = 0;
sbi->s_es_stats.es_stats_cache_misses = 0;
sbi->s_es_stats.es_stats_scan_time = 0;
sbi->s_es_stats.es_stats_max_scan_time = 0;
err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0);
if (err)
return err;
err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0);
if (err)
goto err1;
sbi->s_es_shrinker.scan_objects = ext4_es_scan;
sbi->s_es_shrinker.count_objects = ext4_es_count;
sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
register_shrinker(&sbi->s_es_shrinker);
err = register_shrinker(&sbi->s_es_shrinker);
if (err)
goto err2;
if (sbi->s_proc)
proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
&ext4_es_seq_shrinker_info_fops, sbi);
return 0;
err2:
percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
err1:
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
return err;
}
void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
if (sbi->s_proc)
remove_proc_entry("es_shrinker_info", sbi->s_proc);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt);
unregister_shrinker(&sbi->s_es_shrinker);
}

View file

@ -64,6 +64,17 @@ struct ext4_es_tree {
struct extent_status *cache_es; /* recently accessed extent */
};
struct ext4_es_stats {
unsigned long es_stats_last_sorted;
unsigned long es_stats_shrunk;
unsigned long es_stats_cache_hits;
unsigned long es_stats_cache_misses;
u64 es_stats_scan_time;
u64 es_stats_max_scan_time;
struct percpu_counter es_stats_all_cnt;
struct percpu_counter es_stats_lru_cnt;
};
extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
(pb & ~ES_MASK));
}
extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_lru_add(struct inode *inode);
extern void ext4_es_lru_del(struct inode *inode);

View file

@ -820,7 +820,6 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
percpu_counter_destroy(&sbi->s_extent_cache_cnt);
brelse(sbi->s_sbh);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@ -885,6 +884,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ext4_es_init_tree(&ei->i_es_tree);
rwlock_init(&ei->i_es_lock);
INIT_LIST_HEAD(&ei->i_es_lru);
ei->i_es_all_nr = 0;
ei->i_es_lru_nr = 0;
ei->i_touch_when = 0;
ei->i_reserved_data_blocks = 0;
@ -3890,12 +3890,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.data = (unsigned long) sb;
/* Register extent status tree shrinker */
ext4_es_register_shrinker(sbi);
if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
if (ext4_es_register_shrinker(sbi))
goto failed_mount3;
}
sbi->s_stripe = ext4_get_stripe_size(sbi);
sbi->s_extent_max_zeroout_kb = 32;
@ -4225,10 +4221,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
jbd2_journal_destroy(sbi->s_journal);
sbi->s_journal = NULL;
}
failed_mount3:
ext4_es_unregister_shrinker(sbi);
failed_mount3:
del_timer_sync(&sbi->s_err_report);
percpu_counter_destroy(&sbi->s_extent_cache_cnt);
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2:

View file

@ -2450,6 +2450,37 @@ TRACE_EVENT(ext4_collapse_range,
__entry->offset, __entry->len)
);
TRACE_EVENT(ext4_es_shrink,
TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
int skip_precached, int nr_skipped, int retried),
TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( int, nr_shrunk )
__field( unsigned long long, scan_time )
__field( int, skip_precached )
__field( int, nr_skipped )
__field( int, retried )
),
TP_fast_assign(
__entry->dev = sb->s_dev;
__entry->nr_shrunk = nr_shrunk;
__entry->scan_time = div_u64(scan_time, 1000);
__entry->skip_precached = skip_precached;
__entry->nr_skipped = nr_skipped;
__entry->retried = retried;
),
TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d "
"nr_skipped %d retried %d",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
__entry->scan_time, __entry->skip_precached,
__entry->nr_skipped, __entry->retried)
);
#endif /* _TRACE_EXT4_H */
/* This part must be outside protection */