fs: use fast counters for vfs caches

percpu_counter library generates quite nasty code, so unless you need
to dynamically allocate counters or take fast approximate value, a
simple per cpu set of counters is much better.

The percpu_counter can never be made to work as well, because it has an
indirection from pointer to percpu memory, and it can't use direct
this_cpu_inc interfaces because it doesn't use static PER_CPU data, so
code will always be worse.

In the fastpath, it is the difference between this:

        incl %gs:nr_dentry      # nr_dentry

and this:

        movl    percpu_counter_batch(%rip), %edx        # percpu_counter_batch,
        movl    $1, %esi        #,
        movq    $nr_dentry, %rdi        #,
        call    __percpu_counter_add    # (plus I clobber registers)

__percpu_counter_add:
        pushq   %rbp    #
        movq    %rsp, %rbp      #,
        subq    $32, %rsp       #,
        movq    %rbx, -24(%rbp) #,
        movq    %r12, -16(%rbp) #,
        movq    %r13, -8(%rbp)  #,
        movq    %rdi, %rbx      # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
        movq %gs:kernel_stack,%rax      #, pfo_ret__
# 0 "" 2
#NO_APP
        incl    -8124(%rax)     # <variable>.preempt_count
        movq    32(%rdi), %r12  # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
        add %gs:this_cpu_off, %r12      # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
        movslq  (%r12),%r13     #* tcp_ptr__, tmp73
        movslq  %edx,%rax       # batch, batch
        addq    %rsi, %r13      # amount, count
        cmpq    %rax, %r13      # batch, count
        jge     .L27    #,
        negl    %edx    # tmp76
        movslq  %edx,%rdx       # tmp76, tmp77
        cmpq    %rdx, %r13      # tmp77, count
        jg      .L28    #,
.L27:
        movq    %rbx, %rdi      # fbc,
        call    _raw_spin_lock  #
        addq    %r13, 8(%rbx)   # count, <variable>.count
        movq    %rbx, %rdi      # fbc,
        movl    $0, (%r12)      #,* tcp_ptr__
        call    _raw_spin_unlock        #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
        movq %gs:kernel_stack,%rax      #, pfo_ret__
# 0 "" 2
#NO_APP
        decl    -8124(%rax)     # <variable>.preempt_count
        movq    -8136(%rax), %rax       #, D.14625
        testb   $8, %al #, D.14625
        jne     .L32    #,
.L31:
        movq    -24(%rbp), %rbx #,
        movq    -16(%rbp), %r12 #,
        movq    -8(%rbp), %r13  #,
        leave
        ret
        .p2align 4,,10
        .p2align 3
.L28:
        movl    %r13d, (%r12)   # count,*
        jmp     .L29    #
.L32:
        call    preempt_schedule        #
        .p2align 4,,6
        jmp     .L31    #
        .size   __percpu_counter_add, .-__percpu_counter_add
        .p2align 4,,15

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
This commit is contained in:
Nick Piggin 2011-01-07 17:49:19 +11:00
parent 86c8749ede
commit 3e880fb5e4
2 changed files with 23 additions and 13 deletions

View file

@ -67,13 +67,22 @@ struct dentry_stat_t dentry_stat = {
.age_limit = 45,
};
static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
static DEFINE_PER_CPU(unsigned int, nr_dentry);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
static int get_nr_dentry(void)
{
int i;
int sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_dentry, i);
return sum < 0 ? 0 : sum;
}
int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
dentry_stat.nr_dentry = get_nr_dentry();
return proc_dointvec(table, write, buffer, lenp, ppos);
}
#endif
@ -93,7 +102,7 @@ static void __d_free(struct rcu_head *head)
*/
static void d_free(struct dentry *dentry)
{
percpu_counter_dec(&nr_dentry);
this_cpu_dec(nr_dentry);
if (dentry->d_op && dentry->d_op->d_release)
dentry->d_op->d_release(dentry);
@ -981,7 +990,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
list_add(&dentry->d_u.d_child, &parent->d_subdirs);
spin_unlock(&dcache_lock);
percpu_counter_inc(&nr_dentry);
this_cpu_inc(nr_dentry);
return dentry;
}
@ -2418,8 +2427,6 @@ static void __init dcache_init(void)
{
int loop;
percpu_counter_init(&nr_dentry, 0);
/*
* A constructor could be added for stable state like the lists,
* but it is probably not worth it because of the cache nature

View file

@ -102,13 +102,17 @@ static DECLARE_RWSEM(iprune_sem);
*/
struct inodes_stat_t inodes_stat;
static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
static DEFINE_PER_CPU(unsigned int, nr_inodes);
static struct kmem_cache *inode_cachep __read_mostly;
static inline int get_nr_inodes(void)
static int get_nr_inodes(void)
{
return percpu_counter_sum_positive(&nr_inodes);
int i;
int sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_inodes, i);
return sum < 0 ? 0 : sum;
}
static inline int get_nr_inodes_unused(void)
@ -118,9 +122,9 @@ static inline int get_nr_inodes_unused(void)
int get_nr_dirty_inodes(void)
{
/* not actually dirty inodes, but a wild approximation */
int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
return nr_dirty > 0 ? nr_dirty : 0;
}
/*
@ -222,7 +226,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_fsnotify_mask = 0;
#endif
percpu_counter_inc(&nr_inodes);
this_cpu_inc(nr_inodes);
return 0;
out:
@ -264,7 +268,7 @@ void __destroy_inode(struct inode *inode)
if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
posix_acl_release(inode->i_default_acl);
#endif
percpu_counter_dec(&nr_inodes);
this_cpu_dec(nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);
@ -1646,7 +1650,6 @@ void __init inode_init(void)
SLAB_MEM_SPREAD),
init_once);
register_shrinker(&icache_shrinker);
percpu_counter_init(&nr_inodes, 0);
/* Hash may have been set up in inode_init_early */
if (!hashdist)