tmpfs: make tmpfs scalable with percpu_counter for used blocks

The current implementation of tmpfs is not scalable.  We found that
stat_lock is contended by multiple threads when we need to get a new page,
leading to useless spinning inside this spin lock.

This patch makes use of the percpu_counter library to maintain local count
of used blocks to speed up getting and returning of pages.  So the
acquisition of stat_lock is unnecessary for getting and returning blocks,
improving the performance of tmpfs on system with large number of cpus.
On a 4 socket 32 core NHM-EX system, we saw improvement of 270%.

The implementation below has a slight chance of race between threads
causing a slight overshoot of the maximum configured blocks.  However, any
overshoot is small, and is bounded by the number of cpus.  This happens
when the number of used blocks is slightly below the maximum configured
blocks when a thread checks the used block count, and another thread
allocates the last block before the current thread does.  This should not
be a problem for tmpfs, as the overshoot is most likely to be a few blocks
and bounded.  If a strict limit is really desired, then configured the max
blocks to be the limit less the number of cpus in system.

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Tim Chen 2010-08-09 17:19:05 -07:00 committed by Linus Torvalds
parent 27f5e0f694
commit 7e496299d4
2 changed files with 19 additions and 24 deletions

View file

@ -3,6 +3,7 @@
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/mempolicy.h> #include <linux/mempolicy.h>
#include <linux/percpu_counter.h>
/* inode in-kernel data */ /* inode in-kernel data */
@ -23,7 +24,7 @@ struct shmem_inode_info {
struct shmem_sb_info { struct shmem_sb_info {
unsigned long max_blocks; /* How many blocks are allowed */ unsigned long max_blocks; /* How many blocks are allowed */
unsigned long free_blocks; /* How many are left for allocation */ struct percpu_counter used_blocks; /* How many are allocated */
unsigned long max_inodes; /* How many inodes are allowed */ unsigned long max_inodes; /* How many inodes are allowed */
unsigned long free_inodes; /* How many are left for allocation */ unsigned long free_inodes; /* How many are left for allocation */
spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ spinlock_t stat_lock; /* Serialize shmem_sb_info changes */

View file

@ -28,6 +28,7 @@
#include <linux/file.h> #include <linux/file.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/percpu_counter.h>
#include <linux/swap.h> #include <linux/swap.h>
static struct vfsmount *shm_mnt; static struct vfsmount *shm_mnt;
@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
{ {
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) { if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock); percpu_counter_add(&sbinfo->used_blocks, -pages);
sbinfo->free_blocks += pages; spin_lock(&inode->i_lock);
inode->i_blocks -= pages*BLOCKS_PER_PAGE; inode->i_blocks -= pages*BLOCKS_PER_PAGE;
spin_unlock(&sbinfo->stat_lock); spin_unlock(&inode->i_lock);
} }
} }
@ -416,19 +417,17 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
if (sgp == SGP_READ) if (sgp == SGP_READ)
return shmem_swp_map(ZERO_PAGE(0)); return shmem_swp_map(ZERO_PAGE(0));
/* /*
* Test free_blocks against 1 not 0, since we have 1 data * Test used_blocks against 1 less max_blocks, since we have 1 data
* page (and perhaps indirect index pages) yet to allocate: * page (and perhaps indirect index pages) yet to allocate:
* a waste to allocate index if we cannot allocate data. * a waste to allocate index if we cannot allocate data.
*/ */
if (sbinfo->max_blocks) { if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock); if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
if (sbinfo->free_blocks <= 1) {
spin_unlock(&sbinfo->stat_lock);
return ERR_PTR(-ENOSPC); return ERR_PTR(-ENOSPC);
} percpu_counter_inc(&sbinfo->used_blocks);
sbinfo->free_blocks--; spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE; inode->i_blocks += BLOCKS_PER_PAGE;
spin_unlock(&sbinfo->stat_lock); spin_unlock(&inode->i_lock);
} }
spin_unlock(&info->lock); spin_unlock(&info->lock);
@ -1387,17 +1386,16 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
shmem_swp_unmap(entry); shmem_swp_unmap(entry);
sbinfo = SHMEM_SB(inode->i_sb); sbinfo = SHMEM_SB(inode->i_sb);
if (sbinfo->max_blocks) { if (sbinfo->max_blocks) {
spin_lock(&sbinfo->stat_lock); if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
if (sbinfo->free_blocks == 0 ||
shmem_acct_block(info->flags)) { shmem_acct_block(info->flags)) {
spin_unlock(&sbinfo->stat_lock);
spin_unlock(&info->lock); spin_unlock(&info->lock);
error = -ENOSPC; error = -ENOSPC;
goto failed; goto failed;
} }
sbinfo->free_blocks--; percpu_counter_inc(&sbinfo->used_blocks);
spin_lock(&inode->i_lock);
inode->i_blocks += BLOCKS_PER_PAGE; inode->i_blocks += BLOCKS_PER_PAGE;
spin_unlock(&sbinfo->stat_lock); spin_unlock(&inode->i_lock);
} else if (shmem_acct_block(info->flags)) { } else if (shmem_acct_block(info->flags)) {
spin_unlock(&info->lock); spin_unlock(&info->lock);
error = -ENOSPC; error = -ENOSPC;
@ -1791,17 +1789,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_type = TMPFS_MAGIC; buf->f_type = TMPFS_MAGIC;
buf->f_bsize = PAGE_CACHE_SIZE; buf->f_bsize = PAGE_CACHE_SIZE;
buf->f_namelen = NAME_MAX; buf->f_namelen = NAME_MAX;
spin_lock(&sbinfo->stat_lock);
if (sbinfo->max_blocks) { if (sbinfo->max_blocks) {
buf->f_blocks = sbinfo->max_blocks; buf->f_blocks = sbinfo->max_blocks;
buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; buf->f_bavail = buf->f_bfree =
sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
} }
if (sbinfo->max_inodes) { if (sbinfo->max_inodes) {
buf->f_files = sbinfo->max_inodes; buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes; buf->f_ffree = sbinfo->free_inodes;
} }
/* else leave those fields 0 like simple_statfs */ /* else leave those fields 0 like simple_statfs */
spin_unlock(&sbinfo->stat_lock);
return 0; return 0;
} }
@ -2242,7 +2239,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
{ {
struct shmem_sb_info *sbinfo = SHMEM_SB(sb); struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
struct shmem_sb_info config = *sbinfo; struct shmem_sb_info config = *sbinfo;
unsigned long blocks;
unsigned long inodes; unsigned long inodes;
int error = -EINVAL; int error = -EINVAL;
@ -2250,9 +2246,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
return error; return error;
spin_lock(&sbinfo->stat_lock); spin_lock(&sbinfo->stat_lock);
blocks = sbinfo->max_blocks - sbinfo->free_blocks;
inodes = sbinfo->max_inodes - sbinfo->free_inodes; inodes = sbinfo->max_inodes - sbinfo->free_inodes;
if (config.max_blocks < blocks) if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
goto out; goto out;
if (config.max_inodes < inodes) if (config.max_inodes < inodes)
goto out; goto out;
@ -2269,7 +2264,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
error = 0; error = 0;
sbinfo->max_blocks = config.max_blocks; sbinfo->max_blocks = config.max_blocks;
sbinfo->free_blocks = config.max_blocks - blocks;
sbinfo->max_inodes = config.max_inodes; sbinfo->max_inodes = config.max_inodes;
sbinfo->free_inodes = config.max_inodes - inodes; sbinfo->free_inodes = config.max_inodes - inodes;
@ -2344,7 +2338,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
#endif #endif
spin_lock_init(&sbinfo->stat_lock); spin_lock_init(&sbinfo->stat_lock);
sbinfo->free_blocks = sbinfo->max_blocks; percpu_counter_init(&sbinfo->used_blocks, 0);
sbinfo->free_inodes = sbinfo->max_inodes; sbinfo->free_inodes = sbinfo->max_inodes;
sb->s_maxbytes = SHMEM_MAX_BYTES; sb->s_maxbytes = SHMEM_MAX_BYTES;