fs: allow for more than 2^31 files

Andrew,

Could you please review this patch, you probably are the right guy to
take it, because it crosses fs and net trees.

Note : /proc/sys/fs/file-nr is a read-only file, so this patch doesnt
depend on previous patch (sysctl: fix min/max handling in
__do_proc_doulongvec_minmax())

Thanks !

[PATCH V4] fs: allow for more than 2^31 files

Robin Holt tried to boot a 16TB system and found af_unix was overflowing
a 32bit value :

<quote>

We were seeing a failure which prevented boot.  The kernel was incapable
of creating either a named pipe or unix domain socket.  This comes down
to a common kernel function called unix_create1() which does:

        atomic_inc(&unix_nr_socks);
        if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
                goto out;

The function get_max_files() is a simple return of files_stat.max_files.
files_stat.max_files is a signed integer and is computed in
fs/file_table.c's files_init().

        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = n;

In our case, mempages (total_ram_pages) is approx 3,758,096,384
(0xe0000000).  That leaves max_files at approximately 1,503,238,553.
This causes 2 * get_max_files() to integer overflow.

</quote>

Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long
integers, and change af_unix to use an atomic_long_t instead of
atomic_t.

get_max_files() is changed to return an unsigned long.
get_nr_files() is changed to return a long.

unix_nr_socks is changed from atomic_t to atomic_long_t, while not
strictly needed to address Robin problem.

Before patch (on a 64bit kernel) :
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
-18446744071562067968

After patch:
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
2147483648
# cat /proc/sys/fs/file-nr
704     0       2147483648

Reported-by: Robin Holt <holt@sgi.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Reviewed-by: Robin Holt <holt@sgi.com>
Tested-by: Robin Holt <holt@sgi.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
This commit is contained in:
Eric Dumazet 2010-10-05 09:32:55 +02:00 committed by Al Viro
parent fde214d414
commit 7e360c38ab
4 changed files with 21 additions and 24 deletions

View file

@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
/* /*
* Return the total number of open files in the system * Return the total number of open files in the system
*/ */
static int get_nr_files(void) static long get_nr_files(void)
{ {
return percpu_counter_read_positive(&nr_files); return percpu_counter_read_positive(&nr_files);
} }
@ -68,7 +68,7 @@ static int get_nr_files(void)
/* /*
* Return the maximum number of open files in the system * Return the maximum number of open files in the system
*/ */
int get_max_files(void) unsigned long get_max_files(void)
{ {
return files_stat.max_files; return files_stat.max_files;
} }
@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos) void __user *buffer, size_t *lenp, loff_t *ppos)
{ {
files_stat.nr_files = get_nr_files(); files_stat.nr_files = get_nr_files();
return proc_dointvec(table, write, buffer, lenp, ppos); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
} }
#else #else
int proc_nr_files(ctl_table *table, int write, int proc_nr_files(ctl_table *table, int write,
@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
struct file *get_empty_filp(void) struct file *get_empty_filp(void)
{ {
const struct cred *cred = current_cred(); const struct cred *cred = current_cred();
static int old_max; static long old_max;
struct file * f; struct file * f;
/* /*
@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
over: over:
/* Ran out of filps - report that */ /* Ran out of filps - report that */
if (get_nr_files() > old_max) { if (get_nr_files() > old_max) {
printk(KERN_INFO "VFS: file-max limit %d reached\n", pr_info("VFS: file-max limit %lu reached\n", get_max_files());
get_max_files());
old_max = get_nr_files(); old_max = get_nr_files();
} }
goto fail; goto fail;
@ -487,7 +486,7 @@ void mark_files_ro(struct super_block *sb)
void __init files_init(unsigned long mempages) void __init files_init(unsigned long mempages)
{ {
int n; unsigned long n;
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
*/ */
n = (mempages * (PAGE_SIZE / 1024)) / 10; n = (mempages * (PAGE_SIZE / 1024)) / 10;
files_stat.max_files = n; files_stat.max_files = max_t(unsigned long, n, NR_FILE);
if (files_stat.max_files < NR_FILE)
files_stat.max_files = NR_FILE;
files_defer_init(); files_defer_init();
lg_lock_init(files_lglock); lg_lock_init(files_lglock);
percpu_counter_init(&nr_files, 0); percpu_counter_init(&nr_files, 0);

View file

@ -34,9 +34,9 @@
/* And dynamically-tunable limits and defaults: */ /* And dynamically-tunable limits and defaults: */
struct files_stat_struct { struct files_stat_struct {
int nr_files; /* read only */ unsigned long nr_files; /* read only */
int nr_free_files; /* read only */ unsigned long nr_free_files; /* read only */
int max_files; /* tunable */ unsigned long max_files; /* tunable */
}; };
struct inodes_stat_t { struct inodes_stat_t {
@ -400,7 +400,7 @@ extern void __init inode_init_early(void);
extern void __init files_init(unsigned long); extern void __init files_init(unsigned long);
extern struct files_stat_struct files_stat; extern struct files_stat_struct files_stat;
extern int get_max_files(void); extern unsigned long get_max_files(void);
extern int sysctl_nr_open; extern int sysctl_nr_open;
extern struct inodes_stat_t inodes_stat; extern struct inodes_stat_t inodes_stat;
extern int leases_enable, lease_break_time; extern int leases_enable, lease_break_time;

View file

@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = {
{ {
.procname = "file-nr", .procname = "file-nr",
.data = &files_stat, .data = &files_stat,
.maxlen = 3*sizeof(int), .maxlen = sizeof(files_stat),
.mode = 0444, .mode = 0444,
.proc_handler = proc_nr_files, .proc_handler = proc_nr_files,
}, },
{ {
.procname = "file-max", .procname = "file-max",
.data = &files_stat.max_files, .data = &files_stat.max_files,
.maxlen = sizeof(int), .maxlen = sizeof(files_stat.max_files),
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec, .proc_handler = proc_doulongvec_minmax,
}, },
{ {
.procname = "nr_open", .procname = "nr_open",

View file

@ -117,7 +117,7 @@
static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
static DEFINE_SPINLOCK(unix_table_lock); static DEFINE_SPINLOCK(unix_table_lock);
static atomic_t unix_nr_socks = ATOMIC_INIT(0); static atomic_long_t unix_nr_socks;
#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE])
@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk)
if (u->addr) if (u->addr)
unix_release_addr(u->addr); unix_release_addr(u->addr);
atomic_dec(&unix_nr_socks); atomic_long_dec(&unix_nr_socks);
local_bh_disable(); local_bh_disable();
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
local_bh_enable(); local_bh_enable();
#ifdef UNIX_REFCNT_DEBUG #ifdef UNIX_REFCNT_DEBUG
printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
atomic_read(&unix_nr_socks)); atomic_long_read(&unix_nr_socks));
#endif #endif
} }
@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
struct sock *sk = NULL; struct sock *sk = NULL;
struct unix_sock *u; struct unix_sock *u;
atomic_inc(&unix_nr_socks); atomic_long_inc(&unix_nr_socks);
if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
goto out; goto out;
sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
unix_insert_socket(unix_sockets_unbound, sk); unix_insert_socket(unix_sockets_unbound, sk);
out: out:
if (sk == NULL) if (sk == NULL)
atomic_dec(&unix_nr_socks); atomic_long_dec(&unix_nr_socks);
else { else {
local_bh_disable(); local_bh_disable();
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);