2005-04-16 16:20:36 -06:00
|
|
|
/*
|
|
|
|
*
|
|
|
|
* Definitions for mount interface. This describes the in the kernel build
|
|
|
|
* linkedlist with mounted filesystems.
|
|
|
|
*
|
|
|
|
* Author: Marco van Wieringen <mvw@planets.elm.net>
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
#ifndef _LINUX_MOUNT_H
|
|
|
|
#define _LINUX_MOUNT_H
|
|
|
|
|
2005-07-12 14:58:07 -06:00
|
|
|
#include <linux/types.h>
|
2005-04-16 16:20:36 -06:00
|
|
|
#include <linux/list.h>
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 15:37:59 -07:00
|
|
|
#include <linux/nodemask.h>
|
2005-04-16 16:20:36 -06:00
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <asm/atomic.h>
|
|
|
|
|
2006-06-23 03:02:58 -06:00
|
|
|
struct super_block;
|
|
|
|
struct vfsmount;
|
|
|
|
struct dentry;
|
2006-12-08 03:37:56 -07:00
|
|
|
struct mnt_namespace;
|
2006-06-23 03:02:58 -06:00
|
|
|
|
2005-11-07 15:19:07 -07:00
|
|
|
#define MNT_NOSUID 0x01
|
|
|
|
#define MNT_NODEV 0x02
|
|
|
|
#define MNT_NOEXEC 0x04
|
2006-01-09 21:52:17 -07:00
|
|
|
#define MNT_NOATIME 0x08
|
|
|
|
#define MNT_NODIRATIME 0x10
|
2006-12-13 01:34:34 -07:00
|
|
|
#define MNT_RELATIME 0x20
|
2008-02-15 15:38:00 -07:00
|
|
|
#define MNT_READONLY 0x40 /* does the user want this to be r/o? */
|
2006-01-08 02:03:19 -07:00
|
|
|
|
2006-06-09 07:34:17 -06:00
|
|
|
#define MNT_SHRINKABLE 0x100
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 15:37:59 -07:00
|
|
|
#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */
|
2006-06-09 07:34:17 -06:00
|
|
|
|
2006-01-09 21:52:17 -07:00
|
|
|
#define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */
|
|
|
|
#define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */
|
2007-05-08 23:14:03 -06:00
|
|
|
#define MNT_PNODE_MASK 0x3000 /* propagation flag mask */
|
2005-04-16 16:20:36 -06:00
|
|
|
|
2005-11-07 15:19:07 -07:00
|
|
|
struct vfsmount {
|
2005-04-16 16:20:36 -06:00
|
|
|
struct list_head mnt_hash;
|
|
|
|
struct vfsmount *mnt_parent; /* fs we are mounted on */
|
|
|
|
struct dentry *mnt_mountpoint; /* dentry of mountpoint */
|
|
|
|
struct dentry *mnt_root; /* root of the mounted tree */
|
|
|
|
struct super_block *mnt_sb; /* pointer to superblock */
|
|
|
|
struct list_head mnt_mounts; /* list of children, anchored here */
|
|
|
|
struct list_head mnt_child; /* and going through their mnt_child */
|
|
|
|
int mnt_flags;
|
2007-02-10 02:44:48 -07:00
|
|
|
/* 4 bytes hole on 64bits arches */
|
2008-07-21 04:06:36 -06:00
|
|
|
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
|
2005-04-16 16:20:36 -06:00
|
|
|
struct list_head mnt_list;
|
2005-07-07 18:57:30 -06:00
|
|
|
struct list_head mnt_expire; /* link in fs-specific expiry list */
|
2005-11-07 15:19:33 -07:00
|
|
|
struct list_head mnt_share; /* circular list of shared mounts */
|
2005-11-07 15:20:48 -07:00
|
|
|
struct list_head mnt_slave_list;/* list of slave mounts */
|
|
|
|
struct list_head mnt_slave; /* slave list entry */
|
|
|
|
struct vfsmount *mnt_master; /* slave is on master->mnt_slave_list */
|
2006-12-08 03:37:56 -07:00
|
|
|
struct mnt_namespace *mnt_ns; /* containing namespace */
|
2008-03-26 15:11:34 -06:00
|
|
|
int mnt_id; /* mount identifier */
|
2008-03-27 06:06:23 -06:00
|
|
|
int mnt_group_id; /* peer group identifier */
|
2007-02-10 02:44:48 -07:00
|
|
|
/*
|
|
|
|
* We put mnt_count & mnt_expiry_mark at the end of struct vfsmount
|
|
|
|
* to let these frequently modified fields in a separate cache line
|
|
|
|
* (so that reads of mnt_flags wont ping-pong on SMP machines)
|
|
|
|
*/
|
|
|
|
atomic_t mnt_count;
|
|
|
|
int mnt_expiry_mark; /* true if marked for expiry */
|
2005-11-07 15:13:39 -07:00
|
|
|
int mnt_pinned;
|
2008-03-21 21:59:49 -06:00
|
|
|
int mnt_ghosts;
|
[PATCH] r/o bind mounts: track numbers of writers to mounts
This is the real meat of the entire series. It actually
implements the tracking of the number of writers to a mount.
However, it causes scalability problems because there can be
hundreds of cpus doing open()/close() on files on the same mnt at
the same time. Even an atomic_t in the mnt has massive scalaing
problems because the cacheline gets so terribly contended.
This uses a statically-allocated percpu variable. All want/drop
operations are local to a cpu as long that cpu operates on the same
mount, and there are no writer count imbalances. Writer count
imbalances happen when a write is taken on one cpu, and released
on another, like when an open/close pair is performed on two
Upon a remount,ro request, all of the data from the percpu
variables is collected (expensive, but very rare) and we determine
if there are any outstanding writers to the mount.
I've written a little benchmark to sit in a loop for a couple of
seconds in several cpus in parallel doing open/write/close loops.
http://sr71.net/~dave/linux/openbench.c
The code in here is a a worst-possible case for this patch. It
does opens on a _pair_ of files in two different mounts in parallel.
This should cause my code to lose its "operate on the same mount"
optimization completely. This worst-case scenario causes a 3%
degredation in the benchmark.
I could probably get rid of even this 3%, but it would be more
complex than what I have here, and I think this is getting into
acceptable territory. In practice, I expect writing more than 3
bytes to a file, as well as disk I/O to mask any effects that this
has.
(To get rid of that 3%, we could have an #defined number of mounts
in the percpu variable. So, instead of a CPU getting operate only
on percpu data when it accesses only one mount, it could stay on
percpu data when it only accesses N or fewer mounts.)
[AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2008-02-15 15:37:59 -07:00
|
|
|
/*
|
|
|
|
* This value is not stable unless all of the mnt_writers[] spinlocks
|
|
|
|
* are held, and all mnt_writer[]s on this mount have 0 as their ->count
|
|
|
|
*/
|
|
|
|
atomic_t __mnt_writers;
|
2005-04-16 16:20:36 -06:00
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct vfsmount *mntget(struct vfsmount *mnt)
|
|
|
|
{
|
|
|
|
if (mnt)
|
|
|
|
atomic_inc(&mnt->mnt_count);
|
|
|
|
return mnt;
|
|
|
|
}
|
|
|
|
|
2008-02-15 15:37:30 -07:00
|
|
|
extern int mnt_want_write(struct vfsmount *mnt);
|
|
|
|
extern void mnt_drop_write(struct vfsmount *mnt);
|
2005-11-07 15:13:39 -07:00
|
|
|
extern void mntput_no_expire(struct vfsmount *mnt);
|
|
|
|
extern void mnt_pin(struct vfsmount *mnt);
|
|
|
|
extern void mnt_unpin(struct vfsmount *mnt);
|
2008-02-15 15:37:30 -07:00
|
|
|
extern int __mnt_is_readonly(struct vfsmount *mnt);
|
2005-04-16 16:20:36 -06:00
|
|
|
|
|
|
|
static inline void mntput(struct vfsmount *mnt)
|
|
|
|
{
|
|
|
|
if (mnt) {
|
|
|
|
mnt->mnt_expiry_mark = 0;
|
2005-07-07 18:57:30 -06:00
|
|
|
mntput_no_expire(mnt);
|
2005-04-16 16:20:36 -06:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
|
|
|
|
const char *name, void *data);
|
|
|
|
|
2006-06-09 07:34:15 -06:00
|
|
|
struct file_system_type;
|
|
|
|
extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
|
|
|
|
int flags, const char *name,
|
|
|
|
void *data);
|
|
|
|
|
2005-04-16 16:20:36 -06:00
|
|
|
struct nameidata;
|
|
|
|
|
2008-08-01 07:05:54 -06:00
|
|
|
struct path;
|
|
|
|
extern int do_add_mount(struct vfsmount *newmnt, struct path *path,
|
2005-04-16 16:20:36 -06:00
|
|
|
int mnt_flags, struct list_head *fslist);
|
|
|
|
|
|
|
|
extern void mark_mounts_for_expiry(struct list_head *mounts);
|
|
|
|
|
|
|
|
extern spinlock_t vfsmount_lock;
|
2005-07-12 14:58:07 -06:00
|
|
|
extern dev_t name_to_dev_t(char *name);
|
2005-04-16 16:20:36 -06:00
|
|
|
|
|
|
|
#endif /* _LINUX_MOUNT_H */
|