From 6c0f3af72cb1622a66962a1180c36ef8c41be8e2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Nov 2010 11:14:34 -0800 Subject: [PATCH] ceph: add dir_layout to inode Add a ceph_dir_layout to the inode, and calculate dentry hash values based on the parent directory's specified dir_hash function. This is needed because the old default Linux dcache hash function is extremely week and leads to a poor distribution of files among dir fragments. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 20 ++++++++++++++++++++ fs/ceph/export.c | 2 +- fs/ceph/inode.c | 2 ++ fs/ceph/super.h | 2 ++ include/linux/ceph/ceph_fs.h | 16 +++++++++++++--- net/ceph/ceph_hash.c | 3 +++ 6 files changed, 41 insertions(+), 4 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d902948a90d8..562f9884a4d9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1216,6 +1216,26 @@ void ceph_dentry_lru_del(struct dentry *dn) } } +/* + * Return name hash for a given dentry. This is dependent on + * the parent directory's hash function. + */ +unsigned ceph_dentry_hash(struct dentry *dn) +{ + struct inode *dir = dn->d_parent->d_inode; + struct ceph_inode_info *dci = ceph_inode(dir); + + switch (dci->i_dir_layout.dl_dir_hash) { + case 0: /* for backward compat */ + case CEPH_STR_HASH_LINUX: + return dn->d_name.hash; + + default: + return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + dn->d_name.name, dn->d_name.len); + } +} + const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, .readdir = ceph_readdir, diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 2297d9426992..e41056174bf8 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = parent->d_name.hash; + cfh->parent_name_hash = ceph_dentry_hash(parent); *max_len = connected_handle_length; type = 2; } else if (*max_len >= handle_length) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index bf1286588f26..045283ce4413 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_release_count = 0; ci->i_symlink = NULL; + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + ci->i_fragtree = RB_ROOT; mutex_init(&ci->i_fragtree_mutex); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7f01728a4657..6e0826695112 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -239,6 +239,7 @@ struct ceph_inode_info { unsigned i_ceph_flags; unsigned long i_release_count; + struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; char *i_symlink; @@ -768,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern unsigned ceph_dentry_hash(struct dentry *dn); /* * our d_ops vary depending on whether the inode is live, diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index c3c74aef289d..09dcc0c2ffd5 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -43,6 +43,10 @@ #define CEPH_FEATURE_NOSRCADDR (1<<1) #define CEPH_FEATURE_MONCLOCKCHECK (1<<2) #define CEPH_FEATURE_FLOCK (1<<3) +#define CEPH_FEATURE_SUBSCRIBE2 (1<<4) +#define CEPH_FEATURE_MONNAMES (1<<5) +#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) +#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) /* @@ -55,10 +59,10 @@ struct ceph_file_layout { __le32 fl_stripe_count; /* over this many objects */ __le32 fl_object_size; /* until objects are this big, then move to new objects */ - __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ + __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */ /* pg -> disk layout */ - __le32 fl_object_stripe_unit; /* for per-object parity, if any */ + __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ /* object -> pg layout */ __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ @@ -69,6 +73,12 @@ struct ceph_file_layout { int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); +struct ceph_dir_layout { + __u8 dl_dir_hash; /* see ceph_hash.h for ids */ + __u8 dl_unused1; + __u16 dl_unused2; + __u32 dl_unused3; +} __attribute__ ((packed)); /* crypto algorithms */ #define CEPH_CRYPTO_NONE 0x0 @@ -457,7 +467,7 @@ struct ceph_mds_reply_inode { struct ceph_timespec rctime; struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ } __attribute__ ((packed)); -/* followed by frag array, then symlink string, then xattr blob */ +/* followed by frag array, symlink string, dir layout, xattr blob */ /* reply_lease follows dname, and reply_inode */ struct ceph_mds_reply_lease { diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index 815ef8826796..0a1b53bce76d 100644 --- a/net/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -1,5 +1,6 @@ #include +#include /* * Robert Jenkin's hash function. @@ -104,6 +105,7 @@ unsigned ceph_str_hash(int type, const char *s, unsigned len) return -1; } } +EXPORT_SYMBOL(ceph_str_hash); const char *ceph_str_hash_name(int type) { @@ -116,3 +118,4 @@ const char *ceph_str_hash_name(int type) return "unknown"; } } +EXPORT_SYMBOL(ceph_str_hash_name);