From 6c0f3af72cb1622a66962a1180c36ef8c41be8e2 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 16 Nov 2010 11:14:34 -0800 Subject: [PATCH 1/9] ceph: add dir_layout to inode Add a ceph_dir_layout to the inode, and calculate dentry hash values based on the parent directory's specified dir_hash function. This is needed because the old default Linux dcache hash function is extremely week and leads to a poor distribution of files among dir fragments. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 20 ++++++++++++++++++++ fs/ceph/export.c | 2 +- fs/ceph/inode.c | 2 ++ fs/ceph/super.h | 2 ++ include/linux/ceph/ceph_fs.h | 16 +++++++++++++--- net/ceph/ceph_hash.c | 3 +++ 6 files changed, 41 insertions(+), 4 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d902948a90d8..562f9884a4d9 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1216,6 +1216,26 @@ void ceph_dentry_lru_del(struct dentry *dn) } } +/* + * Return name hash for a given dentry. This is dependent on + * the parent directory's hash function. + */ +unsigned ceph_dentry_hash(struct dentry *dn) +{ + struct inode *dir = dn->d_parent->d_inode; + struct ceph_inode_info *dci = ceph_inode(dir); + + switch (dci->i_dir_layout.dl_dir_hash) { + case 0: /* for backward compat */ + case CEPH_STR_HASH_LINUX: + return dn->d_name.hash; + + default: + return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, + dn->d_name.name, dn->d_name.len); + } +} + const struct file_operations ceph_dir_fops = { .read = ceph_read_dir, .readdir = ceph_readdir, diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 2297d9426992..e41056174bf8 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len, dout("encode_fh %p connectable\n", dentry); cfh->ino = ceph_ino(dentry->d_inode); cfh->parent_ino = ceph_ino(parent->d_inode); - cfh->parent_name_hash = parent->d_name.hash; + cfh->parent_name_hash = ceph_dentry_hash(parent); *max_len = connected_handle_length; type = 2; } else if (*max_len >= handle_length) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index bf1286588f26..045283ce4413 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_release_count = 0; ci->i_symlink = NULL; + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + ci->i_fragtree = RB_ROOT; mutex_init(&ci->i_fragtree_mutex); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7f01728a4657..6e0826695112 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -239,6 +239,7 @@ struct ceph_inode_info { unsigned i_ceph_flags; unsigned long i_release_count; + struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; char *i_symlink; @@ -768,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn); extern void ceph_dentry_lru_touch(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); +extern unsigned ceph_dentry_hash(struct dentry *dn); /* * our d_ops vary depending on whether the inode is live, diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index c3c74aef289d..09dcc0c2ffd5 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -43,6 +43,10 @@ #define CEPH_FEATURE_NOSRCADDR (1<<1) #define CEPH_FEATURE_MONCLOCKCHECK (1<<2) #define CEPH_FEATURE_FLOCK (1<<3) +#define CEPH_FEATURE_SUBSCRIBE2 (1<<4) +#define CEPH_FEATURE_MONNAMES (1<<5) +#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) +#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) /* @@ -55,10 +59,10 @@ struct ceph_file_layout { __le32 fl_stripe_count; /* over this many objects */ __le32 fl_object_size; /* until objects are this big, then move to new objects */ - __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ + __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */ /* pg -> disk layout */ - __le32 fl_object_stripe_unit; /* for per-object parity, if any */ + __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ /* object -> pg layout */ __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ @@ -69,6 +73,12 @@ struct ceph_file_layout { int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); +struct ceph_dir_layout { + __u8 dl_dir_hash; /* see ceph_hash.h for ids */ + __u8 dl_unused1; + __u16 dl_unused2; + __u32 dl_unused3; +} __attribute__ ((packed)); /* crypto algorithms */ #define CEPH_CRYPTO_NONE 0x0 @@ -457,7 +467,7 @@ struct ceph_mds_reply_inode { struct ceph_timespec rctime; struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ } __attribute__ ((packed)); -/* followed by frag array, then symlink string, then xattr blob */ +/* followed by frag array, symlink string, dir layout, xattr blob */ /* reply_lease follows dname, and reply_inode */ struct ceph_mds_reply_lease { diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index 815ef8826796..0a1b53bce76d 100644 --- a/net/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -1,5 +1,6 @@ #include +#include /* * Robert Jenkin's hash function. @@ -104,6 +105,7 @@ unsigned ceph_str_hash(int type, const char *s, unsigned len) return -1; } } +EXPORT_SYMBOL(ceph_str_hash); const char *ceph_str_hash_name(int type) { @@ -116,3 +118,4 @@ const char *ceph_str_hash_name(int type) return "unknown"; } } +EXPORT_SYMBOL(ceph_str_hash_name); From 14303d20f3ae3e6ab626c77a4aac202b3bafd377 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 14 Dec 2010 17:37:52 -0800 Subject: [PATCH 2/9] ceph: implement DIRLAYOUTHASH feature to get dir layout from MDS This implements the DIRLAYOUTHASH protocol feature, which passes the dir layout over the wire from the MDS. This gives the client knowledge of the correct hash function to use for mapping dentries among dir fragments. Note that if this feature is _not_ present on the client but is on the MDS, the client may misdirect requests. This will result in a forward and degrade performance. It may also result in inaccurate NFS filehandle generation, which will prevent fh resolution when the inode is not present in the client cache and the parent directories have been fragmented. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 2 ++ fs/ceph/mds_client.c | 42 +++++++++++++++++++++++++++--------------- fs/ceph/mds_client.h | 1 + fs/ceph/super.c | 3 ++- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 045283ce4413..e791fa34b23d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -682,6 +682,8 @@ static int fill_inode(struct inode *inode, inode->i_op = &ceph_dir_iops; inode->i_fop = &ceph_dir_fops; + ci->i_dir_layout = iinfo->dir_layout; + ci->i_files = le64_to_cpu(info->files); ci->i_subdirs = le64_to_cpu(info->subdirs); ci->i_rbytes = le64_to_cpu(info->rbytes); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 38800eaa81d0..9be29b06a2d9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -60,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops; * parse individual inode info */ static int parse_reply_info_in(void **p, void *end, - struct ceph_mds_reply_info_in *info) + struct ceph_mds_reply_info_in *info, + int features) { int err = -EIO; @@ -74,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end, info->symlink = *p; *p += info->symlink_len; + if (features & CEPH_FEATURE_DIRLAYOUTHASH) + ceph_decode_copy_safe(p, end, &info->dir_layout, + sizeof(info->dir_layout), bad); + else + memset(&info->dir_layout, 0, sizeof(info->dir_layout)); + ceph_decode_32_safe(p, end, info->xattr_len, bad); ceph_decode_need(p, end, info->xattr_len, bad); info->xattr_data = *p; @@ -88,12 +95,13 @@ static int parse_reply_info_in(void **p, void *end, * target inode. */ static int parse_reply_info_trace(void **p, void *end, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + int features) { int err; if (info->head->is_dentry) { - err = parse_reply_info_in(p, end, &info->diri); + err = parse_reply_info_in(p, end, &info->diri, features); if (err < 0) goto out_bad; @@ -114,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end, } if (info->head->is_target) { - err = parse_reply_info_in(p, end, &info->targeti); + err = parse_reply_info_in(p, end, &info->targeti, features); if (err < 0) goto out_bad; } @@ -134,7 +142,8 @@ static int parse_reply_info_trace(void **p, void *end, * parse readdir results */ static int parse_reply_info_dir(void **p, void *end, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + int features) { u32 num, i = 0; int err; @@ -182,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end, *p += sizeof(struct ceph_mds_reply_lease); /* inode */ - err = parse_reply_info_in(p, end, &info->dir_in[i]); + err = parse_reply_info_in(p, end, &info->dir_in[i], features); if (err < 0) goto out_bad; i++; @@ -205,7 +214,8 @@ static int parse_reply_info_dir(void **p, void *end, * parse fcntl F_GETLK results */ static int parse_reply_info_filelock(void **p, void *end, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + int features) { if (*p + sizeof(*info->filelock_reply) > end) goto bad; @@ -225,19 +235,21 @@ static int parse_reply_info_filelock(void **p, void *end, * parse extra results */ static int parse_reply_info_extra(void **p, void *end, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + int features) { if (info->head->op == CEPH_MDS_OP_GETFILELOCK) - return parse_reply_info_filelock(p, end, info); + return parse_reply_info_filelock(p, end, info, features); else - return parse_reply_info_dir(p, end, info); + return parse_reply_info_dir(p, end, info, features); } /* * parse entire mds reply */ static int parse_reply_info(struct ceph_msg *msg, - struct ceph_mds_reply_info_parsed *info) + struct ceph_mds_reply_info_parsed *info, + int features) { void *p, *end; u32 len; @@ -250,7 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg, /* trace */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { - err = parse_reply_info_trace(&p, p+len, info); + err = parse_reply_info_trace(&p, p+len, info, features); if (err < 0) goto out_bad; } @@ -258,7 +270,7 @@ static int parse_reply_info(struct ceph_msg *msg, /* extra */ ceph_decode_32_safe(&p, end, len, bad); if (len > 0) { - err = parse_reply_info_extra(&p, p+len, info); + err = parse_reply_info_extra(&p, p+len, info, features); if (err < 0) goto out_bad; } @@ -654,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else { /* dir + name */ inode = dir; - hash = req->r_dentry->d_name.hash; + hash = ceph_dentry_hash(req->r_dentry); is_hash = true; } } @@ -2101,7 +2113,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) dout("handle_reply tid %lld result %d\n", tid, result); rinfo = &req->r_reply_info; - err = parse_reply_info(msg, rinfo); + err = parse_reply_info(msg, rinfo, session->s_con.peer_features); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index aabe563b54db..f8f27f6eaa90 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -35,6 +35,7 @@ struct ceph_cap; */ struct ceph_mds_reply_info_in { struct ceph_mds_reply_inode *in; + struct ceph_dir_layout dir_layout; u32 symlink_len; char *symlink; u32 xattr_len; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 08b460ae0539..1417f3f3e246 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -428,7 +428,8 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail; } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - fsc->client->supported_features |= CEPH_FEATURE_FLOCK; + fsc->client->supported_features |= CEPH_FEATURE_FLOCK | + CEPH_FEATURE_DIRLAYOUTHASH; fsc->client->monc.want_mdsmap = 1; fsc->mount_options = fsopt; From 4af25fdda6943f311a63034f80933e4d6d6e3a19 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 2 Nov 2010 13:41:47 -0700 Subject: [PATCH 3/9] ceph: drop redundant r_mds field The r_mds field is redundant, since we can find the same information at r_session->s_mds, and when r_session is NULL then r_mds is meaningless. Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 9 ++++++--- fs/ceph/mds_client.c | 8 +++++--- fs/ceph/mds_client.h | 1 - 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 7ae1b3d55b58..08f65faac112 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -60,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p) for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { req = rb_entry(rp, struct ceph_mds_request, r_node); - if (req->r_request) - seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); - else + if (req->r_request && req->r_session) + seq_printf(s, "%lld\tmds%d\t", req->r_tid, + req->r_session->s_mds); + else if (!req->r_request) seq_printf(s, "%lld\t(no request)\t", req->r_tid); + else + seq_printf(s, "%lld\t(no session)\t", req->r_tid); seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9be29b06a2d9..e22e8b41d572 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1705,7 +1705,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, struct ceph_msg *msg; int flags = 0; - req->r_mds = mds; req->r_attempts++; if (req->r_inode) { struct ceph_cap *cap = @@ -2068,8 +2067,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) goto out; } else { struct ceph_inode_info *ci = ceph_inode(req->r_inode); - struct ceph_cap *cap = - ceph_get_cap_for_mds(ci, req->r_mds);; + struct ceph_cap *cap = NULL; + + if (req->r_session) + cap = ceph_get_cap_for_mds(ci, + req->r_session->s_mds); dout("already using auth"); if ((!cap || cap != ci->i_auth_cap) || diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f8f27f6eaa90..4e3a9cc0bba6 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -166,7 +166,6 @@ struct ceph_mds_request { struct ceph_mds_client *r_mdsc; int r_op; /* mds op code */ - int r_mds; /* operation on what? */ struct inode *r_inode; /* arg1 */ From dc69e2e9fcd7c613eb744ea3b9c4ee9ca554e822 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 2 Nov 2010 13:49:00 -0700 Subject: [PATCH 4/9] ceph: associate requests with opening sessions Associate request with sessions that aren't yep open. This makes the debugfs mdsc request list more informative. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index e22e8b41d572..509339ceef72 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1791,6 +1791,8 @@ static int __do_request(struct ceph_mds_client *mdsc, goto finish; } + put_request_session(req); + mds = __choose_mds(mdsc, req); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { @@ -1808,6 +1810,8 @@ static int __do_request(struct ceph_mds_client *mdsc, goto finish; } } + req->r_session = get_session(session); + dout("do_request mds%d session %p state %s\n", mds, session, session_state_name(session->s_state)); if (session->s_state != CEPH_MDS_SESSION_OPEN && @@ -1820,7 +1824,6 @@ static int __do_request(struct ceph_mds_client *mdsc, } /* send request */ - req->r_session = get_session(session); req->r_resend_mds = -1; /* forget any previous mds hint */ if (req->r_request_started == 0) /* note request start time */ @@ -1874,7 +1877,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) if (req->r_session && req->r_session->s_mds == mds) { dout(" kicking tid %llu\n", req->r_tid); - put_request_session(req); __do_request(mdsc, req); } } From 582c86e69045f37da8be445c265f72a7a73b18c6 Mon Sep 17 00:00:00 2001 From: Tracey Dent Date: Tue, 14 Dec 2010 19:32:37 -0500 Subject: [PATCH 5/9] ceph: Makefile: Remove unnessary code Remove the if and else conditional because the code is in mainline and there is no need in it being there. Also, Changed Makefile to use -y instead of -objs because -objs is deprecated and not mentioned in Documentation/kbuild/makefiles.txt. Signed-off-by: Tracey Dent Signed-off-by: Sage Weil --- fs/ceph/Makefile | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index 9e6c4f2e8ff1..bd352125e829 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -2,31 +2,10 @@ # Makefile for CEPH filesystem. # -ifneq ($(KERNELRELEASE),) - obj-$(CONFIG_CEPH_FS) += ceph.o -ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ +ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ export.o caps.o snap.o xattr.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o -else -#Otherwise we were called directly from the command -# line; invoke the kernel build system. - -KERNELDIR ?= /lib/modules/$(shell uname -r)/build -PWD := $(shell pwd) - -default: all - -all: - $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules - -modules_install: - $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install - -clean: - $(MAKE) -C $(KERNELDIR) M=$(PWD) clean - -endif From b0aee3516d84c05240065a53f238ba7a718f56b9 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Fri, 24 Dec 2010 23:01:12 +0100 Subject: [PATCH 6/9] ceph: Always free allocated memory in osdmap_decode() Always free memory allocated to 'pi' in net/ceph/osdmap.c::osdmap_decode(). Signed-off-by: Jesper Juhl Signed-off-by: Sage Weil --- net/ceph/osdmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index d73f3f6efa36..71603ac3dff5 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -605,8 +605,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) goto bad; } err = __decode_pool(p, end, pi); - if (err < 0) + if (err < 0) { + kfree(pi); goto bad; + } __insert_pg_pool(&map->pg_pools, pi); } From 01e6acc4ea4c284c44bfb3d46c76f4ae580c6435 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jan 2011 14:49:45 +0100 Subject: [PATCH 7/9] ceph: fsc->*_wq's aren't used in memory reclaim path fsc->*_wq's aren't depended upon during memory reclaim. Convert to alloc_workqueue() w/o WQ_MEM_RECLAIM. Signed-off-by: Tejun Heo Cc: Sage Weil Cc: ceph-devel@vger.kernel.org Signed-off-by: Sage Weil --- fs/ceph/super.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 1417f3f3e246..bf6f0f34082a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -444,13 +444,17 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, goto fail_client; err = -ENOMEM; - fsc->wb_wq = create_workqueue("ceph-writeback"); + /* + * The number of concurrent works can be high but they don't need + * to be processed in parallel, limit concurrency. + */ + fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); if (fsc->wb_wq == NULL) goto fail_bdi; - fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); + fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); if (fsc->pg_inv_wq == NULL) goto fail_wb_wq; - fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); + fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); if (fsc->trunc_wq == NULL) goto fail_pg_inv_wq; From f363e45fd1184219b472ea549cb7e192e24ef4d2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Jan 2011 14:49:46 +0100 Subject: [PATCH 8/9] net/ceph: make ceph_msgr_wq non-reentrant ceph messenger code does a rather complex dancing around multithread workqueue to make sure the same work item isn't executed concurrently on different CPUs. This restriction can be provided by workqueue with WQ_NON_REENTRANT. Make ceph_msgr_wq non-reentrant workqueue with the default concurrency level and remove the QUEUED/BUSY logic. * This removes backoff handling in con_work() but it couldn't reliably block execution of con_work() to begin with - queue_con() can be called after the work started but before BUSY is set. It seems that it was an optimization for a rather cold path and can be safely removed. * The number of concurrent work items is bound by the number of connections and connetions are independent from each other. With the default concurrency level, different connections will be executed independently. Signed-off-by: Tejun Heo Cc: Sage Weil Cc: ceph-devel@vger.kernel.org Signed-off-by: Sage Weil --- include/linux/ceph/messenger.h | 5 ---- net/ceph/messenger.c | 46 ++-------------------------------- 2 files changed, 2 insertions(+), 49 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index a108b425fee2..c3011beac30d 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -110,17 +110,12 @@ struct ceph_msg_pos { /* * ceph_connection state bit flags - * - * QUEUED and BUSY are used together to ensure that only a single - * thread is currently opening, reading or writing data to the socket. */ #define LOSSYTX 0 /* we can close channel or drop messages on errors */ #define CONNECTING 1 #define NEGOTIATING 2 #define KEEPALIVE_PENDING 3 #define WRITE_PENDING 4 /* we have data ready to send */ -#define QUEUED 5 /* there is work queued on this connection */ -#define BUSY 6 /* work is being done */ #define STANDBY 8 /* no outgoing messages, socket closed. we keep * the ceph_connection around to maintain shared * state with the peer. */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b6ff4a1519ab..dff633d62e5b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -96,7 +96,7 @@ struct workqueue_struct *ceph_msgr_wq; int ceph_msgr_init(void) { - ceph_msgr_wq = create_workqueue("ceph-msgr"); + ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); if (!ceph_msgr_wq) { pr_err("msgr_init failed to create workqueue\n"); return -ENOMEM; @@ -1920,20 +1920,6 @@ static int try_read(struct ceph_connection *con) /* * Atomically queue work on a connection. Bump @con reference to * avoid races with connection teardown. - * - * There is some trickery going on with QUEUED and BUSY because we - * only want a _single_ thread operating on each connection at any - * point in time, but we want to use all available CPUs. - * - * The worker thread only proceeds if it can atomically set BUSY. It - * clears QUEUED and does it's thing. When it thinks it's done, it - * clears BUSY, then rechecks QUEUED.. if it's set again, it loops - * (tries again to set BUSY). - * - * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we - * try to queue work. If that fails (work is already queued, or BUSY) - * we give up (work also already being done or is queued) but leave QUEUED - * set so that the worker thread will loop if necessary. */ static void queue_con(struct ceph_connection *con) { @@ -1948,11 +1934,7 @@ static void queue_con(struct ceph_connection *con) return; } - set_bit(QUEUED, &con->state); - if (test_bit(BUSY, &con->state)) { - dout("queue_con %p - already BUSY\n", con); - con->ops->put(con); - } else if (!queue_work(ceph_msgr_wq, &con->work.work)) { + if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) { dout("queue_con %p - already queued\n", con); con->ops->put(con); } else { @@ -1967,15 +1949,6 @@ static void con_work(struct work_struct *work) { struct ceph_connection *con = container_of(work, struct ceph_connection, work.work); - int backoff = 0; - -more: - if (test_and_set_bit(BUSY, &con->state) != 0) { - dout("con_work %p BUSY already set\n", con); - goto out; - } - dout("con_work %p start, clearing QUEUED\n", con); - clear_bit(QUEUED, &con->state); mutex_lock(&con->mutex); @@ -1994,28 +1967,13 @@ static void con_work(struct work_struct *work) try_read(con) < 0 || try_write(con) < 0) { mutex_unlock(&con->mutex); - backoff = 1; ceph_fault(con); /* error/fault path */ goto done_unlocked; } done: mutex_unlock(&con->mutex); - done_unlocked: - clear_bit(BUSY, &con->state); - dout("con->state=%lu\n", con->state); - if (test_bit(QUEUED, &con->state)) { - if (!backoff || test_bit(OPENING, &con->state)) { - dout("con_work %p QUEUED reset, looping\n", con); - goto more; - } - dout("con_work %p QUEUED reset, but just faulted\n", con); - clear_bit(QUEUED, &con->state); - } - dout("con_work %p done\n", con); - -out: con->ops->put(con); } From 766fc43973b16f9becb6b7402b3e052dbb84adee Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Fri, 7 Jan 2011 14:58:42 -0800 Subject: [PATCH 9/9] rbd: fix cleanup when trying to mount inexistent image Previously we didn't clean up the sysfs entry that was just created. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- drivers/block/rbd.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 008d4a00b50d..e1e38b11f48a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1790,18 +1790,29 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) rc = rbd_bus_add_dev(rbd_dev); if (rc) - goto err_out_disk; + goto err_out_blkdev; + /* set up and announce blkdev mapping */ rc = rbd_init_disk(rbd_dev); if (rc) - goto err_out_blkdev; + goto err_out_bus; return count; +err_out_bus: + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + list_del_init(&rbd_dev->node); + mutex_unlock(&ctl_mutex); + + /* this will also clean up rest of rbd_dev stuff */ + + rbd_bus_del_dev(rbd_dev); + kfree(options); + kfree(mon_dev_name); + return rc; + err_out_blkdev: unregister_blkdev(rbd_dev->major, rbd_dev->name); -err_out_disk: - rbd_free_disk(rbd_dev); err_out_client: rbd_put_client(rbd_dev); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);