From 46f8a64bae11f5c9b15b4401f6e9863281999b66 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 22 Nov 2007 13:54:18 -0500 Subject: [PATCH 001/100] nfsd4: probe callback channel only once Our callback code doesn't actually handle concurrent attempts to probe the callback channel. Some rethinking of the locking may be required. However, we can also just move the callback probing to this case. Since this is the only time a client is "confirmed" (and since that can only happen once in the lifetime of a client), this ensures we only probe once. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 3 +-- fs/nfsd/nfs4state.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 9d536a8cb379..a9735a672963 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -395,8 +395,7 @@ nfsd4_probe_callback(struct nfs4_client *clp) }; struct task_struct *t; - if (atomic_read(&cb->cb_set)) - return; + BUG_ON(atomic_read(&clp->cl_callback.cb_set)); /* Initialize address */ memset(&addr, 0, sizeof(addr)); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 31673cd251c3..9d81c7117ae6 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -948,6 +948,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } move_to_confirmed(unconf); conf = unconf; + nfsd4_probe_callback(conf); status = nfs_ok; } } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) @@ -965,8 +966,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, status = nfserr_clid_inuse; } out: - if (!status) - nfsd4_probe_callback(conf); nfs4_unlock_state(); return status; } From 63c86716ea34ad94d52e5b0abbda152574dc42b5 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 25 Oct 2007 19:00:26 -0400 Subject: [PATCH 002/100] nfsd: move callback rpc_client creation into separate thread The whole reason to move this callback-channel probe into a separate thread was because (for now) we don't have an easy way to create the rpc_client asynchronously. But I forgot to move the rpc_create() to the spawned thread. Doh! Fix that. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 78 +++++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index a9735a672963..6eb5cd2381ab 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -350,30 +350,6 @@ static struct rpc_version * nfs_cb_version[] = { static int do_probe_callback(void *data) { struct nfs4_client *clp = data; - struct nfs4_callback *cb = &clp->cl_callback; - struct rpc_message msg = { - .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], - .rpc_argp = clp, - }; - int status; - - status = rpc_call_sync(cb->cb_client, &msg, RPC_TASK_SOFT); - - if (status) { - rpc_shutdown_client(cb->cb_client); - cb->cb_client = NULL; - } else - atomic_set(&cb->cb_set, 1); - put_nfs4_client(clp); - return 0; -} - -/* - * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... - */ -void -nfsd4_probe_callback(struct nfs4_client *clp) -{ struct sockaddr_in addr; struct nfs4_callback *cb = &clp->cl_callback; struct rpc_timeout timeparms = { @@ -390,12 +366,15 @@ nfsd4_probe_callback(struct nfs4_client *clp) .timeout = &timeparms, .program = program, .version = nfs_cb_version[1]->number, - .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ + .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ .flags = (RPC_CLNT_CREATE_NOPING), }; - struct task_struct *t; - - BUG_ON(atomic_read(&clp->cl_callback.cb_set)); + struct rpc_message msg = { + .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL], + .rpc_argp = clp, + }; + struct rpc_clnt *client; + int status; /* Initialize address */ memset(&addr, 0, sizeof(addr)); @@ -415,29 +394,50 @@ nfsd4_probe_callback(struct nfs4_client *clp) program->stats->program = program; /* Create RPC client */ - cb->cb_client = rpc_create(&args); - if (IS_ERR(cb->cb_client)) { + client = rpc_create(&args); + if (IS_ERR(client)) { dprintk("NFSD: couldn't create callback client\n"); + status = PTR_ERR(client); goto out_err; } + status = rpc_call_sync(client, &msg, RPC_TASK_SOFT); + + if (status) + goto out_release_client; + + cb->cb_client = client; + atomic_set(&cb->cb_set, 1); + put_nfs4_client(clp); + return 0; +out_release_client: + rpc_shutdown_client(client); +out_err: + put_nfs4_client(clp); + dprintk("NFSD: warning: no callback path to client %.*s\n", + (int)clp->cl_name.len, clp->cl_name.data); + return status; +} + +/* + * Set up the callback client and put a NFSPROC4_CB_NULL on the wire... + */ +void +nfsd4_probe_callback(struct nfs4_client *clp) +{ + struct task_struct *t; + + BUG_ON(atomic_read(&clp->cl_callback.cb_set)); + /* the task holds a reference to the nfs4_client struct */ atomic_inc(&clp->cl_count); t = kthread_run(do_probe_callback, clp, "nfs4_cb_probe"); if (IS_ERR(t)) - goto out_release_clp; + atomic_dec(&clp->cl_count); return; - -out_release_clp: - atomic_dec(&clp->cl_count); - rpc_shutdown_client(cb->cb_client); -out_err: - cb->cb_client = NULL; - dprintk("NFSD: warning: no callback path to client %.*s\n", - (int)clp->cl_name.len, clp->cl_name.data); } /* From aefa89d178e6dd83889b66d4e800d4d77363900b Mon Sep 17 00:00:00 2001 From: Prasad P Date: Wed, 24 Oct 2007 15:14:32 -0500 Subject: [PATCH 003/100] nfsd: Fix inconsistent assignment Dereferenced pointer "dentry" without checking and assigned to inode in the declaration. (We could just delete the NULL checks that follow instead, as we never get to the encode function in this particular case. But it takes a little detective work to verify that fact, so it's probably safer to leave the checks in place.) Cc: Steve French Signed-off-by: Prasad V Potluri Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs2acl.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 0e5fa11e6b44..1c3b7654e966 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -221,12 +221,17 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_getaclres *resp) { struct dentry *dentry = resp->fh.fh_dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode; struct kvec *head = rqstp->rq_res.head; unsigned int base; int n; int w; + /* + * Since this is version 2, the check for nfserr in + * nfsd_dispatch actually ensures the following cannot happen. + * However, it seems fragile to depend on that. + */ if (dentry == NULL || dentry->d_inode == NULL) return 0; inode = dentry->d_inode; From d4395e03fec0895d01451904b8a2276ceda663c9 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 26 Oct 2007 13:32:50 -0400 Subject: [PATCH 004/100] knfsd: fix broken length check in nfs4idmap.c Obviously at some point we thought "error" represented the length when positive. This appears to be a long-standing typo. Thanks to Prasad Potluri for finding the problem and proposing an earlier version of this patch. Cc: Steve French Cc: Prasad V Potluri Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4idmap.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 4c0c683ce07a..5b56c77c15c5 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -255,13 +255,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) goto out; if (len == 0) set_bit(CACHE_NEGATIVE, &ent.h.flags); - else { - if (error >= IDMAP_NAMESZ) { - error = -EINVAL; - goto out; - } + else if (len >= IDMAP_NAMESZ) + goto out; + else memcpy(ent.name, buf1, sizeof(ent.name)); - } error = -ENOMEM; res = idtoname_update(&ent, res); if (res == NULL) From 01b2969a8528b926f5e4d98161ae37053234475c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Oct 2007 13:31:20 -0400 Subject: [PATCH 005/100] SUNRPC: Prevent length underflow in read_flush() Make sure we compare an unsigned length to an unsigned count in read_flush(). Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/cache.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 73f053d0cc7a..d27bbe0ee907 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1244,18 +1244,18 @@ static ssize_t read_flush(struct file *file, char __user *buf, struct cache_detail *cd = PDE(file->f_path.dentry->d_inode)->data; char tbuf[20]; unsigned long p = *ppos; - int len; + size_t len; sprintf(tbuf, "%lu\n", cd->flush_time); len = strlen(tbuf); if (p >= len) return 0; len -= p; - if (len > count) len = count; + if (len > count) + len = count; if (copy_to_user(buf, (void*)(tbuf+p), len)) - len = -EFAULT; - else - *ppos += len; + return -EFAULT; + *ppos += len; return len; } From e5cff482c78a35b9f149a06aa777a1bd693864fb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Nov 2007 16:56:47 -0400 Subject: [PATCH 006/100] SUNRPC: Use unsigned string lengths in xdr_decode_string_inplace XDR strings, opaques, and net objects should all use unsigned lengths. To wit, RFC 4506 says: 4.2. Unsigned Integer An XDR unsigned integer is a 32-bit datum that encodes a non-negative integer in the range [0,4294967295]. ... 4.11. String The standard defines a string of n (numbered 0 through n-1) ASCII bytes to be the number n encoded as an unsigned integer (as described above), and followed by the n bytes of the string. After this patch, xdr_decode_string_inplace now matches the other XDR string and array helpers that take a string length argument. See: xdr_encode_opaque_fixed, xdr_encode_opaque, xdr_encode_array Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/xdr.h | 3 ++- net/sunrpc/xdr.c | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 0751c9464d0f..e4057d729f03 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -112,7 +112,8 @@ struct xdr_buf { __be32 *xdr_encode_opaque_fixed(__be32 *p, const void *ptr, unsigned int len); __be32 *xdr_encode_opaque(__be32 *p, const void *ptr, unsigned int len); __be32 *xdr_encode_string(__be32 *p, const char *s); -__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen); +__be32 *xdr_decode_string_inplace(__be32 *p, char **sp, unsigned int *lenp, + unsigned int maxlen); __be32 *xdr_encode_netobj(__be32 *p, const struct xdr_netobj *); __be32 *xdr_decode_netobj(__be32 *p, struct xdr_netobj *); diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 54264062ea69..995c3fdc16c2 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -96,11 +96,13 @@ xdr_encode_string(__be32 *p, const char *string) EXPORT_SYMBOL(xdr_encode_string); __be32 * -xdr_decode_string_inplace(__be32 *p, char **sp, int *lenp, int maxlen) +xdr_decode_string_inplace(__be32 *p, char **sp, + unsigned int *lenp, unsigned int maxlen) { - unsigned int len; + u32 len; - if ((len = ntohl(*p++)) > maxlen) + len = ntohl(*p++); + if (len > maxlen) return NULL; *lenp = len; *sp = (char *) p; From 48df020aa17ac95a012ff765b0086ede5996b320 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Nov 2007 16:56:53 -0400 Subject: [PATCH 007/100] NLM: Fix sign of length of NLM variable length strings According to The Open Group's NLM specification, NLM callers are variable length strings. XDR variable length strings use an unsigned 32 bit length. And internally, negative string lengths are not meaningful for the Linux NLM implementation. Clean up: Make nlm_lock.len and nlm_reboot.len unsigned integers. This makes the sign of NLM string lengths consistent with the sign of xdr_netobj lengths. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 19 +++++++++++-------- include/linux/lockd/lockd.h | 9 ++++++--- include/linux/lockd/xdr.h | 4 ++-- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 572601e98dcd..ebec0098efbf 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -34,10 +34,10 @@ static DEFINE_MUTEX(nlm_host_mutex); static void nlm_gc_hosts(void); static struct nsm_handle * __nsm_find(const struct sockaddr_in *, - const char *, int, int); + const char *, unsigned int, int); static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, const char *hostname, - int hostname_len); + unsigned int hostname_len); /* * Common host lookup routine for server & client @@ -45,7 +45,8 @@ static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, static struct nlm_host * nlm_lookup_host(int server, const struct sockaddr_in *sin, int proto, int version, const char *hostname, - int hostname_len, const struct sockaddr_in *ssin) + unsigned int hostname_len, + const struct sockaddr_in *ssin) { struct hlist_head *chain; struct hlist_node *pos; @@ -176,7 +177,7 @@ nlm_destroy_host(struct nlm_host *host) */ struct nlm_host * nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, - const char *hostname, int hostname_len) + const char *hostname, unsigned int hostname_len) { struct sockaddr_in ssin = {0}; @@ -189,7 +190,7 @@ nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, */ struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *rqstp, - const char *hostname, int hostname_len) + const char *hostname, unsigned int hostname_len) { struct sockaddr_in ssin = {0}; @@ -307,7 +308,8 @@ void nlm_release_host(struct nlm_host *host) * Release all resources held by that peer. */ void nlm_host_rebooted(const struct sockaddr_in *sin, - const char *hostname, int hostname_len, + const char *hostname, + unsigned int hostname_len, u32 new_state) { struct hlist_head *chain; @@ -449,7 +451,7 @@ static DEFINE_MUTEX(nsm_mutex); static struct nsm_handle * __nsm_find(const struct sockaddr_in *sin, - const char *hostname, int hostname_len, + const char *hostname, unsigned int hostname_len, int create) { struct nsm_handle *nsm = NULL; @@ -503,7 +505,8 @@ __nsm_find(const struct sockaddr_in *sin, } static struct nsm_handle * -nsm_find(const struct sockaddr_in *sin, const char *hostname, int hostname_len) +nsm_find(const struct sockaddr_in *sin, const char *hostname, + unsigned int hostname_len) { return __nsm_find(sin, hostname, hostname_len, 1); } diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index e2d1ce36b367..4babb2a129ac 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -173,14 +173,17 @@ void nlmclnt_next_cookie(struct nlm_cookie *); /* * Host cache */ -struct nlm_host * nlmclnt_lookup_host(const struct sockaddr_in *, int, int, const char *, int); -struct nlm_host * nlmsvc_lookup_host(struct svc_rqst *, const char *, int); +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *, int, int, + const char *, unsigned int); +struct nlm_host *nlmsvc_lookup_host(struct svc_rqst *, const char *, + unsigned int); struct rpc_clnt * nlm_bind_host(struct nlm_host *); void nlm_rebind_host(struct nlm_host *); struct nlm_host * nlm_get_host(struct nlm_host *); void nlm_release_host(struct nlm_host *); void nlm_shutdown_hosts(void); -extern void nlm_host_rebooted(const struct sockaddr_in *, const char *, int, u32); +extern void nlm_host_rebooted(const struct sockaddr_in *, const char *, + unsigned int, u32); void nsm_release(struct nsm_handle *); diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h index 83a1f9f6237b..df18fa053bcd 100644 --- a/include/linux/lockd/xdr.h +++ b/include/linux/lockd/xdr.h @@ -29,7 +29,7 @@ struct svc_rqst; /* Lock info passed via NLM */ struct nlm_lock { char * caller; - int len; /* length of "caller" */ + unsigned int len; /* length of "caller" */ struct nfs_fh fh; struct xdr_netobj oh; u32 svid; @@ -78,7 +78,7 @@ struct nlm_res { */ struct nlm_reboot { char * mon; - int len; + unsigned int len; u32 state; __be32 addr; __be32 vers; From ee1a95b3b3fccf3c825bd95f89a8e006901b03ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Nov 2007 16:56:58 -0400 Subject: [PATCH 008/100] NFSD: Use unsigned length argument for decode_filename Clean up: file name lengths are unsigned on the wire, negative lengths are not meaningful natively either. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3xdr.c | 4 ++-- fs/nfsd/nfsxdr.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index f917fd25858a..c02b8d69297d 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -88,10 +88,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp) * no slashes or null bytes. */ static __be32 * -decode_filename(__be32 *p, char **namp, int *lenp) +decode_filename(__be32 *p, char **namp, unsigned int *lenp) { char *name; - int i; + unsigned int i; if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) { for (i = 0, name = *namp; i < *lenp; i++, name++) { diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index b86e3658a0af..50bd6187edfc 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -62,10 +62,10 @@ encode_fh(__be32 *p, struct svc_fh *fhp) * no slashes or null bytes. */ static __be32 * -decode_filename(__be32 *p, char **namp, int *lenp) +decode_filename(__be32 *p, char **namp, unsigned int *lenp) { char *name; - int i; + unsigned int i; if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) { for (i = 0, name = *namp; i < *lenp; i++, name++) { From 29d5e5553826d05b8ecda51c21787ce85efdef06 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Nov 2007 16:57:04 -0400 Subject: [PATCH 009/100] NFSD: File name length signage in nfsd request argument structures Clean up: For consistency, store the length of file name strings in nfsd argument structures as unsigned integers. This matches the XDR routines and client argument structures for the same operation types. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields --- include/linux/nfsd/xdr.h | 12 ++++++------ include/linux/nfsd/xdr3.h | 14 +++++++------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h index 67885d5e6e50..49ddf7903b92 100644 --- a/include/linux/nfsd/xdr.h +++ b/include/linux/nfsd/xdr.h @@ -23,7 +23,7 @@ struct nfsd_sattrargs { struct nfsd_diropargs { struct svc_fh fh; char * name; - int len; + unsigned int len; }; struct nfsd_readargs { @@ -43,17 +43,17 @@ struct nfsd_writeargs { struct nfsd_createargs { struct svc_fh fh; char * name; - int len; + unsigned int len; struct iattr attrs; }; struct nfsd_renameargs { struct svc_fh ffh; char * fname; - int flen; + unsigned int flen; struct svc_fh tfh; char * tname; - int tlen; + unsigned int tlen; }; struct nfsd_readlinkargs { @@ -65,13 +65,13 @@ struct nfsd_linkargs { struct svc_fh ffh; struct svc_fh tfh; char * tname; - int tlen; + unsigned int tlen; }; struct nfsd_symlinkargs { struct svc_fh ffh; char * fname; - int flen; + unsigned int flen; char * tname; int tlen; struct iattr attrs; diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h index 89d9d6061a62..6fdb958043f0 100644 --- a/include/linux/nfsd/xdr3.h +++ b/include/linux/nfsd/xdr3.h @@ -21,7 +21,7 @@ struct nfsd3_sattrargs { struct nfsd3_diropargs { struct svc_fh fh; char * name; - int len; + unsigned int len; }; struct nfsd3_accessargs { @@ -48,7 +48,7 @@ struct nfsd3_writeargs { struct nfsd3_createargs { struct svc_fh fh; char * name; - int len; + unsigned int len; int createmode; struct iattr attrs; __be32 * verf; @@ -57,7 +57,7 @@ struct nfsd3_createargs { struct nfsd3_mknodargs { struct svc_fh fh; char * name; - int len; + unsigned int len; __u32 ftype; __u32 major, minor; struct iattr attrs; @@ -66,10 +66,10 @@ struct nfsd3_mknodargs { struct nfsd3_renameargs { struct svc_fh ffh; char * fname; - int flen; + unsigned int flen; struct svc_fh tfh; char * tname; - int tlen; + unsigned int tlen; }; struct nfsd3_readlinkargs { @@ -81,13 +81,13 @@ struct nfsd3_linkargs { struct svc_fh ffh; struct svc_fh tfh; char * tname; - int tlen; + unsigned int tlen; }; struct nfsd3_symlinkargs { struct svc_fh ffh; char * fname; - int flen; + unsigned int flen; char * tname; int tlen; struct iattr attrs; From 5a022fc8700cadbac373766cf1b5c746ffec7164 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Nov 2007 16:57:09 -0400 Subject: [PATCH 010/100] NFSD: Adjust filename length argument of nfsd_lookup Clean up: adjust the sign of the length argument of nfsd_lookup and nfsd_lookup_dentry, for consistency with recent changes. NFSD version 4 callers already pass an unsigned file name length. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 4 ++-- include/linux/nfsd/nfsd.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d0199189924c..755ba43c13e1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -132,7 +132,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, __be32 nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, - const char *name, int len, + const char *name, unsigned int len, struct svc_export **exp_ret, struct dentry **dentry_ret) { struct svc_export *exp; @@ -226,7 +226,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, */ __be32 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, - int len, struct svc_fh *resfh) + unsigned int len, struct svc_fh *resfh) { struct svc_export *exp; struct dentry *dentry; diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 604a0d786bc6..a51a30f30cee 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -70,9 +70,9 @@ void nfsd_racache_shutdown(void); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, - const char *, int, struct svc_fh *); + const char *, unsigned int, struct svc_fh *); __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, - const char *, int, + const char *, unsigned int, struct svc_export **, struct dentry **); __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, struct iattr *, int, time_t); From 9c7544d3a195cde33b3d1e46639b23c221f901db Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Nov 2007 16:57:14 -0400 Subject: [PATCH 011/100] NFSD: Use unsigned length argument for decode_pathname Clean up: path name lengths are unsigned on the wire, negative lengths are not meaningful natively either. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsxdr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 50bd6187edfc..7003c313272f 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -78,10 +78,10 @@ decode_filename(__be32 *p, char **namp, unsigned int *lenp) } static __be32 * -decode_pathname(__be32 *p, char **namp, int *lenp) +decode_pathname(__be32 *p, char **namp, unsigned int *lenp) { char *name; - int i; + unsigned int i; if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXPATHLEN)) != NULL) { for (i = 0, name = *namp; i < *lenp; i++, name++) { From a628f6675861d979405f751418e924c4ec7d457d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Nov 2007 16:57:20 -0400 Subject: [PATCH 012/100] NFSD: Fix mixed sign comparison in nfs3svc_decode_symlinkargs Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3xdr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index c02b8d69297d..be515c5a8154 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -452,8 +452,7 @@ int nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_symlinkargs *args) { - unsigned int len; - int avail; + unsigned int len, avail; char *old, *new; struct kvec *vec; @@ -486,7 +485,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, /* now copy next page if there is one */ if (len && !avail && rqstp->rq_arg.page_len) { avail = rqstp->rq_arg.page_len; - if (avail > PAGE_SIZE) avail = PAGE_SIZE; + if (avail > PAGE_SIZE) + avail = PAGE_SIZE; old = page_address(rqstp->rq_arg.pages[0]); } while (len && avail && *old) { From 48b4ba3fdd7af319e90ade395162430934ee7b87 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 1 Nov 2007 16:57:25 -0400 Subject: [PATCH 013/100] NFSD: Path name length signage in nfsd request argument structures Clean up: For consistency, store the length of path name strings in nfsd argument structures as unsigned integers. Signed-off-by: Chuck Lever Acked-By: NeilBrown Signed-off-by: J. Bruce Fields --- include/linux/nfsd/xdr.h | 2 +- include/linux/nfsd/xdr3.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h index 49ddf7903b92..a0132ef58f21 100644 --- a/include/linux/nfsd/xdr.h +++ b/include/linux/nfsd/xdr.h @@ -73,7 +73,7 @@ struct nfsd_symlinkargs { char * fname; unsigned int flen; char * tname; - int tlen; + unsigned int tlen; struct iattr attrs; }; diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h index 6fdb958043f0..421eddd65a25 100644 --- a/include/linux/nfsd/xdr3.h +++ b/include/linux/nfsd/xdr3.h @@ -89,7 +89,7 @@ struct nfsd3_symlinkargs { char * fname; unsigned int flen; char * tname; - int tlen; + unsigned int tlen; struct iattr attrs; }; From a490c681cbcf65d548138c377bb691c85824d323 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 6 Nov 2007 14:15:19 -0500 Subject: [PATCH 014/100] knfsd: fix cache.c comment The path here must be left over from some earlier draft; fix it. And do some more minor cleanup while we're there. Signed-off-by: J. Bruce Fields --- net/sunrpc/cache.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index d27bbe0ee907..3b11277d27b1 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -634,13 +634,13 @@ void cache_clean_deferred(void *owner) /* * communicate with user-space * - * We have a magic /proc file - /proc/sunrpc/cache - * On read, you get a full request, or block - * On write, an update request is processed - * Poll works if anything to read, and always allows write + * We have a magic /proc file - /proc/sunrpc//channel. + * On read, you get a full request, or block. + * On write, an update request is processed. + * Poll works if anything to read, and always allows write. * * Implemented by linked list of requests. Each open file has - * a ->private that also exists in this list. New request are added + * a ->private that also exists in this list. New requests are added * to the end and may wakeup and preceding readers. * New readers are added to the head. If, on read, an item is found with * CACHE_UPCALLING clear, we free it from the list. From ca2a05aa7c72309ee65164c78fa2be7a5038215e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 11 Nov 2007 15:43:12 -0500 Subject: [PATCH 015/100] nfsd: Fix handling of negative lengths in read_buf() The length "nbytes" passed into read_buf should never be negative, but we check only for too-large values of "nbytes", not for too-small values. Make nbytes unsigned, so it's clear that the former tests are sufficient. (Despite this read_buf() currently correctly returns an xdr error in the case of a negative length, thanks to an unsigned comparison with size_of() and bounds-checking in kmalloc(). This seems very fragile, though.) Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 57333944af7f..bf1e792a65a0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -148,12 +148,12 @@ xdr_error: \ } \ } while (0) -static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes) +static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) { /* We want more bytes than seem to be available. * Maybe we need a new page, maybe we have just run out */ - int avail = (char*)argp->end - (char*)argp->p; + unsigned int avail = (char *)argp->end - (char *)argp->p; __be32 *p; if (avail + argp->pagelen < nbytes) return NULL; @@ -169,6 +169,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes) return NULL; } + /* + * The following memcpy is safe because read_buf is always + * called with nbytes > avail, and the two cases above both + * guarantee p points to at least nbytes bytes. + */ memcpy(p, argp->p, avail); /* step to next page */ argp->p = page_address(argp->pagelist[0]); From 46b25895767c606c630a97b03a895934a7a36a70 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 9 Nov 2007 12:31:55 -0500 Subject: [PATCH 016/100] knfsd: cleanup nfsd4 properly on module init failure We forgot to shut down the nfs4 state and idmapping code in this case. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 77dc9893b7ba..d8d50a773a5b 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -695,12 +695,14 @@ static int __init init_nfsd(void) } retval = register_filesystem(&nfsd_fs_type); if (retval) { + nfsd_idmap_shutdown(); nfsd_export_shutdown(); nfsd_cache_shutdown(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); nfsd_stat_shutdown(); nfsd_lockd_shutdown(); + nfsd4_free_slabs(); } return retval; } From 26808d3f10b1213bbb6e27d441be40e20ab84611 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 9 Nov 2007 13:44:06 -0500 Subject: [PATCH 017/100] nfsd: cleanup nfsd module initialization cleanup Handle the failure case here with something closer to the standard kernel style. Doesn't really matter for now, but I'd like to add a few more failure cases, and then this'll help. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index d8d50a773a5b..ecf377944286 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -694,16 +694,18 @@ static int __init init_nfsd(void) entry->proc_fops = &exports_operations; } retval = register_filesystem(&nfsd_fs_type); - if (retval) { - nfsd_idmap_shutdown(); - nfsd_export_shutdown(); - nfsd_cache_shutdown(); - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); - nfsd_stat_shutdown(); - nfsd_lockd_shutdown(); - nfsd4_free_slabs(); - } + if (retval) + goto out_free_all; + return 0; +out_free_all: + nfsd_idmap_shutdown(); + nfsd_export_shutdown(); + nfsd_cache_shutdown(); + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); + nfsd_stat_shutdown(); + nfsd_lockd_shutdown(); + nfsd4_free_slabs(); return retval; } From d5c3428b2cb26d605fddc4878f4fcc03c23df89f Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 9 Nov 2007 14:10:56 -0500 Subject: [PATCH 018/100] nfsd: fail module init on reply cache init failure If the reply cache initialization fails due to a kmalloc failure, currently we try to soldier on with a reduced (or nonexistant) reply cache. Better to just fail immediately: the failure is then much easier to understand and debug, and it could save us complexity in some later code. (But actually, it doesn't help currently because the cache is also turned off in some odd failure cases; we should probably find a better way to handle those failure cases some day.) Fix some minor style problems while we're at it, and rename nfsd_cache_init() to remove the need for a comment describing it. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfscache.c | 28 ++++++++++++---------------- fs/nfsd/nfsctl.c | 11 +++++++---- include/linux/nfsd/cache.h | 4 ++-- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 578f2c9d56be..5bfc2ac60d54 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -44,17 +44,17 @@ static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec); */ static DEFINE_SPINLOCK(cache_lock); -void -nfsd_cache_init(void) +int nfsd_reply_cache_init(void) { struct svc_cacherep *rp; int i; INIT_LIST_HEAD(&lru_head); i = CACHESIZE; - while(i) { + while (i) { rp = kmalloc(sizeof(*rp), GFP_KERNEL); - if (!rp) break; + if (!rp) + goto out_nomem; list_add(&rp->c_lru, &lru_head); rp->c_state = RC_UNUSED; rp->c_type = RC_NOCACHE; @@ -62,23 +62,19 @@ nfsd_cache_init(void) i--; } - if (i) - printk (KERN_ERR "nfsd: cannot allocate all %d cache entries, only got %d\n", - CACHESIZE, CACHESIZE-i); - hash_list = kcalloc (HASHSIZE, sizeof(struct hlist_head), GFP_KERNEL); - if (!hash_list) { - nfsd_cache_shutdown(); - printk (KERN_ERR "nfsd: cannot allocate %Zd bytes for hash list\n", - HASHSIZE * sizeof(struct hlist_head)); - return; - } + if (!hash_list) + goto out_nomem; cache_disabled = 0; + return 0; +out_nomem: + printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); + nfsd_reply_cache_shutdown(); + return -ENOMEM; } -void -nfsd_cache_shutdown(void) +void nfsd_reply_cache_shutdown(void) { struct svc_cacherep *rp; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ecf377944286..2bfda9b8f504 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -683,7 +683,9 @@ static int __init init_nfsd(void) if (retval) return retval; nfsd_stat_init(); /* Statistics */ - nfsd_cache_init(); /* RPC reply cache */ + retval = nfsd_reply_cache_init(); + if (retval) + goto out_free_stat; nfsd_export_init(); /* Exports table */ nfsd_lockd_init(); /* lockd->nfsd callbacks */ nfsd_idmap_init(); /* Name to ID mapping */ @@ -700,11 +702,12 @@ static int __init init_nfsd(void) out_free_all: nfsd_idmap_shutdown(); nfsd_export_shutdown(); - nfsd_cache_shutdown(); + nfsd_reply_cache_shutdown(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); - nfsd_stat_shutdown(); nfsd_lockd_shutdown(); +out_free_stat: + nfsd_stat_shutdown(); nfsd4_free_slabs(); return retval; } @@ -712,7 +715,7 @@ static int __init init_nfsd(void) static void __exit exit_nfsd(void) { nfsd_export_shutdown(); - nfsd_cache_shutdown(); + nfsd_reply_cache_shutdown(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); nfsd_stat_shutdown(); diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h index 007480cd6a60..7b5d784cc858 100644 --- a/include/linux/nfsd/cache.h +++ b/include/linux/nfsd/cache.h @@ -72,8 +72,8 @@ enum { */ #define RC_DELAY (HZ/5) -void nfsd_cache_init(void); -void nfsd_cache_shutdown(void); +int nfsd_reply_cache_init(void); +void nfsd_reply_cache_shutdown(void); int nfsd_cache_lookup(struct svc_rqst *, int); void nfsd_cache_update(struct svc_rqst *, int, __be32 *); From df95a9d4fb91d819d3fb55dd437056df59e7f15e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 8 Nov 2007 16:09:59 -0500 Subject: [PATCH 019/100] knfsd: cache unregistration needn't return error There's really nothing much the caller can do if cache unregistration fails. And indeed, all any caller does in this case is print an error and continue. So just return void and move the printk's inside cache_unregister. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 6 ++---- fs/nfsd/nfs4idmap.c | 6 ++---- include/linux/sunrpc/cache.h | 2 +- net/sunrpc/auth_gss/svcauth_gss.c | 6 ++---- net/sunrpc/cache.c | 8 +++++--- net/sunrpc/sunrpc_syms.c | 6 ++---- 6 files changed, 14 insertions(+), 20 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 66d0aeb32a47..d29b70a28f2b 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1670,10 +1670,8 @@ nfsd_export_shutdown(void) exp_writelock(); - if (cache_unregister(&svc_expkey_cache)) - printk(KERN_ERR "nfsd: failed to unregister expkey cache\n"); - if (cache_unregister(&svc_export_cache)) - printk(KERN_ERR "nfsd: failed to unregister export cache\n"); + cache_unregister(&svc_expkey_cache); + cache_unregister(&svc_export_cache); svcauth_unix_purge(); exp_writeunlock(); diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 5b56c77c15c5..ef22179c49ad 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -474,10 +474,8 @@ nfsd_idmap_init(void) void nfsd_idmap_shutdown(void) { - if (cache_unregister(&idtoname_cache)) - printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n"); - if (cache_unregister(&nametoid_cache)) - printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n"); + cache_unregister(&idtoname_cache); + cache_unregister(&nametoid_cache); } /* diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index bd7a6b0a87af..b683b5ddeea9 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -170,7 +170,7 @@ extern void cache_flush(void); extern void cache_purge(struct cache_detail *detail); #define NEVER (0x7FFFFFFF) extern void cache_register(struct cache_detail *cd); -extern int cache_unregister(struct cache_detail *cd); +extern void cache_unregister(struct cache_detail *cd); extern void qword_add(char **bpp, int *lp, char *str); extern void qword_addhex(char **bpp, int *lp, char *buf, int blen); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 73940df6c460..d329a12500aa 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1396,9 +1396,7 @@ gss_svc_init(void) void gss_svc_shutdown(void) { - if (cache_unregister(&rsc_cache)) - printk(KERN_ERR "auth_rpcgss: failed to unregister rsc cache\n"); - if (cache_unregister(&rsi_cache)) - printk(KERN_ERR "auth_rpcgss: failed to unregister rsi cache\n"); + cache_unregister(&rsc_cache); + cache_unregister(&rsi_cache); svc_auth_unregister(RPC_AUTH_GSS); } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 3b11277d27b1..365586a999ea 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -343,7 +343,7 @@ void cache_register(struct cache_detail *cd) schedule_delayed_work(&cache_cleaner, 0); } -int cache_unregister(struct cache_detail *cd) +void cache_unregister(struct cache_detail *cd) { cache_purge(cd); spin_lock(&cache_list_lock); @@ -351,7 +351,7 @@ int cache_unregister(struct cache_detail *cd) if (cd->entries || atomic_read(&cd->inuse)) { write_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); - return -EBUSY; + goto out; } if (current_detail == cd) current_detail = NULL; @@ -373,7 +373,9 @@ int cache_unregister(struct cache_detail *cd) /* module must be being unloaded so its safe to kill the worker */ cancel_delayed_work_sync(&cache_cleaner); } - return 0; + return; +out: + printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name); } /* clean cache tries to find something to clean diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 1a7e309d008b..ef7dc78e2c7b 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -98,10 +98,8 @@ cleanup_sunrpc(void) cleanup_socket_xprt(); unregister_rpc_pipefs(); rpc_destroy_mempool(); - if (cache_unregister(&ip_map_cache)) - printk(KERN_ERR "sunrpc: failed to unregister ip_map cache\n"); - if (cache_unregister(&unix_gid_cache)) - printk(KERN_ERR "sunrpc: failed to unregister unix_gid cache\n"); + cache_unregister(&ip_map_cache); + cache_unregister(&unix_gid_cache); #ifdef RPC_DEBUG rpc_unregister_sysctl(); #endif From 440bcc592052e42c7050a51489c65e18df4a0636 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 12 Nov 2007 17:09:49 -0500 Subject: [PATCH 020/100] nfsd: select CONFIG_PROC_FS in nfsv4 and gss server cases The server depends on upcalls under /proc to support nfsv4 and gss. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/Kconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/Kconfig b/fs/Kconfig index 219ec06a8c7e..987b5d7cb21a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1674,6 +1674,8 @@ config NFSD select CRYPTO_MD5 if NFSD_V4 select CRYPTO if NFSD_V4 select FS_POSIX_ACL if NFSD_V4 + select PROC_FS if NFSD_V4 + select PROC_FS if SUNRPC_GSS help If you want your Linux box to act as an NFS *server*, so that other computers on your local network which support NFS can access certain From e331f606a85a2a9e84e9c63c94d43c0517136139 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 12 Nov 2007 17:32:21 -0500 Subject: [PATCH 021/100] nfsd: fail init on /proc/fs/nfs/exports creation failure I assume the reason failure of creation was ignored here was just to continue support embedded systems that want nfsd but not proc. However, in cases where proc is supported it would be clearer to fail entirely than to come up with some features disabled. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 2bfda9b8f504..2b95597aa4a5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -674,6 +674,27 @@ static struct file_system_type nfsd_fs_type = { .kill_sb = kill_litter_super, }; +#ifdef CONFIG_PROC_FS +static int create_proc_exports_entry(void) +{ + struct proc_dir_entry *entry; + + entry = proc_mkdir("fs/nfs", NULL); + if (!entry) + return -ENOMEM; + entry = create_proc_entry("fs/nfs/exports", 0, NULL); + if (!entry) + return -ENOMEM; + entry->proc_fops = &exports_operations; + return 0; +} +#else /* CONFIG_PROC_FS */ +static int create_proc_exports_entry(void) +{ + return 0; +} +#endif + static int __init init_nfsd(void) { int retval; @@ -689,23 +710,21 @@ static int __init init_nfsd(void) nfsd_export_init(); /* Exports table */ nfsd_lockd_init(); /* lockd->nfsd callbacks */ nfsd_idmap_init(); /* Name to ID mapping */ - if (proc_mkdir("fs/nfs", NULL)) { - struct proc_dir_entry *entry; - entry = create_proc_entry("fs/nfs/exports", 0, NULL); - if (entry) - entry->proc_fops = &exports_operations; - } + retval = create_proc_exports_entry(); + if (retval) + goto out_free_idmap; retval = register_filesystem(&nfsd_fs_type); if (retval) goto out_free_all; return 0; out_free_all: - nfsd_idmap_shutdown(); - nfsd_export_shutdown(); - nfsd_reply_cache_shutdown(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); + nfsd_idmap_shutdown(); +out_free_idmap: nfsd_lockd_shutdown(); + nfsd_export_shutdown(); + nfsd_reply_cache_shutdown(); out_free_stat: nfsd_stat_shutdown(); nfsd4_free_slabs(); From ffe9386b6e08e7132cb7730025d0ea310e08a182 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 12 Nov 2007 17:04:29 -0500 Subject: [PATCH 022/100] nfsd: move cache proc (un)registration to separate function Just some minor cleanup. Also I don't see much point in trying to register further proc entries if initial entries fail; so just stop trying in that case. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- net/sunrpc/cache.c | 103 ++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 47 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 365586a999ea..f41a7cc4cf62 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -290,44 +290,63 @@ static const struct file_operations cache_flush_operations; static void do_cache_clean(struct work_struct *work); static DECLARE_DELAYED_WORK(cache_cleaner, do_cache_clean); +static void remove_cache_proc_entries(struct cache_detail *cd) +{ + if (cd->proc_ent == NULL) + return; + if (cd->flush_ent) + remove_proc_entry("flush", cd->proc_ent); + if (cd->channel_ent) + remove_proc_entry("channel", cd->proc_ent); + if (cd->content_ent) + remove_proc_entry("content", cd->proc_ent); + cd->proc_ent = NULL; + remove_proc_entry(cd->name, proc_net_rpc); +} + +static void create_cache_proc_entries(struct cache_detail *cd) +{ + struct proc_dir_entry *p; + + cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); + if (cd->proc_ent == NULL) + return; + cd->proc_ent->owner = cd->owner; + cd->channel_ent = cd->content_ent = NULL; + + p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent); + cd->flush_ent = p; + if (p == NULL) + return; + p->proc_fops = &cache_flush_operations; + p->owner = cd->owner; + p->data = cd; + + if (cd->cache_request || cd->cache_parse) { + p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, + cd->proc_ent); + cd->channel_ent = p; + if (p == NULL) + return; + p->proc_fops = &cache_file_operations; + p->owner = cd->owner; + p->data = cd; + } + if (cd->cache_show) { + p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, + cd->proc_ent); + cd->content_ent = p; + if (p == NULL) + return; + p->proc_fops = &content_file_operations; + p->owner = cd->owner; + p->data = cd; + } +} + void cache_register(struct cache_detail *cd) { - cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); - if (cd->proc_ent) { - struct proc_dir_entry *p; - cd->proc_ent->owner = cd->owner; - cd->channel_ent = cd->content_ent = NULL; - - p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, - cd->proc_ent); - cd->flush_ent = p; - if (p) { - p->proc_fops = &cache_flush_operations; - p->owner = cd->owner; - p->data = cd; - } - - if (cd->cache_request || cd->cache_parse) { - p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, - cd->proc_ent); - cd->channel_ent = p; - if (p) { - p->proc_fops = &cache_file_operations; - p->owner = cd->owner; - p->data = cd; - } - } - if (cd->cache_show) { - p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, - cd->proc_ent); - cd->content_ent = p; - if (p) { - p->proc_fops = &content_file_operations; - p->owner = cd->owner; - p->data = cd; - } - } - } + create_cache_proc_entries(cd); rwlock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); spin_lock(&cache_list_lock); @@ -358,17 +377,7 @@ void cache_unregister(struct cache_detail *cd) list_del_init(&cd->others); write_unlock(&cd->hash_lock); spin_unlock(&cache_list_lock); - if (cd->proc_ent) { - if (cd->flush_ent) - remove_proc_entry("flush", cd->proc_ent); - if (cd->channel_ent) - remove_proc_entry("channel", cd->proc_ent); - if (cd->content_ent) - remove_proc_entry("content", cd->proc_ent); - - cd->proc_ent = NULL; - remove_proc_entry(cd->name, proc_net_rpc); - } + remove_cache_proc_entries(cd); if (list_empty(&cache_list)) { /* module must be being unloaded so its safe to kill the worker */ cancel_delayed_work_sync(&cache_cleaner); From dbf847ecb6318d3a22c6758fe39696d00f39063a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 8 Nov 2007 17:20:34 -0500 Subject: [PATCH 023/100] knfsd: allow cache_register to return error on failure Newer server features such as nfsv4 and gss depend on proc to work, so a failure to initialize the proc files they need should be treated as fatal. Thanks to Andrew Morton for style fix and compile fix in case where CONFIG_NFSD_V4 is undefined. Cc: Andrew Morton Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/export.c | 12 +++++++++--- fs/nfsd/nfs4idmap.c | 13 ++++++++++--- fs/nfsd/nfsctl.c | 12 +++++++++--- include/linux/nfsd/export.h | 2 +- include/linux/nfsd_idmap.h | 11 ++++++++--- include/linux/sunrpc/cache.h | 2 +- net/sunrpc/auth_gss/svcauth_gss.c | 17 ++++++++++++---- net/sunrpc/cache.c | 32 +++++++++++++++++++++++-------- 8 files changed, 75 insertions(+), 26 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index d29b70a28f2b..cbbc594ef592 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1637,13 +1637,19 @@ exp_verify_string(char *cp, int max) /* * Initialize the exports module. */ -void +int nfsd_export_init(void) { + int rv; dprintk("nfsd: initializing export module.\n"); - cache_register(&svc_export_cache); - cache_register(&svc_expkey_cache); + rv = cache_register(&svc_export_cache); + if (rv) + return rv; + rv = cache_register(&svc_expkey_cache); + if (rv) + cache_unregister(&svc_export_cache); + return rv; } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index ef22179c49ad..996bd88b75ba 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -464,11 +464,18 @@ nametoid_update(struct ent *new, struct ent *old) * Exported API */ -void +int nfsd_idmap_init(void) { - cache_register(&idtoname_cache); - cache_register(&nametoid_cache); + int rv; + + rv = cache_register(&idtoname_cache); + if (rv) + return rv; + rv = cache_register(&nametoid_cache); + if (rv) + cache_unregister(&idtoname_cache); + return rv; } void diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 2b95597aa4a5..4aba92698581 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -707,9 +707,13 @@ static int __init init_nfsd(void) retval = nfsd_reply_cache_init(); if (retval) goto out_free_stat; - nfsd_export_init(); /* Exports table */ + retval = nfsd_export_init(); + if (retval) + goto out_free_cache; nfsd_lockd_init(); /* lockd->nfsd callbacks */ - nfsd_idmap_init(); /* Name to ID mapping */ + retval = nfsd_idmap_init(); + if (retval) + goto out_free_lockd; retval = create_proc_exports_entry(); if (retval) goto out_free_idmap; @@ -720,10 +724,12 @@ static int __init init_nfsd(void) out_free_all: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); - nfsd_idmap_shutdown(); out_free_idmap: + nfsd_idmap_shutdown(); +out_free_lockd: nfsd_lockd_shutdown(); nfsd_export_shutdown(); +out_free_cache: nfsd_reply_cache_shutdown(); out_free_stat: nfsd_stat_shutdown(); diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h index bcb7abafbca9..3a1687251367 100644 --- a/include/linux/nfsd/export.h +++ b/include/linux/nfsd/export.h @@ -122,7 +122,7 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp); /* * Function declarations */ -void nfsd_export_init(void); +int nfsd_export_init(void); void nfsd_export_shutdown(void); void nfsd_export_flush(void); void exp_readlock(void); diff --git a/include/linux/nfsd_idmap.h b/include/linux/nfsd_idmap.h index e82746fcad14..d4a2ac18bd4c 100644 --- a/include/linux/nfsd_idmap.h +++ b/include/linux/nfsd_idmap.h @@ -44,11 +44,16 @@ #define IDMAP_NAMESZ 128 #ifdef CONFIG_NFSD_V4 -void nfsd_idmap_init(void); +int nfsd_idmap_init(void); void nfsd_idmap_shutdown(void); #else -static inline void nfsd_idmap_init(void) {}; -static inline void nfsd_idmap_shutdown(void) {}; +static inline int nfsd_idmap_init(void) +{ + return 0; +} +static inline void nfsd_idmap_shutdown(void) +{ +} #endif int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *); diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h index b683b5ddeea9..03547d6abee5 100644 --- a/include/linux/sunrpc/cache.h +++ b/include/linux/sunrpc/cache.h @@ -169,7 +169,7 @@ extern int cache_check(struct cache_detail *detail, extern void cache_flush(void); extern void cache_purge(struct cache_detail *detail); #define NEVER (0x7FFFFFFF) -extern void cache_register(struct cache_detail *cd); +extern int cache_register(struct cache_detail *cd); extern void cache_unregister(struct cache_detail *cd); extern void qword_add(char **bpp, int *lp, char *str); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index d329a12500aa..aa790bb4f7a1 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1386,10 +1386,19 @@ int gss_svc_init(void) { int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); - if (rv == 0) { - cache_register(&rsc_cache); - cache_register(&rsi_cache); - } + if (rv) + return rv; + rv = cache_register(&rsc_cache); + if (rv) + goto out1; + rv = cache_register(&rsi_cache); + if (rv) + goto out2; + return 0; +out2: + cache_unregister(&rsc_cache); +out1: + svc_auth_unregister(RPC_AUTH_GSS); return rv; } diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index f41a7cc4cf62..50b1a8b441fe 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -304,20 +304,21 @@ static void remove_cache_proc_entries(struct cache_detail *cd) remove_proc_entry(cd->name, proc_net_rpc); } -static void create_cache_proc_entries(struct cache_detail *cd) +#ifdef CONFIG_PROC_FS +static int create_cache_proc_entries(struct cache_detail *cd) { struct proc_dir_entry *p; cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); if (cd->proc_ent == NULL) - return; + goto out_nomem; cd->proc_ent->owner = cd->owner; cd->channel_ent = cd->content_ent = NULL; p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, cd->proc_ent); cd->flush_ent = p; if (p == NULL) - return; + goto out_nomem; p->proc_fops = &cache_flush_operations; p->owner = cd->owner; p->data = cd; @@ -327,7 +328,7 @@ static void create_cache_proc_entries(struct cache_detail *cd) cd->proc_ent); cd->channel_ent = p; if (p == NULL) - return; + goto out_nomem; p->proc_fops = &cache_file_operations; p->owner = cd->owner; p->data = cd; @@ -337,16 +338,30 @@ static void create_cache_proc_entries(struct cache_detail *cd) cd->proc_ent); cd->content_ent = p; if (p == NULL) - return; + goto out_nomem; p->proc_fops = &content_file_operations; p->owner = cd->owner; p->data = cd; } + return 0; +out_nomem: + remove_cache_proc_entries(cd); + return -ENOMEM; } - -void cache_register(struct cache_detail *cd) +#else /* CONFIG_PROC_FS */ +static int create_cache_proc_entries(struct cache_detail *cd) { - create_cache_proc_entries(cd); + return 0; +} +#endif + +int cache_register(struct cache_detail *cd) +{ + int ret; + + ret = create_cache_proc_entries(cd); + if (ret) + return ret; rwlock_init(&cd->hash_lock); INIT_LIST_HEAD(&cd->queue); spin_lock(&cache_list_lock); @@ -360,6 +375,7 @@ void cache_register(struct cache_detail *cd) /* start the cleaning process */ schedule_delayed_work(&cache_cleaner, 0); + return 0; } void cache_unregister(struct cache_detail *cd) From 2e8138a274d81d87591db0803b1e81f4284ff935 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 15 Nov 2007 17:05:43 -0500 Subject: [PATCH 024/100] nfsd: move nfsd/auth.h into fs/nfsd This header is used only in a few places in fs/nfsd, so there seems to be little point to having it in include/. (Thanks to Robert Day for pointing this out.) Cc: Robert P. J. Day Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- {include/linux => fs}/nfsd/auth.h | 0 fs/nfsd/nfs3xdr.c | 1 + fs/nfsd/nfsfh.c | 1 + fs/nfsd/nfsxdr.c | 1 + include/linux/nfsd/Kbuild | 1 - include/linux/nfsd/nfsd.h | 1 - include/linux/nfsd/syscall.h | 1 - 7 files changed, 3 insertions(+), 3 deletions(-) rename {include/linux => fs}/nfsd/auth.h (100%) diff --git a/include/linux/nfsd/auth.h b/fs/nfsd/auth.h similarity index 100% rename from include/linux/nfsd/auth.h rename to fs/nfsd/auth.h diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index be515c5a8154..4b1ffe3be7e2 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -21,6 +21,7 @@ #include #include #include +#include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_XDR diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 468f17a78441..8fbd2dc08a92 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -22,6 +22,7 @@ #include #include #include +#include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_FH diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 7003c313272f..61ad61743d94 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -15,6 +15,7 @@ #include #include #include +#include "auth.h" #define NFSDDBG_FACILITY NFSDDBG_XDR diff --git a/include/linux/nfsd/Kbuild b/include/linux/nfsd/Kbuild index d9c5455808e5..e726fc3a4375 100644 --- a/include/linux/nfsd/Kbuild +++ b/include/linux/nfsd/Kbuild @@ -4,4 +4,3 @@ unifdef-y += stats.h unifdef-y += syscall.h unifdef-y += nfsfh.h unifdef-y += debug.h -unifdef-y += auth.h diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index a51a30f30cee..8caf4c4f64e6 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -20,7 +20,6 @@ #include #include #include -#include #include /* * nfsd version diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h index 8bcddccb6c42..4e439765b705 100644 --- a/include/linux/nfsd/syscall.h +++ b/include/linux/nfsd/syscall.h @@ -18,7 +18,6 @@ #include #include #include -#include /* * Version of the syscall interface From 1f69f172c73a2bf0bf55da9346da8dccea9035cf Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 15 Nov 2007 17:06:58 -0500 Subject: [PATCH 025/100] nfsd: minor fs/nfsd/auth.h cleanup While we're here, let's remove the redundant (and now wrong) pathname in the comment, and the #ifdef __KERNEL__'s. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- fs/nfsd/auth.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/nfsd/auth.h b/fs/nfsd/auth.h index 0fb9f7212195..78b3c0e93822 100644 --- a/fs/nfsd/auth.h +++ b/fs/nfsd/auth.h @@ -1,6 +1,4 @@ /* - * include/linux/nfsd/auth.h - * * nfsd-specific authentication stuff. * uid/gid mapping not yet implemented. * @@ -10,8 +8,6 @@ #ifndef LINUX_NFSD_AUTH_H #define LINUX_NFSD_AUTH_H -#ifdef __KERNEL__ - #define nfsd_luid(rq, uid) ((u32)(uid)) #define nfsd_lgid(rq, gid) ((u32)(gid)) #define nfsd_ruid(rq, uid) ((u32)(uid)) @@ -23,5 +19,4 @@ */ int nfsd_setuser(struct svc_rqst *, struct svc_export *); -#endif /* __KERNEL__ */ #endif /* LINUX_NFSD_AUTH_H */ From a186e767473bd329122f0229b91573b9b6fa43c1 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 20 Nov 2007 16:11:27 -0500 Subject: [PATCH 026/100] nfsd4: kill some unneeded setclientid comments Most of these comments just summarize the code. The matching of code to the cases described in the RFC may still be useful, though; add specific section references to make that easier to follow. Also update references to the outdated RFC 3010. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 136 +++++++++++++------------------------------- 1 file changed, 40 insertions(+), 96 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9d81c7117ae6..242fee7c1018 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -683,39 +683,6 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se) return; } -/* - * RFC 3010 has a complex implmentation description of processing a - * SETCLIENTID request consisting of 5 bullets, labeled as - * CASE0 - CASE4 below. - * - * NOTES: - * callback information will be processed in a future patch - * - * an unconfirmed record is added when: - * NORMAL (part of CASE 4): there is no confirmed nor unconfirmed record. - * CASE 1: confirmed record found with matching name, principal, - * verifier, and clientid. - * CASE 2: confirmed record found with matching name, principal, - * and there is no unconfirmed record with matching - * name and principal - * - * an unconfirmed record is replaced when: - * CASE 3: confirmed record found with matching name, principal, - * and an unconfirmed record is found with matching - * name, principal, and with clientid and - * confirm that does not match the confirmed record. - * CASE 4: there is no confirmed record with matching name and - * principal. there is an unconfirmed record with - * matching name, principal. - * - * an unconfirmed record is deleted when: - * CASE 1: an unconfirmed record that matches input name, verifier, - * and confirmed clientid. - * CASE 4: any unconfirmed records with matching name and principal - * that exist after an unconfirmed record has been replaced - * as described above. - * - */ __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setclientid *setclid) @@ -748,11 +715,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_lock_state(); conf = find_confirmed_client_by_str(dname, strhashval); if (conf) { - /* - * CASE 0: - * clname match, confirmed, different principal - * or different ip_address - */ + /* RFC 3530 14.2.33 CASE 0: */ status = nfserr_clid_inuse; if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) || conf->cl_addr != sin->sin_addr.s_addr) { @@ -761,12 +724,17 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } } + /* + * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION") + * has a description of SETCLIENTID request processing consisting + * of 5 bullet points, labeled as CASE0 - CASE4 below. + */ unconf = find_unconfirmed_client_by_str(dname, strhashval); status = nfserr_resource; if (!conf) { - /* - * CASE 4: - * placed first, because it is the normal case. + /* + * RFC 3530 14.2.33 CASE 4: + * placed first, because it is the normal case */ if (unconf) expire_client(unconf); @@ -776,17 +744,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, gen_clid(new); } else if (same_verf(&conf->cl_verifier, &clverifier)) { /* - * CASE 1: - * cl_name match, confirmed, principal match - * verifier match: probable callback update - * - * remove any unconfirmed nfs4_client with - * matching cl_name, cl_verifier, and cl_clientid - * - * create and insert an unconfirmed nfs4_client with same - * cl_name, cl_verifier, and cl_clientid as existing - * nfs4_client, but with the new callback info and a - * new cl_confirm + * RFC 3530 14.2.33 CASE 1: + * probable callback update */ if (unconf) { /* Note this is removing unconfirmed {*x***}, @@ -802,32 +761,19 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, copy_clid(new, conf); } else if (!unconf) { /* - * CASE 2: - * clname match, confirmed, principal match - * verfier does not match - * no unconfirmed. create a new unconfirmed nfs4_client - * using input clverifier, clname, and callback info - * and generate a new cl_clientid and cl_confirm. + * RFC 3530 14.2.33 CASE 2: + * probable client reboot; state will be removed if + * confirmed. */ new = create_client(clname, dname); if (new == NULL) goto out; gen_clid(new); } else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) { - /* - * CASE3: - * confirmed found (name, principal match) - * confirmed verifier does not match input clverifier - * - * unconfirmed found (name match) - * confirmed->cl_confirm != unconfirmed->cl_confirm - * - * remove unconfirmed. - * - * create an unconfirmed nfs4_client - * with same cl_name as existing confirmed nfs4_client, - * but with new callback info, new cl_clientid, - * new cl_verifier and a new cl_confirm + /* + * RFC 3530 14.2.33 CASE 3: + * probable client reboot; state will be removed if + * confirmed. */ expire_client(unconf); new = create_client(clname, dname); @@ -857,11 +803,9 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* - * RFC 3010 has a complex implmentation description of processing a - * SETCLIENTID_CONFIRM request consisting of 4 bullets describing - * processing on a DRC miss, labeled as CASE1 - CASE4 below. - * - * NOTE: callback information will be processed here in a future patch + * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has + * a description of SETCLIENTID_CONFIRM request processing consisting of 4 + * bullets, labeled as CASE1 - CASE4 below. */ __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, @@ -892,16 +836,20 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, if (unconf && unconf->cl_addr != sin->sin_addr.s_addr) goto out; + /* + * section 14.2.34 of RFC 3530 has a description of + * SETCLIENTID_CONFIRM request processing consisting + * of 4 bullet points, labeled as CASE1 - CASE4 below. + */ if ((conf && unconf) && (same_verf(&unconf->cl_confirm, &confirm)) && (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) && (same_name(conf->cl_recdir,unconf->cl_recdir)) && (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) { - /* CASE 1: - * unconf record that matches input clientid and input confirm. - * conf record that matches input clientid. - * conf and unconf records match names, verifiers - */ + /* + * RFC 3530 14.2.34 CASE 1: + * callback update + */ if (!same_creds(&conf->cl_cred, &unconf->cl_cred)) status = nfserr_clid_inuse; else { @@ -918,11 +866,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, ((conf && unconf) && (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) || !same_name(conf->cl_recdir, unconf->cl_recdir)))) { - /* CASE 2: - * conf record that matches input clientid. - * if unconf record matches input clientid, then - * unconf->cl_name or unconf->cl_verifier don't match the - * conf record. + /* + * RFC 3530 14.2.34 CASE 2: + * probable retransmitted request; play it safe and + * do nothing. */ if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) status = nfserr_clid_inuse; @@ -930,10 +877,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, status = nfs_ok; } else if (!conf && unconf && same_verf(&unconf->cl_confirm, &confirm)) { - /* CASE 3: - * conf record not found. - * unconf record found. - * unconf->cl_confirm matches input confirm + /* + * RFC 3530 14.2.34 CASE 3: + * Normal case; new or rebooted client: */ if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) { status = nfserr_clid_inuse; @@ -954,11 +900,9 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm))) && (!unconf || (unconf && !same_verf(&unconf->cl_confirm, &confirm)))) { - /* CASE 4: - * conf record not found, or if conf, conf->cl_confirm does not - * match input confirm. - * unconf record not found, or if unconf, unconf->cl_confirm - * does not match input confirm. + /* + * RFC 3530 14.2.34 CASE 4: + * Client probably hasn't noticed that we rebooted yet. */ status = nfserr_stale_clientid; } else { From 49ba87811f34a0219dc7a373cd24aa68450f2058 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 19 Nov 2007 19:09:50 -0500 Subject: [PATCH 027/100] nfsd: eliminate final bogus case from setclientid logic We're supposed to generate a different cl_confirm verifier for each new client, so these to cl_confirm values should never be the same. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 242fee7c1018..035e70a01027 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -769,7 +769,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (new == NULL) goto out; gen_clid(new); - } else if (!same_verf(&conf->cl_confirm, &unconf->cl_confirm)) { + } else { /* * RFC 3530 14.2.33 CASE 3: * probable client reboot; state will be removed if @@ -780,11 +780,6 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (new == NULL) goto out; gen_clid(new); - } else { - /* No cases hit !!! */ - status = nfserr_inval; - goto out; - } copy_verf(new, &clverifier); new->cl_addr = sin->sin_addr.s_addr; From deda2faa8e71474c828d8eefc8bc0f19d02062ef Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 19 Nov 2007 20:31:04 -0500 Subject: [PATCH 028/100] nfsd: uniquify cl_confirm values Using a counter instead of the nanoseconds value seems more likely to produce a unique cl_confirm. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 035e70a01027..9f6322e830fa 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -491,15 +491,14 @@ gen_clid(struct nfs4_client *clp) { clp->cl_clientid.cl_id = current_clientid++; } -static void -gen_confirm(struct nfs4_client *clp) { - struct timespec tv; - u32 * p; +static void gen_confirm(struct nfs4_client *clp) +{ + static u32 i; + u32 *p; - tv = CURRENT_TIME; p = (u32 *)clp->cl_confirm.data; - *p++ = tv.tv_sec; - *p++ = tv.tv_nsec; + *p++ = get_seconds(); + *p++ = i++; } static int From f394baad139f8a67a40b4246d53d3b818af2eb88 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 20 Nov 2007 15:39:07 -0500 Subject: [PATCH 029/100] nfsd4: kill unnecessary same_name() in setclientid_confirm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If conf and unconf are both found in the lookup by cl_clientid, then they share the same cl_clientid. We always create a unique new cl_clientid field when creating a new client--the only exception is the "probable callback update" case in setclientid, where we copy the old cl_clientid from another clientid with the same name. Therefore two clients with the same cl_client field also always share the same cl_name field, and a couple of the checks here are redundant. Thanks to Simon Holm Thøgersen for a compile fix. Signed-off-by: J. Bruce Fields Cc: Simon Holm Thøgersen --- fs/nfsd/nfs4state.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9f6322e830fa..df3e7a7ad31e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -838,7 +838,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, if ((conf && unconf) && (same_verf(&unconf->cl_confirm, &confirm)) && (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) && - (same_name(conf->cl_recdir,unconf->cl_recdir)) && (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) { /* * RFC 3530 14.2.34 CASE 1: @@ -858,8 +857,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, } } else if ((conf && !unconf) || ((conf && unconf) && - (!same_verf(&conf->cl_verifier, &unconf->cl_verifier) || - !same_name(conf->cl_recdir, unconf->cl_recdir)))) { + !same_verf(&conf->cl_verifier, &unconf->cl_verifier))) { /* * RFC 3530 14.2.34 CASE 2: * probable retransmitted request; play it safe and From f3aba4e5a1b963c8bd43394cb15fb9fb6a229cd2 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 20 Nov 2007 16:52:07 -0500 Subject: [PATCH 030/100] nfsd4: remove unnecessary cl_verifier check from setclientid_confirm Again, the only way conf and unconf can have the same clientid is if they were created in the "probable callback update" case of setclientid, in which case we already know that the cl_verifier fields must agree. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index df3e7a7ad31e..23b5fc71f9fb 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -837,7 +837,6 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, */ if ((conf && unconf) && (same_verf(&unconf->cl_confirm, &confirm)) && - (same_verf(&conf->cl_verifier, &unconf->cl_verifier)) && (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) { /* * RFC 3530 14.2.34 CASE 1: @@ -855,9 +854,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, status = nfs_ok; } - } else if ((conf && !unconf) || - ((conf && unconf) && - !same_verf(&conf->cl_verifier, &unconf->cl_verifier))) { + } else if (conf && !unconf) { /* * RFC 3530 14.2.34 CASE 2: * probable retransmitted request; play it safe and From 366e0c1d9116ed03320779ecf9c162204f4c712e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 20 Nov 2007 15:54:10 -0500 Subject: [PATCH 031/100] nfsd4: kill unneeded cl_confirm check We generate a unique cl_confirm for every new client; so if we've already checked that this cl_confirm agrees with the cl_confirm of unconf, then we already know that it does not agree with the cl_confirm of conf. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 23b5fc71f9fb..60cc937b7076 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -835,9 +835,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, * SETCLIENTID_CONFIRM request processing consisting * of 4 bullet points, labeled as CASE1 - CASE4 below. */ - if ((conf && unconf) && - (same_verf(&unconf->cl_confirm, &confirm)) && - (!same_verf(&conf->cl_confirm, &unconf->cl_confirm))) { + if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) { /* * RFC 3530 14.2.34 CASE 1: * callback update From 99d965eda736b839a63fe85438ee03a0f660053c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 21 Nov 2007 14:10:07 -0500 Subject: [PATCH 032/100] nfsd: fix encode_entryplus_baggage() indentation Fix bizarre indentation. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs3xdr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 4b1ffe3be7e2..d7647f70e02b 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -817,11 +817,11 @@ static __be32 * encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, struct svc_fh *fhp) { - p = encode_post_op_attr(cd->rqstp, p, fhp); - *p++ = xdr_one; /* yes, a file handle follows */ - p = encode_fh(p, fhp); - fh_put(fhp); - return p; + p = encode_post_op_attr(cd->rqstp, p, fhp); + *p++ = xdr_one; /* yes, a file handle follows */ + p = encode_fh(p, fhp); + fh_put(fhp); + return p; } static int From 5ec7b46c2f4a6f5e136188d598a3f9912ca922e9 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 21 Nov 2007 21:58:56 -0500 Subject: [PATCH 033/100] nfsd4: make current_clientid local Declare this variable in the one function where it's used, and clean up some minor style problems. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 60cc937b7076..78b9139cdd0f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -61,7 +61,6 @@ static time_t lease_time = 90; /* default lease time */ static time_t user_lease_time = 90; static time_t boot_time; static int in_grace = 1; -static u32 current_clientid = 1; static u32 current_ownerid = 1; static u32 current_fileid = 1; static u32 current_delegid = 1; @@ -485,8 +484,10 @@ same_creds(struct svc_cred *cr1, struct svc_cred *cr2) return cr1->cr_uid == cr2->cr_uid; } -static void -gen_clid(struct nfs4_client *clp) { +static void gen_clid(struct nfs4_client *clp) +{ + static u32 current_clientid = 1; + clp->cl_clientid.cl_boot = boot_time; clp->cl_clientid.cl_id = current_clientid++; } From 35bba9a37e68c68a820a1a772f016255c0838f79 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 21 Nov 2007 22:07:08 -0500 Subject: [PATCH 034/100] nfsd4: miscellaneous nfs4state.c style fixes Fix various minor style violations. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 55 +++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 78b9139cdd0f..b9d395856b3a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -339,21 +339,20 @@ STALE_CLIENTID(clientid_t *clid) * This type of memory management is somewhat inefficient, but we use it * anyway since SETCLIENTID is not a common operation. */ -static inline struct nfs4_client * -alloc_client(struct xdr_netobj name) +static struct nfs4_client *alloc_client(struct xdr_netobj name) { struct nfs4_client *clp; - if ((clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL))!= NULL) { - if ((clp->cl_name.data = kmalloc(name.len, GFP_KERNEL)) != NULL) { - memcpy(clp->cl_name.data, name.data, name.len); - clp->cl_name.len = name.len; - } - else { - kfree(clp); - clp = NULL; - } + clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); + if (clp == NULL) + return NULL; + clp->cl_name.data = kmalloc(name.len, GFP_KERNEL); + if (clp->cl_name.data == NULL) { + kfree(clp); + return NULL; } + memcpy(clp->cl_name.data, name.data, name.len); + clp->cl_name.len = name.len; return clp; } @@ -421,12 +420,13 @@ expire_client(struct nfs4_client *clp) put_nfs4_client(clp); } -static struct nfs4_client * -create_client(struct xdr_netobj name, char *recdir) { +static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir) +{ struct nfs4_client *clp; - if (!(clp = alloc_client(name))) - goto out; + clp = alloc_client(name); + if (clp == NULL) + return NULL; memcpy(clp->cl_recdir, recdir, HEXDIR_LEN); atomic_set(&clp->cl_count, 1); atomic_set(&clp->cl_callback.cb_set, 0); @@ -435,32 +435,30 @@ create_client(struct xdr_netobj name, char *recdir) { INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); -out: return clp; } -static void -copy_verf(struct nfs4_client *target, nfs4_verifier *source) { - memcpy(target->cl_verifier.data, source->data, sizeof(target->cl_verifier.data)); +static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) +{ + memcpy(target->cl_verifier.data, source->data, + sizeof(target->cl_verifier.data)); } -static void -copy_clid(struct nfs4_client *target, struct nfs4_client *source) { +static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) +{ target->cl_clientid.cl_boot = source->cl_clientid.cl_boot; target->cl_clientid.cl_id = source->cl_clientid.cl_id; } -static void -copy_cred(struct svc_cred *target, struct svc_cred *source) { - +static void copy_cred(struct svc_cred *target, struct svc_cred *source) +{ target->cr_uid = source->cr_uid; target->cr_gid = source->cr_gid; target->cr_group_info = source->cr_group_info; get_group_info(target->cr_group_info); } -static inline int -same_name(const char *n1, const char *n2) +static int same_name(const char *n1, const char *n2) { return 0 == memcmp(n1, n2, HEXDIR_LEN); } @@ -502,9 +500,8 @@ static void gen_confirm(struct nfs4_client *clp) *p++ = i++; } -static int -check_name(struct xdr_netobj name) { - +static int check_name(struct xdr_netobj name) +{ if (name.len == 0) return 0; if (name.len > NFS4_OPAQUE_LIMIT) { From 404ec117be5d36e1a4c4582d0c518594333e32df Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 23 Nov 2007 22:26:18 -0500 Subject: [PATCH 035/100] nfsd4: recognize callback channel failure earlier When the callback channel fails, we inform the client of that by returning a cb_path_down error the next time it tries to renew its lease. If we wait most of a lease period before deciding that a callback has failed and that the callback channel is down, then we decrease the chances that the client will find out in time to do anything about it. So, mark the channel down as soon as we recognize that an rpc has failed. However, continue trying to recall delegations anyway, in hopes it will come back up. This will prevent more delegations from being given out, and ensure cb_path_down is returned to renew calls earlier, while still making the best effort to deliver recalls of existing delegations. Also fix a couple comments and remove a dprink that doesn't seem likely to be useful. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4callback.c | 13 +++++-------- fs/nfsd/nfs4state.c | 5 ++++- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 6eb5cd2381ab..aae2b29ae2c9 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -457,9 +457,6 @@ nfsd4_cb_recall(struct nfs4_delegation *dp) int retries = 1; int status = 0; - if ((!atomic_read(&clp->cl_callback.cb_set)) || !clnt) - return; - cbr->cbr_trunc = 0; /* XXX need to implement truncate optimization */ cbr->cbr_dp = dp; @@ -468,6 +465,7 @@ nfsd4_cb_recall(struct nfs4_delegation *dp) switch (status) { case -EIO: /* Network partition? */ + atomic_set(&clp->cl_callback.cb_set, 0); case -EBADHANDLE: case -NFS4ERR_BAD_STATEID: /* Race: client probably got cb_recall @@ -480,11 +478,10 @@ nfsd4_cb_recall(struct nfs4_delegation *dp) status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT); } out_put_cred: - if (status == -EIO) - atomic_set(&clp->cl_callback.cb_set, 0); - /* Success or failure, now we're either waiting for lease expiration - * or deleg_return. */ - dprintk("NFSD: nfs4_cb_recall: dp %p dl_flock %p dl_count %d\n",dp, dp->dl_flock, atomic_read(&dp->dl_count)); + /* + * Success or failure, now we're either waiting for lease expiration + * or deleg_return. + */ put_nfs4_client(clp); nfs4_put_delegation(dp); return; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b9d395856b3a..11aa4b6b4fa2 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -361,8 +361,11 @@ shutdown_callback_client(struct nfs4_client *clp) { struct rpc_clnt *clnt = clp->cl_callback.cb_client; - /* shutdown rpc client, ending any outstanding recall rpcs */ if (clnt) { + /* + * Callback threads take a reference on the client, so there + * should be no outstanding callbacks at this point. + */ clp->cl_callback.cb_client = NULL; rpc_shutdown_client(clnt); } From b7e6b86948df8d08d420558212e09eb449be9bfa Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Mon, 26 Nov 2007 13:35:11 -0500 Subject: [PATCH 036/100] lockd: fix reference count leaks in async locking case In a number of places where we wish only to translate nlm_drop_reply to rpc_drop_reply errors we instead return early with rpc_drop_reply, skipping some important end-of-function cleanup. This results in reference count leaks when lockd is doing posix locking on GFS2. Signed-off-by: Oleg Drokin Signed-off-by: J. Bruce Fields --- fs/lockd/svc4proc.c | 20 ++++++++++++-------- fs/lockd/svcproc.c | 22 +++++++++++++--------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index bf27b6c6cb6b..385437e3387d 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -84,6 +84,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; + int rc = rpc_success; dprintk("lockd: TEST4 called\n"); resp->cookie = argp->cookie; @@ -91,7 +92,7 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, /* Don't accept test requests during grace period */ if (nlmsvc_grace_period) { resp->status = nlm_lck_denied_grace_period; - return rpc_success; + return rc; } /* Obtain client and file */ @@ -101,12 +102,13 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, /* Now check for conflicting locks */ resp->status = nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie); if (resp->status == nlm_drop_reply) - return rpc_drop_reply; + rc = rpc_drop_reply; + else + dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); - dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); nlm_release_host(host); nlm_release_file(file); - return rpc_success; + return rc; } static __be32 @@ -115,6 +117,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; + int rc = rpc_success; dprintk("lockd: LOCK called\n"); @@ -123,7 +126,7 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, /* Don't accept new lock requests during grace period */ if (nlmsvc_grace_period && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; - return rpc_success; + return rc; } /* Obtain client and file */ @@ -146,12 +149,13 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->status = nlmsvc_lock(rqstp, file, &argp->lock, argp->block, &argp->cookie); if (resp->status == nlm_drop_reply) - return rpc_drop_reply; + rc = rpc_drop_reply; + else + dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); - dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); nlm_release_host(host); nlm_release_file(file); - return rpc_success; + return rc; } static __be32 diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 9cd5c8b37593..88379cc6e0b1 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -113,6 +113,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; + int rc = rpc_success; dprintk("lockd: TEST called\n"); resp->cookie = argp->cookie; @@ -120,7 +121,7 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, /* Don't accept test requests during grace period */ if (nlmsvc_grace_period) { resp->status = nlm_lck_denied_grace_period; - return rpc_success; + return rc; } /* Obtain client and file */ @@ -130,13 +131,14 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, /* Now check for conflicting locks */ resp->status = cast_status(nlmsvc_testlock(rqstp, file, &argp->lock, &resp->lock, &resp->cookie)); if (resp->status == nlm_drop_reply) - return rpc_drop_reply; + rc = rpc_drop_reply; + else + dprintk("lockd: TEST status %d vers %d\n", + ntohl(resp->status), rqstp->rq_vers); - dprintk("lockd: TEST status %d vers %d\n", - ntohl(resp->status), rqstp->rq_vers); nlm_release_host(host); nlm_release_file(file); - return rpc_success; + return rc; } static __be32 @@ -145,6 +147,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, { struct nlm_host *host; struct nlm_file *file; + int rc = rpc_success; dprintk("lockd: LOCK called\n"); @@ -153,7 +156,7 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, /* Don't accept new lock requests during grace period */ if (nlmsvc_grace_period && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; - return rpc_success; + return rc; } /* Obtain client and file */ @@ -176,12 +179,13 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->status = cast_status(nlmsvc_lock(rqstp, file, &argp->lock, argp->block, &argp->cookie)); if (resp->status == nlm_drop_reply) - return rpc_drop_reply; + rc = rpc_drop_reply; + else + dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); - dprintk("lockd: LOCK status %d\n", ntohl(resp->status)); nlm_release_host(host); nlm_release_file(file); - return rpc_success; + return rc; } static __be32 From 39325bd03fc16d903f1e0f51104436d939899c8c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 26 Nov 2007 17:06:39 -0500 Subject: [PATCH 037/100] nfsd4: fix bad seqid on lock request incompatible with open mode The failure to return a stateowner from nfs4_preprocess_seqid_op() means in the case where a lock request is of a type incompatible with an open (due to, e.g., an application attempting a write lock on a file open for read), means that fs/nfsd/nfs4xdr.c:ENCODE_SEQID_OP_TAIL() never bumps the seqid as it should. The client, attempting to close the file afterwards, then gets an (incorrect) bad sequence id error. Worse, this prevents the open file from ever being closed, so we leak state. Thanks to Benny Halevy and Trond Myklebust for analysis, and to Steven Wilton for the report and extensive data-gathering. Cc: Benny Halevy Cc: Steven Wilton Cc: Trond Myklebust Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 11aa4b6b4fa2..c4b10a1e6c30 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2093,8 +2093,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei goto check_replay; } + *stpp = stp; + *sopp = sop = stp->st_stateowner; + if (lock) { - struct nfs4_stateowner *sop = stp->st_stateowner; clientid_t *lockclid = &lock->v.new.clientid; struct nfs4_client *clp = sop->so_client; int lkflg = 0; @@ -2124,9 +2126,6 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei return nfserr_bad_stateid; } - *stpp = stp; - *sopp = sop = stp->st_stateowner; - /* * We now validate the seqid and stateid generation numbers. * For the moment, we ignore the possibility of From 406a7ea97d9dc1a9348ba92c4cd0e7c678185c4c Mon Sep 17 00:00:00 2001 From: Frank Filz Date: Tue, 27 Nov 2007 11:34:05 -0800 Subject: [PATCH 038/100] nfsd: Allow AIX client to read dir containing mountpoints This patch addresses a compatibility issue with a Linux NFS server and AIX NFS client. I have exported /export as fsid=0 with sec=krb5:krb5i I have mount --bind /home onto /export/home I have exported /export/home with sec=krb5i The AIX client mounts / -o sec=krb5:krb5i onto /mnt If I do an ls /mnt, the AIX client gets a permission error. Looking at the network traceIwe see a READDIR looking for attributes FATTR4_RDATTR_ERROR and FATTR4_MOUNTED_ON_FILEID. The response gives a NFS4ERR_WRONGSEC which the AIX client is not expecting. Since the AIX client is only asking for an attribute that is an attribute of the parent file system (pseudo root in my example), it seems reasonable that there should not be an error. In discussing this issue with Bruce Fields, I initially proposed ignoring the error in nfsd4_encode_dirent_fattr() if all that was being asked for was FATTR4_RDATTR_ERROR and FATTR4_MOUNTED_ON_FILEID, however, Bruce suggested that we avoid calling cross_mnt() if only these attributes are requested. The following patch implements bypassing cross_mnt() if only FATTR4_RDATTR_ERROR and FATTR4_MOUNTED_ON_FILEID are called. Since there is some complexity in the code in nfsd4_encode_fattr(), I didn't want to duplicate code (and introduce a maintenance nightmare), so I added a parameter to nfsd4_encode_fattr() that indicates whether it should ignore cross mounts and simply fill in the attribute using the passed in dentry as opposed to it's parent. Signed-off-by: Frank Filz Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfs4xdr.c | 27 ++++++++++++++++++++++----- include/linux/nfsd/xdr4.h | 2 +- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 18ead1790bb3..c593db047d8b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -750,7 +750,7 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, cstate->current_fh.fh_export, cstate->current_fh.fh_dentry, buf, &count, verify->ve_bmval, - rqstp); + rqstp, 0); /* this means that nfsd4_encode_fattr() ran out of space */ if (status == nfserr_resource && count == 0) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index bf1e792a65a0..b0592e7c378d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1453,7 +1453,7 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, __be32 *buffer, int *countp, u32 *bmval, - struct svc_rqst *rqstp) + struct svc_rqst *rqstp, int ignore_crossmnt) { u32 bmval0 = bmval[0]; u32 bmval1 = bmval[1]; @@ -1833,7 +1833,12 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { if ((buflen -= 8) < 0) goto out_resource; - if (exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) { + /* + * Get parent's attributes if not ignoring crossmount + * and this is the root of a cross-mounted filesystem. + */ + if (ignore_crossmnt == 0 && + exp->ex_mnt->mnt_root->d_inode == dentry->d_inode) { err = vfs_getattr(exp->ex_mnt->mnt_parent, exp->ex_mnt->mnt_mountpoint, &stat); if (err) @@ -1869,13 +1874,25 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, struct svc_export *exp = cd->rd_fhp->fh_export; struct dentry *dentry; __be32 nfserr; + int ignore_crossmnt = 0; dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); exp_get(exp); - if (d_mountpoint(dentry)) { + /* + * In the case of a mountpoint, the client may be asking for + * attributes that are only properties of the underlying filesystem + * as opposed to the cross-mounted file system. In such a case, + * we will not follow the cross mount and will fill the attribtutes + * directly from the mountpoint dentry. + */ + if (d_mountpoint(dentry) && + (cd->rd_bmval[0] & ~FATTR4_WORD0_RDATTR_ERROR) == 0 && + (cd->rd_bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) == 0) + ignore_crossmnt = 1; + else if (d_mountpoint(dentry)) { int err; /* @@ -1894,7 +1911,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, } nfserr = nfsd4_encode_fattr(NULL, exp, dentry, p, buflen, cd->rd_bmval, - cd->rd_rqstp); + cd->rd_rqstp, ignore_crossmnt); out_put: dput(dentry); exp_put(exp); @@ -2048,7 +2065,7 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 buflen = resp->end - resp->p - (COMPOUND_ERR_SLACK_SPACE >> 2); nfserr = nfsd4_encode_fattr(fhp, fhp->fh_export, fhp->fh_dentry, resp->p, &buflen, getattr->ga_bmval, - resp->rqstp); + resp->rqstp, 0); if (!nfserr) resp->p += buflen; return nfserr; diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h index b0ddfb41c790..27bd3e38ec5a 100644 --- a/include/linux/nfsd/xdr4.h +++ b/include/linux/nfsd/xdr4.h @@ -441,7 +441,7 @@ void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *); void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op); __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, __be32 *buffer, int *countp, - u32 *bmval, struct svc_rqst *); + u32 *bmval, struct svc_rqst *, int ignore_crossmnt); extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_setclientid *setclid); From 29dbf546159f5701e11de26fa2da5c4a962e0f83 Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Thu, 29 Nov 2007 14:02:21 -0500 Subject: [PATCH 039/100] lockd: fix a leak in nlmsvc_testlock asynchronous request handling Without the patch, there is a leakage of nlmblock structure refcount that holds a reference nlmfile structure, that holds a reference to struct file, when async GETFL is used (-EINPROGRESS return from file_ops->lock()), and also in some error cases. Fix up a style nit while we're here. Signed-off-by: Oleg Drokin Signed-off-by: J. Bruce Fields --- fs/lockd/svclock.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index d120ec39bcb0..84c4d5e04ebb 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -501,25 +501,29 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, block, block->b_flags, block->b_fl); if (block->b_flags & B_TIMED_OUT) { nlmsvc_unlink_block(block); - return nlm_lck_denied; + ret = nlm_lck_denied; + goto out; } if (block->b_flags & B_GOT_CALLBACK) { if (block->b_fl != NULL && block->b_fl->fl_type != F_UNLCK) { lock->fl = *block->b_fl; goto conf_lock; - } - else { + } else { nlmsvc_unlink_block(block); - return nlm_granted; + ret = nlm_granted; + goto out; } } - return nlm_drop_reply; + ret = nlm_drop_reply; + goto out; } error = vfs_test_lock(file->f_file, &lock->fl); - if (error == -EINPROGRESS) - return nlmsvc_defer_lock_rqst(rqstp, block); + if (error == -EINPROGRESS) { + ret = nlmsvc_defer_lock_rqst(rqstp, block); + goto out; + } if (error) { ret = nlm_lck_denied_nolocks; goto out; From 5c002b3bb294a637312cab7ad92a0deafa05a758 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 30 Nov 2007 16:55:23 -0500 Subject: [PATCH 040/100] nfsd: allow root to set uid and gid on create The server silently ignores attempts to set the uid and gid on create. Based on the comment, this appears to have been done to prevent some overly-clever IRIX client from causing itself problems. Perhaps we should remove that hack completely. For now, at least, it makes sense to allow root (when no_root_squash is set) to set uid and gid. While we're there, since nfsd_create and nfsd_create_v3 share the same logic, pull that out into a separate function. And spell out the individual modifications of ia_valid instead of doing them both at once inside a conditional. Thanks to Roger Willcocks for the bug report and original patch on which this is based. Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 755ba43c13e1..cc75e4fcd02b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1151,6 +1151,26 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif /* CONFIG_NFSD_V3 */ +__be32 +nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, + struct iattr *iap) +{ + /* + * Mode has already been set earlier in create: + */ + iap->ia_valid &= ~ATTR_MODE; + /* + * Setting uid/gid works only for root. Irix appears to + * send along the gid on create when it tries to implement + * setgid directories via NFS: + */ + if (current->fsuid != 0) + iap->ia_valid &= ~(ATTR_UID|ATTR_GID); + if (iap->ia_valid) + return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); + return 0; +} + /* * Create a file (regular, directory, device, fifo); UNIX sockets * not yet implemented. @@ -1167,6 +1187,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry, *dchild = NULL; struct inode *dirp; __be32 err; + __be32 err2; int host_err; err = nfserr_perm; @@ -1257,16 +1278,9 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, } - /* Set file attributes. Mode has already been set and - * setting uid/gid works only for root. Irix appears to - * send along the gid when it tries to implement setgid - * directories via NFS. - */ - if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { - __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); - if (err2) - err = err2; - } + err2 = nfsd_create_setattr(rqstp, resfhp, iap); + if (err2) + err = err2; /* * Update the file handle to get the new inode info. */ @@ -1295,6 +1309,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry, *dchild = NULL; struct inode *dirp; __be32 err; + __be32 err2; int host_err; __u32 v_mtime=0, v_atime=0; @@ -1399,16 +1414,10 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, iap->ia_atime.tv_nsec = 0; } - /* Set file attributes. - * Irix appears to send along the gid when it tries to - * implement setgid directories via NFS. Clear out all that cruft. - */ set_attr: - if ((iap->ia_valid &= ~(ATTR_UID|ATTR_GID|ATTR_MODE)) != 0) { - __be32 err2 = nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); - if (err2) - err = err2; - } + err2 = nfsd_create_setattr(rqstp, resfhp, iap); + if (err2) + err = err2; /* * Update the filehandle to get the new inode info. From 980e5a40a44400edc3f75b7931b8e75fcc3c21a3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 12 Dec 2007 18:21:17 -0500 Subject: [PATCH 041/100] nfsd: fix rsi_cache reference count leak For some reason we haven't been put()'ing the reference count here. Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index aa790bb4f7a1..688cc31040f3 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -975,6 +975,7 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp, struct kvec *resv = &rqstp->rq_res.head[0]; struct xdr_netobj tmpobj; struct rsi *rsip, rsikey; + int ret; /* Read the verifier; should be NULL: */ *authp = rpc_autherr_badverf; @@ -1014,23 +1015,27 @@ static int svcauth_gss_handle_init(struct svc_rqst *rqstp, /* No upcall result: */ return SVC_DROP; case 0: + ret = SVC_DROP; /* Got an answer to the upcall; use it: */ if (gss_write_init_verf(rqstp, rsip)) - return SVC_DROP; + goto out; if (resv->iov_len + 4 > PAGE_SIZE) - return SVC_DROP; + goto out; svc_putnl(resv, RPC_SUCCESS); if (svc_safe_putnetobj(resv, &rsip->out_handle)) - return SVC_DROP; + goto out; if (resv->iov_len + 3 * 4 > PAGE_SIZE) - return SVC_DROP; + goto out; svc_putnl(resv, rsip->major_status); svc_putnl(resv, rsip->minor_status); svc_putnl(resv, GSS_SEQ_WIN); if (svc_safe_putnetobj(resv, &rsip->out_token)) - return SVC_DROP; + goto out; } - return SVC_COMPLETE; + ret = SVC_COMPLETE; +out: + cache_put(&rsip->h, &rsi_cache); + return ret; } /* From 16141c0288f6a6ce2de51a1c39cc3407c841a535 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Tue, 11 Dec 2007 16:16:12 -0800 Subject: [PATCH 042/100] knfsd: change mailing list for nfsd in MAINTAINERS nfs@lists.sourceforge.net is being decommissioned. Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: J. Bruce Fields --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 91082e60d289..6cae13718925 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2247,7 +2247,7 @@ P: J. Bruce Fields M: bfields@fieldses.org P: Neil Brown M: neilb@suse.de -L: nfs@lists.sourceforge.net +L: linux-nfs@vger.kernel.org W: http://nfs.sourceforge.net/ S: Supported From b39c18fce003bb2d5a51a4734d8fdd2c81fa1a78 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sun, 6 Jan 2008 21:32:37 -0500 Subject: [PATCH 043/100] sunrpc: gss: simplify rsi_parse logic Make an obvious simplification that removes a few lines and some unnecessary indentation; no change in behavior. Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 52 ++++++++++++++----------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 688cc31040f3..e8ed848ecd67 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -224,38 +224,34 @@ static int rsi_parse(struct cache_detail *cd, /* major/minor */ len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + rsii.major_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + rsii.minor_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + + /* out_handle */ + len = qword_get(&mesg, buf, mlen); if (len < 0) goto out; - if (len == 0) { + status = -ENOMEM; + if (dup_to_netobj(&rsii.out_handle, buf, len)) goto out; - } else { - rsii.major_status = simple_strtoul(buf, &ep, 10); - if (*ep) - goto out; - len = qword_get(&mesg, buf, mlen); - if (len <= 0) - goto out; - rsii.minor_status = simple_strtoul(buf, &ep, 10); - if (*ep) - goto out; - /* out_handle */ - len = qword_get(&mesg, buf, mlen); - if (len < 0) - goto out; - status = -ENOMEM; - if (dup_to_netobj(&rsii.out_handle, buf, len)) - goto out; - - /* out_token */ - len = qword_get(&mesg, buf, mlen); - status = -EINVAL; - if (len < 0) - goto out; - status = -ENOMEM; - if (dup_to_netobj(&rsii.out_token, buf, len)) - goto out; - } + /* out_token */ + len = qword_get(&mesg, buf, mlen); + status = -EINVAL; + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.out_token, buf, len)) + goto out; rsii.h.expiry_time = expiry; rsip = rsi_update(&rsii, rsip); status = 0; From 8838dc43d6544570e8969a74ddc4a0d21abffde6 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 14 Jan 2008 13:12:19 -0500 Subject: [PATCH 044/100] nfsd4: clean up access_valid, deny_valid checks. Document these checks a little better and inline, as suggested by Neil Brown (note both functions have two callers). Remove an obviously bogus check while we're there (checking whether unsigned value is negative). Signed-off-by: J. Bruce Fields Cc: Neil Brown --- fs/nfsd/nfs4state.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c4b10a1e6c30..f6744bc03dae 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1157,14 +1157,19 @@ find_file(struct inode *ino) return NULL; } -static int access_valid(u32 x) +static inline int access_valid(u32 x) { - return (x > 0 && x < 4); + if (x < NFS4_SHARE_ACCESS_READ) + return 0; + if (x > NFS4_SHARE_ACCESS_BOTH) + return 0; + return 1; } -static int deny_valid(u32 x) +static inline int deny_valid(u32 x) { - return (x >= 0 && x < 5); + /* Note: unlike access bits, deny bits may be zero. */ + return x <= NFS4_SHARE_DENY_BOTH; } static void From 54ca95eb362d6988a577965ffb77c08702adb890 Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Fri, 11 Jan 2008 21:57:35 -0500 Subject: [PATCH 045/100] Leak in nlmsvc_testlock for async GETFL case Fix nlm_block leak for the case of supplied blocking lock info. Signed-off-by: Oleg Drokin Signed-off-by: J. Bruce Fields --- fs/lockd/svclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 84c4d5e04ebb..2f4d8fa66689 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -505,12 +505,12 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } if (block->b_flags & B_GOT_CALLBACK) { + nlmsvc_unlink_block(block); if (block->b_fl != NULL && block->b_fl->fl_type != F_UNLCK) { lock->fl = *block->b_fl; goto conf_lock; } else { - nlmsvc_unlink_block(block); ret = nlm_granted; goto out; } From cb5c7d668e1af269a9409721268f027b86abf29c Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 14 Jan 2008 16:05:07 -0500 Subject: [PATCH 046/100] svcrpc: ensure gss DESTROY tokens free contexts from cache If we don't do this then we'll end up with a pointless unusable context sitting in the cache until the time the original context would have expired. Signed-off-by: J. Bruce Fields --- net/sunrpc/auth_gss/svcauth_gss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index e8ed848ecd67..481f984e9a22 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1126,6 +1126,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) case RPC_GSS_PROC_DESTROY: if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; + rsci->h.expiry_time = get_seconds(); set_bit(CACHE_NEGATIVE, &rsci->h.flags); if (resv->iov_len + 4 > PAGE_SIZE) goto drop; From 1d8206b97a09e7ff2fbef17d8d1ea008d764eeaa Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:15 -0600 Subject: [PATCH 047/100] svc: Add an svc transport class The transport class (svc_xprt_class) represents a type of transport, e.g. udp, tcp, rdma. A transport class has a unique name and a set of transport operations kept in the svc_xprt_ops structure. A transport class can be dynamically registered and unregisterd. The svc_xprt_class represents the module that implements the transport type and keeps reference counts on the module to avoid unloading while there are active users. The endpoint (svc_xprt) is a generic, transport independent endpoint that can be used to send and receive data for an RPC service. It inherits it's operations from the transport class. A transport driver module registers and unregisters itself with svc sunrpc by calling svc_reg_xprt_class, and svc_unreg_xprt_class respectively. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/debug.h | 1 + include/linux/sunrpc/svc_xprt.h | 31 ++++++++++++ net/sunrpc/Makefile | 3 +- net/sunrpc/svc_xprt.c | 83 +++++++++++++++++++++++++++++++++ 4 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 include/linux/sunrpc/svc_xprt.h create mode 100644 net/sunrpc/svc_xprt.c diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 3912cf16361e..092fcfa9ceb9 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -21,6 +21,7 @@ #define RPCDBG_SCHED 0x0040 #define RPCDBG_TRANS 0x0080 #define RPCDBG_SVCSOCK 0x0100 +#define RPCDBG_SVCXPRT 0x0100 #define RPCDBG_SVCDSP 0x0200 #define RPCDBG_MISC 0x0400 #define RPCDBG_CACHE 0x0800 diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h new file mode 100644 index 000000000000..fe8e787bb95d --- /dev/null +++ b/include/linux/sunrpc/svc_xprt.h @@ -0,0 +1,31 @@ +/* + * linux/include/linux/sunrpc/svc_xprt.h + * + * RPC server transport I/O + */ + +#ifndef SUNRPC_SVC_XPRT_H +#define SUNRPC_SVC_XPRT_H + +#include + +struct svc_xprt_ops { +}; + +struct svc_xprt_class { + const char *xcl_name; + struct module *xcl_owner; + struct svc_xprt_ops *xcl_ops; + struct list_head xcl_list; +}; + +struct svc_xprt { + struct svc_xprt_class *xpt_class; + struct svc_xprt_ops *xpt_ops; +}; + +int svc_reg_xprt_class(struct svc_xprt_class *); +void svc_unreg_xprt_class(struct svc_xprt_class *); +void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *); + +#endif /* SUNRPC_SVC_XPRT_H */ diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 5c69a725e530..92e1dbe50947 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -11,6 +11,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ rpcb_clnt.o timer.o xdr.o \ - sunrpc_syms.o cache.o rpc_pipe.o + sunrpc_syms.o cache.o rpc_pipe.o \ + svc_xprt.o sunrpc-$(CONFIG_PROC_FS) += stats.o sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c new file mode 100644 index 000000000000..fe5270fae336 --- /dev/null +++ b/net/sunrpc/svc_xprt.c @@ -0,0 +1,83 @@ +/* + * linux/net/sunrpc/svc_xprt.c + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* List of registered transport classes */ +static DEFINE_SPINLOCK(svc_xprt_class_lock); +static LIST_HEAD(svc_xprt_class_list); + +int svc_reg_xprt_class(struct svc_xprt_class *xcl) +{ + struct svc_xprt_class *cl; + int res = -EEXIST; + + dprintk("svc: Adding svc transport class '%s'\n", xcl->xcl_name); + + INIT_LIST_HEAD(&xcl->xcl_list); + spin_lock(&svc_xprt_class_lock); + /* Make sure there isn't already a class with the same name */ + list_for_each_entry(cl, &svc_xprt_class_list, xcl_list) { + if (strcmp(xcl->xcl_name, cl->xcl_name) == 0) + goto out; + } + list_add_tail(&xcl->xcl_list, &svc_xprt_class_list); + res = 0; +out: + spin_unlock(&svc_xprt_class_lock); + return res; +} +EXPORT_SYMBOL_GPL(svc_reg_xprt_class); + +void svc_unreg_xprt_class(struct svc_xprt_class *xcl) +{ + dprintk("svc: Removing svc transport class '%s'\n", xcl->xcl_name); + spin_lock(&svc_xprt_class_lock); + list_del_init(&xcl->xcl_list); + spin_unlock(&svc_xprt_class_lock); +} +EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); + +/* + * Called by transport drivers to initialize the transport independent + * portion of the transport instance. + */ +void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt) +{ + memset(xprt, 0, sizeof(*xprt)); + xprt->xpt_class = xcl; + xprt->xpt_ops = xcl->xcl_ops; +} +EXPORT_SYMBOL_GPL(svc_xprt_init); From 360d873864c8903a650b227758b49dd50e6ecc9f Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:17 -0600 Subject: [PATCH 048/100] svc: Make svc_sock the tcp/udp transport Make TCP and UDP svc_sock transports, and register them with the svc transport core. A transport type (svc_sock) has an svc_xprt as its first member, and calls svc_xprt_init to initialize this field. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/debug.h | 1 - include/linux/sunrpc/svcsock.h | 4 ++++ net/sunrpc/sunrpc_syms.c | 4 +++- net/sunrpc/svcsock.c | 32 +++++++++++++++++++++++++++++++- 4 files changed, 38 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h index 092fcfa9ceb9..10709cbe96fd 100644 --- a/include/linux/sunrpc/debug.h +++ b/include/linux/sunrpc/debug.h @@ -20,7 +20,6 @@ #define RPCDBG_BIND 0x0020 #define RPCDBG_SCHED 0x0040 #define RPCDBG_TRANS 0x0080 -#define RPCDBG_SVCSOCK 0x0100 #define RPCDBG_SVCXPRT 0x0100 #define RPCDBG_SVCDSP 0x0200 #define RPCDBG_MISC 0x0400 diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index a53e0fa855d2..1878cbe1aa4f 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -10,11 +10,13 @@ #define SUNRPC_SVCSOCK_H #include +#include /* * RPC server socket. */ struct svc_sock { + struct svc_xprt sk_xprt; struct list_head sk_ready; /* list of ready sockets */ struct list_head sk_list; /* list of all sockets */ struct socket * sk_sock; /* berkeley socket layer */ @@ -78,6 +80,8 @@ int svc_addsock(struct svc_serv *serv, int fd, char *name_return, int *proto); +void svc_init_xprt_sock(void); +void svc_cleanup_xprt_sock(void); /* * svc_makesock socket characteristics diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index ef7dc78e2c7b..11b309817b8f 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -85,7 +85,8 @@ init_sunrpc(void) #endif cache_register(&ip_map_cache); cache_register(&unix_gid_cache); - init_socket_xprt(); + svc_init_xprt_sock(); /* svc sock transport */ + init_socket_xprt(); /* clnt sock transport */ rpcauth_init_module(); out: return err; @@ -96,6 +97,7 @@ cleanup_sunrpc(void) { rpcauth_remove_module(); cleanup_socket_xprt(); + svc_cleanup_xprt_sock(); unregister_rpc_pipefs(); rpc_destroy_mempool(); cache_unregister(&ip_map_cache); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c75bffeb89eb..54f1b3d993a6 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -75,7 +75,7 @@ * */ -#define RPCDBG_FACILITY RPCDBG_SVCSOCK +#define RPCDBG_FACILITY RPCDBG_SVCXPRT static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, @@ -900,12 +900,21 @@ svc_udp_sendto(struct svc_rqst *rqstp) return error; } +static struct svc_xprt_ops svc_udp_ops = { +}; + +static struct svc_xprt_class svc_udp_class = { + .xcl_name = "udp", + .xcl_ops = &svc_udp_ops, +}; + static void svc_udp_init(struct svc_sock *svsk) { int one = 1; mm_segment_t oldfs; + svc_xprt_init(&svc_udp_class, &svsk->sk_xprt); svsk->sk_sk->sk_data_ready = svc_udp_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; svsk->sk_recvfrom = svc_udp_recvfrom; @@ -1344,12 +1353,33 @@ svc_tcp_sendto(struct svc_rqst *rqstp) return sent; } +static struct svc_xprt_ops svc_tcp_ops = { +}; + +static struct svc_xprt_class svc_tcp_class = { + .xcl_name = "tcp", + .xcl_ops = &svc_tcp_ops, +}; + +void svc_init_xprt_sock(void) +{ + svc_reg_xprt_class(&svc_tcp_class); + svc_reg_xprt_class(&svc_udp_class); +} + +void svc_cleanup_xprt_sock(void) +{ + svc_unreg_xprt_class(&svc_tcp_class); + svc_unreg_xprt_class(&svc_udp_class); +} + static void svc_tcp_init(struct svc_sock *svsk) { struct sock *sk = svsk->sk_sk; struct tcp_sock *tp = tcp_sk(sk); + svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt); svsk->sk_recvfrom = svc_tcp_recvfrom; svsk->sk_sendto = svc_tcp_sendto; From 9f29868b491beee706931e0cf875a60cb4688754 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:19 -0600 Subject: [PATCH 049/100] svc: Change the svc_sock in the rqstp structure to a transport The rqstp structure contains a pointer to the transport for the RPC request. This functionaly trivial patch adds an unamed union with pointers to both svc_sock and svc_xprt. Ultimately the union will be removed and only the rq_xprt field will remain. This allows incrementally extracting transport independent interfaces without one gigundo patch. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 8531a70da73d..37f7448c997e 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -204,7 +204,10 @@ union svc_addr_u { struct svc_rqst { struct list_head rq_list; /* idle list */ struct list_head rq_all; /* all threads list */ - struct svc_sock * rq_sock; /* socket */ + union { + struct svc_xprt * rq_xprt; /* transport ptr */ + struct svc_sock * rq_sock; /* socket ptr */ + }; struct sockaddr_storage rq_addr; /* peer address */ size_t rq_addrlen; From 490231558e058547da4ffab6d8ce8e28771749cc Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:21 -0600 Subject: [PATCH 050/100] svc: Add a max payload value to the transport The svc_max_payload function currently looks at the socket type to determine the max payload. Add a max payload value to svc_xprt_class so it can be returned directly. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + net/sunrpc/svc.c | 4 +--- net/sunrpc/svcsock.c | 2 ++ 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index fe8e787bb95d..187dc4e2e202 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -17,6 +17,7 @@ struct svc_xprt_class { struct module *xcl_owner; struct svc_xprt_ops *xcl_ops; struct list_head xcl_list; + u32 xcl_max_payload; }; struct svc_xprt { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 4ad5fbbb18b4..94eed9b80038 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1055,10 +1055,8 @@ svc_process(struct svc_rqst *rqstp) */ u32 svc_max_payload(const struct svc_rqst *rqstp) { - int max = RPCSVC_MAXPAYLOAD_TCP; + u32 max = rqstp->rq_xprt->xpt_class->xcl_max_payload; - if (rqstp->rq_sock->sk_sock->type == SOCK_DGRAM) - max = RPCSVC_MAXPAYLOAD_UDP; if (rqstp->rq_server->sv_max_payload < max) max = rqstp->rq_server->sv_max_payload; return max; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 54f1b3d993a6..c507f6f8ee54 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -906,6 +906,7 @@ static struct svc_xprt_ops svc_udp_ops = { static struct svc_xprt_class svc_udp_class = { .xcl_name = "udp", .xcl_ops = &svc_udp_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, }; static void @@ -1359,6 +1360,7 @@ static struct svc_xprt_ops svc_tcp_ops = { static struct svc_xprt_class svc_tcp_class = { .xcl_name = "tcp", .xcl_ops = &svc_tcp_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, }; void svc_init_xprt_sock(void) From 5d137990f5860451a6e0428e0903f62933d05287 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:23 -0600 Subject: [PATCH 051/100] svc: Move sk_sendto and sk_recvfrom to svc_xprt_class The sk_sendto and sk_recvfrom are function pointers that allow svc_sock to be used for both UDP and TCP. Move these function pointers to the svc_xprt_ops structure. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 2 ++ include/linux/sunrpc/svcsock.h | 3 --- net/sunrpc/svcsock.c | 12 ++++++------ 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 187dc4e2e202..7ae6c857b05d 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -10,6 +10,8 @@ #include struct svc_xprt_ops { + int (*xpo_recvfrom)(struct svc_rqst *); + int (*xpo_sendto)(struct svc_rqst *); }; struct svc_xprt_class { diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 1878cbe1aa4f..08e78d0a364f 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -45,9 +45,6 @@ struct svc_sock { * be revisted */ struct mutex sk_mutex; /* to serialize sending data */ - int (*sk_recvfrom)(struct svc_rqst *rqstp); - int (*sk_sendto)(struct svc_rqst *rqstp); - /* We keep the old state_change and data_ready CB's here */ void (*sk_ostate)(struct sock *); void (*sk_odata)(struct sock *, int bytes); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c507f6f8ee54..7817c7eea753 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -901,6 +901,8 @@ svc_udp_sendto(struct svc_rqst *rqstp) } static struct svc_xprt_ops svc_udp_ops = { + .xpo_recvfrom = svc_udp_recvfrom, + .xpo_sendto = svc_udp_sendto, }; static struct svc_xprt_class svc_udp_class = { @@ -918,8 +920,6 @@ svc_udp_init(struct svc_sock *svsk) svc_xprt_init(&svc_udp_class, &svsk->sk_xprt); svsk->sk_sk->sk_data_ready = svc_udp_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; - svsk->sk_recvfrom = svc_udp_recvfrom; - svsk->sk_sendto = svc_udp_sendto; /* initialise setting must have enough space to * receive and respond to one request. @@ -1355,6 +1355,8 @@ svc_tcp_sendto(struct svc_rqst *rqstp) } static struct svc_xprt_ops svc_tcp_ops = { + .xpo_recvfrom = svc_tcp_recvfrom, + .xpo_sendto = svc_tcp_sendto, }; static struct svc_xprt_class svc_tcp_class = { @@ -1382,8 +1384,6 @@ svc_tcp_init(struct svc_sock *svsk) struct tcp_sock *tp = tcp_sk(sk); svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt); - svsk->sk_recvfrom = svc_tcp_recvfrom; - svsk->sk_sendto = svc_tcp_sendto; if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); @@ -1531,7 +1531,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); - len = svsk->sk_recvfrom(rqstp); + len = svsk->sk_xprt.xpt_ops->xpo_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); /* No data, incomplete (TCP) read, or accept() */ @@ -1591,7 +1591,7 @@ svc_send(struct svc_rqst *rqstp) if (test_bit(SK_DEAD, &svsk->sk_flags)) len = -ENOTCONN; else - len = svsk->sk_sendto(rqstp); + len = svsk->sk_xprt.xpt_ops->xpo_sendto(rqstp); mutex_unlock(&svsk->sk_mutex); svc_sock_release(rqstp); From 5148bf4ebc1f59dc6a0ec43a220c55ff0771246e Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:25 -0600 Subject: [PATCH 052/100] svc: Add transport specific xpo_release function The svc_sock_release function releases pages allocated to a thread. For UDP this frees the receive skb. For RDMA it will post a receive WR and bump the client credit count. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 2 +- include/linux/sunrpc/svc_xprt.h | 1 + net/sunrpc/svcsock.c | 17 +++++++++-------- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 37f7448c997e..cfb2652f6f8f 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -217,7 +217,7 @@ struct svc_rqst { struct auth_ops * rq_authop; /* authentication flavour */ u32 rq_flavor; /* pseudoflavor */ struct svc_cred rq_cred; /* auth info */ - struct sk_buff * rq_skbuff; /* fast recv inet buffer */ + void * rq_xprt_ctxt; /* transport specific context ptr */ struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ struct xdr_buf rq_arg; diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 7ae6c857b05d..01ee7bc2c374 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -12,6 +12,7 @@ struct svc_xprt_ops { int (*xpo_recvfrom)(struct svc_rqst *); int (*xpo_sendto)(struct svc_rqst *); + void (*xpo_release_rqst)(struct svc_rqst *); }; struct svc_xprt_class { diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 7817c7eea753..d46abc89b99a 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -185,14 +185,13 @@ svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) /* * Release an skbuff after use */ -static inline void -svc_release_skb(struct svc_rqst *rqstp) +static void svc_release_skb(struct svc_rqst *rqstp) { - struct sk_buff *skb = rqstp->rq_skbuff; + struct sk_buff *skb = rqstp->rq_xprt_ctxt; struct svc_deferred_req *dr = rqstp->rq_deferred; if (skb) { - rqstp->rq_skbuff = NULL; + rqstp->rq_xprt_ctxt = NULL; dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); skb_free_datagram(rqstp->rq_sock->sk_sk, skb); @@ -395,7 +394,7 @@ svc_sock_release(struct svc_rqst *rqstp) { struct svc_sock *svsk = rqstp->rq_sock; - svc_release_skb(rqstp); + rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); svc_free_res_pages(rqstp); rqstp->rq_res.page_len = 0; @@ -867,7 +866,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) skb_free_datagram(svsk->sk_sk, skb); return 0; } - rqstp->rq_skbuff = skb; + rqstp->rq_xprt_ctxt = skb; } rqstp->rq_arg.page_base = 0; @@ -903,6 +902,7 @@ svc_udp_sendto(struct svc_rqst *rqstp) static struct svc_xprt_ops svc_udp_ops = { .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, + .xpo_release_rqst = svc_release_skb, }; static struct svc_xprt_class svc_udp_class = { @@ -1291,7 +1291,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; } - rqstp->rq_skbuff = NULL; + rqstp->rq_xprt_ctxt = NULL; rqstp->rq_prot = IPPROTO_TCP; /* Reset TCP read info */ @@ -1357,6 +1357,7 @@ svc_tcp_sendto(struct svc_rqst *rqstp) static struct svc_xprt_ops svc_tcp_ops = { .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, + .xpo_release_rqst = svc_release_skb, }; static struct svc_xprt_class svc_tcp_class = { @@ -1578,7 +1579,7 @@ svc_send(struct svc_rqst *rqstp) } /* release the receive skb before sending the reply */ - svc_release_skb(rqstp); + rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); /* calculate over-all length */ xb = & rqstp->rq_res; From 755cceaba7555027e61dfa79f1e55bdfc6906633 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:27 -0600 Subject: [PATCH 053/100] svc: Add per-transport delete functions Add transport specific xpo_detach and xpo_free functions. The xpo_detach function causes the transport to stop delivering data-ready events and enqueing the transport for I/O. The xpo_free function frees all resources associated with the particular transport instance. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 2 ++ net/sunrpc/svcsock.c | 56 +++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 01ee7bc2c374..f032fb6b32e9 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -13,6 +13,8 @@ struct svc_xprt_ops { int (*xpo_recvfrom)(struct svc_rqst *); int (*xpo_sendto)(struct svc_rqst *); void (*xpo_release_rqst)(struct svc_rqst *); + void (*xpo_detach)(struct svc_xprt *); + void (*xpo_free)(struct svc_xprt *); }; struct svc_xprt_class { diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d46abc89b99a..44a729d6efea 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -85,6 +85,8 @@ static void svc_udp_data_ready(struct sock *, int); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); static void svc_close_socket(struct svc_sock *svsk); +static void svc_sock_detach(struct svc_xprt *); +static void svc_sock_free(struct svc_xprt *); static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); static int svc_deferred_recv(struct svc_rqst *rqstp); @@ -376,16 +378,8 @@ static inline void svc_sock_put(struct svc_sock *svsk) { if (atomic_dec_and_test(&svsk->sk_inuse)) { - BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags)); - - dprintk("svc: releasing dead socket\n"); - if (svsk->sk_sock->file) - sockfd_put(svsk->sk_sock); - else - sock_release(svsk->sk_sock); - if (svsk->sk_info_authunix != NULL) - svcauth_unix_info_release(svsk->sk_info_authunix); - kfree(svsk); + BUG_ON(!test_bit(SK_DEAD, &svsk->sk_flags)); + svsk->sk_xprt.xpt_ops->xpo_free(&svsk->sk_xprt); } } @@ -903,6 +897,8 @@ static struct svc_xprt_ops svc_udp_ops = { .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, .xpo_release_rqst = svc_release_skb, + .xpo_detach = svc_sock_detach, + .xpo_free = svc_sock_free, }; static struct svc_xprt_class svc_udp_class = { @@ -1358,6 +1354,8 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, .xpo_release_rqst = svc_release_skb, + .xpo_detach = svc_sock_detach, + .xpo_free = svc_sock_free, }; static struct svc_xprt_class svc_tcp_class = { @@ -1814,6 +1812,40 @@ static int svc_create_socket(struct svc_serv *serv, int protocol, return error; } +/* + * Detach the svc_sock from the socket so that no + * more callbacks occur. + */ +static void svc_sock_detach(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct sock *sk = svsk->sk_sk; + + dprintk("svc: svc_sock_detach(%p)\n", svsk); + + /* put back the old socket callbacks */ + sk->sk_state_change = svsk->sk_ostate; + sk->sk_data_ready = svsk->sk_odata; + sk->sk_write_space = svsk->sk_owspace; +} + +/* + * Free the svc_sock's socket resources and the svc_sock itself. + */ +static void svc_sock_free(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + dprintk("svc: svc_sock_free(%p)\n", svsk); + + if (svsk->sk_info_authunix != NULL) + svcauth_unix_info_release(svsk->sk_info_authunix); + if (svsk->sk_sock->file) + sockfd_put(svsk->sk_sock); + else + sock_release(svsk->sk_sock); + kfree(svsk); +} + /* * Remove a dead socket */ @@ -1828,9 +1860,7 @@ svc_delete_socket(struct svc_sock *svsk) serv = svsk->sk_server; sk = svsk->sk_sk; - sk->sk_state_change = svsk->sk_ostate; - sk->sk_data_ready = svsk->sk_odata; - sk->sk_write_space = svsk->sk_owspace; + svsk->sk_xprt.xpt_ops->xpo_detach(&svsk->sk_xprt); spin_lock_bh(&serv->sv_lock); From e831fe65b10199e1e301a7316c66d6ced133712d Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:29 -0600 Subject: [PATCH 054/100] svc: Add xpo_prep_reply_hdr Some transports add fields to the RPC header for replies, e.g. the TCP record length. This function is called when preparing the reply header to allow each transport to add whatever fields it requires. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + net/sunrpc/svc.c | 6 +++--- net/sunrpc/svcsock.c | 17 +++++++++++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index f032fb6b32e9..199cfcb9860b 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -11,6 +11,7 @@ struct svc_xprt_ops { int (*xpo_recvfrom)(struct svc_rqst *); + void (*xpo_prep_reply_hdr)(struct svc_rqst *); int (*xpo_sendto)(struct svc_rqst *); void (*xpo_release_rqst)(struct svc_rqst *); void (*xpo_detach)(struct svc_xprt *); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 94eed9b80038..8281a0402652 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -840,9 +840,9 @@ svc_process(struct svc_rqst *rqstp) rqstp->rq_res.tail[0].iov_len = 0; /* Will be turned off only in gss privacy case: */ rqstp->rq_splice_ok = 1; - /* tcp needs a space for the record length... */ - if (rqstp->rq_prot == IPPROTO_TCP) - svc_putnl(resv, 0); + + /* Setup reply header */ + rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); rqstp->rq_xid = svc_getu32(argv); svc_putu32(resv, rqstp->rq_xid); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 44a729d6efea..492a1dc544f3 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -893,12 +893,17 @@ svc_udp_sendto(struct svc_rqst *rqstp) return error; } +static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) +{ +} + static struct svc_xprt_ops svc_udp_ops = { .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, .xpo_release_rqst = svc_release_skb, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, + .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, }; static struct svc_xprt_class svc_udp_class = { @@ -1350,12 +1355,24 @@ svc_tcp_sendto(struct svc_rqst *rqstp) return sent; } +/* + * Setup response header. TCP has a 4B record length field. + */ +static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) +{ + struct kvec *resv = &rqstp->rq_res.head[0]; + + /* tcp needs a space for the record length... */ + svc_putnl(resv, 0); +} + static struct svc_xprt_ops svc_tcp_ops = { .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, .xpo_release_rqst = svc_release_skb, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, + .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, }; static struct svc_xprt_class svc_tcp_class = { From 323bee32e9bef14c6dd943ecc8e8cd373a9c94d9 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:31 -0600 Subject: [PATCH 055/100] svc: Add a transport function that checks for write space In order to avoid blocking a service thread, the receive side checks to see if there is sufficient write space to reply to the request. Each transport has a different mechanism for determining if there is enough write space to reply. The code that checked for write space was coupled with code that checked for CLOSE and CONN. These checks have been broken out into separate statements to make the code easier to read. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + net/sunrpc/svcsock.c | 82 ++++++++++++++++++++++----------- 2 files changed, 57 insertions(+), 26 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 199cfcb9860b..85df97acc2d0 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -10,6 +10,7 @@ #include struct svc_xprt_ops { + int (*xpo_has_wspace)(struct svc_xprt *); int (*xpo_recvfrom)(struct svc_rqst *); void (*xpo_prep_reply_hdr)(struct svc_rqst *); int (*xpo_sendto)(struct svc_rqst *); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 492a1dc544f3..2007881a5b26 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -204,22 +204,6 @@ static void svc_release_skb(struct svc_rqst *rqstp) } } -/* - * Any space to write? - */ -static inline unsigned long -svc_sock_wspace(struct svc_sock *svsk) -{ - int wspace; - - if (svsk->sk_sock->type == SOCK_STREAM) - wspace = sk_stream_wspace(svsk->sk_sk); - else - wspace = sock_wspace(svsk->sk_sk); - - return wspace; -} - /* * Queue up a socket with data pending. If there are idle nfsd * processes, wake 'em up. @@ -269,22 +253,24 @@ svc_sock_enqueue(struct svc_sock *svsk) BUG_ON(svsk->sk_pool != NULL); svsk->sk_pool = pool; - set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - if (((atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg)*2 - > svc_sock_wspace(svsk)) - && !test_bit(SK_CLOSE, &svsk->sk_flags) - && !test_bit(SK_CONN, &svsk->sk_flags)) { + /* Handle pending connection */ + if (test_bit(SK_CONN, &svsk->sk_flags)) + goto process; + + /* Handle close in-progress */ + if (test_bit(SK_CLOSE, &svsk->sk_flags)) + goto process; + + /* Check if we have space to reply to a request */ + if (!svsk->sk_xprt.xpt_ops->xpo_has_wspace(&svsk->sk_xprt)) { /* Don't enqueue while not enough space for reply */ - dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", - svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_max_mesg, - svc_sock_wspace(svsk)); + dprintk("svc: no write space, socket %p not enqueued\n", svsk); svsk->sk_pool = NULL; clear_bit(SK_BUSY, &svsk->sk_flags); goto out_unlock; } - clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - + process: if (!list_empty(&pool->sp_threads)) { rqstp = list_entry(pool->sp_threads.next, struct svc_rqst, @@ -897,6 +883,24 @@ static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) { } +static int svc_udp_has_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_server; + unsigned long required; + + /* + * Set the SOCK_NOSPACE flag before checking the available + * sock space. + */ + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + required = atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg; + if (required*2 > sock_wspace(svsk->sk_sk)) + return 0; + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return 1; +} + static struct svc_xprt_ops svc_udp_ops = { .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, @@ -904,6 +908,7 @@ static struct svc_xprt_ops svc_udp_ops = { .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, + .xpo_has_wspace = svc_udp_has_wspace, }; static struct svc_xprt_class svc_udp_class = { @@ -1366,6 +1371,30 @@ static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) svc_putnl(resv, 0); } +static int svc_tcp_has_wspace(struct svc_xprt *xprt) +{ + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_serv *serv = svsk->sk_server; + int required; + int wspace; + + /* + * Set the SOCK_NOSPACE flag before checking the available + * sock space. + */ + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + required = atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg; + wspace = sk_stream_wspace(svsk->sk_sk); + + if (wspace < sk_stream_min_wspace(svsk->sk_sk)) + return 0; + if (required * 2 > wspace) + return 0; + + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + return 1; +} + static struct svc_xprt_ops svc_tcp_ops = { .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, @@ -1373,6 +1402,7 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, + .xpo_has_wspace = svc_tcp_has_wspace, }; static struct svc_xprt_class svc_tcp_class = { From d7979ae4a050a45b78af51832475001b68263d2a Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:34 -0600 Subject: [PATCH 056/100] svc: Move close processing to a single place Close handling was duplicated in the UDP and TCP recvfrom methods. This code has been moved to the transport independent svc_recv function. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2007881a5b26..603db98b8fca 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -776,11 +776,6 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) return svc_deferred_recv(rqstp); } - if (test_bit(SK_CLOSE, &svsk->sk_flags)) { - svc_delete_socket(svsk); - return 0; - } - clear_bit(SK_DATA, &svsk->sk_flags); skb = NULL; err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, @@ -1181,11 +1176,6 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) return svc_deferred_recv(rqstp); } - if (test_bit(SK_CLOSE, &svsk->sk_flags)) { - svc_delete_socket(svsk); - return 0; - } - if (svsk->sk_sk->sk_state == TCP_LISTEN) { svc_tcp_accept(svsk); svc_sock_received(svsk); @@ -1311,7 +1301,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) return len; err_delete: - svc_delete_socket(svsk); + set_bit(SK_CLOSE, &svsk->sk_flags); return -EAGAIN; error: @@ -1575,10 +1565,16 @@ svc_recv(struct svc_rqst *rqstp, long timeout) } spin_unlock_bh(&pool->sp_lock); - dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", - rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); - len = svsk->sk_xprt.xpt_ops->xpo_recvfrom(rqstp); - dprintk("svc: got len=%d\n", len); + len = 0; + if (test_bit(SK_CLOSE, &svsk->sk_flags)) { + dprintk("svc_recv: found SK_CLOSE\n"); + svc_delete_socket(svsk); + } else { + dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", + rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); + len = svsk->sk_xprt.xpt_ops->xpo_recvfrom(rqstp); + dprintk("svc: got len=%d\n", len); + } /* No data, incomplete (TCP) read, or accept() */ if (len == 0 || len == -EAGAIN) { From 38a417cc993f4535548e47207f9894e7c27e05e4 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:36 -0600 Subject: [PATCH 057/100] svc: Add xpo_accept transport function Previously, the accept logic looked into the socket state to determine whether to call accept or recv when data-ready was indicated on an endpoint. Since some transports don't use sockets, this logic now uses a flag bit (SK_LISTENER) to identify listening endpoints. A transport function (xpo_accept) allows each transport to define its own accept processing. A transport's initialization logic is reponsible for setting the SK_LISTENER bit. I didn't see any way to do this in transport independent logic since the passive side of a UDP connection doesn't listen and always recv's. In the svc_recv function, if the SK_LISTENER bit is set, the transport xpo_accept function is called to handle accept processing. Note that all functions are defined even if they don't make sense for a given transport. For example, accept doesn't mean anything for UDP. The function is defined anyway and bug checks if called. The UDP transport should never set the SK_LISTENER bit. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + include/linux/sunrpc/svcsock.h | 1 + net/sunrpc/svcsock.c | 31 +++++++++++++++++++------------ 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 85df97acc2d0..31a44f441fd1 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -10,6 +10,7 @@ #include struct svc_xprt_ops { + struct svc_xprt *(*xpo_accept)(struct svc_xprt *); int (*xpo_has_wspace)(struct svc_xprt *); int (*xpo_recvfrom)(struct svc_rqst *); void (*xpo_prep_reply_hdr)(struct svc_rqst *); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 08e78d0a364f..9882ce0ee33c 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -36,6 +36,7 @@ struct svc_sock { #define SK_DEFERRED 8 /* request on sk_deferred */ #define SK_OLD 9 /* used for temp socket aging mark+sweep */ #define SK_DETACHED 10 /* detached from tempsocks list */ +#define SK_LISTENER 11 /* listening endpoint */ atomic_t sk_reserved; /* space on outq that is reserved */ diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 603db98b8fca..41d1f815fbbd 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -896,6 +896,12 @@ static int svc_udp_has_wspace(struct svc_xprt *xprt) return 1; } +static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt) +{ + BUG(); + return NULL; +} + static struct svc_xprt_ops svc_udp_ops = { .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, @@ -904,6 +910,7 @@ static struct svc_xprt_ops svc_udp_ops = { .xpo_free = svc_sock_free, .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, .xpo_has_wspace = svc_udp_has_wspace, + .xpo_accept = svc_udp_accept, }; static struct svc_xprt_class svc_udp_class = { @@ -1028,9 +1035,9 @@ static inline int svc_port_is_privileged(struct sockaddr *sin) /* * Accept a TCP connection */ -static void -svc_tcp_accept(struct svc_sock *svsk) +static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) { + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sockaddr_storage addr; struct sockaddr *sin = (struct sockaddr *) &addr; struct svc_serv *serv = svsk->sk_server; @@ -1042,7 +1049,7 @@ svc_tcp_accept(struct svc_sock *svsk) dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); if (!sock) - return; + return NULL; clear_bit(SK_CONN, &svsk->sk_flags); err = kernel_accept(sock, &newsock, O_NONBLOCK); @@ -1053,7 +1060,7 @@ svc_tcp_accept(struct svc_sock *svsk) else if (err != -EAGAIN && net_ratelimit()) printk(KERN_WARNING "%s: accept failed (err %d)!\n", serv->sv_name, -err); - return; + return NULL; } set_bit(SK_CONN, &svsk->sk_flags); @@ -1147,11 +1154,11 @@ svc_tcp_accept(struct svc_sock *svsk) if (serv->sv_stats) serv->sv_stats->nettcpconn++; - return; + return &newsvsk->sk_xprt; failed: sock_release(newsock); - return; + return NULL; } /* @@ -1176,12 +1183,6 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) return svc_deferred_recv(rqstp); } - if (svsk->sk_sk->sk_state == TCP_LISTEN) { - svc_tcp_accept(svsk); - svc_sock_received(svsk); - return 0; - } - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) /* sndbuf needs to have room for one request * per thread, otherwise we can stall even when the @@ -1393,6 +1394,7 @@ static struct svc_xprt_ops svc_tcp_ops = { .xpo_free = svc_sock_free, .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, .xpo_has_wspace = svc_tcp_has_wspace, + .xpo_accept = svc_tcp_accept, }; static struct svc_xprt_class svc_tcp_class = { @@ -1423,6 +1425,7 @@ svc_tcp_init(struct svc_sock *svsk) if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); + set_bit(SK_LISTENER, &svsk->sk_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(SK_CONN, &svsk->sk_flags); } else { @@ -1569,6 +1572,10 @@ svc_recv(struct svc_rqst *rqstp, long timeout) if (test_bit(SK_CLOSE, &svsk->sk_flags)) { dprintk("svc_recv: found SK_CLOSE\n"); svc_delete_socket(svsk); + } else if (test_bit(SK_LISTENER, &svsk->sk_flags)) { + struct svc_xprt *newxpt; + newxpt = svsk->sk_xprt.xpt_ops->xpo_accept(&svsk->sk_xprt); + svc_sock_received(svsk); } else { dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); From 44a6995b32eb9b021ee71b279edb84728c9f5160 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:38 -0600 Subject: [PATCH 058/100] svc: Remove unnecessary call to svc_sock_enqueue The svc_tcp_accept function calls svc_sock_enqueue after setting the SK_CONN bit. This doesn't actually do anything because the SK_BUSY bit is still set. The call is unnecessary anyway because the generic code in svc_recv calls svc_sock_received after calling the accept function. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 41d1f815fbbd..962dbf43a728 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1064,7 +1064,6 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) } set_bit(SK_CONN, &svsk->sk_flags); - svc_sock_enqueue(svsk); err = kernel_getpeername(newsock, sin, &slen); if (err < 0) { From f9f3cc4fae04c87c815a4b473fb577cf74ef27da Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:40 -0600 Subject: [PATCH 059/100] svc: Move connection limit checking to its own function Move the code that poaches connections when the connection limit is hit to a subroutine to make the accept logic path easier to follow. Since this is in the new connection path, it should not be a performance issue. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 57 ++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 962dbf43a728..6e9dc8f96495 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1105,17 +1105,30 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) svc_sock_received(newsvsk); - /* make sure that we don't have too many active connections. - * If we have, something must be dropped. - * - * There's no point in trying to do random drop here for - * DoS prevention. The NFS clients does 1 reconnect in 15 - * seconds. An attacker can easily beat that. - * - * The only somewhat efficient mechanism would be if drop - * old connections from the same IP first. But right now - * we don't even record the client IP in svc_sock. - */ + if (serv->sv_stats) + serv->sv_stats->nettcpconn++; + + return &newsvsk->sk_xprt; + +failed: + sock_release(newsock); + return NULL; +} + +/* + * Make sure that we don't have too many active connections. If we + * have, something must be dropped. + * + * There's no point in trying to do random drop here for DoS + * prevention. The NFS clients does 1 reconnect in 15 seconds. An + * attacker can easily beat that. + * + * The only somewhat efficient mechanism would be if drop old + * connections from the same IP first. But right now we don't even + * record the client IP in svc_sock. + */ +static void svc_check_conn_limits(struct svc_serv *serv) +{ if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { struct svc_sock *svsk = NULL; spin_lock_bh(&serv->sv_lock); @@ -1123,13 +1136,9 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) if (net_ratelimit()) { /* Try to help the admin */ printk(KERN_NOTICE "%s: too many open TCP " - "sockets, consider increasing the " - "number of nfsd threads\n", - serv->sv_name); - printk(KERN_NOTICE - "%s: last TCP connect from %s\n", - serv->sv_name, __svc_print_addr(sin, - buf, sizeof(buf))); + "sockets, consider increasing the " + "number of nfsd threads\n", + serv->sv_name); } /* * Always select the oldest socket. It's not fair, @@ -1147,17 +1156,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) svc_sock_enqueue(svsk); svc_sock_put(svsk); } - } - - if (serv->sv_stats) - serv->sv_stats->nettcpconn++; - - return &newsvsk->sk_xprt; - -failed: - sock_release(newsock); - return NULL; } /* @@ -1574,6 +1573,8 @@ svc_recv(struct svc_rqst *rqstp, long timeout) } else if (test_bit(SK_LISTENER, &svsk->sk_flags)) { struct svc_xprt *newxpt; newxpt = svsk->sk_xprt.xpt_ops->xpo_accept(&svsk->sk_xprt); + if (newxpt) + svc_check_conn_limits(svsk->sk_server); svc_sock_received(svsk); } else { dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", From b700cbb11fced2a0e953fdd19eac07ffaad86598 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:42 -0600 Subject: [PATCH 060/100] svc: Add a generic transport svc_create_xprt function The svc_create_xprt function is a transport independent version of the svc_makesock function. Since transport instance creation contains transport dependent and independent components, add an xpo_create transport function. The transport implementation of this function allocates the memory for the endpoint, implements the transport dependent initialization logic, and calls svc_xprt_init to initialize the transport independent field (svc_xprt) in it's data structure. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 4 +++ net/sunrpc/svc_xprt.c | 37 +++++++++++++++++++++ net/sunrpc/svcsock.c | 59 ++++++++++++++++++++++++--------- 3 files changed, 85 insertions(+), 15 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 31a44f441fd1..986a5a07044b 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -10,6 +10,9 @@ #include struct svc_xprt_ops { + struct svc_xprt *(*xpo_create)(struct svc_serv *, + struct sockaddr *, int, + int); struct svc_xprt *(*xpo_accept)(struct svc_xprt *); int (*xpo_has_wspace)(struct svc_xprt *); int (*xpo_recvfrom)(struct svc_rqst *); @@ -36,5 +39,6 @@ struct svc_xprt { int svc_reg_xprt_class(struct svc_xprt_class *); void svc_unreg_xprt_class(struct svc_xprt_class *); void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *); +int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); #endif /* SUNRPC_SVC_XPRT_H */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index fe5270fae336..6ff5ca71602f 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -81,3 +81,40 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt) xprt->xpt_ops = xcl->xcl_ops; } EXPORT_SYMBOL_GPL(svc_xprt_init); + +int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, + int flags) +{ + struct svc_xprt_class *xcl; + int ret = -ENOENT; + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = INADDR_ANY, + .sin_port = htons(port), + }; + dprintk("svc: creating transport %s[%d]\n", xprt_name, port); + spin_lock(&svc_xprt_class_lock); + list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { + if (strcmp(xprt_name, xcl->xcl_name) == 0) { + spin_unlock(&svc_xprt_class_lock); + if (try_module_get(xcl->xcl_owner)) { + struct svc_xprt *newxprt; + ret = 0; + newxprt = xcl->xcl_ops->xpo_create + (serv, + (struct sockaddr *)&sin, sizeof(sin), + flags); + if (IS_ERR(newxprt)) { + module_put(xcl->xcl_owner); + ret = PTR_ERR(newxprt); + } + } + goto out; + } + } + spin_unlock(&svc_xprt_class_lock); + dprintk("svc: transport %s not found\n", xprt_name); + out: + return ret; +} +EXPORT_SYMBOL_GPL(svc_create_xprt); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 6e9dc8f96495..9f0f6d088969 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -91,6 +91,8 @@ static void svc_sock_free(struct svc_xprt *); static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); +static struct svc_xprt *svc_create_socket(struct svc_serv *, int, + struct sockaddr *, int, int); /* apparently the "standard" is that clients close * idle connections after 5 minutes, servers after @@ -365,6 +367,7 @@ svc_sock_put(struct svc_sock *svsk) { if (atomic_dec_and_test(&svsk->sk_inuse)) { BUG_ON(!test_bit(SK_DEAD, &svsk->sk_flags)); + module_put(svsk->sk_xprt.xpt_class->xcl_owner); svsk->sk_xprt.xpt_ops->xpo_free(&svsk->sk_xprt); } } @@ -902,7 +905,15 @@ static struct svc_xprt *svc_udp_accept(struct svc_xprt *xprt) return NULL; } +static struct svc_xprt *svc_udp_create(struct svc_serv *serv, + struct sockaddr *sa, int salen, + int flags) +{ + return svc_create_socket(serv, IPPROTO_UDP, sa, salen, flags); +} + static struct svc_xprt_ops svc_udp_ops = { + .xpo_create = svc_udp_create, .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, .xpo_release_rqst = svc_release_skb, @@ -915,6 +926,7 @@ static struct svc_xprt_ops svc_udp_ops = { static struct svc_xprt_class svc_udp_class = { .xcl_name = "udp", + .xcl_owner = THIS_MODULE, .xcl_ops = &svc_udp_ops, .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, }; @@ -1384,7 +1396,15 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) return 1; } +static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, + struct sockaddr *sa, int salen, + int flags) +{ + return svc_create_socket(serv, IPPROTO_TCP, sa, salen, flags); +} + static struct svc_xprt_ops svc_tcp_ops = { + .xpo_create = svc_tcp_create, .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, .xpo_release_rqst = svc_release_skb, @@ -1397,6 +1417,7 @@ static struct svc_xprt_ops svc_tcp_ops = { static struct svc_xprt_class svc_tcp_class = { .xcl_name = "tcp", + .xcl_owner = THIS_MODULE, .xcl_ops = &svc_tcp_ops, .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, }; @@ -1573,8 +1594,14 @@ svc_recv(struct svc_rqst *rqstp, long timeout) } else if (test_bit(SK_LISTENER, &svsk->sk_flags)) { struct svc_xprt *newxpt; newxpt = svsk->sk_xprt.xpt_ops->xpo_accept(&svsk->sk_xprt); - if (newxpt) + if (newxpt) { + /* + * We know this module_get will succeed because the + * listener holds a reference too + */ + __module_get(newxpt->xpt_class->xcl_owner); svc_check_conn_limits(svsk->sk_server); + } svc_sock_received(svsk); } else { dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", @@ -1814,8 +1841,10 @@ EXPORT_SYMBOL_GPL(svc_addsock); /* * Create socket for RPC service. */ -static int svc_create_socket(struct svc_serv *serv, int protocol, - struct sockaddr *sin, int len, int flags) +static struct svc_xprt *svc_create_socket(struct svc_serv *serv, + int protocol, + struct sockaddr *sin, int len, + int flags) { struct svc_sock *svsk; struct socket *sock; @@ -1830,13 +1859,13 @@ static int svc_create_socket(struct svc_serv *serv, int protocol, if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { printk(KERN_WARNING "svc: only UDP and TCP " "sockets supported\n"); - return -EINVAL; + return ERR_PTR(-EINVAL); } type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; error = sock_create_kern(sin->sa_family, type, protocol, &sock); if (error < 0) - return error; + return ERR_PTR(error); svc_reclassify_socket(sock); @@ -1853,13 +1882,13 @@ static int svc_create_socket(struct svc_serv *serv, int protocol, if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { svc_sock_received(svsk); - return ntohs(inet_sk(svsk->sk_sk)->sport); + return (struct svc_xprt *)svsk; } bummer: dprintk("svc: svc_create_socket error = %d\n", -error); sock_release(sock); - return error; + return ERR_PTR(error); } /* @@ -1970,15 +1999,15 @@ void svc_force_close_socket(struct svc_sock *svsk) int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port, int flags) { - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr.s_addr = INADDR_ANY, - .sin_port = htons(port), - }; - dprintk("svc: creating socket proto = %d\n", protocol); - return svc_create_socket(serv, protocol, (struct sockaddr *) &sin, - sizeof(sin), flags); + switch (protocol) { + case IPPROTO_TCP: + return svc_create_xprt(serv, "tcp", port, flags); + case IPPROTO_UDP: + return svc_create_xprt(serv, "udp", port, flags); + default: + return -EINVAL; + } } /* From d7c9f1ed972b4a468dd24a2457721704dfe9ca70 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:44 -0600 Subject: [PATCH 061/100] svc: Change services to use new svc_create_xprt service Modify the various kernel RPC svcs to use the svc_create_xprt service. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 17 ++++++++--------- fs/nfs/callback.c | 4 ++-- fs/nfsd/nfssvc.c | 4 ++-- include/linux/sunrpc/svcsock.h | 1 - net/sunrpc/sunrpc_syms.c | 1 - net/sunrpc/svcsock.c | 22 ---------------------- 6 files changed, 12 insertions(+), 37 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 82e2192a0d5c..868691535115 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -219,13 +219,12 @@ lockd(struct svc_rqst *rqstp) module_put_and_exit(0); } - -static int find_socket(struct svc_serv *serv, int proto) +static int find_xprt(struct svc_serv *serv, char *proto) { struct svc_sock *svsk; int found = 0; list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) - if (svsk->sk_sk->sk_protocol == proto) { + if (strcmp(svsk->sk_xprt.xpt_class->xcl_name, proto) == 0) { found = 1; break; } @@ -243,13 +242,13 @@ static int make_socks(struct svc_serv *serv, int proto) int err = 0; if (proto == IPPROTO_UDP || nlm_udpport) - if (!find_socket(serv, IPPROTO_UDP)) - err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport, - SVC_SOCK_DEFAULTS); + if (!find_xprt(serv, "udp")) + err = svc_create_xprt(serv, "udp", nlm_udpport, + SVC_SOCK_DEFAULTS); if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) - if (!find_socket(serv, IPPROTO_TCP)) - err = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport, - SVC_SOCK_DEFAULTS); + if (!find_xprt(serv, "tcp")) + err = svc_create_xprt(serv, "tcp", nlm_tcpport, + SVC_SOCK_DEFAULTS); if (err >= 0) { warned = 0; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 9b6bbf1b9787..bd185a572a23 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -119,8 +119,8 @@ int nfs_callback_up(void) if (!serv) goto out_err; - ret = svc_makesock(serv, IPPROTO_TCP, nfs_callback_set_tcpport, - SVC_SOCK_ANONYMOUS); + ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport, + SVC_SOCK_ANONYMOUS); if (ret <= 0) goto out_destroy; nfs_callback_tcpport = ret; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1190aeaa92be..a828b0b0fb67 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -236,7 +236,7 @@ static int nfsd_init_socks(int port) error = lockd_up(IPPROTO_UDP); if (error >= 0) { - error = svc_makesock(nfsd_serv, IPPROTO_UDP, port, + error = svc_create_xprt(nfsd_serv, "udp", port, SVC_SOCK_DEFAULTS); if (error < 0) lockd_down(); @@ -247,7 +247,7 @@ static int nfsd_init_socks(int port) #ifdef CONFIG_NFSD_TCP error = lockd_up(IPPROTO_TCP); if (error >= 0) { - error = svc_makesock(nfsd_serv, IPPROTO_TCP, port, + error = svc_create_xprt(nfsd_serv, "tcp", port, SVC_SOCK_DEFAULTS); if (error < 0) lockd_down(); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 9882ce0ee33c..3181d9d4caa0 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -67,7 +67,6 @@ struct svc_sock { /* * Function prototypes. */ -int svc_makesock(struct svc_serv *, int, unsigned short, int flags); void svc_force_close_socket(struct svc_sock *); int svc_recv(struct svc_rqst *, long); int svc_send(struct svc_rqst *); diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 11b309817b8f..ab8a7362e890 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -33,7 +33,6 @@ EXPORT_SYMBOL(svc_drop); EXPORT_SYMBOL(svc_process); EXPORT_SYMBOL(svc_recv); EXPORT_SYMBOL(svc_wake_up); -EXPORT_SYMBOL(svc_makesock); EXPORT_SYMBOL(svc_reserve); EXPORT_SYMBOL(svc_auth_register); EXPORT_SYMBOL(auth_domain_lookup); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 9f0f6d088969..e6bb1b0563ec 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1988,28 +1988,6 @@ void svc_force_close_socket(struct svc_sock *svsk) svc_close_socket(svsk); } -/** - * svc_makesock - Make a socket for nfsd and lockd - * @serv: RPC server structure - * @protocol: transport protocol to use - * @port: port to use - * @flags: requested socket characteristics - * - */ -int svc_makesock(struct svc_serv *serv, int protocol, unsigned short port, - int flags) -{ - dprintk("svc: creating socket proto = %d\n", protocol); - switch (protocol) { - case IPPROTO_TCP: - return svc_create_xprt(serv, "tcp", port, flags); - case IPPROTO_UDP: - return svc_create_xprt(serv, "udp", port, flags); - default: - return -EINVAL; - } -} - /* * Handle defer and revisit of requests */ From e1b3157f9710622bad6c7747d3b08ed3d2394cf6 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:46 -0600 Subject: [PATCH 062/100] svc: Change sk_inuse to a kref Change the atomic_t reference count to a kref and move it to the transport indepenent svc_xprt structure. Change the reference count wrapper names to be generic. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 8 ++ include/linux/sunrpc/svcsock.h | 1 - net/sunrpc/svc_xprt.c | 16 ++++ net/sunrpc/svcsock.c | 138 +++++++++++++++----------------- 4 files changed, 87 insertions(+), 76 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 986a5a07044b..6374805887a6 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -8,6 +8,7 @@ #define SUNRPC_SVC_XPRT_H #include +#include struct svc_xprt_ops { struct svc_xprt *(*xpo_create)(struct svc_serv *, @@ -34,11 +35,18 @@ struct svc_xprt_class { struct svc_xprt { struct svc_xprt_class *xpt_class; struct svc_xprt_ops *xpt_ops; + struct kref xpt_ref; }; int svc_reg_xprt_class(struct svc_xprt_class *); void svc_unreg_xprt_class(struct svc_xprt_class *); void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *); int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); +void svc_xprt_put(struct svc_xprt *xprt); + +static inline void svc_xprt_get(struct svc_xprt *xprt) +{ + kref_get(&xprt->xpt_ref); +} #endif /* SUNRPC_SVC_XPRT_H */ diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 3181d9d4caa0..ba07d50cb11b 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -24,7 +24,6 @@ struct svc_sock { struct svc_pool * sk_pool; /* current pool iff queued */ struct svc_serv * sk_server; /* service for this socket */ - atomic_t sk_inuse; /* use count */ unsigned long sk_flags; #define SK_BUSY 0 /* enqueued/receiving */ #define SK_CONN 1 /* conn pending */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 6ff5ca71602f..31853bfcd88d 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -70,6 +70,21 @@ void svc_unreg_xprt_class(struct svc_xprt_class *xcl) } EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); +static void svc_xprt_free(struct kref *kref) +{ + struct svc_xprt *xprt = + container_of(kref, struct svc_xprt, xpt_ref); + struct module *owner = xprt->xpt_class->xcl_owner; + xprt->xpt_ops->xpo_free(xprt); + module_put(owner); +} + +void svc_xprt_put(struct svc_xprt *xprt) +{ + kref_put(&xprt->xpt_ref, svc_xprt_free); +} +EXPORT_SYMBOL_GPL(svc_xprt_put); + /* * Called by transport drivers to initialize the transport independent * portion of the transport instance. @@ -79,6 +94,7 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt) memset(xprt, 0, sizeof(*xprt)); xprt->xpt_class = xcl; xprt->xpt_ops = xcl->xcl_ops; + kref_init(&xprt->xpt_ref); } EXPORT_SYMBOL_GPL(svc_xprt_init); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e6bb1b0563ec..db589d187170 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -66,8 +66,8 @@ * after a clear, the socket must be read/accepted * if this succeeds, it must be set again. * SK_CLOSE can set at any time. It is never cleared. - * sk_inuse contains a bias of '1' until SK_DEAD is set. - * so when sk_inuse hits zero, we know the socket is dead + * xpt_ref contains a bias of '1' until SK_DEAD is set. + * so when xprt_ref hits zero, we know the transport is dead * and no-one is using it. * SK_DEAD can only be set while SK_BUSY is held which ensures * no other thread will be using the socket or will try to @@ -285,7 +285,7 @@ svc_sock_enqueue(struct svc_sock *svsk) "svc_sock_enqueue: server %p, rq_sock=%p!\n", rqstp, rqstp->rq_sock); rqstp->rq_sock = svsk; - atomic_inc(&svsk->sk_inuse); + svc_xprt_get(&svsk->sk_xprt); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); BUG_ON(svsk->sk_pool != pool); @@ -316,7 +316,7 @@ svc_sock_dequeue(struct svc_pool *pool) list_del_init(&svsk->sk_ready); dprintk("svc: socket %p dequeued, inuse=%d\n", - svsk->sk_sk, atomic_read(&svsk->sk_inuse)); + svsk->sk_sk, atomic_read(&svsk->sk_xprt.xpt_ref.refcount)); return svsk; } @@ -359,19 +359,6 @@ void svc_reserve(struct svc_rqst *rqstp, int space) } } -/* - * Release a socket after use. - */ -static inline void -svc_sock_put(struct svc_sock *svsk) -{ - if (atomic_dec_and_test(&svsk->sk_inuse)) { - BUG_ON(!test_bit(SK_DEAD, &svsk->sk_flags)); - module_put(svsk->sk_xprt.xpt_class->xcl_owner); - svsk->sk_xprt.xpt_ops->xpo_free(&svsk->sk_xprt); - } -} - static void svc_sock_release(struct svc_rqst *rqstp) { @@ -398,7 +385,7 @@ svc_sock_release(struct svc_rqst *rqstp) svc_reserve(rqstp, 0); rqstp->rq_sock = NULL; - svc_sock_put(svsk); + svc_xprt_put(&svsk->sk_xprt); } /* @@ -1127,50 +1114,6 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) return NULL; } -/* - * Make sure that we don't have too many active connections. If we - * have, something must be dropped. - * - * There's no point in trying to do random drop here for DoS - * prevention. The NFS clients does 1 reconnect in 15 seconds. An - * attacker can easily beat that. - * - * The only somewhat efficient mechanism would be if drop old - * connections from the same IP first. But right now we don't even - * record the client IP in svc_sock. - */ -static void svc_check_conn_limits(struct svc_serv *serv) -{ - if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { - struct svc_sock *svsk = NULL; - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { - if (net_ratelimit()) { - /* Try to help the admin */ - printk(KERN_NOTICE "%s: too many open TCP " - "sockets, consider increasing the " - "number of nfsd threads\n", - serv->sv_name); - } - /* - * Always select the oldest socket. It's not fair, - * but so is life - */ - svsk = list_entry(serv->sv_tempsocks.prev, - struct svc_sock, - sk_list); - set_bit(SK_CLOSE, &svsk->sk_flags); - atomic_inc(&svsk->sk_inuse); - } - spin_unlock_bh(&serv->sv_lock); - - if (svsk) { - svc_sock_enqueue(svsk); - svc_sock_put(svsk); - } - } -} - /* * Receive data from a TCP socket. */ @@ -1496,6 +1439,50 @@ svc_sock_update_bufs(struct svc_serv *serv) spin_unlock_bh(&serv->sv_lock); } +/* + * Make sure that we don't have too many active connections. If we + * have, something must be dropped. + * + * There's no point in trying to do random drop here for DoS + * prevention. The NFS clients does 1 reconnect in 15 seconds. An + * attacker can easily beat that. + * + * The only somewhat efficient mechanism would be if drop old + * connections from the same IP first. But right now we don't even + * record the client IP in svc_sock. + */ +static void svc_check_conn_limits(struct svc_serv *serv) +{ + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { + struct svc_sock *svsk = NULL; + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + if (net_ratelimit()) { + /* Try to help the admin */ + printk(KERN_NOTICE "%s: too many open TCP " + "sockets, consider increasing the " + "number of nfsd threads\n", + serv->sv_name); + } + /* + * Always select the oldest socket. It's not fair, + * but so is life + */ + svsk = list_entry(serv->sv_tempsocks.prev, + struct svc_sock, + sk_list); + set_bit(SK_CLOSE, &svsk->sk_flags); + svc_xprt_get(&svsk->sk_xprt); + } + spin_unlock_bh(&serv->sv_lock); + + if (svsk) { + svc_sock_enqueue(svsk); + svc_xprt_put(&svsk->sk_xprt); + } + } +} + /* * Receive the next request on any socket. This code is carefully * organised not to touch any cachelines in the shared svc_serv @@ -1556,7 +1543,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) spin_lock_bh(&pool->sp_lock); if ((svsk = svc_sock_dequeue(pool)) != NULL) { rqstp->rq_sock = svsk; - atomic_inc(&svsk->sk_inuse); + svc_xprt_get(&svsk->sk_xprt); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); } else { @@ -1605,7 +1592,8 @@ svc_recv(struct svc_rqst *rqstp, long timeout) svc_sock_received(svsk); } else { dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", - rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); + rqstp, pool->sp_id, svsk, + atomic_read(&svsk->sk_xprt.xpt_ref.refcount)); len = svsk->sk_xprt.xpt_ops->xpo_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); } @@ -1702,9 +1690,10 @@ svc_age_temp_sockets(unsigned long closure) if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) continue; - if (atomic_read(&svsk->sk_inuse) > 1 || test_bit(SK_BUSY, &svsk->sk_flags)) + if (atomic_read(&svsk->sk_xprt.xpt_ref.refcount) > 1 + || test_bit(SK_BUSY, &svsk->sk_flags)) continue; - atomic_inc(&svsk->sk_inuse); + svc_xprt_get(&svsk->sk_xprt); list_move(le, &to_be_aged); set_bit(SK_CLOSE, &svsk->sk_flags); set_bit(SK_DETACHED, &svsk->sk_flags); @@ -1722,7 +1711,7 @@ svc_age_temp_sockets(unsigned long closure) /* a thread will dequeue and close it soon */ svc_sock_enqueue(svsk); - svc_sock_put(svsk); + svc_xprt_put(&svsk->sk_xprt); } mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); @@ -1767,7 +1756,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; svsk->sk_server = serv; - atomic_set(&svsk->sk_inuse, 1); svsk->sk_lastrecv = get_seconds(); spin_lock_init(&svsk->sk_lock); INIT_LIST_HEAD(&svsk->sk_deferred); @@ -1953,10 +1941,10 @@ svc_delete_socket(struct svc_sock *svsk) * is about to be destroyed (in svc_destroy). */ if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) { - BUG_ON(atomic_read(&svsk->sk_inuse)<2); - atomic_dec(&svsk->sk_inuse); + BUG_ON(atomic_read(&svsk->sk_xprt.xpt_ref.refcount) < 2); if (test_bit(SK_TEMP, &svsk->sk_flags)) serv->sv_tmpcnt--; + svc_xprt_put(&svsk->sk_xprt); } spin_unlock_bh(&serv->sv_lock); @@ -1969,10 +1957,10 @@ static void svc_close_socket(struct svc_sock *svsk) /* someone else will have to effect the close */ return; - atomic_inc(&svsk->sk_inuse); + svc_xprt_get(&svsk->sk_xprt); svc_delete_socket(svsk); clear_bit(SK_BUSY, &svsk->sk_flags); - svc_sock_put(svsk); + svc_xprt_put(&svsk->sk_xprt); } void svc_force_close_socket(struct svc_sock *svsk) @@ -1998,7 +1986,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) struct svc_sock *svsk; if (too_many) { - svc_sock_put(dr->svsk); + svc_xprt_put(&dr->svsk->sk_xprt); kfree(dr); return; } @@ -2010,7 +1998,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) spin_unlock(&svsk->sk_lock); set_bit(SK_DEFERRED, &svsk->sk_flags); svc_sock_enqueue(svsk); - svc_sock_put(svsk); + svc_xprt_put(&svsk->sk_xprt); } static struct cache_deferred_req * @@ -2040,7 +2028,7 @@ svc_defer(struct cache_req *req) dr->argslen = rqstp->rq_arg.len >> 2; memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); } - atomic_inc(&rqstp->rq_sock->sk_inuse); + svc_xprt_get(rqstp->rq_xprt); dr->svsk = rqstp->rq_sock; dr->handle.revisit = svc_revisit; From 02fc6c36188be0ad19502cfd39266150ffab7603 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:48 -0600 Subject: [PATCH 063/100] svc: Move sk_flags to the svc_xprt structure This functionally trivial change moves the transport independent sk_flags field to the transport independent svc_xprt structure. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 12 +++ include/linux/sunrpc/svcsock.h | 13 --- net/sunrpc/svcsock.c | 151 ++++++++++++++++---------------- 3 files changed, 88 insertions(+), 88 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 6374805887a6..2edeb0d13ac2 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -36,6 +36,18 @@ struct svc_xprt { struct svc_xprt_class *xpt_class; struct svc_xprt_ops *xpt_ops; struct kref xpt_ref; + unsigned long xpt_flags; +#define XPT_BUSY 0 /* enqueued/receiving */ +#define XPT_CONN 1 /* conn pending */ +#define XPT_CLOSE 2 /* dead or dying */ +#define XPT_DATA 3 /* data pending */ +#define XPT_TEMP 4 /* connected transport */ +#define XPT_DEAD 6 /* transport closed */ +#define XPT_CHNGBUF 7 /* need to change snd/rcv buf sizes */ +#define XPT_DEFERRED 8 /* deferred request pending */ +#define XPT_OLD 9 /* used for xprt aging mark+sweep */ +#define XPT_DETACHED 10 /* detached from tempsocks list */ +#define XPT_LISTENER 11 /* listening endpoint */ }; int svc_reg_xprt_class(struct svc_xprt_class *); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index ba07d50cb11b..b8a8496dfc7c 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -24,19 +24,6 @@ struct svc_sock { struct svc_pool * sk_pool; /* current pool iff queued */ struct svc_serv * sk_server; /* service for this socket */ - unsigned long sk_flags; -#define SK_BUSY 0 /* enqueued/receiving */ -#define SK_CONN 1 /* conn pending */ -#define SK_CLOSE 2 /* dead or dying */ -#define SK_DATA 3 /* data pending */ -#define SK_TEMP 4 /* temp (TCP) socket */ -#define SK_DEAD 6 /* socket closed */ -#define SK_CHNGBUF 7 /* need to change snd/rcv buffer sizes */ -#define SK_DEFERRED 8 /* request on sk_deferred */ -#define SK_OLD 9 /* used for temp socket aging mark+sweep */ -#define SK_DETACHED 10 /* detached from tempsocks list */ -#define SK_LISTENER 11 /* listening endpoint */ - atomic_t sk_reserved; /* space on outq that is reserved */ spinlock_t sk_lock; /* protects sk_deferred and diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index db589d187170..0a7125271d44 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -56,22 +56,23 @@ * BKL protects svc_serv->sv_nrthread. * svc_sock->sk_lock protects the svc_sock->sk_deferred list * and the ->sk_info_authunix cache. - * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. + * svc_sock->sk_xprt.xpt_flags.XPT_BUSY prevents a svc_sock being + * enqueued multiply. * * Some flags can be set to certain values at any time * providing that certain rules are followed: * - * SK_CONN, SK_DATA, can be set or cleared at any time. + * XPT_CONN, XPT_DATA, can be set or cleared at any time. * after a set, svc_sock_enqueue must be called. * after a clear, the socket must be read/accepted * if this succeeds, it must be set again. - * SK_CLOSE can set at any time. It is never cleared. - * xpt_ref contains a bias of '1' until SK_DEAD is set. + * XPT_CLOSE can set at any time. It is never cleared. + * xpt_ref contains a bias of '1' until XPT_DEAD is set. * so when xprt_ref hits zero, we know the transport is dead * and no-one is using it. - * SK_DEAD can only be set while SK_BUSY is held which ensures + * XPT_DEAD can only be set while XPT_BUSY is held which ensures * no other thread will be using the socket or will try to - * set SK_DEAD. + * set XPT_DEAD. * */ @@ -219,10 +220,10 @@ svc_sock_enqueue(struct svc_sock *svsk) struct svc_rqst *rqstp; int cpu; - if (!(svsk->sk_flags & - ( (1<sk_xprt.xpt_flags & + ((1<sk_flags)) + if (test_bit(XPT_DEAD, &svsk->sk_xprt.xpt_flags)) return; cpu = get_cpu(); @@ -236,7 +237,7 @@ svc_sock_enqueue(struct svc_sock *svsk) printk(KERN_ERR "svc_sock_enqueue: threads and sockets both waiting??\n"); - if (test_bit(SK_DEAD, &svsk->sk_flags)) { + if (test_bit(XPT_DEAD, &svsk->sk_xprt.xpt_flags)) { /* Don't enqueue dead sockets */ dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); goto out_unlock; @@ -244,10 +245,10 @@ svc_sock_enqueue(struct svc_sock *svsk) /* Mark socket as busy. It will remain in this state until the * server has processed all pending data and put the socket back - * on the idle list. We update SK_BUSY atomically because + * on the idle list. We update XPT_BUSY atomically because * it also guards against trying to enqueue the svc_sock twice. */ - if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { + if (test_and_set_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)) { /* Don't enqueue socket while already enqueued */ dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); goto out_unlock; @@ -256,11 +257,11 @@ svc_sock_enqueue(struct svc_sock *svsk) svsk->sk_pool = pool; /* Handle pending connection */ - if (test_bit(SK_CONN, &svsk->sk_flags)) + if (test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags)) goto process; /* Handle close in-progress */ - if (test_bit(SK_CLOSE, &svsk->sk_flags)) + if (test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)) goto process; /* Check if we have space to reply to a request */ @@ -268,7 +269,7 @@ svc_sock_enqueue(struct svc_sock *svsk) /* Don't enqueue while not enough space for reply */ dprintk("svc: no write space, socket %p not enqueued\n", svsk); svsk->sk_pool = NULL; - clear_bit(SK_BUSY, &svsk->sk_flags); + clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); goto out_unlock; } @@ -324,14 +325,14 @@ svc_sock_dequeue(struct svc_pool *pool) /* * Having read something from a socket, check whether it * needs to be re-enqueued. - * Note: SK_DATA only gets cleared when a read-attempt finds + * Note: XPT_DATA only gets cleared when a read-attempt finds * no (or insufficient) data. */ static inline void svc_sock_received(struct svc_sock *svsk) { svsk->sk_pool = NULL; - clear_bit(SK_BUSY, &svsk->sk_flags); + clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); svc_sock_enqueue(svsk); } @@ -680,8 +681,9 @@ svc_udp_data_ready(struct sock *sk, int count) if (svsk) { dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", - svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); - set_bit(SK_DATA, &svsk->sk_flags); + svsk, sk, count, + test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); svc_sock_enqueue(svsk); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) @@ -698,7 +700,7 @@ svc_write_space(struct sock *sk) if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", - svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); + svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); svc_sock_enqueue(svsk); } @@ -748,7 +750,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) .msg_flags = MSG_DONTWAIT, }; - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) /* udp sockets need large rcvbuf as all pending * requests are still in that buffer. sndbuf must * also be large enough that there is enough space @@ -766,7 +768,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) return svc_deferred_recv(rqstp); } - clear_bit(SK_DATA, &svsk->sk_flags); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); skb = NULL; err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, 0, 0, MSG_PEEK | MSG_DONTWAIT); @@ -777,7 +779,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) if (err != -EAGAIN) { /* possibly an icmp error */ dprintk("svc: recvfrom returned error %d\n", -err); - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); } svc_sock_received(svsk); return -EAGAIN; @@ -789,7 +791,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) need that much accuracy */ } svsk->sk_sk->sk_stamp = skb->tstamp; - set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ /* * Maybe more packets - kick another thread ASAP. @@ -936,8 +938,8 @@ svc_udp_init(struct svc_sock *svsk) 3 * svsk->sk_server->sv_max_mesg, 3 * svsk->sk_server->sv_max_mesg); - set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ - set_bit(SK_CHNGBUF, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* might have come in before data_ready set up */ + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); oldfs = get_fs(); set_fs(KERNEL_DS); @@ -971,7 +973,7 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused) */ if (sk->sk_state == TCP_LISTEN) { if (svsk) { - set_bit(SK_CONN, &svsk->sk_flags); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); svc_sock_enqueue(svsk); } else printk("svc: socket %p: no user data\n", sk); @@ -995,7 +997,7 @@ svc_tcp_state_change(struct sock *sk) if (!svsk) printk("svc: socket %p: no user data\n", sk); else { - set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); svc_sock_enqueue(svsk); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) @@ -1010,7 +1012,7 @@ svc_tcp_data_ready(struct sock *sk, int count) dprintk("svc: socket %p TCP data ready (svsk %p)\n", sk, sk->sk_user_data); if (svsk) { - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); svc_sock_enqueue(svsk); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) @@ -1050,7 +1052,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) if (!sock) return NULL; - clear_bit(SK_CONN, &svsk->sk_flags); + clear_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_accept(sock, &newsock, O_NONBLOCK); if (err < 0) { if (err == -ENOMEM) @@ -1061,8 +1063,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) serv->sv_name, -err); return NULL; } - - set_bit(SK_CONN, &svsk->sk_flags); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); err = kernel_getpeername(newsock, sin, &slen); if (err < 0) { @@ -1127,16 +1128,16 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) int pnum, vlen; dprintk("svc: tcp_recv %p data %d conn %d close %d\n", - svsk, test_bit(SK_DATA, &svsk->sk_flags), - test_bit(SK_CONN, &svsk->sk_flags), - test_bit(SK_CLOSE, &svsk->sk_flags)); + svsk, test_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), + test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { svc_sock_received(svsk); return svc_deferred_recv(rqstp); } - if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) /* sndbuf needs to have room for one request * per thread, otherwise we can stall even when the * network isn't a bottleneck. @@ -1153,7 +1154,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) (serv->sv_nrthreads+3) * serv->sv_max_mesg, 3 * serv->sv_max_mesg); - clear_bit(SK_DATA, &svsk->sk_flags); + clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* Receive data. If we haven't got the record length yet, get * the next four bytes. Otherwise try to gobble up as much as @@ -1212,7 +1213,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) return -EAGAIN; /* record not complete */ } len = svsk->sk_reclen; - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); vec = rqstp->rq_vec; vec[0] = rqstp->rq_arg.head[0]; @@ -1255,7 +1256,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) return len; err_delete: - set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); return -EAGAIN; error: @@ -1288,7 +1289,7 @@ svc_tcp_sendto(struct svc_rqst *rqstp) reclen = htonl(0x80000000|((xbufp->len ) - 4)); memcpy(xbufp->head[0].iov_base, &reclen, 4); - if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) + if (test_bit(XPT_DEAD, &rqstp->rq_sock->sk_xprt.xpt_flags)) return -ENOTCONN; sent = svc_sendto(rqstp, &rqstp->rq_res); @@ -1297,7 +1298,7 @@ svc_tcp_sendto(struct svc_rqst *rqstp) rqstp->rq_sock->sk_server->sv_name, (sent<0)?"got error":"sent only", sent, xbufp->len); - set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags); + set_bit(XPT_CLOSE, &rqstp->rq_sock->sk_xprt.xpt_flags); svc_sock_enqueue(rqstp->rq_sock); sent = -EAGAIN; } @@ -1387,9 +1388,9 @@ svc_tcp_init(struct svc_sock *svsk) if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); - set_bit(SK_LISTENER, &svsk->sk_flags); + set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; - set_bit(SK_CONN, &svsk->sk_flags); + set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { dprintk("setting up TCP socket for reading\n"); sk->sk_state_change = svc_tcp_state_change; @@ -1409,10 +1410,10 @@ svc_tcp_init(struct svc_sock *svsk) 3 * svsk->sk_server->sv_max_mesg, 3 * svsk->sk_server->sv_max_mesg); - set_bit(SK_CHNGBUF, &svsk->sk_flags); - set_bit(SK_DATA, &svsk->sk_flags); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (sk->sk_state != TCP_ESTABLISHED) - set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); } } @@ -1429,12 +1430,12 @@ svc_sock_update_bufs(struct svc_serv *serv) list_for_each(le, &serv->sv_permsocks) { struct svc_sock *svsk = list_entry(le, struct svc_sock, sk_list); - set_bit(SK_CHNGBUF, &svsk->sk_flags); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); } list_for_each(le, &serv->sv_tempsocks) { struct svc_sock *svsk = list_entry(le, struct svc_sock, sk_list); - set_bit(SK_CHNGBUF, &svsk->sk_flags); + set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); } spin_unlock_bh(&serv->sv_lock); } @@ -1471,7 +1472,7 @@ static void svc_check_conn_limits(struct svc_serv *serv) svsk = list_entry(serv->sv_tempsocks.prev, struct svc_sock, sk_list); - set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); svc_xprt_get(&svsk->sk_xprt); } spin_unlock_bh(&serv->sv_lock); @@ -1575,10 +1576,10 @@ svc_recv(struct svc_rqst *rqstp, long timeout) spin_unlock_bh(&pool->sp_lock); len = 0; - if (test_bit(SK_CLOSE, &svsk->sk_flags)) { - dprintk("svc_recv: found SK_CLOSE\n"); + if (test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)) { + dprintk("svc_recv: found XPT_CLOSE\n"); svc_delete_socket(svsk); - } else if (test_bit(SK_LISTENER, &svsk->sk_flags)) { + } else if (test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags)) { struct svc_xprt *newxpt; newxpt = svsk->sk_xprt.xpt_ops->xpo_accept(&svsk->sk_xprt); if (newxpt) { @@ -1605,7 +1606,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) return -EAGAIN; } svsk->sk_lastrecv = get_seconds(); - clear_bit(SK_OLD, &svsk->sk_flags); + clear_bit(XPT_OLD, &svsk->sk_xprt.xpt_flags); rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); rqstp->rq_chandle.defer = svc_defer; @@ -1652,7 +1653,7 @@ svc_send(struct svc_rqst *rqstp) /* Grab svsk->sk_mutex to serialize outgoing data. */ mutex_lock(&svsk->sk_mutex); - if (test_bit(SK_DEAD, &svsk->sk_flags)) + if (test_bit(XPT_DEAD, &svsk->sk_xprt.xpt_flags)) len = -ENOTCONN; else len = svsk->sk_xprt.xpt_ops->xpo_sendto(rqstp); @@ -1688,21 +1689,21 @@ svc_age_temp_sockets(unsigned long closure) list_for_each_safe(le, next, &serv->sv_tempsocks) { svsk = list_entry(le, struct svc_sock, sk_list); - if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) + if (!test_and_set_bit(XPT_OLD, &svsk->sk_xprt.xpt_flags)) continue; if (atomic_read(&svsk->sk_xprt.xpt_ref.refcount) > 1 - || test_bit(SK_BUSY, &svsk->sk_flags)) + || test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)) continue; svc_xprt_get(&svsk->sk_xprt); list_move(le, &to_be_aged); - set_bit(SK_CLOSE, &svsk->sk_flags); - set_bit(SK_DETACHED, &svsk->sk_flags); + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_DETACHED, &svsk->sk_xprt.xpt_flags); } spin_unlock_bh(&serv->sv_lock); while (!list_empty(&to_be_aged)) { le = to_be_aged.next; - /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ + /* fiddling the sk_list node is safe 'cos we're XPT_DETACHED */ list_del_init(le); svsk = list_entry(le, struct svc_sock, sk_list); @@ -1748,7 +1749,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, return NULL; } - set_bit(SK_BUSY, &svsk->sk_flags); + set_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); inet->sk_user_data = svsk; svsk->sk_sock = sock; svsk->sk_sk = inet; @@ -1770,7 +1771,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, spin_lock_bh(&serv->sv_lock); if (is_temporary) { - set_bit(SK_TEMP, &svsk->sk_flags); + set_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); list_add(&svsk->sk_list, &serv->sv_tempsocks); serv->sv_tmpcnt++; if (serv->sv_temptimer.function == NULL) { @@ -1781,7 +1782,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, jiffies + svc_conn_age_period * HZ); } } else { - clear_bit(SK_TEMP, &svsk->sk_flags); + clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); list_add(&svsk->sk_list, &serv->sv_permsocks); } spin_unlock_bh(&serv->sv_lock); @@ -1931,7 +1932,7 @@ svc_delete_socket(struct svc_sock *svsk) spin_lock_bh(&serv->sv_lock); - if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) + if (!test_and_set_bit(XPT_DETACHED, &svsk->sk_xprt.xpt_flags)) list_del_init(&svsk->sk_list); /* * We used to delete the svc_sock from whichever list @@ -1940,9 +1941,9 @@ svc_delete_socket(struct svc_sock *svsk) * while still attached to a queue, the queue itself * is about to be destroyed (in svc_destroy). */ - if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) { + if (!test_and_set_bit(XPT_DEAD, &svsk->sk_xprt.xpt_flags)) { BUG_ON(atomic_read(&svsk->sk_xprt.xpt_ref.refcount) < 2); - if (test_bit(SK_TEMP, &svsk->sk_flags)) + if (test_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags)) serv->sv_tmpcnt--; svc_xprt_put(&svsk->sk_xprt); } @@ -1952,26 +1953,26 @@ svc_delete_socket(struct svc_sock *svsk) static void svc_close_socket(struct svc_sock *svsk) { - set_bit(SK_CLOSE, &svsk->sk_flags); - if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + if (test_and_set_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)) /* someone else will have to effect the close */ return; svc_xprt_get(&svsk->sk_xprt); svc_delete_socket(svsk); - clear_bit(SK_BUSY, &svsk->sk_flags); + clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); svc_xprt_put(&svsk->sk_xprt); } void svc_force_close_socket(struct svc_sock *svsk) { - set_bit(SK_CLOSE, &svsk->sk_flags); - if (test_bit(SK_BUSY, &svsk->sk_flags)) { + set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); + if (test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)) { /* Waiting to be processed, but no threads left, * So just remove it from the waiting list */ list_del_init(&svsk->sk_ready); - clear_bit(SK_BUSY, &svsk->sk_flags); + clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); } svc_close_socket(svsk); } @@ -1996,7 +1997,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) spin_lock(&svsk->sk_lock); list_add(&dr->handle.recent, &svsk->sk_deferred); spin_unlock(&svsk->sk_lock); - set_bit(SK_DEFERRED, &svsk->sk_flags); + set_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); svc_sock_enqueue(svsk); svc_xprt_put(&svsk->sk_xprt); } @@ -2059,16 +2060,16 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) { struct svc_deferred_req *dr = NULL; - if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) + if (!test_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags)) return NULL; spin_lock(&svsk->sk_lock); - clear_bit(SK_DEFERRED, &svsk->sk_flags); + clear_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); if (!list_empty(&svsk->sk_deferred)) { dr = list_entry(svsk->sk_deferred.next, struct svc_deferred_req, handle.recent); list_del_init(&dr->handle.recent); - set_bit(SK_DEFERRED, &svsk->sk_flags); + set_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); } spin_unlock(&svsk->sk_lock); return dr; From bb5cf160b282644c4491afbf76fbc66f5dc35030 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:50 -0600 Subject: [PATCH 064/100] svc: Move sk_server and sk_pool to svc_xprt This is another incremental change that moves transport independent fields from svc_sock to the svc_xprt structure. The changes should be functionally null. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 6 +++- include/linux/sunrpc/svcsock.h | 2 -- net/sunrpc/svc_xprt.c | 4 ++- net/sunrpc/svcsock.c | 57 ++++++++++++++++----------------- 4 files changed, 35 insertions(+), 34 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 2edeb0d13ac2..4042cdfec4fe 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -48,11 +48,15 @@ struct svc_xprt { #define XPT_OLD 9 /* used for xprt aging mark+sweep */ #define XPT_DETACHED 10 /* detached from tempsocks list */ #define XPT_LISTENER 11 /* listening endpoint */ + + struct svc_pool *xpt_pool; /* current pool iff queued */ + struct svc_serv *xpt_server; /* service for transport */ }; int svc_reg_xprt_class(struct svc_xprt_class *); void svc_unreg_xprt_class(struct svc_xprt_class *); -void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *); +void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *, + struct svc_serv *); int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); void svc_xprt_put(struct svc_xprt *xprt); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index b8a8496dfc7c..92d4cc99c60d 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -22,8 +22,6 @@ struct svc_sock { struct socket * sk_sock; /* berkeley socket layer */ struct sock * sk_sk; /* INET layer */ - struct svc_pool * sk_pool; /* current pool iff queued */ - struct svc_serv * sk_server; /* service for this socket */ atomic_t sk_reserved; /* space on outq that is reserved */ spinlock_t sk_lock; /* protects sk_deferred and diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 31853bfcd88d..ea17b533db74 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -89,12 +89,14 @@ EXPORT_SYMBOL_GPL(svc_xprt_put); * Called by transport drivers to initialize the transport independent * portion of the transport instance. */ -void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt) +void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, + struct svc_serv *serv) { memset(xprt, 0, sizeof(*xprt)); xprt->xpt_class = xcl; xprt->xpt_ops = xcl->xcl_ops; kref_init(&xprt->xpt_ref); + xprt->xpt_server = serv; } EXPORT_SYMBOL_GPL(svc_xprt_init); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 0a7125271d44..f86538e1dec6 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -215,7 +215,7 @@ static void svc_release_skb(struct svc_rqst *rqstp) static void svc_sock_enqueue(struct svc_sock *svsk) { - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; struct svc_pool *pool; struct svc_rqst *rqstp; int cpu; @@ -227,7 +227,7 @@ svc_sock_enqueue(struct svc_sock *svsk) return; cpu = get_cpu(); - pool = svc_pool_for_cpu(svsk->sk_server, cpu); + pool = svc_pool_for_cpu(svsk->sk_xprt.xpt_server, cpu); put_cpu(); spin_lock_bh(&pool->sp_lock); @@ -253,8 +253,8 @@ svc_sock_enqueue(struct svc_sock *svsk) dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); goto out_unlock; } - BUG_ON(svsk->sk_pool != NULL); - svsk->sk_pool = pool; + BUG_ON(svsk->sk_xprt.xpt_pool != NULL); + svsk->sk_xprt.xpt_pool = pool; /* Handle pending connection */ if (test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags)) @@ -268,7 +268,7 @@ svc_sock_enqueue(struct svc_sock *svsk) if (!svsk->sk_xprt.xpt_ops->xpo_has_wspace(&svsk->sk_xprt)) { /* Don't enqueue while not enough space for reply */ dprintk("svc: no write space, socket %p not enqueued\n", svsk); - svsk->sk_pool = NULL; + svsk->sk_xprt.xpt_pool = NULL; clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); goto out_unlock; } @@ -289,12 +289,12 @@ svc_sock_enqueue(struct svc_sock *svsk) svc_xprt_get(&svsk->sk_xprt); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); - BUG_ON(svsk->sk_pool != pool); + BUG_ON(svsk->sk_xprt.xpt_pool != pool); wake_up(&rqstp->rq_wait); } else { dprintk("svc: socket %p put into queue\n", svsk->sk_sk); list_add_tail(&svsk->sk_ready, &pool->sp_sockets); - BUG_ON(svsk->sk_pool != pool); + BUG_ON(svsk->sk_xprt.xpt_pool != pool); } out_unlock: @@ -331,7 +331,7 @@ svc_sock_dequeue(struct svc_pool *pool) static inline void svc_sock_received(struct svc_sock *svsk) { - svsk->sk_pool = NULL; + svsk->sk_xprt.xpt_pool = NULL; clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); svc_sock_enqueue(svsk); } @@ -735,7 +735,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) { struct svc_sock *svsk = rqstp->rq_sock; - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; struct sk_buff *skb; union { struct cmsghdr hdr; @@ -873,7 +873,7 @@ static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) static int svc_udp_has_wspace(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = xprt->xpt_server; unsigned long required; /* @@ -920,13 +920,12 @@ static struct svc_xprt_class svc_udp_class = { .xcl_max_payload = RPCSVC_MAXPAYLOAD_UDP, }; -static void -svc_udp_init(struct svc_sock *svsk) +static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { int one = 1; mm_segment_t oldfs; - svc_xprt_init(&svc_udp_class, &svsk->sk_xprt); + svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv); svsk->sk_sk->sk_data_ready = svc_udp_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; @@ -935,8 +934,8 @@ svc_udp_init(struct svc_sock *svsk) * svc_udp_recvfrom will re-adjust if necessary */ svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_server->sv_max_mesg, - 3 * svsk->sk_server->sv_max_mesg); + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* might have come in before data_ready set up */ set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); @@ -1041,7 +1040,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct sockaddr_storage addr; struct sockaddr *sin = (struct sockaddr *) &addr; - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; struct socket *sock = svsk->sk_sock; struct socket *newsock; struct svc_sock *newsvsk; @@ -1122,7 +1121,7 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { struct svc_sock *svsk = rqstp->rq_sock; - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; int len; struct kvec *vec; int pnum, vlen; @@ -1265,7 +1264,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) svc_sock_received(svsk); } else { printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", - svsk->sk_server->sv_name, -len); + svsk->sk_xprt.xpt_server->sv_name, -len); goto err_delete; } @@ -1295,7 +1294,7 @@ svc_tcp_sendto(struct svc_rqst *rqstp) sent = svc_sendto(rqstp, &rqstp->rq_res); if (sent != xbufp->len) { printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", - rqstp->rq_sock->sk_server->sv_name, + rqstp->rq_sock->sk_xprt.xpt_server->sv_name, (sent<0)?"got error":"sent only", sent, xbufp->len); set_bit(XPT_CLOSE, &rqstp->rq_sock->sk_xprt.xpt_flags); @@ -1319,7 +1318,7 @@ static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) static int svc_tcp_has_wspace(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - struct svc_serv *serv = svsk->sk_server; + struct svc_serv *serv = svsk->sk_xprt.xpt_server; int required; int wspace; @@ -1378,13 +1377,12 @@ void svc_cleanup_xprt_sock(void) svc_unreg_xprt_class(&svc_udp_class); } -static void -svc_tcp_init(struct svc_sock *svsk) +static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) { struct sock *sk = svsk->sk_sk; struct tcp_sock *tp = tcp_sk(sk); - svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt); + svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv); if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); @@ -1407,8 +1405,8 @@ svc_tcp_init(struct svc_sock *svsk) * svc_tcp_recvfrom will re-adjust if necessary */ svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_server->sv_max_mesg, - 3 * svsk->sk_server->sv_max_mesg); + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, + 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); @@ -1588,7 +1586,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) * listener holds a reference too */ __module_get(newxpt->xpt_class->xcl_owner); - svc_check_conn_limits(svsk->sk_server); + svc_check_conn_limits(svsk->sk_xprt.xpt_server); } svc_sock_received(svsk); } else { @@ -1756,7 +1754,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_ostate = inet->sk_state_change; svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; - svsk->sk_server = serv; svsk->sk_lastrecv = get_seconds(); spin_lock_init(&svsk->sk_lock); INIT_LIST_HEAD(&svsk->sk_deferred); @@ -1765,9 +1762,9 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, /* Initialize the socket */ if (sock->type == SOCK_DGRAM) - svc_udp_init(svsk); + svc_udp_init(svsk, serv); else - svc_tcp_init(svsk); + svc_tcp_init(svsk, serv); spin_lock_bh(&serv->sv_lock); if (is_temporary) { @@ -1925,7 +1922,7 @@ svc_delete_socket(struct svc_sock *svsk) dprintk("svc: svc_delete_socket(%p)\n", svsk); - serv = svsk->sk_server; + serv = svsk->sk_xprt.xpt_server; sk = svsk->sk_sk; svsk->sk_xprt.xpt_ops->xpo_detach(&svsk->sk_xprt); From 7a18208383ab3f3ce4a1f4e0536acc9372523d81 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:53 -0600 Subject: [PATCH 065/100] svc: Make close transport independent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move sk_list and sk_ready to svc_xprt. This involves close because these lists are walked by svcs when closing all their transports. So I combined the moving of these lists to svc_xprt with making close transport independent. The svc_force_sock_close has been changed to svc_close_all and takes a list as an argument. This removes some svc internals knowledge from the svcs. This code races with module removal and transport addition. Thanks to Simon Holm Thøgersen for a compile fix. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields Cc: Simon Holm Thøgersen --- fs/lockd/svc.c | 6 +- fs/nfsd/nfssvc.c | 4 +- include/linux/sunrpc/svc_xprt.h | 2 + include/linux/sunrpc/svcsock.h | 4 +- net/sunrpc/svc.c | 9 +-- net/sunrpc/svc_xprt.c | 2 + net/sunrpc/svcsock.c | 104 +++++++++++++++----------------- 7 files changed, 62 insertions(+), 69 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 868691535115..a8e79a907202 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -221,10 +221,10 @@ lockd(struct svc_rqst *rqstp) static int find_xprt(struct svc_serv *serv, char *proto) { - struct svc_sock *svsk; + struct svc_xprt *xprt; int found = 0; - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) - if (strcmp(svsk->sk_xprt.xpt_class->xcl_name, proto) == 0) { + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) + if (strcmp(xprt->xpt_class->xcl_name, proto) == 0) { found = 1; break; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a828b0b0fb67..9647b0f7bc0c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -155,8 +155,8 @@ static int killsig; /* signal that was used to kill last nfsd */ static void nfsd_last_thread(struct svc_serv *serv) { /* When last nfsd thread exits we need to do some clean-up */ - struct svc_sock *svsk; - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) + struct svc_xprt *xprt; + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) lockd_down(); nfsd_serv = NULL; nfsd_racache_shutdown(); diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 4042cdfec4fe..0a3e09b42a83 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -36,6 +36,8 @@ struct svc_xprt { struct svc_xprt_class *xpt_class; struct svc_xprt_ops *xpt_ops; struct kref xpt_ref; + struct list_head xpt_list; + struct list_head xpt_ready; unsigned long xpt_flags; #define XPT_BUSY 0 /* enqueued/receiving */ #define XPT_CONN 1 /* conn pending */ diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 92d4cc99c60d..060508ba358b 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -17,8 +17,6 @@ */ struct svc_sock { struct svc_xprt sk_xprt; - struct list_head sk_ready; /* list of ready sockets */ - struct list_head sk_list; /* list of all sockets */ struct socket * sk_sock; /* berkeley socket layer */ struct sock * sk_sk; /* INET layer */ @@ -51,7 +49,7 @@ struct svc_sock { /* * Function prototypes. */ -void svc_force_close_socket(struct svc_sock *); +void svc_close_all(struct list_head *); int svc_recv(struct svc_rqst *, long); int svc_send(struct svc_rqst *); void svc_drop(struct svc_rqst *); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 8281a0402652..a5eb2d6b14ae 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -459,9 +459,6 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, void svc_destroy(struct svc_serv *serv) { - struct svc_sock *svsk; - struct svc_sock *tmp; - dprintk("svc: svc_destroy(%s, %d)\n", serv->sv_program->pg_name, serv->sv_nrthreads); @@ -476,14 +473,12 @@ svc_destroy(struct svc_serv *serv) del_timer_sync(&serv->sv_temptimer); - list_for_each_entry_safe(svsk, tmp, &serv->sv_tempsocks, sk_list) - svc_force_close_socket(svsk); + svc_close_all(&serv->sv_tempsocks); if (serv->sv_shutdown) serv->sv_shutdown(serv); - list_for_each_entry_safe(svsk, tmp, &serv->sv_permsocks, sk_list) - svc_force_close_socket(svsk); + svc_close_all(&serv->sv_permsocks); BUG_ON(!list_empty(&serv->sv_permsocks)); BUG_ON(!list_empty(&serv->sv_tempsocks)); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ea17b533db74..95186b548099 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -97,6 +97,8 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, xprt->xpt_ops = xcl->xcl_ops; kref_init(&xprt->xpt_ref); xprt->xpt_server = serv; + INIT_LIST_HEAD(&xprt->xpt_list); + INIT_LIST_HEAD(&xprt->xpt_ready); } EXPORT_SYMBOL_GPL(svc_xprt_init); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index f86538e1dec6..6f63a5ca6a91 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -81,11 +81,11 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int *errp, int flags); -static void svc_delete_socket(struct svc_sock *svsk); +static void svc_delete_xprt(struct svc_xprt *xprt); static void svc_udp_data_ready(struct sock *, int); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); -static void svc_close_socket(struct svc_sock *svsk); +static void svc_close_xprt(struct svc_xprt *xprt); static void svc_sock_detach(struct svc_xprt *); static void svc_sock_free(struct svc_xprt *); @@ -293,7 +293,7 @@ svc_sock_enqueue(struct svc_sock *svsk) wake_up(&rqstp->rq_wait); } else { dprintk("svc: socket %p put into queue\n", svsk->sk_sk); - list_add_tail(&svsk->sk_ready, &pool->sp_sockets); + list_add_tail(&svsk->sk_xprt.xpt_ready, &pool->sp_sockets); BUG_ON(svsk->sk_xprt.xpt_pool != pool); } @@ -313,8 +313,8 @@ svc_sock_dequeue(struct svc_pool *pool) return NULL; svsk = list_entry(pool->sp_sockets.next, - struct svc_sock, sk_ready); - list_del_init(&svsk->sk_ready); + struct svc_sock, sk_xprt.xpt_ready); + list_del_init(&svsk->sk_xprt.xpt_ready); dprintk("svc: socket %p dequeued, inuse=%d\n", svsk->sk_sk, atomic_read(&svsk->sk_xprt.xpt_ref.refcount)); @@ -572,7 +572,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) if (!serv) return 0; spin_lock_bh(&serv->sv_lock); - list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { + list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) { int onelen = one_sock_name(buf+len, svsk); if (toclose && strcmp(toclose, buf+len) == 0) closesk = svsk; @@ -584,7 +584,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) /* Should unregister with portmap, but you cannot * unregister just one protocol... */ - svc_close_socket(closesk); + svc_close_xprt(&closesk->sk_xprt); else if (toclose) return -ENOENT; return len; @@ -1427,12 +1427,12 @@ svc_sock_update_bufs(struct svc_serv *serv) spin_lock_bh(&serv->sv_lock); list_for_each(le, &serv->sv_permsocks) { struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_list); + list_entry(le, struct svc_sock, sk_xprt.xpt_list); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); } list_for_each(le, &serv->sv_tempsocks) { struct svc_sock *svsk = - list_entry(le, struct svc_sock, sk_list); + list_entry(le, struct svc_sock, sk_xprt.xpt_list); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); } spin_unlock_bh(&serv->sv_lock); @@ -1469,7 +1469,7 @@ static void svc_check_conn_limits(struct svc_serv *serv) */ svsk = list_entry(serv->sv_tempsocks.prev, struct svc_sock, - sk_list); + sk_xprt.xpt_list); set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); svc_xprt_get(&svsk->sk_xprt); } @@ -1576,7 +1576,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) len = 0; if (test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)) { dprintk("svc_recv: found XPT_CLOSE\n"); - svc_delete_socket(svsk); + svc_delete_xprt(&svsk->sk_xprt); } else if (test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags)) { struct svc_xprt *newxpt; newxpt = svsk->sk_xprt.xpt_ops->xpo_accept(&svsk->sk_xprt); @@ -1685,7 +1685,7 @@ svc_age_temp_sockets(unsigned long closure) } list_for_each_safe(le, next, &serv->sv_tempsocks) { - svsk = list_entry(le, struct svc_sock, sk_list); + svsk = list_entry(le, struct svc_sock, sk_xprt.xpt_list); if (!test_and_set_bit(XPT_OLD, &svsk->sk_xprt.xpt_flags)) continue; @@ -1701,9 +1701,9 @@ svc_age_temp_sockets(unsigned long closure) while (!list_empty(&to_be_aged)) { le = to_be_aged.next; - /* fiddling the sk_list node is safe 'cos we're XPT_DETACHED */ + /* fiddling the sk_xprt.xpt_list node is safe 'cos we're XPT_DETACHED */ list_del_init(le); - svsk = list_entry(le, struct svc_sock, sk_list); + svsk = list_entry(le, struct svc_sock, sk_xprt.xpt_list); dprintk("queuing svsk %p for closing, %lu seconds old\n", svsk, get_seconds() - svsk->sk_lastrecv); @@ -1757,7 +1757,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_lastrecv = get_seconds(); spin_lock_init(&svsk->sk_lock); INIT_LIST_HEAD(&svsk->sk_deferred); - INIT_LIST_HEAD(&svsk->sk_ready); mutex_init(&svsk->sk_mutex); /* Initialize the socket */ @@ -1769,7 +1768,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, spin_lock_bh(&serv->sv_lock); if (is_temporary) { set_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); - list_add(&svsk->sk_list, &serv->sv_tempsocks); + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_tempsocks); serv->sv_tmpcnt++; if (serv->sv_temptimer.function == NULL) { /* setup timer to age temp sockets */ @@ -1780,7 +1779,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, } } else { clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); - list_add(&svsk->sk_list, &serv->sv_permsocks); + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); } spin_unlock_bh(&serv->sv_lock); @@ -1912,66 +1911,63 @@ static void svc_sock_free(struct svc_xprt *xprt) } /* - * Remove a dead socket + * Remove a dead transport */ -static void -svc_delete_socket(struct svc_sock *svsk) +static void svc_delete_xprt(struct svc_xprt *xprt) { - struct svc_serv *serv; - struct sock *sk; + struct svc_serv *serv = xprt->xpt_server; - dprintk("svc: svc_delete_socket(%p)\n", svsk); - - serv = svsk->sk_xprt.xpt_server; - sk = svsk->sk_sk; - - svsk->sk_xprt.xpt_ops->xpo_detach(&svsk->sk_xprt); + dprintk("svc: svc_delete_xprt(%p)\n", xprt); + xprt->xpt_ops->xpo_detach(xprt); spin_lock_bh(&serv->sv_lock); - - if (!test_and_set_bit(XPT_DETACHED, &svsk->sk_xprt.xpt_flags)) - list_del_init(&svsk->sk_list); + if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) + list_del_init(&xprt->xpt_list); /* - * We used to delete the svc_sock from whichever list - * it's sk_ready node was on, but we don't actually + * We used to delete the transport from whichever list + * it's sk_xprt.xpt_ready node was on, but we don't actually * need to. This is because the only time we're called * while still attached to a queue, the queue itself * is about to be destroyed (in svc_destroy). */ - if (!test_and_set_bit(XPT_DEAD, &svsk->sk_xprt.xpt_flags)) { - BUG_ON(atomic_read(&svsk->sk_xprt.xpt_ref.refcount) < 2); - if (test_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags)) + if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) { + BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2); + if (test_bit(XPT_TEMP, &xprt->xpt_flags)) serv->sv_tmpcnt--; - svc_xprt_put(&svsk->sk_xprt); + svc_xprt_put(xprt); } - spin_unlock_bh(&serv->sv_lock); } -static void svc_close_socket(struct svc_sock *svsk) +static void svc_close_xprt(struct svc_xprt *xprt) { - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - if (test_and_set_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)) + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) /* someone else will have to effect the close */ return; - svc_xprt_get(&svsk->sk_xprt); - svc_delete_socket(svsk); - clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); - svc_xprt_put(&svsk->sk_xprt); + svc_xprt_get(xprt); + svc_delete_xprt(xprt); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_put(xprt); } -void svc_force_close_socket(struct svc_sock *svsk) +void svc_close_all(struct list_head *xprt_list) { - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - if (test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)) { - /* Waiting to be processed, but no threads left, - * So just remove it from the waiting list - */ - list_del_init(&svsk->sk_ready); - clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); + struct svc_xprt *xprt; + struct svc_xprt *tmp; + + list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_bit(XPT_BUSY, &xprt->xpt_flags)) { + /* Waiting to be processed, but no threads left, + * So just remove it from the waiting list + */ + list_del_init(&xprt->xpt_ready); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + } + svc_close_xprt(xprt); } - svc_close_socket(svsk); } /* From 7a90e8cc21ad80529b3a3371dc97acc8832cc592 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:55 -0600 Subject: [PATCH 066/100] svc: Move sk_reserved to svc_xprt This functionally trivial patch moves the sk_reserved field to the transport independent svc_xprt structure. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + include/linux/sunrpc/svcsock.h | 2 -- net/sunrpc/svcsock.c | 10 +++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 0a3e09b42a83..0b8ee06f99c0 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -53,6 +53,7 @@ struct svc_xprt { struct svc_pool *xpt_pool; /* current pool iff queued */ struct svc_serv *xpt_server; /* service for transport */ + atomic_t xpt_reserved; /* space on outq that is rsvd */ }; int svc_reg_xprt_class(struct svc_xprt_class *); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 060508ba358b..ba41f11788f2 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -20,8 +20,6 @@ struct svc_sock { struct socket * sk_sock; /* berkeley socket layer */ struct sock * sk_sk; /* INET layer */ - atomic_t sk_reserved; /* space on outq that is reserved */ - spinlock_t sk_lock; /* protects sk_deferred and * sk_info_authunix */ struct list_head sk_deferred; /* deferred requests that need to diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 6f63a5ca6a91..c47bede754ea 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -288,7 +288,7 @@ svc_sock_enqueue(struct svc_sock *svsk) rqstp->rq_sock = svsk; svc_xprt_get(&svsk->sk_xprt); rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); + atomic_add(rqstp->rq_reserved, &svsk->sk_xprt.xpt_reserved); BUG_ON(svsk->sk_xprt.xpt_pool != pool); wake_up(&rqstp->rq_wait); } else { @@ -353,7 +353,7 @@ void svc_reserve(struct svc_rqst *rqstp, int space) if (space < rqstp->rq_reserved) { struct svc_sock *svsk = rqstp->rq_sock; - atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); + atomic_sub((rqstp->rq_reserved - space), &svsk->sk_xprt.xpt_reserved); rqstp->rq_reserved = space; svc_sock_enqueue(svsk); @@ -881,7 +881,7 @@ static int svc_udp_has_wspace(struct svc_xprt *xprt) * sock space. */ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - required = atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg; + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; if (required*2 > sock_wspace(svsk->sk_sk)) return 0; clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); @@ -1327,7 +1327,7 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) * sock space. */ set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - required = atomic_read(&svsk->sk_reserved) + serv->sv_max_mesg; + required = atomic_read(&svsk->sk_xprt.xpt_reserved) + serv->sv_max_mesg; wspace = sk_stream_wspace(svsk->sk_sk); if (wspace < sk_stream_min_wspace(svsk->sk_sk)) @@ -1544,7 +1544,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) rqstp->rq_sock = svsk; svc_xprt_get(&svsk->sk_xprt); rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); + atomic_add(rqstp->rq_reserved, &svsk->sk_xprt.xpt_reserved); } else { /* No data pending. Go to sleep */ svc_thread_enqueue(pool, rqstp); From f6150c3cab6e788afacb07470be3c6b4a722f1ed Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:57 -0600 Subject: [PATCH 067/100] svc: Make the enqueue service transport neutral and export it. The svc_sock_enqueue function is now transport independent since all of the fields it touches have been moved to the transport independent svc_xprt structure. Change the function to use the svc_xprt structure directly instead of the transport specific svc_sock structure. Transport specific data-ready handlers need to call this function, so export it. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 94 ++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c47bede754ea..d95a0c894d4f 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -5,7 +5,7 @@ * * The server scheduling algorithm does not always distribute the load * evenly when servicing a single client. May need to modify the - * svc_sock_enqueue procedure... + * svc_xprt_enqueue procedure... * * TCP support is largely untested and may be a little slow. The problem * is that we currently do two separate recvfrom's, one for the 4-byte @@ -63,7 +63,7 @@ * providing that certain rules are followed: * * XPT_CONN, XPT_DATA, can be set or cleared at any time. - * after a set, svc_sock_enqueue must be called. + * after a set, svc_xprt_enqueue must be called. * after a clear, the socket must be read/accepted * if this succeeds, it must be set again. * XPT_CLOSE can set at any time. It is never cleared. @@ -212,22 +212,21 @@ static void svc_release_skb(struct svc_rqst *rqstp) * processes, wake 'em up. * */ -static void -svc_sock_enqueue(struct svc_sock *svsk) +void svc_xprt_enqueue(struct svc_xprt *xprt) { - struct svc_serv *serv = svsk->sk_xprt.xpt_server; + struct svc_serv *serv = xprt->xpt_server; struct svc_pool *pool; struct svc_rqst *rqstp; int cpu; - if (!(svsk->sk_xprt.xpt_flags & + if (!(xprt->xpt_flags & ((1<sk_xprt.xpt_flags)) + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) return; cpu = get_cpu(); - pool = svc_pool_for_cpu(svsk->sk_xprt.xpt_server, cpu); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); put_cpu(); spin_lock_bh(&pool->sp_lock); @@ -235,11 +234,12 @@ svc_sock_enqueue(struct svc_sock *svsk) if (!list_empty(&pool->sp_threads) && !list_empty(&pool->sp_sockets)) printk(KERN_ERR - "svc_sock_enqueue: threads and sockets both waiting??\n"); + "svc_xprt_enqueue: " + "threads and transports both waiting??\n"); - if (test_bit(XPT_DEAD, &svsk->sk_xprt.xpt_flags)) { + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { /* Don't enqueue dead sockets */ - dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); + dprintk("svc: transport %p is dead, not enqueued\n", xprt); goto out_unlock; } @@ -248,28 +248,29 @@ svc_sock_enqueue(struct svc_sock *svsk) * on the idle list. We update XPT_BUSY atomically because * it also guards against trying to enqueue the svc_sock twice. */ - if (test_and_set_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)) { + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { /* Don't enqueue socket while already enqueued */ - dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); + dprintk("svc: transport %p busy, not enqueued\n", xprt); goto out_unlock; } - BUG_ON(svsk->sk_xprt.xpt_pool != NULL); - svsk->sk_xprt.xpt_pool = pool; + BUG_ON(xprt->xpt_pool != NULL); + xprt->xpt_pool = pool; /* Handle pending connection */ - if (test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags)) + if (test_bit(XPT_CONN, &xprt->xpt_flags)) goto process; /* Handle close in-progress */ - if (test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)) + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) goto process; /* Check if we have space to reply to a request */ - if (!svsk->sk_xprt.xpt_ops->xpo_has_wspace(&svsk->sk_xprt)) { + if (!xprt->xpt_ops->xpo_has_wspace(xprt)) { /* Don't enqueue while not enough space for reply */ - dprintk("svc: no write space, socket %p not enqueued\n", svsk); - svsk->sk_xprt.xpt_pool = NULL; - clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); + dprintk("svc: no write space, transport %p not enqueued\n", + xprt); + xprt->xpt_pool = NULL; + clear_bit(XPT_BUSY, &xprt->xpt_flags); goto out_unlock; } @@ -278,28 +279,29 @@ svc_sock_enqueue(struct svc_sock *svsk) rqstp = list_entry(pool->sp_threads.next, struct svc_rqst, rq_list); - dprintk("svc: socket %p served by daemon %p\n", - svsk->sk_sk, rqstp); + dprintk("svc: transport %p served by daemon %p\n", + xprt, rqstp); svc_thread_dequeue(pool, rqstp); - if (rqstp->rq_sock) + if (rqstp->rq_xprt) printk(KERN_ERR - "svc_sock_enqueue: server %p, rq_sock=%p!\n", - rqstp, rqstp->rq_sock); - rqstp->rq_sock = svsk; - svc_xprt_get(&svsk->sk_xprt); + "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", + rqstp, rqstp->rq_xprt); + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &svsk->sk_xprt.xpt_reserved); - BUG_ON(svsk->sk_xprt.xpt_pool != pool); + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + BUG_ON(xprt->xpt_pool != pool); wake_up(&rqstp->rq_wait); } else { - dprintk("svc: socket %p put into queue\n", svsk->sk_sk); - list_add_tail(&svsk->sk_xprt.xpt_ready, &pool->sp_sockets); - BUG_ON(svsk->sk_xprt.xpt_pool != pool); + dprintk("svc: transport %p put into queue\n", xprt); + list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); + BUG_ON(xprt->xpt_pool != pool); } out_unlock: spin_unlock_bh(&pool->sp_lock); } +EXPORT_SYMBOL_GPL(svc_xprt_enqueue); /* * Dequeue the first socket. Must be called with the pool->sp_lock held. @@ -333,7 +335,7 @@ svc_sock_received(struct svc_sock *svsk) { svsk->sk_xprt.xpt_pool = NULL; clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); - svc_sock_enqueue(svsk); + svc_xprt_enqueue(&svsk->sk_xprt); } @@ -352,11 +354,11 @@ void svc_reserve(struct svc_rqst *rqstp, int space) space += rqstp->rq_res.head[0].iov_len; if (space < rqstp->rq_reserved) { - struct svc_sock *svsk = rqstp->rq_sock; - atomic_sub((rqstp->rq_reserved - space), &svsk->sk_xprt.xpt_reserved); + struct svc_xprt *xprt = rqstp->rq_xprt; + atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); rqstp->rq_reserved = space; - svc_sock_enqueue(svsk); + svc_xprt_enqueue(xprt); } } @@ -684,7 +686,7 @@ svc_udp_data_ready(struct sock *sk, int count) svsk, sk, count, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - svc_sock_enqueue(svsk); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); @@ -701,7 +703,7 @@ svc_write_space(struct sock *sk) if (svsk) { dprintk("svc: socket %p(inet %p), write_space busy=%d\n", svsk, sk, test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)); - svc_sock_enqueue(svsk); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { @@ -973,7 +975,7 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused) if (sk->sk_state == TCP_LISTEN) { if (svsk) { set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); - svc_sock_enqueue(svsk); + svc_xprt_enqueue(&svsk->sk_xprt); } else printk("svc: socket %p: no user data\n", sk); } @@ -997,7 +999,7 @@ svc_tcp_state_change(struct sock *sk) printk("svc: socket %p: no user data\n", sk); else { set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - svc_sock_enqueue(svsk); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible_all(sk->sk_sleep); @@ -1012,7 +1014,7 @@ svc_tcp_data_ready(struct sock *sk, int count) sk, sk->sk_user_data); if (svsk) { set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); - svc_sock_enqueue(svsk); + svc_xprt_enqueue(&svsk->sk_xprt); } if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep); @@ -1298,7 +1300,7 @@ svc_tcp_sendto(struct svc_rqst *rqstp) (sent<0)?"got error":"sent only", sent, xbufp->len); set_bit(XPT_CLOSE, &rqstp->rq_sock->sk_xprt.xpt_flags); - svc_sock_enqueue(rqstp->rq_sock); + svc_xprt_enqueue(rqstp->rq_xprt); sent = -EAGAIN; } return sent; @@ -1476,7 +1478,7 @@ static void svc_check_conn_limits(struct svc_serv *serv) spin_unlock_bh(&serv->sv_lock); if (svsk) { - svc_sock_enqueue(svsk); + svc_xprt_enqueue(&svsk->sk_xprt); svc_xprt_put(&svsk->sk_xprt); } } @@ -1709,7 +1711,7 @@ svc_age_temp_sockets(unsigned long closure) svsk, get_seconds() - svsk->sk_lastrecv); /* a thread will dequeue and close it soon */ - svc_sock_enqueue(svsk); + svc_xprt_enqueue(&svsk->sk_xprt); svc_xprt_put(&svsk->sk_xprt); } @@ -1991,7 +1993,7 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) list_add(&dr->handle.recent, &svsk->sk_deferred); spin_unlock(&svsk->sk_lock); set_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); - svc_sock_enqueue(svsk); + svc_xprt_enqueue(&svsk->sk_xprt); svc_xprt_put(&svsk->sk_xprt); } From a50fea26b9d2aa7b66fdd6d9579de10827ec086a Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:07:59 -0600 Subject: [PATCH 068/100] svc: Make svc_send transport neutral Move the sk_mutex field to the transport independent svc_xprt structure. Now all the fields that svc_send touches are transport neutral. Change the svc_send function to use the transport independent svc_xprt directly instead of the transport dependent svc_sock structure. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + include/linux/sunrpc/svcsock.h | 1 - net/sunrpc/svc_xprt.c | 1 + net/sunrpc/svcsock.c | 19 ++++++++----------- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 0b8ee06f99c0..1b8c596b1177 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -54,6 +54,7 @@ struct svc_xprt { struct svc_pool *xpt_pool; /* current pool iff queued */ struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ + struct mutex xpt_mutex; /* to serialize sending data */ }; int svc_reg_xprt_class(struct svc_xprt_class *); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index ba41f11788f2..41c2dfaf7371 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -24,7 +24,6 @@ struct svc_sock { * sk_info_authunix */ struct list_head sk_deferred; /* deferred requests that need to * be revisted */ - struct mutex sk_mutex; /* to serialize sending data */ /* We keep the old state_change and data_ready CB's here */ void (*sk_ostate)(struct sock *); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 95186b548099..3e6a1c81d4ce 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -99,6 +99,7 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, xprt->xpt_server = serv; INIT_LIST_HEAD(&xprt->xpt_list); INIT_LIST_HEAD(&xprt->xpt_ready); + mutex_init(&xprt->xpt_mutex); } EXPORT_SYMBOL_GPL(svc_xprt_init); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d95a0c894d4f..2016d9c63f88 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1632,15 +1632,13 @@ svc_drop(struct svc_rqst *rqstp) int svc_send(struct svc_rqst *rqstp) { - struct svc_sock *svsk; + struct svc_xprt *xprt; int len; struct xdr_buf *xb; - if ((svsk = rqstp->rq_sock) == NULL) { - printk(KERN_WARNING "NULL socket pointer in %s:%d\n", - __FILE__, __LINE__); + xprt = rqstp->rq_xprt; + if (!xprt) return -EFAULT; - } /* release the receive skb before sending the reply */ rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); @@ -1651,13 +1649,13 @@ svc_send(struct svc_rqst *rqstp) xb->page_len + xb->tail[0].iov_len; - /* Grab svsk->sk_mutex to serialize outgoing data. */ - mutex_lock(&svsk->sk_mutex); - if (test_bit(XPT_DEAD, &svsk->sk_xprt.xpt_flags)) + /* Grab mutex to serialize outgoing data. */ + mutex_lock(&xprt->xpt_mutex); + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) len = -ENOTCONN; else - len = svsk->sk_xprt.xpt_ops->xpo_sendto(rqstp); - mutex_unlock(&svsk->sk_mutex); + len = xprt->xpt_ops->xpo_sendto(rqstp); + mutex_unlock(&xprt->xpt_mutex); svc_sock_release(rqstp); if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) @@ -1759,7 +1757,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_lastrecv = get_seconds(); spin_lock_init(&svsk->sk_lock); INIT_LIST_HEAD(&svsk->sk_deferred); - mutex_init(&svsk->sk_mutex); /* Initialize the socket */ if (sock->type == SOCK_DGRAM) From a6046f71f2b598af241e7496a8ead90f2979224b Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:01 -0600 Subject: [PATCH 069/100] svc: Change svc_sock_received to svc_xprt_received and export it All fields touched by svc_sock_received are now transport independent. Change it to use svc_xprt directly. This function is called from transport dependent code, so export it. Update the comment to clearly state the rules for calling this function. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 2 +- net/sunrpc/svcsock.c | 46 +++++++++++++++++---------------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 1b8c596b1177..936e0dc52fcc 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -62,8 +62,8 @@ void svc_unreg_xprt_class(struct svc_xprt_class *); void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *, struct svc_serv *); int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); +void svc_xprt_received(struct svc_xprt *); void svc_xprt_put(struct svc_xprt *xprt); - static inline void svc_xprt_get(struct svc_xprt *xprt) { kref_get(&xprt->xpt_ref); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2016d9c63f88..5fb537e4d63b 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -325,19 +325,21 @@ svc_sock_dequeue(struct svc_pool *pool) } /* - * Having read something from a socket, check whether it - * needs to be re-enqueued. - * Note: XPT_DATA only gets cleared when a read-attempt finds - * no (or insufficient) data. + * svc_xprt_received conditionally queues the transport for processing + * by another thread. The caller must hold the XPT_BUSY bit and must + * not thereafter touch transport data. + * + * Note: XPT_DATA only gets cleared when a read-attempt finds no (or + * insufficient) data. */ -static inline void -svc_sock_received(struct svc_sock *svsk) +void svc_xprt_received(struct svc_xprt *xprt) { - svsk->sk_xprt.xpt_pool = NULL; - clear_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); + BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); + xprt->xpt_pool = NULL; + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); } - +EXPORT_SYMBOL_GPL(svc_xprt_received); /** * svc_reserve - change the space reserved for the reply to a request. @@ -766,7 +768,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) (serv->sv_nrthreads+3) * serv->sv_max_mesg); if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return svc_deferred_recv(rqstp); } @@ -783,7 +785,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) dprintk("svc: recvfrom returned error %d\n", -err); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); } - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return -EAGAIN; } rqstp->rq_addrlen = sizeof(rqstp->rq_addr); @@ -798,7 +800,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) /* * Maybe more packets - kick another thread ASAP. */ - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); len = skb->len - sizeof(struct udphdr); rqstp->rq_arg.len = len; @@ -1104,7 +1106,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) } memcpy(&newsvsk->sk_local, sin, slen); - svc_sock_received(newsvsk); + svc_xprt_received(&newsvsk->sk_xprt); if (serv->sv_stats) serv->sv_stats->nettcpconn++; @@ -1134,7 +1136,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return svc_deferred_recv(rqstp); } @@ -1174,7 +1176,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len < want) { dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", len, want); - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return -EAGAIN; /* record header not complete */ } @@ -1210,7 +1212,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len < svsk->sk_reclen) { dprintk("svc: incomplete TCP record (%d of %d)\n", len, svsk->sk_reclen); - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return -EAGAIN; /* record not complete */ } len = svsk->sk_reclen; @@ -1250,7 +1252,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) svsk->sk_reclen = 0; svsk->sk_tcplen = 0; - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); if (serv->sv_stats) serv->sv_stats->nettcpcnt++; @@ -1263,7 +1265,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) error: if (len == -EAGAIN) { dprintk("RPC: TCP recvfrom got EAGAIN\n"); - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); } else { printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", svsk->sk_xprt.xpt_server->sv_name, -len); @@ -1590,7 +1592,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) __module_get(newxpt->xpt_class->xcl_owner); svc_check_conn_limits(svsk->sk_xprt.xpt_server); } - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); } else { dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", rqstp, pool->sp_id, svsk, @@ -1809,7 +1811,7 @@ int svc_addsock(struct svc_serv *serv, else { svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); if (svsk) { - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); err = 0; } } @@ -1865,7 +1867,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, } if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { - svc_sock_received(svsk); + svc_xprt_received(&svsk->sk_xprt); return (struct svc_xprt *)svsk; } From 6bc5ab1367e43e59671d3fd19e6b0d2e4c138323 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:03 -0600 Subject: [PATCH 070/100] svc: Move accept call to svc_xprt_received to common code Now that the svc_xprt_received function handles transports, the call to svc_xprt_received in the xpo_tcp_accept function can be moved to common code. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5fb537e4d63b..73975547c385 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1106,8 +1106,6 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) } memcpy(&newsvsk->sk_local, sin, slen); - svc_xprt_received(&newsvsk->sk_xprt); - if (serv->sv_stats) serv->sv_stats->nettcpconn++; @@ -1591,6 +1589,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) */ __module_get(newxpt->xpt_class->xcl_owner); svc_check_conn_limits(svsk->sk_xprt.xpt_server); + svc_xprt_received(newxpt); } svc_xprt_received(&svsk->sk_xprt); } else { From 4bc6c497b26a7984cac842a09e2e8f8c46242782 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:05 -0600 Subject: [PATCH 071/100] svc: Remove sk_lastrecv With the implementation of the new mark and sweep algorithm for shutting down old connections, the sk_lastrecv field is no longer needed. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svcsock.h | 1 - net/sunrpc/svcsock.c | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 41c2dfaf7371..406d0031b989 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -33,7 +33,6 @@ struct svc_sock { /* private TCP part */ int sk_reclen; /* length of record */ int sk_tcplen; /* current read length */ - time_t sk_lastrecv; /* time of last received request */ /* cache of various info for TCP sockets */ void *sk_info_authunix; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 73975547c385..2390286e1827 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1606,7 +1606,6 @@ svc_recv(struct svc_rqst *rqstp, long timeout) svc_sock_release(rqstp); return -EAGAIN; } - svsk->sk_lastrecv = get_seconds(); clear_bit(XPT_OLD, &svsk->sk_xprt.xpt_flags); rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); @@ -1706,8 +1705,7 @@ svc_age_temp_sockets(unsigned long closure) list_del_init(le); svsk = list_entry(le, struct svc_sock, sk_xprt.xpt_list); - dprintk("queuing svsk %p for closing, %lu seconds old\n", - svsk, get_seconds() - svsk->sk_lastrecv); + dprintk("queuing svsk %p for closing\n", svsk); /* a thread will dequeue and close it soon */ svc_xprt_enqueue(&svsk->sk_xprt); @@ -1755,7 +1753,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_ostate = inet->sk_state_change; svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; - svsk->sk_lastrecv = get_seconds(); spin_lock_init(&svsk->sk_lock); INIT_LIST_HEAD(&svsk->sk_deferred); From def13d7401e9b95bbd34c20057ebeb2972708b1b Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:08 -0600 Subject: [PATCH 072/100] svc: Move the authinfo cache to svc_xprt. Move the authinfo cache to svc_xprt. This allows both the TCP and RDMA transports to share this logic. A flag bit is used to determine if auth information is to be cached or not. Previously, this code looked at the transport protocol. I've also changed the spin_lock/unlock logic so that a lock is not taken for transports that are not caching auth info. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 4 +++ include/linux/sunrpc/svcsock.h | 5 --- net/sunrpc/svc_xprt.c | 4 +++ net/sunrpc/svcauth_unix.c | 54 ++++++++++++++++++--------------- net/sunrpc/svcsock.c | 22 ++++++++------ 5 files changed, 49 insertions(+), 40 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 936e0dc52fcc..1b5da39bb461 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -50,11 +50,15 @@ struct svc_xprt { #define XPT_OLD 9 /* used for xprt aging mark+sweep */ #define XPT_DETACHED 10 /* detached from tempsocks list */ #define XPT_LISTENER 11 /* listening endpoint */ +#define XPT_CACHE_AUTH 12 /* cache auth info */ struct svc_pool *xpt_pool; /* current pool iff queued */ struct svc_serv *xpt_server; /* service for transport */ atomic_t xpt_reserved; /* space on outq that is rsvd */ struct mutex xpt_mutex; /* to serialize sending data */ + spinlock_t xpt_lock; /* protects sk_deferred + * and xpt_auth_cache */ + void *xpt_auth_cache;/* auth cache */ }; int svc_reg_xprt_class(struct svc_xprt_class *); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 406d0031b989..f2ed6a25a7aa 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -20,8 +20,6 @@ struct svc_sock { struct socket * sk_sock; /* berkeley socket layer */ struct sock * sk_sk; /* INET layer */ - spinlock_t sk_lock; /* protects sk_deferred and - * sk_info_authunix */ struct list_head sk_deferred; /* deferred requests that need to * be revisted */ @@ -34,9 +32,6 @@ struct svc_sock { int sk_reclen; /* length of record */ int sk_tcplen; /* current read length */ - /* cache of various info for TCP sockets */ - void *sk_info_authunix; - struct sockaddr_storage sk_local; /* local address */ struct sockaddr_storage sk_remote; /* remote peer's address */ int sk_remotelen; /* length of address */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 3e6a1c81d4ce..d2ac130b9040 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -75,6 +75,9 @@ static void svc_xprt_free(struct kref *kref) struct svc_xprt *xprt = container_of(kref, struct svc_xprt, xpt_ref); struct module *owner = xprt->xpt_class->xcl_owner; + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags) + && xprt->xpt_auth_cache != NULL) + svcauth_unix_info_release(xprt->xpt_auth_cache); xprt->xpt_ops->xpo_free(xprt); module_put(owner); } @@ -100,6 +103,7 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, INIT_LIST_HEAD(&xprt->xpt_list); INIT_LIST_HEAD(&xprt->xpt_ready); mutex_init(&xprt->xpt_mutex); + spin_lock_init(&xprt->xpt_lock); } EXPORT_SYMBOL_GPL(svc_xprt_init); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 411479411b21..6815157bd65c 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -384,41 +384,45 @@ void svcauth_unix_purge(void) static inline struct ip_map * ip_map_cached_get(struct svc_rqst *rqstp) { - struct ip_map *ipm; - struct svc_sock *svsk = rqstp->rq_sock; - spin_lock(&svsk->sk_lock); - ipm = svsk->sk_info_authunix; - if (ipm != NULL) { - if (!cache_valid(&ipm->h)) { - /* - * The entry has been invalidated since it was - * remembered, e.g. by a second mount from the - * same IP address. - */ - svsk->sk_info_authunix = NULL; - spin_unlock(&svsk->sk_lock); - cache_put(&ipm->h, &ip_map_cache); - return NULL; + struct ip_map *ipm = NULL; + struct svc_xprt *xprt = rqstp->rq_xprt; + + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { + spin_lock(&xprt->xpt_lock); + ipm = xprt->xpt_auth_cache; + if (ipm != NULL) { + if (!cache_valid(&ipm->h)) { + /* + * The entry has been invalidated since it was + * remembered, e.g. by a second mount from the + * same IP address. + */ + xprt->xpt_auth_cache = NULL; + spin_unlock(&xprt->xpt_lock); + cache_put(&ipm->h, &ip_map_cache); + return NULL; + } + cache_get(&ipm->h); } - cache_get(&ipm->h); + spin_unlock(&xprt->xpt_lock); } - spin_unlock(&svsk->sk_lock); return ipm; } static inline void ip_map_cached_put(struct svc_rqst *rqstp, struct ip_map *ipm) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_xprt *xprt = rqstp->rq_xprt; - spin_lock(&svsk->sk_lock); - if (svsk->sk_sock->type == SOCK_STREAM && - svsk->sk_info_authunix == NULL) { - /* newly cached, keep the reference */ - svsk->sk_info_authunix = ipm; - ipm = NULL; + if (test_bit(XPT_CACHE_AUTH, &xprt->xpt_flags)) { + spin_lock(&xprt->xpt_lock); + if (xprt->xpt_auth_cache == NULL) { + /* newly cached, keep the reference */ + xprt->xpt_auth_cache = ipm; + ipm = NULL; + } + spin_unlock(&xprt->xpt_lock); } - spin_unlock(&svsk->sk_lock); if (ipm) cache_put(&ipm->h, &ip_map_cache); } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2390286e1827..5c9422c9a980 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -113,12 +113,16 @@ static inline void svc_reclassify_socket(struct socket *sock) switch (sk->sk_family) { case AF_INET: sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD", - &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]); + &svc_slock_key[0], + "sk_xprt.xpt_lock-AF_INET-NFSD", + &svc_key[0]); break; case AF_INET6: sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD", - &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]); + &svc_slock_key[1], + "sk_xprt.xpt_lock-AF_INET6-NFSD", + &svc_key[1]); break; default: @@ -930,6 +934,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) mm_segment_t oldfs; svc_xprt_init(&svc_udp_class, &svsk->sk_xprt, serv); + clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); svsk->sk_sk->sk_data_ready = svc_udp_data_ready; svsk->sk_sk->sk_write_space = svc_write_space; @@ -1385,7 +1390,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) struct tcp_sock *tp = tcp_sk(sk); svc_xprt_init(&svc_tcp_class, &svsk->sk_xprt, serv); - + set_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); if (sk->sk_state == TCP_LISTEN) { dprintk("setting up TCP socket for listening\n"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); @@ -1753,7 +1758,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_ostate = inet->sk_state_change; svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; - spin_lock_init(&svsk->sk_lock); INIT_LIST_HEAD(&svsk->sk_deferred); /* Initialize the socket */ @@ -1898,8 +1902,6 @@ static void svc_sock_free(struct svc_xprt *xprt) struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); dprintk("svc: svc_sock_free(%p)\n", svsk); - if (svsk->sk_info_authunix != NULL) - svcauth_unix_info_release(svsk->sk_info_authunix); if (svsk->sk_sock->file) sockfd_put(svsk->sk_sock); else @@ -1984,9 +1986,9 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) dprintk("revisit queued\n"); svsk = dr->svsk; dr->svsk = NULL; - spin_lock(&svsk->sk_lock); + spin_lock(&svsk->sk_xprt.xpt_lock); list_add(&dr->handle.recent, &svsk->sk_deferred); - spin_unlock(&svsk->sk_lock); + spin_unlock(&svsk->sk_xprt.xpt_lock); set_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); svc_xprt_enqueue(&svsk->sk_xprt); svc_xprt_put(&svsk->sk_xprt); @@ -2052,7 +2054,7 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) if (!test_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags)) return NULL; - spin_lock(&svsk->sk_lock); + spin_lock(&svsk->sk_xprt.xpt_lock); clear_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); if (!list_empty(&svsk->sk_deferred)) { dr = list_entry(svsk->sk_deferred.next, @@ -2061,6 +2063,6 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) list_del_init(&dr->handle.recent); set_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); } - spin_unlock(&svsk->sk_lock); + spin_unlock(&svsk->sk_xprt.xpt_lock); return dr; } From 8c7b0172a1db8120d25ecb4eff69664c52ee7639 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:10 -0600 Subject: [PATCH 073/100] svc: Make deferral processing xprt independent This patch moves the transport independent sk_deferred list to the svc_xprt structure and updates the svc_deferred_req structure to keep pointers to svc_xprt's directly. The deferral processing code is also moved out of the transport dependent recvfrom functions and into the generic svc_recv path. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 2 +- include/linux/sunrpc/svc_xprt.h | 2 ++ include/linux/sunrpc/svcsock.h | 3 -- net/sunrpc/svc_xprt.c | 1 + net/sunrpc/svcsock.c | 57 +++++++++++++++------------------ 5 files changed, 29 insertions(+), 36 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index cfb2652f6f8f..40adc9d75a6d 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -320,7 +320,7 @@ static inline void svc_free_res_pages(struct svc_rqst *rqstp) struct svc_deferred_req { u32 prot; /* protocol (UDP or TCP) */ - struct svc_sock *svsk; + struct svc_xprt *xprt; struct sockaddr_storage addr; /* where reply must go */ size_t addrlen; union svc_addr_u daddr; /* where reply must come from */ diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 1b5da39bb461..6a8445b9dfd9 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -59,6 +59,8 @@ struct svc_xprt { spinlock_t xpt_lock; /* protects sk_deferred * and xpt_auth_cache */ void *xpt_auth_cache;/* auth cache */ + struct list_head xpt_deferred; /* deferred requests that need + * to be revisted */ }; int svc_reg_xprt_class(struct svc_xprt_class *); diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index f2ed6a25a7aa..96a229e6b9c9 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -20,9 +20,6 @@ struct svc_sock { struct socket * sk_sock; /* berkeley socket layer */ struct sock * sk_sk; /* INET layer */ - struct list_head sk_deferred; /* deferred requests that need to - * be revisted */ - /* We keep the old state_change and data_ready CB's here */ void (*sk_ostate)(struct sock *); void (*sk_odata)(struct sock *, int bytes); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d2ac130b9040..023aeb0ecfa9 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -102,6 +102,7 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, xprt->xpt_server = serv; INIT_LIST_HEAD(&xprt->xpt_list); INIT_LIST_HEAD(&xprt->xpt_ready); + INIT_LIST_HEAD(&xprt->xpt_deferred); mutex_init(&xprt->xpt_mutex); spin_lock_init(&xprt->xpt_lock); } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5c9422c9a980..9d0a9e6c0e10 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -89,7 +89,7 @@ static void svc_close_xprt(struct svc_xprt *xprt); static void svc_sock_detach(struct svc_xprt *); static void svc_sock_free(struct svc_xprt *); -static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); static struct svc_xprt *svc_create_socket(struct svc_serv *, int, @@ -771,11 +771,6 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) (serv->sv_nrthreads+3) * serv->sv_max_mesg, (serv->sv_nrthreads+3) * serv->sv_max_mesg); - if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { - svc_xprt_received(&svsk->sk_xprt); - return svc_deferred_recv(rqstp); - } - clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); skb = NULL; err = kernel_recvmsg(svsk->sk_sock, &msg, NULL, @@ -1138,11 +1133,6 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { - svc_xprt_received(&svsk->sk_xprt); - return svc_deferred_recv(rqstp); - } - if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) /* sndbuf needs to have room for one request * per thread, otherwise we can stall even when the @@ -1601,7 +1591,12 @@ svc_recv(struct svc_rqst *rqstp, long timeout) dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_xprt.xpt_ref.refcount)); - len = svsk->sk_xprt.xpt_ops->xpo_recvfrom(rqstp); + rqstp->rq_deferred = svc_deferred_dequeue(&svsk->sk_xprt); + if (rqstp->rq_deferred) { + svc_xprt_received(&svsk->sk_xprt); + len = svc_deferred_recv(rqstp); + } else + len = svsk->sk_xprt.xpt_ops->xpo_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); } @@ -1758,7 +1753,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, svsk->sk_ostate = inet->sk_state_change; svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; - INIT_LIST_HEAD(&svsk->sk_deferred); /* Initialize the socket */ if (sock->type == SOCK_DGRAM) @@ -1976,22 +1970,21 @@ void svc_close_all(struct list_head *xprt_list) static void svc_revisit(struct cache_deferred_req *dreq, int too_many) { struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); - struct svc_sock *svsk; + struct svc_xprt *xprt = dr->xprt; if (too_many) { - svc_xprt_put(&dr->svsk->sk_xprt); + svc_xprt_put(xprt); kfree(dr); return; } dprintk("revisit queued\n"); - svsk = dr->svsk; - dr->svsk = NULL; - spin_lock(&svsk->sk_xprt.xpt_lock); - list_add(&dr->handle.recent, &svsk->sk_deferred); - spin_unlock(&svsk->sk_xprt.xpt_lock); - set_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); - svc_xprt_enqueue(&svsk->sk_xprt); - svc_xprt_put(&svsk->sk_xprt); + dr->xprt = NULL; + spin_lock(&xprt->xpt_lock); + list_add(&dr->handle.recent, &xprt->xpt_deferred); + spin_unlock(&xprt->xpt_lock); + set_bit(XPT_DEFERRED, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); } static struct cache_deferred_req * @@ -2022,7 +2015,7 @@ svc_defer(struct cache_req *req) memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); } svc_xprt_get(rqstp->rq_xprt); - dr->svsk = rqstp->rq_sock; + dr->xprt = rqstp->rq_xprt; dr->handle.revisit = svc_revisit; return &dr->handle; @@ -2048,21 +2041,21 @@ static int svc_deferred_recv(struct svc_rqst *rqstp) } -static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) { struct svc_deferred_req *dr = NULL; - if (!test_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags)) + if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags)) return NULL; - spin_lock(&svsk->sk_xprt.xpt_lock); - clear_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); - if (!list_empty(&svsk->sk_deferred)) { - dr = list_entry(svsk->sk_deferred.next, + spin_lock(&xprt->xpt_lock); + clear_bit(XPT_DEFERRED, &xprt->xpt_flags); + if (!list_empty(&xprt->xpt_deferred)) { + dr = list_entry(xprt->xpt_deferred.next, struct svc_deferred_req, handle.recent); list_del_init(&dr->handle.recent); - set_bit(XPT_DEFERRED, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_DEFERRED, &xprt->xpt_flags); } - spin_unlock(&svsk->sk_xprt.xpt_lock); + spin_unlock(&xprt->xpt_lock); return dr; } From 9dbc240f199c16c3c0859c255ad52a663d8ee51d Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:12 -0600 Subject: [PATCH 074/100] svc: Move the sockaddr information to svc_xprt This patch moves the transport sockaddr to the svc_xprt structure. Convenience functions are added to set and get the local and remote addresses of a transport from the transport provider as well as determine the length of a sockaddr. A transport is responsible for setting the xpt_local and xpt_remote addresses in the svc_xprt structure as part of transport creation and xpo_accept processing. This cannot be done in a generic way and in fact varies between TCP, UDP and RDMA. A set of xpo_ functions (e.g. getlocalname, getremotename) could have been added but this would have resulted in additional caching and copying of the addresses around. Note that the xpt_local address should also be set on listening endpoints; for TCP/RDMA this is done as part of endpoint creation. For connected transports like TCP and RDMA, the addresses never change and can be set once and copied into the rqstp structure for each request. For UDP, however, the local and remote addresses may change for each request. In this case, the address information is obtained from the UDP recvmsg info and copied into the rqstp structure from there. A svc_xprt_local_port function was also added that returns the local port given a transport. This is used by svc_create_xprt when returning the port associated with a newly created transport, and later when creating a generic find transport service to check if a service is already listening on a given port. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 51 ++++++++++++++++++++++++++++++ include/linux/sunrpc/svcsock.h | 4 --- net/sunrpc/svc_xprt.c | 31 ++++++++++++++++-- net/sunrpc/svcsock.c | 56 ++++++++++++++++++--------------- 4 files changed, 110 insertions(+), 32 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 6a8445b9dfd9..09de12b63c1d 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -61,6 +61,10 @@ struct svc_xprt { void *xpt_auth_cache;/* auth cache */ struct list_head xpt_deferred; /* deferred requests that need * to be revisted */ + struct sockaddr_storage xpt_local; /* local address */ + size_t xpt_locallen; /* length of address */ + struct sockaddr_storage xpt_remote; /* remote peer's address */ + size_t xpt_remotelen; /* length of address */ }; int svc_reg_xprt_class(struct svc_xprt_class *); @@ -70,9 +74,56 @@ void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *, int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); void svc_xprt_received(struct svc_xprt *); void svc_xprt_put(struct svc_xprt *xprt); +void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt); static inline void svc_xprt_get(struct svc_xprt *xprt) { kref_get(&xprt->xpt_ref); } +static inline void svc_xprt_set_local(struct svc_xprt *xprt, + struct sockaddr *sa, int salen) +{ + memcpy(&xprt->xpt_local, sa, salen); + xprt->xpt_locallen = salen; +} +static inline void svc_xprt_set_remote(struct svc_xprt *xprt, + struct sockaddr *sa, int salen) +{ + memcpy(&xprt->xpt_remote, sa, salen); + xprt->xpt_remotelen = salen; +} +static inline unsigned short svc_addr_port(struct sockaddr *sa) +{ + unsigned short ret = 0; + switch (sa->sa_family) { + case AF_INET: + ret = ntohs(((struct sockaddr_in *)sa)->sin_port); + break; + case AF_INET6: + ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port); + break; + } + return ret; +} + +static inline size_t svc_addr_len(struct sockaddr *sa) +{ + switch (sa->sa_family) { + case AF_INET: + return sizeof(struct sockaddr_in); + case AF_INET6: + return sizeof(struct sockaddr_in6); + } + return -EAFNOSUPPORT; +} + +static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt) +{ + return svc_addr_port((struct sockaddr *)&xprt->xpt_local); +} + +static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt) +{ + return svc_addr_port((struct sockaddr *)&xprt->xpt_remote); +} #endif /* SUNRPC_SVC_XPRT_H */ diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index 96a229e6b9c9..206f092ad4c7 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -28,10 +28,6 @@ struct svc_sock { /* private TCP part */ int sk_reclen; /* length of record */ int sk_tcplen; /* current read length */ - - struct sockaddr_storage sk_local; /* local address */ - struct sockaddr_storage sk_remote; /* remote peer's address */ - int sk_remotelen; /* length of address */ }; /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 023aeb0ecfa9..eb650af50c49 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -125,7 +125,6 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, spin_unlock(&svc_xprt_class_lock); if (try_module_get(xcl->xcl_owner)) { struct svc_xprt *newxprt; - ret = 0; newxprt = xcl->xcl_ops->xpo_create (serv, (struct sockaddr *)&sin, sizeof(sin), @@ -133,7 +132,8 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, if (IS_ERR(newxprt)) { module_put(xcl->xcl_owner); ret = PTR_ERR(newxprt); - } + } else + ret = svc_xprt_local_port(newxprt); } goto out; } @@ -144,3 +144,30 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, return ret; } EXPORT_SYMBOL_GPL(svc_create_xprt); + +/* + * Copy the local and remote xprt addresses to the rqstp structure + */ +void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt) +{ + struct sockaddr *sin; + + memcpy(&rqstp->rq_addr, &xprt->xpt_remote, xprt->xpt_remotelen); + rqstp->rq_addrlen = xprt->xpt_remotelen; + + /* + * Destination address in request is needed for binding the + * source address in RPC replies/callbacks later. + */ + sin = (struct sockaddr *)&xprt->xpt_local; + switch (sin->sa_family) { + case AF_INET: + rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; + break; + case AF_INET6: + rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; + break; + } +} +EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs); + diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 9d0a9e6c0e10..9564d2e9520e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -623,33 +623,13 @@ svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; - struct sockaddr *sin; int len; len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, msg.msg_flags); - /* sock_recvmsg doesn't fill in the name/namelen, so we must.. - */ - memcpy(&rqstp->rq_addr, &svsk->sk_remote, svsk->sk_remotelen); - rqstp->rq_addrlen = svsk->sk_remotelen; - - /* Destination address in request is needed for binding the - * source address in RPC callbacks later. - */ - sin = (struct sockaddr *)&svsk->sk_local; - switch (sin->sa_family) { - case AF_INET: - rqstp->rq_daddr.addr = ((struct sockaddr_in *)sin)->sin_addr; - break; - case AF_INET6: - rqstp->rq_daddr.addr6 = ((struct sockaddr_in6 *)sin)->sin6_addr; - break; - } - dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", svsk, iov[0].iov_base, iov[0].iov_len, len); - return len; } @@ -719,8 +699,15 @@ svc_write_space(struct sock *sk) } } -static inline void svc_udp_get_dest_address(struct svc_rqst *rqstp, - struct cmsghdr *cmh) +/* + * Copy the UDP datagram's destination address to the rqstp structure. + * The 'destination' address in this case is the address to which the + * peer sent the datagram, i.e. our local address. For multihomed + * hosts, this can change from msg to msg. Note that only the IP + * address changes, the port number should remain the same. + */ +static void svc_udp_get_dest_address(struct svc_rqst *rqstp, + struct cmsghdr *cmh) { switch (rqstp->rq_sock->sk_sk->sk_family) { case AF_INET: { @@ -787,7 +774,10 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) svc_xprt_received(&svsk->sk_xprt); return -EAGAIN; } - rqstp->rq_addrlen = sizeof(rqstp->rq_addr); + len = svc_addr_len(svc_addr(rqstp)); + if (len < 0) + return len; + rqstp->rq_addrlen = len; if (skb->tstamp.tv64 == 0) { skb->tstamp = ktime_get_real(); /* Don't enable netstamp, sunrpc doesn't @@ -1097,14 +1087,13 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) if (!(newsvsk = svc_setup_socket(serv, newsock, &err, (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) goto failed; - memcpy(&newsvsk->sk_remote, sin, slen); - newsvsk->sk_remotelen = slen; + svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); err = kernel_getsockname(newsock, sin, &slen); if (unlikely(err < 0)) { dprintk("svc_tcp_accept: kernel_getsockname error %d\n", -err); slen = offsetof(struct sockaddr, sa_data); } - memcpy(&newsvsk->sk_local, sin, slen); + svc_xprt_set_local(&newsvsk->sk_xprt, sin, slen); if (serv->sv_stats) serv->sv_stats->nettcpconn++; @@ -1245,6 +1234,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) svsk->sk_reclen = 0; svsk->sk_tcplen = 0; + svc_xprt_copy_addrs(rqstp, &svsk->sk_xprt); svc_xprt_received(&svsk->sk_xprt); if (serv->sv_stats) serv->sv_stats->nettcpcnt++; @@ -1805,6 +1795,11 @@ int svc_addsock(struct svc_serv *serv, else { svsk = svc_setup_socket(serv, so, &err, SVC_SOCK_DEFAULTS); if (svsk) { + struct sockaddr_storage addr; + struct sockaddr *sin = (struct sockaddr *)&addr; + int salen; + if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) + svc_xprt_set_local(&svsk->sk_xprt, sin, salen); svc_xprt_received(&svsk->sk_xprt); err = 0; } @@ -1831,6 +1826,9 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, int error; int type; char buf[RPC_MAX_ADDRBUFLEN]; + struct sockaddr_storage addr; + struct sockaddr *newsin = (struct sockaddr *)&addr; + int newlen; dprintk("svc: svc_create_socket(%s, %d, %s)\n", serv->sv_program->pg_name, protocol, @@ -1855,12 +1853,18 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, if (error < 0) goto bummer; + newlen = len; + error = kernel_getsockname(sock, newsin, &newlen); + if (error < 0) + goto bummer; + if (protocol == IPPROTO_TCP) { if ((error = kernel_listen(sock, 64)) < 0) goto bummer; } if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { + svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); svc_xprt_received(&svsk->sk_xprt); return (struct svc_xprt *)svsk; } From eab996d4aca7a9d8621d2b98c00ce420df85eaed Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:14 -0600 Subject: [PATCH 075/100] svc: Make svc_sock_release svc_xprt_release The svc_sock_release function only touches transport independent fields. Change the function to manipulate svc_xprt directly instead of the transport dependent svc_sock structure. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 9564d2e9520e..355ab8da54fe 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -368,10 +368,9 @@ void svc_reserve(struct svc_rqst *rqstp, int space) } } -static void -svc_sock_release(struct svc_rqst *rqstp) +static void svc_xprt_release(struct svc_rqst *rqstp) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_xprt *xprt = rqstp->rq_xprt; rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); @@ -379,7 +378,6 @@ svc_sock_release(struct svc_rqst *rqstp) rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; - /* Reset response buffer and release * the reservation. * But first, check that enough space was reserved @@ -392,9 +390,9 @@ svc_sock_release(struct svc_rqst *rqstp) rqstp->rq_res.head[0].iov_len = 0; svc_reserve(rqstp, 0); - rqstp->rq_sock = NULL; + rqstp->rq_xprt = NULL; - svc_xprt_put(&svsk->sk_xprt); + svc_xprt_put(xprt); } /* @@ -1593,7 +1591,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) /* No data, incomplete (TCP) read, or accept() */ if (len == 0 || len == -EAGAIN) { rqstp->rq_res.len = 0; - svc_sock_release(rqstp); + svc_xprt_release(rqstp); return -EAGAIN; } clear_bit(XPT_OLD, &svsk->sk_xprt.xpt_flags); @@ -1613,7 +1611,7 @@ void svc_drop(struct svc_rqst *rqstp) { dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); - svc_sock_release(rqstp); + svc_xprt_release(rqstp); } /* @@ -1646,7 +1644,7 @@ svc_send(struct svc_rqst *rqstp) else len = xprt->xpt_ops->xpo_sendto(rqstp); mutex_unlock(&xprt->xpt_mutex); - svc_sock_release(rqstp); + svc_xprt_release(rqstp); if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) return 0; From c36adb2a7f9132b37d4b669b2e2c04e46d5188b2 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:16 -0600 Subject: [PATCH 076/100] svc: Make svc_recv transport neutral All of the transport field and functions used by svc_recv are now transport independent. Change the svc_recv function to use the svc_xprt structure directly instead of the transport specific svc_sock structure. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 63 ++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 355ab8da54fe..fa57e9bc68e4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -310,22 +310,21 @@ EXPORT_SYMBOL_GPL(svc_xprt_enqueue); /* * Dequeue the first socket. Must be called with the pool->sp_lock held. */ -static inline struct svc_sock * -svc_sock_dequeue(struct svc_pool *pool) +static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) { - struct svc_sock *svsk; + struct svc_xprt *xprt; if (list_empty(&pool->sp_sockets)) return NULL; - svsk = list_entry(pool->sp_sockets.next, - struct svc_sock, sk_xprt.xpt_ready); - list_del_init(&svsk->sk_xprt.xpt_ready); + xprt = list_entry(pool->sp_sockets.next, + struct svc_xprt, xpt_ready); + list_del_init(&xprt->xpt_ready); - dprintk("svc: socket %p dequeued, inuse=%d\n", - svsk->sk_sk, atomic_read(&svsk->sk_xprt.xpt_ref.refcount)); + dprintk("svc: transport %p dequeued, inuse=%d\n", + xprt, atomic_read(&xprt->xpt_ref.refcount)); - return svsk; + return xprt; } /* @@ -1475,20 +1474,20 @@ static void svc_check_conn_limits(struct svc_serv *serv) int svc_recv(struct svc_rqst *rqstp, long timeout) { - struct svc_sock *svsk = NULL; + struct svc_xprt *xprt = NULL; struct svc_serv *serv = rqstp->rq_server; struct svc_pool *pool = rqstp->rq_pool; int len, i; - int pages; + int pages; struct xdr_buf *arg; DECLARE_WAITQUEUE(wait, current); dprintk("svc: server %p waiting for data (to = %ld)\n", rqstp, timeout); - if (rqstp->rq_sock) + if (rqstp->rq_xprt) printk(KERN_ERR - "svc_recv: service %p, socket not NULL!\n", + "svc_recv: service %p, transport not NULL!\n", rqstp); if (waitqueue_active(&rqstp->rq_wait)) printk(KERN_ERR @@ -1525,11 +1524,12 @@ svc_recv(struct svc_rqst *rqstp, long timeout) return -EINTR; spin_lock_bh(&pool->sp_lock); - if ((svsk = svc_sock_dequeue(pool)) != NULL) { - rqstp->rq_sock = svsk; - svc_xprt_get(&svsk->sk_xprt); + xprt = svc_xprt_dequeue(pool); + if (xprt) { + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &svsk->sk_xprt.xpt_reserved); + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); } else { /* No data pending. Go to sleep */ svc_thread_enqueue(pool, rqstp); @@ -1549,7 +1549,8 @@ svc_recv(struct svc_rqst *rqstp, long timeout) spin_lock_bh(&pool->sp_lock); remove_wait_queue(&rqstp->rq_wait, &wait); - if (!(svsk = rqstp->rq_sock)) { + xprt = rqstp->rq_xprt; + if (!xprt) { svc_thread_dequeue(pool, rqstp); spin_unlock_bh(&pool->sp_lock); dprintk("svc: server %p, no data yet\n", rqstp); @@ -1559,32 +1560,32 @@ svc_recv(struct svc_rqst *rqstp, long timeout) spin_unlock_bh(&pool->sp_lock); len = 0; - if (test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)) { + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { dprintk("svc_recv: found XPT_CLOSE\n"); - svc_delete_xprt(&svsk->sk_xprt); - } else if (test_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags)) { + svc_delete_xprt(xprt); + } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { struct svc_xprt *newxpt; - newxpt = svsk->sk_xprt.xpt_ops->xpo_accept(&svsk->sk_xprt); + newxpt = xprt->xpt_ops->xpo_accept(xprt); if (newxpt) { /* * We know this module_get will succeed because the * listener holds a reference too */ __module_get(newxpt->xpt_class->xcl_owner); - svc_check_conn_limits(svsk->sk_xprt.xpt_server); + svc_check_conn_limits(xprt->xpt_server); svc_xprt_received(newxpt); } - svc_xprt_received(&svsk->sk_xprt); + svc_xprt_received(xprt); } else { - dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", - rqstp, pool->sp_id, svsk, - atomic_read(&svsk->sk_xprt.xpt_ref.refcount)); - rqstp->rq_deferred = svc_deferred_dequeue(&svsk->sk_xprt); + dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", + rqstp, pool->sp_id, xprt, + atomic_read(&xprt->xpt_ref.refcount)); + rqstp->rq_deferred = svc_deferred_dequeue(xprt); if (rqstp->rq_deferred) { - svc_xprt_received(&svsk->sk_xprt); + svc_xprt_received(xprt); len = svc_deferred_recv(rqstp); } else - len = svsk->sk_xprt.xpt_ops->xpo_recvfrom(rqstp); + len = xprt->xpt_ops->xpo_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); } @@ -1594,7 +1595,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) svc_xprt_release(rqstp); return -EAGAIN; } - clear_bit(XPT_OLD, &svsk->sk_xprt.xpt_flags); + clear_bit(XPT_OLD, &xprt->xpt_flags); rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); rqstp->rq_chandle.defer = svc_defer; From 9f8bfae693c724120ffc8fb77564f6deb508daf3 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:18 -0600 Subject: [PATCH 077/100] svc: Make svc_age_temp_sockets svc_age_temp_transports This function is transport independent. Change it to use svc_xprt directly and change it's name to reflect this. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index fa57e9bc68e4..e43412bb07ae 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1656,49 +1656,50 @@ svc_send(struct svc_rqst *rqstp) * Timer function to close old temporary sockets, using * a mark-and-sweep algorithm. */ -static void -svc_age_temp_sockets(unsigned long closure) +static void svc_age_temp_xprts(unsigned long closure) { struct svc_serv *serv = (struct svc_serv *)closure; - struct svc_sock *svsk; + struct svc_xprt *xprt; struct list_head *le, *next; LIST_HEAD(to_be_aged); - dprintk("svc_age_temp_sockets\n"); + dprintk("svc_age_temp_xprts\n"); if (!spin_trylock_bh(&serv->sv_lock)) { /* busy, try again 1 sec later */ - dprintk("svc_age_temp_sockets: busy\n"); + dprintk("svc_age_temp_xprts: busy\n"); mod_timer(&serv->sv_temptimer, jiffies + HZ); return; } list_for_each_safe(le, next, &serv->sv_tempsocks) { - svsk = list_entry(le, struct svc_sock, sk_xprt.xpt_list); + xprt = list_entry(le, struct svc_xprt, xpt_list); - if (!test_and_set_bit(XPT_OLD, &svsk->sk_xprt.xpt_flags)) + /* First time through, just mark it OLD. Second time + * through, close it. */ + if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) continue; - if (atomic_read(&svsk->sk_xprt.xpt_ref.refcount) > 1 - || test_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags)) + if (atomic_read(&xprt->xpt_ref.refcount) > 1 + || test_bit(XPT_BUSY, &xprt->xpt_flags)) continue; - svc_xprt_get(&svsk->sk_xprt); + svc_xprt_get(xprt); list_move(le, &to_be_aged); - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - set_bit(XPT_DETACHED, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + set_bit(XPT_DETACHED, &xprt->xpt_flags); } spin_unlock_bh(&serv->sv_lock); while (!list_empty(&to_be_aged)) { le = to_be_aged.next; - /* fiddling the sk_xprt.xpt_list node is safe 'cos we're XPT_DETACHED */ + /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */ list_del_init(le); - svsk = list_entry(le, struct svc_sock, sk_xprt.xpt_list); + xprt = list_entry(le, struct svc_xprt, xpt_list); - dprintk("queuing svsk %p for closing\n", svsk); + dprintk("queuing xprt %p for closing\n", xprt); /* a thread will dequeue and close it soon */ - svc_xprt_enqueue(&svsk->sk_xprt); - svc_xprt_put(&svsk->sk_xprt); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); } mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); @@ -1756,7 +1757,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, serv->sv_tmpcnt++; if (serv->sv_temptimer.function == NULL) { /* setup timer to age temp sockets */ - setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, + setup_timer(&serv->sv_temptimer, svc_age_temp_xprts, (unsigned long)serv); mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); From 4e5caaa5f24b3df1fe01097e1e7576461e70d491 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:20 -0600 Subject: [PATCH 078/100] svc: Move create logic to common code Move the svc transport list logic into common transport creation code. Refactor this code path to make the flow of control easier to read. Move the setting and clearing of the BUSY_BIT during transport creation to common code. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 43 +++++++++++++++++++++++++------------------ net/sunrpc/svcsock.c | 39 ++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 39 deletions(-) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index eb650af50c49..271467c5138d 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -105,6 +105,7 @@ void svc_xprt_init(struct svc_xprt_class *xcl, struct svc_xprt *xprt, INIT_LIST_HEAD(&xprt->xpt_deferred); mutex_init(&xprt->xpt_mutex); spin_lock_init(&xprt->xpt_lock); + set_bit(XPT_BUSY, &xprt->xpt_flags); } EXPORT_SYMBOL_GPL(svc_xprt_init); @@ -112,7 +113,6 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, int flags) { struct svc_xprt_class *xcl; - int ret = -ENOENT; struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = INADDR_ANY, @@ -121,27 +121,34 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, dprintk("svc: creating transport %s[%d]\n", xprt_name, port); spin_lock(&svc_xprt_class_lock); list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { - if (strcmp(xprt_name, xcl->xcl_name) == 0) { - spin_unlock(&svc_xprt_class_lock); - if (try_module_get(xcl->xcl_owner)) { - struct svc_xprt *newxprt; - newxprt = xcl->xcl_ops->xpo_create - (serv, - (struct sockaddr *)&sin, sizeof(sin), - flags); - if (IS_ERR(newxprt)) { - module_put(xcl->xcl_owner); - ret = PTR_ERR(newxprt); - } else - ret = svc_xprt_local_port(newxprt); - } - goto out; + struct svc_xprt *newxprt; + + if (strcmp(xprt_name, xcl->xcl_name)) + continue; + + if (!try_module_get(xcl->xcl_owner)) + goto err; + + spin_unlock(&svc_xprt_class_lock); + newxprt = xcl->xcl_ops-> + xpo_create(serv, (struct sockaddr *)&sin, sizeof(sin), + flags); + if (IS_ERR(newxprt)) { + module_put(xcl->xcl_owner); + return PTR_ERR(newxprt); } + + clear_bit(XPT_TEMP, &newxprt->xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&newxprt->xpt_list, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); + clear_bit(XPT_BUSY, &newxprt->xpt_flags); + return svc_xprt_local_port(newxprt); } + err: spin_unlock(&svc_xprt_class_lock); dprintk("svc: transport %s not found\n", xprt_name); - out: - return ret; + return -ENOENT; } EXPORT_SYMBOL_GPL(svc_create_xprt); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index e43412bb07ae..b1cb97bbbd34 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -94,6 +94,7 @@ static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); static struct svc_xprt *svc_create_socket(struct svc_serv *, int, struct sockaddr *, int, int); +static void svc_age_temp_xprts(unsigned long closure); /* apparently the "standard" is that clients close * idle connections after 5 minutes, servers after @@ -1573,6 +1574,19 @@ svc_recv(struct svc_rqst *rqstp, long timeout) */ __module_get(newxpt->xpt_class->xcl_owner); svc_check_conn_limits(xprt->xpt_server); + spin_lock_bh(&serv->sv_lock); + set_bit(XPT_TEMP, &newxpt->xpt_flags); + list_add(&newxpt->xpt_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp sockets */ + setup_timer(&serv->sv_temptimer, + svc_age_temp_xprts, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } + spin_unlock_bh(&serv->sv_lock); svc_xprt_received(newxpt); } svc_xprt_received(xprt); @@ -1716,7 +1730,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); - int is_temporary = flags & SVC_SOCK_TEMPORARY; dprintk("svc: svc_setup_socket %p\n", sock); if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { @@ -1736,7 +1749,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, return NULL; } - set_bit(XPT_BUSY, &svsk->sk_xprt.xpt_flags); inet->sk_user_data = svsk; svsk->sk_sock = sock; svsk->sk_sk = inet; @@ -1750,24 +1762,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, else svc_tcp_init(svsk, serv); - spin_lock_bh(&serv->sv_lock); - if (is_temporary) { - set_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); - list_add(&svsk->sk_xprt.xpt_list, &serv->sv_tempsocks); - serv->sv_tmpcnt++; - if (serv->sv_temptimer.function == NULL) { - /* setup timer to age temp sockets */ - setup_timer(&serv->sv_temptimer, svc_age_temp_xprts, - (unsigned long)serv); - mod_timer(&serv->sv_temptimer, - jiffies + svc_conn_age_period * HZ); - } - } else { - clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); - list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); - } - spin_unlock_bh(&serv->sv_lock); - dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); @@ -1800,6 +1794,10 @@ int svc_addsock(struct svc_serv *serv, int salen; if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) svc_xprt_set_local(&svsk->sk_xprt, sin, salen); + clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); svc_xprt_received(&svsk->sk_xprt); err = 0; } @@ -1865,7 +1863,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); - svc_xprt_received(&svsk->sk_xprt); return (struct svc_xprt *)svsk; } From 57b1d3babaafea1c395c932914e38c2ff9493001 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:22 -0600 Subject: [PATCH 079/100] svc: Removing remaining references to rq_sock in rqstp This functionally empty patch removes rq_sock and unamed union from rqstp structure. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 5 +---- net/sunrpc/svcsock.c | 38 ++++++++++++++++++++++++-------------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 40adc9d75a6d..04eb20e2c344 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -204,10 +204,7 @@ union svc_addr_u { struct svc_rqst { struct list_head rq_list; /* idle list */ struct list_head rq_all; /* all threads list */ - union { - struct svc_xprt * rq_xprt; /* transport ptr */ - struct svc_sock * rq_sock; /* socket ptr */ - }; + struct svc_xprt * rq_xprt; /* transport ptr */ struct sockaddr_storage rq_addr; /* peer address */ size_t rq_addrlen; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b1cb97bbbd34..157de9979cfe 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -201,10 +201,12 @@ static void svc_release_skb(struct svc_rqst *rqstp) struct svc_deferred_req *dr = rqstp->rq_deferred; if (skb) { + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); rqstp->rq_xprt_ctxt = NULL; dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); - skb_free_datagram(rqstp->rq_sock->sk_sk, skb); + skb_free_datagram(svsk->sk_sk, skb); } if (dr) { rqstp->rq_deferred = NULL; @@ -418,7 +420,7 @@ svc_wake_up(struct svc_serv *serv) dprintk("svc: daemon %p woken up.\n", rqstp); /* svc_thread_dequeue(pool, rqstp); - rqstp->rq_sock = NULL; + rqstp->rq_xprt = NULL; */ wake_up(&rqstp->rq_wait); } @@ -435,7 +437,9 @@ union svc_pktinfo_u { static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) { - switch (rqstp->rq_sock->sk_sk->sk_family) { + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + switch (svsk->sk_sk->sk_family) { case AF_INET: { struct in_pktinfo *pki = CMSG_DATA(cmh); @@ -468,7 +472,8 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct socket *sock = svsk->sk_sock; int slen; union { @@ -541,7 +546,7 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) } out: dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %s)\n", - rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, + svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); return len; @@ -617,7 +622,8 @@ svc_recv_available(struct svc_sock *svsk) static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; @@ -707,7 +713,9 @@ svc_write_space(struct sock *sk) static void svc_udp_get_dest_address(struct svc_rqst *rqstp, struct cmsghdr *cmh) { - switch (rqstp->rq_sock->sk_sk->sk_family) { + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); + switch (svsk->sk_sk->sk_family) { case AF_INET: { struct in_pktinfo *pki = CMSG_DATA(cmh); rqstp->rq_daddr.addr.s_addr = pki->ipi_spec_dst.s_addr; @@ -727,7 +735,8 @@ static void svc_udp_get_dest_address(struct svc_rqst *rqstp, static int svc_udp_recvfrom(struct svc_rqst *rqstp) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct svc_serv *serv = svsk->sk_xprt.xpt_server; struct sk_buff *skb; union { @@ -1109,7 +1118,8 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { - struct svc_sock *svsk = rqstp->rq_sock; + struct svc_sock *svsk = + container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); struct svc_serv *serv = svsk->sk_xprt.xpt_server; int len; struct kvec *vec; @@ -1273,16 +1283,16 @@ svc_tcp_sendto(struct svc_rqst *rqstp) reclen = htonl(0x80000000|((xbufp->len ) - 4)); memcpy(xbufp->head[0].iov_base, &reclen, 4); - if (test_bit(XPT_DEAD, &rqstp->rq_sock->sk_xprt.xpt_flags)) + if (test_bit(XPT_DEAD, &rqstp->rq_xprt->xpt_flags)) return -ENOTCONN; sent = svc_sendto(rqstp, &rqstp->rq_res); if (sent != xbufp->len) { printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", - rqstp->rq_sock->sk_xprt.xpt_server->sv_name, + rqstp->rq_xprt->xpt_server->sv_name, (sent<0)?"got error":"sent only", sent, xbufp->len); - set_bit(XPT_CLOSE, &rqstp->rq_sock->sk_xprt.xpt_flags); + set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags); svc_xprt_enqueue(rqstp->rq_xprt); sent = -EAGAIN; } @@ -1302,7 +1312,7 @@ static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) static int svc_tcp_has_wspace(struct svc_xprt *xprt) { - struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); struct svc_serv *serv = svsk->sk_xprt.xpt_server; int required; int wspace; @@ -1625,7 +1635,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout) void svc_drop(struct svc_rqst *rqstp) { - dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); + dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); svc_xprt_release(rqstp); } From 18d19f949d5a9c927b2b88402630c5137971b619 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:25 -0600 Subject: [PATCH 080/100] svc: Make svc_check_conn_limits xprt independent The svc_check_conn_limits function only manipulates xprt fields. Change references to svc_sock->sk_xprt to svc_xprt directly. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 157de9979cfe..0814a78ad7ad 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1448,31 +1448,31 @@ svc_sock_update_bufs(struct svc_serv *serv) static void svc_check_conn_limits(struct svc_serv *serv) { if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { - struct svc_sock *svsk = NULL; + struct svc_xprt *xprt = NULL; spin_lock_bh(&serv->sv_lock); if (!list_empty(&serv->sv_tempsocks)) { if (net_ratelimit()) { /* Try to help the admin */ - printk(KERN_NOTICE "%s: too many open TCP " - "sockets, consider increasing the " + printk(KERN_NOTICE "%s: too many open " + "connections, consider increasing the " "number of nfsd threads\n", serv->sv_name); } /* - * Always select the oldest socket. It's not fair, + * Always select the oldest connection. It's not fair, * but so is life */ - svsk = list_entry(serv->sv_tempsocks.prev, - struct svc_sock, - sk_xprt.xpt_list); - set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); - svc_xprt_get(&svsk->sk_xprt); + xprt = list_entry(serv->sv_tempsocks.prev, + struct svc_xprt, + xpt_list); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_get(xprt); } spin_unlock_bh(&serv->sv_lock); - if (svsk) { - svc_xprt_enqueue(&svsk->sk_xprt); - svc_xprt_put(&svsk->sk_xprt); + if (xprt) { + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); } } } From 0f0257eaa5d29b80f6ab2c40ed21aa65bb4527f6 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:27 -0600 Subject: [PATCH 081/100] svc: Move the xprt independent code to the svc_xprt.c file This functionally trivial patch moves all of the transport independent functions from the svcsock.c file to the transport independent svc_xprt.c file. In addition the following formatting changes were made: - White space cleanup - Function signatures on single line - The inline directive was removed - Lines over 80 columns were reformatted - The term 'socket' was changed to 'transport' in comments - The SMP comment was moved and updated. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 27 ++ net/sunrpc/svc_xprt.c | 753 ++++++++++++++++++++++++++++ net/sunrpc/svcsock.c | 834 +------------------------------- 3 files changed, 804 insertions(+), 810 deletions(-) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 09de12b63c1d..405281e745d1 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -72,9 +72,14 @@ void svc_unreg_xprt_class(struct svc_xprt_class *); void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *, struct svc_serv *); int svc_create_xprt(struct svc_serv *, char *, unsigned short, int); +void svc_xprt_enqueue(struct svc_xprt *xprt); void svc_xprt_received(struct svc_xprt *); void svc_xprt_put(struct svc_xprt *xprt); void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt); +void svc_close_xprt(struct svc_xprt *xprt); +void svc_delete_xprt(struct svc_xprt *xprt); +int svc_port_is_privileged(struct sockaddr *sin); + static inline void svc_xprt_get(struct svc_xprt *xprt) { kref_get(&xprt->xpt_ref); @@ -126,4 +131,26 @@ static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt) return svc_addr_port((struct sockaddr *)&xprt->xpt_remote); } +static inline char *__svc_print_addr(struct sockaddr *addr, + char *buf, size_t len) +{ + switch (addr->sa_family) { + case AF_INET: + snprintf(buf, len, "%u.%u.%u.%u, port=%u", + NIPQUAD(((struct sockaddr_in *) addr)->sin_addr), + ntohs(((struct sockaddr_in *) addr)->sin_port)); + break; + + case AF_INET6: + snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u", + NIP6(((struct sockaddr_in6 *) addr)->sin6_addr), + ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); + break; + + default: + snprintf(buf, len, "unknown address type: %d", addr->sa_family); + break; + } + return buf; +} #endif /* SUNRPC_SVC_XPRT_H */ diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 271467c5138d..23165aef59d9 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -35,10 +35,53 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); +static int svc_deferred_recv(struct svc_rqst *rqstp); +static struct cache_deferred_req *svc_defer(struct cache_req *req); +static void svc_age_temp_xprts(unsigned long closure); + +/* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ +static int svc_conn_age_period = 6*60; + /* List of registered transport classes */ static DEFINE_SPINLOCK(svc_xprt_class_lock); static LIST_HEAD(svc_xprt_class_list); +/* SMP locking strategy: + * + * svc_pool->sp_lock protects most of the fields of that pool. + * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. + * when both need to be taken (rare), svc_serv->sv_lock is first. + * BKL protects svc_serv->sv_nrthread. + * svc_sock->sk_lock protects the svc_sock->sk_deferred list + * and the ->sk_info_authunix cache. + * + * The XPT_BUSY bit in xprt->xpt_flags prevents a transport being + * enqueued multiply. During normal transport processing this bit + * is set by svc_xprt_enqueue and cleared by svc_xprt_received. + * Providers should not manipulate this bit directly. + * + * Some flags can be set to certain values at any time + * providing that certain rules are followed: + * + * XPT_CONN, XPT_DATA: + * - Can be set or cleared at any time. + * - After a set, svc_xprt_enqueue must be called to enqueue + * the transport for processing. + * - After a clear, the transport must be read/accepted. + * If this succeeds, it must be set again. + * XPT_CLOSE: + * - Can set at any time. It is never cleared. + * XPT_DEAD: + * - Can only be set while XPT_BUSY is held which ensures + * that no other thread will be using the transport or will + * try to set XPT_DEAD. + */ + int svc_reg_xprt_class(struct svc_xprt_class *xcl) { struct svc_xprt_class *cl; @@ -178,3 +221,713 @@ void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt) } EXPORT_SYMBOL_GPL(svc_xprt_copy_addrs); +/** + * svc_print_addr - Format rq_addr field for printing + * @rqstp: svc_rqst struct containing address to print + * @buf: target buffer for formatted address + * @len: length of target buffer + * + */ +char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) +{ + return __svc_print_addr(svc_addr(rqstp), buf, len); +} +EXPORT_SYMBOL_GPL(svc_print_addr); + +/* + * Queue up an idle server thread. Must have pool->sp_lock held. + * Note: this is really a stack rather than a queue, so that we only + * use as many different threads as we need, and the rest don't pollute + * the cache. + */ +static void svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) +{ + list_add(&rqstp->rq_list, &pool->sp_threads); +} + +/* + * Dequeue an nfsd thread. Must have pool->sp_lock held. + */ +static void svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) +{ + list_del(&rqstp->rq_list); +} + +/* + * Queue up a transport with data pending. If there are idle nfsd + * processes, wake 'em up. + * + */ +void svc_xprt_enqueue(struct svc_xprt *xprt) +{ + struct svc_serv *serv = xprt->xpt_server; + struct svc_pool *pool; + struct svc_rqst *rqstp; + int cpu; + + if (!(xprt->xpt_flags & + ((1<xpt_flags)) + return; + + cpu = get_cpu(); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + put_cpu(); + + spin_lock_bh(&pool->sp_lock); + + if (!list_empty(&pool->sp_threads) && + !list_empty(&pool->sp_sockets)) + printk(KERN_ERR + "svc_xprt_enqueue: " + "threads and transports both waiting??\n"); + + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { + /* Don't enqueue dead transports */ + dprintk("svc: transport %p is dead, not enqueued\n", xprt); + goto out_unlock; + } + + /* Mark transport as busy. It will remain in this state until + * the provider calls svc_xprt_received. We update XPT_BUSY + * atomically because it also guards against trying to enqueue + * the transport twice. + */ + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { + /* Don't enqueue transport while already enqueued */ + dprintk("svc: transport %p busy, not enqueued\n", xprt); + goto out_unlock; + } + BUG_ON(xprt->xpt_pool != NULL); + xprt->xpt_pool = pool; + + /* Handle pending connection */ + if (test_bit(XPT_CONN, &xprt->xpt_flags)) + goto process; + + /* Handle close in-progress */ + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) + goto process; + + /* Check if we have space to reply to a request */ + if (!xprt->xpt_ops->xpo_has_wspace(xprt)) { + /* Don't enqueue while not enough space for reply */ + dprintk("svc: no write space, transport %p not enqueued\n", + xprt); + xprt->xpt_pool = NULL; + clear_bit(XPT_BUSY, &xprt->xpt_flags); + goto out_unlock; + } + + process: + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: transport %p served by daemon %p\n", + xprt, rqstp); + svc_thread_dequeue(pool, rqstp); + if (rqstp->rq_xprt) + printk(KERN_ERR + "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", + rqstp, rqstp->rq_xprt); + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + BUG_ON(xprt->xpt_pool != pool); + wake_up(&rqstp->rq_wait); + } else { + dprintk("svc: transport %p put into queue\n", xprt); + list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); + BUG_ON(xprt->xpt_pool != pool); + } + +out_unlock: + spin_unlock_bh(&pool->sp_lock); +} +EXPORT_SYMBOL_GPL(svc_xprt_enqueue); + +/* + * Dequeue the first transport. Must be called with the pool->sp_lock held. + */ +static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) +{ + struct svc_xprt *xprt; + + if (list_empty(&pool->sp_sockets)) + return NULL; + + xprt = list_entry(pool->sp_sockets.next, + struct svc_xprt, xpt_ready); + list_del_init(&xprt->xpt_ready); + + dprintk("svc: transport %p dequeued, inuse=%d\n", + xprt, atomic_read(&xprt->xpt_ref.refcount)); + + return xprt; +} + +/* + * svc_xprt_received conditionally queues the transport for processing + * by another thread. The caller must hold the XPT_BUSY bit and must + * not thereafter touch transport data. + * + * Note: XPT_DATA only gets cleared when a read-attempt finds no (or + * insufficient) data. + */ +void svc_xprt_received(struct svc_xprt *xprt) +{ + BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); + xprt->xpt_pool = NULL; + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); +} +EXPORT_SYMBOL_GPL(svc_xprt_received); + +/** + * svc_reserve - change the space reserved for the reply to a request. + * @rqstp: The request in question + * @space: new max space to reserve + * + * Each request reserves some space on the output queue of the transport + * to make sure the reply fits. This function reduces that reserved + * space to be the amount of space used already, plus @space. + * + */ +void svc_reserve(struct svc_rqst *rqstp, int space) +{ + space += rqstp->rq_res.head[0].iov_len; + + if (space < rqstp->rq_reserved) { + struct svc_xprt *xprt = rqstp->rq_xprt; + atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); + rqstp->rq_reserved = space; + + svc_xprt_enqueue(xprt); + } +} + +static void svc_xprt_release(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + + rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); + + svc_free_res_pages(rqstp); + rqstp->rq_res.page_len = 0; + rqstp->rq_res.page_base = 0; + + /* Reset response buffer and release + * the reservation. + * But first, check that enough space was reserved + * for the reply, otherwise we have a bug! + */ + if ((rqstp->rq_res.len) > rqstp->rq_reserved) + printk(KERN_ERR "RPC request reserved %d but used %d\n", + rqstp->rq_reserved, + rqstp->rq_res.len); + + rqstp->rq_res.head[0].iov_len = 0; + svc_reserve(rqstp, 0); + rqstp->rq_xprt = NULL; + + svc_xprt_put(xprt); +} + +/* + * External function to wake up a server waiting for data + * This really only makes sense for services like lockd + * which have exactly one thread anyway. + */ +void svc_wake_up(struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + unsigned int i; + struct svc_pool *pool; + + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[i]; + + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_thread_dequeue(pool, rqstp); + rqstp->rq_xprt = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&pool->sp_lock); + } +} + +int svc_port_is_privileged(struct sockaddr *sin) +{ + switch (sin->sa_family) { + case AF_INET: + return ntohs(((struct sockaddr_in *)sin)->sin_port) + < PROT_SOCK; + case AF_INET6: + return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) + < PROT_SOCK; + default: + return 0; + } +} + +/* + * Make sure that we don't have too many active connections. If we + * have, something must be dropped. + * + * There's no point in trying to do random drop here for DoS + * prevention. The NFS clients does 1 reconnect in 15 seconds. An + * attacker can easily beat that. + * + * The only somewhat efficient mechanism would be if drop old + * connections from the same IP first. But right now we don't even + * record the client IP in svc_sock. + */ +static void svc_check_conn_limits(struct svc_serv *serv) +{ + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { + struct svc_xprt *xprt = NULL; + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + if (net_ratelimit()) { + /* Try to help the admin */ + printk(KERN_NOTICE "%s: too many open " + "connections, consider increasing the " + "number of nfsd threads\n", + serv->sv_name); + } + /* + * Always select the oldest connection. It's not fair, + * but so is life + */ + xprt = list_entry(serv->sv_tempsocks.prev, + struct svc_xprt, + xpt_list); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_get(xprt); + } + spin_unlock_bh(&serv->sv_lock); + + if (xprt) { + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + } + } +} + +/* + * Receive the next request on any transport. This code is carefully + * organised not to touch any cachelines in the shared svc_serv + * structure, only cachelines in the local svc_pool. + */ +int svc_recv(struct svc_rqst *rqstp, long timeout) +{ + struct svc_xprt *xprt = NULL; + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; + int len, i; + int pages; + struct xdr_buf *arg; + DECLARE_WAITQUEUE(wait, current); + + dprintk("svc: server %p waiting for data (to = %ld)\n", + rqstp, timeout); + + if (rqstp->rq_xprt) + printk(KERN_ERR + "svc_recv: service %p, transport not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); + + /* now allocate needed pages. If we get a failure, sleep briefly */ + pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; + for (i = 0; i < pages ; i++) + while (rqstp->rq_pages[i] == NULL) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) { + int j = msecs_to_jiffies(500); + schedule_timeout_uninterruptible(j); + } + rqstp->rq_pages[i] = p; + } + rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ + BUG_ON(pages >= RPCSVC_MAXPAGES); + + /* Make arg->head point to first page and arg->pages point to rest */ + arg = &rqstp->rq_arg; + arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); + arg->head[0].iov_len = PAGE_SIZE; + arg->pages = rqstp->rq_pages + 1; + arg->page_base = 0; + /* save at least one page for response */ + arg->page_len = (pages-2)*PAGE_SIZE; + arg->len = (pages-1)*PAGE_SIZE; + arg->tail[0].iov_len = 0; + + try_to_freeze(); + cond_resched(); + if (signalled()) + return -EINTR; + + spin_lock_bh(&pool->sp_lock); + xprt = svc_xprt_dequeue(pool); + if (xprt) { + rqstp->rq_xprt = xprt; + svc_xprt_get(xprt); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + } else { + /* No data pending. Go to sleep */ + svc_thread_enqueue(pool, rqstp); + + /* + * We have to be able to interrupt this wait + * to bring down the daemons ... + */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&rqstp->rq_wait, &wait); + spin_unlock_bh(&pool->sp_lock); + + schedule_timeout(timeout); + + try_to_freeze(); + + spin_lock_bh(&pool->sp_lock); + remove_wait_queue(&rqstp->rq_wait, &wait); + + xprt = rqstp->rq_xprt; + if (!xprt) { + svc_thread_dequeue(pool, rqstp); + spin_unlock_bh(&pool->sp_lock); + dprintk("svc: server %p, no data yet\n", rqstp); + return signalled()? -EINTR : -EAGAIN; + } + } + spin_unlock_bh(&pool->sp_lock); + + len = 0; + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { + dprintk("svc_recv: found XPT_CLOSE\n"); + svc_delete_xprt(xprt); + } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { + struct svc_xprt *newxpt; + newxpt = xprt->xpt_ops->xpo_accept(xprt); + if (newxpt) { + /* + * We know this module_get will succeed because the + * listener holds a reference too + */ + __module_get(newxpt->xpt_class->xcl_owner); + svc_check_conn_limits(xprt->xpt_server); + spin_lock_bh(&serv->sv_lock); + set_bit(XPT_TEMP, &newxpt->xpt_flags); + list_add(&newxpt->xpt_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp transports */ + setup_timer(&serv->sv_temptimer, + svc_age_temp_xprts, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } + spin_unlock_bh(&serv->sv_lock); + svc_xprt_received(newxpt); + } + svc_xprt_received(xprt); + } else { + dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", + rqstp, pool->sp_id, xprt, + atomic_read(&xprt->xpt_ref.refcount)); + rqstp->rq_deferred = svc_deferred_dequeue(xprt); + if (rqstp->rq_deferred) { + svc_xprt_received(xprt); + len = svc_deferred_recv(rqstp); + } else + len = xprt->xpt_ops->xpo_recvfrom(rqstp); + dprintk("svc: got len=%d\n", len); + } + + /* No data, incomplete (TCP) read, or accept() */ + if (len == 0 || len == -EAGAIN) { + rqstp->rq_res.len = 0; + svc_xprt_release(rqstp); + return -EAGAIN; + } + clear_bit(XPT_OLD, &xprt->xpt_flags); + + rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); + rqstp->rq_chandle.defer = svc_defer; + + if (serv->sv_stats) + serv->sv_stats->netcnt++; + return len; +} + +/* + * Drop request + */ +void svc_drop(struct svc_rqst *rqstp) +{ + dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); + svc_xprt_release(rqstp); +} + +/* + * Return reply to client. + */ +int svc_send(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt; + int len; + struct xdr_buf *xb; + + xprt = rqstp->rq_xprt; + if (!xprt) + return -EFAULT; + + /* release the receive skb before sending the reply */ + rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); + + /* calculate over-all length */ + xb = &rqstp->rq_res; + xb->len = xb->head[0].iov_len + + xb->page_len + + xb->tail[0].iov_len; + + /* Grab mutex to serialize outgoing data. */ + mutex_lock(&xprt->xpt_mutex); + if (test_bit(XPT_DEAD, &xprt->xpt_flags)) + len = -ENOTCONN; + else + len = xprt->xpt_ops->xpo_sendto(rqstp); + mutex_unlock(&xprt->xpt_mutex); + svc_xprt_release(rqstp); + + if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) + return 0; + return len; +} + +/* + * Timer function to close old temporary transports, using + * a mark-and-sweep algorithm. + */ +static void svc_age_temp_xprts(unsigned long closure) +{ + struct svc_serv *serv = (struct svc_serv *)closure; + struct svc_xprt *xprt; + struct list_head *le, *next; + LIST_HEAD(to_be_aged); + + dprintk("svc_age_temp_xprts\n"); + + if (!spin_trylock_bh(&serv->sv_lock)) { + /* busy, try again 1 sec later */ + dprintk("svc_age_temp_xprts: busy\n"); + mod_timer(&serv->sv_temptimer, jiffies + HZ); + return; + } + + list_for_each_safe(le, next, &serv->sv_tempsocks) { + xprt = list_entry(le, struct svc_xprt, xpt_list); + + /* First time through, just mark it OLD. Second time + * through, close it. */ + if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) + continue; + if (atomic_read(&xprt->xpt_ref.refcount) > 1 + || test_bit(XPT_BUSY, &xprt->xpt_flags)) + continue; + svc_xprt_get(xprt); + list_move(le, &to_be_aged); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + set_bit(XPT_DETACHED, &xprt->xpt_flags); + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_aged)) { + le = to_be_aged.next; + /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */ + list_del_init(le); + xprt = list_entry(le, struct svc_xprt, xpt_list); + + dprintk("queuing xprt %p for closing\n", xprt); + + /* a thread will dequeue and close it soon */ + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); + } + + mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); +} + +/* + * Remove a dead transport + */ +void svc_delete_xprt(struct svc_xprt *xprt) +{ + struct svc_serv *serv = xprt->xpt_server; + + dprintk("svc: svc_delete_xprt(%p)\n", xprt); + xprt->xpt_ops->xpo_detach(xprt); + + spin_lock_bh(&serv->sv_lock); + if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) + list_del_init(&xprt->xpt_list); + /* + * We used to delete the transport from whichever list + * it's sk_xprt.xpt_ready node was on, but we don't actually + * need to. This is because the only time we're called + * while still attached to a queue, the queue itself + * is about to be destroyed (in svc_destroy). + */ + if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) { + BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2); + if (test_bit(XPT_TEMP, &xprt->xpt_flags)) + serv->sv_tmpcnt--; + svc_xprt_put(xprt); + } + spin_unlock_bh(&serv->sv_lock); +} + +void svc_close_xprt(struct svc_xprt *xprt) +{ + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) + /* someone else will have to effect the close */ + return; + + svc_xprt_get(xprt); + svc_delete_xprt(xprt); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_put(xprt); +} + +void svc_close_all(struct list_head *xprt_list) +{ + struct svc_xprt *xprt; + struct svc_xprt *tmp; + + list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + if (test_bit(XPT_BUSY, &xprt->xpt_flags)) { + /* Waiting to be processed, but no threads left, + * So just remove it from the waiting list + */ + list_del_init(&xprt->xpt_ready); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + } + svc_close_xprt(xprt); + } +} + +/* + * Handle defer and revisit of requests + */ + +static void svc_revisit(struct cache_deferred_req *dreq, int too_many) +{ + struct svc_deferred_req *dr = + container_of(dreq, struct svc_deferred_req, handle); + struct svc_xprt *xprt = dr->xprt; + + if (too_many) { + svc_xprt_put(xprt); + kfree(dr); + return; + } + dprintk("revisit queued\n"); + dr->xprt = NULL; + spin_lock(&xprt->xpt_lock); + list_add(&dr->handle.recent, &xprt->xpt_deferred); + spin_unlock(&xprt->xpt_lock); + set_bit(XPT_DEFERRED, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); +} + +static struct cache_deferred_req *svc_defer(struct cache_req *req) +{ + struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); + int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); + struct svc_deferred_req *dr; + + if (rqstp->rq_arg.page_len) + return NULL; /* if more than a page, give up FIXME */ + if (rqstp->rq_deferred) { + dr = rqstp->rq_deferred; + rqstp->rq_deferred = NULL; + } else { + int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; + /* FIXME maybe discard if size too large */ + dr = kmalloc(size, GFP_KERNEL); + if (dr == NULL) + return NULL; + + dr->handle.owner = rqstp->rq_server; + dr->prot = rqstp->rq_prot; + memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); + dr->addrlen = rqstp->rq_addrlen; + dr->daddr = rqstp->rq_daddr; + dr->argslen = rqstp->rq_arg.len >> 2; + memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, + dr->argslen<<2); + } + svc_xprt_get(rqstp->rq_xprt); + dr->xprt = rqstp->rq_xprt; + + dr->handle.revisit = svc_revisit; + return &dr->handle; +} + +/* + * recv data from a deferred request into an active one + */ +static int svc_deferred_recv(struct svc_rqst *rqstp) +{ + struct svc_deferred_req *dr = rqstp->rq_deferred; + + rqstp->rq_arg.head[0].iov_base = dr->args; + rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; + rqstp->rq_arg.page_len = 0; + rqstp->rq_arg.len = dr->argslen<<2; + rqstp->rq_prot = dr->prot; + memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); + rqstp->rq_addrlen = dr->addrlen; + rqstp->rq_daddr = dr->daddr; + rqstp->rq_respages = rqstp->rq_pages; + return dr->argslen<<2; +} + + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) +{ + struct svc_deferred_req *dr = NULL; + + if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags)) + return NULL; + spin_lock(&xprt->xpt_lock); + clear_bit(XPT_DEFERRED, &xprt->xpt_flags); + if (!list_empty(&xprt->xpt_deferred)) { + dr = list_entry(xprt->xpt_deferred.next, + struct svc_deferred_req, + handle.recent); + list_del_init(&dr->handle.recent); + set_bit(XPT_DEFERRED, &xprt->xpt_flags); + } + spin_unlock(&xprt->xpt_lock); + return dr; +} diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 0814a78ad7ad..343a85b700f0 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -48,66 +48,24 @@ #include #include -/* SMP locking strategy: - * - * svc_pool->sp_lock protects most of the fields of that pool. - * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. - * when both need to be taken (rare), svc_serv->sv_lock is first. - * BKL protects svc_serv->sv_nrthread. - * svc_sock->sk_lock protects the svc_sock->sk_deferred list - * and the ->sk_info_authunix cache. - * svc_sock->sk_xprt.xpt_flags.XPT_BUSY prevents a svc_sock being - * enqueued multiply. - * - * Some flags can be set to certain values at any time - * providing that certain rules are followed: - * - * XPT_CONN, XPT_DATA, can be set or cleared at any time. - * after a set, svc_xprt_enqueue must be called. - * after a clear, the socket must be read/accepted - * if this succeeds, it must be set again. - * XPT_CLOSE can set at any time. It is never cleared. - * xpt_ref contains a bias of '1' until XPT_DEAD is set. - * so when xprt_ref hits zero, we know the transport is dead - * and no-one is using it. - * XPT_DEAD can only be set while XPT_BUSY is held which ensures - * no other thread will be using the socket or will try to - * set XPT_DEAD. - * - */ - #define RPCDBG_FACILITY RPCDBG_SVCXPRT static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int *errp, int flags); -static void svc_delete_xprt(struct svc_xprt *xprt); static void svc_udp_data_ready(struct sock *, int); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); -static void svc_close_xprt(struct svc_xprt *xprt); static void svc_sock_detach(struct svc_xprt *); static void svc_sock_free(struct svc_xprt *); -static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); -static int svc_deferred_recv(struct svc_rqst *rqstp); -static struct cache_deferred_req *svc_defer(struct cache_req *req); static struct svc_xprt *svc_create_socket(struct svc_serv *, int, struct sockaddr *, int, int); -static void svc_age_temp_xprts(unsigned long closure); - -/* apparently the "standard" is that clients close - * idle connections after 5 minutes, servers after - * 6 minutes - * http://www.connectathon.org/talks96/nfstcp.pdf - */ -static int svc_conn_age_period = 6*60; - #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key svc_key[2]; static struct lock_class_key svc_slock_key[2]; -static inline void svc_reclassify_socket(struct socket *sock) +static void svc_reclassify_socket(struct socket *sock) { struct sock *sk = sock->sk; BUG_ON(sock_owned_by_user(sk)); @@ -131,67 +89,11 @@ static inline void svc_reclassify_socket(struct socket *sock) } } #else -static inline void svc_reclassify_socket(struct socket *sock) +static void svc_reclassify_socket(struct socket *sock) { } #endif -static char *__svc_print_addr(struct sockaddr *addr, char *buf, size_t len) -{ - switch (addr->sa_family) { - case AF_INET: - snprintf(buf, len, "%u.%u.%u.%u, port=%u", - NIPQUAD(((struct sockaddr_in *) addr)->sin_addr), - ntohs(((struct sockaddr_in *) addr)->sin_port)); - break; - - case AF_INET6: - snprintf(buf, len, "%x:%x:%x:%x:%x:%x:%x:%x, port=%u", - NIP6(((struct sockaddr_in6 *) addr)->sin6_addr), - ntohs(((struct sockaddr_in6 *) addr)->sin6_port)); - break; - - default: - snprintf(buf, len, "unknown address type: %d", addr->sa_family); - break; - } - return buf; -} - -/** - * svc_print_addr - Format rq_addr field for printing - * @rqstp: svc_rqst struct containing address to print - * @buf: target buffer for formatted address - * @len: length of target buffer - * - */ -char *svc_print_addr(struct svc_rqst *rqstp, char *buf, size_t len) -{ - return __svc_print_addr(svc_addr(rqstp), buf, len); -} -EXPORT_SYMBOL_GPL(svc_print_addr); - -/* - * Queue up an idle server thread. Must have pool->sp_lock held. - * Note: this is really a stack rather than a queue, so that we only - * use as many different threads as we need, and the rest don't pollute - * the cache. - */ -static inline void -svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) -{ - list_add(&rqstp->rq_list, &pool->sp_threads); -} - -/* - * Dequeue an nfsd thread. Must have pool->sp_lock held. - */ -static inline void -svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) -{ - list_del(&rqstp->rq_list); -} - /* * Release an skbuff after use */ @@ -214,220 +116,6 @@ static void svc_release_skb(struct svc_rqst *rqstp) } } -/* - * Queue up a socket with data pending. If there are idle nfsd - * processes, wake 'em up. - * - */ -void svc_xprt_enqueue(struct svc_xprt *xprt) -{ - struct svc_serv *serv = xprt->xpt_server; - struct svc_pool *pool; - struct svc_rqst *rqstp; - int cpu; - - if (!(xprt->xpt_flags & - ((1<xpt_flags)) - return; - - cpu = get_cpu(); - pool = svc_pool_for_cpu(xprt->xpt_server, cpu); - put_cpu(); - - spin_lock_bh(&pool->sp_lock); - - if (!list_empty(&pool->sp_threads) && - !list_empty(&pool->sp_sockets)) - printk(KERN_ERR - "svc_xprt_enqueue: " - "threads and transports both waiting??\n"); - - if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { - /* Don't enqueue dead sockets */ - dprintk("svc: transport %p is dead, not enqueued\n", xprt); - goto out_unlock; - } - - /* Mark socket as busy. It will remain in this state until the - * server has processed all pending data and put the socket back - * on the idle list. We update XPT_BUSY atomically because - * it also guards against trying to enqueue the svc_sock twice. - */ - if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) { - /* Don't enqueue socket while already enqueued */ - dprintk("svc: transport %p busy, not enqueued\n", xprt); - goto out_unlock; - } - BUG_ON(xprt->xpt_pool != NULL); - xprt->xpt_pool = pool; - - /* Handle pending connection */ - if (test_bit(XPT_CONN, &xprt->xpt_flags)) - goto process; - - /* Handle close in-progress */ - if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) - goto process; - - /* Check if we have space to reply to a request */ - if (!xprt->xpt_ops->xpo_has_wspace(xprt)) { - /* Don't enqueue while not enough space for reply */ - dprintk("svc: no write space, transport %p not enqueued\n", - xprt); - xprt->xpt_pool = NULL; - clear_bit(XPT_BUSY, &xprt->xpt_flags); - goto out_unlock; - } - - process: - if (!list_empty(&pool->sp_threads)) { - rqstp = list_entry(pool->sp_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: transport %p served by daemon %p\n", - xprt, rqstp); - svc_thread_dequeue(pool, rqstp); - if (rqstp->rq_xprt) - printk(KERN_ERR - "svc_xprt_enqueue: server %p, rq_xprt=%p!\n", - rqstp, rqstp->rq_xprt); - rqstp->rq_xprt = xprt; - svc_xprt_get(xprt); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); - BUG_ON(xprt->xpt_pool != pool); - wake_up(&rqstp->rq_wait); - } else { - dprintk("svc: transport %p put into queue\n", xprt); - list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); - BUG_ON(xprt->xpt_pool != pool); - } - -out_unlock: - spin_unlock_bh(&pool->sp_lock); -} -EXPORT_SYMBOL_GPL(svc_xprt_enqueue); - -/* - * Dequeue the first socket. Must be called with the pool->sp_lock held. - */ -static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) -{ - struct svc_xprt *xprt; - - if (list_empty(&pool->sp_sockets)) - return NULL; - - xprt = list_entry(pool->sp_sockets.next, - struct svc_xprt, xpt_ready); - list_del_init(&xprt->xpt_ready); - - dprintk("svc: transport %p dequeued, inuse=%d\n", - xprt, atomic_read(&xprt->xpt_ref.refcount)); - - return xprt; -} - -/* - * svc_xprt_received conditionally queues the transport for processing - * by another thread. The caller must hold the XPT_BUSY bit and must - * not thereafter touch transport data. - * - * Note: XPT_DATA only gets cleared when a read-attempt finds no (or - * insufficient) data. - */ -void svc_xprt_received(struct svc_xprt *xprt) -{ - BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); - xprt->xpt_pool = NULL; - clear_bit(XPT_BUSY, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); -} -EXPORT_SYMBOL_GPL(svc_xprt_received); - -/** - * svc_reserve - change the space reserved for the reply to a request. - * @rqstp: The request in question - * @space: new max space to reserve - * - * Each request reserves some space on the output queue of the socket - * to make sure the reply fits. This function reduces that reserved - * space to be the amount of space used already, plus @space. - * - */ -void svc_reserve(struct svc_rqst *rqstp, int space) -{ - space += rqstp->rq_res.head[0].iov_len; - - if (space < rqstp->rq_reserved) { - struct svc_xprt *xprt = rqstp->rq_xprt; - atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); - rqstp->rq_reserved = space; - - svc_xprt_enqueue(xprt); - } -} - -static void svc_xprt_release(struct svc_rqst *rqstp) -{ - struct svc_xprt *xprt = rqstp->rq_xprt; - - rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); - - svc_free_res_pages(rqstp); - rqstp->rq_res.page_len = 0; - rqstp->rq_res.page_base = 0; - - /* Reset response buffer and release - * the reservation. - * But first, check that enough space was reserved - * for the reply, otherwise we have a bug! - */ - if ((rqstp->rq_res.len) > rqstp->rq_reserved) - printk(KERN_ERR "RPC request reserved %d but used %d\n", - rqstp->rq_reserved, - rqstp->rq_res.len); - - rqstp->rq_res.head[0].iov_len = 0; - svc_reserve(rqstp, 0); - rqstp->rq_xprt = NULL; - - svc_xprt_put(xprt); -} - -/* - * External function to wake up a server waiting for data - * This really only makes sense for services like lockd - * which have exactly one thread anyway. - */ -void -svc_wake_up(struct svc_serv *serv) -{ - struct svc_rqst *rqstp; - unsigned int i; - struct svc_pool *pool; - - for (i = 0; i < serv->sv_nrpools; i++) { - pool = &serv->sv_pools[i]; - - spin_lock_bh(&pool->sp_lock); - if (!list_empty(&pool->sp_threads)) { - rqstp = list_entry(pool->sp_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: daemon %p woken up.\n", rqstp); - /* - svc_thread_dequeue(pool, rqstp); - rqstp->rq_xprt = NULL; - */ - wake_up(&rqstp->rq_wait); - } - spin_unlock_bh(&pool->sp_lock); - } -} - union svc_pktinfo_u { struct in_pktinfo pkti; struct in6_pktinfo pkti6; @@ -469,8 +157,7 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) /* * Generic sendto routine */ -static int -svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) +static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); @@ -605,8 +292,7 @@ EXPORT_SYMBOL(svc_sock_names); /* * Check input queue length */ -static int -svc_recv_available(struct svc_sock *svsk) +static int svc_recv_available(struct svc_sock *svsk) { struct socket *sock = svsk->sk_sock; int avail, err; @@ -619,8 +305,8 @@ svc_recv_available(struct svc_sock *svsk) /* * Generic recvfrom routine. */ -static int -svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) +static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, + int buflen) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); @@ -640,8 +326,8 @@ svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) /* * Set socket snd and rcv buffer lengths */ -static inline void -svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) +static void svc_sock_setbufsize(struct socket *sock, unsigned int snd, + unsigned int rcv) { #if 0 mm_segment_t oldfs; @@ -666,8 +352,7 @@ svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) /* * INET callback when data has been received on the socket. */ -static void -svc_udp_data_ready(struct sock *sk, int count) +static void svc_udp_data_ready(struct sock *sk, int count) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; @@ -685,8 +370,7 @@ svc_udp_data_ready(struct sock *sk, int count) /* * INET callback when space is newly available on the socket. */ -static void -svc_write_space(struct sock *sk) +static void svc_write_space(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); @@ -732,8 +416,7 @@ static void svc_udp_get_dest_address(struct svc_rqst *rqstp, /* * Receive a datagram from a UDP socket. */ -static int -svc_udp_recvfrom(struct svc_rqst *rqstp) +static int svc_udp_recvfrom(struct svc_rqst *rqstp) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); @@ -827,7 +510,8 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) skb_free_datagram(svsk->sk_sk, skb); } else { /* we can use it in-place */ - rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); + rqstp->rq_arg.head[0].iov_base = skb->data + + sizeof(struct udphdr); rqstp->rq_arg.head[0].iov_len = len; if (skb_checksum_complete(skb)) { skb_free_datagram(svsk->sk_sk, skb); @@ -938,7 +622,8 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* might have come in before data_ready set up */ + /* data might have come in before data_ready set up */ + set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); oldfs = get_fs(); @@ -953,8 +638,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) * A data_ready event on a listening socket means there's a connection * pending. Do not use state_change as a substitute for it. */ -static void -svc_tcp_listen_data_ready(struct sock *sk, int count_unused) +static void svc_tcp_listen_data_ready(struct sock *sk, int count_unused) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; @@ -986,8 +670,7 @@ svc_tcp_listen_data_ready(struct sock *sk, int count_unused) /* * A state change on a connected socket means it's dying or dead. */ -static void -svc_tcp_state_change(struct sock *sk) +static void svc_tcp_state_change(struct sock *sk) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; @@ -1004,8 +687,7 @@ svc_tcp_state_change(struct sock *sk) wake_up_interruptible_all(sk->sk_sleep); } -static void -svc_tcp_data_ready(struct sock *sk, int count) +static void svc_tcp_data_ready(struct sock *sk, int count) { struct svc_sock *svsk = (struct svc_sock *)sk->sk_user_data; @@ -1019,20 +701,6 @@ svc_tcp_data_ready(struct sock *sk, int count) wake_up_interruptible(sk->sk_sleep); } -static inline int svc_port_is_privileged(struct sockaddr *sin) -{ - switch (sin->sa_family) { - case AF_INET: - return ntohs(((struct sockaddr_in *)sin)->sin_port) - < PROT_SOCK; - case AF_INET6: - return ntohs(((struct sockaddr_in6 *)sin)->sin6_port) - < PROT_SOCK; - default: - return 0; - } -} - /* * Accept a TCP connection */ @@ -1115,8 +783,7 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) /* * Receive data from a TCP socket. */ -static int -svc_tcp_recvfrom(struct svc_rqst *rqstp) +static int svc_tcp_recvfrom(struct svc_rqst *rqstp) { struct svc_sock *svsk = container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); @@ -1269,8 +936,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) /* * Send out data on TCP socket. */ -static int -svc_tcp_sendto(struct svc_rqst *rqstp) +static int svc_tcp_sendto(struct svc_rqst *rqstp) { struct xdr_buf *xbufp = &rqstp->rq_res; int sent; @@ -1288,7 +954,9 @@ svc_tcp_sendto(struct svc_rqst *rqstp) sent = svc_sendto(rqstp, &rqstp->rq_res); if (sent != xbufp->len) { - printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + printk(KERN_NOTICE + "rpc-srv/tcp: %s: %s %d when sending %d bytes " + "- shutting down socket\n", rqstp->rq_xprt->xpt_server->sv_name, (sent<0)?"got error":"sent only", sent, xbufp->len); @@ -1410,8 +1078,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) } } -void -svc_sock_update_bufs(struct svc_serv *serv) +void svc_sock_update_bufs(struct svc_serv *serv) { /* * The number of server threads has changed. Update @@ -1433,302 +1100,6 @@ svc_sock_update_bufs(struct svc_serv *serv) spin_unlock_bh(&serv->sv_lock); } -/* - * Make sure that we don't have too many active connections. If we - * have, something must be dropped. - * - * There's no point in trying to do random drop here for DoS - * prevention. The NFS clients does 1 reconnect in 15 seconds. An - * attacker can easily beat that. - * - * The only somewhat efficient mechanism would be if drop old - * connections from the same IP first. But right now we don't even - * record the client IP in svc_sock. - */ -static void svc_check_conn_limits(struct svc_serv *serv) -{ - if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { - struct svc_xprt *xprt = NULL; - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { - if (net_ratelimit()) { - /* Try to help the admin */ - printk(KERN_NOTICE "%s: too many open " - "connections, consider increasing the " - "number of nfsd threads\n", - serv->sv_name); - } - /* - * Always select the oldest connection. It's not fair, - * but so is life - */ - xprt = list_entry(serv->sv_tempsocks.prev, - struct svc_xprt, - xpt_list); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_get(xprt); - } - spin_unlock_bh(&serv->sv_lock); - - if (xprt) { - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); - } - } -} - -/* - * Receive the next request on any socket. This code is carefully - * organised not to touch any cachelines in the shared svc_serv - * structure, only cachelines in the local svc_pool. - */ -int -svc_recv(struct svc_rqst *rqstp, long timeout) -{ - struct svc_xprt *xprt = NULL; - struct svc_serv *serv = rqstp->rq_server; - struct svc_pool *pool = rqstp->rq_pool; - int len, i; - int pages; - struct xdr_buf *arg; - DECLARE_WAITQUEUE(wait, current); - - dprintk("svc: server %p waiting for data (to = %ld)\n", - rqstp, timeout); - - if (rqstp->rq_xprt) - printk(KERN_ERR - "svc_recv: service %p, transport not NULL!\n", - rqstp); - if (waitqueue_active(&rqstp->rq_wait)) - printk(KERN_ERR - "svc_recv: service %p, wait queue active!\n", - rqstp); - - - /* now allocate needed pages. If we get a failure, sleep briefly */ - pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; - for (i=0; i < pages ; i++) - while (rqstp->rq_pages[i] == NULL) { - struct page *p = alloc_page(GFP_KERNEL); - if (!p) - schedule_timeout_uninterruptible(msecs_to_jiffies(500)); - rqstp->rq_pages[i] = p; - } - rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */ - BUG_ON(pages >= RPCSVC_MAXPAGES); - - /* Make arg->head point to first page and arg->pages point to rest */ - arg = &rqstp->rq_arg; - arg->head[0].iov_base = page_address(rqstp->rq_pages[0]); - arg->head[0].iov_len = PAGE_SIZE; - arg->pages = rqstp->rq_pages + 1; - arg->page_base = 0; - /* save at least one page for response */ - arg->page_len = (pages-2)*PAGE_SIZE; - arg->len = (pages-1)*PAGE_SIZE; - arg->tail[0].iov_len = 0; - - try_to_freeze(); - cond_resched(); - if (signalled()) - return -EINTR; - - spin_lock_bh(&pool->sp_lock); - xprt = svc_xprt_dequeue(pool); - if (xprt) { - rqstp->rq_xprt = xprt; - svc_xprt_get(xprt); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); - } else { - /* No data pending. Go to sleep */ - svc_thread_enqueue(pool, rqstp); - - /* - * We have to be able to interrupt this wait - * to bring down the daemons ... - */ - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&rqstp->rq_wait, &wait); - spin_unlock_bh(&pool->sp_lock); - - schedule_timeout(timeout); - - try_to_freeze(); - - spin_lock_bh(&pool->sp_lock); - remove_wait_queue(&rqstp->rq_wait, &wait); - - xprt = rqstp->rq_xprt; - if (!xprt) { - svc_thread_dequeue(pool, rqstp); - spin_unlock_bh(&pool->sp_lock); - dprintk("svc: server %p, no data yet\n", rqstp); - return signalled()? -EINTR : -EAGAIN; - } - } - spin_unlock_bh(&pool->sp_lock); - - len = 0; - if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { - dprintk("svc_recv: found XPT_CLOSE\n"); - svc_delete_xprt(xprt); - } else if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { - struct svc_xprt *newxpt; - newxpt = xprt->xpt_ops->xpo_accept(xprt); - if (newxpt) { - /* - * We know this module_get will succeed because the - * listener holds a reference too - */ - __module_get(newxpt->xpt_class->xcl_owner); - svc_check_conn_limits(xprt->xpt_server); - spin_lock_bh(&serv->sv_lock); - set_bit(XPT_TEMP, &newxpt->xpt_flags); - list_add(&newxpt->xpt_list, &serv->sv_tempsocks); - serv->sv_tmpcnt++; - if (serv->sv_temptimer.function == NULL) { - /* setup timer to age temp sockets */ - setup_timer(&serv->sv_temptimer, - svc_age_temp_xprts, - (unsigned long)serv); - mod_timer(&serv->sv_temptimer, - jiffies + svc_conn_age_period * HZ); - } - spin_unlock_bh(&serv->sv_lock); - svc_xprt_received(newxpt); - } - svc_xprt_received(xprt); - } else { - dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", - rqstp, pool->sp_id, xprt, - atomic_read(&xprt->xpt_ref.refcount)); - rqstp->rq_deferred = svc_deferred_dequeue(xprt); - if (rqstp->rq_deferred) { - svc_xprt_received(xprt); - len = svc_deferred_recv(rqstp); - } else - len = xprt->xpt_ops->xpo_recvfrom(rqstp); - dprintk("svc: got len=%d\n", len); - } - - /* No data, incomplete (TCP) read, or accept() */ - if (len == 0 || len == -EAGAIN) { - rqstp->rq_res.len = 0; - svc_xprt_release(rqstp); - return -EAGAIN; - } - clear_bit(XPT_OLD, &xprt->xpt_flags); - - rqstp->rq_secure = svc_port_is_privileged(svc_addr(rqstp)); - rqstp->rq_chandle.defer = svc_defer; - - if (serv->sv_stats) - serv->sv_stats->netcnt++; - return len; -} - -/* - * Drop request - */ -void -svc_drop(struct svc_rqst *rqstp) -{ - dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); - svc_xprt_release(rqstp); -} - -/* - * Return reply to client. - */ -int -svc_send(struct svc_rqst *rqstp) -{ - struct svc_xprt *xprt; - int len; - struct xdr_buf *xb; - - xprt = rqstp->rq_xprt; - if (!xprt) - return -EFAULT; - - /* release the receive skb before sending the reply */ - rqstp->rq_xprt->xpt_ops->xpo_release_rqst(rqstp); - - /* calculate over-all length */ - xb = & rqstp->rq_res; - xb->len = xb->head[0].iov_len + - xb->page_len + - xb->tail[0].iov_len; - - /* Grab mutex to serialize outgoing data. */ - mutex_lock(&xprt->xpt_mutex); - if (test_bit(XPT_DEAD, &xprt->xpt_flags)) - len = -ENOTCONN; - else - len = xprt->xpt_ops->xpo_sendto(rqstp); - mutex_unlock(&xprt->xpt_mutex); - svc_xprt_release(rqstp); - - if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) - return 0; - return len; -} - -/* - * Timer function to close old temporary sockets, using - * a mark-and-sweep algorithm. - */ -static void svc_age_temp_xprts(unsigned long closure) -{ - struct svc_serv *serv = (struct svc_serv *)closure; - struct svc_xprt *xprt; - struct list_head *le, *next; - LIST_HEAD(to_be_aged); - - dprintk("svc_age_temp_xprts\n"); - - if (!spin_trylock_bh(&serv->sv_lock)) { - /* busy, try again 1 sec later */ - dprintk("svc_age_temp_xprts: busy\n"); - mod_timer(&serv->sv_temptimer, jiffies + HZ); - return; - } - - list_for_each_safe(le, next, &serv->sv_tempsocks) { - xprt = list_entry(le, struct svc_xprt, xpt_list); - - /* First time through, just mark it OLD. Second time - * through, close it. */ - if (!test_and_set_bit(XPT_OLD, &xprt->xpt_flags)) - continue; - if (atomic_read(&xprt->xpt_ref.refcount) > 1 - || test_bit(XPT_BUSY, &xprt->xpt_flags)) - continue; - svc_xprt_get(xprt); - list_move(le, &to_be_aged); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - set_bit(XPT_DETACHED, &xprt->xpt_flags); - } - spin_unlock_bh(&serv->sv_lock); - - while (!list_empty(&to_be_aged)) { - le = to_be_aged.next; - /* fiddling the xpt_list node is safe 'cos we're XPT_DETACHED */ - list_del_init(le); - xprt = list_entry(le, struct svc_xprt, xpt_list); - - dprintk("queuing xprt %p for closing\n", xprt); - - /* a thread will dequeue and close it soon */ - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); - } - - mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); -} - /* * Initialize socket for RPC use and create svc_sock struct * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. @@ -1913,160 +1284,3 @@ static void svc_sock_free(struct svc_xprt *xprt) sock_release(svsk->sk_sock); kfree(svsk); } - -/* - * Remove a dead transport - */ -static void svc_delete_xprt(struct svc_xprt *xprt) -{ - struct svc_serv *serv = xprt->xpt_server; - - dprintk("svc: svc_delete_xprt(%p)\n", xprt); - xprt->xpt_ops->xpo_detach(xprt); - - spin_lock_bh(&serv->sv_lock); - if (!test_and_set_bit(XPT_DETACHED, &xprt->xpt_flags)) - list_del_init(&xprt->xpt_list); - /* - * We used to delete the transport from whichever list - * it's sk_xprt.xpt_ready node was on, but we don't actually - * need to. This is because the only time we're called - * while still attached to a queue, the queue itself - * is about to be destroyed (in svc_destroy). - */ - if (!test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) { - BUG_ON(atomic_read(&xprt->xpt_ref.refcount) < 2); - if (test_bit(XPT_TEMP, &xprt->xpt_flags)) - serv->sv_tmpcnt--; - svc_xprt_put(xprt); - } - spin_unlock_bh(&serv->sv_lock); -} - -static void svc_close_xprt(struct svc_xprt *xprt) -{ - set_bit(XPT_CLOSE, &xprt->xpt_flags); - if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) - /* someone else will have to effect the close */ - return; - - svc_xprt_get(xprt); - svc_delete_xprt(xprt); - clear_bit(XPT_BUSY, &xprt->xpt_flags); - svc_xprt_put(xprt); -} - -void svc_close_all(struct list_head *xprt_list) -{ - struct svc_xprt *xprt; - struct svc_xprt *tmp; - - list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { - set_bit(XPT_CLOSE, &xprt->xpt_flags); - if (test_bit(XPT_BUSY, &xprt->xpt_flags)) { - /* Waiting to be processed, but no threads left, - * So just remove it from the waiting list - */ - list_del_init(&xprt->xpt_ready); - clear_bit(XPT_BUSY, &xprt->xpt_flags); - } - svc_close_xprt(xprt); - } -} - -/* - * Handle defer and revisit of requests - */ - -static void svc_revisit(struct cache_deferred_req *dreq, int too_many) -{ - struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); - struct svc_xprt *xprt = dr->xprt; - - if (too_many) { - svc_xprt_put(xprt); - kfree(dr); - return; - } - dprintk("revisit queued\n"); - dr->xprt = NULL; - spin_lock(&xprt->xpt_lock); - list_add(&dr->handle.recent, &xprt->xpt_deferred); - spin_unlock(&xprt->xpt_lock); - set_bit(XPT_DEFERRED, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); -} - -static struct cache_deferred_req * -svc_defer(struct cache_req *req) -{ - struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); - int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); - struct svc_deferred_req *dr; - - if (rqstp->rq_arg.page_len) - return NULL; /* if more than a page, give up FIXME */ - if (rqstp->rq_deferred) { - dr = rqstp->rq_deferred; - rqstp->rq_deferred = NULL; - } else { - int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; - /* FIXME maybe discard if size too large */ - dr = kmalloc(size, GFP_KERNEL); - if (dr == NULL) - return NULL; - - dr->handle.owner = rqstp->rq_server; - dr->prot = rqstp->rq_prot; - memcpy(&dr->addr, &rqstp->rq_addr, rqstp->rq_addrlen); - dr->addrlen = rqstp->rq_addrlen; - dr->daddr = rqstp->rq_daddr; - dr->argslen = rqstp->rq_arg.len >> 2; - memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); - } - svc_xprt_get(rqstp->rq_xprt); - dr->xprt = rqstp->rq_xprt; - - dr->handle.revisit = svc_revisit; - return &dr->handle; -} - -/* - * recv data from a deferred request into an active one - */ -static int svc_deferred_recv(struct svc_rqst *rqstp) -{ - struct svc_deferred_req *dr = rqstp->rq_deferred; - - rqstp->rq_arg.head[0].iov_base = dr->args; - rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; - rqstp->rq_arg.page_len = 0; - rqstp->rq_arg.len = dr->argslen<<2; - rqstp->rq_prot = dr->prot; - memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); - rqstp->rq_addrlen = dr->addrlen; - rqstp->rq_daddr = dr->daddr; - rqstp->rq_respages = rqstp->rq_pages; - return dr->argslen<<2; -} - - -static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) -{ - struct svc_deferred_req *dr = NULL; - - if (!test_bit(XPT_DEFERRED, &xprt->xpt_flags)) - return NULL; - spin_lock(&xprt->xpt_lock); - clear_bit(XPT_DEFERRED, &xprt->xpt_flags); - if (!list_empty(&xprt->xpt_deferred)) { - dr = list_entry(xprt->xpt_deferred.next, - struct svc_deferred_req, - handle.recent); - list_del_init(&dr->handle.recent); - set_bit(XPT_DEFERRED, &xprt->xpt_flags); - } - spin_unlock(&xprt->xpt_lock); - return dr; -} From 260c1d1298f6703d38fdccd3dd5a310766327340 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:29 -0600 Subject: [PATCH 082/100] svc: Add transport hdr size for defer/revisit Some transports have a header in front of the RPC header. The current defer/revisit processing considers only the iov_len and arg_len to determine how much to back up when saving the original request to revisit. Add a field to the rqstp structure to save the size of the transport header so svc_defer can correctly compute the start of a request. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 2 ++ net/sunrpc/svc_xprt.c | 36 +++++++++++++++++++++++++++--------- net/sunrpc/svcsock.c | 2 ++ 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 04eb20e2c344..742ab461d842 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -217,6 +217,7 @@ struct svc_rqst { void * rq_xprt_ctxt; /* transport specific context ptr */ struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ + size_t rq_xprt_hlen; /* xprt header len */ struct xdr_buf rq_arg; struct xdr_buf rq_res; struct page * rq_pages[RPCSVC_MAXPAGES]; @@ -322,6 +323,7 @@ struct svc_deferred_req { size_t addrlen; union svc_addr_u daddr; /* where reply must come from */ struct cache_deferred_req handle; + size_t xprt_hlen; int argslen; __be32 args[0]; }; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 23165aef59d9..000c7dc3b82c 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -859,10 +858,18 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) svc_xprt_put(xprt); } +/* + * Save the request off for later processing. The request buffer looks + * like this: + * + * + * + * This code can only handle requests that consist of an xprt-header + * and rpc-header. + */ static struct cache_deferred_req *svc_defer(struct cache_req *req) { struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); - int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); struct svc_deferred_req *dr; if (rqstp->rq_arg.page_len) @@ -871,8 +878,10 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) dr = rqstp->rq_deferred; rqstp->rq_deferred = NULL; } else { - int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; + size_t skip; + size_t size; /* FIXME maybe discard if size too large */ + size = sizeof(struct svc_deferred_req) + rqstp->rq_arg.len; dr = kmalloc(size, GFP_KERNEL); if (dr == NULL) return NULL; @@ -883,8 +892,12 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req) dr->addrlen = rqstp->rq_addrlen; dr->daddr = rqstp->rq_daddr; dr->argslen = rqstp->rq_arg.len >> 2; - memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, - dr->argslen<<2); + dr->xprt_hlen = rqstp->rq_xprt_hlen; + + /* back up head to the start of the buffer and copy */ + skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; + memcpy(dr->args, rqstp->rq_arg.head[0].iov_base - skip, + dr->argslen << 2); } svc_xprt_get(rqstp->rq_xprt); dr->xprt = rqstp->rq_xprt; @@ -900,16 +913,21 @@ static int svc_deferred_recv(struct svc_rqst *rqstp) { struct svc_deferred_req *dr = rqstp->rq_deferred; - rqstp->rq_arg.head[0].iov_base = dr->args; - rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; + /* setup iov_base past transport header */ + rqstp->rq_arg.head[0].iov_base = dr->args + (dr->xprt_hlen>>2); + /* The iov_len does not include the transport header bytes */ + rqstp->rq_arg.head[0].iov_len = (dr->argslen<<2) - dr->xprt_hlen; rqstp->rq_arg.page_len = 0; - rqstp->rq_arg.len = dr->argslen<<2; + /* The rq_arg.len includes the transport header bytes */ + rqstp->rq_arg.len = dr->argslen<<2; rqstp->rq_prot = dr->prot; memcpy(&rqstp->rq_addr, &dr->addr, dr->addrlen); rqstp->rq_addrlen = dr->addrlen; + /* Save off transport header len in case we get deferred again */ + rqstp->rq_xprt_hlen = dr->xprt_hlen; rqstp->rq_daddr = dr->daddr; rqstp->rq_respages = rqstp->rq_pages; - return dr->argslen<<2; + return (dr->argslen<<2) - dr->xprt_hlen; } diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 343a85b700f0..1d3e5fcc2cc4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -315,6 +315,8 @@ static int svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, }; int len; + rqstp->rq_xprt_hlen = 0; + len = kernel_recvmsg(svsk->sk_sock, &msg, iov, nr, buflen, msg.msg_flags); From dc9a16e49dbba3dd042e6aec5d9a7929e099a89b Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:31 -0600 Subject: [PATCH 083/100] svc: Add /proc/sys/sunrpc/transport files Add a file that when read lists the set of registered svc transports. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_xprt.h | 1 + net/sunrpc/svc_xprt.c | 28 ++++++++++++++++++++++++++++ net/sunrpc/sysctl.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 405281e745d1..01e71b7a2e20 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -79,6 +79,7 @@ void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt); void svc_close_xprt(struct svc_xprt *xprt); void svc_delete_xprt(struct svc_xprt *xprt); int svc_port_is_privileged(struct sockaddr *sin); +int svc_print_xprts(char *buf, int maxlen); static inline void svc_xprt_get(struct svc_xprt *xprt) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 000c7dc3b82c..2e5b92ae24ed 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -112,6 +112,34 @@ void svc_unreg_xprt_class(struct svc_xprt_class *xcl) } EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); +/* + * Format the transport list for printing + */ +int svc_print_xprts(char *buf, int maxlen) +{ + struct list_head *le; + char tmpstr[80]; + int len = 0; + buf[0] = '\0'; + + spin_lock(&svc_xprt_class_lock); + list_for_each(le, &svc_xprt_class_list) { + int slen; + struct svc_xprt_class *xcl = + list_entry(le, struct svc_xprt_class, xcl_list); + + sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); + slen = strlen(tmpstr); + if (len + slen > maxlen) + break; + len += slen; + strcat(buf, tmpstr); + } + spin_unlock(&svc_xprt_class_lock); + + return len; +} + static void svc_xprt_free(struct kref *kref) { struct svc_xprt *xprt = diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index bada7de0c2fc..0f8c439b848a 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * Declare the debug flags here @@ -55,6 +56,30 @@ rpc_unregister_sysctl(void) } } +static int proc_do_xprt(ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char tmpbuf[256]; + int len; + if ((*ppos && !write) || !*lenp) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + else { + len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); + if (!access_ok(VERIFY_WRITE, buffer, len)) + return -EFAULT; + + if (__copy_to_user(buffer, tmpbuf, len)) + return -EFAULT; + } + *lenp -= len; + *ppos += len; + return 0; +} + static int proc_dodebug(ctl_table *table, int write, struct file *file, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -147,6 +172,12 @@ static ctl_table debug_table[] = { .mode = 0644, .proc_handler = &proc_dodebug }, + { + .procname = "transports", + .maxlen = 256, + .mode = 0444, + .proc_handler = &proc_do_xprt, + }, { .ctl_name = 0 } }; From 7fcb98d58cb4d18af6386f71025fc5192f25fbca Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:33 -0600 Subject: [PATCH 084/100] svc: Add svc API that queries for a transport instance Add a new svc function that allows a service to query whether a transport instance has already been created. This is used in lockd to determine whether or not a transport needs to be created when a lockd instance is brought up. Specifying 0 for the address family or port is effectively a wild-card, and will result in matching the first transport in the service's list that has a matching class name. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 16 ++------------- include/linux/sunrpc/svc_xprt.h | 1 + net/sunrpc/svc_xprt.c | 35 +++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 14 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index a8e79a907202..470af0138bb5 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -219,18 +219,6 @@ lockd(struct svc_rqst *rqstp) module_put_and_exit(0); } -static int find_xprt(struct svc_serv *serv, char *proto) -{ - struct svc_xprt *xprt; - int found = 0; - list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) - if (strcmp(xprt->xpt_class->xcl_name, proto) == 0) { - found = 1; - break; - } - return found; -} - /* * Make any sockets that are needed but not present. * If nlm_udpport or nlm_tcpport were set as module @@ -242,11 +230,11 @@ static int make_socks(struct svc_serv *serv, int proto) int err = 0; if (proto == IPPROTO_UDP || nlm_udpport) - if (!find_xprt(serv, "udp")) + if (!svc_find_xprt(serv, "udp", 0, 0)) err = svc_create_xprt(serv, "udp", nlm_udpport, SVC_SOCK_DEFAULTS); if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) - if (!find_xprt(serv, "tcp")) + if (!svc_find_xprt(serv, "tcp", 0, 0)) err = svc_create_xprt(serv, "tcp", nlm_tcpport, SVC_SOCK_DEFAULTS); diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 01e71b7a2e20..68862d51bb36 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -80,6 +80,7 @@ void svc_close_xprt(struct svc_xprt *xprt); void svc_delete_xprt(struct svc_xprt *xprt); int svc_port_is_privileged(struct sockaddr *sin); int svc_print_xprts(char *buf, int maxlen); +struct svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int); static inline void svc_xprt_get(struct svc_xprt *xprt) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 2e5b92ae24ed..512c10fc1a9f 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -977,3 +977,38 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt) spin_unlock(&xprt->xpt_lock); return dr; } + +/* + * Return the transport instance pointer for the endpoint accepting + * connections/peer traffic from the specified transport class, + * address family and port. + * + * Specifying 0 for the address family or port is effectively a + * wild-card, and will result in matching the first transport in the + * service's list that has a matching class name. + */ +struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, + int af, int port) +{ + struct svc_xprt *xprt; + struct svc_xprt *found = NULL; + + /* Sanity check the args */ + if (!serv || !xcl_name) + return found; + + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { + if (strcmp(xprt->xpt_class->xcl_name, xcl_name)) + continue; + if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family) + continue; + if (port && port != svc_xprt_local_port(xprt)) + continue; + found = xprt; + break; + } + spin_unlock_bh(&serv->sv_lock); + return found; +} +EXPORT_SYMBOL_GPL(svc_find_xprt); From a217813f9067b785241cb7f31956e51d2071703a Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:35 -0600 Subject: [PATCH 085/100] knfsd: Support adding transports by writing portlist file Update the write handler for the portlist file to allow creating new listening endpoints on a transport. The general form of the string is: For example: echo "tcp 2049" > /proc/fs/nfsd/portlist This is intended to support the creation of a listening endpoint for RDMA transports without adding #ifdef code to the nfssvc.c file. Transports can also be removed as follows: '-' For example: echo "-tcp 2049" > /proc/fs/nfsd/portlist Attempting to add a listener with an invalid transport string results in EPROTONOSUPPORT and a perror string of "Protocol not supported". Attempting to remove an non-existent listener (.e.g. bad proto or port) results in ENOTCONN and a perror string of "Transport endpoint is not connected" Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 18 +++++++++++----- fs/nfsd/nfsctl.c | 49 ++++++++++++++++++++++++++++++++++++++++++- net/sunrpc/svc_xprt.c | 2 ++ 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 470af0138bb5..08226464e563 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -227,17 +227,25 @@ lockd(struct svc_rqst *rqstp) static int make_socks(struct svc_serv *serv, int proto) { static int warned; + struct svc_xprt *xprt; int err = 0; - if (proto == IPPROTO_UDP || nlm_udpport) - if (!svc_find_xprt(serv, "udp", 0, 0)) + if (proto == IPPROTO_UDP || nlm_udpport) { + xprt = svc_find_xprt(serv, "udp", 0, 0); + if (!xprt) err = svc_create_xprt(serv, "udp", nlm_udpport, SVC_SOCK_DEFAULTS); - if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) - if (!svc_find_xprt(serv, "tcp", 0, 0)) + else + svc_xprt_put(xprt); + } + if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) { + xprt = svc_find_xprt(serv, "tcp", 0, 0); + if (!xprt) err = svc_create_xprt(serv, "tcp", nlm_tcpport, SVC_SOCK_DEFAULTS); - + else + svc_xprt_put(xprt); + } if (err >= 0) { warned = 0; err = 0; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 4aba92698581..eff6a6b4c2f6 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -540,7 +540,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) } return err < 0 ? err : 0; } - if (buf[0] == '-') { + if (buf[0] == '-' && isdigit(buf[1])) { char *toclose = kstrdup(buf+1, GFP_KERNEL); int len = 0; if (!toclose) @@ -554,6 +554,53 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) kfree(toclose); return len; } + /* + * Add a transport listener by writing it's transport name + */ + if (isalpha(buf[0])) { + int err; + char transport[16]; + int port; + if (sscanf(buf, "%15s %4d", transport, &port) == 2) { + err = nfsd_create_serv(); + if (!err) { + err = svc_create_xprt(nfsd_serv, + transport, port, + SVC_SOCK_ANONYMOUS); + if (err == -ENOENT) + /* Give a reasonable perror msg for + * bad transport string */ + err = -EPROTONOSUPPORT; + } + return err < 0 ? err : 0; + } + } + /* + * Remove a transport by writing it's transport name and port number + */ + if (buf[0] == '-' && isalpha(buf[1])) { + struct svc_xprt *xprt; + int err = -EINVAL; + char transport[16]; + int port; + if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { + if (port == 0) + return -EINVAL; + lock_kernel(); + if (nfsd_serv) { + xprt = svc_find_xprt(nfsd_serv, transport, + AF_UNSPEC, port); + if (xprt) { + svc_close_xprt(xprt); + svc_xprt_put(xprt); + err = 0; + } else + err = -ENOTCONN; + } + unlock_kernel(); + return err < 0 ? err : 0; + } + } return -EINVAL; } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 512c10fc1a9f..783597343877 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -842,6 +842,7 @@ void svc_close_xprt(struct svc_xprt *xprt) clear_bit(XPT_BUSY, &xprt->xpt_flags); svc_xprt_put(xprt); } +EXPORT_SYMBOL_GPL(svc_close_xprt); void svc_close_all(struct list_head *xprt_list) { @@ -1006,6 +1007,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, if (port && port != svc_xprt_local_port(xprt)) continue; found = xprt; + svc_xprt_get(xprt); break; } spin_unlock_bh(&serv->sv_lock); From 9571af18fa1e4a431dc6f6023ddbd87d1112fd5d Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Sun, 30 Dec 2007 21:08:37 -0600 Subject: [PATCH 086/100] svc: Add svc_xprt_names service to replace svc_sock_names Create a transport independent version of the svc_sock_names function. The toclose capability of the svc_sock_names service can be implemented using the svc_xprt_find and svc_xprt_close services. Signed-off-by: Tom Tucker Acked-by: Neil Brown Reviewed-by: Chuck Lever Reviewed-by: Greg Banks Signed-off-by: J. Bruce Fields --- fs/nfsd/nfsctl.c | 2 +- include/linux/sunrpc/svc_xprt.h | 1 + net/sunrpc/svc_xprt.c | 35 +++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index eff6a6b4c2f6..bc22e0b0343a 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -503,7 +503,7 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) int len = 0; lock_kernel(); if (nfsd_serv) - len = svc_sock_names(buf, nfsd_serv, NULL); + len = svc_xprt_names(nfsd_serv, buf, 0); unlock_kernel(); return len; } diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h index 68862d51bb36..6fd7b016517f 100644 --- a/include/linux/sunrpc/svc_xprt.h +++ b/include/linux/sunrpc/svc_xprt.h @@ -81,6 +81,7 @@ void svc_delete_xprt(struct svc_xprt *xprt); int svc_port_is_privileged(struct sockaddr *sin); int svc_print_xprts(char *buf, int maxlen); struct svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int); +int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen); static inline void svc_xprt_get(struct svc_xprt *xprt) { diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 783597343877..c3fb36784f3b 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1014,3 +1014,38 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, return found; } EXPORT_SYMBOL_GPL(svc_find_xprt); + +/* + * Format a buffer with a list of the active transports. A zero for + * the buflen parameter disables target buffer overflow checking. + */ +int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen) +{ + struct svc_xprt *xprt; + char xprt_str[64]; + int totlen = 0; + int len; + + /* Sanity check args */ + if (!serv) + return 0; + + spin_lock_bh(&serv->sv_lock); + list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) { + len = snprintf(xprt_str, sizeof(xprt_str), + "%s %d\n", xprt->xpt_class->xcl_name, + svc_xprt_local_port(xprt)); + /* If the string was truncated, replace with error string */ + if (len >= sizeof(xprt_str)) + strcpy(xprt_str, "name-too-long\n"); + /* Don't overflow buffer */ + len = strlen(xprt_str); + if (buflen && (len + totlen >= buflen)) + break; + strcpy(buf+totlen, xprt_str); + totlen += len; + } + spin_unlock_bh(&serv->sv_lock); + return totlen; +} +EXPORT_SYMBOL_GPL(svc_xprt_names); From d21b05f101ae732d9bc322f13eea2f59c0aa60f5 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 12 Dec 2007 16:13:17 -0600 Subject: [PATCH 087/100] rdma: SVCRMDA Header File This file defines the data types used by the SVCRDMA transport module. The principle data structure is the transport specific extension to the svcxprt structure. Signed-off-by: Tom Tucker Acked-by: Neil Brown Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 262 ++++++++++++++++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 include/linux/sunrpc/svc_rdma.h diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h new file mode 100644 index 000000000000..c11bbcc081f9 --- /dev/null +++ b/include/linux/sunrpc/svc_rdma.h @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#ifndef SVC_RDMA_H +#define SVC_RDMA_H +#include +#include +#include +#include +#include +#define SVCRDMA_DEBUG + +/* RPC/RDMA parameters and stats */ +extern unsigned int svcrdma_ord; +extern unsigned int svcrdma_max_requests; +extern unsigned int svcrdma_max_req_size; + +extern atomic_t rdma_stat_recv; +extern atomic_t rdma_stat_read; +extern atomic_t rdma_stat_write; +extern atomic_t rdma_stat_sq_starve; +extern atomic_t rdma_stat_rq_starve; +extern atomic_t rdma_stat_rq_poll; +extern atomic_t rdma_stat_rq_prod; +extern atomic_t rdma_stat_sq_poll; +extern atomic_t rdma_stat_sq_prod; + +#define RPCRDMA_VERSION 1 + +/* + * Contexts are built when an RDMA request is created and are a + * record of the resources that can be recovered when the request + * completes. + */ +struct svc_rdma_op_ctxt { + struct svc_rdma_op_ctxt *next; + struct xdr_buf arg; + struct list_head dto_q; + enum ib_wr_opcode wr_op; + enum ib_wc_status wc_status; + u32 byte_len; + struct svcxprt_rdma *xprt; + unsigned long flags; + enum dma_data_direction direction; + int count; + struct ib_sge sge[RPCSVC_MAXPAGES]; + struct page *pages[RPCSVC_MAXPAGES]; +}; + +#define RDMACTXT_F_READ_DONE 1 +#define RDMACTXT_F_LAST_CTXT 2 + +struct svcxprt_rdma { + struct svc_xprt sc_xprt; /* SVC transport structure */ + struct rdma_cm_id *sc_cm_id; /* RDMA connection id */ + struct list_head sc_accept_q; /* Conn. waiting accept */ + int sc_ord; /* RDMA read limit */ + wait_queue_head_t sc_read_wait; + int sc_max_sge; + + int sc_sq_depth; /* Depth of SQ */ + atomic_t sc_sq_count; /* Number of SQ WR on queue */ + + int sc_max_requests; /* Depth of RQ */ + int sc_max_req_size; /* Size of each RQ WR buf */ + + struct ib_pd *sc_pd; + + struct svc_rdma_op_ctxt *sc_ctxt_head; + int sc_ctxt_cnt; + int sc_ctxt_bump; + int sc_ctxt_max; + spinlock_t sc_ctxt_lock; + struct list_head sc_rq_dto_q; + spinlock_t sc_rq_dto_lock; + struct ib_qp *sc_qp; + struct ib_cq *sc_rq_cq; + struct ib_cq *sc_sq_cq; + struct ib_mr *sc_phys_mr; /* MR for server memory */ + + spinlock_t sc_lock; /* transport lock */ + + wait_queue_head_t sc_send_wait; /* SQ exhaustion waitlist */ + unsigned long sc_flags; + struct list_head sc_dto_q; /* DTO tasklet I/O pending Q */ + struct list_head sc_read_complete_q; + spinlock_t sc_read_complete_lock; +}; +/* sc_flags */ +#define RDMAXPRT_RQ_PENDING 1 +#define RDMAXPRT_SQ_PENDING 2 +#define RDMAXPRT_CONN_PENDING 3 + +#define RPCRDMA_LISTEN_BACKLOG 10 +/* The default ORD value is based on two outstanding full-size writes with a + * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */ +#define RPCRDMA_ORD (64/4) +#define RPCRDMA_SQ_DEPTH_MULT 8 +#define RPCRDMA_MAX_THREADS 16 +#define RPCRDMA_MAX_REQUESTS 16 +#define RPCRDMA_MAX_REQ_SIZE 4096 + +/* svc_rdma_marshal.c */ +extern void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *, + int *, int *); +extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); +extern int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *); +extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, + struct rpcrdma_msg *, + enum rpcrdma_errcode, u32 *); +extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); +extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); +extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, + u32, u64, u32); +extern void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *, + struct rpcrdma_msg *, + struct rpcrdma_msg *, + enum rpcrdma_proc); +extern int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *); + +/* svc_rdma_recvfrom.c */ +extern int svc_rdma_recvfrom(struct svc_rqst *); + +/* svc_rdma_sendto.c */ +extern int svc_rdma_sendto(struct svc_rqst *); + +/* svc_rdma_transport.c */ +extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); +extern int svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, + enum rpcrdma_errcode); +struct page *svc_rdma_get_page(void); +extern int svc_rdma_post_recv(struct svcxprt_rdma *); +extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); +extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); +extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); +extern void svc_sq_reap(struct svcxprt_rdma *); +extern void svc_rq_reap(struct svcxprt_rdma *); +extern struct svc_xprt_class svc_rdma_class; +extern void svc_rdma_prep_reply_hdr(struct svc_rqst *); + +/* svc_rdma.c */ +extern int svc_rdma_init(void); +extern void svc_rdma_cleanup(void); + +/* + * Returns the address of the first read chunk or if no read chunk is + * present + */ +static inline struct rpcrdma_read_chunk * +svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *ch = + (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + + if (ch->rc_discrim == 0) + return NULL; + + return ch; +} + +/* + * Returns the address of the first read write array element or if no + * write array list is present + */ +static inline struct rpcrdma_write_array * +svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp) +{ + if (rmsgp->rm_body.rm_chunks[0] != 0 + || rmsgp->rm_body.rm_chunks[1] == 0) + return NULL; + + return (struct rpcrdma_write_array *)&rmsgp->rm_body.rm_chunks[1]; +} + +/* + * Returns the address of the first reply array element or if no + * reply array is present + */ +static inline struct rpcrdma_write_array * +svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_read_chunk *rch; + struct rpcrdma_write_array *wr_ary; + struct rpcrdma_write_array *rp_ary; + + /* XXX: Need to fix when reply list may occur with read-list and/or + * write list */ + if (rmsgp->rm_body.rm_chunks[0] != 0 || + rmsgp->rm_body.rm_chunks[1] != 0) + return NULL; + + rch = svc_rdma_get_read_chunk(rmsgp); + if (rch) { + while (rch->rc_discrim) + rch++; + + /* The reply list follows an empty write array located + * at 'rc_position' here. The reply array is at rc_target. + */ + rp_ary = (struct rpcrdma_write_array *)&rch->rc_target; + + goto found_it; + } + + wr_ary = svc_rdma_get_write_array(rmsgp); + if (wr_ary) { + rp_ary = (struct rpcrdma_write_array *) + &wr_ary-> + wc_array[wr_ary->wc_nchunks].wc_target.rs_length; + + goto found_it; + } + + /* No read list, no write list */ + rp_ary = (struct rpcrdma_write_array *) + &rmsgp->rm_body.rm_chunks[2]; + + found_it: + if (rp_ary->wc_discrim == 0) + return NULL; + + return rp_ary; +} +#endif From ef7fbf59e6f780a0fa03536839e3c42e9ce40ad1 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 12 Dec 2007 16:13:19 -0600 Subject: [PATCH 088/100] rdma: SVCRDMA Transport Module This file implements the RDMA transport module initialization and termination logic and registers the transport sysctl variables. Signed-off-by: Tom Tucker Acked-by: Neil Brown Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma.c | 266 +++++++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 net/sunrpc/xprtrdma/svc_rdma.c diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c new file mode 100644 index 000000000000..88c0ca20bb1e --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* RPC/RDMA parameters */ +unsigned int svcrdma_ord = RPCRDMA_ORD; +static unsigned int min_ord = 1; +static unsigned int max_ord = 4096; +unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; +static unsigned int min_max_requests = 4; +static unsigned int max_max_requests = 16384; +unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; +static unsigned int min_max_inline = 4096; +static unsigned int max_max_inline = 65536; + +atomic_t rdma_stat_recv; +atomic_t rdma_stat_read; +atomic_t rdma_stat_write; +atomic_t rdma_stat_sq_starve; +atomic_t rdma_stat_rq_starve; +atomic_t rdma_stat_rq_poll; +atomic_t rdma_stat_rq_prod; +atomic_t rdma_stat_sq_poll; +atomic_t rdma_stat_sq_prod; + +/* + * This function implements reading and resetting an atomic_t stat + * variable through read/write to a proc file. Any write to the file + * resets the associated statistic to zero. Any read returns it's + * current value. + */ +static int read_reset_stat(ctl_table *table, int write, + struct file *filp, void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + atomic_t *stat = (atomic_t *)table->data; + + if (!stat) + return -EINVAL; + + if (write) + atomic_set(stat, 0); + else { + char str_buf[32]; + char *data; + int len = snprintf(str_buf, 32, "%d\n", atomic_read(stat)); + if (len >= 32) + return -EFAULT; + len = strlen(str_buf); + if (*ppos > len) { + *lenp = 0; + return 0; + } + data = &str_buf[*ppos]; + len -= *ppos; + if (len > *lenp) + len = *lenp; + if (len && copy_to_user(buffer, str_buf, len)) + return -EFAULT; + *lenp = len; + *ppos += len; + } + return 0; +} + +static struct ctl_table_header *svcrdma_table_header; +static ctl_table svcrdma_parm_table[] = { + { + .procname = "max_requests", + .data = &svcrdma_max_requests, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_max_requests, + .extra2 = &max_max_requests + }, + { + .procname = "max_req_size", + .data = &svcrdma_max_req_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_max_inline, + .extra2 = &max_max_inline + }, + { + .procname = "max_outbound_read_requests", + .data = &svcrdma_ord, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_ord, + .extra2 = &max_ord, + }, + + { + .procname = "rdma_stat_read", + .data = &rdma_stat_read, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_recv", + .data = &rdma_stat_recv, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_write", + .data = &rdma_stat_write, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_sq_starve", + .data = &rdma_stat_sq_starve, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_rq_starve", + .data = &rdma_stat_rq_starve, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_rq_poll", + .data = &rdma_stat_rq_poll, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_rq_prod", + .data = &rdma_stat_rq_prod, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_sq_poll", + .data = &rdma_stat_sq_poll, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .procname = "rdma_stat_sq_prod", + .data = &rdma_stat_sq_prod, + .maxlen = sizeof(atomic_t), + .mode = 0644, + .proc_handler = &read_reset_stat, + }, + { + .ctl_name = 0, + }, +}; + +static ctl_table svcrdma_table[] = { + { + .procname = "svc_rdma", + .mode = 0555, + .child = svcrdma_parm_table + }, + { + .ctl_name = 0, + }, +}; + +static ctl_table svcrdma_root_table[] = { + { + .ctl_name = CTL_SUNRPC, + .procname = "sunrpc", + .mode = 0555, + .child = svcrdma_table + }, + { + .ctl_name = 0, + }, +}; + +void svc_rdma_cleanup(void) +{ + dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n"); + if (svcrdma_table_header) { + unregister_sysctl_table(svcrdma_table_header); + svcrdma_table_header = NULL; + } + svc_unreg_xprt_class(&svc_rdma_class); +} + +int svc_rdma_init(void) +{ + dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); + dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); + dprintk("\tmax_requests : %d\n", svcrdma_max_requests); + dprintk("\tsq_depth : %d\n", + svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); + dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); + if (!svcrdma_table_header) + svcrdma_table_header = + register_sysctl_table(svcrdma_root_table); + + /* Register RDMA with the SVC transport switch */ + svc_reg_xprt_class(&svc_rdma_class); + return 0; +} +MODULE_AUTHOR("Tom Tucker "); +MODULE_DESCRIPTION("SVC RDMA Transport"); +MODULE_LICENSE("Dual BSD/GPL"); +module_init(svc_rdma_init); +module_exit(svc_rdma_cleanup); From 377f9b2f4529e0ac702fd7b91e216afd0adc959e Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 12 Dec 2007 16:13:21 -0600 Subject: [PATCH 089/100] rdma: SVCRDMA Core Transport Services This file implements the core transport data management and I/O path. The I/O path for RDMA involves receiving callbacks on interrupt context. Since all the svc transport locks are _bh locks we enqueue the transport on a list, schedule a tasklet to dequeue data indications from the RDMA completion queue. The tasklet in turn takes _bh locks to enqueue receive data indications on a list for the transport. The svc_rdma_recvfrom transport function dequeues data from this list in an NFSD thread context. Signed-off-by: Tom Tucker Acked-by: Neil Brown Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1080 ++++++++++++++++++++++ 1 file changed, 1080 insertions(+) create mode 100644 net/sunrpc/xprtrdma/svc_rdma_transport.c diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c new file mode 100644 index 000000000000..f09444c451bc --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -0,0 +1,1080 @@ +/* + * Copyright (c) 2005-2007 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct sockaddr *sa, int salen, + int flags); +static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt); +static void svc_rdma_release_rqst(struct svc_rqst *); +static void rdma_destroy_xprt(struct svcxprt_rdma *xprt); +static void dto_tasklet_func(unsigned long data); +static void svc_rdma_detach(struct svc_xprt *xprt); +static void svc_rdma_free(struct svc_xprt *xprt); +static int svc_rdma_has_wspace(struct svc_xprt *xprt); +static void rq_cq_reap(struct svcxprt_rdma *xprt); +static void sq_cq_reap(struct svcxprt_rdma *xprt); + +DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL); +static DEFINE_SPINLOCK(dto_lock); +static LIST_HEAD(dto_xprt_q); + +static struct svc_xprt_ops svc_rdma_ops = { + .xpo_create = svc_rdma_create, + .xpo_recvfrom = svc_rdma_recvfrom, + .xpo_sendto = svc_rdma_sendto, + .xpo_release_rqst = svc_rdma_release_rqst, + .xpo_detach = svc_rdma_detach, + .xpo_free = svc_rdma_free, + .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, + .xpo_has_wspace = svc_rdma_has_wspace, + .xpo_accept = svc_rdma_accept, +}; + +struct svc_xprt_class svc_rdma_class = { + .xcl_name = "rdma", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_rdma_ops, + .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, +}; + +static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) +{ + int target; + int at_least_one = 0; + struct svc_rdma_op_ctxt *ctxt; + + target = min(xprt->sc_ctxt_cnt + xprt->sc_ctxt_bump, + xprt->sc_ctxt_max); + + spin_lock_bh(&xprt->sc_ctxt_lock); + while (xprt->sc_ctxt_cnt < target) { + xprt->sc_ctxt_cnt++; + spin_unlock_bh(&xprt->sc_ctxt_lock); + + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + + spin_lock_bh(&xprt->sc_ctxt_lock); + if (ctxt) { + at_least_one = 1; + ctxt->next = xprt->sc_ctxt_head; + xprt->sc_ctxt_head = ctxt; + } else { + /* kmalloc failed...give up for now */ + xprt->sc_ctxt_cnt--; + break; + } + } + spin_unlock_bh(&xprt->sc_ctxt_lock); + dprintk("svcrdma: sc_ctxt_max=%d, sc_ctxt_cnt=%d\n", + xprt->sc_ctxt_max, xprt->sc_ctxt_cnt); + return at_least_one; +} + +struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt; + + while (1) { + spin_lock_bh(&xprt->sc_ctxt_lock); + if (unlikely(xprt->sc_ctxt_head == NULL)) { + /* Try to bump my cache. */ + spin_unlock_bh(&xprt->sc_ctxt_lock); + + if (rdma_bump_context_cache(xprt)) + continue; + + printk(KERN_INFO "svcrdma: sleeping waiting for " + "context memory on xprt=%p\n", + xprt); + schedule_timeout_uninterruptible(msecs_to_jiffies(500)); + continue; + } + ctxt = xprt->sc_ctxt_head; + xprt->sc_ctxt_head = ctxt->next; + spin_unlock_bh(&xprt->sc_ctxt_lock); + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->dto_q); + ctxt->count = 0; + break; + } + return ctxt; +} + +void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) +{ + struct svcxprt_rdma *xprt; + int i; + + BUG_ON(!ctxt); + xprt = ctxt->xprt; + if (free_pages) + for (i = 0; i < ctxt->count; i++) + put_page(ctxt->pages[i]); + + for (i = 0; i < ctxt->count; i++) + dma_unmap_single(xprt->sc_cm_id->device->dma_device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); + spin_lock_bh(&xprt->sc_ctxt_lock); + ctxt->next = xprt->sc_ctxt_head; + xprt->sc_ctxt_head = ctxt; + spin_unlock_bh(&xprt->sc_ctxt_lock); +} + +/* ib_cq event handler */ +static void cq_event_handler(struct ib_event *event, void *context) +{ + struct svc_xprt *xprt = context; + dprintk("svcrdma: received CQ event id=%d, context=%p\n", + event->event, context); + set_bit(XPT_CLOSE, &xprt->xpt_flags); +} + +/* QP event handler */ +static void qp_event_handler(struct ib_event *event, void *context) +{ + struct svc_xprt *xprt = context; + + switch (event->event) { + /* These are considered benign events */ + case IB_EVENT_PATH_MIG: + case IB_EVENT_COMM_EST: + case IB_EVENT_SQ_DRAINED: + case IB_EVENT_QP_LAST_WQE_REACHED: + dprintk("svcrdma: QP event %d received for QP=%p\n", + event->event, event->element.qp); + break; + /* These are considered fatal events */ + case IB_EVENT_PATH_MIG_ERR: + case IB_EVENT_QP_FATAL: + case IB_EVENT_QP_REQ_ERR: + case IB_EVENT_QP_ACCESS_ERR: + case IB_EVENT_DEVICE_FATAL: + default: + dprintk("svcrdma: QP ERROR event %d received for QP=%p, " + "closing transport\n", + event->event, event->element.qp); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + break; + } +} + +/* + * Data Transfer Operation Tasklet + * + * Walks a list of transports with I/O pending, removing entries as + * they are added to the server's I/O pending list. Two bits indicate + * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave + * spinlock that serializes access to the transport list with the RQ + * and SQ interrupt handlers. + */ +static void dto_tasklet_func(unsigned long data) +{ + struct svcxprt_rdma *xprt; + unsigned long flags; + + spin_lock_irqsave(&dto_lock, flags); + while (!list_empty(&dto_xprt_q)) { + xprt = list_entry(dto_xprt_q.next, + struct svcxprt_rdma, sc_dto_q); + list_del_init(&xprt->sc_dto_q); + spin_unlock_irqrestore(&dto_lock, flags); + + if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) { + ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); + rq_cq_reap(xprt); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + /* + * If data arrived before established event, + * don't enqueue. This defers RPC I/O until the + * RDMA connection is complete. + */ + if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) + svc_xprt_enqueue(&xprt->sc_xprt); + } + + if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) { + ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); + sq_cq_reap(xprt); + } + + spin_lock_irqsave(&dto_lock, flags); + } + spin_unlock_irqrestore(&dto_lock, flags); +} + +/* + * Receive Queue Completion Handler + * + * Since an RQ completion handler is called on interrupt context, we + * need to defer the handling of the I/O to a tasklet + */ +static void rq_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct svcxprt_rdma *xprt = cq_context; + unsigned long flags; + + /* + * Set the bit regardless of whether or not it's on the list + * because it may be on the list already due to an SQ + * completion. + */ + set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); + + /* + * If this transport is not already on the DTO transport queue, + * add it + */ + spin_lock_irqsave(&dto_lock, flags); + if (list_empty(&xprt->sc_dto_q)) + list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); + spin_unlock_irqrestore(&dto_lock, flags); + + /* Tasklet does all the work to avoid irqsave locks. */ + tasklet_schedule(&dto_tasklet); +} + +/* + * rq_cq_reap - Process the RQ CQ. + * + * Take all completing WC off the CQE and enqueue the associated DTO + * context on the dto_q for the transport. + */ +static void rq_cq_reap(struct svcxprt_rdma *xprt) +{ + int ret; + struct ib_wc wc; + struct svc_rdma_op_ctxt *ctxt = NULL; + + atomic_inc(&rdma_stat_rq_poll); + + spin_lock_bh(&xprt->sc_rq_dto_lock); + while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + ctxt->wc_status = wc.status; + ctxt->byte_len = wc.byte_len; + if (wc.status != IB_WC_SUCCESS) { + /* Close the transport */ + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_rdma_put_context(ctxt, 1); + continue; + } + list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + } + spin_unlock_bh(&xprt->sc_rq_dto_lock); + + if (ctxt) + atomic_inc(&rdma_stat_rq_prod); +} + +/* + * Send Queue Completion Handler - potentially called on interrupt context. + */ +static void sq_cq_reap(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt = NULL; + struct ib_wc wc; + struct ib_cq *cq = xprt->sc_sq_cq; + int ret; + + atomic_inc(&rdma_stat_sq_poll); + while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + xprt = ctxt->xprt; + + if (wc.status != IB_WC_SUCCESS) + /* Close the transport */ + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + + /* Decrement used SQ WR count */ + atomic_dec(&xprt->sc_sq_count); + wake_up(&xprt->sc_send_wait); + + switch (ctxt->wr_op) { + case IB_WR_SEND: + case IB_WR_RDMA_WRITE: + svc_rdma_put_context(ctxt, 1); + break; + + case IB_WR_RDMA_READ: + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); + spin_lock_bh(&xprt->sc_read_complete_lock); + list_add_tail(&ctxt->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_read_complete_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + } + break; + + default: + printk(KERN_ERR "svcrdma: unexpected completion type, " + "opcode=%d, status=%d\n", + wc.opcode, wc.status); + break; + } + } + + if (ctxt) + atomic_inc(&rdma_stat_sq_prod); +} + +static void sq_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct svcxprt_rdma *xprt = cq_context; + unsigned long flags; + + /* + * Set the bit regardless of whether or not it's on the list + * because it may be on the list already due to an RQ + * completion. + */ + set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); + + /* + * If this transport is not already on the DTO transport queue, + * add it + */ + spin_lock_irqsave(&dto_lock, flags); + if (list_empty(&xprt->sc_dto_q)) + list_add_tail(&xprt->sc_dto_q, &dto_xprt_q); + spin_unlock_irqrestore(&dto_lock, flags); + + /* Tasklet does all the work to avoid irqsave locks. */ + tasklet_schedule(&dto_tasklet); +} + +static void create_context_cache(struct svcxprt_rdma *xprt, + int ctxt_count, int ctxt_bump, int ctxt_max) +{ + struct svc_rdma_op_ctxt *ctxt; + int i; + + xprt->sc_ctxt_max = ctxt_max; + xprt->sc_ctxt_bump = ctxt_bump; + xprt->sc_ctxt_cnt = 0; + xprt->sc_ctxt_head = NULL; + for (i = 0; i < ctxt_count; i++) { + ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); + if (ctxt) { + ctxt->next = xprt->sc_ctxt_head; + xprt->sc_ctxt_head = ctxt; + xprt->sc_ctxt_cnt++; + } + } +} + +static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt) +{ + struct svc_rdma_op_ctxt *next; + if (!ctxt) + return; + + do { + next = ctxt->next; + kfree(ctxt); + ctxt = next; + } while (next); +} + +static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, + int listener) +{ + struct svcxprt_rdma *cma_xprt = kzalloc(sizeof *cma_xprt, GFP_KERNEL); + + if (!cma_xprt) + return NULL; + svc_xprt_init(&svc_rdma_class, &cma_xprt->sc_xprt, serv); + INIT_LIST_HEAD(&cma_xprt->sc_accept_q); + INIT_LIST_HEAD(&cma_xprt->sc_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); + INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); + init_waitqueue_head(&cma_xprt->sc_send_wait); + + spin_lock_init(&cma_xprt->sc_lock); + spin_lock_init(&cma_xprt->sc_read_complete_lock); + spin_lock_init(&cma_xprt->sc_ctxt_lock); + spin_lock_init(&cma_xprt->sc_rq_dto_lock); + + cma_xprt->sc_ord = svcrdma_ord; + + cma_xprt->sc_max_req_size = svcrdma_max_req_size; + cma_xprt->sc_max_requests = svcrdma_max_requests; + cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; + atomic_set(&cma_xprt->sc_sq_count, 0); + + if (!listener) { + int reqs = cma_xprt->sc_max_requests; + create_context_cache(cma_xprt, + reqs << 1, /* starting size */ + reqs, /* bump amount */ + reqs + + cma_xprt->sc_sq_depth + + RPCRDMA_MAX_THREADS + 1); /* max */ + if (!cma_xprt->sc_ctxt_head) { + kfree(cma_xprt); + return NULL; + } + clear_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); + } else + set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); + + return cma_xprt; +} + +struct page *svc_rdma_get_page(void) +{ + struct page *page; + + while ((page = alloc_page(GFP_KERNEL)) == NULL) { + /* If we can't get memory, wait a bit and try again */ + printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 " + "jiffies.\n"); + schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); + } + return page; +} + +int svc_rdma_post_recv(struct svcxprt_rdma *xprt) +{ + struct ib_recv_wr recv_wr, *bad_recv_wr; + struct svc_rdma_op_ctxt *ctxt; + struct page *page; + unsigned long pa; + int sge_no; + int buflen; + int ret; + + ctxt = svc_rdma_get_context(xprt); + buflen = 0; + ctxt->direction = DMA_FROM_DEVICE; + for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) { + BUG_ON(sge_no >= xprt->sc_max_sge); + page = svc_rdma_get_page(); + ctxt->pages[sge_no] = page; + pa = ib_dma_map_page(xprt->sc_cm_id->device, + page, 0, PAGE_SIZE, + DMA_FROM_DEVICE); + ctxt->sge[sge_no].addr = pa; + ctxt->sge[sge_no].length = PAGE_SIZE; + ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + buflen += PAGE_SIZE; + } + ctxt->count = sge_no; + recv_wr.next = NULL; + recv_wr.sg_list = &ctxt->sge[0]; + recv_wr.num_sge = ctxt->count; + recv_wr.wr_id = (u64)(unsigned long)ctxt; + + ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); + return ret; +} + +/* + * This function handles the CONNECT_REQUEST event on a listening + * endpoint. It is passed the cma_id for the _new_ connection. The context in + * this cma_id is inherited from the listening cma_id and is the svc_xprt + * structure for the listening endpoint. + * + * This function creates a new xprt for the new connection and enqueues it on + * the accept queue for the listent xprt. When the listen thread is kicked, it + * will call the recvfrom method on the listen xprt which will accept the new + * connection. + */ +static void handle_connect_req(struct rdma_cm_id *new_cma_id) +{ + struct svcxprt_rdma *listen_xprt = new_cma_id->context; + struct svcxprt_rdma *newxprt; + + /* Create a new transport */ + newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); + if (!newxprt) { + dprintk("svcrdma: failed to create new transport\n"); + return; + } + newxprt->sc_cm_id = new_cma_id; + new_cma_id->context = newxprt; + dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", + newxprt, newxprt->sc_cm_id, listen_xprt); + + /* + * Enqueue the new transport on the accept queue of the listening + * transport + */ + spin_lock_bh(&listen_xprt->sc_lock); + list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q); + spin_unlock_bh(&listen_xprt->sc_lock); + + /* + * Can't use svc_xprt_received here because we are not on a + * rqstp thread + */ + set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags); + svc_xprt_enqueue(&listen_xprt->sc_xprt); +} + +/* + * Handles events generated on the listening endpoint. These events will be + * either be incoming connect requests or adapter removal events. + */ +static int rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct svcxprt_rdma *xprt = cma_id->context; + int ret = 0; + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " + "event=%d\n", cma_id, cma_id->context, event->event); + handle_connect_req(cma_id); + break; + + case RDMA_CM_EVENT_ESTABLISHED: + /* Accept complete */ + dprintk("svcrdma: Connection completed on LISTEN xprt=%p, " + "cm_id=%p\n", xprt, cma_id); + break; + + case RDMA_CM_EVENT_DEVICE_REMOVAL: + dprintk("svcrdma: Device removal xprt=%p, cm_id=%p\n", + xprt, cma_id); + if (xprt) + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + break; + + default: + dprintk("svcrdma: Unexpected event on listening endpoint %p, " + "event=%d\n", cma_id, event->event); + break; + } + + return ret; +} + +static int rdma_cma_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) +{ + struct svc_xprt *xprt = cma_id->context; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + switch (event->event) { + case RDMA_CM_EVENT_ESTABLISHED: + /* Accept complete */ + dprintk("svcrdma: Connection completed on DTO xprt=%p, " + "cm_id=%p\n", xprt, cma_id); + clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); + svc_xprt_enqueue(xprt); + break; + case RDMA_CM_EVENT_DISCONNECTED: + dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", + xprt, cma_id); + if (xprt) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + } + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " + "event=%d\n", cma_id, xprt, event->event); + if (xprt) { + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + } + break; + default: + dprintk("svcrdma: Unexpected event on DTO endpoint %p, " + "event=%d\n", cma_id, event->event); + break; + } + return 0; +} + +/* + * Create a listening RDMA service endpoint. + */ +static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, + struct sockaddr *sa, int salen, + int flags) +{ + struct rdma_cm_id *listen_id; + struct svcxprt_rdma *cma_xprt; + struct svc_xprt *xprt; + int ret; + + dprintk("svcrdma: Creating RDMA socket\n"); + + cma_xprt = rdma_create_xprt(serv, 1); + if (!cma_xprt) + return ERR_PTR(ENOMEM); + xprt = &cma_xprt->sc_xprt; + + listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); + if (IS_ERR(listen_id)) { + rdma_destroy_xprt(cma_xprt); + dprintk("svcrdma: rdma_create_id failed = %ld\n", + PTR_ERR(listen_id)); + return (void *)listen_id; + } + ret = rdma_bind_addr(listen_id, sa); + if (ret) { + rdma_destroy_xprt(cma_xprt); + rdma_destroy_id(listen_id); + dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); + return ERR_PTR(ret); + } + cma_xprt->sc_cm_id = listen_id; + + ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); + if (ret) { + rdma_destroy_id(listen_id); + rdma_destroy_xprt(cma_xprt); + dprintk("svcrdma: rdma_listen failed = %d\n", ret); + } + + /* + * We need to use the address from the cm_id in case the + * caller specified 0 for the port number. + */ + sa = (struct sockaddr *)&cma_xprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); + + return &cma_xprt->sc_xprt; +} + +/* + * This is the xpo_recvfrom function for listening endpoints. Its + * purpose is to accept incoming connections. The CMA callback handler + * has already created a new transport and attached it to the new CMA + * ID. + * + * There is a queue of pending connections hung on the listening + * transport. This queue contains the new svc_xprt structure. This + * function takes svc_xprt structures off the accept_q and completes + * the connection. + */ +static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *listen_rdma; + struct svcxprt_rdma *newxprt = NULL; + struct rdma_conn_param conn_param; + struct ib_qp_init_attr qp_attr; + struct ib_device_attr devattr; + struct sockaddr *sa; + int ret; + int i; + + listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); + clear_bit(XPT_CONN, &xprt->xpt_flags); + /* Get the next entry off the accept list */ + spin_lock_bh(&listen_rdma->sc_lock); + if (!list_empty(&listen_rdma->sc_accept_q)) { + newxprt = list_entry(listen_rdma->sc_accept_q.next, + struct svcxprt_rdma, sc_accept_q); + list_del_init(&newxprt->sc_accept_q); + } + if (!list_empty(&listen_rdma->sc_accept_q)) + set_bit(XPT_CONN, &listen_rdma->sc_xprt.xpt_flags); + spin_unlock_bh(&listen_rdma->sc_lock); + if (!newxprt) + return NULL; + + dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", + newxprt, newxprt->sc_cm_id); + + ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); + if (ret) { + dprintk("svcrdma: could not query device attributes on " + "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret); + goto errout; + } + + /* Qualify the transport resource defaults with the + * capabilities of this particular device */ + newxprt->sc_max_sge = min((size_t)devattr.max_sge, + (size_t)RPCSVC_MAXPAGES); + newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, + (size_t)svcrdma_max_requests); + newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; + + newxprt->sc_ord = min((size_t)devattr.max_qp_rd_atom, + (size_t)svcrdma_ord); + + newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); + if (IS_ERR(newxprt->sc_pd)) { + dprintk("svcrdma: error creating PD for connect request\n"); + goto errout; + } + newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, + sq_comp_handler, + cq_event_handler, + newxprt, + newxprt->sc_sq_depth, + 0); + if (IS_ERR(newxprt->sc_sq_cq)) { + dprintk("svcrdma: error creating SQ CQ for connect request\n"); + goto errout; + } + newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, + rq_comp_handler, + cq_event_handler, + newxprt, + newxprt->sc_max_requests, + 0); + if (IS_ERR(newxprt->sc_rq_cq)) { + dprintk("svcrdma: error creating RQ CQ for connect request\n"); + goto errout; + } + + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.event_handler = qp_event_handler; + qp_attr.qp_context = &newxprt->sc_xprt; + qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; + qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; + qp_attr.cap.max_send_sge = newxprt->sc_max_sge; + qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = newxprt->sc_sq_cq; + qp_attr.recv_cq = newxprt->sc_rq_cq; + dprintk("svcrdma: newxprt->sc_cm_id=%p, newxprt->sc_pd=%p\n" + " cm_id->device=%p, sc_pd->device=%p\n" + " cap.max_send_wr = %d\n" + " cap.max_recv_wr = %d\n" + " cap.max_send_sge = %d\n" + " cap.max_recv_sge = %d\n", + newxprt->sc_cm_id, newxprt->sc_pd, + newxprt->sc_cm_id->device, newxprt->sc_pd->device, + qp_attr.cap.max_send_wr, + qp_attr.cap.max_recv_wr, + qp_attr.cap.max_send_sge, + qp_attr.cap.max_recv_sge); + + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); + if (ret) { + /* + * XXX: This is a hack. We need a xx_request_qp interface + * that will adjust the qp_attr's with a best-effort + * number + */ + qp_attr.cap.max_send_sge -= 2; + qp_attr.cap.max_recv_sge -= 2; + ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, + &qp_attr); + if (ret) { + dprintk("svcrdma: failed to create QP, ret=%d\n", ret); + goto errout; + } + newxprt->sc_max_sge = qp_attr.cap.max_send_sge; + newxprt->sc_max_sge = qp_attr.cap.max_recv_sge; + newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; + newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; + } + newxprt->sc_qp = newxprt->sc_cm_id->qp; + + /* Register all of physical memory */ + newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd, + IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + if (IS_ERR(newxprt->sc_phys_mr)) { + dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret); + goto errout; + } + + /* Post receive buffers */ + for (i = 0; i < newxprt->sc_max_requests; i++) { + ret = svc_rdma_post_recv(newxprt); + if (ret) { + dprintk("svcrdma: failure posting receive buffers\n"); + goto errout; + } + } + + /* Swap out the handler */ + newxprt->sc_cm_id->event_handler = rdma_cma_handler; + + /* Accept Connection */ + set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); + memset(&conn_param, 0, sizeof conn_param); + conn_param.responder_resources = 0; + conn_param.initiator_depth = newxprt->sc_ord; + ret = rdma_accept(newxprt->sc_cm_id, &conn_param); + if (ret) { + dprintk("svcrdma: failed to accept new connection, ret=%d\n", + ret); + goto errout; + } + + dprintk("svcrdma: new connection %p accepted with the following " + "attributes:\n" + " local_ip : %d.%d.%d.%d\n" + " local_port : %d\n" + " remote_ip : %d.%d.%d.%d\n" + " remote_port : %d\n" + " max_sge : %d\n" + " sq_depth : %d\n" + " max_requests : %d\n" + " ord : %d\n", + newxprt, + NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.src_addr)->sin_addr.s_addr), + ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.src_addr)->sin_port), + NIPQUAD(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.dst_addr)->sin_addr.s_addr), + ntohs(((struct sockaddr_in *)&newxprt->sc_cm_id-> + route.addr.dst_addr)->sin_port), + newxprt->sc_max_sge, + newxprt->sc_sq_depth, + newxprt->sc_max_requests, + newxprt->sc_ord); + + /* Set the local and remote addresses in the transport */ + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + + ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); + return &newxprt->sc_xprt; + + errout: + dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); + rdma_destroy_id(newxprt->sc_cm_id); + rdma_destroy_xprt(newxprt); + return NULL; +} + +/* + * Post an RQ WQE to the RQ when the rqst is being released. This + * effectively returns an RQ credit to the client. The rq_xprt_ctxt + * will be null if the request is deferred due to an RDMA_READ or the + * transport had no data ready (EAGAIN). Note that an RPC deferred in + * svc_process will still return the credit, this is because the data + * is copied and no longer consume a WQE/WC. + */ +static void svc_rdma_release_rqst(struct svc_rqst *rqstp) +{ + int err; + struct svcxprt_rdma *rdma = + container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); + if (rqstp->rq_xprt_ctxt) { + BUG_ON(rqstp->rq_xprt_ctxt != rdma); + err = svc_rdma_post_recv(rdma); + if (err) + dprintk("svcrdma: failed to post an RQ WQE error=%d\n", + err); + } + rqstp->rq_xprt_ctxt = NULL; +} + +/* Disable data ready events for this connection */ +static void svc_rdma_detach(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + unsigned long flags; + + dprintk("svc: svc_rdma_detach(%p)\n", xprt); + /* + * Shutdown the connection. This will ensure we don't get any + * more events from the provider. + */ + rdma_disconnect(rdma->sc_cm_id); + rdma_destroy_id(rdma->sc_cm_id); + + /* We may already be on the DTO list */ + spin_lock_irqsave(&dto_lock, flags); + if (!list_empty(&rdma->sc_dto_q)) + list_del_init(&rdma->sc_dto_q); + spin_unlock_irqrestore(&dto_lock, flags); +} + +static void svc_rdma_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt; + dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + rdma_destroy_xprt(rdma); + kfree(rdma); +} + +static void rdma_destroy_xprt(struct svcxprt_rdma *xprt) +{ + if (xprt->sc_qp && !IS_ERR(xprt->sc_qp)) + ib_destroy_qp(xprt->sc_qp); + + if (xprt->sc_sq_cq && !IS_ERR(xprt->sc_sq_cq)) + ib_destroy_cq(xprt->sc_sq_cq); + + if (xprt->sc_rq_cq && !IS_ERR(xprt->sc_rq_cq)) + ib_destroy_cq(xprt->sc_rq_cq); + + if (xprt->sc_phys_mr && !IS_ERR(xprt->sc_phys_mr)) + ib_dereg_mr(xprt->sc_phys_mr); + + if (xprt->sc_pd && !IS_ERR(xprt->sc_pd)) + ib_dealloc_pd(xprt->sc_pd); + + destroy_context_cache(xprt->sc_ctxt_head); +} + +static int svc_rdma_has_wspace(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + /* + * If there are fewer SQ WR available than required to send a + * simple response, return false. + */ + if ((rdma->sc_sq_depth - atomic_read(&rdma->sc_sq_count) < 3)) + return 0; + + /* + * ...or there are already waiters on the SQ, + * return false. + */ + if (waitqueue_active(&rdma->sc_send_wait)) + return 0; + + /* Otherwise return true. */ + return 1; +} + +int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) +{ + struct ib_send_wr *bad_wr; + int ret; + + if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) + return 0; + + BUG_ON(wr->send_flags != IB_SEND_SIGNALED); + BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != + wr->opcode); + /* If the SQ is full, wait until an SQ entry is available */ + while (1) { + spin_lock_bh(&xprt->sc_lock); + if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { + spin_unlock_bh(&xprt->sc_lock); + atomic_inc(&rdma_stat_sq_starve); + /* See if we can reap some SQ WR */ + sq_cq_reap(xprt); + + /* Wait until SQ WR available if SQ still full */ + wait_event(xprt->sc_send_wait, + atomic_read(&xprt->sc_sq_count) < + xprt->sc_sq_depth); + continue; + } + /* Bumped used SQ WR count and post */ + ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); + if (!ret) + atomic_inc(&xprt->sc_sq_count); + else + dprintk("svcrdma: failed to post SQ WR rc=%d, " + "sc_sq_count=%d, sc_sq_depth=%d\n", + ret, atomic_read(&xprt->sc_sq_count), + xprt->sc_sq_depth); + spin_unlock_bh(&xprt->sc_lock); + break; + } + return ret; +} + +int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err) +{ + struct ib_send_wr err_wr; + struct ib_sge sge; + struct page *p; + struct svc_rdma_op_ctxt *ctxt; + u32 *va; + int length; + int ret; + + p = svc_rdma_get_page(); + va = page_address(p); + + /* XDR encode error */ + length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); + + /* Prepare SGE for local address */ + sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, + p, 0, PAGE_SIZE, DMA_FROM_DEVICE); + sge.lkey = xprt->sc_phys_mr->lkey; + sge.length = length; + + ctxt = svc_rdma_get_context(xprt); + ctxt->count = 1; + ctxt->pages[0] = p; + + /* Prepare SEND WR */ + memset(&err_wr, 0, sizeof err_wr); + ctxt->wr_op = IB_WR_SEND; + err_wr.wr_id = (unsigned long)ctxt; + err_wr.sg_list = &sge; + err_wr.num_sge = 1; + err_wr.opcode = IB_WR_SEND; + err_wr.send_flags = IB_SEND_SIGNALED; + + /* Post It */ + ret = svc_rdma_send(xprt, &err_wr); + if (ret) { + dprintk("svcrdma: Error posting send = %d\n", ret); + svc_rdma_put_context(ctxt, 1); + } + + return ret; +} From d5b31be6823320d81570e0199acd60d3a3f75d85 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 12 Dec 2007 16:13:23 -0600 Subject: [PATCH 090/100] rdma: SVCRDMA recvfrom This file implements the RDMA transport recvfrom function. The function dequeues work reqeust completion contexts from an I/O list that it shares with the I/O tasklet in svc_rdma_transport.c. For ONCRPC RDMA, an RPC may not be complete when it is received. Instead, the RDMA header that precedes the RPC message informs the transport where to get the RPC data from on the client and where to place it in the RPC message before it is delivered to the server. The svc_rdma_recvfrom function therefore, parses this RDMA header and issues any necessary RDMA operations to fetch the remainder of the RPC from the client. Special handling is required when the request involves an RDMA_READ. In this case, recvfrom submits the RDMA_READ requests to the underlying transport driver and then returns 0. When the transport completes the last RDMA_READ for the request, it enqueues it on a read completion queue and enqueues the transport. The recvfrom code favors this queue over the regular DTO queue when satisfying reads. Signed-off-by: Tom Tucker Acked-by: Neil Brown Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 586 ++++++++++++++++++++++++ 1 file changed, 586 insertions(+) create mode 100644 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c new file mode 100644 index 000000000000..ab54a736486e --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -0,0 +1,586 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* + * Replace the pages in the rq_argpages array with the pages from the SGE in + * the RDMA_RECV completion. The SGL should contain full pages up until the + * last one. + */ +static void rdma_build_arg_xdr(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *ctxt, + u32 byte_count) +{ + struct page *page; + u32 bc; + int sge_no; + + /* Swap the page in the SGE with the page in argpages */ + page = ctxt->pages[0]; + put_page(rqstp->rq_pages[0]); + rqstp->rq_pages[0] = page; + + /* Set up the XDR head */ + rqstp->rq_arg.head[0].iov_base = page_address(page); + rqstp->rq_arg.head[0].iov_len = min(byte_count, ctxt->sge[0].length); + rqstp->rq_arg.len = byte_count; + rqstp->rq_arg.buflen = byte_count; + + /* Compute bytes past head in the SGL */ + bc = byte_count - rqstp->rq_arg.head[0].iov_len; + + /* If data remains, store it in the pagelist */ + rqstp->rq_arg.page_len = bc; + rqstp->rq_arg.page_base = 0; + rqstp->rq_arg.pages = &rqstp->rq_pages[1]; + sge_no = 1; + while (bc && sge_no < ctxt->count) { + page = ctxt->pages[sge_no]; + put_page(rqstp->rq_pages[sge_no]); + rqstp->rq_pages[sge_no] = page; + bc -= min(bc, ctxt->sge[sge_no].length); + rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; + sge_no++; + } + rqstp->rq_respages = &rqstp->rq_pages[sge_no]; + + /* We should never run out of SGE because the limit is defined to + * support the max allowed RPC data length + */ + BUG_ON(bc && (sge_no == ctxt->count)); + BUG_ON((rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len) + != byte_count); + BUG_ON(rqstp->rq_arg.len != byte_count); + + /* If not all pages were used from the SGL, free the remaining ones */ + bc = sge_no; + while (sge_no < ctxt->count) { + page = ctxt->pages[sge_no++]; + put_page(page); + } + ctxt->count = bc; + + /* Set up tail */ + rqstp->rq_arg.tail[0].iov_base = NULL; + rqstp->rq_arg.tail[0].iov_len = 0; +} + +struct chunk_sge { + int start; /* sge no for this chunk */ + int count; /* sge count for this chunk */ +}; + +/* Encode a read-chunk-list as an array of IB SGE + * + * Assumptions: + * - chunk[0]->position points to pages[0] at an offset of 0 + * - pages[] is not physically or virtually contigous and consists of + * PAGE_SIZE elements. + * + * Output: + * - sge array pointing into pages[] array. + * - chunk_sge array specifying sge index and count for each + * chunk in the read list + * + */ +static int rdma_rcl_to_sge(struct svcxprt_rdma *xprt, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, + struct rpcrdma_msg *rmsgp, + struct ib_sge *sge, + struct chunk_sge *ch_sge_ary, + int ch_count, + int byte_count) +{ + int sge_no; + int sge_bytes; + int page_off; + int page_no; + int ch_bytes; + int ch_no; + struct rpcrdma_read_chunk *ch; + + sge_no = 0; + page_no = 0; + page_off = 0; + ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + ch_no = 0; + ch_bytes = ch->rc_target.rs_length; + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = &head->pages[head->count]; + head->sge[0].length = head->count; /* save count of hdr pages */ + head->arg.page_base = 0; + head->arg.page_len = ch_bytes; + head->arg.len = rqstp->rq_arg.len + ch_bytes; + head->arg.buflen = rqstp->rq_arg.buflen + ch_bytes; + head->count++; + ch_sge_ary[0].start = 0; + while (byte_count) { + sge_bytes = min_t(int, PAGE_SIZE-page_off, ch_bytes); + sge[sge_no].addr = + ib_dma_map_page(xprt->sc_cm_id->device, + rqstp->rq_arg.pages[page_no], + page_off, sge_bytes, + DMA_FROM_DEVICE); + sge[sge_no].length = sge_bytes; + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + /* + * Don't bump head->count here because the same page + * may be used by multiple SGE. + */ + head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; + + byte_count -= sge_bytes; + ch_bytes -= sge_bytes; + sge_no++; + /* + * If all bytes for this chunk have been mapped to an + * SGE, move to the next SGE + */ + if (ch_bytes == 0) { + ch_sge_ary[ch_no].count = + sge_no - ch_sge_ary[ch_no].start; + ch_no++; + ch++; + ch_sge_ary[ch_no].start = sge_no; + ch_bytes = ch->rc_target.rs_length; + /* If bytes remaining account for next chunk */ + if (byte_count) { + head->arg.page_len += ch_bytes; + head->arg.len += ch_bytes; + head->arg.buflen += ch_bytes; + } + } + /* + * If this SGE consumed all of the page, move to the + * next page + */ + if ((sge_bytes + page_off) == PAGE_SIZE) { + page_no++; + page_off = 0; + /* + * If there are still bytes left to map, bump + * the page count + */ + if (byte_count) + head->count++; + } else + page_off += sge_bytes; + } + BUG_ON(byte_count != 0); + return sge_no; +} + +static void rdma_set_ctxt_sge(struct svc_rdma_op_ctxt *ctxt, + struct ib_sge *sge, + u64 *sgl_offset, + int count) +{ + int i; + + ctxt->count = count; + for (i = 0; i < count; i++) { + ctxt->sge[i].addr = sge[i].addr; + ctxt->sge[i].length = sge[i].length; + *sgl_offset = *sgl_offset + sge[i].length; + } +} + +static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) +{ +#ifdef RDMA_TRANSPORT_IWARP + if ((RDMA_TRANSPORT_IWARP == + rdma_node_get_transport(xprt->sc_cm_id-> + device->node_type)) + && sge_count > 1) + return 1; + else +#endif + return min_t(int, sge_count, xprt->sc_max_sge); +} + +/* + * Use RDMA_READ to read data from the advertised client buffer into the + * XDR stream starting at rq_arg.head[0].iov_base. + * Each chunk in the array + * contains the following fields: + * discrim - '1', This isn't used for data placement + * position - The xdr stream offset (the same for every chunk) + * handle - RMR for client memory region + * length - data transfer length + * offset - 64 bit tagged offset in remote memory region + * + * On our side, we need to read into a pagelist. The first page immediately + * follows the RPC header. + * + * This function returns 1 to indicate success. The data is not yet in + * the pagelist and therefore the RPC request must be deferred. The + * I/O completion will enqueue the transport again and + * svc_rdma_recvfrom will complete the request. + * + * NOTE: The ctxt must not be touched after the last WR has been posted + * because the I/O completion processing may occur on another + * processor and free / modify the context. Ne touche pas! + */ +static int rdma_read_xdr(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *hdr_ctxt) +{ + struct ib_send_wr read_wr; + int err = 0; + int ch_no; + struct ib_sge *sge; + int ch_count; + int byte_count; + int sge_count; + u64 sgl_offset; + struct rpcrdma_read_chunk *ch; + struct svc_rdma_op_ctxt *ctxt = NULL; + struct svc_rdma_op_ctxt *head; + struct svc_rdma_op_ctxt *tmp_sge_ctxt; + struct svc_rdma_op_ctxt *tmp_ch_ctxt; + struct chunk_sge *ch_sge_ary; + + /* If no read list is present, return 0 */ + ch = svc_rdma_get_read_chunk(rmsgp); + if (!ch) + return 0; + + /* Allocate temporary contexts to keep SGE */ + BUG_ON(sizeof(struct ib_sge) < sizeof(struct chunk_sge)); + tmp_sge_ctxt = svc_rdma_get_context(xprt); + sge = tmp_sge_ctxt->sge; + tmp_ch_ctxt = svc_rdma_get_context(xprt); + ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge; + + svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); + sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, + sge, ch_sge_ary, + ch_count, byte_count); + head = svc_rdma_get_context(xprt); + sgl_offset = 0; + ch_no = 0; + + for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; + ch->rc_discrim != 0; ch++, ch_no++) { +next_sge: + if (!ctxt) + ctxt = head; + else { + ctxt->next = svc_rdma_get_context(xprt); + ctxt = ctxt->next; + } + ctxt->next = NULL; + ctxt->direction = DMA_FROM_DEVICE; + clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); + clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + if ((ch+1)->rc_discrim == 0) { + /* + * Checked in sq_cq_reap to see if we need to + * be enqueued + */ + set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); + ctxt->next = hdr_ctxt; + hdr_ctxt->next = head; + } + + /* Prepare READ WR */ + memset(&read_wr, 0, sizeof read_wr); + ctxt->wr_op = IB_WR_RDMA_READ; + read_wr.wr_id = (unsigned long)ctxt; + read_wr.opcode = IB_WR_RDMA_READ; + read_wr.send_flags = IB_SEND_SIGNALED; + read_wr.wr.rdma.rkey = ch->rc_target.rs_handle; + read_wr.wr.rdma.remote_addr = + get_unaligned(&(ch->rc_target.rs_offset)) + + sgl_offset; + read_wr.sg_list = &sge[ch_sge_ary[ch_no].start]; + read_wr.num_sge = + rdma_read_max_sge(xprt, ch_sge_ary[ch_no].count); + rdma_set_ctxt_sge(ctxt, &sge[ch_sge_ary[ch_no].start], + &sgl_offset, + read_wr.num_sge); + + /* Post the read */ + err = svc_rdma_send(xprt, &read_wr); + if (err) { + printk(KERN_ERR "svcrdma: Error posting send = %d\n", + err); + /* + * Break the circular list so free knows when + * to stop if the error happened to occur on + * the last read + */ + ctxt->next = NULL; + goto out; + } + atomic_inc(&rdma_stat_read); + + if (read_wr.num_sge < ch_sge_ary[ch_no].count) { + ch_sge_ary[ch_no].count -= read_wr.num_sge; + ch_sge_ary[ch_no].start += read_wr.num_sge; + goto next_sge; + } + sgl_offset = 0; + err = 0; + } + + out: + svc_rdma_put_context(tmp_sge_ctxt, 0); + svc_rdma_put_context(tmp_ch_ctxt, 0); + + /* Detach arg pages. svc_recv will replenish them */ + for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) + rqstp->rq_pages[ch_no] = NULL; + + /* + * Detach res pages. svc_release must see a resused count of + * zero or it will attempt to put them. + */ + while (rqstp->rq_resused) + rqstp->rq_respages[--rqstp->rq_resused] = NULL; + + if (err) { + printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err); + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + /* Free the linked list of read contexts */ + while (head != NULL) { + ctxt = head->next; + svc_rdma_put_context(head, 1); + head = ctxt; + } + return 0; + } + + return 1; +} + +static int rdma_read_complete(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *data) +{ + struct svc_rdma_op_ctxt *head = data->next; + int page_no; + int ret; + + BUG_ON(!head); + + /* Copy RPC pages */ + for (page_no = 0; page_no < head->count; page_no++) { + put_page(rqstp->rq_pages[page_no]); + rqstp->rq_pages[page_no] = head->pages[page_no]; + } + /* Point rq_arg.pages past header */ + rqstp->rq_arg.pages = &rqstp->rq_pages[head->sge[0].length]; + rqstp->rq_arg.page_len = head->arg.page_len; + rqstp->rq_arg.page_base = head->arg.page_base; + + /* rq_respages starts after the last arg page */ + rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; + rqstp->rq_resused = 0; + + /* Rebuild rq_arg head and tail. */ + rqstp->rq_arg.head[0] = head->arg.head[0]; + rqstp->rq_arg.tail[0] = head->arg.tail[0]; + rqstp->rq_arg.len = head->arg.len; + rqstp->rq_arg.buflen = head->arg.buflen; + + /* XXX: What should this be? */ + rqstp->rq_prot = IPPROTO_MAX; + + /* + * Free the contexts we used to build the RDMA_READ. We have + * to be careful here because the context list uses the same + * next pointer used to chain the contexts associated with the + * RDMA_READ + */ + data->next = NULL; /* terminate circular list */ + do { + data = head->next; + svc_rdma_put_context(head, 0); + head = data; + } while (head != NULL); + + ret = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + dprintk("svcrdma: deferred read ret=%d, rq_arg.len =%d, " + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", + ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + + /* Indicate that we've consumed an RQ credit */ + rqstp->rq_xprt_ctxt = rqstp->rq_xprt; + svc_xprt_received(rqstp->rq_xprt); + return ret; +} + +/* + * Set up the rqstp thread context to point to the RQ buffer. If + * necessary, pull additional data from the client with an RDMA_READ + * request. + */ +int svc_rdma_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma_xprt = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct svc_rdma_op_ctxt *ctxt = NULL; + struct rpcrdma_msg *rmsgp; + int ret = 0; + int len; + + dprintk("svcrdma: rqstp=%p\n", rqstp); + + /* + * The rq_xprt_ctxt indicates if we've consumed an RQ credit + * or not. It is used in the rdma xpo_release_rqst function to + * determine whether or not to return an RQ WQE to the RQ. + */ + rqstp->rq_xprt_ctxt = NULL; + + spin_lock_bh(&rdma_xprt->sc_read_complete_lock); + if (!list_empty(&rdma_xprt->sc_read_complete_q)) { + ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + } + spin_unlock_bh(&rdma_xprt->sc_read_complete_lock); + if (ctxt) + return rdma_read_complete(rqstp, ctxt); + + spin_lock_bh(&rdma_xprt->sc_rq_dto_lock); + if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { + ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + } else { + atomic_inc(&rdma_stat_rq_starve); + clear_bit(XPT_DATA, &xprt->xpt_flags); + ctxt = NULL; + } + spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); + if (!ctxt) { + /* This is the EAGAIN path. The svc_recv routine will + * return -EAGAIN, the nfsd thread will go to call into + * svc_recv again and we shouldn't be on the active + * transport list + */ + if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) + goto close_out; + + BUG_ON(ret); + goto out; + } + dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", + ctxt, rdma_xprt, rqstp, ctxt->wc_status); + BUG_ON(ctxt->wc_status != IB_WC_SUCCESS); + atomic_inc(&rdma_stat_recv); + + /* Build up the XDR from the receive buffers. */ + rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); + + /* Decode the RDMA header. */ + len = svc_rdma_xdr_decode_req(&rmsgp, rqstp); + rqstp->rq_xprt_hlen = len; + + /* If the request is invalid, reply with an error */ + if (len < 0) { + if (len == -ENOSYS) + (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); + goto close_out; + } + + /* Read read-list data. If we would need to wait, defer + * it. Not that in this case, we don't return the RQ credit + * until after the read completes. + */ + if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) { + svc_xprt_received(xprt); + return 0; + } + + /* Indicate we've consumed an RQ credit */ + rqstp->rq_xprt_ctxt = rqstp->rq_xprt; + + ret = rqstp->rq_arg.head[0].iov_len + + rqstp->rq_arg.page_len + + rqstp->rq_arg.tail[0].iov_len; + svc_rdma_put_context(ctxt, 0); + out: + dprintk("svcrdma: ret = %d, rq_arg.len =%d, " + "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len = %zd\n", + ret, rqstp->rq_arg.len, + rqstp->rq_arg.head[0].iov_base, + rqstp->rq_arg.head[0].iov_len); + rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, xprt); + svc_xprt_received(xprt); + return ret; + + close_out: + if (ctxt) { + svc_rdma_put_context(ctxt, 1); + /* Indicate we've consumed an RQ credit */ + rqstp->rq_xprt_ctxt = rqstp->rq_xprt; + } + dprintk("svcrdma: transport %p is closing\n", xprt); + /* + * Set the close bit and enqueue it. svc_recv will see the + * close bit and call svc_xprt_delete + */ + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_received(xprt); + return 0; +} From c06b540a54ad01d2fda8cfb5d8823b9b3d8d1cb2 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 12 Dec 2007 16:13:25 -0600 Subject: [PATCH 091/100] rdma: SVCRDMA sendto This file implements the RDMA transport sendto function. A RPC reply on an RDMA transport consists of some number of RDMA_WRITE requests followed by an RDMA_SEND request. The sendto function parses the ONCRPC RDMA reply header to determine how to send the reply back to the client. The send queue is sized so as to be able to send complete replies for requests in most cases. In the event that there are not enough SQ WR slots to reply, e.g. big data, the send will block the NFSD thread. The I/O callback functions in svc_rdma_transport.c that reap WR completions wake any waiters blocked on the SQ. In general, the goal is not to block NFSD threads and the has_wspace method stall requests when the SQ is nearly full. Signed-off-by: Tom Tucker Acked-by: Neil Brown Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 520 ++++++++++++++++++++++++++ 1 file changed, 520 insertions(+) create mode 100644 net/sunrpc/xprtrdma/svc_rdma_sendto.c diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c new file mode 100644 index 000000000000..3e321949e1dc --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* Encode an XDR as an array of IB SGE + * + * Assumptions: + * - head[0] is physically contiguous. + * - tail[0] is physically contiguous. + * - pages[] is not physically or virtually contigous and consists of + * PAGE_SIZE elements. + * + * Output: + * SGE[0] reserved for RCPRDMA header + * SGE[1] data from xdr->head[] + * SGE[2..sge_count-2] data from xdr->pages[] + * SGE[sge_count-1] data from xdr->tail. + * + */ +static struct ib_sge *xdr_to_sge(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct ib_sge *sge, + int *sge_count) +{ + /* Max we need is the length of the XDR / pagesize + one for + * head + one for tail + one for RPCRDMA header + */ + int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3; + int sge_no; + u32 byte_count = xdr->len; + u32 sge_bytes; + u32 page_bytes; + int page_off; + int page_no; + + /* Skip the first sge, this is for the RPCRDMA header */ + sge_no = 1; + + /* Head SGE */ + sge[sge_no].addr = ib_dma_map_single(xprt->sc_cm_id->device, + xdr->head[0].iov_base, + xdr->head[0].iov_len, + DMA_TO_DEVICE); + sge_bytes = min_t(u32, byte_count, xdr->head[0].iov_len); + byte_count -= sge_bytes; + sge[sge_no].length = sge_bytes; + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + sge_no++; + + /* pages SGE */ + page_no = 0; + page_bytes = xdr->page_len; + page_off = xdr->page_base; + while (byte_count && page_bytes) { + sge_bytes = min_t(u32, byte_count, (PAGE_SIZE-page_off)); + sge[sge_no].addr = + ib_dma_map_page(xprt->sc_cm_id->device, + xdr->pages[page_no], page_off, + sge_bytes, DMA_TO_DEVICE); + sge_bytes = min(sge_bytes, page_bytes); + byte_count -= sge_bytes; + page_bytes -= sge_bytes; + sge[sge_no].length = sge_bytes; + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + + sge_no++; + page_no++; + page_off = 0; /* reset for next time through loop */ + } + + /* Tail SGE */ + if (byte_count && xdr->tail[0].iov_len) { + sge[sge_no].addr = + ib_dma_map_single(xprt->sc_cm_id->device, + xdr->tail[0].iov_base, + xdr->tail[0].iov_len, + DMA_TO_DEVICE); + sge_bytes = min_t(u32, byte_count, xdr->tail[0].iov_len); + byte_count -= sge_bytes; + sge[sge_no].length = sge_bytes; + sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + sge_no++; + } + + BUG_ON(sge_no > sge_max); + BUG_ON(byte_count != 0); + + *sge_count = sge_no; + return sge; +} + + +/* Assumptions: + * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE + */ +static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, + u32 rmr, u64 to, + u32 xdr_off, int write_len, + struct ib_sge *xdr_sge, int sge_count) +{ + struct svc_rdma_op_ctxt *tmp_sge_ctxt; + struct ib_send_wr write_wr; + struct ib_sge *sge; + int xdr_sge_no; + int sge_no; + int sge_bytes; + int sge_off; + int bc; + struct svc_rdma_op_ctxt *ctxt; + int ret = 0; + + BUG_ON(sge_count >= 32); + dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " + "write_len=%d, xdr_sge=%p, sge_count=%d\n", + rmr, to, xdr_off, write_len, xdr_sge, sge_count); + + ctxt = svc_rdma_get_context(xprt); + ctxt->count = 0; + tmp_sge_ctxt = svc_rdma_get_context(xprt); + sge = tmp_sge_ctxt->sge; + + /* Find the SGE associated with xdr_off */ + for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < sge_count; + xdr_sge_no++) { + if (xdr_sge[xdr_sge_no].length > bc) + break; + bc -= xdr_sge[xdr_sge_no].length; + } + + sge_off = bc; + bc = write_len; + sge_no = 0; + + /* Copy the remaining SGE */ + while (bc != 0 && xdr_sge_no < sge_count) { + sge[sge_no].addr = xdr_sge[xdr_sge_no].addr + sge_off; + sge[sge_no].lkey = xdr_sge[xdr_sge_no].lkey; + sge_bytes = min((size_t)bc, + (size_t)(xdr_sge[xdr_sge_no].length-sge_off)); + sge[sge_no].length = sge_bytes; + + sge_off = 0; + sge_no++; + xdr_sge_no++; + bc -= sge_bytes; + } + + BUG_ON(bc != 0); + BUG_ON(xdr_sge_no > sge_count); + + /* Prepare WRITE WR */ + memset(&write_wr, 0, sizeof write_wr); + ctxt->wr_op = IB_WR_RDMA_WRITE; + write_wr.wr_id = (unsigned long)ctxt; + write_wr.sg_list = &sge[0]; + write_wr.num_sge = sge_no; + write_wr.opcode = IB_WR_RDMA_WRITE; + write_wr.send_flags = IB_SEND_SIGNALED; + write_wr.wr.rdma.rkey = rmr; + write_wr.wr.rdma.remote_addr = to; + + /* Post It */ + atomic_inc(&rdma_stat_write); + if (svc_rdma_send(xprt, &write_wr)) { + svc_rdma_put_context(ctxt, 1); + /* Fatal error, close transport */ + ret = -EIO; + } + svc_rdma_put_context(tmp_sge_ctxt, 0); + return ret; +} + +static int send_write_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + struct svc_rqst *rqstp, + struct ib_sge *sge, + int sge_count) +{ + u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; + int write_len; + int max_write; + u32 xdr_off; + int chunk_off; + int chunk_no; + struct rpcrdma_write_array *arg_ary; + struct rpcrdma_write_array *res_ary; + int ret; + + arg_ary = svc_rdma_get_write_array(rdma_argp); + if (!arg_ary) + return 0; + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[1]; + + max_write = xprt->sc_max_sge * PAGE_SIZE; + + /* Write chunks start at the pagelist */ + for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; + xfer_len && chunk_no < arg_ary->wc_nchunks; + chunk_no++) { + struct rpcrdma_segment *arg_ch; + u64 rs_offset; + + arg_ch = &arg_ary->wc_array[chunk_no].wc_target; + write_len = min(xfer_len, arg_ch->rs_length); + + /* Prepare the response chunk given the length actually + * written */ + rs_offset = get_unaligned(&(arg_ch->rs_offset)); + svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, + arg_ch->rs_handle, + rs_offset, + write_len); + chunk_off = 0; + while (write_len) { + int this_write; + this_write = min(write_len, max_write); + ret = send_write(xprt, rqstp, + arg_ch->rs_handle, + rs_offset + chunk_off, + xdr_off, + this_write, + sge, + sge_count); + if (ret) { + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", + ret); + return -EIO; + } + chunk_off += this_write; + xdr_off += this_write; + xfer_len -= this_write; + write_len -= this_write; + } + } + /* Update the req with the number of chunks actually used */ + svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no); + + return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len; +} + +static int send_reply_chunks(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + struct svc_rqst *rqstp, + struct ib_sge *sge, + int sge_count) +{ + u32 xfer_len = rqstp->rq_res.len; + int write_len; + int max_write; + u32 xdr_off; + int chunk_no; + int chunk_off; + struct rpcrdma_segment *ch; + struct rpcrdma_write_array *arg_ary; + struct rpcrdma_write_array *res_ary; + int ret; + + arg_ary = svc_rdma_get_reply_array(rdma_argp); + if (!arg_ary) + return 0; + /* XXX: need to fix when reply lists occur with read-list and or + * write-list */ + res_ary = (struct rpcrdma_write_array *) + &rdma_resp->rm_body.rm_chunks[2]; + + max_write = xprt->sc_max_sge * PAGE_SIZE; + + /* xdr offset starts at RPC message */ + for (xdr_off = 0, chunk_no = 0; + xfer_len && chunk_no < arg_ary->wc_nchunks; + chunk_no++) { + u64 rs_offset; + ch = &arg_ary->wc_array[chunk_no].wc_target; + write_len = min(xfer_len, ch->rs_length); + + + /* Prepare the reply chunk given the length actually + * written */ + rs_offset = get_unaligned(&(ch->rs_offset)); + svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, + ch->rs_handle, rs_offset, + write_len); + chunk_off = 0; + while (write_len) { + int this_write; + + this_write = min(write_len, max_write); + ret = send_write(xprt, rqstp, + ch->rs_handle, + rs_offset + chunk_off, + xdr_off, + this_write, + sge, + sge_count); + if (ret) { + dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n", + ret); + return -EIO; + } + chunk_off += this_write; + xdr_off += this_write; + xfer_len -= this_write; + write_len -= this_write; + } + } + /* Update the req with the number of chunks actually used */ + svc_rdma_xdr_encode_reply_array(res_ary, chunk_no); + + return rqstp->rq_res.len; +} + +/* This function prepares the portion of the RPCRDMA message to be + * sent in the RDMA_SEND. This function is called after data sent via + * RDMA has already been transmitted. There are three cases: + * - The RPCRDMA header, RPC header, and payload are all sent in a + * single RDMA_SEND. This is the "inline" case. + * - The RPCRDMA header and some portion of the RPC header and data + * are sent via this RDMA_SEND and another portion of the data is + * sent via RDMA. + * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC + * header and data are all transmitted via RDMA. + * In all three cases, this function prepares the RPCRDMA header in + * sge[0], the 'type' parameter indicates the type to place in the + * RPCRDMA header, and the 'byte_count' field indicates how much of + * the XDR to include in this RDMA_SEND. + */ +static int send_reply(struct svcxprt_rdma *rdma, + struct svc_rqst *rqstp, + struct page *page, + struct rpcrdma_msg *rdma_resp, + struct svc_rdma_op_ctxt *ctxt, + int sge_count, + int byte_count) +{ + struct ib_send_wr send_wr; + int sge_no; + int sge_bytes; + int page_no; + int ret; + + /* Prepare the context */ + ctxt->pages[0] = page; + ctxt->count = 1; + + /* Prepare the SGE for the RPCRDMA Header */ + ctxt->sge[0].addr = + ib_dma_map_page(rdma->sc_cm_id->device, + page, 0, PAGE_SIZE, DMA_TO_DEVICE); + ctxt->direction = DMA_TO_DEVICE; + ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); + ctxt->sge[0].lkey = rdma->sc_phys_mr->lkey; + + /* Determine how many of our SGE are to be transmitted */ + for (sge_no = 1; byte_count && sge_no < sge_count; sge_no++) { + sge_bytes = min((size_t)ctxt->sge[sge_no].length, + (size_t)byte_count); + byte_count -= sge_bytes; + } + BUG_ON(byte_count != 0); + + /* Save all respages in the ctxt and remove them from the + * respages array. They are our pages until the I/O + * completes. + */ + for (page_no = 0; page_no < rqstp->rq_resused; page_no++) { + ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; + ctxt->count++; + rqstp->rq_respages[page_no] = NULL; + } + + BUG_ON(sge_no > rdma->sc_max_sge); + memset(&send_wr, 0, sizeof send_wr); + ctxt->wr_op = IB_WR_SEND; + send_wr.wr_id = (unsigned long)ctxt; + send_wr.sg_list = ctxt->sge; + send_wr.num_sge = sge_no; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = svc_rdma_send(rdma, &send_wr); + if (ret) + svc_rdma_put_context(ctxt, 1); + + return ret; +} + +void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) +{ +} + +/* + * Return the start of an xdr buffer. + */ +static void *xdr_start(struct xdr_buf *xdr) +{ + return xdr->head[0].iov_base - + (xdr->len - + xdr->page_len - + xdr->tail[0].iov_len - + xdr->head[0].iov_len); +} + +int svc_rdma_sendto(struct svc_rqst *rqstp) +{ + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + struct rpcrdma_msg *rdma_argp; + struct rpcrdma_msg *rdma_resp; + struct rpcrdma_write_array *reply_ary; + enum rpcrdma_proc reply_type; + int ret; + int inline_bytes; + struct ib_sge *sge; + int sge_count = 0; + struct page *res_page; + struct svc_rdma_op_ctxt *ctxt; + + dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); + + /* Get the RDMA request header. */ + rdma_argp = xdr_start(&rqstp->rq_arg); + + /* Build an SGE for the XDR */ + ctxt = svc_rdma_get_context(rdma); + ctxt->direction = DMA_TO_DEVICE; + sge = xdr_to_sge(rdma, &rqstp->rq_res, ctxt->sge, &sge_count); + + inline_bytes = rqstp->rq_res.len; + + /* Create the RDMA response header */ + res_page = svc_rdma_get_page(); + rdma_resp = page_address(res_page); + reply_ary = svc_rdma_get_reply_array(rdma_argp); + if (reply_ary) + reply_type = RDMA_NOMSG; + else + reply_type = RDMA_MSG; + svc_rdma_xdr_encode_reply_header(rdma, rdma_argp, + rdma_resp, reply_type); + + /* Send any write-chunk data and build resp write-list */ + ret = send_write_chunks(rdma, rdma_argp, rdma_resp, + rqstp, sge, sge_count); + if (ret < 0) { + printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n", + ret); + goto error; + } + inline_bytes -= ret; + + /* Send any reply-list data and update resp reply-list */ + ret = send_reply_chunks(rdma, rdma_argp, rdma_resp, + rqstp, sge, sge_count); + if (ret < 0) { + printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n", + ret); + goto error; + } + inline_bytes -= ret; + + ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, sge_count, + inline_bytes); + dprintk("svcrdma: send_reply returns %d\n", ret); + return ret; + error: + svc_rdma_put_context(ctxt, 0); + put_page(res_page); + return ret; +} From ef1eac0a3fa86b06aa2d87021f157d13abc1903f Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 12 Dec 2007 16:13:27 -0600 Subject: [PATCH 092/100] rdma: ONCRPC RDMA protocol marshalling This logic parses the ONCRDMA protocol headers that precede the actual RPC header. It is placed in a separate file to keep all protocol aware code in a single place. Signed-off-by: Tom Tucker Acked-by: Neil Brown Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/svc_rdma_marshal.c | 412 +++++++++++++++++++++++++ 1 file changed, 412 insertions(+) create mode 100644 net/sunrpc/xprtrdma/svc_rdma_marshal.c diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c new file mode 100644 index 000000000000..9530ef2d40dc --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the BSD-type + * license below: + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * Neither the name of the Network Appliance, Inc. nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Author: Tom Tucker + */ + +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* + * Decodes a read chunk list. The expected format is as follows: + * descrim : xdr_one + * position : u32 offset into XDR stream + * handle : u32 RKEY + * . . . + * end-of-list: xdr_zero + */ +static u32 *decode_read_list(u32 *va, u32 *vaend) +{ + struct rpcrdma_read_chunk *ch = (struct rpcrdma_read_chunk *)va; + + while (ch->rc_discrim != xdr_zero) { + u64 ch_offset; + + if (((unsigned long)ch + sizeof(struct rpcrdma_read_chunk)) > + (unsigned long)vaend) { + dprintk("svcrdma: vaend=%p, ch=%p\n", vaend, ch); + return NULL; + } + + ch->rc_discrim = ntohl(ch->rc_discrim); + ch->rc_position = ntohl(ch->rc_position); + ch->rc_target.rs_handle = ntohl(ch->rc_target.rs_handle); + ch->rc_target.rs_length = ntohl(ch->rc_target.rs_length); + va = (u32 *)&ch->rc_target.rs_offset; + xdr_decode_hyper(va, &ch_offset); + put_unaligned(ch_offset, (u64 *)va); + ch++; + } + return (u32 *)&ch->rc_position; +} + +/* + * Determine number of chunks and total bytes in chunk list. The chunk + * list has already been verified to fit within the RPCRDMA header. + */ +void svc_rdma_rcl_chunk_counts(struct rpcrdma_read_chunk *ch, + int *ch_count, int *byte_count) +{ + /* compute the number of bytes represented by read chunks */ + *byte_count = 0; + *ch_count = 0; + for (; ch->rc_discrim != 0; ch++) { + *byte_count = *byte_count + ch->rc_target.rs_length; + *ch_count = *ch_count + 1; + } +} + +/* + * Decodes a write chunk list. The expected format is as follows: + * descrim : xdr_one + * nchunks : + * handle : u32 RKEY ---+ + * length : u32 | + * offset : remove va + + * . . . | + * ---+ + */ +static u32 *decode_write_list(u32 *va, u32 *vaend) +{ + int ch_no; + struct rpcrdma_write_array *ary = + (struct rpcrdma_write_array *)va; + + /* Check for not write-array */ + if (ary->wc_discrim == xdr_zero) + return (u32 *)&ary->wc_nchunks; + + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); + return NULL; + } + ary->wc_discrim = ntohl(ary->wc_discrim); + ary->wc_nchunks = ntohl(ary->wc_nchunks); + if (((unsigned long)&ary->wc_array[0] + + (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", + ary, ary->wc_nchunks, vaend); + return NULL; + } + for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { + u64 ch_offset; + + ary->wc_array[ch_no].wc_target.rs_handle = + ntohl(ary->wc_array[ch_no].wc_target.rs_handle); + ary->wc_array[ch_no].wc_target.rs_length = + ntohl(ary->wc_array[ch_no].wc_target.rs_length); + va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; + xdr_decode_hyper(va, &ch_offset); + put_unaligned(ch_offset, (u64 *)va); + } + + /* + * rs_length is the 2nd 4B field in wc_target and taking its + * address skips the list terminator + */ + return (u32 *)&ary->wc_array[ch_no].wc_target.rs_length; +} + +static u32 *decode_reply_array(u32 *va, u32 *vaend) +{ + int ch_no; + struct rpcrdma_write_array *ary = + (struct rpcrdma_write_array *)va; + + /* Check for no reply-array */ + if (ary->wc_discrim == xdr_zero) + return (u32 *)&ary->wc_nchunks; + + if ((unsigned long)ary + sizeof(struct rpcrdma_write_array) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, vaend=%p\n", ary, vaend); + return NULL; + } + ary->wc_discrim = ntohl(ary->wc_discrim); + ary->wc_nchunks = ntohl(ary->wc_nchunks); + if (((unsigned long)&ary->wc_array[0] + + (sizeof(struct rpcrdma_write_chunk) * ary->wc_nchunks)) > + (unsigned long)vaend) { + dprintk("svcrdma: ary=%p, wc_nchunks=%d, vaend=%p\n", + ary, ary->wc_nchunks, vaend); + return NULL; + } + for (ch_no = 0; ch_no < ary->wc_nchunks; ch_no++) { + u64 ch_offset; + + ary->wc_array[ch_no].wc_target.rs_handle = + ntohl(ary->wc_array[ch_no].wc_target.rs_handle); + ary->wc_array[ch_no].wc_target.rs_length = + ntohl(ary->wc_array[ch_no].wc_target.rs_length); + va = (u32 *)&ary->wc_array[ch_no].wc_target.rs_offset; + xdr_decode_hyper(va, &ch_offset); + put_unaligned(ch_offset, (u64 *)va); + } + + return (u32 *)&ary->wc_array[ch_no]; +} + +int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req, + struct svc_rqst *rqstp) +{ + struct rpcrdma_msg *rmsgp = NULL; + u32 *va; + u32 *vaend; + u32 hdr_len; + + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + + /* Verify that there's enough bytes for header + something */ + if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) { + dprintk("svcrdma: header too short = %d\n", + rqstp->rq_arg.len); + return -EINVAL; + } + + /* Decode the header */ + rmsgp->rm_xid = ntohl(rmsgp->rm_xid); + rmsgp->rm_vers = ntohl(rmsgp->rm_vers); + rmsgp->rm_credit = ntohl(rmsgp->rm_credit); + rmsgp->rm_type = ntohl(rmsgp->rm_type); + + if (rmsgp->rm_vers != RPCRDMA_VERSION) + return -ENOSYS; + + /* Pull in the extra for the padded case and bump our pointer */ + if (rmsgp->rm_type == RDMA_MSGP) { + int hdrlen; + rmsgp->rm_body.rm_padded.rm_align = + ntohl(rmsgp->rm_body.rm_padded.rm_align); + rmsgp->rm_body.rm_padded.rm_thresh = + ntohl(rmsgp->rm_body.rm_padded.rm_thresh); + + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); + rqstp->rq_arg.head[0].iov_len -= hdrlen; + if (hdrlen > rqstp->rq_arg.len) + return -EINVAL; + return hdrlen; + } + + /* The chunk list may contain either a read chunk list or a write + * chunk list and a reply chunk list. + */ + va = &rmsgp->rm_body.rm_chunks[0]; + vaend = (u32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); + va = decode_read_list(va, vaend); + if (!va) + return -EINVAL; + va = decode_write_list(va, vaend); + if (!va) + return -EINVAL; + va = decode_reply_array(va, vaend); + if (!va) + return -EINVAL; + + rqstp->rq_arg.head[0].iov_base = va; + hdr_len = (unsigned long)va - (unsigned long)rmsgp; + rqstp->rq_arg.head[0].iov_len -= hdr_len; + + *rdma_req = rmsgp; + return hdr_len; +} + +int svc_rdma_xdr_decode_deferred_req(struct svc_rqst *rqstp) +{ + struct rpcrdma_msg *rmsgp = NULL; + struct rpcrdma_read_chunk *ch; + struct rpcrdma_write_array *ary; + u32 *va; + u32 hdrlen; + + dprintk("svcrdma: processing deferred RDMA header on rqstp=%p\n", + rqstp); + rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + + /* Pull in the extra for the padded case and bump our pointer */ + if (rmsgp->rm_type == RDMA_MSGP) { + va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp); + rqstp->rq_arg.head[0].iov_len -= hdrlen; + return hdrlen; + } + + /* + * Skip all chunks to find RPC msg. These were previously processed + */ + va = &rmsgp->rm_body.rm_chunks[0]; + + /* Skip read-list */ + for (ch = (struct rpcrdma_read_chunk *)va; + ch->rc_discrim != xdr_zero; ch++); + va = (u32 *)&ch->rc_position; + + /* Skip write-list */ + ary = (struct rpcrdma_write_array *)va; + if (ary->wc_discrim == xdr_zero) + va = (u32 *)&ary->wc_nchunks; + else + /* + * rs_length is the 2nd 4B field in wc_target and taking its + * address skips the list terminator + */ + va = (u32 *)&ary->wc_array[ary->wc_nchunks].wc_target.rs_length; + + /* Skip reply-array */ + ary = (struct rpcrdma_write_array *)va; + if (ary->wc_discrim == xdr_zero) + va = (u32 *)&ary->wc_nchunks; + else + va = (u32 *)&ary->wc_array[ary->wc_nchunks]; + + rqstp->rq_arg.head[0].iov_base = va; + hdrlen = (unsigned long)va - (unsigned long)rmsgp; + rqstp->rq_arg.head[0].iov_len -= hdrlen; + + return hdrlen; +} + +int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err, u32 *va) +{ + u32 *startp = va; + + *va++ = htonl(rmsgp->rm_xid); + *va++ = htonl(rmsgp->rm_vers); + *va++ = htonl(xprt->sc_max_requests); + *va++ = htonl(RDMA_ERROR); + *va++ = htonl(err); + if (err == ERR_VERS) { + *va++ = htonl(RPCRDMA_VERSION); + *va++ = htonl(RPCRDMA_VERSION); + } + + return (int)((unsigned long)va - (unsigned long)startp); +} + +int svc_rdma_xdr_get_reply_hdr_len(struct rpcrdma_msg *rmsgp) +{ + struct rpcrdma_write_array *wr_ary; + + /* There is no read-list in a reply */ + + /* skip write list */ + wr_ary = (struct rpcrdma_write_array *) + &rmsgp->rm_body.rm_chunks[1]; + if (wr_ary->wc_discrim) + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]. + wc_target.rs_length; + else + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_nchunks; + + /* skip reply array */ + if (wr_ary->wc_discrim) + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_array[ntohl(wr_ary->wc_nchunks)]; + else + wr_ary = (struct rpcrdma_write_array *) + &wr_ary->wc_nchunks; + + return (unsigned long) wr_ary - (unsigned long) rmsgp; +} + +void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) +{ + struct rpcrdma_write_array *ary; + + /* no read-list */ + rmsgp->rm_body.rm_chunks[0] = xdr_zero; + + /* write-array discrim */ + ary = (struct rpcrdma_write_array *) + &rmsgp->rm_body.rm_chunks[1]; + ary->wc_discrim = xdr_one; + ary->wc_nchunks = htonl(chunks); + + /* write-list terminator */ + ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; + + /* reply-array discriminator */ + ary->wc_array[chunks].wc_target.rs_length = xdr_zero; +} + +void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, + int chunks) +{ + ary->wc_discrim = xdr_one; + ary->wc_nchunks = htonl(chunks); +} + +void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, + int chunk_no, + u32 rs_handle, u64 rs_offset, + u32 write_len) +{ + struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; + seg->rs_handle = htonl(rs_handle); + seg->rs_length = htonl(write_len); + xdr_encode_hyper((u32 *) &seg->rs_offset, rs_offset); +} + +void svc_rdma_xdr_encode_reply_header(struct svcxprt_rdma *xprt, + struct rpcrdma_msg *rdma_argp, + struct rpcrdma_msg *rdma_resp, + enum rpcrdma_proc rdma_type) +{ + rdma_resp->rm_xid = htonl(rdma_argp->rm_xid); + rdma_resp->rm_vers = htonl(rdma_argp->rm_vers); + rdma_resp->rm_credit = htonl(xprt->sc_max_requests); + rdma_resp->rm_type = htonl(rdma_type); + + /* Encode chunks lists */ + rdma_resp->rm_body.rm_chunks[0] = xdr_zero; + rdma_resp->rm_body.rm_chunks[1] = xdr_zero; + rdma_resp->rm_body.rm_chunks[2] = xdr_zero; +} From 4b8449af75fa2e2d9736ec503a818be626a4e763 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 12 Dec 2007 16:13:30 -0600 Subject: [PATCH 093/100] rdma: makefile Add the svcrdma module to the xprtrdma makefile. Signed-off-by: Tom Tucker Acked-by: Neil Brown Signed-off-by: J. Bruce Fields --- net/sunrpc/xprtrdma/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 264f0feeb513..5a8f268bdd30 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,3 +1,8 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o xprtrdma-y := transport.o rpc_rdma.o verbs.o + +obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o + +svcrdma-y := svc_rdma.o svc_rdma_transport.o \ + svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o From f7b8066f9ff68016489ff6f9fb358aa59bd14e1b Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 21 Jan 2008 12:20:45 -0500 Subject: [PATCH 094/100] knfsd: don't bother mapping putrootfh enoent to eperm Neither EPERM and ENOENT map to valid errors for PUTROOTFH according to rfc 3530, and, if anything, ENOENT is likely to be slightly more informative; so don't bother mapping ENOENT to EPERM. (Probably this was originally done because one likely cause was that there is an fsid=0 export but that it isn't permitted to this particular client. Now that we allow WRONGSEC returns, this is somewhat less likely.) In the long term we should work to make this situation less likely, perhaps by turning off nfsv4 service entirely in the absence of the pseudofs root, or constructing a pseudofilesystem root ourselves in the kernel as necessary. Thanks to Benny Halevy for pointing out this problem. Signed-off-by: J. Bruce Fields Cc: Benny Halevy --- fs/nfsd/export.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index cbbc594ef592..79b4bf812960 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1357,8 +1357,6 @@ exp_pseudoroot(struct svc_rqst *rqstp, struct svc_fh *fhp) mk_fsid(FSID_NUM, fsidv, 0, 0, 0, NULL); exp = rqst_exp_find(rqstp, FSID_NUM, fsidv); - if (PTR_ERR(exp) == -ENOENT) - return nfserr_perm; if (IS_ERR(exp)) return nfserrno(PTR_ERR(exp)); rv = fh_compose(fhp, exp, exp->ex_dentry, NULL); From 50431d94e732ba71b66a83c5435890728e313095 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 31 Aug 2007 17:09:33 -0400 Subject: [PATCH 095/100] lockd: minor log message fix Wendy Cheng noticed that function name doesn't agree here. Signed-off-by: J. Bruce Fields Cc: Wendy Cheng --- fs/lockd/svcsubs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 84ebba33b98d..dbbefbcd6712 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -87,7 +87,7 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, unsigned int hash; __be32 nfserr; - nlm_debug_print_fh("nlm_file_lookup", f); + nlm_debug_print_fh("nlm_lookup_file", f); hash = file_hash(f); From 87d26ea7771ad637035e6bd5a2700d81ee9162da Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 22 Jan 2008 17:40:42 -0500 Subject: [PATCH 096/100] nfsd: more careful input validation in nfsctl write methods Neil Brown points out that we're checking buf[size-1] in a couple places without first checking whether size is zero. Actually, given the implementation of simple_transaction_get(), buf[-1] is zero, so in both of these cases the subsequent check of the value of buf[size-1] will catch this case. But it seems fragile to depend on that, so add explicit checks for this case. Signed-off-by: J. Bruce Fields Acked-by: NeilBrown --- fs/nfsd/nfsctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc22e0b0343a..8516137cdbb0 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -304,6 +304,9 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) struct auth_domain *dom; struct knfsd_fh fh; + if (size == 0) + return -EINVAL; + if (buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; @@ -663,7 +666,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) char *recdir; int len, status; - if (size > PATH_MAX || buf[size-1] != '\n') + if (size == 0 || size > PATH_MAX || buf[size-1] != '\n') return -EINVAL; buf[size-1] = 0; From 0113ab34644649aceaac37ef4b7e5c7d5c183be3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 29 Jan 2008 10:30:54 -0500 Subject: [PATCH 097/100] SUNRPC: spin svc_rqst initialization to its own function Move the initialzation in __svc_create_thread that happens prior to thread creation to a new function. Export the function to allow services to have better control over the svc_rqst structs. Also rearrange the rqstp initialization to prevent NULL pointer dereferences in svc_exit_thread in case allocations fail. Signed-off-by: Jeff Layton Reviewed-by: NeilBrown Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc.h | 2 ++ net/sunrpc/svc.c | 59 +++++++++++++++++++++++++++----------- 2 files changed, 44 insertions(+), 17 deletions(-) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 742ab461d842..64c771056187 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -384,6 +384,8 @@ struct svc_procedure { */ struct svc_serv * svc_create(struct svc_program *, unsigned int, void (*shutdown)(struct svc_serv*)); +struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, + struct svc_pool *pool); int svc_create_thread(svc_thread_fn, struct svc_serv *); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a5eb2d6b14ae..b76963d52657 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -531,6 +531,44 @@ svc_release_buffer(struct svc_rqst *rqstp) put_page(rqstp->rq_pages[i]); } +struct svc_rqst * +svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool) +{ + struct svc_rqst *rqstp; + + rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); + if (!rqstp) + goto out_enomem; + + init_waitqueue_head(&rqstp->rq_wait); + + serv->sv_nrthreads++; + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads++; + list_add(&rqstp->rq_all, &pool->sp_all_threads); + spin_unlock_bh(&pool->sp_lock); + rqstp->rq_server = serv; + rqstp->rq_pool = pool; + + rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL); + if (!rqstp->rq_argp) + goto out_thread; + + rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL); + if (!rqstp->rq_resp) + goto out_thread; + + if (!svc_init_buffer(rqstp, serv->sv_max_mesg)) + goto out_thread; + + return rqstp; +out_thread: + svc_exit_thread(rqstp); +out_enomem: + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(svc_prepare_thread); + /* * Create a thread in the given pool. Caller must hold BKL. * On a NUMA or SMP machine, with a multi-pool serv, the thread @@ -545,24 +583,11 @@ __svc_create_thread(svc_thread_fn func, struct svc_serv *serv, int have_oldmask = 0; cpumask_t oldmask; - rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); - if (!rqstp) + rqstp = svc_prepare_thread(serv, pool); + if (IS_ERR(rqstp)) { + error = PTR_ERR(rqstp); goto out; - - init_waitqueue_head(&rqstp->rq_wait); - - if (!(rqstp->rq_argp = kmalloc(serv->sv_xdrsize, GFP_KERNEL)) - || !(rqstp->rq_resp = kmalloc(serv->sv_xdrsize, GFP_KERNEL)) - || !svc_init_buffer(rqstp, serv->sv_max_mesg)) - goto out_thread; - - serv->sv_nrthreads++; - spin_lock_bh(&pool->sp_lock); - pool->sp_nrthreads++; - list_add(&rqstp->rq_all, &pool->sp_all_threads); - spin_unlock_bh(&pool->sp_lock); - rqstp->rq_server = serv; - rqstp->rq_pool = pool; + } if (serv->sv_nrpools > 1) have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); From d801b861681116ea23a7fb87a70bf463d29c8b9c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 29 Jan 2008 10:30:55 -0500 Subject: [PATCH 098/100] NLM: tear down RPC clients in nlm_shutdown_hosts It's possible for a RPC to outlive the lockd daemon that created it, so we need to make sure that all RPC's are killed when lockd is coming down. When nlm_shutdown_hosts is called, kill off all RPC tasks associated with the host. Since we need to wait until they have all gone away, we might as well just shut down the RPC client altogether. Signed-off-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/lockd/host.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/lockd/host.c b/fs/lockd/host.c index ebec0098efbf..ca6b16fc3101 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -379,8 +379,13 @@ nlm_shutdown_hosts(void) /* First, make all hosts eligible for gc */ dprintk("lockd: nuking all hosts...\n"); for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { - hlist_for_each_entry(host, pos, chain, h_hash) + hlist_for_each_entry(host, pos, chain, h_hash) { host->h_expires = jiffies - 1; + if (host->h_rpcclnt) { + rpc_shutdown_client(host->h_rpcclnt); + host->h_rpcclnt = NULL; + } + } } /* Then, perform a garbage collection pass */ From d2f7e79e3bad31b3d52c405085b9e01e5f6c01e0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 14 Jul 2007 15:39:58 -0400 Subject: [PATCH 099/100] SUNRPC: Move exported symbol definitions after function declaration part 2 Do it for the server code... Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/cache.c | 9 ++++++++- net/sunrpc/stats.c | 3 +++ net/sunrpc/sunrpc_syms.c | 41 --------------------------------------- net/sunrpc/svc.c | 7 +++++++ net/sunrpc/svc_xprt.c | 4 ++++ net/sunrpc/svcauth.c | 6 ++++++ net/sunrpc/svcauth_unix.c | 5 +++++ 7 files changed, 33 insertions(+), 42 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 50b1a8b441fe..636c8e04e0be 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -245,6 +245,7 @@ int cache_check(struct cache_detail *detail, cache_put(h, detail); return rv; } +EXPORT_SYMBOL(cache_check); /* * caches need to be periodically cleaned. @@ -377,6 +378,7 @@ int cache_register(struct cache_detail *cd) schedule_delayed_work(&cache_cleaner, 0); return 0; } +EXPORT_SYMBOL(cache_register); void cache_unregister(struct cache_detail *cd) { @@ -402,6 +404,7 @@ void cache_unregister(struct cache_detail *cd) out: printk(KERN_ERR "nfsd: failed to unregister %s cache\n", cd->name); } +EXPORT_SYMBOL(cache_unregister); /* clean cache tries to find something to clean * and cleans it. @@ -516,6 +519,7 @@ void cache_flush(void) while (cache_clean() != -1) cond_resched(); } +EXPORT_SYMBOL(cache_flush); void cache_purge(struct cache_detail *detail) { @@ -524,7 +528,7 @@ void cache_purge(struct cache_detail *detail) cache_flush(); detail->flush_time = 1; } - +EXPORT_SYMBOL(cache_purge); /* @@ -990,6 +994,7 @@ void qword_add(char **bpp, int *lp, char *str) *bpp = bp; *lp = len; } +EXPORT_SYMBOL(qword_add); void qword_addhex(char **bpp, int *lp, char *buf, int blen) { @@ -1018,6 +1023,7 @@ void qword_addhex(char **bpp, int *lp, char *buf, int blen) *bpp = bp; *lp = len; } +EXPORT_SYMBOL(qword_addhex); static void warn_no_listener(struct cache_detail *detail) { @@ -1140,6 +1146,7 @@ int qword_get(char **bpp, char *dest, int bufsize) *dest = '\0'; return len; } +EXPORT_SYMBOL(qword_get); /* diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 74df2d358e61..be0d1009e593 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -106,6 +106,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { seq_putc(seq, '\n'); } } +EXPORT_SYMBOL(svc_seq_show); /** * rpc_alloc_iostats - allocate an rpc_iostats structure @@ -255,12 +256,14 @@ svc_proc_register(struct svc_stat *statp, const struct file_operations *fops) { return do_register(statp->program->pg_name, statp, fops); } +EXPORT_SYMBOL(svc_proc_register); void svc_proc_unregister(const char *name) { remove_proc_entry(name, proc_net_rpc); } +EXPORT_SYMBOL(svc_proc_unregister); void rpc_proc_init(void) diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index ab8a7362e890..843629f55763 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -22,47 +22,6 @@ #include #include -/* RPC server stuff */ -EXPORT_SYMBOL(svc_create); -EXPORT_SYMBOL(svc_create_thread); -EXPORT_SYMBOL(svc_create_pooled); -EXPORT_SYMBOL(svc_set_num_threads); -EXPORT_SYMBOL(svc_exit_thread); -EXPORT_SYMBOL(svc_destroy); -EXPORT_SYMBOL(svc_drop); -EXPORT_SYMBOL(svc_process); -EXPORT_SYMBOL(svc_recv); -EXPORT_SYMBOL(svc_wake_up); -EXPORT_SYMBOL(svc_reserve); -EXPORT_SYMBOL(svc_auth_register); -EXPORT_SYMBOL(auth_domain_lookup); -EXPORT_SYMBOL(svc_authenticate); -EXPORT_SYMBOL(svc_set_client); - -/* RPC statistics */ -#ifdef CONFIG_PROC_FS -EXPORT_SYMBOL(svc_proc_register); -EXPORT_SYMBOL(svc_proc_unregister); -EXPORT_SYMBOL(svc_seq_show); -#endif - -/* caching... */ -EXPORT_SYMBOL(auth_domain_find); -EXPORT_SYMBOL(auth_domain_put); -EXPORT_SYMBOL(auth_unix_add_addr); -EXPORT_SYMBOL(auth_unix_forget_old); -EXPORT_SYMBOL(auth_unix_lookup); -EXPORT_SYMBOL(cache_check); -EXPORT_SYMBOL(cache_flush); -EXPORT_SYMBOL(cache_purge); -EXPORT_SYMBOL(cache_register); -EXPORT_SYMBOL(cache_unregister); -EXPORT_SYMBOL(qword_add); -EXPORT_SYMBOL(qword_addhex); -EXPORT_SYMBOL(qword_get); -EXPORT_SYMBOL(svcauth_unix_purge); -EXPORT_SYMBOL(unix_domain_find); - extern struct cache_detail ip_map_cache, unix_gid_cache; static int __init diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b76963d52657..7537af7c4a76 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -433,6 +433,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize, { return __svc_create(prog, bufsize, /*npools*/1, shutdown); } +EXPORT_SYMBOL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, @@ -452,6 +453,7 @@ svc_create_pooled(struct svc_program *prog, unsigned int bufsize, return serv; } +EXPORT_SYMBOL(svc_create_pooled); /* * Destroy an RPC service. Should be called with the BKL held @@ -493,6 +495,7 @@ svc_destroy(struct svc_serv *serv) kfree(serv->sv_pools); kfree(serv); } +EXPORT_SYMBOL(svc_destroy); /* * Allocate an RPC server's buffer space. @@ -617,6 +620,7 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv) { return __svc_create_thread(func, serv, &serv->sv_pools[0]); } +EXPORT_SYMBOL(svc_create_thread); /* * Choose a pool in which to create a new thread, for svc_set_num_threads @@ -720,6 +724,7 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) return error; } +EXPORT_SYMBOL(svc_set_num_threads); /* * Called from a server thread as it's exiting. Caller must hold BKL. @@ -746,6 +751,7 @@ svc_exit_thread(struct svc_rqst *rqstp) if (serv) svc_destroy(serv); } +EXPORT_SYMBOL(svc_exit_thread); /* * Register an RPC service with the local portmapper. @@ -1069,6 +1075,7 @@ svc_process(struct svc_rqst *rqstp) svc_putnl(resv, ntohl(rpc_stat)); goto sendit; } +EXPORT_SYMBOL(svc_process); /* * Return (transport-specific) limit on the rpc payload. diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index c3fb36784f3b..ea377e06afae 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -435,6 +435,7 @@ void svc_reserve(struct svc_rqst *rqstp, int space) svc_xprt_enqueue(xprt); } } +EXPORT_SYMBOL(svc_reserve); static void svc_xprt_release(struct svc_rqst *rqstp) { @@ -492,6 +493,7 @@ void svc_wake_up(struct svc_serv *serv) spin_unlock_bh(&pool->sp_lock); } } +EXPORT_SYMBOL(svc_wake_up); int svc_port_is_privileged(struct sockaddr *sin) { @@ -702,6 +704,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) serv->sv_stats->netcnt++; return len; } +EXPORT_SYMBOL(svc_recv); /* * Drop request @@ -711,6 +714,7 @@ void svc_drop(struct svc_rqst *rqstp) dprintk("svc: xprt %p dropped request\n", rqstp->rq_xprt); svc_xprt_release(rqstp); } +EXPORT_SYMBOL(svc_drop); /* * Return reply to client. diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index af7c5f05c6e1..8a73cbb16052 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -57,11 +57,13 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) rqstp->rq_authop = aops; return aops->accept(rqstp, authp); } +EXPORT_SYMBOL(svc_authenticate); int svc_set_client(struct svc_rqst *rqstp) { return rqstp->rq_authop->set_client(rqstp); } +EXPORT_SYMBOL(svc_set_client); /* A request, which was authenticated, has now executed. * Time to finalise the credentials and verifier @@ -93,6 +95,7 @@ svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops) spin_unlock(&authtab_lock); return rv; } +EXPORT_SYMBOL(svc_auth_register); void svc_auth_unregister(rpc_authflavor_t flavor) @@ -129,6 +132,7 @@ void auth_domain_put(struct auth_domain *dom) spin_unlock(&auth_domain_lock); } } +EXPORT_SYMBOL(auth_domain_put); struct auth_domain * auth_domain_lookup(char *name, struct auth_domain *new) @@ -153,8 +157,10 @@ auth_domain_lookup(char *name, struct auth_domain *new) spin_unlock(&auth_domain_lock); return new; } +EXPORT_SYMBOL(auth_domain_lookup); struct auth_domain *auth_domain_find(char *name) { return auth_domain_lookup(name, NULL); } +EXPORT_SYMBOL(auth_domain_find); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 6815157bd65c..3c64051e4555 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -63,6 +63,7 @@ struct auth_domain *unix_domain_find(char *name) rv = auth_domain_lookup(name, &new->h); } } +EXPORT_SYMBOL(unix_domain_find); static void svcauth_unix_domain_release(struct auth_domain *dom) { @@ -340,6 +341,7 @@ int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom) else return -ENOMEM; } +EXPORT_SYMBOL(auth_unix_add_addr); int auth_unix_forget_old(struct auth_domain *dom) { @@ -351,6 +353,7 @@ int auth_unix_forget_old(struct auth_domain *dom) udom->addr_changes++; return 0; } +EXPORT_SYMBOL(auth_unix_forget_old); struct auth_domain *auth_unix_lookup(struct in_addr addr) { @@ -375,11 +378,13 @@ struct auth_domain *auth_unix_lookup(struct in_addr addr) cache_put(&ipm->h, &ip_map_cache); return rv; } +EXPORT_SYMBOL(auth_unix_lookup); void svcauth_unix_purge(void) { cache_purge(&ip_map_cache); } +EXPORT_SYMBOL(svcauth_unix_purge); static inline struct ip_map * ip_map_cached_get(struct svc_rqst *rqstp) From ea339d46b93c7b16e067a29aad1812f7a389815a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 26 Oct 2007 13:32:56 -0400 Subject: [PATCH 100/100] SUNRPC: RPC program information is stored in unsigned integers Clean up: When looping over RPC version and procedure numbers, use unsigned index variables. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust Signed-off-by: J. Bruce Fields --- net/sunrpc/stats.c | 4 ++-- net/sunrpc/svc.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index be0d1009e593..5a16875f5ac8 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -33,7 +33,7 @@ struct proc_dir_entry *proc_net_rpc = NULL; static int rpc_proc_show(struct seq_file *seq, void *v) { const struct rpc_stat *statp = seq->private; const struct rpc_program *prog = statp->program; - int i, j; + unsigned int i, j; seq_printf(seq, "net %u %u %u %u\n", @@ -81,7 +81,7 @@ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { const struct svc_program *prog = statp->program; const struct svc_procedure *proc; const struct svc_version *vers; - int i, j; + unsigned int i, j; seq_printf(seq, "net %u %u %u %u\n", diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 7537af7c4a76..a290e1523297 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -364,7 +364,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; - int vers; + unsigned int vers; unsigned int xdrsize; unsigned int i; @@ -763,7 +763,8 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) { struct svc_program *progp; unsigned long flags; - int i, error = 0, dummy; + unsigned int i; + int error = 0, dummy; if (!port) clear_thread_flag(TIF_SIGPENDING);