Message ID | ac18892ea3f718c63f0a12e39aeaac812c081515.1694436263.git.lorenzo@kernel.org (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | add rpc_status netlink support for NFSD | expand |
Context | Check | Description |
---|---|---|
netdev/tree_selection | success | Not a local patch |
On Mon, 2023-09-11 at 14:49 +0200, Lorenzo Bianconi wrote: > Introduce rpc_status netlink support for NFSD in order to dump pending > RPC requests debugging information from userspace. > > Tested-by: Jeff Layton <jlayton@kernel.org> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> > --- > fs/nfsd/nfsctl.c | 192 ++++++++++++++++++++++++++++++++++++- > fs/nfsd/nfsd.h | 16 ++++ > fs/nfsd/nfssvc.c | 15 +++ > fs/nfsd/state.h | 2 - > include/linux/sunrpc/svc.h | 1 + > 5 files changed, 222 insertions(+), 4 deletions(-) > > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c > index 1be66088849c..b862a759ea15 100644 > --- a/fs/nfsd/nfsctl.c > +++ b/fs/nfsd/nfsctl.c > @@ -26,6 +26,7 @@ > #include "pnfs.h" > #include "filecache.h" > #include "trace.h" > +#include "nfs_netlink_gen.h" > > /* > * We have a single directory with several nodes in it. > @@ -1497,17 +1498,199 @@ unsigned int nfsd_net_id; > > int nfsd_server_nl_rpc_status_get_start(struct netlink_callback *cb) > { > - return 0; > + struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id); > + int ret = -ENODEV; > + > + mutex_lock(&nfsd_mutex); > + if (nn->nfsd_serv) { > + svc_get(nn->nfsd_serv); > + ret = 0; > + } > + mutex_unlock(&nfsd_mutex); > + > + return ret; > } > > -int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb) > +static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, > + struct netlink_callback *cb, > + struct nfsd_genl_rqstp *rqstp) > { > + void *hdr; > + int i; > + > + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, > + &nfsd_server_nl_family, NLM_F_MULTI, > + NFSD_CMD_RPC_STATUS_GET); > + if (!hdr) > + return -ENOBUFS; > + > + if (nla_put_be32(skb, NFSD_ATTR_RPC_STATUS_XID, rqstp->rq_xid) || > + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_FLAGS, rqstp->rq_flags) || > + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROG, rqstp->rq_prog) || > + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROC, rqstp->rq_proc) || > + nla_put_u8(skb, NFSD_ATTR_RPC_STATUS_VERSION, rqstp->rq_vers) || > + nla_put_s64(skb, NFSD_ATTR_RPC_STATUS_SERVICE_TIME, > + ktime_to_us(rqstp->rq_stime), > + NFSD_ATTR_RPC_STATUS_PAD)) > + return -ENOBUFS; > + > + switch (rqstp->saddr.sa_family) { > + case AF_INET: { > + const struct sockaddr_in *s_in, *d_in; > + > + s_in = (const struct sockaddr_in *)&rqstp->saddr; > + d_in = (const struct sockaddr_in *)&rqstp->daddr; > + if (nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR4, > + s_in->sin_addr.s_addr) || > + nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR4, > + d_in->sin_addr.s_addr) || > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT, > + s_in->sin_port) || > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT, > + d_in->sin_port)) > + return -ENOBUFS; > + break; > + } > + case AF_INET6: { > + const struct sockaddr_in6 *s_in, *d_in; > + > + s_in = (const struct sockaddr_in6 *)&rqstp->saddr; > + d_in = (const struct sockaddr_in6 *)&rqstp->daddr; > + if (nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR6, > + &s_in->sin6_addr) || > + nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR6, > + &d_in->sin6_addr) || > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT, > + s_in->sin6_port) || > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT, > + d_in->sin6_port)) > + return -ENOBUFS; > + break; > + } > + default: > + break; > + } > + > + if (rqstp->opcnt) { It may be that we always have an opcount of 0 whenever we're running something besides NFSv4 COMPOUND, but it may be best not to count on that. I'd test for prog == NFS_PROGRAM, vers == 4, and that the proc == COMPOUND and then for the opcnt. > + struct nlattr *attr; > + > + attr = nla_nest_start(skb, NFSD_ATTR_RPC_STATUS_COMPOND_OP); > + if (!attr) > + return -ENOBUFS; > + > + for (i = 0; i < rqstp->opcnt; i++) { > + struct nlattr *op_attr; > + > + op_attr = nla_nest_start(skb, i); > + if (!op_attr) > + return -ENOBUFS; > + > + if (nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_COMP_OP, > + rqstp->opnum[i])) > + return -ENOBUFS; > + > + nla_nest_end(skb, op_attr); > + } > + > + nla_nest_end(skb, attr); > + } > + > + genlmsg_end(skb, hdr); > + > return 0; > } > > int nfsd_server_nl_rpc_status_get_dumpit(struct sk_buff *skb, > struct netlink_callback *cb) > { > + struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id); > + int i, ret, rqstp_index; > + > + rcu_read_lock(); > + > + for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) { > + struct svc_rqst *rqstp; > + > + if (i < cb->args[0]) /* already consumed */ > + continue; > + > + rqstp_index = 0; > + list_for_each_entry_rcu(rqstp, > + &nn->nfsd_serv->sv_pools[i].sp_all_threads, > + rq_all) { > + struct nfsd_genl_rqstp genl_rqstp; > + unsigned int status_counter; > + > + if (rqstp_index++ < cb->args[1]) /* already consumed */ > + continue; > + /* > + * Acquire rq_status_counter before parsing the rqst > + * fields. rq_status_counter is set to an odd value in > + * order to notify the consumers the rqstp fields are > + * meaningful. > + */ > + status_counter = > + smp_load_acquire(&rqstp->rq_status_counter); > + if (!(status_counter & 1)) > + continue; > + > + genl_rqstp.rq_xid = rqstp->rq_xid; > + genl_rqstp.rq_flags = rqstp->rq_flags; > + genl_rqstp.rq_vers = rqstp->rq_vers; > + genl_rqstp.rq_prog = rqstp->rq_prog; > + genl_rqstp.rq_proc = rqstp->rq_proc; > + genl_rqstp.rq_stime = rqstp->rq_stime; > + genl_rqstp.opcnt = 0; > + memcpy(&genl_rqstp.daddr, svc_daddr(rqstp), > + sizeof(struct sockaddr)); > + memcpy(&genl_rqstp.saddr, svc_addr(rqstp), > + sizeof(struct sockaddr)); > + > +#ifdef CONFIG_NFSD_V4 > + if (rqstp->rq_vers == NFS4_VERSION && > + rqstp->rq_proc == NFSPROC4_COMPOUND) { > + /* NFSv4 compund */ > + struct nfsd4_compoundargs *args; > + int j; > + > + args = rqstp->rq_argp; > + genl_rqstp.opcnt = args->opcnt; > + for (j = 0; j < genl_rqstp.opcnt; j++) > + genl_rqstp.opnum[j] = > + args->ops[j].opnum; > + } > +#endif /* CONFIG_NFSD_V4 */ > + > + /* > + * Acquire rq_status_counter before reporting the rqst > + * fields to the user. > + */ > + if (smp_load_acquire(&rqstp->rq_status_counter) != > + status_counter) > + continue; > + > + ret = nfsd_genl_rpc_status_compose_msg(skb, cb, > + &genl_rqstp); > + if (ret) > + goto out; > + } > + } > + > + cb->args[0] = i; > + cb->args[1] = rqstp_index; > + ret = skb->len; > +out: > + rcu_read_unlock(); > + > + return ret; > +} > + > +int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb) > +{ > + mutex_lock(&nfsd_mutex); > + nfsd_put(sock_net(cb->skb->sk)); > + mutex_unlock(&nfsd_mutex); > + > return 0; > } > > @@ -1605,6 +1788,10 @@ static int __init init_nfsd(void) > retval = register_filesystem(&nfsd_fs_type); > if (retval) > goto out_free_all; > + retval = genl_register_family(&nfsd_server_nl_family); > + if (retval) > + goto out_free_all; > + > return 0; > out_free_all: > nfsd4_destroy_laundry_wq(); > @@ -1629,6 +1816,7 @@ static int __init init_nfsd(void) > > static void __exit exit_nfsd(void) > { > + genl_unregister_family(&nfsd_server_nl_family); > unregister_filesystem(&nfsd_fs_type); > nfsd4_destroy_laundry_wq(); > unregister_cld_notifier(); > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h > index 11c14faa6c67..d787bd38c053 100644 > --- a/fs/nfsd/nfsd.h > +++ b/fs/nfsd/nfsd.h > @@ -62,6 +62,22 @@ struct readdir_cd { > __be32 err; /* 0, nfserr, or nfserr_eof */ > }; > > +/* Maximum number of operations per session compound */ > +#define NFSD_MAX_OPS_PER_COMPOUND 50 > + > +struct nfsd_genl_rqstp { > + struct sockaddr daddr; > + struct sockaddr saddr; > + unsigned long rq_flags; > + ktime_t rq_stime; > + __be32 rq_xid; > + u32 rq_vers; > + u32 rq_prog; > + u32 rq_proc; > + /* NFSv4 compund */ > + u32 opnum[NFSD_MAX_OPS_PER_COMPOUND]; > + u16 opcnt; > +}; > Again, I'm wondering, is there a way to pass down some sort context- specific value with netlink? It might be nice to make the two NFSv4 specific fields into a union if that can be done with netlink. > extern struct svc_program nfsd_program; > extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c > index 1582af33e204..fad34a7325b3 100644 > --- a/fs/nfsd/nfssvc.c > +++ b/fs/nfsd/nfssvc.c > @@ -998,6 +998,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp) > if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) > goto out_decode_err; > > + /* > + * Release rq_status_counter setting it to an odd value after the rpc > + * request has been properly parsed. rq_status_counter is used to > + * notify the consumers if the rqstp fields are stable > + * (rq_status_counter is odd) or not meaningful (rq_status_counter > + * is even). > + */ > + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); > + > rp = NULL; > switch (nfsd_cache_lookup(rqstp, &rp)) { > case RC_DOIT: > @@ -1015,6 +1024,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp) > if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) > goto out_encode_err; > > + /* > + * Release rq_status_counter setting it to an even value after the rpc > + * request has been properly processed. > + */ > + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); > + > nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1); > out_cached_reply: > return 1; > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > index cbddcf484dba..41bdc913fa71 100644 > --- a/fs/nfsd/state.h > +++ b/fs/nfsd/state.h > @@ -174,8 +174,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) > > /* Maximum number of slots per session. 160 is useful for long haul TCP */ > #define NFSD_MAX_SLOTS_PER_SESSION 160 > -/* Maximum number of operations per session compound */ > -#define NFSD_MAX_OPS_PER_COMPOUND 50 > /* Maximum session per slot cache size */ > #define NFSD_SLOT_CACHE_SIZE 2048 > /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h > index dbf5b21feafe..caa20defd255 100644 > --- a/include/linux/sunrpc/svc.h > +++ b/include/linux/sunrpc/svc.h > @@ -251,6 +251,7 @@ struct svc_rqst { > * net namespace > */ > void ** rq_lease_breaker; /* The v4 client breaking a lease */ > + unsigned int rq_status_counter; /* RPC processing counter */ > }; > > /* bits for rq_flags */
On Mon, Sep 11, 2023 at 02:49:46PM +0200, Lorenzo Bianconi wrote: > Introduce rpc_status netlink support for NFSD in order to dump pending > RPC requests debugging information from userspace. > > Tested-by: Jeff Layton <jlayton@kernel.org> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> o Hi Lorenzo, some minor feedback from my side. ... > int nfsd_server_nl_rpc_status_get_dumpit(struct sk_buff *skb, > struct netlink_callback *cb) > { > + struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id); > + int i, ret, rqstp_index; > + > + rcu_read_lock(); > + > + for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) { > + struct svc_rqst *rqstp; > + > + if (i < cb->args[0]) /* already consumed */ > + continue; > + > + rqstp_index = 0; > + list_for_each_entry_rcu(rqstp, > + &nn->nfsd_serv->sv_pools[i].sp_all_threads, > + rq_all) { > + struct nfsd_genl_rqstp genl_rqstp; > + unsigned int status_counter; > + > + if (rqstp_index++ < cb->args[1]) /* already consumed */ > + continue; > + /* > + * Acquire rq_status_counter before parsing the rqst > + * fields. rq_status_counter is set to an odd value in > + * order to notify the consumers the rqstp fields are > + * meaningful. > + */ > + status_counter = > + smp_load_acquire(&rqstp->rq_status_counter); > + if (!(status_counter & 1)) > + continue; > + > + genl_rqstp.rq_xid = rqstp->rq_xid; > + genl_rqstp.rq_flags = rqstp->rq_flags; > + genl_rqstp.rq_vers = rqstp->rq_vers; > + genl_rqstp.rq_prog = rqstp->rq_prog; > + genl_rqstp.rq_proc = rqstp->rq_proc; > + genl_rqstp.rq_stime = rqstp->rq_stime; > + genl_rqstp.opcnt = 0; > + memcpy(&genl_rqstp.daddr, svc_daddr(rqstp), > + sizeof(struct sockaddr)); > + memcpy(&genl_rqstp.saddr, svc_addr(rqstp), > + sizeof(struct sockaddr)); > + > +#ifdef CONFIG_NFSD_V4 > + if (rqstp->rq_vers == NFS4_VERSION && > + rqstp->rq_proc == NFSPROC4_COMPOUND) { > + /* NFSv4 compund */ nit: compound > + struct nfsd4_compoundargs *args; > + int j; > + > + args = rqstp->rq_argp; > + genl_rqstp.opcnt = args->opcnt; > + for (j = 0; j < genl_rqstp.opcnt; j++) > + genl_rqstp.opnum[j] = > + args->ops[j].opnum; > + } > +#endif /* CONFIG_NFSD_V4 */ > + > + /* > + * Acquire rq_status_counter before reporting the rqst > + * fields to the user. > + */ > + if (smp_load_acquire(&rqstp->rq_status_counter) != > + status_counter) > + continue; > + > + ret = nfsd_genl_rpc_status_compose_msg(skb, cb, > + &genl_rqstp); > + if (ret) > + goto out; > + } > + } > + > + cb->args[0] = i; > + cb->args[1] = rqstp_index; I'm unsure if this is possible, but if the for loop above iterates zero times, or for all iterations (i < cb->args[0]), then rqstp_index will be used uninitialised here. Flagged by Smatch. > + ret = skb->len; > +out: > + rcu_read_unlock(); > + > + return ret; > +} ... > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h > index 11c14faa6c67..d787bd38c053 100644 > --- a/fs/nfsd/nfsd.h > +++ b/fs/nfsd/nfsd.h > @@ -62,6 +62,22 @@ struct readdir_cd { > __be32 err; /* 0, nfserr, or nfserr_eof */ > }; > > +/* Maximum number of operations per session compound */ > +#define NFSD_MAX_OPS_PER_COMPOUND 50 > + > +struct nfsd_genl_rqstp { > + struct sockaddr daddr; > + struct sockaddr saddr; > + unsigned long rq_flags; > + ktime_t rq_stime; > + __be32 rq_xid; > + u32 rq_vers; > + u32 rq_prog; > + u32 rq_proc; > + /* NFSv4 compund */ nit: compound > + u32 opnum[NFSD_MAX_OPS_PER_COMPOUND]; > + u16 opcnt; > +}; > > extern struct svc_program nfsd_program; > extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; ...
[...] > > + > > +#ifdef CONFIG_NFSD_V4 > > + if (rqstp->rq_vers == NFS4_VERSION && > > + rqstp->rq_proc == NFSPROC4_COMPOUND) { > > + /* NFSv4 compund */ > > nit: compound ack, I will fix it. > > > + struct nfsd4_compoundargs *args; > > + int j; > > + > > + args = rqstp->rq_argp; > > + genl_rqstp.opcnt = args->opcnt; > > + for (j = 0; j < genl_rqstp.opcnt; j++) > > + genl_rqstp.opnum[j] = > > + args->ops[j].opnum; > > + } > > +#endif /* CONFIG_NFSD_V4 */ > > + > > + /* > > + * Acquire rq_status_counter before reporting the rqst > > + * fields to the user. > > + */ > > + if (smp_load_acquire(&rqstp->rq_status_counter) != > > + status_counter) > > + continue; > > + > > + ret = nfsd_genl_rpc_status_compose_msg(skb, cb, > > + &genl_rqstp); > > + if (ret) > > + goto out; > > + } > > + } > > + > > + cb->args[0] = i; > > + cb->args[1] = rqstp_index; > > I'm unsure if this is possible, but if the for loop above iterates zero > times, or for all iterations (i < cb->args[0]), then rqstp_index will > be used uninitialised here. ack, thx for spotting it, I will fix it. Regards, Lorenzo > > Flagged by Smatch. > > > + ret = skb->len; > > +out: > > + rcu_read_unlock(); > > + > > + return ret; > > +} > > ... > > > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h > > index 11c14faa6c67..d787bd38c053 100644 > > --- a/fs/nfsd/nfsd.h > > +++ b/fs/nfsd/nfsd.h > > @@ -62,6 +62,22 @@ struct readdir_cd { > > __be32 err; /* 0, nfserr, or nfserr_eof */ > > }; > > > > +/* Maximum number of operations per session compound */ > > +#define NFSD_MAX_OPS_PER_COMPOUND 50 > > + > > +struct nfsd_genl_rqstp { > > + struct sockaddr daddr; > > + struct sockaddr saddr; > > + unsigned long rq_flags; > > + ktime_t rq_stime; > > + __be32 rq_xid; > > + u32 rq_vers; > > + u32 rq_prog; > > + u32 rq_proc; > > + /* NFSv4 compund */ > > nit: compound > > > + u32 opnum[NFSD_MAX_OPS_PER_COMPOUND]; > > + u16 opcnt; > > +}; > > > > extern struct svc_program nfsd_program; > > extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; > > ... >
On Mon, 11 Sep 2023 14:49:46 +0200 Lorenzo Bianconi wrote: > + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, > + &nfsd_server_nl_family, NLM_F_MULTI, > + NFSD_CMD_RPC_STATUS_GET); > + if (!hdr) > + return -ENOBUFS; Why NLM_F_MULTI? AFAIU that means "I'm splitting one object over multiple messages". 99% of the time the right thing to do is change what we consider to be "an object" rather than do F_MULTI. In theory user space should re-constitute all the NLM_F_MULTI messages into as single object, which none of YNL does today :(
> On Mon, 11 Sep 2023 14:49:46 +0200 Lorenzo Bianconi wrote: > > + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, > > + &nfsd_server_nl_family, NLM_F_MULTI, > > + NFSD_CMD_RPC_STATUS_GET); > > + if (!hdr) > > + return -ENOBUFS; > > Why NLM_F_MULTI? AFAIU that means "I'm splitting one object over > multiple messages". 99% of the time the right thing to do is change > what we consider to be "an object" rather than do F_MULTI. In theory > user space should re-constitute all the NLM_F_MULTI messages into as > single object, which none of YNL does today :( > ack, fine. I think we can get rid of it. @chuck: do you want me to send a patch or are you taking care of it? Regards, Lorenzo
> On Oct 4, 2023, at 6:14 AM, Lorenzo Bianconi <lorenzo.bianconi@redhat.com> wrote: > >> On Mon, 11 Sep 2023 14:49:46 +0200 Lorenzo Bianconi wrote: >>> + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, >>> + &nfsd_server_nl_family, NLM_F_MULTI, >>> + NFSD_CMD_RPC_STATUS_GET); >>> + if (!hdr) >>> + return -ENOBUFS; >> >> Why NLM_F_MULTI? AFAIU that means "I'm splitting one object over >> multiple messages". 99% of the time the right thing to do is change >> what we consider to be "an object" rather than do F_MULTI. In theory >> user space should re-constitute all the NLM_F_MULTI messages into as >> single object, which none of YNL does today :( >> > ack, fine. I think we can get rid of it. > @chuck: do you want me to send a patch or are you taking care of it? Send a (tested) patch and I can squash it into this one. -- Chuck Lever
> > > > On Oct 4, 2023, at 6:14 AM, Lorenzo Bianconi <lorenzo.bianconi@redhat.com> wrote: > > > >> On Mon, 11 Sep 2023 14:49:46 +0200 Lorenzo Bianconi wrote: > >>> + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, > >>> + &nfsd_server_nl_family, NLM_F_MULTI, > >>> + NFSD_CMD_RPC_STATUS_GET); > >>> + if (!hdr) > >>> + return -ENOBUFS; > >> > >> Why NLM_F_MULTI? AFAIU that means "I'm splitting one object over > >> multiple messages". 99% of the time the right thing to do is change > >> what we consider to be "an object" rather than do F_MULTI. In theory > >> user space should re-constitute all the NLM_F_MULTI messages into as > >> single object, which none of YNL does today :( > >> > > ack, fine. I think we can get rid of it. > > @chuck: do you want me to send a patch or are you taking care of it? > > Send a (tested) patch and I can squash it into this one. ack, I will do. Regards, Lorenzo > > > -- > Chuck Lever > >
On Mon, 2023-09-11 at 14:49 +0200, Lorenzo Bianconi wrote: > Introduce rpc_status netlink support for NFSD in order to dump pending > RPC requests debugging information from userspace. > > Tested-by: Jeff Layton <jlayton@kernel.org> > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> > --- > fs/nfsd/nfsctl.c | 192 ++++++++++++++++++++++++++++++++++++- > fs/nfsd/nfsd.h | 16 ++++ > fs/nfsd/nfssvc.c | 15 +++ > fs/nfsd/state.h | 2 - > include/linux/sunrpc/svc.h | 1 + > 5 files changed, 222 insertions(+), 4 deletions(-) > > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c > index 1be66088849c..b862a759ea15 100644 > --- a/fs/nfsd/nfsctl.c > +++ b/fs/nfsd/nfsctl.c > @@ -26,6 +26,7 @@ > #include "pnfs.h" > #include "filecache.h" > #include "trace.h" > +#include "nfs_netlink_gen.h" > > /* > * We have a single directory with several nodes in it. > @@ -1497,17 +1498,199 @@ unsigned int nfsd_net_id; > > int nfsd_server_nl_rpc_status_get_start(struct netlink_callback *cb) > { > - return 0; > + struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id); > + int ret = -ENODEV; > + > + mutex_lock(&nfsd_mutex); > + if (nn->nfsd_serv) { > + svc_get(nn->nfsd_serv); > + ret = 0; > + } > + mutex_unlock(&nfsd_mutex); > + > + return ret; > } I think there is a potential race above. Once you've dropped the nfsd_mutex, there is no guarantee that the nn->nfsd_serv will still be set when you come back to put the serv. That means that we could oops when we hit the _done method below. Is it possible to stash a pointer to the serv while we hold the reference? > > -int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb) > +static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, > + struct netlink_callback *cb, > + struct nfsd_genl_rqstp *rqstp) > { > + void *hdr; > + int i; > + > + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, > + &nfsd_server_nl_family, NLM_F_MULTI, > + NFSD_CMD_RPC_STATUS_GET); > + if (!hdr) > + return -ENOBUFS; > + > + if (nla_put_be32(skb, NFSD_ATTR_RPC_STATUS_XID, rqstp->rq_xid) || > + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_FLAGS, rqstp->rq_flags) || > + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROG, rqstp->rq_prog) || > + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROC, rqstp->rq_proc) || > + nla_put_u8(skb, NFSD_ATTR_RPC_STATUS_VERSION, rqstp->rq_vers) || > + nla_put_s64(skb, NFSD_ATTR_RPC_STATUS_SERVICE_TIME, > + ktime_to_us(rqstp->rq_stime), > + NFSD_ATTR_RPC_STATUS_PAD)) > + return -ENOBUFS; > + > + switch (rqstp->saddr.sa_family) { > + case AF_INET: { > + const struct sockaddr_in *s_in, *d_in; > + > + s_in = (const struct sockaddr_in *)&rqstp->saddr; > + d_in = (const struct sockaddr_in *)&rqstp->daddr; > + if (nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR4, > + s_in->sin_addr.s_addr) || > + nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR4, > + d_in->sin_addr.s_addr) || > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT, > + s_in->sin_port) || > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT, > + d_in->sin_port)) > + return -ENOBUFS; > + break; > + } > + case AF_INET6: { > + const struct sockaddr_in6 *s_in, *d_in; > + > + s_in = (const struct sockaddr_in6 *)&rqstp->saddr; > + d_in = (const struct sockaddr_in6 *)&rqstp->daddr; > + if (nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR6, > + &s_in->sin6_addr) || > + nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR6, > + &d_in->sin6_addr) || > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT, > + s_in->sin6_port) || > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT, > + d_in->sin6_port)) > + return -ENOBUFS; > + break; > + } > + default: > + break; > + } > + > + if (rqstp->opcnt) { > + struct nlattr *attr; > + > + attr = nla_nest_start(skb, NFSD_ATTR_RPC_STATUS_COMPOND_OP); > + if (!attr) > + return -ENOBUFS; > + > + for (i = 0; i < rqstp->opcnt; i++) { > + struct nlattr *op_attr; > + > + op_attr = nla_nest_start(skb, i); > + if (!op_attr) > + return -ENOBUFS; > + > + if (nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_COMP_OP, > + rqstp->opnum[i])) > + return -ENOBUFS; > + > + nla_nest_end(skb, op_attr); > + } > + > + nla_nest_end(skb, attr); > + } > + > + genlmsg_end(skb, hdr); > + > return 0; > } > > int nfsd_server_nl_rpc_status_get_dumpit(struct sk_buff *skb, > struct netlink_callback *cb) > { > + struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id); > + int i, ret, rqstp_index; > + > + rcu_read_lock(); > + > + for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) { > + struct svc_rqst *rqstp; > + > + if (i < cb->args[0]) /* already consumed */ > + continue; > + > + rqstp_index = 0; > + list_for_each_entry_rcu(rqstp, > + &nn->nfsd_serv->sv_pools[i].sp_all_threads, > + rq_all) { > + struct nfsd_genl_rqstp genl_rqstp; > + unsigned int status_counter; > + > + if (rqstp_index++ < cb->args[1]) /* already consumed */ > + continue; > + /* > + * Acquire rq_status_counter before parsing the rqst > + * fields. rq_status_counter is set to an odd value in > + * order to notify the consumers the rqstp fields are > + * meaningful. > + */ > + status_counter = > + smp_load_acquire(&rqstp->rq_status_counter); > + if (!(status_counter & 1)) > + continue; > + > + genl_rqstp.rq_xid = rqstp->rq_xid; > + genl_rqstp.rq_flags = rqstp->rq_flags; > + genl_rqstp.rq_vers = rqstp->rq_vers; > + genl_rqstp.rq_prog = rqstp->rq_prog; > + genl_rqstp.rq_proc = rqstp->rq_proc; > + genl_rqstp.rq_stime = rqstp->rq_stime; > + genl_rqstp.opcnt = 0; > + memcpy(&genl_rqstp.daddr, svc_daddr(rqstp), > + sizeof(struct sockaddr)); > + memcpy(&genl_rqstp.saddr, svc_addr(rqstp), > + sizeof(struct sockaddr)); > + > +#ifdef CONFIG_NFSD_V4 > + if (rqstp->rq_vers == NFS4_VERSION && > + rqstp->rq_proc == NFSPROC4_COMPOUND) { > + /* NFSv4 compund */ > + struct nfsd4_compoundargs *args; > + int j; > + > + args = rqstp->rq_argp; > + genl_rqstp.opcnt = args->opcnt; > + for (j = 0; j < genl_rqstp.opcnt; j++) > + genl_rqstp.opnum[j] = > + args->ops[j].opnum; > + } > +#endif /* CONFIG_NFSD_V4 */ > + > + /* > + * Acquire rq_status_counter before reporting the rqst > + * fields to the user. > + */ > + if (smp_load_acquire(&rqstp->rq_status_counter) != > + status_counter) > + continue; > + > + ret = nfsd_genl_rpc_status_compose_msg(skb, cb, > + &genl_rqstp); > + if (ret) > + goto out; > + } > + } > + > + cb->args[0] = i; > + cb->args[1] = rqstp_index; > + ret = skb->len; > +out: > + rcu_read_unlock(); > + > + return ret; > +} > + > +int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb) > +{ > + mutex_lock(&nfsd_mutex); > + nfsd_put(sock_net(cb->skb->sk)); > + mutex_unlock(&nfsd_mutex); > + > return 0; > } > I think there is a potential race above. Once you've > @@ -1605,6 +1788,10 @@ static int __init init_nfsd(void) > retval = register_filesystem(&nfsd_fs_type); > if (retval) > goto out_free_all; > + retval = genl_register_family(&nfsd_server_nl_family); > + if (retval) > + goto out_free_all; > + > return 0; > out_free_all: > nfsd4_destroy_laundry_wq(); > @@ -1629,6 +1816,7 @@ static int __init init_nfsd(void) > > static void __exit exit_nfsd(void) > { > + genl_unregister_family(&nfsd_server_nl_family); > unregister_filesystem(&nfsd_fs_type); > nfsd4_destroy_laundry_wq(); > unregister_cld_notifier(); > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h > index 11c14faa6c67..d787bd38c053 100644 > --- a/fs/nfsd/nfsd.h > +++ b/fs/nfsd/nfsd.h > @@ -62,6 +62,22 @@ struct readdir_cd { > __be32 err; /* 0, nfserr, or nfserr_eof */ > }; > > +/* Maximum number of operations per session compound */ > +#define NFSD_MAX_OPS_PER_COMPOUND 50 > + > +struct nfsd_genl_rqstp { > + struct sockaddr daddr; > + struct sockaddr saddr; > + unsigned long rq_flags; > + ktime_t rq_stime; > + __be32 rq_xid; > + u32 rq_vers; > + u32 rq_prog; > + u32 rq_proc; > + /* NFSv4 compund */ > + u32 opnum[NFSD_MAX_OPS_PER_COMPOUND]; > + u16 opcnt; > +}; > > extern struct svc_program nfsd_program; > extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c > index 1582af33e204..fad34a7325b3 100644 > --- a/fs/nfsd/nfssvc.c > +++ b/fs/nfsd/nfssvc.c > @@ -998,6 +998,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp) > if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) > goto out_decode_err; > > + /* > + * Release rq_status_counter setting it to an odd value after the rpc > + * request has been properly parsed. rq_status_counter is used to > + * notify the consumers if the rqstp fields are stable > + * (rq_status_counter is odd) or not meaningful (rq_status_counter > + * is even). > + */ > + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); > + > rp = NULL; > switch (nfsd_cache_lookup(rqstp, &rp)) { > case RC_DOIT: > @@ -1015,6 +1024,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp) > if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) > goto out_encode_err; > > + /* > + * Release rq_status_counter setting it to an even value after the rpc > + * request has been properly processed. > + */ > + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); > + > nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1); > out_cached_reply: > return 1; > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > index cbddcf484dba..41bdc913fa71 100644 > --- a/fs/nfsd/state.h > +++ b/fs/nfsd/state.h > @@ -174,8 +174,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) > > /* Maximum number of slots per session. 160 is useful for long haul TCP */ > #define NFSD_MAX_SLOTS_PER_SESSION 160 > -/* Maximum number of operations per session compound */ > -#define NFSD_MAX_OPS_PER_COMPOUND 50 > /* Maximum session per slot cache size */ > #define NFSD_SLOT_CACHE_SIZE 2048 > /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h > index dbf5b21feafe..caa20defd255 100644 > --- a/include/linux/sunrpc/svc.h > +++ b/include/linux/sunrpc/svc.h > @@ -251,6 +251,7 @@ struct svc_rqst { > * net namespace > */ > void ** rq_lease_breaker; /* The v4 client breaking a lease */ > + unsigned int rq_status_counter; /* RPC processing counter */ > }; > > /* bits for rq_flags */
On Mon, 2023-12-11 at 13:56 -0500, Jeff Layton wrote: > On Mon, 2023-09-11 at 14:49 +0200, Lorenzo Bianconi wrote: > > Introduce rpc_status netlink support for NFSD in order to dump pending > > RPC requests debugging information from userspace. > > > > Tested-by: Jeff Layton <jlayton@kernel.org> > > Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org> > > --- > > fs/nfsd/nfsctl.c | 192 ++++++++++++++++++++++++++++++++++++- > > fs/nfsd/nfsd.h | 16 ++++ > > fs/nfsd/nfssvc.c | 15 +++ > > fs/nfsd/state.h | 2 - > > include/linux/sunrpc/svc.h | 1 + > > 5 files changed, 222 insertions(+), 4 deletions(-) > > > > diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c > > index 1be66088849c..b862a759ea15 100644 > > --- a/fs/nfsd/nfsctl.c > > +++ b/fs/nfsd/nfsctl.c > > @@ -26,6 +26,7 @@ > > #include "pnfs.h" > > #include "filecache.h" > > #include "trace.h" > > +#include "nfs_netlink_gen.h" > > > > /* > > * We have a single directory with several nodes in it. > > @@ -1497,17 +1498,199 @@ unsigned int nfsd_net_id; > > > > int nfsd_server_nl_rpc_status_get_start(struct netlink_callback *cb) > > { > > - return 0; > > + struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id); > > + int ret = -ENODEV; > > + > > + mutex_lock(&nfsd_mutex); > > + if (nn->nfsd_serv) { > > + svc_get(nn->nfsd_serv); > > + ret = 0; > > + } > > + mutex_unlock(&nfsd_mutex); > > + > > + return ret; > > } > > I think there is a potential race above. Once you've dropped the > nfsd_mutex, there is no guarantee that the nn->nfsd_serv will still be > set when you come back to put the serv. That means that we could oops > when we hit the _done method below. > > Is it possible to stash a pointer to the serv while we hold the > reference? > Actually, it looks like Neil may have already fixed this in the series he sent on Oct 29th. See: [PATCH 3/5] nfsd: hold nfsd_mutex across entire netlink operation Another reason to go ahead and get that series in... > > > > -int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb) > > +static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, > > + struct netlink_callback *cb, > > + struct nfsd_genl_rqstp *rqstp) > > { > > + void *hdr; > > + int i; > > + > > + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, > > + &nfsd_server_nl_family, NLM_F_MULTI, > > + NFSD_CMD_RPC_STATUS_GET); > > + if (!hdr) > > + return -ENOBUFS; > > + > > + if (nla_put_be32(skb, NFSD_ATTR_RPC_STATUS_XID, rqstp->rq_xid) || > > + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_FLAGS, rqstp->rq_flags) || > > + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROG, rqstp->rq_prog) || > > + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROC, rqstp->rq_proc) || > > + nla_put_u8(skb, NFSD_ATTR_RPC_STATUS_VERSION, rqstp->rq_vers) || > > + nla_put_s64(skb, NFSD_ATTR_RPC_STATUS_SERVICE_TIME, > > + ktime_to_us(rqstp->rq_stime), > > + NFSD_ATTR_RPC_STATUS_PAD)) > > + return -ENOBUFS; > > + > > + switch (rqstp->saddr.sa_family) { > > + case AF_INET: { > > + const struct sockaddr_in *s_in, *d_in; > > + > > + s_in = (const struct sockaddr_in *)&rqstp->saddr; > > + d_in = (const struct sockaddr_in *)&rqstp->daddr; > > + if (nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR4, > > + s_in->sin_addr.s_addr) || > > + nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR4, > > + d_in->sin_addr.s_addr) || > > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT, > > + s_in->sin_port) || > > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT, > > + d_in->sin_port)) > > + return -ENOBUFS; > > + break; > > + } > > + case AF_INET6: { > > + const struct sockaddr_in6 *s_in, *d_in; > > + > > + s_in = (const struct sockaddr_in6 *)&rqstp->saddr; > > + d_in = (const struct sockaddr_in6 *)&rqstp->daddr; > > + if (nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR6, > > + &s_in->sin6_addr) || > > + nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR6, > > + &d_in->sin6_addr) || > > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT, > > + s_in->sin6_port) || > > + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT, > > + d_in->sin6_port)) > > + return -ENOBUFS; > > + break; > > + } > > + default: > > + break; > > + } > > + > > + if (rqstp->opcnt) { > > + struct nlattr *attr; > > + > > + attr = nla_nest_start(skb, NFSD_ATTR_RPC_STATUS_COMPOND_OP); > > + if (!attr) > > + return -ENOBUFS; > > + > > + for (i = 0; i < rqstp->opcnt; i++) { > > + struct nlattr *op_attr; > > + > > + op_attr = nla_nest_start(skb, i); > > + if (!op_attr) > > + return -ENOBUFS; > > + > > + if (nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_COMP_OP, > > + rqstp->opnum[i])) > > + return -ENOBUFS; > > + > > + nla_nest_end(skb, op_attr); > > + } > > + > > + nla_nest_end(skb, attr); > > + } > > + > > + genlmsg_end(skb, hdr); > > + > > return 0; > > } > > > > int nfsd_server_nl_rpc_status_get_dumpit(struct sk_buff *skb, > > struct netlink_callback *cb) > > { > > + struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id); > > + int i, ret, rqstp_index; > > + > > + rcu_read_lock(); > > + > > + for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) { > > + struct svc_rqst *rqstp; > > + > > + if (i < cb->args[0]) /* already consumed */ > > + continue; > > + > > + rqstp_index = 0; > > + list_for_each_entry_rcu(rqstp, > > + &nn->nfsd_serv->sv_pools[i].sp_all_threads, > > + rq_all) { > > + struct nfsd_genl_rqstp genl_rqstp; > > + unsigned int status_counter; > > + > > + if (rqstp_index++ < cb->args[1]) /* already consumed */ > > + continue; > > + /* > > + * Acquire rq_status_counter before parsing the rqst > > + * fields. rq_status_counter is set to an odd value in > > + * order to notify the consumers the rqstp fields are > > + * meaningful. > > + */ > > + status_counter = > > + smp_load_acquire(&rqstp->rq_status_counter); > > + if (!(status_counter & 1)) > > + continue; > > + > > + genl_rqstp.rq_xid = rqstp->rq_xid; > > + genl_rqstp.rq_flags = rqstp->rq_flags; > > + genl_rqstp.rq_vers = rqstp->rq_vers; > > + genl_rqstp.rq_prog = rqstp->rq_prog; > > + genl_rqstp.rq_proc = rqstp->rq_proc; > > + genl_rqstp.rq_stime = rqstp->rq_stime; > > + genl_rqstp.opcnt = 0; > > + memcpy(&genl_rqstp.daddr, svc_daddr(rqstp), > > + sizeof(struct sockaddr)); > > + memcpy(&genl_rqstp.saddr, svc_addr(rqstp), > > + sizeof(struct sockaddr)); > > + > > +#ifdef CONFIG_NFSD_V4 > > + if (rqstp->rq_vers == NFS4_VERSION && > > + rqstp->rq_proc == NFSPROC4_COMPOUND) { > > + /* NFSv4 compund */ > > + struct nfsd4_compoundargs *args; > > + int j; > > + > > + args = rqstp->rq_argp; > > + genl_rqstp.opcnt = args->opcnt; > > + for (j = 0; j < genl_rqstp.opcnt; j++) > > + genl_rqstp.opnum[j] = > > + args->ops[j].opnum; > > + } > > +#endif /* CONFIG_NFSD_V4 */ > > + > > + /* > > + * Acquire rq_status_counter before reporting the rqst > > + * fields to the user. > > + */ > > + if (smp_load_acquire(&rqstp->rq_status_counter) != > > + status_counter) > > + continue; > > + > > + ret = nfsd_genl_rpc_status_compose_msg(skb, cb, > > + &genl_rqstp); > > + if (ret) > > + goto out; > > + } > > + } > > + > > + cb->args[0] = i; > > + cb->args[1] = rqstp_index; > > + ret = skb->len; > > +out: > > + rcu_read_unlock(); > > + > > + return ret; > > +} > > + > > +int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb) > > +{ > > + mutex_lock(&nfsd_mutex); > > + nfsd_put(sock_net(cb->skb->sk)); > > + mutex_unlock(&nfsd_mutex); > > + > > return 0; > > } > > > > I think there is a potential race above. Once you've > > > > @@ -1605,6 +1788,10 @@ static int __init init_nfsd(void) > > retval = register_filesystem(&nfsd_fs_type); > > if (retval) > > goto out_free_all; > > + retval = genl_register_family(&nfsd_server_nl_family); > > + if (retval) > > + goto out_free_all; > > + > > return 0; > > out_free_all: > > nfsd4_destroy_laundry_wq(); > > @@ -1629,6 +1816,7 @@ static int __init init_nfsd(void) > > > > static void __exit exit_nfsd(void) > > { > > + genl_unregister_family(&nfsd_server_nl_family); > > unregister_filesystem(&nfsd_fs_type); > > nfsd4_destroy_laundry_wq(); > > unregister_cld_notifier(); > > diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h > > index 11c14faa6c67..d787bd38c053 100644 > > --- a/fs/nfsd/nfsd.h > > +++ b/fs/nfsd/nfsd.h > > @@ -62,6 +62,22 @@ struct readdir_cd { > > __be32 err; /* 0, nfserr, or nfserr_eof */ > > }; > > > > +/* Maximum number of operations per session compound */ > > +#define NFSD_MAX_OPS_PER_COMPOUND 50 > > + > > +struct nfsd_genl_rqstp { > > + struct sockaddr daddr; > > + struct sockaddr saddr; > > + unsigned long rq_flags; > > + ktime_t rq_stime; > > + __be32 rq_xid; > > + u32 rq_vers; > > + u32 rq_prog; > > + u32 rq_proc; > > + /* NFSv4 compund */ > > + u32 opnum[NFSD_MAX_OPS_PER_COMPOUND]; > > + u16 opcnt; > > +}; > > > > extern struct svc_program nfsd_program; > > extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; > > diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c > > index 1582af33e204..fad34a7325b3 100644 > > --- a/fs/nfsd/nfssvc.c > > +++ b/fs/nfsd/nfssvc.c > > @@ -998,6 +998,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp) > > if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) > > goto out_decode_err; > > > > + /* > > + * Release rq_status_counter setting it to an odd value after the rpc > > + * request has been properly parsed. rq_status_counter is used to > > + * notify the consumers if the rqstp fields are stable > > + * (rq_status_counter is odd) or not meaningful (rq_status_counter > > + * is even). > > + */ > > + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); > > + > > rp = NULL; > > switch (nfsd_cache_lookup(rqstp, &rp)) { > > case RC_DOIT: > > @@ -1015,6 +1024,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp) > > if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) > > goto out_encode_err; > > > > + /* > > + * Release rq_status_counter setting it to an even value after the rpc > > + * request has been properly processed. > > + */ > > + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); > > + > > nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1); > > out_cached_reply: > > return 1; > > diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h > > index cbddcf484dba..41bdc913fa71 100644 > > --- a/fs/nfsd/state.h > > +++ b/fs/nfsd/state.h > > @@ -174,8 +174,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) > > > > /* Maximum number of slots per session. 160 is useful for long haul TCP */ > > #define NFSD_MAX_SLOTS_PER_SESSION 160 > > -/* Maximum number of operations per session compound */ > > -#define NFSD_MAX_OPS_PER_COMPOUND 50 > > /* Maximum session per slot cache size */ > > #define NFSD_SLOT_CACHE_SIZE 2048 > > /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ > > diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h > > index dbf5b21feafe..caa20defd255 100644 > > --- a/include/linux/sunrpc/svc.h > > +++ b/include/linux/sunrpc/svc.h > > @@ -251,6 +251,7 @@ struct svc_rqst { > > * net namespace > > */ > > void ** rq_lease_breaker; /* The v4 client breaking a lease */ > > + unsigned int rq_status_counter; /* RPC processing counter */ > > }; > > > > /* bits for rq_flags */ >
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1be66088849c..b862a759ea15 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -26,6 +26,7 @@ #include "pnfs.h" #include "filecache.h" #include "trace.h" +#include "nfs_netlink_gen.h" /* * We have a single directory with several nodes in it. @@ -1497,17 +1498,199 @@ unsigned int nfsd_net_id; int nfsd_server_nl_rpc_status_get_start(struct netlink_callback *cb) { - return 0; + struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id); + int ret = -ENODEV; + + mutex_lock(&nfsd_mutex); + if (nn->nfsd_serv) { + svc_get(nn->nfsd_serv); + ret = 0; + } + mutex_unlock(&nfsd_mutex); + + return ret; } -int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb) +static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, + struct netlink_callback *cb, + struct nfsd_genl_rqstp *rqstp) { + void *hdr; + int i; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &nfsd_server_nl_family, NLM_F_MULTI, + NFSD_CMD_RPC_STATUS_GET); + if (!hdr) + return -ENOBUFS; + + if (nla_put_be32(skb, NFSD_ATTR_RPC_STATUS_XID, rqstp->rq_xid) || + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_FLAGS, rqstp->rq_flags) || + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROG, rqstp->rq_prog) || + nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_PROC, rqstp->rq_proc) || + nla_put_u8(skb, NFSD_ATTR_RPC_STATUS_VERSION, rqstp->rq_vers) || + nla_put_s64(skb, NFSD_ATTR_RPC_STATUS_SERVICE_TIME, + ktime_to_us(rqstp->rq_stime), + NFSD_ATTR_RPC_STATUS_PAD)) + return -ENOBUFS; + + switch (rqstp->saddr.sa_family) { + case AF_INET: { + const struct sockaddr_in *s_in, *d_in; + + s_in = (const struct sockaddr_in *)&rqstp->saddr; + d_in = (const struct sockaddr_in *)&rqstp->daddr; + if (nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR4, + s_in->sin_addr.s_addr) || + nla_put_in_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR4, + d_in->sin_addr.s_addr) || + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT, + s_in->sin_port) || + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT, + d_in->sin_port)) + return -ENOBUFS; + break; + } + case AF_INET6: { + const struct sockaddr_in6 *s_in, *d_in; + + s_in = (const struct sockaddr_in6 *)&rqstp->saddr; + d_in = (const struct sockaddr_in6 *)&rqstp->daddr; + if (nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_SADDR6, + &s_in->sin6_addr) || + nla_put_in6_addr(skb, NFSD_ATTR_RPC_STATUS_DADDR6, + &d_in->sin6_addr) || + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_SPORT, + s_in->sin6_port) || + nla_put_be16(skb, NFSD_ATTR_RPC_STATUS_DPORT, + d_in->sin6_port)) + return -ENOBUFS; + break; + } + default: + break; + } + + if (rqstp->opcnt) { + struct nlattr *attr; + + attr = nla_nest_start(skb, NFSD_ATTR_RPC_STATUS_COMPOND_OP); + if (!attr) + return -ENOBUFS; + + for (i = 0; i < rqstp->opcnt; i++) { + struct nlattr *op_attr; + + op_attr = nla_nest_start(skb, i); + if (!op_attr) + return -ENOBUFS; + + if (nla_put_u32(skb, NFSD_ATTR_RPC_STATUS_COMP_OP, + rqstp->opnum[i])) + return -ENOBUFS; + + nla_nest_end(skb, op_attr); + } + + nla_nest_end(skb, attr); + } + + genlmsg_end(skb, hdr); + return 0; } int nfsd_server_nl_rpc_status_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { + struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id); + int i, ret, rqstp_index; + + rcu_read_lock(); + + for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) { + struct svc_rqst *rqstp; + + if (i < cb->args[0]) /* already consumed */ + continue; + + rqstp_index = 0; + list_for_each_entry_rcu(rqstp, + &nn->nfsd_serv->sv_pools[i].sp_all_threads, + rq_all) { + struct nfsd_genl_rqstp genl_rqstp; + unsigned int status_counter; + + if (rqstp_index++ < cb->args[1]) /* already consumed */ + continue; + /* + * Acquire rq_status_counter before parsing the rqst + * fields. rq_status_counter is set to an odd value in + * order to notify the consumers the rqstp fields are + * meaningful. + */ + status_counter = + smp_load_acquire(&rqstp->rq_status_counter); + if (!(status_counter & 1)) + continue; + + genl_rqstp.rq_xid = rqstp->rq_xid; + genl_rqstp.rq_flags = rqstp->rq_flags; + genl_rqstp.rq_vers = rqstp->rq_vers; + genl_rqstp.rq_prog = rqstp->rq_prog; + genl_rqstp.rq_proc = rqstp->rq_proc; + genl_rqstp.rq_stime = rqstp->rq_stime; + genl_rqstp.opcnt = 0; + memcpy(&genl_rqstp.daddr, svc_daddr(rqstp), + sizeof(struct sockaddr)); + memcpy(&genl_rqstp.saddr, svc_addr(rqstp), + sizeof(struct sockaddr)); + +#ifdef CONFIG_NFSD_V4 + if (rqstp->rq_vers == NFS4_VERSION && + rqstp->rq_proc == NFSPROC4_COMPOUND) { + /* NFSv4 compund */ + struct nfsd4_compoundargs *args; + int j; + + args = rqstp->rq_argp; + genl_rqstp.opcnt = args->opcnt; + for (j = 0; j < genl_rqstp.opcnt; j++) + genl_rqstp.opnum[j] = + args->ops[j].opnum; + } +#endif /* CONFIG_NFSD_V4 */ + + /* + * Acquire rq_status_counter before reporting the rqst + * fields to the user. + */ + if (smp_load_acquire(&rqstp->rq_status_counter) != + status_counter) + continue; + + ret = nfsd_genl_rpc_status_compose_msg(skb, cb, + &genl_rqstp); + if (ret) + goto out; + } + } + + cb->args[0] = i; + cb->args[1] = rqstp_index; + ret = skb->len; +out: + rcu_read_unlock(); + + return ret; +} + +int nfsd_server_nl_rpc_status_get_done(struct netlink_callback *cb) +{ + mutex_lock(&nfsd_mutex); + nfsd_put(sock_net(cb->skb->sk)); + mutex_unlock(&nfsd_mutex); + return 0; } @@ -1605,6 +1788,10 @@ static int __init init_nfsd(void) retval = register_filesystem(&nfsd_fs_type); if (retval) goto out_free_all; + retval = genl_register_family(&nfsd_server_nl_family); + if (retval) + goto out_free_all; + return 0; out_free_all: nfsd4_destroy_laundry_wq(); @@ -1629,6 +1816,7 @@ static int __init init_nfsd(void) static void __exit exit_nfsd(void) { + genl_unregister_family(&nfsd_server_nl_family); unregister_filesystem(&nfsd_fs_type); nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 11c14faa6c67..d787bd38c053 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -62,6 +62,22 @@ struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ }; +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 50 + +struct nfsd_genl_rqstp { + struct sockaddr daddr; + struct sockaddr saddr; + unsigned long rq_flags; + ktime_t rq_stime; + __be32 rq_xid; + u32 rq_vers; + u32 rq_prog; + u32 rq_proc; + /* NFSv4 compund */ + u32 opnum[NFSD_MAX_OPS_PER_COMPOUND]; + u16 opcnt; +}; extern struct svc_program nfsd_program; extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1582af33e204..fad34a7325b3 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -998,6 +998,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; + /* + * Release rq_status_counter setting it to an odd value after the rpc + * request has been properly parsed. rq_status_counter is used to + * notify the consumers if the rqstp fields are stable + * (rq_status_counter is odd) or not meaningful (rq_status_counter + * is even). + */ + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); + rp = NULL; switch (nfsd_cache_lookup(rqstp, &rp)) { case RC_DOIT: @@ -1015,6 +1024,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; + /* + * Release rq_status_counter setting it to an even value after the rpc + * request has been properly processed. + */ + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); + nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1); out_cached_reply: return 1; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index cbddcf484dba..41bdc913fa71 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -174,8 +174,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) /* Maximum number of slots per session. 160 is useful for long haul TCP */ #define NFSD_MAX_SLOTS_PER_SESSION 160 -/* Maximum number of operations per session compound */ -#define NFSD_MAX_OPS_PER_COMPOUND 50 /* Maximum session per slot cache size */ #define NFSD_SLOT_CACHE_SIZE 2048 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index dbf5b21feafe..caa20defd255 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -251,6 +251,7 @@ struct svc_rqst { * net namespace */ void ** rq_lease_breaker; /* The v4 client breaking a lease */ + unsigned int rq_status_counter; /* RPC processing counter */ }; /* bits for rq_flags */