diff mbox series

[2/2] net/9p: add a per-client fcall kmem_cache

Message ID 1532943263-24378-2-git-send-email-asmadeus@codewreck.org (mailing list archive)
State New, archived
Headers show
Series [1/2] net/9p: embed fcall in req to round down buffer allocs | expand

Commit Message

Dominique Martinet July 30, 2018, 9:34 a.m. UTC
From: Dominique Martinet <dominique.martinet@cea.fr>

Having a specific cache for the fcall allocations helps speed up
allocations a bit, especially in case of non-"round" msizes.

The caches will automatically be merged if there are multiple caches
of items with the same size so we do not need to try to share a cache
between different clients of the same size.

Since the msize is negotiated with the server, only allocate the cache
after that negotiation has happened - previous allocations or
allocations of different sizes (e.g. zero-copy fcall) are made with
kmalloc directly.

Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
---
 include/net/9p/client.h |  2 ++
 net/9p/client.c         | 40 ++++++++++++++++++++++++++++++++--------
 net/9p/trans_rdma.c     |  2 +-
 3 files changed, 35 insertions(+), 9 deletions(-)

Comments

piaojun July 31, 2018, 1:18 a.m. UTC | #1
Hi Dominique,

Could you help paste some test result before-and-after the patch applied?
And I have a little suggestion in comments below.

On 2018/7/30 17:34, Dominique Martinet wrote:
> From: Dominique Martinet <dominique.martinet@cea.fr>
> 
> Having a specific cache for the fcall allocations helps speed up
> allocations a bit, especially in case of non-"round" msizes.
> 
> The caches will automatically be merged if there are multiple caches
> of items with the same size so we do not need to try to share a cache
> between different clients of the same size.
> 
> Since the msize is negotiated with the server, only allocate the cache
> after that negotiation has happened - previous allocations or
> allocations of different sizes (e.g. zero-copy fcall) are made with
> kmalloc directly.
> 
> Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
> ---
>  include/net/9p/client.h |  2 ++
>  net/9p/client.c         | 40 ++++++++++++++++++++++++++++++++--------
>  net/9p/trans_rdma.c     |  2 +-
>  3 files changed, 35 insertions(+), 9 deletions(-)
> 
> diff --git a/include/net/9p/client.h b/include/net/9p/client.h
> index 4b4ac1362ad5..8d9bc7402a42 100644
> --- a/include/net/9p/client.h
> +++ b/include/net/9p/client.h
> @@ -123,6 +123,7 @@ struct p9_client {
>  	struct p9_trans_module *trans_mod;
>  	enum p9_trans_status status;
>  	void *trans;
> +	struct kmem_cache *fcall_cache;
>  
>  	union {
>  		struct {
> @@ -230,6 +231,7 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
>  				kgid_t gid, struct p9_qid *);
>  int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
>  int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
> +void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc);
>  struct p9_req_t *p9_tag_lookup(struct p9_client *, u16);
>  void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status);
>  
> diff --git a/net/9p/client.c b/net/9p/client.c
> index ba99a94a12c9..215e3b1ed7b4 100644
> --- a/net/9p/client.c
> +++ b/net/9p/client.c
> @@ -231,15 +231,34 @@ static int parse_opts(char *opts, struct p9_client *clnt)
>  	return ret;
>  }
>  
> -static int p9_fcall_alloc(struct p9_fcall *fc, int alloc_msize)
> +static int p9_fcall_alloc(struct p9_client *c, struct p9_fcall *fc,
> +			  int alloc_msize)
>  {
> -	fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
> +	if (c->fcall_cache && alloc_msize == c->msize)
> +		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
> +	else
> +		fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
>  	if (!fc->sdata)
>  		return -ENOMEM;
>  	fc->capacity = alloc_msize;
>  	return 0;
>  }
>  
> +void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc)
> +{
> +	/* sdata can be NULL for interrupted requests in trans_rdma,
> +	 * and kmem_cache_free does not do NULL-check for us
> +	 */
> +	if (unlikely(!fc->sdata))
> +		return;
> +
> +	if (c->fcall_cache && fc->capacity == c->msize)
> +		kmem_cache_free(c->fcall_cache, fc->sdata);
> +	else
> +		kfree(fc->sdata);
> +}
> +EXPORT_SYMBOL(p9_fcall_free);
> +
>  static struct kmem_cache *p9_req_cache;
>  
>  /**
> @@ -261,9 +280,9 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
>  	if (!req)
>  		return NULL;
>  
> -	if (p9_fcall_alloc(&req->tc, alloc_msize))
> +	if (p9_fcall_alloc(c, &req->tc, alloc_msize))
>  		goto free;
> -	if (p9_fcall_alloc(&req->rc, alloc_msize))
> +	if (p9_fcall_alloc(c, &req->rc, alloc_msize))
>  		goto free;
>  
>  	p9pdu_reset(&req->tc);
> @@ -288,8 +307,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
>  	return req;
>  
>  free:
> -	kfree(req->tc.sdata);
> -	kfree(req->rc.sdata);
> +	p9_fcall_free(c, &req->tc);
> +	p9_fcall_free(c, &req->rc);
>  	kmem_cache_free(p9_req_cache, req);
>  	return ERR_PTR(-ENOMEM);
>  }
> @@ -333,8 +352,8 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
>  	spin_lock_irqsave(&c->lock, flags);
>  	idr_remove(&c->reqs, tag);
>  	spin_unlock_irqrestore(&c->lock, flags);
> -	kfree(r->tc.sdata);
> -	kfree(r->rc.sdata);
> +	p9_fcall_free(c, &r->tc);
> +	p9_fcall_free(c, &r->rc);
>  	kmem_cache_free(p9_req_cache, r);
>  }
>  
> @@ -944,6 +963,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
>  
>  	clnt->trans_mod = NULL;
>  	clnt->trans = NULL;
> +	clnt->fcall_cache = NULL;
>  
>  	client_id = utsname()->nodename;
>  	memcpy(clnt->name, client_id, strlen(client_id) + 1);
> @@ -980,6 +1000,9 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
>  	if (err)
>  		goto close_trans;
>  
> +	clnt->fcall_cache = kmem_cache_create("9p-fcall-cache", clnt->msize,
> +					      0, 0, NULL);
> +
>  	return clnt;
>  
>  close_trans:
> @@ -1011,6 +1034,7 @@ void p9_client_destroy(struct p9_client *clnt)
>  
>  	p9_tag_cleanup(clnt);
>  
> +	kmem_cache_destroy(clnt->fcall_cache);

We could set NULL for fcall_cache in case of use-after-free.

>  	kfree(clnt);
>  }
>  EXPORT_SYMBOL(p9_client_destroy);
> diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
> index c5cac97df7f7..5e43f0a00b3a 100644
> --- a/net/9p/trans_rdma.c
> +++ b/net/9p/trans_rdma.c
> @@ -445,7 +445,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
>  	if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
>  		if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
>  			/* Got one! */
> -			kfree(req->rc.sdata);
> +			p9_fcall_free(client, &req->rc);
>  			req->rc.sdata = NULL;
>  			goto dont_need_post_recv;
>  		} else {
>
Dominique Martinet July 31, 2018, 1:35 a.m. UTC | #2
piaojun wrote on Tue, Jul 31, 2018:
> Could you help paste some test result before-and-after the patch applied?

The only performance tests I did were sent to the list a couple of mails
earlier, you can find it here:
http://lkml.kernel.org/r/20180730093101.GA7894@nautica

In particular, the results for benchmark on small writes just before and
after this patch, without KASAN (these are the same numbers as in the
link, hardware/setup is described there):
 - no alloc (4.18-rc7 request cache): 65.4k req/s
 - non-power of two alloc, no patch: 61.6k req/s
 - power of two alloc, no patch: 62.2k req/s
 - non-power of two alloc, with patch: 64.7k req/s
 - power of two alloc, with patch: 65.1k req/s

I'm rather happy with the result, I didn't expect using a dedicated
cache would bring this much back but it's certainly worth it.

> > @@ -1011,6 +1034,7 @@ void p9_client_destroy(struct p9_client *clnt)
> >  
> >  	p9_tag_cleanup(clnt);
> >  
> > +	kmem_cache_destroy(clnt->fcall_cache);
> 
> We could set NULL for fcall_cache in case of use-after-free.
> 
> >  	kfree(clnt);

Hmm, I understand where this comes from, but I'm not sure I agree.
If someone tries to access the client while/after it is freed things are
going to break anyway, I'd rather let things break as obviously as
possible than try to cover it up.
piaojun July 31, 2018, 1:45 a.m. UTC | #3
On 2018/7/31 9:35, Dominique Martinet wrote:
> piaojun wrote on Tue, Jul 31, 2018:
>> Could you help paste some test result before-and-after the patch applied?
> 
> The only performance tests I did were sent to the list a couple of mails
> earlier, you can find it here:
> http://lkml.kernel.org/r/20180730093101.GA7894@nautica
> 
> In particular, the results for benchmark on small writes just before and
> after this patch, without KASAN (these are the same numbers as in the
> link, hardware/setup is described there):
>  - no alloc (4.18-rc7 request cache): 65.4k req/s
>  - non-power of two alloc, no patch: 61.6k req/s
>  - power of two alloc, no patch: 62.2k req/s
>  - non-power of two alloc, with patch: 64.7k req/s
>  - power of two alloc, with patch: 65.1k req/s
> 
> I'm rather happy with the result, I didn't expect using a dedicated
> cache would bring this much back but it's certainly worth it.
> 
It looks like an obvious promotion.

>>> @@ -1011,6 +1034,7 @@ void p9_client_destroy(struct p9_client *clnt)
>>>  
>>>  	p9_tag_cleanup(clnt);
>>>  
>>> +	kmem_cache_destroy(clnt->fcall_cache);
>>
>> We could set NULL for fcall_cache in case of use-after-free.
>>
>>>  	kfree(clnt);
> 
> Hmm, I understand where this comes from, but I'm not sure I agree.
> If someone tries to access the client while/after it is freed things are
> going to break anyway, I'd rather let things break as obviously as
> possible than try to cover it up.
> 
Setting NULL is not a big matter, and I will hear others' opinion.
Matthew Wilcox July 31, 2018, 2:46 a.m. UTC | #4
On Mon, Jul 30, 2018 at 11:34:23AM +0200, Dominique Martinet wrote:
> -static int p9_fcall_alloc(struct p9_fcall *fc, int alloc_msize)
> +static int p9_fcall_alloc(struct p9_client *c, struct p9_fcall *fc,
> +			  int alloc_msize)
>  {
> -	fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
> +	if (c->fcall_cache && alloc_msize == c->msize)
> +		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
> +	else
> +		fc->sdata = kmalloc(alloc_msize, GFP_NOFS);

Could you simplify this by initialising c->msize to 0 and then this
can simply be:

> +	if (alloc_msize == c->msize)
...

> +void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc)
> +{
> +	/* sdata can be NULL for interrupted requests in trans_rdma,
> +	 * and kmem_cache_free does not do NULL-check for us
> +	 */
> +	if (unlikely(!fc->sdata))
> +		return;
> +
> +	if (c->fcall_cache && fc->capacity == c->msize)
> +		kmem_cache_free(c->fcall_cache, fc->sdata);
> +	else
> +		kfree(fc->sdata);
> +}

Is it possible for fcall_cache to be allocated before fcall_free is
called?  I'm concerned we might do this:

allocate message A
allocate message B
receive response A
allocate fcall_cache
receive response B

and then we'd call kmem_cache_free() for something allocated by kmalloc(),
which works with slab and slub, but doesn't work with slob (alas).

> @@ -980,6 +1000,9 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
>  	if (err)
>  		goto close_trans;
>  
> +	clnt->fcall_cache = kmem_cache_create("9p-fcall-cache", clnt->msize,
> +					      0, 0, NULL);
> +

If we have slab merging turned off, or we have two mounts from servers
with different msizes, we'll end up with two slabs called 9p-fcall-cache.
I'm OK with that, but are you?
Dominique Martinet July 31, 2018, 4:17 a.m. UTC | #5
Matthew Wilcox wrote on Mon, Jul 30, 2018:
> On Mon, Jul 30, 2018 at 11:34:23AM +0200, Dominique Martinet wrote:
> > -static int p9_fcall_alloc(struct p9_fcall *fc, int alloc_msize)
> > +static int p9_fcall_alloc(struct p9_client *c, struct p9_fcall *fc,
> > +			  int alloc_msize)
> >  {
> > -	fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
> > +	if (c->fcall_cache && alloc_msize == c->msize)
> > +		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
> > +	else
> > +		fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
> 
> Could you simplify this by initialising c->msize to 0 and then this
> can simply be:
> 
> > +	if (alloc_msize == c->msize)
> ...

Hmm, this is rather tricky with the current flow of things;
p9_client_version() has multiple uses for that msize field.

Basically what happens is:
 - init client struct, set clip msize to mount option/transport-specific
max
 - p9_client_version() uses current c->msize to send a suggested value
to the server
 - p9_client_rpc() uses current c->msize to allocate that first rpc,
this is pretty much hard-coded and will be quite intrusive to make an
exception for
 - p9_client_version() looks at the msize the server suggested and clips
c->msize if the reply's is smaller than c->msize


I kind of agree it'd be nice to remove that check being done all the
time for just startup, but I don't see how to do this easily with the
current code.

Making p9_client_version take an extra argument would be easy but we'd
need to actually hardcode in p9_client_rpc that "if the message type is
TVERSION then use [page size or whatever] for allocation" and that kinds
of kills the point... The alternative being having p9_client_rpc takes
the actual size as argument itself but this once again is pretty
intrusive even if it could be done mechanically...

I'll think about this some more

> > +void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc)
> > +{
> > +	/* sdata can be NULL for interrupted requests in trans_rdma,
> > +	 * and kmem_cache_free does not do NULL-check for us
> > +	 */
> > +	if (unlikely(!fc->sdata))
> > +		return;
> > +
> > +	if (c->fcall_cache && fc->capacity == c->msize)
> > +		kmem_cache_free(c->fcall_cache, fc->sdata);
> > +	else
> > +		kfree(fc->sdata);
> > +}
> 
> Is it possible for fcall_cache to be allocated before fcall_free is
> called?  I'm concerned we might do this:
> 
> allocate message A
> allocate message B
> receive response A
> allocate fcall_cache
> receive response B
> 
> and then we'd call kmem_cache_free() for something allocated by kmalloc(),
> which works with slab and slub, but doesn't work with slob (alas).

Bleh, I checked this would work for slab and didn't really check
others..

This cannot happen right now because we only return the client struct
from p9_client_create after the first message is done (and, right now,
freed) but when we start adding refcounting to requests it'd be possible
to free the very first response after fcall_cache is allocated with a
"bad" server like syzcaller does sending the version reply before the
request came in.

I can't see any work-around around this other than storing how the fcall
was allocated in the struct itself though...
I guess I might as well do that now, unless you have a better idea.


> > @@ -980,6 +1000,9 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
> >  	if (err)
> >  		goto close_trans;
> >  
> > +	clnt->fcall_cache = kmem_cache_create("9p-fcall-cache", clnt->msize,
> > +					      0, 0, NULL);
> > +
> 
> If we have slab merging turned off, or we have two mounts from servers
> with different msizes, we'll end up with two slabs called 9p-fcall-cache.
> I'm OK with that, but are you?

Yeah, the reason I didn't make it global like p9_req_cache is precisely
to get two separate caches if the msizes are different.

I actually considered adding msize to the string with snprintf or
something but someone looking at it through slabinfo or similar will
have the sizes anyway so I don't think this would bring anything, do you
know if/think that tools will choke on multiple caches with the same
name?


I'm not sure about slab merging being disabled though, from the little I
understand I do not see why anyone would do that except for debugging,
and I'm fine with that.
Please let me know if I'm missing something though!


Thanks for the review,
Greg Kurz Aug. 1, 2018, 2:28 p.m. UTC | #6
On Mon, 30 Jul 2018 11:34:23 +0200
Dominique Martinet <asmadeus@codewreck.org> wrote:

> From: Dominique Martinet <dominique.martinet@cea.fr>
> 
> Having a specific cache for the fcall allocations helps speed up
> allocations a bit, especially in case of non-"round" msizes.
> 
> The caches will automatically be merged if there are multiple caches
> of items with the same size so we do not need to try to share a cache
> between different clients of the same size.
> 
> Since the msize is negotiated with the server, only allocate the cache
> after that negotiation has happened - previous allocations or
> allocations of different sizes (e.g. zero-copy fcall) are made with
> kmalloc directly.
> 
> Signed-off-by: Dominique Martinet <dominique.martinet@cea.fr>
> ---

The patch looks good to me. It would need to be rebased when you have fixed
the potential kfree() of stale data in patch 1. Either with an extra goto
label in p9_tag_alloc or by turning p9_fcall_alloc into p9_fcall_alloc_sdata,
both solutions are equivalent.

Just one suggestion, see below.

>  include/net/9p/client.h |  2 ++
>  net/9p/client.c         | 40 ++++++++++++++++++++++++++++++++--------
>  net/9p/trans_rdma.c     |  2 +-
>  3 files changed, 35 insertions(+), 9 deletions(-)
> 
> diff --git a/include/net/9p/client.h b/include/net/9p/client.h
> index 4b4ac1362ad5..8d9bc7402a42 100644
> --- a/include/net/9p/client.h
> +++ b/include/net/9p/client.h
> @@ -123,6 +123,7 @@ struct p9_client {
>  	struct p9_trans_module *trans_mod;
>  	enum p9_trans_status status;
>  	void *trans;
> +	struct kmem_cache *fcall_cache;
>  
>  	union {
>  		struct {
> @@ -230,6 +231,7 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
>  				kgid_t gid, struct p9_qid *);
>  int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
>  int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
> +void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc);
>  struct p9_req_t *p9_tag_lookup(struct p9_client *, u16);
>  void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status);
>  
> diff --git a/net/9p/client.c b/net/9p/client.c
> index ba99a94a12c9..215e3b1ed7b4 100644
> --- a/net/9p/client.c
> +++ b/net/9p/client.c
> @@ -231,15 +231,34 @@ static int parse_opts(char *opts, struct p9_client *clnt)
>  	return ret;
>  }
>  
> -static int p9_fcall_alloc(struct p9_fcall *fc, int alloc_msize)
> +static int p9_fcall_alloc(struct p9_client *c, struct p9_fcall *fc,
> +			  int alloc_msize)
>  {
> -	fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
> +	if (c->fcall_cache && alloc_msize == c->msize)

This is a presumably hot path for any request but the initial TVERSION,
you probably want likely() here...

> +		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
> +	else
> +		fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
>  	if (!fc->sdata)
>  		return -ENOMEM;
>  	fc->capacity = alloc_msize;
>  	return 0;
>  }
>  
> +void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc)
> +{
> +	/* sdata can be NULL for interrupted requests in trans_rdma,
> +	 * and kmem_cache_free does not do NULL-check for us
> +	 */
> +	if (unlikely(!fc->sdata))
> +		return;
> +
> +	if (c->fcall_cache && fc->capacity == c->msize)

... and here as well.

> +		kmem_cache_free(c->fcall_cache, fc->sdata);
> +	else
> +		kfree(fc->sdata);
> +}
> +EXPORT_SYMBOL(p9_fcall_free);
> +
>  static struct kmem_cache *p9_req_cache;
>  
>  /**
> @@ -261,9 +280,9 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
>  	if (!req)
>  		return NULL;
>  
> -	if (p9_fcall_alloc(&req->tc, alloc_msize))
> +	if (p9_fcall_alloc(c, &req->tc, alloc_msize))
>  		goto free;
> -	if (p9_fcall_alloc(&req->rc, alloc_msize))
> +	if (p9_fcall_alloc(c, &req->rc, alloc_msize))
>  		goto free;
>  
>  	p9pdu_reset(&req->tc);
> @@ -288,8 +307,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
>  	return req;
>  
>  free:
> -	kfree(req->tc.sdata);
> -	kfree(req->rc.sdata);
> +	p9_fcall_free(c, &req->tc);
> +	p9_fcall_free(c, &req->rc);
>  	kmem_cache_free(p9_req_cache, req);
>  	return ERR_PTR(-ENOMEM);
>  }
> @@ -333,8 +352,8 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
>  	spin_lock_irqsave(&c->lock, flags);
>  	idr_remove(&c->reqs, tag);
>  	spin_unlock_irqrestore(&c->lock, flags);
> -	kfree(r->tc.sdata);
> -	kfree(r->rc.sdata);
> +	p9_fcall_free(c, &r->tc);
> +	p9_fcall_free(c, &r->rc);
>  	kmem_cache_free(p9_req_cache, r);
>  }
>  
> @@ -944,6 +963,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
>  
>  	clnt->trans_mod = NULL;
>  	clnt->trans = NULL;
> +	clnt->fcall_cache = NULL;
>  
>  	client_id = utsname()->nodename;
>  	memcpy(clnt->name, client_id, strlen(client_id) + 1);
> @@ -980,6 +1000,9 @@ struct p9_client *p9_client_create(const char *dev_name, char *options)
>  	if (err)
>  		goto close_trans;
>  
> +	clnt->fcall_cache = kmem_cache_create("9p-fcall-cache", clnt->msize,
> +					      0, 0, NULL);
> +
>  	return clnt;
>  
>  close_trans:
> @@ -1011,6 +1034,7 @@ void p9_client_destroy(struct p9_client *clnt)
>  
>  	p9_tag_cleanup(clnt);
>  
> +	kmem_cache_destroy(clnt->fcall_cache);
>  	kfree(clnt);
>  }
>  EXPORT_SYMBOL(p9_client_destroy);
> diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
> index c5cac97df7f7..5e43f0a00b3a 100644
> --- a/net/9p/trans_rdma.c
> +++ b/net/9p/trans_rdma.c
> @@ -445,7 +445,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
>  	if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
>  		if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
>  			/* Got one! */
> -			kfree(req->rc.sdata);
> +			p9_fcall_free(client, &req->rc);
>  			req->rc.sdata = NULL;
>  			goto dont_need_post_recv;
>  		} else {
Dominique Martinet Aug. 1, 2018, 3:22 p.m. UTC | #7
Greg Kurz wrote on Wed, Aug 01, 2018:
> > diff --git a/net/9p/client.c b/net/9p/client.c
> > index ba99a94a12c9..215e3b1ed7b4 100644
> > --- a/net/9p/client.c
> > +++ b/net/9p/client.c
> > @@ -231,15 +231,34 @@ static int parse_opts(char *opts, struct p9_client *clnt)
> >  	return ret;
> >  }
> >  
> > -static int p9_fcall_alloc(struct p9_fcall *fc, int alloc_msize)
> > +static int p9_fcall_alloc(struct p9_client *c, struct p9_fcall *fc,
> > +			  int alloc_msize)
> >  {
> > -	fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
> > +	if (c->fcall_cache && alloc_msize == c->msize)
> 
> This is a presumably hot path for any request but the initial TVERSION,
> you probably want likely() here...

c->fcall_cache is indeed very likely, but alloc_size == c->msize not so
much, as zc requests will be quite common for virtio and will be 4k in
size.
Although with that cache I'm starting to wonder if we should always use
it... Speed-wise if there is no memory pressure the cache is likely
going to be faster.
If there is pressure and the items are reclaimed though that'll bring
some heavier slow-down as it'll need to find bigger memory regions.

I'm not sure which path we should favor, tbh. I'll keep these separate
for now.


For the first part of the check, Matthew suggested trying to trick msize
into a different value to fail this check for the initial TVERSION call,
but even after thinking it a bit I don't really see how to do that
cleanly. I can at least make -that- likely()...

> 
> > +		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
> > +	else
> > +		fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
> >  	if (!fc->sdata)
> >  		return -ENOMEM;
> >  	fc->capacity = alloc_msize;
> >  	return 0;
> >  }
> >  
> > +void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc)
> > +{
> > +	/* sdata can be NULL for interrupted requests in trans_rdma,
> > +	 * and kmem_cache_free does not do NULL-check for us
> > +	 */
> > +	if (unlikely(!fc->sdata))
> > +		return;
> > +
> > +	if (c->fcall_cache && fc->capacity == c->msize)
> 
> ... and here as well.

For this one I'll unfortunately need to store in the fc how it has been
allocated as slob doesn't allow to kmem_cache_free() a buffer allocated
by kmalloc and in prevision of refs being refcounted in a hostile world
the initial TVERSION req could be freed after fcall_cache is created :/

That's a bit of a burden but at least will reduce the checks to one here

> > +		kmem_cache_free(c->fcall_cache, fc->sdata);
> > +	else
> > +		kfree(fc->sdata);
> > +}
> > +EXPORT_SYMBOL(p9_fcall_free);
> > +
> >  static struct kmem_cache *p9_req_cache;
> >  
> >  /**


Anyway I've had as many comments as I could hope for, thanks everyone
for the quick review.
I'll send a v2 of both patches tomorrow
diff mbox series

Patch

diff --git a/include/net/9p/client.h b/include/net/9p/client.h
index 4b4ac1362ad5..8d9bc7402a42 100644
--- a/include/net/9p/client.h
+++ b/include/net/9p/client.h
@@ -123,6 +123,7 @@  struct p9_client {
 	struct p9_trans_module *trans_mod;
 	enum p9_trans_status status;
 	void *trans;
+	struct kmem_cache *fcall_cache;
 
 	union {
 		struct {
@@ -230,6 +231,7 @@  int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
 				kgid_t gid, struct p9_qid *);
 int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status);
 int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl);
+void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc);
 struct p9_req_t *p9_tag_lookup(struct p9_client *, u16);
 void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status);
 
diff --git a/net/9p/client.c b/net/9p/client.c
index ba99a94a12c9..215e3b1ed7b4 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -231,15 +231,34 @@  static int parse_opts(char *opts, struct p9_client *clnt)
 	return ret;
 }
 
-static int p9_fcall_alloc(struct p9_fcall *fc, int alloc_msize)
+static int p9_fcall_alloc(struct p9_client *c, struct p9_fcall *fc,
+			  int alloc_msize)
 {
-	fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
+	if (c->fcall_cache && alloc_msize == c->msize)
+		fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
+	else
+		fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
 	if (!fc->sdata)
 		return -ENOMEM;
 	fc->capacity = alloc_msize;
 	return 0;
 }
 
+void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc)
+{
+	/* sdata can be NULL for interrupted requests in trans_rdma,
+	 * and kmem_cache_free does not do NULL-check for us
+	 */
+	if (unlikely(!fc->sdata))
+		return;
+
+	if (c->fcall_cache && fc->capacity == c->msize)
+		kmem_cache_free(c->fcall_cache, fc->sdata);
+	else
+		kfree(fc->sdata);
+}
+EXPORT_SYMBOL(p9_fcall_free);
+
 static struct kmem_cache *p9_req_cache;
 
 /**
@@ -261,9 +280,9 @@  p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
 	if (!req)
 		return NULL;
 
-	if (p9_fcall_alloc(&req->tc, alloc_msize))
+	if (p9_fcall_alloc(c, &req->tc, alloc_msize))
 		goto free;
-	if (p9_fcall_alloc(&req->rc, alloc_msize))
+	if (p9_fcall_alloc(c, &req->rc, alloc_msize))
 		goto free;
 
 	p9pdu_reset(&req->tc);
@@ -288,8 +307,8 @@  p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size)
 	return req;
 
 free:
-	kfree(req->tc.sdata);
-	kfree(req->rc.sdata);
+	p9_fcall_free(c, &req->tc);
+	p9_fcall_free(c, &req->rc);
 	kmem_cache_free(p9_req_cache, req);
 	return ERR_PTR(-ENOMEM);
 }
@@ -333,8 +352,8 @@  static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
 	spin_lock_irqsave(&c->lock, flags);
 	idr_remove(&c->reqs, tag);
 	spin_unlock_irqrestore(&c->lock, flags);
-	kfree(r->tc.sdata);
-	kfree(r->rc.sdata);
+	p9_fcall_free(c, &r->tc);
+	p9_fcall_free(c, &r->rc);
 	kmem_cache_free(p9_req_cache, r);
 }
 
@@ -944,6 +963,7 @@  struct p9_client *p9_client_create(const char *dev_name, char *options)
 
 	clnt->trans_mod = NULL;
 	clnt->trans = NULL;
+	clnt->fcall_cache = NULL;
 
 	client_id = utsname()->nodename;
 	memcpy(clnt->name, client_id, strlen(client_id) + 1);
@@ -980,6 +1000,9 @@  struct p9_client *p9_client_create(const char *dev_name, char *options)
 	if (err)
 		goto close_trans;
 
+	clnt->fcall_cache = kmem_cache_create("9p-fcall-cache", clnt->msize,
+					      0, 0, NULL);
+
 	return clnt;
 
 close_trans:
@@ -1011,6 +1034,7 @@  void p9_client_destroy(struct p9_client *clnt)
 
 	p9_tag_cleanup(clnt);
 
+	kmem_cache_destroy(clnt->fcall_cache);
 	kfree(clnt);
 }
 EXPORT_SYMBOL(p9_client_destroy);
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index c5cac97df7f7..5e43f0a00b3a 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -445,7 +445,7 @@  static int rdma_request(struct p9_client *client, struct p9_req_t *req)
 	if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
 		if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
 			/* Got one! */
-			kfree(req->rc.sdata);
+			p9_fcall_free(client, &req->rc);
 			req->rc.sdata = NULL;
 			goto dont_need_post_recv;
 		} else {