diff mbox

NFS: fix usage of mempools.

Message ID 87wpatvw6m.fsf@notabene.neil.brown.name (mailing list archive)
State New, archived
Headers show

Commit Message

NeilBrown April 10, 2017, 2:22 a.m. UTC
When passed GFP flags that allow sleeping (such as
GFP_NOIO), mempool_alloc() will never return NULL, it will
wait until memory is available.

This means that we don't need to handle failure, but that we
do need to ensure one thread doesn't call mempool_alloc()
twice on the one pool without queuing or freeing the first
allocation.  If multiple threads did this during times of
high memory pressure, the pool could be exhausted and a
deadlock could result.

pnfs_generic_alloc_ds_commits() attempts to allocate from
the nfs_commit_mempool while already holding an allocation
from that pool.  This is not safe.  So change
nfs_commitdata_alloc() to take a flag that indicates whether
failure is acceptable.

In pnfs_generic_alloc_ds_commits(), accept failure and
handle it as we currently do.  Else where, do not accept
failure, and do not handle it.

Even when failure is acceptable, we want to succeed if
possible.  That means both
 - using an entry from the pool if there is one
 - waiting for direct reclaim is there isn't.

We call mempool_alloc(GFP_NOWAIT) to achieve the first, then
kmem_cache_alloc(GFP_NOIO|__GFP_NORETRY) to achieve the
second.  Each of these can fail, but together they do the
best they can without blocking indefinitely.

Also, don't test for failure when allocating from
nfs_wdata_mempool.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 fs/nfs/pnfs_nfs.c      | 16 +++++-----------
 fs/nfs/write.c         | 35 +++++++++++++++++++++--------------
 include/linux/nfs_fs.h |  2 +-
 3 files changed, 27 insertions(+), 26 deletions(-)

Comments

Schumaker, Anna April 10, 2017, 8:22 p.m. UTC | #1
Hi Neil,

On 04/09/2017 10:22 PM, NeilBrown wrote:
> 
> When passed GFP flags that allow sleeping (such as
> GFP_NOIO), mempool_alloc() will never return NULL, it will
> wait until memory is available.
> 
> This means that we don't need to handle failure, but that we
> do need to ensure one thread doesn't call mempool_alloc()
> twice on the one pool without queuing or freeing the first
> allocation.  If multiple threads did this during times of
> high memory pressure, the pool could be exhausted and a
> deadlock could result.
> 
> pnfs_generic_alloc_ds_commits() attempts to allocate from
> the nfs_commit_mempool while already holding an allocation
> from that pool.  This is not safe.  So change
> nfs_commitdata_alloc() to take a flag that indicates whether
> failure is acceptable.
> 
> In pnfs_generic_alloc_ds_commits(), accept failure and
> handle it as we currently do.  Else where, do not accept
> failure, and do not handle it.
> 
> Even when failure is acceptable, we want to succeed if
> possible.  That means both
>  - using an entry from the pool if there is one
>  - waiting for direct reclaim is there isn't.
> 
> We call mempool_alloc(GFP_NOWAIT) to achieve the first, then
> kmem_cache_alloc(GFP_NOIO|__GFP_NORETRY) to achieve the
> second.  Each of these can fail, but together they do the
> best they can without blocking indefinitely.
> 
> Also, don't test for failure when allocating from
> nfs_wdata_mempool.
> 
> Signed-off-by: NeilBrown <neilb@suse.com>
> ---
>  fs/nfs/pnfs_nfs.c      | 16 +++++-----------
>  fs/nfs/write.c         | 35 +++++++++++++++++++++--------------
>  include/linux/nfs_fs.h |  2 +-
>  3 files changed, 27 insertions(+), 26 deletions(-)
> 
> diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
> index 7250b95549ec..1edf5b84aba5 100644
> --- a/fs/nfs/pnfs_nfs.c
> +++ b/fs/nfs/pnfs_nfs.c
> @@ -217,7 +217,7 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
>  	for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
>  		if (list_empty(&bucket->committing))
>  			continue;
> -		data = nfs_commitdata_alloc();
> +		data = nfs_commitdata_alloc(false);
>  		if (!data)
>  			break;
>  		data->ds_commit_index = i;
> @@ -283,16 +283,10 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
>  	unsigned int nreq = 0;
>  
>  	if (!list_empty(mds_pages)) {
> -		data = nfs_commitdata_alloc();
> -		if (data != NULL) {
> -			data->ds_commit_index = -1;
> -			list_add(&data->pages, &list);
> -			nreq++;
> -		} else {
> -			nfs_retry_commit(mds_pages, NULL, cinfo, 0);
> -			pnfs_generic_retry_commit(cinfo, 0);
> -			return -ENOMEM;
> -		}
> +		data = nfs_commitdata_alloc(true);
> +		data->ds_commit_index = -1;
> +		list_add(&data->pages, &list);
> +		nreq++;
>  	}
>  
>  	nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
> index abb2c8a3be42..bdfe5a7c5874 100644
> --- a/fs/nfs/write.c
> +++ b/fs/nfs/write.c
> @@ -60,14 +60,28 @@ static mempool_t *nfs_wdata_mempool;
>  static struct kmem_cache *nfs_cdata_cachep;
>  static mempool_t *nfs_commit_mempool;
>  
> -struct nfs_commit_data *nfs_commitdata_alloc(void)
> +struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
>  {
> -	struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
> +	struct nfs_commit_data *p;
>  
> -	if (p) {
> -		memset(p, 0, sizeof(*p));
> -		INIT_LIST_HEAD(&p->pages);
> +	if (never_fail)
> +		p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
> +	else {
> +		/* It is OK to do some reclaim, not no safe to wait
> +		 * for anything to be returned to the pool.
> +		 * mempool_alloc() cannot handle that particular combination,
> +		 * so we need two separate attempts.
> +		 */
> +		p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
> +		if (!p)
> +			p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
> +					     __GFP_NOWARN | __GFP_NORETRY);

Do we need to add something to the nfs_commit_data structure to properly free a kmem_cache_alloc()-ed object?  Right now it looks like nfs_commit_free() calls mempool_free() unconditionally.

Thanks,
Anna

> +		if (!p)
> +			return NULL;
>  	}
> +
> +	memset(p, 0, sizeof(*p));
> +	INIT_LIST_HEAD(&p->pages);
>  	return p;
>  }
>  EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
> @@ -82,8 +96,7 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void)
>  {
>  	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
>  
> -	if (p)
> -		memset(p, 0, sizeof(*p));
> +	memset(p, 0, sizeof(*p));
>  	return p;
>  }
>  
> @@ -1705,19 +1718,13 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
>  	if (list_empty(head))
>  		return 0;
>  
> -	data = nfs_commitdata_alloc();
> -
> -	if (!data)
> -		goto out_bad;
> +	data = nfs_commitdata_alloc(true);
>  
>  	/* Set up the argument struct */
>  	nfs_init_commit(data, head, NULL, cinfo);
>  	atomic_inc(&cinfo->mds->rpcs_out);
>  	return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
>  				   data->mds_ops, how, 0);
> - out_bad:
> -	nfs_retry_commit(head, NULL, cinfo, 0);
> -	return -ENOMEM;
>  }
>  
>  int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
> index 287f34161086..1b29915247b2 100644
> --- a/include/linux/nfs_fs.h
> +++ b/include/linux/nfs_fs.h
> @@ -502,7 +502,7 @@ extern int nfs_wb_all(struct inode *inode);
>  extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder);
>  extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
>  extern int  nfs_commit_inode(struct inode *, int);
> -extern struct nfs_commit_data *nfs_commitdata_alloc(void);
> +extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
>  extern void nfs_commit_free(struct nfs_commit_data *data);
>  
>  static inline int
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
NeilBrown April 10, 2017, 9:42 p.m. UTC | #2
On Mon, Apr 10 2017, Anna Schumaker wrote:

> Hi Neil,
>
> On 04/09/2017 10:22 PM, NeilBrown wrote:
>> 
>> +		/* It is OK to do some reclaim, not no safe to wait
>> +		 * for anything to be returned to the pool.
>> +		 * mempool_alloc() cannot handle that particular combination,
>> +		 * so we need two separate attempts.
>> +		 */
>> +		p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
>> +		if (!p)
>> +			p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
>> +					     __GFP_NOWARN | __GFP_NORETRY);
>
> Do we need to add something to the nfs_commit_data structure to
> properly free a kmem_cache_alloc()-ed object?  Right now it looks like
> nfs_commit_free() calls mempool_free() unconditionally. 

Good question.  I should have clarified that in the patch comment.
Can you add this to that comment please.

  The objects returned by kmem_cache_alloc() will still be freed
  by mempool_free().  This is safe as mempool_alloc() uses
  exactly the same function to allocate objects (since the mempool
  was created with mempool_create_slab_pool()).  The object returned
  by mempool_alloc() and kmem_cache_alloc() are indistinguishable
  so mempool_free() will handle both identically, either adding to the
  pool or calling kmem_cache_free().


Thanks,
NeilBrown
diff mbox

Patch

diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7250b95549ec..1edf5b84aba5 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -217,7 +217,7 @@  pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
 	for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
 		if (list_empty(&bucket->committing))
 			continue;
-		data = nfs_commitdata_alloc();
+		data = nfs_commitdata_alloc(false);
 		if (!data)
 			break;
 		data->ds_commit_index = i;
@@ -283,16 +283,10 @@  pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 	unsigned int nreq = 0;
 
 	if (!list_empty(mds_pages)) {
-		data = nfs_commitdata_alloc();
-		if (data != NULL) {
-			data->ds_commit_index = -1;
-			list_add(&data->pages, &list);
-			nreq++;
-		} else {
-			nfs_retry_commit(mds_pages, NULL, cinfo, 0);
-			pnfs_generic_retry_commit(cinfo, 0);
-			return -ENOMEM;
-		}
+		data = nfs_commitdata_alloc(true);
+		data->ds_commit_index = -1;
+		list_add(&data->pages, &list);
+		nreq++;
 	}
 
 	nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index abb2c8a3be42..bdfe5a7c5874 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -60,14 +60,28 @@  static mempool_t *nfs_wdata_mempool;
 static struct kmem_cache *nfs_cdata_cachep;
 static mempool_t *nfs_commit_mempool;
 
-struct nfs_commit_data *nfs_commitdata_alloc(void)
+struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
 {
-	struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+	struct nfs_commit_data *p;
 
-	if (p) {
-		memset(p, 0, sizeof(*p));
-		INIT_LIST_HEAD(&p->pages);
+	if (never_fail)
+		p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
+	else {
+		/* It is OK to do some reclaim, not no safe to wait
+		 * for anything to be returned to the pool.
+		 * mempool_alloc() cannot handle that particular combination,
+		 * so we need two separate attempts.
+		 */
+		p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
+		if (!p)
+			p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
+					     __GFP_NOWARN | __GFP_NORETRY);
+		if (!p)
+			return NULL;
 	}
+
+	memset(p, 0, sizeof(*p));
+	INIT_LIST_HEAD(&p->pages);
 	return p;
 }
 EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
@@ -82,8 +96,7 @@  static struct nfs_pgio_header *nfs_writehdr_alloc(void)
 {
 	struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
 
-	if (p)
-		memset(p, 0, sizeof(*p));
+	memset(p, 0, sizeof(*p));
 	return p;
 }
 
@@ -1705,19 +1718,13 @@  nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 	if (list_empty(head))
 		return 0;
 
-	data = nfs_commitdata_alloc();
-
-	if (!data)
-		goto out_bad;
+	data = nfs_commitdata_alloc(true);
 
 	/* Set up the argument struct */
 	nfs_init_commit(data, head, NULL, cinfo);
 	atomic_inc(&cinfo->mds->rpcs_out);
 	return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
 				   data->mds_ops, how, 0);
- out_bad:
-	nfs_retry_commit(head, NULL, cinfo, 0);
-	return -ENOMEM;
 }
 
 int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 287f34161086..1b29915247b2 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -502,7 +502,7 @@  extern int nfs_wb_all(struct inode *inode);
 extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder);
 extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 extern int  nfs_commit_inode(struct inode *, int);
-extern struct nfs_commit_data *nfs_commitdata_alloc(void);
+extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail);
 extern void nfs_commit_free(struct nfs_commit_data *data);
 
 static inline int