diff mbox series

[v2,1/4] nfs/blocklayout: Fix premature PR key unregistration

Message ID 20240621162227.215412-7-cel@kernel.org (mailing list archive)
State New
Headers show
Series Fixes for pNFS SCSI layout PR key registration | expand

Commit Message

Chuck Lever June 21, 2024, 4:22 p.m. UTC
From: Chuck Lever <chuck.lever@oracle.com>

During generic/069 runs with pNFS SCSI layouts, the NFS client emits
the following in the system journal:

kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
kernel: pNFS: failed to open device /dev/disk/by-id/dm-uuid-mpath-0x6001405e3366f045b7949eb8e4540b51 (-2)
kernel: pNFS: using block device sdb (reservation key 0x666b60901e7b26b3)
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: [sdb] tag#16 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#16 CDB: Write(10) 2a 00 00 00 00 50 00 00 08 00
kernel: reservation conflict error, dev sdb, sector 80 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 2
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: reservation conflict
kernel: sd 6:0:0:1: [sdb] tag#18 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#17 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
kernel: sd 6:0:0:1: [sdb] tag#18 CDB: Write(10) 2a 00 00 00 00 60 00 00 08 00
kernel: sd 6:0:0:1: [sdb] tag#17 CDB: Write(10) 2a 00 00 00 00 58 00 00 08 00
kernel: reservation conflict error, dev sdb, sector 96 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
kernel: reservation conflict error, dev sdb, sector 88 op 0x1:(WRITE) flags 0x0 phys_seg 1 prio class 0
systemd[1]: fstests-generic-069.scope: Deactivated successfully.
systemd[1]: fstests-generic-069.scope: Consumed 5.092s CPU time.
systemd[1]: media-test.mount: Deactivated successfully.
systemd[1]: media-scratch.mount: Deactivated successfully.
kernel: sd 6:0:0:1: reservation conflict
kernel: failed to unregister PR key.

This appears to be due to a race. bl_alloc_lseg() calls this:

561 static struct nfs4_deviceid_node *
562 bl_find_get_deviceid(struct nfs_server *server,
563                 const struct nfs4_deviceid *id, const struct cred *cred,
564                 gfp_t gfp_mask)
565 {
566         struct nfs4_deviceid_node *node;
567         unsigned long start, end;
568
569 retry:
570         node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
571         if (!node)
572                 return ERR_PTR(-ENODEV);

nfs4_find_get_deviceid() does a lookup without the spin lock first.
If it can't find a matching deviceid, it creates a new device_info
(which calls bl_alloc_deviceid_node, and that registers the device's
PR key).

Then it takes the nfs4_deviceid_lock and looks up the deviceid again.
If it finds it this time, bl_find_get_deviceid() frees the spare
(new) device_info, which unregisters the PR key for the same device.

Any subsequent I/O from this client on that device gets EBADE.

The umount later unregisters the device's PR key again.

To prevent this problem, register the PR key after the deviceid_node
lookup.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 fs/nfs/blocklayout/blocklayout.c | 13 +++++++++--
 fs/nfs/blocklayout/blocklayout.h |  8 ++++++-
 fs/nfs/blocklayout/dev.c         | 39 +++++++++++++++++++++++---------
 3 files changed, 46 insertions(+), 14 deletions(-)

Comments

Christoph Hellwig June 22, 2024, 5:03 a.m. UTC | #1
On Fri, Jun 21, 2024 at 12:22:29PM -0400, cel@kernel.org wrote:
> @@ -367,14 +391,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
>  		goto out_blkdev_put;
>  	}
>  
> -	error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true);
> -	if (error) {
> -		pr_err("pNFS: failed to register key for block device %s.",
> -				file_bdev(d->bdev_file)->bd_disk->disk_name);
> -		goto out_blkdev_put;
> -	}
> -
> -	d->pr_registered = true;
> +	d->pr_register = bl_pr_register_scsi;

I think this will break complex (slice, concat, stripe) volumes,
as we'll never call ->pr_register for them at all.  We'll also need
a register callback for them, which then calls into underlying
volume, similar to how bl_parse_deviceid works.  That would also
do away with the need for the d->pr_register callback, we could
just do the swithc on the volume types which might be more
efficient.  (the same is actually true for the ->map callback,
but that's a separate cleanup).
Chuck Lever III June 22, 2024, 5:26 p.m. UTC | #2
On Sat, Jun 22, 2024 at 07:03:25AM +0200, Christoph Hellwig wrote:
> On Fri, Jun 21, 2024 at 12:22:29PM -0400, cel@kernel.org wrote:
> > @@ -367,14 +391,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
> >  		goto out_blkdev_put;
> >  	}
> >  
> > -	error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true);
> > -	if (error) {
> > -		pr_err("pNFS: failed to register key for block device %s.",
> > -				file_bdev(d->bdev_file)->bd_disk->disk_name);
> > -		goto out_blkdev_put;
> > -	}
> > -
> > -	d->pr_registered = true;
> > +	d->pr_register = bl_pr_register_scsi;
> 
> I think this will break complex (slice, concat, stripe) volumes,
> as we'll never call ->pr_register for them at all.  We'll also need
> a register callback for them, which then calls into underlying
> volume, similar to how bl_parse_deviceid works.

This patch currently adds the pr_reg callback to
bl_find_get_deviceid(), which has no visibility of the volume
hierarchy. Where should the registration be done instead? I'm
missing something.


> That would also
> do away with the need for the d->pr_register callback, we could
> just do the swithc on the volume types which might be more
> efficient.  (the same is actually true for the ->map callback,
> but that's a separate cleanup).
Christoph Hellwig June 23, 2024, 7:36 a.m. UTC | #3
On Sat, Jun 22, 2024 at 01:26:10PM -0400, Chuck Lever wrote:
> This patch currently adds the pr_reg callback to
> bl_find_get_deviceid(), which has no visibility of the volume
> hierarchy. Where should the registration be done instead? I'm
> missing something.

Something like the patch below (untested Sunday morning coding) should
do the trick:

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 947b2c52344097..6db54b215066e0 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -564,34 +564,32 @@ bl_find_get_deviceid(struct nfs_server *server,
 		gfp_t gfp_mask)
 {
 	struct nfs4_deviceid_node *node;
-	struct pnfs_block_dev *d;
-	unsigned long start, end;
+	int err = -ENODEV;
 
 retry:
 	node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
 	if (!node)
 		return ERR_PTR(-ENODEV);
 
-	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags))
-		goto transient;
+	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
+		unsigned long end = jiffies;
+		unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT;
 
-	d = container_of(node, struct pnfs_block_dev, node);
-	if (d->pr_register)
-		if (!d->pr_register(d))
-			goto out_put;
-	return node;
-
-transient:
-	end = jiffies;
-	start = end - PNFS_DEVICE_RETRY_TIMEOUT;
-	if (!time_in_range(node->timestamp_unavailable, start, end)) {
-		nfs4_delete_deviceid(node->ld, node->nfs_client, id);
-		goto retry;
+		if (!time_in_range(node->timestamp_unavailable, start, end)) {
+			nfs4_delete_deviceid(node->ld, node->nfs_client, id);
+			goto retry;
+		}
+		goto out_put;
 	}
 
+	err = bl_register_dev(container_of(node, struct pnfs_block_dev, node));
+	if (err)
+		goto out_put;
+	return node;
+
 out_put:
 	nfs4_put_deviceid_node(node);
-	return ERR_PTR(-ENODEV);
+	return ERR_PTR(err);
 }
 
 static int
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index cc788e8ce90933..7efbef9d10dba8 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -104,6 +104,7 @@ struct pnfs_block_dev {
 	u64				start;
 	u64				len;
 
+	enum pnfs_block_volume_type	type;
 	u32				nr_children;
 	struct pnfs_block_dev		*children;
 	u64				chunk_size;
@@ -116,7 +117,6 @@ struct pnfs_block_dev {
 
 	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
 			struct pnfs_block_dev_map *map);
-	bool (*pr_register)(struct pnfs_block_dev *dev);
 };
 
 /* pnfs_block_dev flag bits */
@@ -178,6 +178,7 @@ struct bl_msg_hdr {
 #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
 
 /* dev.c */
+int bl_register_dev(struct pnfs_block_dev *d);
 struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
 		struct pnfs_device *pdev, gfp_t gfp_mask);
 void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 16fb64d4af31db..72e061e87e145a 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -13,9 +13,74 @@
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
 
+static void bl_unregister_scsi(struct pnfs_block_dev *dev)
+{
+	struct block_device *bdev = file_bdev(dev->bdev_file);
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+
+	if (!test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags))
+		return;
+
+	if (ops->pr_register(bdev, dev->pr_key, 0, false))
+		pr_err("failed to unregister PR key.\n");
+}
+
+static bool bl_register_scsi(struct pnfs_block_dev *dev)
+{
+	struct block_device *bdev = file_bdev(dev->bdev_file);
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+	int status;
+
+	if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags))
+		return true;
+
+	status = ops->pr_register(bdev, 0, dev->pr_key, true);
+	if (status) {
+		pr_err("pNFS: failed to register key for block device %s.",
+		       bdev->bd_disk->disk_name);
+		return false;
+	}
+	return true;
+}
+
+static void bl_unregister_dev(struct pnfs_block_dev *dev)
+{
+	if (dev->nr_children) {
+		for (u32 i = 0; i < dev->nr_children; i++)
+			bl_unregister_dev(&dev->children[i]);
+		return;
+	}
+
+	if (dev->type == PNFS_BLOCK_VOLUME_SCSI)
+		bl_unregister_scsi(dev);
+}
+
+int bl_register_dev(struct pnfs_block_dev *dev)
+{
+	if (dev->nr_children) {
+		for (u32 i = 0; i < dev->nr_children; i++) {
+			int ret = bl_register_dev(&dev->children[i]);
+
+			if (ret) {
+				while (i > 0)
+					bl_unregister_dev(&dev->children[--i]);
+				return ret;
+			}
+		}
+
+		return 0;
+	}
+
+	if (dev->type == PNFS_BLOCK_VOLUME_SCSI)
+		return bl_register_scsi(dev);
+	return 0;
+}
+
 static void
 bl_free_device(struct pnfs_block_dev *dev)
 {
+	bl_unregister_dev(dev);
+
 	if (dev->nr_children) {
 		int i;
 
@@ -23,17 +88,6 @@ bl_free_device(struct pnfs_block_dev *dev)
 			bl_free_device(&dev->children[i]);
 		kfree(dev->children);
 	} else {
-		if (test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags)) {
-			struct block_device *bdev = file_bdev(dev->bdev_file);
-			const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
-			int error;
-
-			error = ops->pr_register(file_bdev(dev->bdev_file),
-				dev->pr_key, 0, false);
-			if (error)
-				pr_err("failed to unregister PR key.\n");
-		}
-
 		if (dev->bdev_file)
 			fput(dev->bdev_file);
 	}
@@ -226,30 +280,6 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
 	return true;
 }
 
-/**
- * bl_pr_register_scsi - Register a SCSI PR key for @d
- * @dev: pNFS block device, key to register is already in @d->pr_key
- *
- * Returns true if the device's PR key is registered, otherwise false.
- */
-static bool bl_pr_register_scsi(struct pnfs_block_dev *dev)
-{
-	struct block_device *bdev = file_bdev(dev->bdev_file);
-	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
-	int status;
-
-	if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags))
-		return true;
-
-	status = ops->pr_register(bdev, 0, dev->pr_key, true);
-	if (status) {
-		pr_err("pNFS: failed to register key for block device %s.",
-		       bdev->bd_disk->disk_name);
-		return false;
-	}
-	return true;
-}
-
 static int
 bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
@@ -392,7 +422,6 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 		goto out_blkdev_put;
 	}
 
-	d->pr_register = bl_pr_register_scsi;
 	return 0;
 
 out_blkdev_put:
@@ -478,7 +507,9 @@ static int
 bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
-	switch (volumes[idx].type) {
+	d->type = volumes[idx].type;
+
+	switch (d->type) {
 	case PNFS_BLOCK_VOLUME_SIMPLE:
 		return bl_parse_simple(server, d, volumes, idx, gfp_mask);
 	case PNFS_BLOCK_VOLUME_SLICE:
@@ -490,7 +521,7 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
 	case PNFS_BLOCK_VOLUME_SCSI:
 		return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
 	default:
-		dprintk("unsupported volume type: %d\n", volumes[idx].type);
+		dprintk("unsupported volume type: %d\n", d->type);
 		return -EIO;
 	}
 }
Chuck Lever III June 24, 2024, 3:08 p.m. UTC | #4
On Sun, Jun 23, 2024 at 09:36:27AM +0200, Christoph Hellwig wrote:
> On Sat, Jun 22, 2024 at 01:26:10PM -0400, Chuck Lever wrote:
> > This patch currently adds the pr_reg callback to
> > bl_find_get_deviceid(), which has no visibility of the volume
> > hierarchy. Where should the registration be done instead? I'm
> > missing something.
> 
> Something like the patch below (untested Sunday morning coding) should
> do the trick:
> 
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 947b2c52344097..6db54b215066e0 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -564,34 +564,32 @@ bl_find_get_deviceid(struct nfs_server *server,
>  		gfp_t gfp_mask)
>  {
>  	struct nfs4_deviceid_node *node;
> -	struct pnfs_block_dev *d;
> -	unsigned long start, end;
> +	int err = -ENODEV;
>  
>  retry:
>  	node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
>  	if (!node)
>  		return ERR_PTR(-ENODEV);
>  
> -	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags))
> -		goto transient;
> +	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) {
> +		unsigned long end = jiffies;
> +		unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT;
>  
> -	d = container_of(node, struct pnfs_block_dev, node);
> -	if (d->pr_register)
> -		if (!d->pr_register(d))
> -			goto out_put;
> -	return node;
> -
> -transient:
> -	end = jiffies;
> -	start = end - PNFS_DEVICE_RETRY_TIMEOUT;
> -	if (!time_in_range(node->timestamp_unavailable, start, end)) {
> -		nfs4_delete_deviceid(node->ld, node->nfs_client, id);
> -		goto retry;
> +		if (!time_in_range(node->timestamp_unavailable, start, end)) {
> +			nfs4_delete_deviceid(node->ld, node->nfs_client, id);
> +			goto retry;
> +		}
> +		goto out_put;
>  	}
>  
> +	err = bl_register_dev(container_of(node, struct pnfs_block_dev, node));
> +	if (err)
> +		goto out_put;
> +	return node;
> +
>  out_put:
>  	nfs4_put_deviceid_node(node);
> -	return ERR_PTR(-ENODEV);
> +	return ERR_PTR(err);
>  }
>  
>  static int
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index cc788e8ce90933..7efbef9d10dba8 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -104,6 +104,7 @@ struct pnfs_block_dev {
>  	u64				start;
>  	u64				len;
>  
> +	enum pnfs_block_volume_type	type;
>  	u32				nr_children;
>  	struct pnfs_block_dev		*children;
>  	u64				chunk_size;
> @@ -116,7 +117,6 @@ struct pnfs_block_dev {
>  
>  	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
>  			struct pnfs_block_dev_map *map);
> -	bool (*pr_register)(struct pnfs_block_dev *dev);
>  };
>  
>  /* pnfs_block_dev flag bits */
> @@ -178,6 +178,7 @@ struct bl_msg_hdr {
>  #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
>  
>  /* dev.c */
> +int bl_register_dev(struct pnfs_block_dev *d);
>  struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
>  		struct pnfs_device *pdev, gfp_t gfp_mask);
>  void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
> diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
> index 16fb64d4af31db..72e061e87e145a 100644
> --- a/fs/nfs/blocklayout/dev.c
> +++ b/fs/nfs/blocklayout/dev.c
> @@ -13,9 +13,74 @@
>  
>  #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
>  
> +static void bl_unregister_scsi(struct pnfs_block_dev *dev)
> +{
> +	struct block_device *bdev = file_bdev(dev->bdev_file);
> +	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
> +
> +	if (!test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags))
> +		return;
> +
> +	if (ops->pr_register(bdev, dev->pr_key, 0, false))
> +		pr_err("failed to unregister PR key.\n");
> +}
> +
> +static bool bl_register_scsi(struct pnfs_block_dev *dev)
> +{
> +	struct block_device *bdev = file_bdev(dev->bdev_file);
> +	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
> +	int status;
> +
> +	if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags))
> +		return true;
> +
> +	status = ops->pr_register(bdev, 0, dev->pr_key, true);
> +	if (status) {
> +		pr_err("pNFS: failed to register key for block device %s.",
> +		       bdev->bd_disk->disk_name);
> +		return false;
> +	}
> +	return true;
> +}
> +
> +static void bl_unregister_dev(struct pnfs_block_dev *dev)
> +{
> +	if (dev->nr_children) {
> +		for (u32 i = 0; i < dev->nr_children; i++)
> +			bl_unregister_dev(&dev->children[i]);
> +		return;
> +	}
> +
> +	if (dev->type == PNFS_BLOCK_VOLUME_SCSI)
> +		bl_unregister_scsi(dev);
> +}
> +
> +int bl_register_dev(struct pnfs_block_dev *dev)
> +{
> +	if (dev->nr_children) {
> +		for (u32 i = 0; i < dev->nr_children; i++) {
> +			int ret = bl_register_dev(&dev->children[i]);
> +
> +			if (ret) {
> +				while (i > 0)
> +					bl_unregister_dev(&dev->children[--i]);
> +				return ret;
> +			}
> +		}
> +
> +		return 0;
> +	}
> +
> +	if (dev->type == PNFS_BLOCK_VOLUME_SCSI)
> +		return bl_register_scsi(dev);
> +	return 0;
> +}
> +
>  static void
>  bl_free_device(struct pnfs_block_dev *dev)
>  {
> +	bl_unregister_dev(dev);
> +
>  	if (dev->nr_children) {
>  		int i;
>  
> @@ -23,17 +88,6 @@ bl_free_device(struct pnfs_block_dev *dev)
>  			bl_free_device(&dev->children[i]);
>  		kfree(dev->children);
>  	} else {
> -		if (test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags)) {
> -			struct block_device *bdev = file_bdev(dev->bdev_file);
> -			const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
> -			int error;
> -
> -			error = ops->pr_register(file_bdev(dev->bdev_file),
> -				dev->pr_key, 0, false);
> -			if (error)
> -				pr_err("failed to unregister PR key.\n");
> -		}
> -
>  		if (dev->bdev_file)
>  			fput(dev->bdev_file);
>  	}
> @@ -226,30 +280,6 @@ static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
>  	return true;
>  }
>  
> -/**
> - * bl_pr_register_scsi - Register a SCSI PR key for @d
> - * @dev: pNFS block device, key to register is already in @d->pr_key
> - *
> - * Returns true if the device's PR key is registered, otherwise false.
> - */
> -static bool bl_pr_register_scsi(struct pnfs_block_dev *dev)
> -{
> -	struct block_device *bdev = file_bdev(dev->bdev_file);
> -	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
> -	int status;
> -
> -	if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags))
> -		return true;
> -
> -	status = ops->pr_register(bdev, 0, dev->pr_key, true);
> -	if (status) {
> -		pr_err("pNFS: failed to register key for block device %s.",
> -		       bdev->bd_disk->disk_name);
> -		return false;
> -	}
> -	return true;
> -}
> -
>  static int
>  bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
>  		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
> @@ -392,7 +422,6 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
>  		goto out_blkdev_put;
>  	}
>  
> -	d->pr_register = bl_pr_register_scsi;
>  	return 0;
>  
>  out_blkdev_put:
> @@ -478,7 +507,9 @@ static int
>  bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
>  		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
>  {
> -	switch (volumes[idx].type) {
> +	d->type = volumes[idx].type;
> +
> +	switch (d->type) {
>  	case PNFS_BLOCK_VOLUME_SIMPLE:
>  		return bl_parse_simple(server, d, volumes, idx, gfp_mask);
>  	case PNFS_BLOCK_VOLUME_SLICE:
> @@ -490,7 +521,7 @@ bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
>  	case PNFS_BLOCK_VOLUME_SCSI:
>  		return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
>  	default:
> -		dprintk("unsupported volume type: %d\n", volumes[idx].type);
> +		dprintk("unsupported volume type: %d\n", d->type);
>  		return -EIO;
>  	}
>  }

Thanks. I've applied this as a separate patch (I can squash it into
1/4 once it passes testing). The first write I/O segfaults at L143
in do_add_page_to_bio() :

 142         if (!offset_in_map(disk_addr, map)) {                       
 143                 if (!dev->map(dev, disk_addr, map) || !offset_in_map(disk_addr, map))
 144                         return ERR_PTR(-EIO);

I'm looking into it.
diff mbox series

Patch

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 6be13e0ec170..947b2c523440 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -564,6 +564,7 @@  bl_find_get_deviceid(struct nfs_server *server,
 		gfp_t gfp_mask)
 {
 	struct nfs4_deviceid_node *node;
+	struct pnfs_block_dev *d;
 	unsigned long start, end;
 
 retry:
@@ -571,9 +572,16 @@  bl_find_get_deviceid(struct nfs_server *server,
 	if (!node)
 		return ERR_PTR(-ENODEV);
 
-	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
-		return node;
+	if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags))
+		goto transient;
 
+	d = container_of(node, struct pnfs_block_dev, node);
+	if (d->pr_register)
+		if (!d->pr_register(d))
+			goto out_put;
+	return node;
+
+transient:
 	end = jiffies;
 	start = end - PNFS_DEVICE_RETRY_TIMEOUT;
 	if (!time_in_range(node->timestamp_unavailable, start, end)) {
@@ -581,6 +589,7 @@  bl_find_get_deviceid(struct nfs_server *server,
 		goto retry;
 	}
 
+out_put:
 	nfs4_put_deviceid_node(node);
 	return ERR_PTR(-ENODEV);
 }
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f1eeb4914199..cc788e8ce909 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -110,12 +110,18 @@  struct pnfs_block_dev {
 
 	struct file			*bdev_file;
 	u64				disk_offset;
+	unsigned long			flags;
 
 	u64				pr_key;
-	bool				pr_registered;
 
 	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
 			struct pnfs_block_dev_map *map);
+	bool (*pr_register)(struct pnfs_block_dev *dev);
+};
+
+/* pnfs_block_dev flag bits */
+enum {
+	PNFS_BDEV_REGISTERED = 0,
 };
 
 /* sector_t fields are all in 512-byte sectors */
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index 519c310c745d..83753a08a19d 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -23,9 +23,9 @@  bl_free_device(struct pnfs_block_dev *dev)
 			bl_free_device(&dev->children[i]);
 		kfree(dev->children);
 	} else {
-		if (dev->pr_registered) {
-			const struct pr_ops *ops =
-				file_bdev(dev->bdev_file)->bd_disk->fops->pr_ops;
+		if (test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags)) {
+			struct block_device *bdev = file_bdev(dev->bdev_file);
+			const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
 			int error;
 
 			error = ops->pr_register(file_bdev(dev->bdev_file),
@@ -226,6 +226,30 @@  static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
 	return true;
 }
 
+/**
+ * bl_pr_register_scsi - Register a SCSI PR key for @d
+ * @dev: pNFS block device, key to register is already in @d->pr_key
+ *
+ * Returns true if the device's PR key is registered, otherwise false.
+ */
+static bool bl_pr_register_scsi(struct pnfs_block_dev *dev)
+{
+	struct block_device *bdev = file_bdev(dev->bdev_file);
+	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
+	int status;
+
+	if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags))
+		return true;
+
+	status = ops->pr_register(bdev, 0, dev->pr_key, true);
+	if (status) {
+		pr_err("pNFS: failed to register key for block device %s.",
+		       bdev->bd_disk->disk_name);
+		return false;
+	}
+	return true;
+}
+
 static int
 bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
@@ -367,14 +391,7 @@  bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 		goto out_blkdev_put;
 	}
 
-	error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true);
-	if (error) {
-		pr_err("pNFS: failed to register key for block device %s.",
-				file_bdev(d->bdev_file)->bd_disk->disk_name);
-		goto out_blkdev_put;
-	}
-
-	d->pr_registered = true;
+	d->pr_register = bl_pr_register_scsi;
 	return 0;
 
 out_blkdev_put: