Message ID | 20211226143439.3985960-3-ruansy.fnst@fujitsu.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | fsdax: introduce fs query to support reflink | expand |
On Sun, Dec 26, 2021 at 10:34:31PM +0800, Shiyang Ruan wrote: > To easily track filesystem from a pmem device, we introduce a holder for > dax_device structure, and also its operation. This holder is used to > remember who is using this dax_device: > - When it is the backend of a filesystem, the holder will be the > instance of this filesystem. > - When this pmem device is one of the targets in a mapped device, the > holder will be this mapped device. In this case, the mapped device > has its own dax_device and it will follow the first rule. So that we > can finally track to the filesystem we needed. > > The holder and holder_ops will be set when filesystem is being mounted, > or an target device is being activated. > > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com> > --- > drivers/dax/super.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ > include/linux/dax.h | 29 +++++++++++++++++++++ > 2 files changed, 91 insertions(+) > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > index c46f56e33d40..94c51f2ee133 100644 > --- a/drivers/dax/super.c > +++ b/drivers/dax/super.c > @@ -20,15 +20,20 @@ > * @inode: core vfs > * @cdev: optional character interface for "device dax" > * @private: dax driver private data > + * @holder_data: holder of a dax_device: could be filesystem or mapped device > * @flags: state and boolean properties > + * @ops: operations for dax_device > + * @holder_ops: operations for the inner holder > */ > struct dax_device { > struct inode inode; > struct cdev cdev; > void *private; > struct percpu_rw_semaphore rwsem; > + void *holder_data; > unsigned long flags; > const struct dax_operations *ops; > + const struct dax_holder_operations *holder_ops; > }; > > static dev_t dax_devt; > @@ -192,6 +197,29 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > } > EXPORT_SYMBOL_GPL(dax_zero_page_range); > > +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, > + u64 len, int mf_flags) > +{ > + int rc; > + > + dax_read_lock(dax_dev); > + if (!dax_alive(dax_dev)) { > + rc = -ENXIO; > + goto out; > + } > + > + if (!dax_dev->holder_ops) { > + rc = -EOPNOTSUPP; > + goto out; > + } > + > + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); > +out: > + dax_read_unlock(dax_dev); > + return rc; > +} > +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); > + > #ifdef CONFIG_ARCH_HAS_PMEM_API > void arch_wb_cache_pmem(void *addr, size_t size); > void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) > @@ -254,6 +282,10 @@ void kill_dax(struct dax_device *dax_dev) > return; > dax_write_lock(dax_dev); > clear_bit(DAXDEV_ALIVE, &dax_dev->flags); > + > + /* clear holder data */ > + dax_dev->holder_ops = NULL; > + dax_dev->holder_data = NULL; > dax_write_unlock(dax_dev); > } > EXPORT_SYMBOL_GPL(kill_dax); > @@ -401,6 +433,36 @@ void put_dax(struct dax_device *dax_dev) > } > EXPORT_SYMBOL_GPL(put_dax); > > +void dax_register_holder(struct dax_device *dax_dev, void *holder, > + const struct dax_holder_operations *ops) > +{ > + if (!dax_alive(dax_dev)) > + return; > + > + dax_dev->holder_data = holder; > + dax_dev->holder_ops = ops; Shouldn't this return an error code if the dax device is dead or if someone already registered a holder? I'm pretty sure XFS should not bind to a dax device if someone else already registered for it... ...unless you want to use a notifier chain for failure events so that there can be multiple consumers of dax failure events? --D > +} > +EXPORT_SYMBOL_GPL(dax_register_holder); > + > +void dax_unregister_holder(struct dax_device *dax_dev) > +{ > + if (!dax_alive(dax_dev)) > + return; > + > + dax_dev->holder_data = NULL; > + dax_dev->holder_ops = NULL; > +} > +EXPORT_SYMBOL_GPL(dax_unregister_holder); > + > +void *dax_get_holder(struct dax_device *dax_dev) > +{ > + if (!dax_alive(dax_dev)) > + return NULL; > + > + return dax_dev->holder_data; > +} > +EXPORT_SYMBOL_GPL(dax_get_holder); > + > /** > * inode_dax: convert a public inode into its dax_dev > * @inode: An inode with i_cdev pointing to a dax_dev > diff --git a/include/linux/dax.h b/include/linux/dax.h > index a146bfb80804..e16a9e0ee857 100644 > --- a/include/linux/dax.h > +++ b/include/linux/dax.h > @@ -44,6 +44,22 @@ struct dax_operations { > #if IS_ENABLED(CONFIG_DAX) > struct dax_device *alloc_dax(void *private, const struct dax_operations *ops, > unsigned long flags); > +struct dax_holder_operations { > + /* > + * notify_failure - notify memory failure into inner holder device > + * @dax_dev: the dax device which contains the holder > + * @offset: offset on this dax device where memory failure occurs > + * @len: length of this memory failure event > + * @flags: action flags for memory failure handler > + */ > + int (*notify_failure)(struct dax_device *dax_dev, u64 offset, > + u64 len, int mf_flags); > +}; > + > +void dax_register_holder(struct dax_device *dax_dev, void *holder, > + const struct dax_holder_operations *ops); > +void dax_unregister_holder(struct dax_device *dax_dev); > +void *dax_get_holder(struct dax_device *dax_dev); > void put_dax(struct dax_device *dax_dev); > void kill_dax(struct dax_device *dax_dev); > void dax_write_cache(struct dax_device *dax_dev, bool wc); > @@ -71,6 +87,17 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, > return dax_synchronous(dax_dev); > } > #else > +static inline void dax_register_holder(struct dax_device *dax_dev, void *holder, > + const struct dax_holder_operations *ops) > +{ > +} > +static inline void dax_unregister_holder(struct dax_device *dax_dev) > +{ > +} > +static inline void *dax_get_holder(struct dax_device *dax_dev) > +{ > + return NULL; > +} > static inline struct dax_device *alloc_dax(void *private, > const struct dax_operations *ops, unsigned long flags) > { > @@ -209,6 +236,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, > size_t bytes, struct iov_iter *i); > int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > size_t nr_pages); > +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, > + int mf_flags); > void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); > > ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, > -- > 2.34.1 > > >
On Wed, Jan 5, 2022 at 10:12 AM Darrick J. Wong <djwong@kernel.org> wrote: > > On Sun, Dec 26, 2021 at 10:34:31PM +0800, Shiyang Ruan wrote: > > To easily track filesystem from a pmem device, we introduce a holder for > > dax_device structure, and also its operation. This holder is used to > > remember who is using this dax_device: > > - When it is the backend of a filesystem, the holder will be the > > instance of this filesystem. > > - When this pmem device is one of the targets in a mapped device, the > > holder will be this mapped device. In this case, the mapped device > > has its own dax_device and it will follow the first rule. So that we > > can finally track to the filesystem we needed. > > > > The holder and holder_ops will be set when filesystem is being mounted, > > or an target device is being activated. > > > > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com> > > --- > > drivers/dax/super.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ > > include/linux/dax.h | 29 +++++++++++++++++++++ > > 2 files changed, 91 insertions(+) > > > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > > index c46f56e33d40..94c51f2ee133 100644 > > --- a/drivers/dax/super.c > > +++ b/drivers/dax/super.c > > @@ -20,15 +20,20 @@ > > * @inode: core vfs > > * @cdev: optional character interface for "device dax" > > * @private: dax driver private data > > + * @holder_data: holder of a dax_device: could be filesystem or mapped device > > * @flags: state and boolean properties > > + * @ops: operations for dax_device > > + * @holder_ops: operations for the inner holder > > */ > > struct dax_device { > > struct inode inode; > > struct cdev cdev; > > void *private; > > struct percpu_rw_semaphore rwsem; > > + void *holder_data; > > unsigned long flags; > > const struct dax_operations *ops; > > + const struct dax_holder_operations *holder_ops; > > }; > > > > static dev_t dax_devt; > > @@ -192,6 +197,29 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > > } > > EXPORT_SYMBOL_GPL(dax_zero_page_range); > > > > +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, > > + u64 len, int mf_flags) > > +{ > > + int rc; > > + > > + dax_read_lock(dax_dev); > > + if (!dax_alive(dax_dev)) { > > + rc = -ENXIO; > > + goto out; > > + } > > + > > + if (!dax_dev->holder_ops) { > > + rc = -EOPNOTSUPP; > > + goto out; > > + } > > + > > + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); > > +out: > > + dax_read_unlock(dax_dev); > > + return rc; > > +} > > +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); > > + > > #ifdef CONFIG_ARCH_HAS_PMEM_API > > void arch_wb_cache_pmem(void *addr, size_t size); > > void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) > > @@ -254,6 +282,10 @@ void kill_dax(struct dax_device *dax_dev) > > return; > > dax_write_lock(dax_dev); > > clear_bit(DAXDEV_ALIVE, &dax_dev->flags); > > + > > + /* clear holder data */ > > + dax_dev->holder_ops = NULL; > > + dax_dev->holder_data = NULL; > > dax_write_unlock(dax_dev); > > } > > EXPORT_SYMBOL_GPL(kill_dax); > > @@ -401,6 +433,36 @@ void put_dax(struct dax_device *dax_dev) > > } > > EXPORT_SYMBOL_GPL(put_dax); > > > > +void dax_register_holder(struct dax_device *dax_dev, void *holder, > > + const struct dax_holder_operations *ops) > > +{ > > + if (!dax_alive(dax_dev)) > > + return; > > + > > + dax_dev->holder_data = holder; > > + dax_dev->holder_ops = ops; > > Shouldn't this return an error code if the dax device is dead or if > someone already registered a holder? I'm pretty sure XFS should not > bind to a dax device if someone else already registered for it... Agree, yes. > > ...unless you want to use a notifier chain for failure events so that > there can be multiple consumers of dax failure events? No, I would hope not. It should be 1:1 holders to dax-devices. Similar ownership semantics like bd_prepare_to_claim().
On Wed, Jan 05, 2022 at 10:23:08AM -0800, Dan Williams wrote: > On Wed, Jan 5, 2022 at 10:12 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Sun, Dec 26, 2021 at 10:34:31PM +0800, Shiyang Ruan wrote: > > > To easily track filesystem from a pmem device, we introduce a holder for > > > dax_device structure, and also its operation. This holder is used to > > > remember who is using this dax_device: > > > - When it is the backend of a filesystem, the holder will be the > > > instance of this filesystem. > > > - When this pmem device is one of the targets in a mapped device, the > > > holder will be this mapped device. In this case, the mapped device > > > has its own dax_device and it will follow the first rule. So that we > > > can finally track to the filesystem we needed. > > > > > > The holder and holder_ops will be set when filesystem is being mounted, > > > or an target device is being activated. > > > > > > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com> > > > --- > > > drivers/dax/super.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ > > > include/linux/dax.h | 29 +++++++++++++++++++++ > > > 2 files changed, 91 insertions(+) > > > > > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > > > index c46f56e33d40..94c51f2ee133 100644 > > > --- a/drivers/dax/super.c > > > +++ b/drivers/dax/super.c > > > @@ -20,15 +20,20 @@ > > > * @inode: core vfs > > > * @cdev: optional character interface for "device dax" > > > * @private: dax driver private data > > > + * @holder_data: holder of a dax_device: could be filesystem or mapped device > > > * @flags: state and boolean properties > > > + * @ops: operations for dax_device > > > + * @holder_ops: operations for the inner holder > > > */ > > > struct dax_device { > > > struct inode inode; > > > struct cdev cdev; > > > void *private; > > > struct percpu_rw_semaphore rwsem; > > > + void *holder_data; > > > unsigned long flags; > > > const struct dax_operations *ops; > > > + const struct dax_holder_operations *holder_ops; > > > }; > > > > > > static dev_t dax_devt; > > > @@ -192,6 +197,29 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > > > } > > > EXPORT_SYMBOL_GPL(dax_zero_page_range); > > > > > > +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, > > > + u64 len, int mf_flags) > > > +{ > > > + int rc; > > > + > > > + dax_read_lock(dax_dev); > > > + if (!dax_alive(dax_dev)) { > > > + rc = -ENXIO; > > > + goto out; > > > + } > > > + > > > + if (!dax_dev->holder_ops) { > > > + rc = -EOPNOTSUPP; > > > + goto out; > > > + } > > > + > > > + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); > > > +out: > > > + dax_read_unlock(dax_dev); > > > + return rc; > > > +} > > > +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); > > > + > > > #ifdef CONFIG_ARCH_HAS_PMEM_API > > > void arch_wb_cache_pmem(void *addr, size_t size); > > > void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) > > > @@ -254,6 +282,10 @@ void kill_dax(struct dax_device *dax_dev) > > > return; > > > dax_write_lock(dax_dev); > > > clear_bit(DAXDEV_ALIVE, &dax_dev->flags); > > > + > > > + /* clear holder data */ > > > + dax_dev->holder_ops = NULL; > > > + dax_dev->holder_data = NULL; > > > dax_write_unlock(dax_dev); > > > } > > > EXPORT_SYMBOL_GPL(kill_dax); > > > @@ -401,6 +433,36 @@ void put_dax(struct dax_device *dax_dev) > > > } > > > EXPORT_SYMBOL_GPL(put_dax); > > > > > > +void dax_register_holder(struct dax_device *dax_dev, void *holder, > > > + const struct dax_holder_operations *ops) > > > +{ > > > + if (!dax_alive(dax_dev)) > > > + return; > > > + > > > + dax_dev->holder_data = holder; > > > + dax_dev->holder_ops = ops; > > > > Shouldn't this return an error code if the dax device is dead or if > > someone already registered a holder? I'm pretty sure XFS should not > > bind to a dax device if someone else already registered for it... > > Agree, yes. > > > > > ...unless you want to use a notifier chain for failure events so that > > there can be multiple consumers of dax failure events? > > No, I would hope not. It should be 1:1 holders to dax-devices. Similar > ownership semantics like bd_prepare_to_claim(). Does each partition on a pmem device still have its own dax_device? --D
On Wed, Jan 5, 2022 at 10:56 AM Darrick J. Wong <djwong@kernel.org> wrote: > > On Wed, Jan 05, 2022 at 10:23:08AM -0800, Dan Williams wrote: > > On Wed, Jan 5, 2022 at 10:12 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > On Sun, Dec 26, 2021 at 10:34:31PM +0800, Shiyang Ruan wrote: > > > > To easily track filesystem from a pmem device, we introduce a holder for > > > > dax_device structure, and also its operation. This holder is used to > > > > remember who is using this dax_device: > > > > - When it is the backend of a filesystem, the holder will be the > > > > instance of this filesystem. > > > > - When this pmem device is one of the targets in a mapped device, the > > > > holder will be this mapped device. In this case, the mapped device > > > > has its own dax_device and it will follow the first rule. So that we > > > > can finally track to the filesystem we needed. > > > > > > > > The holder and holder_ops will be set when filesystem is being mounted, > > > > or an target device is being activated. > > > > > > > > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com> > > > > --- > > > > drivers/dax/super.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ > > > > include/linux/dax.h | 29 +++++++++++++++++++++ > > > > 2 files changed, 91 insertions(+) > > > > > > > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > > > > index c46f56e33d40..94c51f2ee133 100644 > > > > --- a/drivers/dax/super.c > > > > +++ b/drivers/dax/super.c > > > > @@ -20,15 +20,20 @@ > > > > * @inode: core vfs > > > > * @cdev: optional character interface for "device dax" > > > > * @private: dax driver private data > > > > + * @holder_data: holder of a dax_device: could be filesystem or mapped device > > > > * @flags: state and boolean properties > > > > + * @ops: operations for dax_device > > > > + * @holder_ops: operations for the inner holder > > > > */ > > > > struct dax_device { > > > > struct inode inode; > > > > struct cdev cdev; > > > > void *private; > > > > struct percpu_rw_semaphore rwsem; > > > > + void *holder_data; > > > > unsigned long flags; > > > > const struct dax_operations *ops; > > > > + const struct dax_holder_operations *holder_ops; > > > > }; > > > > > > > > static dev_t dax_devt; > > > > @@ -192,6 +197,29 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > > > > } > > > > EXPORT_SYMBOL_GPL(dax_zero_page_range); > > > > > > > > +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, > > > > + u64 len, int mf_flags) > > > > +{ > > > > + int rc; > > > > + > > > > + dax_read_lock(dax_dev); > > > > + if (!dax_alive(dax_dev)) { > > > > + rc = -ENXIO; > > > > + goto out; > > > > + } > > > > + > > > > + if (!dax_dev->holder_ops) { > > > > + rc = -EOPNOTSUPP; > > > > + goto out; > > > > + } > > > > + > > > > + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); > > > > +out: > > > > + dax_read_unlock(dax_dev); > > > > + return rc; > > > > +} > > > > +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); > > > > + > > > > #ifdef CONFIG_ARCH_HAS_PMEM_API > > > > void arch_wb_cache_pmem(void *addr, size_t size); > > > > void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) > > > > @@ -254,6 +282,10 @@ void kill_dax(struct dax_device *dax_dev) > > > > return; > > > > dax_write_lock(dax_dev); > > > > clear_bit(DAXDEV_ALIVE, &dax_dev->flags); > > > > + > > > > + /* clear holder data */ > > > > + dax_dev->holder_ops = NULL; > > > > + dax_dev->holder_data = NULL; > > > > dax_write_unlock(dax_dev); > > > > } > > > > EXPORT_SYMBOL_GPL(kill_dax); > > > > @@ -401,6 +433,36 @@ void put_dax(struct dax_device *dax_dev) > > > > } > > > > EXPORT_SYMBOL_GPL(put_dax); > > > > > > > > +void dax_register_holder(struct dax_device *dax_dev, void *holder, > > > > + const struct dax_holder_operations *ops) > > > > +{ > > > > + if (!dax_alive(dax_dev)) > > > > + return; > > > > + > > > > + dax_dev->holder_data = holder; > > > > + dax_dev->holder_ops = ops; > > > > > > Shouldn't this return an error code if the dax device is dead or if > > > someone already registered a holder? I'm pretty sure XFS should not > > > bind to a dax device if someone else already registered for it... > > > > Agree, yes. > > > > > > > > ...unless you want to use a notifier chain for failure events so that > > > there can be multiple consumers of dax failure events? > > > > No, I would hope not. It should be 1:1 holders to dax-devices. Similar > > ownership semantics like bd_prepare_to_claim(). > > Does each partition on a pmem device still have its own dax_device? No, it never did... Just as before, each dax-device is still associated with a gendisk / whole-block_device. The recent change is that instead of needing that partition-block_device plumbed to convert a relative block number to its absolute whole-block_device offset the filesystem now handles that at iomap_begin() time. See: if (mapping_flags & IOMAP_DAX) iomap->addr += target->bt_dax_part_off; ...in xfs_bmbt_to_iomap() (in -next). I.e. bdev_dax_pgoff() is gone with the lead-in reworks.
On Wed, Jan 05, 2022 at 11:20:12AM -0800, Dan Williams wrote: > On Wed, Jan 5, 2022 at 10:56 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Wed, Jan 05, 2022 at 10:23:08AM -0800, Dan Williams wrote: > > > On Wed, Jan 5, 2022 at 10:12 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > On Sun, Dec 26, 2021 at 10:34:31PM +0800, Shiyang Ruan wrote: > > > > > To easily track filesystem from a pmem device, we introduce a holder for > > > > > dax_device structure, and also its operation. This holder is used to > > > > > remember who is using this dax_device: > > > > > - When it is the backend of a filesystem, the holder will be the > > > > > instance of this filesystem. > > > > > - When this pmem device is one of the targets in a mapped device, the > > > > > holder will be this mapped device. In this case, the mapped device > > > > > has its own dax_device and it will follow the first rule. So that we > > > > > can finally track to the filesystem we needed. > > > > > > > > > > The holder and holder_ops will be set when filesystem is being mounted, > > > > > or an target device is being activated. > > > > > > > > > > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com> > > > > > --- > > > > > drivers/dax/super.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > include/linux/dax.h | 29 +++++++++++++++++++++ > > > > > 2 files changed, 91 insertions(+) > > > > > > > > > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > > > > > index c46f56e33d40..94c51f2ee133 100644 > > > > > --- a/drivers/dax/super.c > > > > > +++ b/drivers/dax/super.c > > > > > @@ -20,15 +20,20 @@ > > > > > * @inode: core vfs > > > > > * @cdev: optional character interface for "device dax" > > > > > * @private: dax driver private data > > > > > + * @holder_data: holder of a dax_device: could be filesystem or mapped device > > > > > * @flags: state and boolean properties > > > > > + * @ops: operations for dax_device > > > > > + * @holder_ops: operations for the inner holder > > > > > */ > > > > > struct dax_device { > > > > > struct inode inode; > > > > > struct cdev cdev; > > > > > void *private; > > > > > struct percpu_rw_semaphore rwsem; > > > > > + void *holder_data; > > > > > unsigned long flags; > > > > > const struct dax_operations *ops; > > > > > + const struct dax_holder_operations *holder_ops; > > > > > }; > > > > > > > > > > static dev_t dax_devt; > > > > > @@ -192,6 +197,29 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > > > > > } > > > > > EXPORT_SYMBOL_GPL(dax_zero_page_range); > > > > > > > > > > +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, > > > > > + u64 len, int mf_flags) > > > > > +{ > > > > > + int rc; > > > > > + > > > > > + dax_read_lock(dax_dev); > > > > > + if (!dax_alive(dax_dev)) { > > > > > + rc = -ENXIO; > > > > > + goto out; > > > > > + } > > > > > + > > > > > + if (!dax_dev->holder_ops) { > > > > > + rc = -EOPNOTSUPP; > > > > > + goto out; > > > > > + } > > > > > + > > > > > + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); > > > > > +out: > > > > > + dax_read_unlock(dax_dev); > > > > > + return rc; > > > > > +} > > > > > +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); > > > > > + > > > > > #ifdef CONFIG_ARCH_HAS_PMEM_API > > > > > void arch_wb_cache_pmem(void *addr, size_t size); > > > > > void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) > > > > > @@ -254,6 +282,10 @@ void kill_dax(struct dax_device *dax_dev) > > > > > return; > > > > > dax_write_lock(dax_dev); > > > > > clear_bit(DAXDEV_ALIVE, &dax_dev->flags); > > > > > + > > > > > + /* clear holder data */ > > > > > + dax_dev->holder_ops = NULL; > > > > > + dax_dev->holder_data = NULL; > > > > > dax_write_unlock(dax_dev); > > > > > } > > > > > EXPORT_SYMBOL_GPL(kill_dax); > > > > > @@ -401,6 +433,36 @@ void put_dax(struct dax_device *dax_dev) > > > > > } > > > > > EXPORT_SYMBOL_GPL(put_dax); > > > > > > > > > > +void dax_register_holder(struct dax_device *dax_dev, void *holder, > > > > > + const struct dax_holder_operations *ops) > > > > > +{ > > > > > + if (!dax_alive(dax_dev)) > > > > > + return; > > > > > + > > > > > + dax_dev->holder_data = holder; > > > > > + dax_dev->holder_ops = ops; > > > > > > > > Shouldn't this return an error code if the dax device is dead or if > > > > someone already registered a holder? I'm pretty sure XFS should not > > > > bind to a dax device if someone else already registered for it... > > > > > > Agree, yes. > > > > > > > > > > > ...unless you want to use a notifier chain for failure events so that > > > > there can be multiple consumers of dax failure events? > > > > > > No, I would hope not. It should be 1:1 holders to dax-devices. Similar > > > ownership semantics like bd_prepare_to_claim(). > > > > Does each partition on a pmem device still have its own dax_device? > > No, it never did... > > Just as before, each dax-device is still associated with a gendisk / > whole-block_device. The recent change is that instead of needing that > partition-block_device plumbed to convert a relative block number to > its absolute whole-block_device offset the filesystem now handles that > at iomap_begin() time. See: > > if (mapping_flags & IOMAP_DAX) > iomap->addr += target->bt_dax_part_off; > > ...in xfs_bmbt_to_iomap() (in -next). I.e. bdev_dax_pgoff() is gone > with the lead-in reworks. OH. Crap, Dan's right: XFS (pmem0p1): ddev dax = 0xffff88800304ed00 bdev = 0xffff8880480d6180 XFS (pmem0p2): ddev dax = 0xffff88800304ed00 bdev = 0xffff8880480d4380 Unless you're planning to remove partition support too, this patch needs to be reworked so that multiple filesystems in separate partitions can each call dax_register_holder to receive memory_failure notifications for their partition. /methinks this sharing is all a little scary... --D
On Wed, Jan 5, 2022 at 2:47 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Wed, Jan 05, 2022 at 11:20:12AM -0800, Dan Williams wrote: > > On Wed, Jan 5, 2022 at 10:56 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > On Wed, Jan 05, 2022 at 10:23:08AM -0800, Dan Williams wrote: > > > > On Wed, Jan 5, 2022 at 10:12 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > On Sun, Dec 26, 2021 at 10:34:31PM +0800, Shiyang Ruan wrote: > > > > > > To easily track filesystem from a pmem device, we introduce a holder for > > > > > > dax_device structure, and also its operation. This holder is used to > > > > > > remember who is using this dax_device: > > > > > > - When it is the backend of a filesystem, the holder will be the > > > > > > instance of this filesystem. > > > > > > - When this pmem device is one of the targets in a mapped device, the > > > > > > holder will be this mapped device. In this case, the mapped device > > > > > > has its own dax_device and it will follow the first rule. So that we > > > > > > can finally track to the filesystem we needed. > > > > > > > > > > > > The holder and holder_ops will be set when filesystem is being mounted, > > > > > > or an target device is being activated. > > > > > > > > > > > > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com> > > > > > > --- > > > > > > drivers/dax/super.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > > include/linux/dax.h | 29 +++++++++++++++++++++ > > > > > > 2 files changed, 91 insertions(+) > > > > > > > > > > > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > > > > > > index c46f56e33d40..94c51f2ee133 100644 > > > > > > --- a/drivers/dax/super.c > > > > > > +++ b/drivers/dax/super.c > > > > > > @@ -20,15 +20,20 @@ > > > > > > * @inode: core vfs > > > > > > * @cdev: optional character interface for "device dax" > > > > > > * @private: dax driver private data > > > > > > + * @holder_data: holder of a dax_device: could be filesystem or mapped device > > > > > > * @flags: state and boolean properties > > > > > > + * @ops: operations for dax_device > > > > > > + * @holder_ops: operations for the inner holder > > > > > > */ > > > > > > struct dax_device { > > > > > > struct inode inode; > > > > > > struct cdev cdev; > > > > > > void *private; > > > > > > struct percpu_rw_semaphore rwsem; > > > > > > + void *holder_data; > > > > > > unsigned long flags; > > > > > > const struct dax_operations *ops; > > > > > > + const struct dax_holder_operations *holder_ops; > > > > > > }; > > > > > > > > > > > > static dev_t dax_devt; > > > > > > @@ -192,6 +197,29 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > > > > > > } > > > > > > EXPORT_SYMBOL_GPL(dax_zero_page_range); > > > > > > > > > > > > +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, > > > > > > + u64 len, int mf_flags) > > > > > > +{ > > > > > > + int rc; > > > > > > + > > > > > > + dax_read_lock(dax_dev); > > > > > > + if (!dax_alive(dax_dev)) { > > > > > > + rc = -ENXIO; > > > > > > + goto out; > > > > > > + } > > > > > > + > > > > > > + if (!dax_dev->holder_ops) { > > > > > > + rc = -EOPNOTSUPP; > > > > > > + goto out; > > > > > > + } > > > > > > + > > > > > > + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); > > > > > > +out: > > > > > > + dax_read_unlock(dax_dev); > > > > > > + return rc; > > > > > > +} > > > > > > +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); > > > > > > + > > > > > > #ifdef CONFIG_ARCH_HAS_PMEM_API > > > > > > void arch_wb_cache_pmem(void *addr, size_t size); > > > > > > void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) > > > > > > @@ -254,6 +282,10 @@ void kill_dax(struct dax_device *dax_dev) > > > > > > return; > > > > > > dax_write_lock(dax_dev); > > > > > > clear_bit(DAXDEV_ALIVE, &dax_dev->flags); > > > > > > + > > > > > > + /* clear holder data */ > > > > > > + dax_dev->holder_ops = NULL; > > > > > > + dax_dev->holder_data = NULL; > > > > > > dax_write_unlock(dax_dev); > > > > > > } > > > > > > EXPORT_SYMBOL_GPL(kill_dax); > > > > > > @@ -401,6 +433,36 @@ void put_dax(struct dax_device *dax_dev) > > > > > > } > > > > > > EXPORT_SYMBOL_GPL(put_dax); > > > > > > > > > > > > +void dax_register_holder(struct dax_device *dax_dev, void *holder, > > > > > > + const struct dax_holder_operations *ops) > > > > > > +{ > > > > > > + if (!dax_alive(dax_dev)) > > > > > > + return; > > > > > > + > > > > > > + dax_dev->holder_data = holder; > > > > > > + dax_dev->holder_ops = ops; > > > > > > > > > > Shouldn't this return an error code if the dax device is dead or if > > > > > someone already registered a holder? I'm pretty sure XFS should not > > > > > bind to a dax device if someone else already registered for it... > > > > > > > > Agree, yes. > > > > > > > > > > > > > > ...unless you want to use a notifier chain for failure events so that > > > > > there can be multiple consumers of dax failure events? > > > > > > > > No, I would hope not. It should be 1:1 holders to dax-devices. Similar > > > > ownership semantics like bd_prepare_to_claim(). > > > > > > Does each partition on a pmem device still have its own dax_device? > > > > No, it never did... > > > > Just as before, each dax-device is still associated with a gendisk / > > whole-block_device. The recent change is that instead of needing that > > partition-block_device plumbed to convert a relative block number to > > its absolute whole-block_device offset the filesystem now handles that > > at iomap_begin() time. See: > > > > if (mapping_flags & IOMAP_DAX) > > iomap->addr += target->bt_dax_part_off; > > > > ...in xfs_bmbt_to_iomap() (in -next). I.e. bdev_dax_pgoff() is gone > > with the lead-in reworks. > > OH. Crap, Dan's right: > > XFS (pmem0p1): ddev dax = 0xffff88800304ed00 bdev = 0xffff8880480d6180 > XFS (pmem0p2): ddev dax = 0xffff88800304ed00 bdev = 0xffff8880480d4380 > > Unless you're planning to remove partition support too, this patch needs > to be reworked so that multiple filesystems in separate partitions can > each call dax_register_holder to receive memory_failure notifications > for their partition. Oh, crap indeed. I think this gets back to the slow tip-toeing away from dax + partition support. While FSDAX can continue to support "legacy/experimental" operation on partitions of DAX capable block devices, I think failure notification + reflink support is where we draw the line and say "DAX on partitions was a mistake, it's too late to undo that mistake, but going forward for new FSDAX features it requires switching from block-device partitions to pmem-namespaces if you need sub-division support and new DAX features." > /methinks this sharing is all a little scary... Yes, I think we should just fail the holder registration and DAX+reflink unless the FS being mounted on a whole device. I know Ted and others had reservations about moving filesystems to be mounted on dax-devices directly, but hopefully the whole-block_device requirement is a workable middle ground?
On Wed, Jan 05, 2022 at 03:01:22PM -0800, Dan Williams wrote: > On Wed, Jan 5, 2022 at 2:47 PM Darrick J. Wong <djwong@kernel.org> wrote: > > > > On Wed, Jan 05, 2022 at 11:20:12AM -0800, Dan Williams wrote: > > > On Wed, Jan 5, 2022 at 10:56 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > On Wed, Jan 05, 2022 at 10:23:08AM -0800, Dan Williams wrote: > > > > > On Wed, Jan 5, 2022 at 10:12 AM Darrick J. Wong <djwong@kernel.org> wrote: > > > > > > > > > > > > On Sun, Dec 26, 2021 at 10:34:31PM +0800, Shiyang Ruan wrote: > > > > > > > To easily track filesystem from a pmem device, we introduce a holder for > > > > > > > dax_device structure, and also its operation. This holder is used to > > > > > > > remember who is using this dax_device: > > > > > > > - When it is the backend of a filesystem, the holder will be the > > > > > > > instance of this filesystem. > > > > > > > - When this pmem device is one of the targets in a mapped device, the > > > > > > > holder will be this mapped device. In this case, the mapped device > > > > > > > has its own dax_device and it will follow the first rule. So that we > > > > > > > can finally track to the filesystem we needed. > > > > > > > > > > > > > > The holder and holder_ops will be set when filesystem is being mounted, > > > > > > > or an target device is being activated. > > > > > > > > > > > > > > Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com> > > > > > > > --- > > > > > > > drivers/dax/super.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ > > > > > > > include/linux/dax.h | 29 +++++++++++++++++++++ > > > > > > > 2 files changed, 91 insertions(+) > > > > > > > > > > > > > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > > > > > > > index c46f56e33d40..94c51f2ee133 100644 > > > > > > > --- a/drivers/dax/super.c > > > > > > > +++ b/drivers/dax/super.c > > > > > > > @@ -20,15 +20,20 @@ > > > > > > > * @inode: core vfs > > > > > > > * @cdev: optional character interface for "device dax" > > > > > > > * @private: dax driver private data > > > > > > > + * @holder_data: holder of a dax_device: could be filesystem or mapped device > > > > > > > * @flags: state and boolean properties > > > > > > > + * @ops: operations for dax_device > > > > > > > + * @holder_ops: operations for the inner holder > > > > > > > */ > > > > > > > struct dax_device { > > > > > > > struct inode inode; > > > > > > > struct cdev cdev; > > > > > > > void *private; > > > > > > > struct percpu_rw_semaphore rwsem; > > > > > > > + void *holder_data; > > > > > > > unsigned long flags; > > > > > > > const struct dax_operations *ops; > > > > > > > + const struct dax_holder_operations *holder_ops; > > > > > > > }; > > > > > > > > > > > > > > static dev_t dax_devt; > > > > > > > @@ -192,6 +197,29 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > > > > > > > } > > > > > > > EXPORT_SYMBOL_GPL(dax_zero_page_range); > > > > > > > > > > > > > > +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, > > > > > > > + u64 len, int mf_flags) > > > > > > > +{ > > > > > > > + int rc; > > > > > > > + > > > > > > > + dax_read_lock(dax_dev); > > > > > > > + if (!dax_alive(dax_dev)) { > > > > > > > + rc = -ENXIO; > > > > > > > + goto out; > > > > > > > + } > > > > > > > + > > > > > > > + if (!dax_dev->holder_ops) { > > > > > > > + rc = -EOPNOTSUPP; > > > > > > > + goto out; > > > > > > > + } > > > > > > > + > > > > > > > + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); > > > > > > > +out: > > > > > > > + dax_read_unlock(dax_dev); > > > > > > > + return rc; > > > > > > > +} > > > > > > > +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); > > > > > > > + > > > > > > > #ifdef CONFIG_ARCH_HAS_PMEM_API > > > > > > > void arch_wb_cache_pmem(void *addr, size_t size); > > > > > > > void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) > > > > > > > @@ -254,6 +282,10 @@ void kill_dax(struct dax_device *dax_dev) > > > > > > > return; > > > > > > > dax_write_lock(dax_dev); > > > > > > > clear_bit(DAXDEV_ALIVE, &dax_dev->flags); > > > > > > > + > > > > > > > + /* clear holder data */ > > > > > > > + dax_dev->holder_ops = NULL; > > > > > > > + dax_dev->holder_data = NULL; > > > > > > > dax_write_unlock(dax_dev); > > > > > > > } > > > > > > > EXPORT_SYMBOL_GPL(kill_dax); > > > > > > > @@ -401,6 +433,36 @@ void put_dax(struct dax_device *dax_dev) > > > > > > > } > > > > > > > EXPORT_SYMBOL_GPL(put_dax); > > > > > > > > > > > > > > +void dax_register_holder(struct dax_device *dax_dev, void *holder, > > > > > > > + const struct dax_holder_operations *ops) > > > > > > > +{ > > > > > > > + if (!dax_alive(dax_dev)) > > > > > > > + return; > > > > > > > + > > > > > > > + dax_dev->holder_data = holder; > > > > > > > + dax_dev->holder_ops = ops; > > > > > > > > > > > > Shouldn't this return an error code if the dax device is dead or if > > > > > > someone already registered a holder? I'm pretty sure XFS should not > > > > > > bind to a dax device if someone else already registered for it... > > > > > > > > > > Agree, yes. > > > > > > > > > > > > > > > > > ...unless you want to use a notifier chain for failure events so that > > > > > > there can be multiple consumers of dax failure events? > > > > > > > > > > No, I would hope not. It should be 1:1 holders to dax-devices. Similar > > > > > ownership semantics like bd_prepare_to_claim(). > > > > > > > > Does each partition on a pmem device still have its own dax_device? > > > > > > No, it never did... > > > > > > Just as before, each dax-device is still associated with a gendisk / > > > whole-block_device. The recent change is that instead of needing that > > > partition-block_device plumbed to convert a relative block number to > > > its absolute whole-block_device offset the filesystem now handles that > > > at iomap_begin() time. See: > > > > > > if (mapping_flags & IOMAP_DAX) > > > iomap->addr += target->bt_dax_part_off; > > > > > > ...in xfs_bmbt_to_iomap() (in -next). I.e. bdev_dax_pgoff() is gone > > > with the lead-in reworks. > > > > OH. Crap, Dan's right: > > > > XFS (pmem0p1): ddev dax = 0xffff88800304ed00 bdev = 0xffff8880480d6180 > > XFS (pmem0p2): ddev dax = 0xffff88800304ed00 bdev = 0xffff8880480d4380 > > > > Unless you're planning to remove partition support too, this patch needs > > to be reworked so that multiple filesystems in separate partitions can > > each call dax_register_holder to receive memory_failure notifications > > for their partition. > > Oh, crap indeed. I think this gets back to the slow tip-toeing away > from dax + partition support. While FSDAX can continue to support > "legacy/experimental" operation on partitions of DAX capable block > devices, I think failure notification + reflink support is where we > draw the line and say "DAX on partitions was a mistake, it's too late > to undo that mistake, but going forward for new FSDAX features it > requires switching from block-device partitions to pmem-namespaces if > you need sub-division support and new DAX features." > > > /methinks this sharing is all a little scary... > > Yes, I think we should just fail the holder registration and > DAX+reflink unless the FS being mounted on a whole device. I know Ted > and others had reservations about moving filesystems to be mounted on > dax-devices directly, but hopefully the whole-block_device requirement > is a workable middle ground? I think you have to be /very/ careful about that kind of statement -- Take ext4 for example. It has a lot of statically allocated ondisk metadata. Someone could decide that it's a good idea to wire up a media failure notification so that we shut down the fs if (say) a giant hole opens up in the middle of the inode table. However, registering any kind of media failure handler brings along this requirement for not having partitions. This means that if ext4 finds a filesystem on a partition on a pmem device and someone else has already registered a media failure handler, it will have to choose between foregoing media failure notifications or failing the mount outright. Or you could support notification call chains... --D PS: I was about to say that ext4 lacks reverse mapping and reflink, but I forgot that ext4 *does* support reflink now. It just doesn't support copy on write.
On Wed, Jan 5, 2022 at 3:54 PM Darrick J. Wong <djwong@kernel.org> wrote: [..] > > Yes, I think we should just fail the holder registration and > > DAX+reflink unless the FS being mounted on a whole device. I know Ted > > and others had reservations about moving filesystems to be mounted on > > dax-devices directly, but hopefully the whole-block_device requirement > > is a workable middle ground? > > I think you have to be /very/ careful about that kind of statement -- > > Take ext4 for example. It has a lot of statically allocated ondisk > metadata. Someone could decide that it's a good idea to wire up a media > failure notification so that we shut down the fs if (say) a giant hole > opens up in the middle of the inode table. However, registering any > kind of media failure handler brings along this requirement for not > having partitions. > > This means that if ext4 finds a filesystem on a partition on a pmem > device and someone else has already registered a media failure handler, > it will have to choose between foregoing media failure notifications or > failing the mount outright. ...good example. > Or you could support notification call chains... We ended up with explicit callbacks after hch balked at a notifier call-chain, but I think we're back to that now. The partition mistake might be unfixable, but at least bdev_dax_pgoff() is dead. Notifier call chains have their own locking so, Ruan, this still does not need to touch dax_read_lock().
On Wed, Jan 05, 2022 at 04:12:04PM -0800, Dan Williams wrote: > We ended up with explicit callbacks after hch balked at a notifier > call-chain, but I think we're back to that now. The partition mistake > might be unfixable, but at least bdev_dax_pgoff() is dead. Notifier > call chains have their own locking so, Ruan, this still does not need > to touch dax_read_lock(). I think we have a few options here: (1) don't allow error notifications on partitions. And error return from the holder registration with proper error handling in the file system would give us that (2) extent the holder mechanism to cover a range (3) bite the bullet and create a new stacked dax_device for each partition I think (1) is the best option for now. If people really do need partitions we'll have to go for (3)
在 2022/1/20 16:46, Christoph Hellwig 写道: > On Wed, Jan 05, 2022 at 04:12:04PM -0800, Dan Williams wrote: >> We ended up with explicit callbacks after hch balked at a notifier >> call-chain, but I think we're back to that now. The partition mistake >> might be unfixable, but at least bdev_dax_pgoff() is dead. Notifier >> call chains have their own locking so, Ruan, this still does not need >> to touch dax_read_lock(). > > I think we have a few options here: > > (1) don't allow error notifications on partitions. And error return from > the holder registration with proper error handling in the file > system would give us that > (2) extent the holder mechanism to cover a range > (3) bite the bullet and create a new stacked dax_device for each > partition > > I think (1) is the best option for now. If people really do need > partitions we'll have to go for (3) Yes, I agree. I'm doing it the first way right now. I think that since we can use namespace to divide a big NVDIMM into multiple pmems, partition on a pmem seems not so meaningful. -- Thanks, Ruan.
On Fri, Jan 21, 2022 at 09:26:52AM +0800, Shiyang Ruan wrote: > > > 在 2022/1/20 16:46, Christoph Hellwig 写道: > > On Wed, Jan 05, 2022 at 04:12:04PM -0800, Dan Williams wrote: > > > We ended up with explicit callbacks after hch balked at a notifier > > > call-chain, but I think we're back to that now. The partition mistake > > > might be unfixable, but at least bdev_dax_pgoff() is dead. Notifier > > > call chains have their own locking so, Ruan, this still does not need > > > to touch dax_read_lock(). > > > > I think we have a few options here: > > > > (1) don't allow error notifications on partitions. And error return from > > the holder registration with proper error handling in the file > > system would give us that Hm, so that means XFS can only support dax+pmem when there aren't partitions in use? Ew. > > (2) extent the holder mechanism to cover a rangeo I don't think I was around for the part where "hch balked at a notifier call chain" -- what were the objections there, specifically? I would hope that pmem problems would be infrequent enough that the locking contention (or rcu expiration) wouldn't be an issue...? > > (3) bite the bullet and create a new stacked dax_device for each > > partition > > > > I think (1) is the best option for now. If people really do need > > partitions we'll have to go for (3) > > Yes, I agree. I'm doing it the first way right now. > > I think that since we can use namespace to divide a big NVDIMM into multiple > pmems, partition on a pmem seems not so meaningful. I'll try to find out what will happen if pmem suddenly stops supporting partitions... --D > > -- > Thanks, > Ruan. > >
On Thu, Jan 20, 2022 at 06:22:00PM -0800, Darrick J. Wong wrote: > Hm, so that means XFS can only support dax+pmem when there aren't > partitions in use? Ew. Yes. Or any sensible DAX usage going forward for that matter. > > > > (2) extent the holder mechanism to cover a rangeo > > I don't think I was around for the part where "hch balked at a notifier > call chain" -- what were the objections there, specifically? I would > hope that pmem problems would be infrequent enough that the locking > contention (or rcu expiration) wouldn't be an issue...? notifiers are a nightmare untype API leading to tons of boilerplate code. Open coding the notification is almost always a better idea.
On Thu, Jan 20, 2022 at 6:22 PM Darrick J. Wong <djwong@kernel.org> wrote: > > On Fri, Jan 21, 2022 at 09:26:52AM +0800, Shiyang Ruan wrote: > > > > > > 在 2022/1/20 16:46, Christoph Hellwig 写道: > > > On Wed, Jan 05, 2022 at 04:12:04PM -0800, Dan Williams wrote: > > > > We ended up with explicit callbacks after hch balked at a notifier > > > > call-chain, but I think we're back to that now. The partition mistake > > > > might be unfixable, but at least bdev_dax_pgoff() is dead. Notifier > > > > call chains have their own locking so, Ruan, this still does not need > > > > to touch dax_read_lock(). > > > > > > I think we have a few options here: > > > > > > (1) don't allow error notifications on partitions. And error return from > > > the holder registration with proper error handling in the file > > > system would give us that > > Hm, so that means XFS can only support dax+pmem when there aren't > partitions in use? Ew. > > > > (2) extent the holder mechanism to cover a rangeo > > I don't think I was around for the part where "hch balked at a notifier > call chain" -- what were the objections there, specifically? I would > hope that pmem problems would be infrequent enough that the locking > contention (or rcu expiration) wouldn't be an issue...? > > > > (3) bite the bullet and create a new stacked dax_device for each > > > partition > > > > > > I think (1) is the best option for now. If people really do need > > > partitions we'll have to go for (3) > > > > Yes, I agree. I'm doing it the first way right now. > > > > I think that since we can use namespace to divide a big NVDIMM into multiple > > pmems, partition on a pmem seems not so meaningful. > > I'll try to find out what will happen if pmem suddenly stops supporting > partitions... Finally catching up with this thread... Given that XFS already has the policy of disabling DAX rather than failing the mount in some cases, I think it is workable for XFS to fail a DAX mount if reflink is enabled on a partition. This should not regress anyone's current setup since the FS will not even mount with dax+reflink today. As to the specific concern about registering failure handlers for other purposes I expect that can be done by registering failure notification handlers on block devices, not dax devices. So it's not that pmem will suddenly stop supporting partitions, dax will simply never gain support for reflink in the presence of partitions.
diff --git a/drivers/dax/super.c b/drivers/dax/super.c index c46f56e33d40..94c51f2ee133 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -20,15 +20,20 @@ * @inode: core vfs * @cdev: optional character interface for "device dax" * @private: dax driver private data + * @holder_data: holder of a dax_device: could be filesystem or mapped device * @flags: state and boolean properties + * @ops: operations for dax_device + * @holder_ops: operations for the inner holder */ struct dax_device { struct inode inode; struct cdev cdev; void *private; struct percpu_rw_semaphore rwsem; + void *holder_data; unsigned long flags; const struct dax_operations *ops; + const struct dax_holder_operations *holder_ops; }; static dev_t dax_devt; @@ -192,6 +197,29 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, } EXPORT_SYMBOL_GPL(dax_zero_page_range); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, + u64 len, int mf_flags) +{ + int rc; + + dax_read_lock(dax_dev); + if (!dax_alive(dax_dev)) { + rc = -ENXIO; + goto out; + } + + if (!dax_dev->holder_ops) { + rc = -EOPNOTSUPP; + goto out; + } + + rc = dax_dev->holder_ops->notify_failure(dax_dev, off, len, mf_flags); +out: + dax_read_unlock(dax_dev); + return rc; +} +EXPORT_SYMBOL_GPL(dax_holder_notify_failure); + #ifdef CONFIG_ARCH_HAS_PMEM_API void arch_wb_cache_pmem(void *addr, size_t size); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) @@ -254,6 +282,10 @@ void kill_dax(struct dax_device *dax_dev) return; dax_write_lock(dax_dev); clear_bit(DAXDEV_ALIVE, &dax_dev->flags); + + /* clear holder data */ + dax_dev->holder_ops = NULL; + dax_dev->holder_data = NULL; dax_write_unlock(dax_dev); } EXPORT_SYMBOL_GPL(kill_dax); @@ -401,6 +433,36 @@ void put_dax(struct dax_device *dax_dev) } EXPORT_SYMBOL_GPL(put_dax); +void dax_register_holder(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *ops) +{ + if (!dax_alive(dax_dev)) + return; + + dax_dev->holder_data = holder; + dax_dev->holder_ops = ops; +} +EXPORT_SYMBOL_GPL(dax_register_holder); + +void dax_unregister_holder(struct dax_device *dax_dev) +{ + if (!dax_alive(dax_dev)) + return; + + dax_dev->holder_data = NULL; + dax_dev->holder_ops = NULL; +} +EXPORT_SYMBOL_GPL(dax_unregister_holder); + +void *dax_get_holder(struct dax_device *dax_dev) +{ + if (!dax_alive(dax_dev)) + return NULL; + + return dax_dev->holder_data; +} +EXPORT_SYMBOL_GPL(dax_get_holder); + /** * inode_dax: convert a public inode into its dax_dev * @inode: An inode with i_cdev pointing to a dax_dev diff --git a/include/linux/dax.h b/include/linux/dax.h index a146bfb80804..e16a9e0ee857 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -44,6 +44,22 @@ struct dax_operations { #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const struct dax_operations *ops, unsigned long flags); +struct dax_holder_operations { + /* + * notify_failure - notify memory failure into inner holder device + * @dax_dev: the dax device which contains the holder + * @offset: offset on this dax device where memory failure occurs + * @len: length of this memory failure event + * @flags: action flags for memory failure handler + */ + int (*notify_failure)(struct dax_device *dax_dev, u64 offset, + u64 len, int mf_flags); +}; + +void dax_register_holder(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *ops); +void dax_unregister_holder(struct dax_device *dax_dev); +void *dax_get_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); void dax_write_cache(struct dax_device *dax_dev, bool wc); @@ -71,6 +87,17 @@ static inline bool daxdev_mapping_supported(struct vm_area_struct *vma, return dax_synchronous(dax_dev); } #else +static inline void dax_register_holder(struct dax_device *dax_dev, void *holder, + const struct dax_holder_operations *ops) +{ +} +static inline void dax_unregister_holder(struct dax_device *dax_dev) +{ +} +static inline void *dax_get_holder(struct dax_device *dax_dev) +{ + return NULL; +} static inline struct dax_device *alloc_dax(void *private, const struct dax_operations *ops, unsigned long flags) { @@ -209,6 +236,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i); int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages); +int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len, + int mf_flags); void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
To easily track filesystem from a pmem device, we introduce a holder for dax_device structure, and also its operation. This holder is used to remember who is using this dax_device: - When it is the backend of a filesystem, the holder will be the instance of this filesystem. - When this pmem device is one of the targets in a mapped device, the holder will be this mapped device. In this case, the mapped device has its own dax_device and it will follow the first rule. So that we can finally track to the filesystem we needed. The holder and holder_ops will be set when filesystem is being mounted, or an target device is being activated. Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com> --- drivers/dax/super.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/dax.h | 29 +++++++++++++++++++++ 2 files changed, 91 insertions(+)