Message ID | 20230509054714.1746956-2-jane.chu@oracle.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | dax: enable dax fault handler to report VM_FAULT_HWPOISON | expand |
Ping... Is there any further concern? -jane On 5/8/2023 10:47 PM, Jane Chu wrote: > When multiple processes mmap() a dax file, then at some point, > a process issues a 'load' and consumes a hwpoison, the process > receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb > set for the poison scope. Soon after, any other process issues > a 'load' to the poisoned page (that is unmapped from the kernel > side by memory_failure), it receives a SIGBUS with > si_code = BUS_ADRERR and without valid si_lsb. > > This is confusing to user, and is different from page fault due > to poison in RAM memory, also some helpful information is lost. > > Channel dax backend driver's poison detection to the filesystem > such that instead of reporting VM_FAULT_SIGBUS, it could report > VM_FAULT_HWPOISON. > > If user level block IO syscalls fail due to poison, the errno will > be converted to EIO to maintain block API consistency. > > Signed-off-by: Jane Chu <jane.chu@oracle.com> > --- > drivers/dax/super.c | 5 ++++- > drivers/nvdimm/pmem.c | 2 +- > drivers/s390/block/dcssblk.c | 3 ++- > fs/dax.c | 11 ++++++----- > fs/fuse/virtio_fs.c | 3 ++- > include/linux/dax.h | 5 +++++ > include/linux/mm.h | 2 ++ > 7 files changed, 22 insertions(+), 9 deletions(-) > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > index c4c4728a36e4..0da9232ea175 100644 > --- a/drivers/dax/super.c > +++ b/drivers/dax/super.c > @@ -203,6 +203,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, > int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > size_t nr_pages) > { > + int ret; > + > if (!dax_alive(dax_dev)) > return -ENXIO; > /* > @@ -213,7 +215,8 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > if (nr_pages != 1) > return -EIO; > > - return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); > + ret = dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); > + return dax_mem2blk_err(ret); > } > EXPORT_SYMBOL_GPL(dax_zero_page_range); > > diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c > index ceea55f621cc..46e094e56159 100644 > --- a/drivers/nvdimm/pmem.c > +++ b/drivers/nvdimm/pmem.c > @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, > long actual_nr; > > if (mode != DAX_RECOVERY_WRITE) > - return -EIO; > + return -EHWPOISON; > > /* > * Set the recovery stride is set to kernel page size because > diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c > index c09f2e053bf8..ee47ac520cd4 100644 > --- a/drivers/s390/block/dcssblk.c > +++ b/drivers/s390/block/dcssblk.c > @@ -54,7 +54,8 @@ static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev, > rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, > &kaddr, NULL); > if (rc < 0) > - return rc; > + return dax_mem2blk_err(rc); > + > memset(kaddr, 0, nr_pages << PAGE_SHIFT); > dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); > return 0; > diff --git a/fs/dax.c b/fs/dax.c > index 2ababb89918d..a26eb5abfdc0 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -1148,7 +1148,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, > if (!zero_edge) { > ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); > if (ret) > - return ret; > + return dax_mem2blk_err(ret); > } > > if (copy_all) { > @@ -1310,7 +1310,7 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) > > out_unlock: > dax_read_unlock(id); > - return ret; > + return dax_mem2blk_err(ret); > } > > int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, > @@ -1342,7 +1342,8 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) > ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, > NULL); > if (ret < 0) > - return ret; > + return dax_mem2blk_err(ret); > + > memset(kaddr + offset, 0, size); > if (iomap->flags & IOMAP_F_SHARED) > ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, > @@ -1498,7 +1499,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, > > map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), > DAX_ACCESS, &kaddr, NULL); > - if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { > + if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) { > map_len = dax_direct_access(dax_dev, pgoff, > PHYS_PFN(size), DAX_RECOVERY_WRITE, > &kaddr, NULL); > @@ -1506,7 +1507,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, > recovery = true; > } > if (map_len < 0) { > - ret = map_len; > + ret = dax_mem2blk_err(map_len); > break; > } > > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > index 4d8d4f16c727..5f1be1da92ce 100644 > --- a/fs/fuse/virtio_fs.c > +++ b/fs/fuse/virtio_fs.c > @@ -775,7 +775,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev, > rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, > NULL); > if (rc < 0) > - return rc; > + return dax_mem2blk_err(rc); > + > memset(kaddr, 0, nr_pages << PAGE_SHIFT); > dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); > return 0; > diff --git a/include/linux/dax.h b/include/linux/dax.h > index bf6258472e49..a4e97acf60f5 100644 > --- a/include/linux/dax.h > +++ b/include/linux/dax.h > @@ -261,6 +261,11 @@ static inline bool dax_mapping(struct address_space *mapping) > return mapping->host && IS_DAX(mapping->host); > } > > +static inline int dax_mem2blk_err(int err) > +{ > + return (err == -EHWPOISON) ? -EIO : err; > +} > + > #ifdef CONFIG_DEV_DAX_HMEM_DEVICES > void hmem_register_resource(int target_nid, struct resource *r); > #else > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 1f79667824eb..e4c974587659 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -3217,6 +3217,8 @@ static inline vm_fault_t vmf_error(int err) > { > if (err == -ENOMEM) > return VM_FAULT_OOM; > + else if (err == -EHWPOISON) > + return VM_FAULT_HWPOISON; > return VM_FAULT_SIGBUS; > } >
Jane Chu wrote: > When multiple processes mmap() a dax file, then at some point, > a process issues a 'load' and consumes a hwpoison, the process > receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb > set for the poison scope. Soon after, any other process issues > a 'load' to the poisoned page (that is unmapped from the kernel > side by memory_failure), it receives a SIGBUS with > si_code = BUS_ADRERR and without valid si_lsb. > > This is confusing to user, and is different from page fault due > to poison in RAM memory, also some helpful information is lost. > > Channel dax backend driver's poison detection to the filesystem > such that instead of reporting VM_FAULT_SIGBUS, it could report > VM_FAULT_HWPOISON. > > If user level block IO syscalls fail due to poison, the errno will > be converted to EIO to maintain block API consistency. > > Signed-off-by: Jane Chu <jane.chu@oracle.com> > --- > drivers/dax/super.c | 5 ++++- > drivers/nvdimm/pmem.c | 2 +- > drivers/s390/block/dcssblk.c | 3 ++- > fs/dax.c | 11 ++++++----- > fs/fuse/virtio_fs.c | 3 ++- > include/linux/dax.h | 5 +++++ > include/linux/mm.h | 2 ++ > 7 files changed, 22 insertions(+), 9 deletions(-) > > diff --git a/drivers/dax/super.c b/drivers/dax/super.c > index c4c4728a36e4..0da9232ea175 100644 > --- a/drivers/dax/super.c > +++ b/drivers/dax/super.c > @@ -203,6 +203,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, > int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > size_t nr_pages) > { > + int ret; > + > if (!dax_alive(dax_dev)) > return -ENXIO; > /* > @@ -213,7 +215,8 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, > if (nr_pages != 1) > return -EIO; > > - return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); > + ret = dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); > + return dax_mem2blk_err(ret); > } > EXPORT_SYMBOL_GPL(dax_zero_page_range); > > diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c > index ceea55f621cc..46e094e56159 100644 > --- a/drivers/nvdimm/pmem.c > +++ b/drivers/nvdimm/pmem.c > @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, > long actual_nr; > > if (mode != DAX_RECOVERY_WRITE) > - return -EIO; > + return -EHWPOISON; > > /* > * Set the recovery stride is set to kernel page size because > diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c > index c09f2e053bf8..ee47ac520cd4 100644 > --- a/drivers/s390/block/dcssblk.c > +++ b/drivers/s390/block/dcssblk.c > @@ -54,7 +54,8 @@ static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev, > rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, > &kaddr, NULL); > if (rc < 0) > - return rc; > + return dax_mem2blk_err(rc); > + > memset(kaddr, 0, nr_pages << PAGE_SHIFT); > dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); > return 0; > diff --git a/fs/dax.c b/fs/dax.c > index 2ababb89918d..a26eb5abfdc0 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -1148,7 +1148,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, > if (!zero_edge) { > ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); > if (ret) > - return ret; > + return dax_mem2blk_err(ret); > } > > if (copy_all) { > @@ -1310,7 +1310,7 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) > > out_unlock: > dax_read_unlock(id); > - return ret; > + return dax_mem2blk_err(ret); > } > > int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, > @@ -1342,7 +1342,8 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) > ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, > NULL); > if (ret < 0) > - return ret; > + return dax_mem2blk_err(ret); > + > memset(kaddr + offset, 0, size); > if (iomap->flags & IOMAP_F_SHARED) > ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, > @@ -1498,7 +1499,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, > > map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), > DAX_ACCESS, &kaddr, NULL); > - if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { > + if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) { > map_len = dax_direct_access(dax_dev, pgoff, > PHYS_PFN(size), DAX_RECOVERY_WRITE, > &kaddr, NULL); > @@ -1506,7 +1507,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, > recovery = true; > } > if (map_len < 0) { > - ret = map_len; > + ret = dax_mem2blk_err(map_len); > break; > } > > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > index 4d8d4f16c727..5f1be1da92ce 100644 > --- a/fs/fuse/virtio_fs.c > +++ b/fs/fuse/virtio_fs.c > @@ -775,7 +775,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev, > rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, > NULL); > if (rc < 0) > - return rc; > + return dax_mem2blk_err(rc); > + > memset(kaddr, 0, nr_pages << PAGE_SHIFT); > dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); > return 0; > diff --git a/include/linux/dax.h b/include/linux/dax.h > index bf6258472e49..a4e97acf60f5 100644 > --- a/include/linux/dax.h > +++ b/include/linux/dax.h > @@ -261,6 +261,11 @@ static inline bool dax_mapping(struct address_space *mapping) > return mapping->host && IS_DAX(mapping->host); > } > > +static inline int dax_mem2blk_err(int err) > +{ > + return (err == -EHWPOISON) ? -EIO : err; > +} I think it is worth a comment on this function to indicate where this helper is *not* used. I.e. it's easy to grep for where the error code is converted for file I/O errors, but the subtlety of when the dax_direct_acces() return value is not translated for fault handling deserves a comment when we wonder why this function exists in a few years time.
On 6/8/2023 8:16 PM, Dan Williams wrote: [..] >> >> +static inline int dax_mem2blk_err(int err) >> +{ >> + return (err == -EHWPOISON) ? -EIO : err; >> +} > > I think it is worth a comment on this function to indicate where this > helper is *not* used. I.e. it's easy to grep for where the error code is > converted for file I/O errors, but the subtlety of when the > dax_direct_acces() return value is not translated for fault handling > deserves a comment when we wonder why this function exists in a few > years time. Good point! v5 coming up next. thanks! -jane
diff --git a/drivers/dax/super.c b/drivers/dax/super.c index c4c4728a36e4..0da9232ea175 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -203,6 +203,8 @@ size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, size_t nr_pages) { + int ret; + if (!dax_alive(dax_dev)) return -ENXIO; /* @@ -213,7 +215,8 @@ int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff, if (nr_pages != 1) return -EIO; - return dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); + ret = dax_dev->ops->zero_page_range(dax_dev, pgoff, nr_pages); + return dax_mem2blk_err(ret); } EXPORT_SYMBOL_GPL(dax_zero_page_range); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index ceea55f621cc..46e094e56159 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -260,7 +260,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, long actual_nr; if (mode != DAX_RECOVERY_WRITE) - return -EIO; + return -EHWPOISON; /* * Set the recovery stride is set to kernel page size because diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index c09f2e053bf8..ee47ac520cd4 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -54,7 +54,8 @@ static int dcssblk_dax_zero_page_range(struct dax_device *dax_dev, rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, NULL); if (rc < 0) - return rc; + return dax_mem2blk_err(rc); + memset(kaddr, 0, nr_pages << PAGE_SHIFT); dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); return 0; diff --git a/fs/dax.c b/fs/dax.c index 2ababb89918d..a26eb5abfdc0 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1148,7 +1148,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size, if (!zero_edge) { ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL); if (ret) - return ret; + return dax_mem2blk_err(ret); } if (copy_all) { @@ -1310,7 +1310,7 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) out_unlock: dax_read_unlock(id); - return ret; + return dax_mem2blk_err(ret); } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, @@ -1342,7 +1342,8 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr, NULL); if (ret < 0) - return ret; + return dax_mem2blk_err(ret); + memset(kaddr + offset, 0, size); if (iomap->flags & IOMAP_F_SHARED) ret = dax_iomap_copy_around(pos, size, PAGE_SIZE, srcmap, @@ -1498,7 +1499,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_ACCESS, &kaddr, NULL); - if (map_len == -EIO && iov_iter_rw(iter) == WRITE) { + if (map_len == -EHWPOISON && iov_iter_rw(iter) == WRITE) { map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), DAX_RECOVERY_WRITE, &kaddr, NULL); @@ -1506,7 +1507,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, recovery = true; } if (map_len < 0) { - ret = map_len; + ret = dax_mem2blk_err(map_len); break; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 4d8d4f16c727..5f1be1da92ce 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -775,7 +775,8 @@ static int virtio_fs_zero_page_range(struct dax_device *dax_dev, rc = dax_direct_access(dax_dev, pgoff, nr_pages, DAX_ACCESS, &kaddr, NULL); if (rc < 0) - return rc; + return dax_mem2blk_err(rc); + memset(kaddr, 0, nr_pages << PAGE_SHIFT); dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); return 0; diff --git a/include/linux/dax.h b/include/linux/dax.h index bf6258472e49..a4e97acf60f5 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -261,6 +261,11 @@ static inline bool dax_mapping(struct address_space *mapping) return mapping->host && IS_DAX(mapping->host); } +static inline int dax_mem2blk_err(int err) +{ + return (err == -EHWPOISON) ? -EIO : err; +} + #ifdef CONFIG_DEV_DAX_HMEM_DEVICES void hmem_register_resource(int target_nid, struct resource *r); #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f79667824eb..e4c974587659 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3217,6 +3217,8 @@ static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) return VM_FAULT_OOM; + else if (err == -EHWPOISON) + return VM_FAULT_HWPOISON; return VM_FAULT_SIGBUS; }
When multiple processes mmap() a dax file, then at some point, a process issues a 'load' and consumes a hwpoison, the process receives a SIGBUS with si_code = BUS_MCEERR_AR and with si_lsb set for the poison scope. Soon after, any other process issues a 'load' to the poisoned page (that is unmapped from the kernel side by memory_failure), it receives a SIGBUS with si_code = BUS_ADRERR and without valid si_lsb. This is confusing to user, and is different from page fault due to poison in RAM memory, also some helpful information is lost. Channel dax backend driver's poison detection to the filesystem such that instead of reporting VM_FAULT_SIGBUS, it could report VM_FAULT_HWPOISON. If user level block IO syscalls fail due to poison, the errno will be converted to EIO to maintain block API consistency. Signed-off-by: Jane Chu <jane.chu@oracle.com> --- drivers/dax/super.c | 5 ++++- drivers/nvdimm/pmem.c | 2 +- drivers/s390/block/dcssblk.c | 3 ++- fs/dax.c | 11 ++++++----- fs/fuse/virtio_fs.c | 3 ++- include/linux/dax.h | 5 +++++ include/linux/mm.h | 2 ++ 7 files changed, 22 insertions(+), 9 deletions(-)