Message ID | 20220128213150.1333552-6-jane.chu@oracle.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | DAX poison recovery | expand |
> @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, > __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, > long nr_pages, void **kaddr, pfn_t *pfn) > { > + bool bad_pmem; > + bool do_recovery = false; > resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; > > - if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, > - PFN_PHYS(nr_pages)))) > + bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, > + PFN_PHYS(nr_pages)); > + if (bad_pmem && kaddr) > + do_recovery = dax_recovery_started(pmem->dax_dev, kaddr); > + if (bad_pmem && !do_recovery) > return -EIO; I find the passing of the recovery flag through the address very cumbersome. I remember there was some kind of discussion, but this looks pretty ugly. Also no need for the bad_pmem variable: if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, PFN_PHYS(nr_pages)) && (!kaddr | !dax_recovery_started(pmem->dax_dev, kaddr))) return -EIO; Also: the !kaddr check could go into dax_recovery_started. That way even if we stick with the overloading kaddr could also be used just for the flag if needed.
On 2/2/2022 5:43 AM, Christoph Hellwig wrote: >> @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, >> __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, >> long nr_pages, void **kaddr, pfn_t *pfn) >> { >> + bool bad_pmem; >> + bool do_recovery = false; >> resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; >> >> - if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, >> - PFN_PHYS(nr_pages)))) >> + bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, >> + PFN_PHYS(nr_pages)); >> + if (bad_pmem && kaddr) >> + do_recovery = dax_recovery_started(pmem->dax_dev, kaddr); >> + if (bad_pmem && !do_recovery) >> return -EIO; > > I find the passing of the recovery flag through the address very > cumbersome. I remember there was some kind of discussion, but this looks > pretty ugly. > > Also no need for the bad_pmem variable: > > if (is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, PFN_PHYS(nr_pages)) && > (!kaddr | !dax_recovery_started(pmem->dax_dev, kaddr))) > return -EIO; Okay. > > Also: the !kaddr check could go into dax_recovery_started. That way > even if we stick with the overloading kaddr could also be used just for > the flag if needed. The !kaddr check is in dax_recovery_started(), and that reminded me the check should be in dax_prep_recovery() too. Thanks! -jane
On Fri, Jan 28, 2022 at 1:32 PM Jane Chu <jane.chu@oracle.com> wrote: > > pmem_recovery_write() consists of clearing poison via DSM, > clearing page HWPoison bit, re-state _PAGE_PRESENT bit, > cacheflush, write, and finally clearing bad-block record. > > A competing pread thread is held off during recovery write > by the presence of bad-block record. A competing recovery_write > thread is serialized by a lock. > > Signed-off-by: Jane Chu <jane.chu@oracle.com> > --- > drivers/nvdimm/pmem.c | 82 +++++++++++++++++++++++++++++++++++++++---- > drivers/nvdimm/pmem.h | 1 + > 2 files changed, 77 insertions(+), 6 deletions(-) > > diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c > index 638e64681db9..f2d6b34d48de 100644 > --- a/drivers/nvdimm/pmem.c > +++ b/drivers/nvdimm/pmem.c > @@ -69,6 +69,14 @@ static void hwpoison_clear(struct pmem_device *pmem, > } > } > > +static void pmem_clear_badblocks(struct pmem_device *pmem, sector_t sector, > + long cleared_blks) > +{ > + badblocks_clear(&pmem->bb, sector, cleared_blks); > + if (pmem->bb_state) > + sysfs_notify_dirent(pmem->bb_state); > +} > + > static blk_status_t pmem_clear_poison(struct pmem_device *pmem, > phys_addr_t offset, unsigned int len) > { > @@ -88,9 +96,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, > dev_dbg(dev, "%#llx clear %ld sector%s\n", > (unsigned long long) sector, cleared, > cleared > 1 ? "s" : ""); > - badblocks_clear(&pmem->bb, sector, cleared); > - if (pmem->bb_state) > - sysfs_notify_dirent(pmem->bb_state); > + pmem_clear_badblocks(pmem, sector, cleared); > } > > arch_invalidate_pmem(pmem->virt_addr + offset, len); > @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, > __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, > long nr_pages, void **kaddr, pfn_t *pfn) > { > + bool bad_pmem; > + bool do_recovery = false; > resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; > > - if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, > - PFN_PHYS(nr_pages)))) > + bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, > + PFN_PHYS(nr_pages)); > + if (bad_pmem && kaddr) > + do_recovery = dax_recovery_started(pmem->dax_dev, kaddr); > + if (bad_pmem && !do_recovery) > return -EIO; > > if (kaddr) > @@ -301,10 +312,68 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev, > return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); > } > > +/* > + * The recovery write thread started out as a normal pwrite thread and > + * when the filesystem was told about potential media error in the > + * range, filesystem turns the normal pwrite to a dax_recovery_write. > + * > + * The recovery write consists of clearing poison via DSM, clearing page > + * HWPoison bit, reenable page-wide read-write permission, flush the > + * caches and finally write. A competing pread thread needs to be held > + * off during the recovery process since data read back might not be valid. > + * And that's achieved by placing the badblock records clearing after > + * the completion of the recovery write. > + * > + * Any competing recovery write thread needs to be serialized, and this is > + * done via pmem device level lock .recovery_lock. > + */ > static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, > void *addr, size_t bytes, struct iov_iter *i) > { > - return 0; > + size_t rc, len, off; > + phys_addr_t pmem_off; > + struct pmem_device *pmem = dax_get_private(dax_dev); > + struct device *dev = pmem->bb.dev; > + sector_t sector; > + long cleared, cleared_blk; > + > + mutex_lock(&pmem->recovery_lock); > + > + /* If no poison found in the range, go ahead with write */ > + off = (unsigned long)addr & ~PAGE_MASK; > + len = PFN_PHYS(PFN_UP(off + bytes)); > + if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) { > + rc = _copy_from_iter_flushcache(addr, bytes, i); > + goto write_done; > + } is_bad_pmem() takes a seqlock so it should be safe to move the recovery_lock below this point. > + > + /* Not page-aligned range cannot be recovered */ > + if (off || !(PAGE_ALIGNED(bytes))) { > + dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n", > + addr, bytes); fs/dax.c will prevent this from happening, right? It seems like an upper layer bug if we get this far into the recovery process without checking if a full page is being overwritten. > + rc = (size_t) -EIO; > + goto write_done; > + } > + > + pmem_off = PFN_PHYS(pgoff) + pmem->data_offset; > + sector = (pmem_off - pmem->data_offset) / 512; > + cleared = nvdimm_clear_poison(dev, pmem->phys_addr + pmem_off, len); > + cleared_blk = cleared / 512; > + if (cleared_blk > 0) { > + hwpoison_clear(pmem, pmem->phys_addr + pmem_off, cleared); > + } else { > + dev_warn(dev, "pmem_recovery_write: cleared_blk: %ld\n", > + cleared_blk); > + rc = (size_t) -EIO; > + goto write_done; > + } > + arch_invalidate_pmem(pmem->virt_addr + pmem_off, bytes); > + rc = _copy_from_iter_flushcache(addr, bytes, i); > + pmem_clear_badblocks(pmem, sector, cleared_blk); This duplicates pmem_clear_poison() can more code be shared between the 2 functions? > + > +write_done: > + mutex_unlock(&pmem->recovery_lock); > + return rc; > } > > static const struct dax_operations pmem_dax_ops = { > @@ -495,6 +564,7 @@ static int pmem_attach_disk(struct device *dev, > goto out_cleanup_dax; > dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); > set_dax_recovery(dax_dev); > + mutex_init(&pmem->recovery_lock); > pmem->dax_dev = dax_dev; > > rc = device_add_disk(dev, disk, pmem_attribute_groups); > diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h > index 59cfe13ea8a8..971bff7552d6 100644 > --- a/drivers/nvdimm/pmem.h > +++ b/drivers/nvdimm/pmem.h > @@ -24,6 +24,7 @@ struct pmem_device { > struct dax_device *dax_dev; > struct gendisk *disk; > struct dev_pagemap pgmap; > + struct mutex recovery_lock; > }; > > long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, > -- > 2.18.4 >
On 2/3/2022 10:21 PM, Dan Williams wrote: > On Fri, Jan 28, 2022 at 1:32 PM Jane Chu <jane.chu@oracle.com> wrote: >> >> pmem_recovery_write() consists of clearing poison via DSM, >> clearing page HWPoison bit, re-state _PAGE_PRESENT bit, >> cacheflush, write, and finally clearing bad-block record. >> >> A competing pread thread is held off during recovery write >> by the presence of bad-block record. A competing recovery_write >> thread is serialized by a lock. >> >> Signed-off-by: Jane Chu <jane.chu@oracle.com> >> --- >> drivers/nvdimm/pmem.c | 82 +++++++++++++++++++++++++++++++++++++++---- >> drivers/nvdimm/pmem.h | 1 + >> 2 files changed, 77 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c >> index 638e64681db9..f2d6b34d48de 100644 >> --- a/drivers/nvdimm/pmem.c >> +++ b/drivers/nvdimm/pmem.c >> @@ -69,6 +69,14 @@ static void hwpoison_clear(struct pmem_device *pmem, >> } >> } >> >> +static void pmem_clear_badblocks(struct pmem_device *pmem, sector_t sector, >> + long cleared_blks) >> +{ >> + badblocks_clear(&pmem->bb, sector, cleared_blks); >> + if (pmem->bb_state) >> + sysfs_notify_dirent(pmem->bb_state); >> +} >> + >> static blk_status_t pmem_clear_poison(struct pmem_device *pmem, >> phys_addr_t offset, unsigned int len) >> { >> @@ -88,9 +96,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, >> dev_dbg(dev, "%#llx clear %ld sector%s\n", >> (unsigned long long) sector, cleared, >> cleared > 1 ? "s" : ""); >> - badblocks_clear(&pmem->bb, sector, cleared); >> - if (pmem->bb_state) >> - sysfs_notify_dirent(pmem->bb_state); >> + pmem_clear_badblocks(pmem, sector, cleared); >> } >> >> arch_invalidate_pmem(pmem->virt_addr + offset, len); >> @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, >> __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, >> long nr_pages, void **kaddr, pfn_t *pfn) >> { >> + bool bad_pmem; >> + bool do_recovery = false; >> resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; >> >> - if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, >> - PFN_PHYS(nr_pages)))) >> + bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, >> + PFN_PHYS(nr_pages)); >> + if (bad_pmem && kaddr) >> + do_recovery = dax_recovery_started(pmem->dax_dev, kaddr); >> + if (bad_pmem && !do_recovery) >> return -EIO; >> >> if (kaddr) >> @@ -301,10 +312,68 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev, >> return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); >> } >> >> +/* >> + * The recovery write thread started out as a normal pwrite thread and >> + * when the filesystem was told about potential media error in the >> + * range, filesystem turns the normal pwrite to a dax_recovery_write. >> + * >> + * The recovery write consists of clearing poison via DSM, clearing page >> + * HWPoison bit, reenable page-wide read-write permission, flush the >> + * caches and finally write. A competing pread thread needs to be held >> + * off during the recovery process since data read back might not be valid. >> + * And that's achieved by placing the badblock records clearing after >> + * the completion of the recovery write. >> + * >> + * Any competing recovery write thread needs to be serialized, and this is >> + * done via pmem device level lock .recovery_lock. >> + */ >> static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, >> void *addr, size_t bytes, struct iov_iter *i) >> { >> - return 0; >> + size_t rc, len, off; >> + phys_addr_t pmem_off; >> + struct pmem_device *pmem = dax_get_private(dax_dev); >> + struct device *dev = pmem->bb.dev; >> + sector_t sector; >> + long cleared, cleared_blk; >> + >> + mutex_lock(&pmem->recovery_lock); >> + >> + /* If no poison found in the range, go ahead with write */ >> + off = (unsigned long)addr & ~PAGE_MASK; >> + len = PFN_PHYS(PFN_UP(off + bytes)); >> + if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) { >> + rc = _copy_from_iter_flushcache(addr, bytes, i); >> + goto write_done; >> + } > > is_bad_pmem() takes a seqlock so it should be safe to move the > recovery_lock below this point. Okay, thanks! > >> + >> + /* Not page-aligned range cannot be recovered */ >> + if (off || !(PAGE_ALIGNED(bytes))) { >> + dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n", >> + addr, bytes); > > fs/dax.c will prevent this from happening, right? It seems like an > upper layer bug if we get this far into the recovery process without > checking if a full page is being overwritten. Yes, at the start of each dax_iomap_iter, the buffer is page aligned. However, the underlying dax_copy_from_iter is allowed to return partial results, causing the subsequent 'while' loop within dax_iomap_iter to continue at not page aligned address. I ran into the situation when I played around dax_copy_from_iter, not sure in reality, partial result is legitimate, just thought to issue a warning should the situation happen. > >> + rc = (size_t) -EIO; >> + goto write_done; >> + } >> + >> + pmem_off = PFN_PHYS(pgoff) + pmem->data_offset; >> + sector = (pmem_off - pmem->data_offset) / 512; >> + cleared = nvdimm_clear_poison(dev, pmem->phys_addr + pmem_off, len); >> + cleared_blk = cleared / 512; >> + if (cleared_blk > 0) { >> + hwpoison_clear(pmem, pmem->phys_addr + pmem_off, cleared); >> + } else { >> + dev_warn(dev, "pmem_recovery_write: cleared_blk: %ld\n", >> + cleared_blk); >> + rc = (size_t) -EIO; >> + goto write_done; >> + } >> + arch_invalidate_pmem(pmem->virt_addr + pmem_off, bytes); >> + rc = _copy_from_iter_flushcache(addr, bytes, i); >> + pmem_clear_badblocks(pmem, sector, cleared_blk); > > This duplicates pmem_clear_poison() can more code be shared between > the 2 functions? I'll look into refactoring pmem_clear_poison(). > > >> + >> +write_done: >> + mutex_unlock(&pmem->recovery_lock); >> + return rc; >> } >> >> static const struct dax_operations pmem_dax_ops = { >> @@ -495,6 +564,7 @@ static int pmem_attach_disk(struct device *dev, >> goto out_cleanup_dax; >> dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); >> set_dax_recovery(dax_dev); >> + mutex_init(&pmem->recovery_lock); >> pmem->dax_dev = dax_dev; >> >> rc = device_add_disk(dev, disk, pmem_attribute_groups); >> diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h >> index 59cfe13ea8a8..971bff7552d6 100644 >> --- a/drivers/nvdimm/pmem.h >> +++ b/drivers/nvdimm/pmem.h >> @@ -24,6 +24,7 @@ struct pmem_device { >> struct dax_device *dax_dev; >> struct gendisk *disk; >> struct dev_pagemap pgmap; >> + struct mutex recovery_lock; >> }; >> >> long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, >> -- >> 2.18.4 >> thanks! -jane
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 638e64681db9..f2d6b34d48de 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -69,6 +69,14 @@ static void hwpoison_clear(struct pmem_device *pmem, } } +static void pmem_clear_badblocks(struct pmem_device *pmem, sector_t sector, + long cleared_blks) +{ + badblocks_clear(&pmem->bb, sector, cleared_blks); + if (pmem->bb_state) + sysfs_notify_dirent(pmem->bb_state); +} + static blk_status_t pmem_clear_poison(struct pmem_device *pmem, phys_addr_t offset, unsigned int len) { @@ -88,9 +96,7 @@ static blk_status_t pmem_clear_poison(struct pmem_device *pmem, dev_dbg(dev, "%#llx clear %ld sector%s\n", (unsigned long long) sector, cleared, cleared > 1 ? "s" : ""); - badblocks_clear(&pmem->bb, sector, cleared); - if (pmem->bb_state) - sysfs_notify_dirent(pmem->bb_state); + pmem_clear_badblocks(pmem, sector, cleared); } arch_invalidate_pmem(pmem->virt_addr + offset, len); @@ -257,10 +263,15 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn) { + bool bad_pmem; + bool do_recovery = false; resource_size_t offset = PFN_PHYS(pgoff) + pmem->data_offset; - if (unlikely(is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, - PFN_PHYS(nr_pages)))) + bad_pmem = is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, + PFN_PHYS(nr_pages)); + if (bad_pmem && kaddr) + do_recovery = dax_recovery_started(pmem->dax_dev, kaddr); + if (bad_pmem && !do_recovery) return -EIO; if (kaddr) @@ -301,10 +312,68 @@ static long pmem_dax_direct_access(struct dax_device *dax_dev, return __pmem_direct_access(pmem, pgoff, nr_pages, kaddr, pfn); } +/* + * The recovery write thread started out as a normal pwrite thread and + * when the filesystem was told about potential media error in the + * range, filesystem turns the normal pwrite to a dax_recovery_write. + * + * The recovery write consists of clearing poison via DSM, clearing page + * HWPoison bit, reenable page-wide read-write permission, flush the + * caches and finally write. A competing pread thread needs to be held + * off during the recovery process since data read back might not be valid. + * And that's achieved by placing the badblock records clearing after + * the completion of the recovery write. + * + * Any competing recovery write thread needs to be serialized, and this is + * done via pmem device level lock .recovery_lock. + */ static size_t pmem_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i) { - return 0; + size_t rc, len, off; + phys_addr_t pmem_off; + struct pmem_device *pmem = dax_get_private(dax_dev); + struct device *dev = pmem->bb.dev; + sector_t sector; + long cleared, cleared_blk; + + mutex_lock(&pmem->recovery_lock); + + /* If no poison found in the range, go ahead with write */ + off = (unsigned long)addr & ~PAGE_MASK; + len = PFN_PHYS(PFN_UP(off + bytes)); + if (!is_bad_pmem(&pmem->bb, PFN_PHYS(pgoff) / 512, len)) { + rc = _copy_from_iter_flushcache(addr, bytes, i); + goto write_done; + } + + /* Not page-aligned range cannot be recovered */ + if (off || !(PAGE_ALIGNED(bytes))) { + dev_warn(dev, "Found poison, but addr(%p) or bytes(%#lx) not page aligned\n", + addr, bytes); + rc = (size_t) -EIO; + goto write_done; + } + + pmem_off = PFN_PHYS(pgoff) + pmem->data_offset; + sector = (pmem_off - pmem->data_offset) / 512; + cleared = nvdimm_clear_poison(dev, pmem->phys_addr + pmem_off, len); + cleared_blk = cleared / 512; + if (cleared_blk > 0) { + hwpoison_clear(pmem, pmem->phys_addr + pmem_off, cleared); + } else { + dev_warn(dev, "pmem_recovery_write: cleared_blk: %ld\n", + cleared_blk); + rc = (size_t) -EIO; + goto write_done; + } + arch_invalidate_pmem(pmem->virt_addr + pmem_off, bytes); + rc = _copy_from_iter_flushcache(addr, bytes, i); + pmem_clear_badblocks(pmem, sector, cleared_blk); + +write_done: + mutex_unlock(&pmem->recovery_lock); + return rc; } static const struct dax_operations pmem_dax_ops = { @@ -495,6 +564,7 @@ static int pmem_attach_disk(struct device *dev, goto out_cleanup_dax; dax_write_cache(dax_dev, nvdimm_has_cache(nd_region)); set_dax_recovery(dax_dev); + mutex_init(&pmem->recovery_lock); pmem->dax_dev = dax_dev; rc = device_add_disk(dev, disk, pmem_attribute_groups); diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 59cfe13ea8a8..971bff7552d6 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -24,6 +24,7 @@ struct pmem_device { struct dax_device *dax_dev; struct gendisk *disk; struct dev_pagemap pgmap; + struct mutex recovery_lock; }; long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff,
pmem_recovery_write() consists of clearing poison via DSM, clearing page HWPoison bit, re-state _PAGE_PRESENT bit, cacheflush, write, and finally clearing bad-block record. A competing pread thread is held off during recovery write by the presence of bad-block record. A competing recovery_write thread is serialized by a lock. Signed-off-by: Jane Chu <jane.chu@oracle.com> --- drivers/nvdimm/pmem.c | 82 +++++++++++++++++++++++++++++++++++++++---- drivers/nvdimm/pmem.h | 1 + 2 files changed, 77 insertions(+), 6 deletions(-)