Message ID | 1448066960-20119-3-git-send-email-vishal.l.verma@intel.com (mailing list archive) |
---|---|
State | Not Applicable, archived |
Headers | show |
Vishal Verma <vishal.l.verma@intel.com> writes: > NVDIMM devices, which can behave more like DRAM rather than block > devices, may develop bad cache lines, or 'poison'. A block device > exposed by the pmem driver can then consume poison via a read (or > write), and cause a machine check. On platforms without machine > check recovery features, this would mean a crash. > > The block device maintaining a runtime list of all known sectors that > have poison can directly avoid this, and also provide a path forward > to enable proper handling/recovery for DAX faults on such a device. > > Use the new badblock management interfaces to add a badblocks list to > gendisks. Because disk_alloc_badblocks can fail, you need to check for a NULL disk->bb in all of the utility functions you've defined. Cheers, Jeff > > Signed-off-by: Vishal Verma <vishal.l.verma@intel.com> > --- > block/genhd.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++ > include/linux/genhd.h | 6 +++++ > 2 files changed, 70 insertions(+) > > diff --git a/block/genhd.c b/block/genhd.c > index 0c706f3..4209c32 100644 > --- a/block/genhd.c > +++ b/block/genhd.c > @@ -20,6 +20,7 @@ > #include <linux/idr.h> > #include <linux/log2.h> > #include <linux/pm_runtime.h> > +#include <linux/badblocks.h> > > #include "blk.h" > > @@ -505,6 +506,20 @@ static int exact_lock(dev_t devt, void *data) > return 0; > } > > +static void disk_alloc_badblocks(struct gendisk *disk) > +{ > + disk->bb = kzalloc(sizeof(disk->bb), GFP_KERNEL); > + if (!disk->bb) { > + pr_warn("%s: failed to allocate space for badblocks\n", > + disk->disk_name); > + return; > + } > + > + if (badblocks_init(disk->bb, 1)) > + pr_warn("%s: failed to initialize badblocks\n", > + disk->disk_name); > +} > + > static void register_disk(struct gendisk *disk) > { > struct device *ddev = disk_to_dev(disk); > @@ -609,6 +624,7 @@ void add_disk(struct gendisk *disk) > disk->first_minor = MINOR(devt); > > disk_alloc_events(disk); > + disk_alloc_badblocks(disk); > > /* Register BDI before referencing it from bdev */ > bdi = &disk->queue->backing_dev_info; > @@ -657,6 +673,9 @@ void del_gendisk(struct gendisk *disk) > blk_unregister_queue(disk); > blk_unregister_region(disk_devt(disk), disk->minors); > > + badblocks_free(disk->bb); > + kfree(disk->bb); > + > part_stat_set_all(&disk->part0, 0); > disk->part0.stamp = 0; > > @@ -670,6 +689,48 @@ void del_gendisk(struct gendisk *disk) > } > EXPORT_SYMBOL(del_gendisk); > > +/* > + * The gendisk usage of badblocks does not track acknowledgements for > + * badblocks. We always assume they are acknowledged. > + */ > +int disk_check_badblocks(struct gendisk *disk, sector_t s, int sectors, > + sector_t *first_bad, int *bad_sectors) > +{ > + return badblocks_check(disk->bb, s, sectors, first_bad, bad_sectors); > +} > +EXPORT_SYMBOL(disk_check_badblocks); > + > +int disk_set_badblocks(struct gendisk *disk, sector_t s, int sectors) > +{ > + return badblocks_set(disk->bb, s, sectors, 1); > +} > +EXPORT_SYMBOL(disk_set_badblocks); > + > +int disk_clear_badblocks(struct gendisk *disk, sector_t s, int sectors) > +{ > + return badblocks_clear(disk->bb, s, sectors); > +} > +EXPORT_SYMBOL(disk_clear_badblocks); > + > +/* sysfs access to bad-blocks list. */ > +static ssize_t disk_badblocks_show(struct device *dev, > + struct device_attribute *attr, > + char *page) > +{ > + struct gendisk *disk = dev_to_disk(dev); > + > + return badblocks_show(disk->bb, page, 0); > +} > + > +static ssize_t disk_badblocks_store(struct device *dev, > + struct device_attribute *attr, > + const char *page, size_t len) > +{ > + struct gendisk *disk = dev_to_disk(dev); > + > + return badblocks_store(disk->bb, page, len, 0); > +} > + > /** > * get_gendisk - get partitioning information for a given device > * @devt: device to get partitioning information for > @@ -988,6 +1049,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, > static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); > static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); > static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); > +static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show, > + disk_badblocks_store); > #ifdef CONFIG_FAIL_MAKE_REQUEST > static struct device_attribute dev_attr_fail = > __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); > @@ -1009,6 +1072,7 @@ static struct attribute *disk_attrs[] = { > &dev_attr_capability.attr, > &dev_attr_stat.attr, > &dev_attr_inflight.attr, > + &dev_attr_badblocks.attr, > #ifdef CONFIG_FAIL_MAKE_REQUEST > &dev_attr_fail.attr, > #endif > diff --git a/include/linux/genhd.h b/include/linux/genhd.h > index 2adbfa6..5563bde 100644 > --- a/include/linux/genhd.h > +++ b/include/linux/genhd.h > @@ -162,6 +162,7 @@ struct disk_part_tbl { > }; > > struct disk_events; > +struct badblocks; > > struct gendisk { > /* major, first_minor and minors are input parameters only, > @@ -201,6 +202,7 @@ struct gendisk { > struct blk_integrity *integrity; > #endif > int node_id; > + struct badblocks *bb; > }; > > static inline struct gendisk *part_to_disk(struct hd_struct *part) > @@ -421,6 +423,10 @@ extern void add_disk(struct gendisk *disk); > extern void del_gendisk(struct gendisk *gp); > extern struct gendisk *get_gendisk(dev_t dev, int *partno); > extern struct block_device *bdget_disk(struct gendisk *disk, int partno); > +extern int disk_check_badblocks(struct gendisk *disk, sector_t s, int sectors, > + sector_t *first_bad, int *bad_sectors); > +extern int disk_set_badblocks(struct gendisk *disk, sector_t s, int sectors); > +extern int disk_clear_badblocks(struct gendisk *disk, sector_t s, int sectors); > > extern void set_device_ro(struct block_device *bdev, int flag); > extern void set_disk_ro(struct gendisk *disk, int flag); -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
T24gVHVlLCAyMDE1LTExLTI0IGF0IDEwOjM0IC0wNTAwLCBKZWZmIE1veWVyIHdyb3RlOg0KPiBW aXNoYWwgVmVybWEgPHZpc2hhbC5sLnZlcm1hQGludGVsLmNvbT4gd3JpdGVzOg0KPiANCj4gPiBO VkRJTU0gZGV2aWNlcywgd2hpY2ggY2FuIGJlaGF2ZSBtb3JlIGxpa2UgRFJBTSByYXRoZXIgdGhh biBibG9jaw0KPiA+IGRldmljZXMsIG1heSBkZXZlbG9wIGJhZCBjYWNoZSBsaW5lcywgb3IgJ3Bv aXNvbicuIEEgYmxvY2sgZGV2aWNlDQo+ID4gZXhwb3NlZCBieSB0aGUgcG1lbSBkcml2ZXIgY2Fu IHRoZW4gY29uc3VtZSBwb2lzb24gdmlhIGEgcmVhZCAob3INCj4gPiB3cml0ZSksIGFuZCBjYXVz ZSBhIG1hY2hpbmUgY2hlY2suIE9uIHBsYXRmb3JtcyB3aXRob3V0IG1hY2hpbmUNCj4gPiBjaGVj ayByZWNvdmVyeSBmZWF0dXJlcywgdGhpcyB3b3VsZCBtZWFuIGEgY3Jhc2guDQo+ID4gDQo+ID4g VGhlIGJsb2NrIGRldmljZSBtYWludGFpbmluZyBhIHJ1bnRpbWUgbGlzdCBvZiBhbGwga25vd24g c2VjdG9ycw0KPiA+IHRoYXQNCj4gPiBoYXZlIHBvaXNvbiBjYW4gZGlyZWN0bHkgYXZvaWQgdGhp cywgYW5kIGFsc28gcHJvdmlkZSBhIHBhdGggZm9yd2FyZA0KPiA+IHRvIGVuYWJsZSBwcm9wZXIg aGFuZGxpbmcvcmVjb3ZlcnkgZm9yIERBWCBmYXVsdHMgb24gc3VjaCBhIGRldmljZS4NCj4gPiAN Cj4gPiBVc2UgdGhlIG5ldyBiYWRibG9jayBtYW5hZ2VtZW50IGludGVyZmFjZXMgdG8gYWRkIGEg YmFkYmxvY2tzIGxpc3QNCj4gPiB0bw0KPiA+IGdlbmRpc2tzLg0KPiANCj4gQmVjYXVzZSBkaXNr X2FsbG9jX2JhZGJsb2NrcyBjYW4gZmFpbCwgeW91IG5lZWQgdG8gY2hlY2sgZm9yIGEgTlVMTA0K PiBkaXNrLT5iYiBpbiBhbGwgb2YgdGhlIHV0aWxpdHkgZnVuY3Rpb25zIHlvdSd2ZSBkZWZpbmVk Lg0KPiANCg0KVGhhbmtzLCBKZWZmIC0gSSdsbCBmaXggdGhpcy4gSSBoYXZlIGEgaGFuZGZ1bCBv ZiBvdGhlciBmaXhlcyBxdWV1ZWQgdXANCnRvbywgd2lsbCBzZW5kIG91dCBhIHYyIHNvb24uDQoN CgktVmlzaGFs -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
"Verma, Vishal L" <vishal.l.verma@intel.com> writes: > On Tue, 2015-11-24 at 10:34 -0500, Jeff Moyer wrote: >> Vishal Verma <vishal.l.verma@intel.com> writes: >> >> > NVDIMM devices, which can behave more like DRAM rather than block >> > devices, may develop bad cache lines, or 'poison'. A block device >> > exposed by the pmem driver can then consume poison via a read (or >> > write), and cause a machine check. On platforms without machine >> > check recovery features, this would mean a crash. >> > >> > The block device maintaining a runtime list of all known sectors >> > that >> > have poison can directly avoid this, and also provide a path forward >> > to enable proper handling/recovery for DAX faults on such a device. >> > >> > Use the new badblock management interfaces to add a badblocks list >> > to >> > gendisks. >> >> Because disk_alloc_badblocks can fail, you need to check for a NULL >> disk->bb in all of the utility functions you've defined. >> > > Thanks, Jeff - I'll fix this. I have a handful of other fixes queued up > too, will send out a v2 soon. I'm not sure whether it makes sense to continue without badblock management for the RAID code. I was hoping Neil would comment on that. -Jeff -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2015-11-24 at 14:14 -0500, Jeff Moyer wrote: > > I'm not sure whether it makes sense to continue without badblock > management for the RAID code. I was hoping Neil would comment on > that. > > -Jeff Not sure I follow? I believe I've kept all the badblocks functionality RAID already had.. On a related note, something I observed when testing with md: md's badblock list is per-device, and can be seen at: /sys/block/md0/md/dev-pmem0/bad_blocks Now if we have badblocks in the gendisks too, there is also: /sys/block/pmem0/bad_blocks The two are separate 'accounts' maintained by separate drivers (md for the former, and pmem for the latter). This can potentially be confusing.. Should we consolidate the two, i.e. make md (re)use the gendisk badblocks for its purposes too? -Vishal
On Tue, Nov 24, 2015 at 12:10 PM, Verma, Vishal L <vishal.l.verma@intel.com> wrote: > On Tue, 2015-11-24 at 14:14 -0500, Jeff Moyer wrote: >> >> I'm not sure whether it makes sense to continue without badblock >> management for the RAID code. I was hoping Neil would comment on >> that. >> >> -Jeff > > Not sure I follow? I believe I've kept all the badblocks functionality > RAID already had.. > > > On a related note, something I observed when testing with md: > > md's badblock list is per-device, and can be seen at: > /sys/block/md0/md/dev-pmem0/bad_blocks > > Now if we have badblocks in the gendisks too, there is also: > /sys/block/pmem0/bad_blocks > > The two are separate 'accounts' maintained by separate drivers (md for > the former, and pmem for the latter). This can potentially be > confusing.. > > Should we consolidate the two, i.e. make md (re)use the gendisk > badblocks for its purposes too? If we get agreement that tracking bad blocks at the gendisk is useful for more than just nvdimms then yes, I think it makes sense to move md bad_blocks to the gendisk layer. That is provided we can add a symlink to make the move transparent to existing md userspace tooling. The use cases I can envision being useful for other disks is: 1/ Bypassing / avoiding known bad blocks on disks / drivers that inject long completion latency for error handling. 2/ Simulating bad blocks for disks that can't store them locally. Similar to the md use case, if one encounters errors restoring a disk image from a backup it's useful to have the option to record the error in a bad block list and keep going. -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
"Verma, Vishal L" <vishal.l.verma@intel.com> writes: > On Tue, 2015-11-24 at 14:14 -0500, Jeff Moyer wrote: >> >> I'm not sure whether it makes sense to continue without badblock >> management for the RAID code. I was hoping Neil would comment on >> that. >> >> -Jeff > > Not sure I follow? I believe I've kept all the badblocks functionality > RAID already had.. What I mean to say is that the RAID code had previously embedded the badblocks structure in one of its other data structures. As a result, you would never get an allocation failure for it. > On a related note, something I observed when testing with md: > > md's badblock list is per-device, and can be seen at: > /sys/block/md0/md/dev-pmem0/bad_blocks > > Now if we have badblocks in the gendisks too, there is also: > /sys/block/pmem0/bad_blocks > > The two are separate 'accounts' maintained by separate drivers (md for > the former, and pmem for the latter). This can potentially be > confusing.. > > Should we consolidate the two, i.e. make md (re)use the gendisk > badblocks for its purposes too? I agree with what Dan said. Cheers, Jeff -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, 2015-11-25 at 10:37 -0500, Jeff Moyer wrote: > "Verma, Vishal L" <vishal.l.verma@intel.com> writes: > > > On Tue, 2015-11-24 at 14:14 -0500, Jeff Moyer wrote: > > > > > > I'm not sure whether it makes sense to continue without badblock > > > management for the RAID code. I was hoping Neil would comment on > > > that. > > > > > > -Jeff > > > > Not sure I follow? I believe I've kept all the badblocks > > functionality > > RAID already had.. > > What I mean to say is that the RAID code had previously embedded the > badblocks structure in one of its other data structures. As a result, > you would never get an allocation failure for it. > Ah I see - I don't think that has effectively changed. 'rdev' still contains a statically allocated badblocks structure (as opposed to gendisk, which just stores a pointer). md used to dynamically allocate the storage space inside badblocks (bb->page), and that is still the case using badblocks_init. -Vishal
"Verma, Vishal L" <vishal.l.verma@intel.com> writes: > On Wed, 2015-11-25 at 10:37 -0500, Jeff Moyer wrote: >> "Verma, Vishal L" <vishal.l.verma@intel.com> writes: >> >> > On Tue, 2015-11-24 at 14:14 -0500, Jeff Moyer wrote: >> > > >> > > I'm not sure whether it makes sense to continue without badblock >> > > management for the RAID code. I was hoping Neil would comment on >> > > that. >> > > >> > > -Jeff >> > >> > Not sure I follow? I believe I've kept all the badblocks >> > functionality >> > RAID already had.. >> >> What I mean to say is that the RAID code had previously embedded the >> badblocks structure in one of its other data structures. As a result, >> you would never get an allocation failure for it. >> > Ah I see - I don't think that has effectively changed. 'rdev' still > contains a statically allocated badblocks structure (as opposed to > gendisk, which just stores a pointer). md used to dynamically allocate > the storage space inside badblocks (bb->page), and that is still the > case using badblocks_init. Ah, ok. Sorry I didn't dig into that further. Thanks, Jeff -- To unsubscribe from this list: send the line "unsubscribe linux-scsi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/block/genhd.c b/block/genhd.c index 0c706f3..4209c32 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -20,6 +20,7 @@ #include <linux/idr.h> #include <linux/log2.h> #include <linux/pm_runtime.h> +#include <linux/badblocks.h> #include "blk.h" @@ -505,6 +506,20 @@ static int exact_lock(dev_t devt, void *data) return 0; } +static void disk_alloc_badblocks(struct gendisk *disk) +{ + disk->bb = kzalloc(sizeof(disk->bb), GFP_KERNEL); + if (!disk->bb) { + pr_warn("%s: failed to allocate space for badblocks\n", + disk->disk_name); + return; + } + + if (badblocks_init(disk->bb, 1)) + pr_warn("%s: failed to initialize badblocks\n", + disk->disk_name); +} + static void register_disk(struct gendisk *disk) { struct device *ddev = disk_to_dev(disk); @@ -609,6 +624,7 @@ void add_disk(struct gendisk *disk) disk->first_minor = MINOR(devt); disk_alloc_events(disk); + disk_alloc_badblocks(disk); /* Register BDI before referencing it from bdev */ bdi = &disk->queue->backing_dev_info; @@ -657,6 +673,9 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); + badblocks_free(disk->bb); + kfree(disk->bb); + part_stat_set_all(&disk->part0, 0); disk->part0.stamp = 0; @@ -670,6 +689,48 @@ void del_gendisk(struct gendisk *disk) } EXPORT_SYMBOL(del_gendisk); +/* + * The gendisk usage of badblocks does not track acknowledgements for + * badblocks. We always assume they are acknowledged. + */ +int disk_check_badblocks(struct gendisk *disk, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + return badblocks_check(disk->bb, s, sectors, first_bad, bad_sectors); +} +EXPORT_SYMBOL(disk_check_badblocks); + +int disk_set_badblocks(struct gendisk *disk, sector_t s, int sectors) +{ + return badblocks_set(disk->bb, s, sectors, 1); +} +EXPORT_SYMBOL(disk_set_badblocks); + +int disk_clear_badblocks(struct gendisk *disk, sector_t s, int sectors) +{ + return badblocks_clear(disk->bb, s, sectors); +} +EXPORT_SYMBOL(disk_clear_badblocks); + +/* sysfs access to bad-blocks list. */ +static ssize_t disk_badblocks_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + + return badblocks_show(disk->bb, page, 0); +} + +static ssize_t disk_badblocks_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t len) +{ + struct gendisk *disk = dev_to_disk(dev); + + return badblocks_store(disk->bb, page, len, 0); +} + /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for @@ -988,6 +1049,8 @@ static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); +static DEVICE_ATTR(badblocks, S_IRUGO | S_IWUSR, disk_badblocks_show, + disk_badblocks_store); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -1009,6 +1072,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, + &dev_attr_badblocks.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 2adbfa6..5563bde 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -162,6 +162,7 @@ struct disk_part_tbl { }; struct disk_events; +struct badblocks; struct gendisk { /* major, first_minor and minors are input parameters only, @@ -201,6 +202,7 @@ struct gendisk { struct blk_integrity *integrity; #endif int node_id; + struct badblocks *bb; }; static inline struct gendisk *part_to_disk(struct hd_struct *part) @@ -421,6 +423,10 @@ extern void add_disk(struct gendisk *disk); extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); +extern int disk_check_badblocks(struct gendisk *disk, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); +extern int disk_set_badblocks(struct gendisk *disk, sector_t s, int sectors); +extern int disk_clear_badblocks(struct gendisk *disk, sector_t s, int sectors); extern void set_device_ro(struct block_device *bdev, int flag); extern void set_disk_ro(struct gendisk *disk, int flag);
NVDIMM devices, which can behave more like DRAM rather than block devices, may develop bad cache lines, or 'poison'. A block device exposed by the pmem driver can then consume poison via a read (or write), and cause a machine check. On platforms without machine check recovery features, this would mean a crash. The block device maintaining a runtime list of all known sectors that have poison can directly avoid this, and also provide a path forward to enable proper handling/recovery for DAX faults on such a device. Use the new badblock management interfaces to add a badblocks list to gendisks. Signed-off-by: Vishal Verma <vishal.l.verma@intel.com> --- block/genhd.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/genhd.h | 6 +++++ 2 files changed, 70 insertions(+)