Message ID | 20230120034622.2698268-2-jiaqiyan@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Introduce per NUMA node memory error statistics | expand |
On Fri, Jan 20, 2023 at 03:46:20AM +0000, Jiaqi Yan wrote: > Today kernel provides following memory error info to userspace, but each > has its own disadvantage > * HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total, > not per NUMA node stats though > * ras:memory_failure_event: only available after explicitly enabled > * /dev/mcelog provides many useful info about the MCEs, but > doesn't capture how memory_failure recovered memory MCEs > * kernel logs: userspace needs to process log text > > Exposes per NUMA node memory error stats as sysfs entries: > > /sys/devices/system/node/node${X}/memory_failure/total > /sys/devices/system/node/node${X}/memory_failure/recovered > /sys/devices/system/node/node${X}/memory_failure/ignored > /sys/devices/system/node/node${X}/memory_failure/failed > /sys/devices/system/node/node${X}/memory_failure/delayed > > These counters describe how many raw pages are poisoned and after the > attempted recoveries by the kernel, their resolutions: how many are > recovered, ignored, failed, or delayed respectively. The following > math holds for the statistics: > * total = recovered + ignored + failed + delayed > > Acked-by: David Rientjes <rientjes@google.com> > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com> Looks good to me, thank you. Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
On 2023/1/20 11:46, Jiaqi Yan wrote: > Today kernel provides following memory error info to userspace, but each > has its own disadvantage > * HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total, > not per NUMA node stats though > * ras:memory_failure_event: only available after explicitly enabled > * /dev/mcelog provides many useful info about the MCEs, but > doesn't capture how memory_failure recovered memory MCEs > * kernel logs: userspace needs to process log text > > Exposes per NUMA node memory error stats as sysfs entries: > > /sys/devices/system/node/node${X}/memory_failure/total > /sys/devices/system/node/node${X}/memory_failure/recovered > /sys/devices/system/node/node${X}/memory_failure/ignored > /sys/devices/system/node/node${X}/memory_failure/failed > /sys/devices/system/node/node${X}/memory_failure/delayed > > These counters describe how many raw pages are poisoned and after the > attempted recoveries by the kernel, their resolutions: how many are > recovered, ignored, failed, or delayed respectively. The following > math holds for the statistics: > * total = recovered + ignored + failed + delayed > > Acked-by: David Rientjes <rientjes@google.com> > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com> > --- > drivers/base/node.c | 3 +++ > include/linux/mm.h | 5 +++++ > include/linux/mmzone.h | 28 ++++++++++++++++++++++++++++ > mm/memory-failure.c | 35 +++++++++++++++++++++++++++++++++++ > 4 files changed, 71 insertions(+) > > diff --git a/drivers/base/node.c b/drivers/base/node.c > index faf3597a96da..b46db17124f3 100644 > --- a/drivers/base/node.c > +++ b/drivers/base/node.c > @@ -586,6 +586,9 @@ static const struct attribute_group *node_dev_groups[] = { > &node_dev_group, > #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP > &arch_node_dev_group, > +#endif > +#ifdef CONFIG_MEMORY_FAILURE > + &memory_failure_attr_group, > #endif > NULL > }; > diff --git a/include/linux/mm.h b/include/linux/mm.h > index f3f196e4d66d..888576884eb9 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -3521,6 +3521,11 @@ enum mf_action_page_type { > MF_MSG_UNKNOWN, > }; > > +/* > + * Sysfs entries for memory failure handling statistics. > + */ > +extern const struct attribute_group memory_failure_attr_group; > + This should move under CONFIG_MEMORY_FAILURE > #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) > extern void clear_huge_page(struct page *page, > unsigned long addr_hint, > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index cd28a100d9e4..2c537b31fa7b 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -1110,6 +1110,31 @@ struct deferred_split { > }; > #endif > > +#ifdef CONFIG_MEMORY_FAILURE > +/* > + * Per NUMA node memory failure handling statistics. > + */ > +struct memory_failure_stats { > + /* > + * Number of raw pages poisoned. > + * Cases not accounted: memory outside kernel control, offline page, > + * arch-specific memory_failure (SGX), hwpoison_filter() filtered > + * error events, and unpoison actions from hwpoison_unpoison. > + */ > + unsigned long total; > + /* > + * Recovery results of poisoned raw pages handled by memory_failure, > + * in sync with mf_result. > + * total = ignored + failed + delayed + recovered. > + * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. > + */ > + unsigned long ignored; > + unsigned long failed; > + unsigned long delayed; > + unsigned long recovered; > +}; > +#endif > + > /* > * On NUMA machines, each NUMA node would have a pg_data_t to describe > * it's memory layout. On UMA machines there is a single pglist_data which > @@ -1253,6 +1278,9 @@ typedef struct pglist_data { > #ifdef CONFIG_NUMA > struct memory_tier __rcu *memtier; > #endif > +#ifdef CONFIG_MEMORY_FAILURE > + struct memory_failure_stats mf_stats; > +#endif > } pg_data_t; > > #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) > diff --git a/mm/memory-failure.c b/mm/memory-failure.c > index c77a9e37e27e..c628f1db3a4d 100644 > --- a/mm/memory-failure.c > +++ b/mm/memory-failure.c > @@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i) > memblk_nr_poison_sub(pfn, i); > } > > +/** > + * MF_ATTR_RO - Create sysfs entry for each memory failure statistics. > + * @_name: name of the file in the per NUMA sysfs directory. > + */ > +#define MF_ATTR_RO(_name) \ > +static ssize_t _name##_show(struct device *dev, \ > + struct device_attribute *attr, \ > + char *buf) \ > +{ \ > + struct memory_failure_stats *mf_stats = \ > + &NODE_DATA(dev->id)->mf_stats; \ > + return sprintf(buf, "%lu\n", mf_stats->_name); \ > +} \ > +static DEVICE_ATTR_RO(_name) > + > +MF_ATTR_RO(total); > +MF_ATTR_RO(ignored); > +MF_ATTR_RO(failed); > +MF_ATTR_RO(delayed); > +MF_ATTR_RO(recovered); > + > +static struct attribute *memory_failure_attr[] = { > + &dev_attr_total.attr, > + &dev_attr_ignored.attr, > + &dev_attr_failed.attr, > + &dev_attr_delayed.attr, > + &dev_attr_recovered.attr, > + NULL, > +}; > + > +const struct attribute_group memory_failure_attr_group = { > + .name = "memory_failure", > + .attrs = memory_failure_attr, > +}; > + > /* > * Return values: > * 1: the page is dissolved (if needed) and taken off from buddy,
On Wed, Feb 1, 2023 at 10:54 PM Kefeng Wang <wangkefeng.wang@huawei.com> wrote: > > > > On 2023/1/20 11:46, Jiaqi Yan wrote: > > Today kernel provides following memory error info to userspace, but each > > has its own disadvantage > > * HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total, > > not per NUMA node stats though > > * ras:memory_failure_event: only available after explicitly enabled > > * /dev/mcelog provides many useful info about the MCEs, but > > doesn't capture how memory_failure recovered memory MCEs > > * kernel logs: userspace needs to process log text > > > > Exposes per NUMA node memory error stats as sysfs entries: > > > > /sys/devices/system/node/node${X}/memory_failure/total > > /sys/devices/system/node/node${X}/memory_failure/recovered > > /sys/devices/system/node/node${X}/memory_failure/ignored > > /sys/devices/system/node/node${X}/memory_failure/failed > > /sys/devices/system/node/node${X}/memory_failure/delayed > > > > These counters describe how many raw pages are poisoned and after the > > attempted recoveries by the kernel, their resolutions: how many are > > recovered, ignored, failed, or delayed respectively. The following > > math holds for the statistics: > > * total = recovered + ignored + failed + delayed > > > > Acked-by: David Rientjes <rientjes@google.com> > > Signed-off-by: Jiaqi Yan <jiaqiyan@google.com> > > --- > > drivers/base/node.c | 3 +++ > > include/linux/mm.h | 5 +++++ > > include/linux/mmzone.h | 28 ++++++++++++++++++++++++++++ > > mm/memory-failure.c | 35 +++++++++++++++++++++++++++++++++++ > > 4 files changed, 71 insertions(+) > > > > diff --git a/drivers/base/node.c b/drivers/base/node.c > > index faf3597a96da..b46db17124f3 100644 > > --- a/drivers/base/node.c > > +++ b/drivers/base/node.c > > @@ -586,6 +586,9 @@ static const struct attribute_group *node_dev_groups[] = { > > &node_dev_group, > > #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP > > &arch_node_dev_group, > > +#endif > > +#ifdef CONFIG_MEMORY_FAILURE > > + &memory_failure_attr_group, > > #endif > > NULL > > }; > > diff --git a/include/linux/mm.h b/include/linux/mm.h > > index f3f196e4d66d..888576884eb9 100644 > > --- a/include/linux/mm.h > > +++ b/include/linux/mm.h > > @@ -3521,6 +3521,11 @@ enum mf_action_page_type { > > MF_MSG_UNKNOWN, > > }; > > > > +/* > > + * Sysfs entries for memory failure handling statistics. > > + */ > > +extern const struct attribute_group memory_failure_attr_group; > > + > > This should move under CONFIG_MEMORY_FAILURE Thanks! I will move it around in the new version. > > > #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) > > extern void clear_huge_page(struct page *page, > > unsigned long addr_hint, > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > > index cd28a100d9e4..2c537b31fa7b 100644 > > --- a/include/linux/mmzone.h > > +++ b/include/linux/mmzone.h > > @@ -1110,6 +1110,31 @@ struct deferred_split { > > }; > > #endif > > > > +#ifdef CONFIG_MEMORY_FAILURE > > +/* > > + * Per NUMA node memory failure handling statistics. > > + */ > > +struct memory_failure_stats { > > + /* > > + * Number of raw pages poisoned. > > + * Cases not accounted: memory outside kernel control, offline page, > > + * arch-specific memory_failure (SGX), hwpoison_filter() filtered > > + * error events, and unpoison actions from hwpoison_unpoison. > > + */ > > + unsigned long total; > > + /* > > + * Recovery results of poisoned raw pages handled by memory_failure, > > + * in sync with mf_result. > > + * total = ignored + failed + delayed + recovered. > > + * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. > > + */ > > + unsigned long ignored; > > + unsigned long failed; > > + unsigned long delayed; > > + unsigned long recovered; > > +}; > > +#endif > > + > > /* > > * On NUMA machines, each NUMA node would have a pg_data_t to describe > > * it's memory layout. On UMA machines there is a single pglist_data which > > @@ -1253,6 +1278,9 @@ typedef struct pglist_data { > > #ifdef CONFIG_NUMA > > struct memory_tier __rcu *memtier; > > #endif > > +#ifdef CONFIG_MEMORY_FAILURE > > + struct memory_failure_stats mf_stats; > > +#endif > > } pg_data_t; > > > > #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) > > diff --git a/mm/memory-failure.c b/mm/memory-failure.c > > index c77a9e37e27e..c628f1db3a4d 100644 > > --- a/mm/memory-failure.c > > +++ b/mm/memory-failure.c > > @@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i) > > memblk_nr_poison_sub(pfn, i); > > } > > > > +/** > > + * MF_ATTR_RO - Create sysfs entry for each memory failure statistics. > > + * @_name: name of the file in the per NUMA sysfs directory. > > + */ > > +#define MF_ATTR_RO(_name) \ > > +static ssize_t _name##_show(struct device *dev, \ > > + struct device_attribute *attr, \ > > + char *buf) \ > > +{ \ > > + struct memory_failure_stats *mf_stats = \ > > + &NODE_DATA(dev->id)->mf_stats; \ > > + return sprintf(buf, "%lu\n", mf_stats->_name); \ > > +} \ > > +static DEVICE_ATTR_RO(_name) > > + > > +MF_ATTR_RO(total); > > +MF_ATTR_RO(ignored); > > +MF_ATTR_RO(failed); > > +MF_ATTR_RO(delayed); > > +MF_ATTR_RO(recovered); > > + > > +static struct attribute *memory_failure_attr[] = { > > + &dev_attr_total.attr, > > + &dev_attr_ignored.attr, > > + &dev_attr_failed.attr, > > + &dev_attr_delayed.attr, > > + &dev_attr_recovered.attr, > > + NULL, > > +}; > > + > > +const struct attribute_group memory_failure_attr_group = { > > + .name = "memory_failure", > > + .attrs = memory_failure_attr, > > +}; > > + > > /* > > * Return values: > > * 1: the page is dissolved (if needed) and taken off from buddy,
diff --git a/drivers/base/node.c b/drivers/base/node.c index faf3597a96da..b46db17124f3 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -586,6 +586,9 @@ static const struct attribute_group *node_dev_groups[] = { &node_dev_group, #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP &arch_node_dev_group, +#endif +#ifdef CONFIG_MEMORY_FAILURE + &memory_failure_attr_group, #endif NULL }; diff --git a/include/linux/mm.h b/include/linux/mm.h index f3f196e4d66d..888576884eb9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3521,6 +3521,11 @@ enum mf_action_page_type { MF_MSG_UNKNOWN, }; +/* + * Sysfs entries for memory failure handling statistics. + */ +extern const struct attribute_group memory_failure_attr_group; + #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) extern void clear_huge_page(struct page *page, unsigned long addr_hint, diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd28a100d9e4..2c537b31fa7b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1110,6 +1110,31 @@ struct deferred_split { }; #endif +#ifdef CONFIG_MEMORY_FAILURE +/* + * Per NUMA node memory failure handling statistics. + */ +struct memory_failure_stats { + /* + * Number of raw pages poisoned. + * Cases not accounted: memory outside kernel control, offline page, + * arch-specific memory_failure (SGX), hwpoison_filter() filtered + * error events, and unpoison actions from hwpoison_unpoison. + */ + unsigned long total; + /* + * Recovery results of poisoned raw pages handled by memory_failure, + * in sync with mf_result. + * total = ignored + failed + delayed + recovered. + * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted. + */ + unsigned long ignored; + unsigned long failed; + unsigned long delayed; + unsigned long recovered; +}; +#endif + /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which @@ -1253,6 +1278,9 @@ typedef struct pglist_data { #ifdef CONFIG_NUMA struct memory_tier __rcu *memtier; #endif +#ifdef CONFIG_MEMORY_FAILURE + struct memory_failure_stats mf_stats; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c77a9e37e27e..c628f1db3a4d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i) memblk_nr_poison_sub(pfn, i); } +/** + * MF_ATTR_RO - Create sysfs entry for each memory failure statistics. + * @_name: name of the file in the per NUMA sysfs directory. + */ +#define MF_ATTR_RO(_name) \ +static ssize_t _name##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + struct memory_failure_stats *mf_stats = \ + &NODE_DATA(dev->id)->mf_stats; \ + return sprintf(buf, "%lu\n", mf_stats->_name); \ +} \ +static DEVICE_ATTR_RO(_name) + +MF_ATTR_RO(total); +MF_ATTR_RO(ignored); +MF_ATTR_RO(failed); +MF_ATTR_RO(delayed); +MF_ATTR_RO(recovered); + +static struct attribute *memory_failure_attr[] = { + &dev_attr_total.attr, + &dev_attr_ignored.attr, + &dev_attr_failed.attr, + &dev_attr_delayed.attr, + &dev_attr_recovered.attr, + NULL, +}; + +const struct attribute_group memory_failure_attr_group = { + .name = "memory_failure", + .attrs = memory_failure_attr, +}; + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy,