Message ID | 20200422115814.22205-5-rrichter@marvell.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | EDAC/mc/ghes: Fixes, cleanup and reworks | expand |
Hi Robert, I love your patch! Yet something to improve: [auto build test ERROR on ras/edac-for-next] [also build test ERROR on linus/master v5.7-rc2 next-20200423] [cannot apply to bp/for-next linux/master] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system. BTW, we also suggest to use '--base' option to specify the base tree in git format-patch, please see https://stackoverflow.com/a/37406982] url: https://github.com/0day-ci/linux/commits/Robert-Richter/EDAC-mc-ghes-Fixes-cleanup-and-reworks/20200424-042828 base: https://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next config: i386-allyesconfig (attached as .config) compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0 reproduce: # save the attached .config to linux build tree make ARCH=i386 If you fix the issue, kindly add following tag as appropriate Reported-by: kbuild test robot <lkp@intel.com> All error/warnings (new ones prefixed by >>): In file included from arch/x86/include/asm/bug.h:83:0, from include/linux/bug.h:5, from include/linux/debug_locks.h:7, from include/linux/lockdep.h:44, from include/linux/spinlock_types.h:18, from include/linux/mutex.h:16, from include/linux/kernfs.h:12, from include/linux/sysfs.h:16, from include/linux/kobject.h:20, from include/linux/of.h:17, from include/linux/irqdomain.h:35, from include/linux/acpi.h:13, from include/acpi/apei.h:9, from include/acpi/ghes.h:5, from drivers/edac/ghes_edac.c:12: drivers/edac/ghes_edac.c: In function 'ghes_dimm_pool_create': >> include/linux/lockdep.h:409:52: error: invalid type argument of '->' (have 'struct mutex') #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) ^ include/asm-generic/bug.h:113:25: note: in definition of macro 'WARN_ON' int __ret_warn_on = !!(condition); \ ^~~~~~~~~ include/linux/lockdep.h:435:27: note: in expansion of macro 'lockdep_is_held' WARN_ON(debug_locks && !lockdep_is_held(l)); \ ^~~~~~~~~~~~~~~ >> drivers/edac/ghes_edac.c:104:2: note: in expansion of macro 'lockdep_assert_held' lockdep_assert_held(ghes_reg_mutex); ^~~~~~~~~~~~~~~~~~~ drivers/edac/ghes_edac.c: In function 'ghes_dimm_pool_destroy': >> include/linux/lockdep.h:409:52: error: invalid type argument of '->' (have 'struct mutex') #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) ^ include/asm-generic/bug.h:113:25: note: in definition of macro 'WARN_ON' int __ret_warn_on = !!(condition); \ ^~~~~~~~~ include/linux/lockdep.h:435:27: note: in expansion of macro 'lockdep_is_held' WARN_ON(debug_locks && !lockdep_is_held(l)); \ ^~~~~~~~~~~~~~~ drivers/edac/ghes_edac.c:118:2: note: in expansion of macro 'lockdep_assert_held' lockdep_assert_held(ghes_reg_mutex); ^~~~~~~~~~~~~~~~~~~ drivers/edac/ghes_edac.c: In function 'ghes_dimm_alloc': >> include/linux/lockdep.h:409:52: error: invalid type argument of '->' (have 'struct mutex') #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) ^ include/asm-generic/bug.h:113:25: note: in definition of macro 'WARN_ON' int __ret_warn_on = !!(condition); \ ^~~~~~~~~ include/linux/lockdep.h:435:27: note: in expansion of macro 'lockdep_is_held' WARN_ON(debug_locks && !lockdep_is_held(l)); \ ^~~~~~~~~~~~~~~ drivers/edac/ghes_edac.c:127:2: note: in expansion of macro 'lockdep_assert_held' lockdep_assert_held(ghes_reg_mutex); ^~~~~~~~~~~~~~~~~~~ drivers/edac/ghes_edac.c: In function 'ghes_dimm_release': >> include/linux/lockdep.h:409:52: error: invalid type argument of '->' (have 'struct mutex') #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) ^ include/asm-generic/bug.h:113:25: note: in definition of macro 'WARN_ON' int __ret_warn_on = !!(condition); \ ^~~~~~~~~ include/linux/lockdep.h:435:27: note: in expansion of macro 'lockdep_is_held' WARN_ON(debug_locks && !lockdep_is_held(l)); \ ^~~~~~~~~~~~~~~ drivers/edac/ghes_edac.c:144:2: note: in expansion of macro 'lockdep_assert_held' lockdep_assert_held(ghes_reg_mutex); ^~~~~~~~~~~~~~~~~~~ vim +/lockdep_assert_held +104 drivers/edac/ghes_edac.c 96 97 static int ghes_dimm_pool_create(int num_dimm) 98 { 99 struct ghes_dimm *ghes_dimm; 100 101 if (!num_dimm) 102 return 0; 103 > 104 lockdep_assert_held(ghes_reg_mutex); 105 106 dimms = kcalloc(num_dimm, sizeof(*dimms), GFP_KERNEL); 107 if (!dimms) 108 return -ENOMEM; 109 110 for (ghes_dimm = dimms; ghes_dimm < dimms + num_dimm; ghes_dimm++) 111 list_add(&ghes_dimm->entry, &ghes_dimm_pool); 112 113 return 0; 114 } 115 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
On Wed, Apr 22, 2020 at 01:58:08PM +0200, Robert Richter wrote: > diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c > index 39efce0df881..23adb7674f9b 100644 > --- a/drivers/edac/ghes_edac.c > +++ b/drivers/edac/ghes_edac.c > @@ -15,6 +15,12 @@ > #include "edac_module.h" > #include <ras/ras_event.h> > > +struct ghes_dimm { Simply struct dimm > + struct list_head entry; > + struct dimm_info *dimm; > + u16 handle; > +}; > + > struct ghes_mci { > struct mem_ctl_info *mci; > > @@ -42,6 +48,16 @@ static DEFINE_MUTEX(ghes_reg_mutex); > */ > static DEFINE_SPINLOCK(ghes_lock); > > +/* > + * Locking: > + * > + * dimms, ghes_dimm_pool: ghes_reg_mutex > + * ghes_dimm_list: ghes_lock > + */ > +static struct ghes_dimm *dimms; > +static LIST_HEAD(ghes_dimm_list); > +static LIST_HEAD(ghes_dimm_pool); Those are static lists, no need to prefix them with "ghes_". There's too much "ghes" in that code. :) > + > /* "ghes_edac.force_load=1" skips the platform check */ > static bool __read_mostly force_load; > module_param(force_load, bool, 0); > @@ -72,11 +88,63 @@ struct memdev_dmi_entry { > u16 conf_mem_clk_speed; > } __attribute__((__packed__)); > > -struct ghes_edac_dimm_fill { > +struct dimm_fill { > + struct list_head dimms; > struct mem_ctl_info *mci; > unsigned int count; > }; > > +static int ghes_dimm_pool_create(int num_dimm) Yeah, drop "ghes_" here too. I'm not going to comment on this in the rest of the patchset but for the next version, please drop the "ghes_" prefix from static functions and members - it unnecessarily gets in the way when reading the code. > +{ > + struct ghes_dimm *ghes_dimm; > + > + if (!num_dimm) > + return 0; > + > + lockdep_assert_held(ghes_reg_mutex); > + > + dimms = kcalloc(num_dimm, sizeof(*dimms), GFP_KERNEL); > + if (!dimms) > + return -ENOMEM; > + > + for (ghes_dimm = dimms; ghes_dimm < dimms + num_dimm; ghes_dimm++) And with the above shortening of names, this loop becomes: for (d = dimms; d < dimms + num_dimms; d++) list_add(&d->entry, &dimm_pool); Simple. > + > + return 0; > +} > + > +static void ghes_dimm_pool_destroy(void) > +{ > + lockdep_assert_held(ghes_reg_mutex); > + INIT_LIST_HEAD(&ghes_dimm_pool); > + kfree(dimms); > +} > + > +static struct ghes_dimm *ghes_dimm_alloc(struct dimm_info *dimm, u16 handle) > +{ > + struct ghes_dimm *ghes_dimm; > + > + lockdep_assert_held(ghes_reg_mutex); The 0day bot caught it already - this needs to be a ptr. Please test with PROVE_LOCKING enabled before sending next time. > + > + ghes_dimm = list_first_entry_or_null(&ghes_dimm_pool, > + struct ghes_dimm, entry); Let that line stick out. > + > + /* should be always non-zero */ > + if (!WARN_ON_ONCE(!ghes_dimm)) { > + ghes_dimm->dimm = dimm; > + ghes_dimm->handle = handle; > + list_del(&ghes_dimm->entry); > + } > + > + return ghes_dimm; > +} > + > +static void ghes_dimm_release(struct list_head *dimms) > +{ > + lockdep_assert_held(ghes_reg_mutex); > + list_splice(dimms, &ghes_dimm_pool); > +} > + > static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) > { > int *num_dimm = arg; ... > @@ -547,12 +626,18 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) > > spin_lock_irqsave(&ghes_lock, flags); > ghes_pvt = pvt; > + list_splice_tail(&dimm_fill.dimms, &ghes_dimm_list); > spin_unlock_irqrestore(&ghes_lock, flags); > > /* only set on success */ > refcount_set(&ghes_refcount, 1); > > unlock: > + if (rc < 0) { > + ghes_dimm_pool_destroy(); > + pr_err("Can't register at EDAC core: %d\n", rc); with the EDAC core: > + } > + > mutex_unlock(&ghes_reg_mutex); > > return rc; > @@ -562,6 +647,7 @@ void ghes_edac_unregister(struct ghes *ghes) > { > struct mem_ctl_info *mci; > unsigned long flags; > + LIST_HEAD(dimm_list); > > mutex_lock(&ghes_reg_mutex); > > @@ -574,14 +660,19 @@ void ghes_edac_unregister(struct ghes *ghes) > spin_lock_irqsave(&ghes_lock, flags); > mci = ghes_pvt ? ghes_pvt->mci : NULL; > ghes_pvt = NULL; > + list_splice_init(&ghes_dimm_list, &dimm_list); Why do you need to do this? Can't you simply do: ghes_dimm_release(&ghes_dimm_list); here? Btw, please add an explanation above ghes_dimm_list and ghes_dimm_pool what those are and what the rules are: stuff gets added on register to what list and freed on unreg from what list, etc. So that it is clear upon a quick glance. Thx.
On 24.04.20 18:21:57, Borislav Petkov wrote: > On Wed, Apr 22, 2020 at 01:58:08PM +0200, Robert Richter wrote: > > @@ -562,6 +647,7 @@ void ghes_edac_unregister(struct ghes *ghes) > > { > > struct mem_ctl_info *mci; > > unsigned long flags; > > + LIST_HEAD(dimm_list); > > > > mutex_lock(&ghes_reg_mutex); > > > > @@ -574,14 +660,19 @@ void ghes_edac_unregister(struct ghes *ghes) > > spin_lock_irqsave(&ghes_lock, flags); > > mci = ghes_pvt ? ghes_pvt->mci : NULL; > > ghes_pvt = NULL; > > + list_splice_init(&ghes_dimm_list, &dimm_list); > > Why do you need to do this? > > Can't you simply do: > > ghes_dimm_release(&ghes_dimm_list); > > here? This decouples the locking. Otherwise ghes_dimm_release() would be called with ghes_lock held which I want to avoid to keep the locking simple. -Robert
diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c index 39efce0df881..23adb7674f9b 100644 --- a/drivers/edac/ghes_edac.c +++ b/drivers/edac/ghes_edac.c @@ -15,6 +15,12 @@ #include "edac_module.h" #include <ras/ras_event.h> +struct ghes_dimm { + struct list_head entry; + struct dimm_info *dimm; + u16 handle; +}; + struct ghes_mci { struct mem_ctl_info *mci; @@ -42,6 +48,16 @@ static DEFINE_MUTEX(ghes_reg_mutex); */ static DEFINE_SPINLOCK(ghes_lock); +/* + * Locking: + * + * dimms, ghes_dimm_pool: ghes_reg_mutex + * ghes_dimm_list: ghes_lock + */ +static struct ghes_dimm *dimms; +static LIST_HEAD(ghes_dimm_list); +static LIST_HEAD(ghes_dimm_pool); + /* "ghes_edac.force_load=1" skips the platform check */ static bool __read_mostly force_load; module_param(force_load, bool, 0); @@ -72,11 +88,63 @@ struct memdev_dmi_entry { u16 conf_mem_clk_speed; } __attribute__((__packed__)); -struct ghes_edac_dimm_fill { +struct dimm_fill { + struct list_head dimms; struct mem_ctl_info *mci; unsigned int count; }; +static int ghes_dimm_pool_create(int num_dimm) +{ + struct ghes_dimm *ghes_dimm; + + if (!num_dimm) + return 0; + + lockdep_assert_held(ghes_reg_mutex); + + dimms = kcalloc(num_dimm, sizeof(*dimms), GFP_KERNEL); + if (!dimms) + return -ENOMEM; + + for (ghes_dimm = dimms; ghes_dimm < dimms + num_dimm; ghes_dimm++) + list_add(&ghes_dimm->entry, &ghes_dimm_pool); + + return 0; +} + +static void ghes_dimm_pool_destroy(void) +{ + lockdep_assert_held(ghes_reg_mutex); + INIT_LIST_HEAD(&ghes_dimm_pool); + kfree(dimms); +} + +static struct ghes_dimm *ghes_dimm_alloc(struct dimm_info *dimm, u16 handle) +{ + struct ghes_dimm *ghes_dimm; + + lockdep_assert_held(ghes_reg_mutex); + + ghes_dimm = list_first_entry_or_null(&ghes_dimm_pool, + struct ghes_dimm, entry); + + /* should be always non-zero */ + if (!WARN_ON_ONCE(!ghes_dimm)) { + ghes_dimm->dimm = dimm; + ghes_dimm->handle = handle; + list_del(&ghes_dimm->entry); + } + + return ghes_dimm; +} + +static void ghes_dimm_release(struct list_head *dimms) +{ + lockdep_assert_held(ghes_reg_mutex); + list_splice(dimms, &ghes_dimm_pool); +} + static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) { int *num_dimm = arg; @@ -85,13 +153,15 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg) (*num_dimm)++; } -static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle) +static int get_dimm_smbios_index(u16 handle) { - struct dimm_info *dimm; + struct ghes_dimm *ghes_dimm; - mci_for_each_dimm(mci, dimm) { - if (dimm->smbios_handle == handle) - return dimm->idx; + lockdep_assert_held(&ghes_lock); + + list_for_each_entry(ghes_dimm, &ghes_dimm_list, entry) { + if (ghes_dimm->handle == handle) + return ghes_dimm->dimm->idx; } return -1; @@ -99,8 +169,9 @@ static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle) static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) { - struct ghes_edac_dimm_fill *dimm_fill = arg; + struct dimm_fill *dimm_fill = arg; struct mem_ctl_info *mci = dimm_fill->mci; + struct ghes_dimm *ghes_dimm; if (dh->type == DMI_ENTRY_MEM_DEVICE) { struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh; @@ -191,7 +262,10 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg) entry->total_width, entry->data_width); } - dimm->smbios_handle = entry->handle; + + ghes_dimm = ghes_dimm_alloc(dimm, entry->handle); + if (ghes_dimm) + list_add_tail(&ghes_dimm->entry, &dimm_fill->dimms); dimm_fill->count++; } @@ -352,7 +426,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) p += sprintf(p, "DIMM DMI handle: 0x%.4x ", mem_err->mem_dev_handle); - index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle); + index = get_dimm_smbios_index(mem_err->mem_dev_handle); if (index >= 0) e->top_layer = index; } @@ -457,7 +531,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) struct mem_ctl_info *mci; struct ghes_mci *pvt; struct edac_mc_layer layers[1]; - struct ghes_edac_dimm_fill dimm_fill; + struct dimm_fill dimm_fill; unsigned long flags; int idx = -1; @@ -482,6 +556,10 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) /* Get the number of DIMMs */ dmi_walk(ghes_edac_count_dimms, &num_dimm); + rc = ghes_dimm_pool_create(num_dimm); + if (rc < 0) + goto unlock; + /* Check if we've got a bogus BIOS */ if (num_dimm == 0) { fake = true; @@ -494,7 +572,6 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(*pvt)); if (!mci) { - pr_info("Can't allocate memory for EDAC data\n"); rc = -ENOMEM; goto unlock; } @@ -523,6 +600,8 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) pr_info("This system has %d DIMM sockets.\n", num_dimm); } + INIT_LIST_HEAD(&dimm_fill.dimms); + if (!fake) { dimm_fill.count = 0; dimm_fill.mci = mci; @@ -539,7 +618,7 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) rc = edac_mc_add_mc(mci); if (rc < 0) { - pr_info("Can't register at EDAC core\n"); + ghes_dimm_release(&dimm_fill.dimms); edac_mc_free(mci); rc = -ENODEV; goto unlock; @@ -547,12 +626,18 @@ int ghes_edac_register(struct ghes *ghes, struct device *dev) spin_lock_irqsave(&ghes_lock, flags); ghes_pvt = pvt; + list_splice_tail(&dimm_fill.dimms, &ghes_dimm_list); spin_unlock_irqrestore(&ghes_lock, flags); /* only set on success */ refcount_set(&ghes_refcount, 1); unlock: + if (rc < 0) { + ghes_dimm_pool_destroy(); + pr_err("Can't register at EDAC core: %d\n", rc); + } + mutex_unlock(&ghes_reg_mutex); return rc; @@ -562,6 +647,7 @@ void ghes_edac_unregister(struct ghes *ghes) { struct mem_ctl_info *mci; unsigned long flags; + LIST_HEAD(dimm_list); mutex_lock(&ghes_reg_mutex); @@ -574,14 +660,19 @@ void ghes_edac_unregister(struct ghes *ghes) spin_lock_irqsave(&ghes_lock, flags); mci = ghes_pvt ? ghes_pvt->mci : NULL; ghes_pvt = NULL; + list_splice_init(&ghes_dimm_list, &dimm_list); spin_unlock_irqrestore(&ghes_lock, flags); + ghes_dimm_release(&dimm_list); + if (!mci) goto unlock; mci = edac_mc_del_mc(mci->pdev); - if (mci) + if (mci) { edac_mc_free(mci); + ghes_dimm_pool_destroy(); + } unlock: mutex_unlock(&ghes_reg_mutex); diff --git a/include/linux/edac.h b/include/linux/edac.h index 0f20b986b0ab..6b7f594782c0 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -382,8 +382,6 @@ struct dimm_info { unsigned int csrow, cschannel; /* Points to the old API data */ - u16 smbios_handle; /* Handle for SMBIOS type 17 */ - u32 ce_count; u32 ue_count; };
To identify a hw error's location, the ghes driver needs to know the mapping between a DIMM and the used SMBIOS handle. The handle is stored in struct dimm_info in the smbios_handle field where other drivers carry it too. Make the SMBIOS handle private and implement an own SMBIOS handle mapping table that is only used by the ghes driver. As a consequence, smbios_handle is removed from struct dimm_info. The mapping table is implemented as a list. This makes the locking and the allocation of table entries when adding or removing DIMMs much easier. Since the list is accessed by the interrupt handler it must be protected by a spinlock. Thanks to the use of a list, multiple entries can be prepared in advance without a lock being held. Once ready, the entries are added to an active list of DIMMs (ghes_dimm_list) by just a single list operation that needs the locking. The same list entry is also used for memory allocation, all entries are created at once using a single array allocation and are put in a pool (ghes_dimm_pool) that holds unused entries for later use. While at it, rename struct ghes_edac_dimm_fill to struct dimm_fill. Signed-off-by: Robert Richter <rrichter@marvell.com> --- drivers/edac/ghes_edac.c | 117 ++++++++++++++++++++++++++++++++++----- include/linux/edac.h | 2 - 2 files changed, 104 insertions(+), 15 deletions(-)