Message ID | 1357861230-29549-2-git-send-email-toshi.kani@hp.com (mailing list archive) |
---|---|
State | RFC, archived |
Headers | show |
On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote: > Added include/linux/sys_hotplug.h, which defines the system device > hotplug framework interfaces used by the framework itself and > handlers. > > The order values define the calling sequence of handlers. For add > execute, the ordering is ACPI->MEM->CPU. Memory is onlined before > CPU so that threads on new CPUs can start using their local memory. > The ordering of the delete execute is symmetric to the add execute. > > struct shp_request defines a hot-plug request information. The > device resource information is managed with a list so that a single > request may target to multiple devices. > > Signed-off-by: Toshi Kani <toshi.kani@hp.com> > --- > include/linux/sys_hotplug.h | 181 +++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 181 insertions(+) > create mode 100644 include/linux/sys_hotplug.h > > diff --git a/include/linux/sys_hotplug.h b/include/linux/sys_hotplug.h > new file mode 100644 > index 0000000..86674dd > --- /dev/null > +++ b/include/linux/sys_hotplug.h > @@ -0,0 +1,181 @@ > +/* > + * sys_hotplug.h - System device hot-plug framework > + * > + * Copyright (C) 2012 Hewlett-Packard Development Company, L.P. > + * Toshi Kani <toshi.kani@hp.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#ifndef _LINUX_SYS_HOTPLUG_H > +#define _LINUX_SYS_HOTPLUG_H > + > +#include <linux/list.h> > +#include <linux/device.h> > + > +/* > + * System device hot-plug operation proceeds in the following order. > + * Validate phase -> Execute phase -> Commit phase > + * > + * The order values below define the calling sequence of platform > + * neutral handlers for each phase in ascending order. The order > + * values of firmware-specific handlers are defined in sys_hotplug.h > + * under firmware specific directories. > + */ > + > +/* All order values must be smaller than this value */ > +#define SHP_ORDER_MAX 0xffffff > + > +/* Add Validate order values */ > + > +/* Add Execute order values */ > +#define SHP_MEM_ADD_EXECUTE_ORDER 100 > +#define SHP_CPU_ADD_EXECUTE_ORDER 110 > + > +/* Add Commit order values */ > + > +/* Delete Validate order values */ > +#define SHP_CPU_DEL_VALIDATE_ORDER 100 > +#define SHP_MEM_DEL_VALIDATE_ORDER 110 > + > +/* Delete Execute order values */ > +#define SHP_CPU_DEL_EXECUTE_ORDER 10 > +#define SHP_MEM_DEL_EXECUTE_ORDER 20 > + > +/* Delete Commit order values */ > + > +/* > + * Hot-plug request types > + */ > +#define SHP_REQ_ADD 0x000000 > +#define SHP_REQ_DELETE 0x000001 > +#define SHP_REQ_MASK 0x0000ff > + > +/* > + * Hot-plug phase types > + */ > +#define SHP_PH_VALIDATE 0x000000 > +#define SHP_PH_EXECUTE 0x000100 > +#define SHP_PH_COMMIT 0x000200 > +#define SHP_PH_MASK 0x00ff00 > + > +/* > + * Hot-plug operation types > + */ > +#define SHP_OP_HOTPLUG 0x000000 > +#define SHP_OP_ONLINE 0x010000 > +#define SHP_OP_MASK 0xff0000 > + > +/* > + * Hot-plug phases > + */ > +enum shp_phase { > + SHP_ADD_VALIDATE = (SHP_REQ_ADD|SHP_PH_VALIDATE), > + SHP_ADD_EXECUTE = (SHP_REQ_ADD|SHP_PH_EXECUTE), > + SHP_ADD_COMMIT = (SHP_REQ_ADD|SHP_PH_COMMIT), > + SHP_DEL_VALIDATE = (SHP_REQ_DELETE|SHP_PH_VALIDATE), > + SHP_DEL_EXECUTE = (SHP_REQ_DELETE|SHP_PH_EXECUTE), > + SHP_DEL_COMMIT = (SHP_REQ_DELETE|SHP_PH_COMMIT) > +}; > + > +/* > + * Hot-plug operations > + */ > +enum shp_operation { > + SHP_HOTPLUG_ADD = (SHP_OP_HOTPLUG|SHP_REQ_ADD), > + SHP_HOTPLUG_DEL = (SHP_OP_HOTPLUG|SHP_REQ_DELETE), > + SHP_ONLINE_ADD = (SHP_OP_ONLINE|SHP_REQ_ADD), > + SHP_ONLINE_DEL = (SHP_OP_ONLINE|SHP_REQ_DELETE) > +}; > + > +/* > + * Hot-plug device classes > + */ > +enum shp_class { > + SHP_CLS_INVALID = 0, > + SHP_CLS_CPU = 1, > + SHP_CLS_MEMORY = 2, > + SHP_CLS_HOSTBRIDGE = 3, > + SHP_CLS_CONTAINER = 4, > +}; > + > +/* > + * Hot-plug device information > + */ > +union shp_dev_info { > + struct shp_cpu { > + u32 cpu_id; > + } cpu; > + > + struct shp_memory { > + int node; > + u64 start_addr; > + u64 length; > + } mem; > + > + struct shp_hostbridge { > + } hb; > + > + struct shp_node { > + } node; > +}; > + > +struct shp_device { > + struct list_head list; > + struct device *device; > + enum shp_class class; > + union shp_dev_info info; > +}; > + > +/* > + * Hot-plug request > + */ > +struct shp_request { > + /* common info */ > + enum shp_operation operation; /* operation */ > + > + /* hot-plug event info: only valid for hot-plug operations */ > + void *handle; /* FW handle */ What's the role of handle here? > + u32 event; /* FW event */ > + > + /* device resource info */ > + struct list_head dev_list; /* shp_device list */ > +}; > + > +/* > + * Inline Utility Functions > + */ > +static inline bool shp_is_hotplug_op(enum shp_operation operation) > +{ > + return (operation & SHP_OP_MASK) == SHP_OP_HOTPLUG; > +} > + > +static inline bool shp_is_online_op(enum shp_operation operation) > +{ > + return (operation & SHP_OP_MASK) == SHP_OP_ONLINE; > +} > + > +static inline bool shp_is_add_op(enum shp_operation operation) > +{ > + return (operation & SHP_REQ_MASK) == SHP_REQ_ADD; > +} > + > +static inline bool shp_is_add_phase(enum shp_phase phase) > +{ > + return (phase & SHP_REQ_MASK) == SHP_REQ_ADD; > +} > + > +/* > + * Externs > + */ > +typedef int (*shp_func)(struct shp_request *req, int rollback); > +extern int shp_register_handler(enum shp_phase phase, shp_func func, u32 order); > +extern int shp_unregister_handler(enum shp_phase phase, shp_func func); > +extern int shp_submit_req(struct shp_request *req); > +extern struct shp_request *shp_alloc_request(enum shp_operation operation); > +extern void shp_add_dev_info(struct shp_request *shp_req, > + struct shp_device *shp_dev); > + > +#endif /* _LINUX_SYS_HOTPLUG_H */ > -- > To unsubscribe from this list: send the line "unsubscribe linux-acpi" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2013-01-11 at 22:23 +0100, Rafael J. Wysocki wrote: > On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote: > > Added include/linux/sys_hotplug.h, which defines the system device > > hotplug framework interfaces used by the framework itself and > > handlers. > > > > The order values define the calling sequence of handlers. For add > > execute, the ordering is ACPI->MEM->CPU. Memory is onlined before > > CPU so that threads on new CPUs can start using their local memory. > > The ordering of the delete execute is symmetric to the add execute. > > > > struct shp_request defines a hot-plug request information. The > > device resource information is managed with a list so that a single > > request may target to multiple devices. > > : > > + > > +struct shp_device { > > + struct list_head list; > > + struct device *device; > > + enum shp_class class; > > + union shp_dev_info info; > > +}; > > + > > +/* > > + * Hot-plug request > > + */ > > +struct shp_request { > > + /* common info */ > > + enum shp_operation operation; /* operation */ > > + > > + /* hot-plug event info: only valid for hot-plug operations */ > > + void *handle; /* FW handle */ > > What's the role of handle here? On ACPI-based platforms, the handle keeps a notified ACPI handle when a hot-plug request is made. ACPI bus handlers, acpi_add_execute() / acpi_del_execute(), then scans / trims ACPI devices from the handle. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Monday, January 14, 2013 08:33:48 AM Toshi Kani wrote: > On Fri, 2013-01-11 at 22:23 +0100, Rafael J. Wysocki wrote: > > On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote: > > > Added include/linux/sys_hotplug.h, which defines the system device > > > hotplug framework interfaces used by the framework itself and > > > handlers. > > > > > > The order values define the calling sequence of handlers. For add > > > execute, the ordering is ACPI->MEM->CPU. Memory is onlined before > > > CPU so that threads on new CPUs can start using their local memory. > > > The ordering of the delete execute is symmetric to the add execute. > > > > > > struct shp_request defines a hot-plug request information. The > > > device resource information is managed with a list so that a single > > > request may target to multiple devices. > > > > : > > > + > > > +struct shp_device { > > > + struct list_head list; > > > + struct device *device; > > > + enum shp_class class; > > > + union shp_dev_info info; > > > +}; > > > + > > > +/* > > > + * Hot-plug request > > > + */ > > > +struct shp_request { > > > + /* common info */ > > > + enum shp_operation operation; /* operation */ > > > + > > > + /* hot-plug event info: only valid for hot-plug operations */ > > > + void *handle; /* FW handle */ > > > > What's the role of handle here? > > On ACPI-based platforms, the handle keeps a notified ACPI handle when a > hot-plug request is made. ACPI bus handlers, acpi_add_execute() / > acpi_del_execute(), then scans / trims ACPI devices from the handle. OK, so this is ACPI-specific and should be described as such. Thanks, Rafael
On Mon, 2013-01-14 at 19:48 +0100, Rafael J. Wysocki wrote: > On Monday, January 14, 2013 08:33:48 AM Toshi Kani wrote: > > On Fri, 2013-01-11 at 22:23 +0100, Rafael J. Wysocki wrote: > > > On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote: > > > > Added include/linux/sys_hotplug.h, which defines the system device > > > > hotplug framework interfaces used by the framework itself and > > > > handlers. > > > > > > > > The order values define the calling sequence of handlers. For add > > > > execute, the ordering is ACPI->MEM->CPU. Memory is onlined before > > > > CPU so that threads on new CPUs can start using their local memory. > > > > The ordering of the delete execute is symmetric to the add execute. > > > > > > > > struct shp_request defines a hot-plug request information. The > > > > device resource information is managed with a list so that a single > > > > request may target to multiple devices. > > > > > > : > > > > + > > > > +struct shp_device { > > > > + struct list_head list; > > > > + struct device *device; > > > > + enum shp_class class; > > > > + union shp_dev_info info; > > > > +}; > > > > + > > > > +/* > > > > + * Hot-plug request > > > > + */ > > > > +struct shp_request { > > > > + /* common info */ > > > > + enum shp_operation operation; /* operation */ > > > > + > > > > + /* hot-plug event info: only valid for hot-plug operations */ > > > > + void *handle; /* FW handle */ > > > > > > What's the role of handle here? > > > > On ACPI-based platforms, the handle keeps a notified ACPI handle when a > > hot-plug request is made. ACPI bus handlers, acpi_add_execute() / > > acpi_del_execute(), then scans / trims ACPI devices from the handle. > > OK, so this is ACPI-specific and should be described as such. Other FW interface I know is parisc, which has mod_index (module index) to identify a unique object, just like what ACPI handle does. The handle can keep the mod_index as an opaque value as well. But as you said, I do not know if the handle works for all other FWs. So, I will add descriptions, such that the hot-plug event info is modeled after ACPI and may need to be revisited when supporting other FW. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Jan 14, 2013 at 12:02:04PM -0700, Toshi Kani wrote: > On Mon, 2013-01-14 at 19:48 +0100, Rafael J. Wysocki wrote: > > On Monday, January 14, 2013 08:33:48 AM Toshi Kani wrote: > > > On Fri, 2013-01-11 at 22:23 +0100, Rafael J. Wysocki wrote: > > > > On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote: > > > > > Added include/linux/sys_hotplug.h, which defines the system device > > > > > hotplug framework interfaces used by the framework itself and > > > > > handlers. > > > > > > > > > > The order values define the calling sequence of handlers. For add > > > > > execute, the ordering is ACPI->MEM->CPU. Memory is onlined before > > > > > CPU so that threads on new CPUs can start using their local memory. > > > > > The ordering of the delete execute is symmetric to the add execute. > > > > > > > > > > struct shp_request defines a hot-plug request information. The > > > > > device resource information is managed with a list so that a single > > > > > request may target to multiple devices. > > > > > > > > : > > > > > + > > > > > +struct shp_device { > > > > > + struct list_head list; > > > > > + struct device *device; > > > > > + enum shp_class class; > > > > > + union shp_dev_info info; > > > > > +}; > > > > > + > > > > > +/* > > > > > + * Hot-plug request > > > > > + */ > > > > > +struct shp_request { > > > > > + /* common info */ > > > > > + enum shp_operation operation; /* operation */ > > > > > + > > > > > + /* hot-plug event info: only valid for hot-plug operations */ > > > > > + void *handle; /* FW handle */ > > > > > > > > What's the role of handle here? > > > > > > On ACPI-based platforms, the handle keeps a notified ACPI handle when a > > > hot-plug request is made. ACPI bus handlers, acpi_add_execute() / > > > acpi_del_execute(), then scans / trims ACPI devices from the handle. > > > > OK, so this is ACPI-specific and should be described as such. > > Other FW interface I know is parisc, which has mod_index (module index) > to identify a unique object, just like what ACPI handle does. The > handle can keep the mod_index as an opaque value as well. But as you > said, I do not know if the handle works for all other FWs. So, I will > add descriptions, such that the hot-plug event info is modeled after > ACPI and may need to be revisited when supporting other FW. Please make it a "real" pointer, and not a void *, those shouldn't be used at all if possible. thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote: > Added include/linux/sys_hotplug.h, which defines the system device > hotplug framework interfaces used by the framework itself and > handlers. > > The order values define the calling sequence of handlers. For add > execute, the ordering is ACPI->MEM->CPU. Memory is onlined before > CPU so that threads on new CPUs can start using their local memory. > The ordering of the delete execute is symmetric to the add execute. > > struct shp_request defines a hot-plug request information. The > device resource information is managed with a list so that a single > request may target to multiple devices. > > Signed-off-by: Toshi Kani <toshi.kani@hp.com> > --- > include/linux/sys_hotplug.h | 181 +++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 181 insertions(+) > create mode 100644 include/linux/sys_hotplug.h > > diff --git a/include/linux/sys_hotplug.h b/include/linux/sys_hotplug.h > new file mode 100644 > index 0000000..86674dd > --- /dev/null > +++ b/include/linux/sys_hotplug.h > @@ -0,0 +1,181 @@ > +/* > + * sys_hotplug.h - System device hot-plug framework > + * > + * Copyright (C) 2012 Hewlett-Packard Development Company, L.P. > + * Toshi Kani <toshi.kani@hp.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + */ > + > +#ifndef _LINUX_SYS_HOTPLUG_H > +#define _LINUX_SYS_HOTPLUG_H > + > +#include <linux/list.h> > +#include <linux/device.h> > + > +/* > + * System device hot-plug operation proceeds in the following order. > + * Validate phase -> Execute phase -> Commit phase > + * > + * The order values below define the calling sequence of platform > + * neutral handlers for each phase in ascending order. The order > + * values of firmware-specific handlers are defined in sys_hotplug.h > + * under firmware specific directories. > + */ > + > +/* All order values must be smaller than this value */ > +#define SHP_ORDER_MAX 0xffffff > + > +/* Add Validate order values */ > + > +/* Add Execute order values */ > +#define SHP_MEM_ADD_EXECUTE_ORDER 100 > +#define SHP_CPU_ADD_EXECUTE_ORDER 110 > + > +/* Add Commit order values */ > + > +/* Delete Validate order values */ > +#define SHP_CPU_DEL_VALIDATE_ORDER 100 > +#define SHP_MEM_DEL_VALIDATE_ORDER 110 > + > +/* Delete Execute order values */ > +#define SHP_CPU_DEL_EXECUTE_ORDER 10 > +#define SHP_MEM_DEL_EXECUTE_ORDER 20 > + > +/* Delete Commit order values */ > + Empty value? Anyway, as I said before, don't use "values", just call things directly in the order you need to. This isn't like other operating systems, we don't need to be so "flexible", we can modify the core code as much as we want and need to if future things come along :) thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote: > +/* > + * Hot-plug device information > + */ Again, stop it with the "generic" hotplug term here, and everywhere else. You are doing a very _specific_ type of hotplug devices, so spell it out. We've worked hard to hotplug _everything_ in Linux, you are going to confuse a lot of people with this type of terms. > +union shp_dev_info { > + struct shp_cpu { > + u32 cpu_id; > + } cpu; What is this? Why not point to the system device for the cpu? > + struct shp_memory { > + int node; > + u64 start_addr; > + u64 length; > + } mem; Same here, why not point to the system device? > + struct shp_hostbridge { > + } hb; > + > + struct shp_node { > + } node; What happened here with these? Empty structures? Huh? > +}; > + > +struct shp_device { > + struct list_head list; > + struct device *device; No, make it a "real" device, embed the device into it. But, again, I'm going to ask why you aren't using the existing cpu / memory / bridge / node devices that we have in the kernel. Please use them, or give me a _really_ good reason why they will not work. > + enum shp_class class; > + union shp_dev_info info; > +}; > + > +/* > + * Hot-plug request > + */ > +struct shp_request { > + /* common info */ > + enum shp_operation operation; /* operation */ > + > + /* hot-plug event info: only valid for hot-plug operations */ > + void *handle; /* FW handle */ > + u32 event; /* FW event */ What is this? greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2013-01-29 at 23:48 -0500, Greg KH wrote: > On Mon, Jan 14, 2013 at 12:02:04PM -0700, Toshi Kani wrote: > > On Mon, 2013-01-14 at 19:48 +0100, Rafael J. Wysocki wrote: > > > On Monday, January 14, 2013 08:33:48 AM Toshi Kani wrote: > > > > On Fri, 2013-01-11 at 22:23 +0100, Rafael J. Wysocki wrote: > > > > > On Thursday, January 10, 2013 04:40:19 PM Toshi Kani wrote: > > > > > > Added include/linux/sys_hotplug.h, which defines the system device > > > > > > hotplug framework interfaces used by the framework itself and > > > > > > handlers. > > > > > > > > > > > > The order values define the calling sequence of handlers. For add > > > > > > execute, the ordering is ACPI->MEM->CPU. Memory is onlined before > > > > > > CPU so that threads on new CPUs can start using their local memory. > > > > > > The ordering of the delete execute is symmetric to the add execute. > > > > > > > > > > > > struct shp_request defines a hot-plug request information. The > > > > > > device resource information is managed with a list so that a single > > > > > > request may target to multiple devices. > > > > > > > > > > : > > > > > > + > > > > > > +struct shp_device { > > > > > > + struct list_head list; > > > > > > + struct device *device; > > > > > > + enum shp_class class; > > > > > > + union shp_dev_info info; > > > > > > +}; > > > > > > + > > > > > > +/* > > > > > > + * Hot-plug request > > > > > > + */ > > > > > > +struct shp_request { > > > > > > + /* common info */ > > > > > > + enum shp_operation operation; /* operation */ > > > > > > + > > > > > > + /* hot-plug event info: only valid for hot-plug operations */ > > > > > > + void *handle; /* FW handle */ > > > > > > > > > > What's the role of handle here? > > > > > > > > On ACPI-based platforms, the handle keeps a notified ACPI handle when a > > > > hot-plug request is made. ACPI bus handlers, acpi_add_execute() / > > > > acpi_del_execute(), then scans / trims ACPI devices from the handle. > > > > > > OK, so this is ACPI-specific and should be described as such. > > > > Other FW interface I know is parisc, which has mod_index (module index) > > to identify a unique object, just like what ACPI handle does. The > > handle can keep the mod_index as an opaque value as well. But as you > > said, I do not know if the handle works for all other FWs. So, I will > > add descriptions, such that the hot-plug event info is modeled after > > ACPI and may need to be revisited when supporting other FW. > > Please make it a "real" pointer, and not a void *, those shouldn't be > used at all if possible. How about changing the "void *handle" to acpi_dev_node below? struct acpi_dev_node acpi_node; Basically, it has the same challenge as struct device, which uses acpi_dev_node as well. We can add other FW node when needed (just like device also has *of_node). Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2013-01-29 at 23:53 -0500, Greg KH wrote: > On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote: > > Added include/linux/sys_hotplug.h, which defines the system device > > hotplug framework interfaces used by the framework itself and > > handlers. > > > > The order values define the calling sequence of handlers. For add > > execute, the ordering is ACPI->MEM->CPU. Memory is onlined before > > CPU so that threads on new CPUs can start using their local memory. > > The ordering of the delete execute is symmetric to the add execute. > > > > struct shp_request defines a hot-plug request information. The > > device resource information is managed with a list so that a single > > request may target to multiple devices. > > > > Signed-off-by: Toshi Kani <toshi.kani@hp.com> > > --- > > include/linux/sys_hotplug.h | 181 +++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 181 insertions(+) > > create mode 100644 include/linux/sys_hotplug.h > > > > diff --git a/include/linux/sys_hotplug.h b/include/linux/sys_hotplug.h > > new file mode 100644 > > index 0000000..86674dd > > --- /dev/null > > +++ b/include/linux/sys_hotplug.h > > @@ -0,0 +1,181 @@ > > +/* > > + * sys_hotplug.h - System device hot-plug framework > > + * > > + * Copyright (C) 2012 Hewlett-Packard Development Company, L.P. > > + * Toshi Kani <toshi.kani@hp.com> > > + * > > + * This program is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License version 2 as > > + * published by the Free Software Foundation. > > + */ > > + > > +#ifndef _LINUX_SYS_HOTPLUG_H > > +#define _LINUX_SYS_HOTPLUG_H > > + > > +#include <linux/list.h> > > +#include <linux/device.h> > > + > > +/* > > + * System device hot-plug operation proceeds in the following order. > > + * Validate phase -> Execute phase -> Commit phase > > + * > > + * The order values below define the calling sequence of platform > > + * neutral handlers for each phase in ascending order. The order > > + * values of firmware-specific handlers are defined in sys_hotplug.h > > + * under firmware specific directories. > > + */ > > + > > +/* All order values must be smaller than this value */ > > +#define SHP_ORDER_MAX 0xffffff > > + > > +/* Add Validate order values */ > > + > > +/* Add Execute order values */ > > +#define SHP_MEM_ADD_EXECUTE_ORDER 100 > > +#define SHP_CPU_ADD_EXECUTE_ORDER 110 > > + > > +/* Add Commit order values */ > > + > > +/* Delete Validate order values */ > > +#define SHP_CPU_DEL_VALIDATE_ORDER 100 > > +#define SHP_MEM_DEL_VALIDATE_ORDER 110 > > + > > +/* Delete Execute order values */ > > +#define SHP_CPU_DEL_EXECUTE_ORDER 10 > > +#define SHP_MEM_DEL_EXECUTE_ORDER 20 > > + > > +/* Delete Commit order values */ > > + > > Empty value? Yes, in this version, all the delete commit order values are defined in <acpi/sys_hotplug.h>. > Anyway, as I said before, don't use "values", just call things directly > in the order you need to. > > This isn't like other operating systems, we don't need to be so > "flexible", we can modify the core code as much as we want and need to > if future things come along :) Understood. As described in the previous email, I will define them with enum and avoid using values. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2013-01-29 at 23:58 -0500, Greg KH wrote: > On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote: > > +/* > > + * Hot-plug device information > > + */ > > Again, stop it with the "generic" hotplug term here, and everywhere > else. You are doing a very _specific_ type of hotplug devices, so spell > it out. We've worked hard to hotplug _everything_ in Linux, you are > going to confuse a lot of people with this type of terms. Agreed. I will clarify in all places. > > +union shp_dev_info { > > + struct shp_cpu { > > + u32 cpu_id; > > + } cpu; > > What is this? Why not point to the system device for the cpu? This info is used to on-line a new CPU and create its system/cpu device. In other word, a system/cpu device is created as a result of CPU hotplug. > > + struct shp_memory { > > + int node; > > + u64 start_addr; > > + u64 length; > > + } mem; > > Same here, why not point to the system device? Same as above. > > + struct shp_hostbridge { > > + } hb; > > + > > + struct shp_node { > > + } node; > > What happened here with these? Empty structures? Huh? They are place holders for now. PCI bridge hot-plug and node hot-plug are still very much work in progress, so I have not integrated them into this framework yet. > > +}; > > + > > +struct shp_device { > > + struct list_head list; > > + struct device *device; > > No, make it a "real" device, embed the device into it. This device pointer is used to send KOBJ_ONLINE/OFFLINE event during CPU online/offline operation in order to maintain the current behavior. CPU online/offline operation only changes the state of CPU, so its system/cpu device continues to be present before and after an operation. (Whereas, CPU hot-add/delete operation creates or removes a system/cpu device.) So, this "*device" needs to be a pointer to reference an existing device that is to be on-lined/off-lined. > But, again, I'm going to ask why you aren't using the existing cpu / > memory / bridge / node devices that we have in the kernel. Please use > them, or give me a _really_ good reason why they will not work. We cannot use the existing system devices or ACPI devices here. During hot-plug, ACPI handler sets this shp_device info, so that cpu and memory handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target device information in a platform-neutral way. During hot-add, we first creates an ACPI device node (i.e. device under /sys/bus/acpi/devices), but platform-neutral modules cannot use them as they are ACPI-specific. Also, its system device (i.e. device under /sys/devices/system) has not been created until the hot-add operation completes. > > + enum shp_class class; > > + union shp_dev_info info; > > +}; > > + > > +/* > > + * Hot-plug request > > + */ > > +struct shp_request { > > + /* common info */ > > + enum shp_operation operation; /* operation */ > > + > > + /* hot-plug event info: only valid for hot-plug operations */ > > + void *handle; /* FW handle */ > > + u32 event; /* FW event */ > > What is this? The shp_request describes a hotplug or online/offline operation that is requested. In case of hot-plug request, the "*handle" describes a target device (which is an ACPI device object) and the "event" describes a type of request, such as hot-add or hot-delete. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Jan 30, 2013 at 06:15:12PM -0700, Toshi Kani wrote: > > Please make it a "real" pointer, and not a void *, those shouldn't be > > used at all if possible. > > How about changing the "void *handle" to acpi_dev_node below? > > struct acpi_dev_node acpi_node; > > Basically, it has the same challenge as struct device, which uses > acpi_dev_node as well. We can add other FW node when needed (just like > device also has *of_node). That sounds good to me. -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, 2013-01-31 at 05:24 +0000, Greg KH wrote: > On Wed, Jan 30, 2013 at 06:15:12PM -0700, Toshi Kani wrote: > > > Please make it a "real" pointer, and not a void *, those shouldn't be > > > used at all if possible. > > > > How about changing the "void *handle" to acpi_dev_node below? > > > > struct acpi_dev_node acpi_node; > > > > Basically, it has the same challenge as struct device, which uses > > acpi_dev_node as well. We can add other FW node when needed (just like > > device also has *of_node). > > That sounds good to me. Great! Thanks Greg, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wednesday, January 30, 2013 07:57:45 PM Toshi Kani wrote: > On Tue, 2013-01-29 at 23:58 -0500, Greg KH wrote: > > On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote: > > > +/* > > > + * Hot-plug device information > > > + */ > > > > Again, stop it with the "generic" hotplug term here, and everywhere > > else. You are doing a very _specific_ type of hotplug devices, so spell > > it out. We've worked hard to hotplug _everything_ in Linux, you are > > going to confuse a lot of people with this type of terms. > > Agreed. I will clarify in all places. > > > > +union shp_dev_info { > > > + struct shp_cpu { > > > + u32 cpu_id; > > > + } cpu; > > > > What is this? Why not point to the system device for the cpu? > > This info is used to on-line a new CPU and create its system/cpu device. > In other word, a system/cpu device is created as a result of CPU > hotplug. > > > > + struct shp_memory { > > > + int node; > > > + u64 start_addr; > > > + u64 length; > > > + } mem; > > > > Same here, why not point to the system device? > > Same as above. > > > > + struct shp_hostbridge { > > > + } hb; > > > + > > > + struct shp_node { > > > + } node; > > > > What happened here with these? Empty structures? Huh? > > They are place holders for now. PCI bridge hot-plug and node hot-plug > are still very much work in progress, so I have not integrated them into > this framework yet. > > > > +}; > > > + > > > +struct shp_device { > > > + struct list_head list; > > > + struct device *device; > > > > No, make it a "real" device, embed the device into it. > > This device pointer is used to send KOBJ_ONLINE/OFFLINE event during CPU > online/offline operation in order to maintain the current behavior. CPU > online/offline operation only changes the state of CPU, so its > system/cpu device continues to be present before and after an operation. > (Whereas, CPU hot-add/delete operation creates or removes a system/cpu > device.) So, this "*device" needs to be a pointer to reference an > existing device that is to be on-lined/off-lined. > > > But, again, I'm going to ask why you aren't using the existing cpu / > > memory / bridge / node devices that we have in the kernel. Please use > > them, or give me a _really_ good reason why they will not work. > > We cannot use the existing system devices or ACPI devices here. During > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target > device information in a platform-neutral way. During hot-add, we first > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices), > but platform-neutral modules cannot use them as they are ACPI-specific. But suppose we're smart and have ACPI scan handlers that will create "physical" device nodes for those devices during the ACPI namespace scan. Then, the platform-neutral nodes will be able to bind to those "physical" nodes. Moreover, it should be possible to get a hierarchy of device objects this way that will reflect all of the dependencies we need to take into account during hot-add and hot-remove operations. That may not be what we have today, but I don't see any *fundamental* obstacles preventing us from using this approach. This is already done for PCI host bridges and platform devices and I don't see why we can't do that for the other types of devices too. The only missing piece I see is a way to handle the "eject" problem, i.e. when we try do eject a device at the top of a subtree and need to tear down the entire subtree below it, but if that's going to lead to a system crash, for example, we want to cancel the eject. It seems to me that we'll need some help from the driver core here. Thanks, Rafael
On Thu, 2013-01-31 at 21:54 +0100, Rafael J. Wysocki wrote: > On Wednesday, January 30, 2013 07:57:45 PM Toshi Kani wrote: > > On Tue, 2013-01-29 at 23:58 -0500, Greg KH wrote: > > > On Thu, Jan 10, 2013 at 04:40:19PM -0700, Toshi Kani wrote: : > > > > +}; > > > > + > > > > +struct shp_device { > > > > + struct list_head list; > > > > + struct device *device; > > > > > > No, make it a "real" device, embed the device into it. > > > > This device pointer is used to send KOBJ_ONLINE/OFFLINE event during CPU > > online/offline operation in order to maintain the current behavior. CPU > > online/offline operation only changes the state of CPU, so its > > system/cpu device continues to be present before and after an operation. > > (Whereas, CPU hot-add/delete operation creates or removes a system/cpu > > device.) So, this "*device" needs to be a pointer to reference an > > existing device that is to be on-lined/off-lined. > > > > > But, again, I'm going to ask why you aren't using the existing cpu / > > > memory / bridge / node devices that we have in the kernel. Please use > > > them, or give me a _really_ good reason why they will not work. > > > > We cannot use the existing system devices or ACPI devices here. During > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target > > device information in a platform-neutral way. During hot-add, we first > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices), > > but platform-neutral modules cannot use them as they are ACPI-specific. > > But suppose we're smart and have ACPI scan handlers that will create > "physical" device nodes for those devices during the ACPI namespace scan. > Then, the platform-neutral nodes will be able to bind to those "physical" > nodes. Moreover, it should be possible to get a hierarchy of device objects > this way that will reflect all of the dependencies we need to take into > account during hot-add and hot-remove operations. That may not be what we > have today, but I don't see any *fundamental* obstacles preventing us from > using this approach. I misstated in my previous email. system/cpu device is actually created by ACPI driver during ACPI scan in case of hot-add. This is done by acpi_processor_hotadd_init(), which I consider as a hack but can be done. system/memory device is created in add_memory() by the mm module. > This is already done for PCI host bridges and platform devices and I don't > see why we can't do that for the other types of devices too. > > The only missing piece I see is a way to handle the "eject" problem, i.e. > when we try do eject a device at the top of a subtree and need to tear down > the entire subtree below it, but if that's going to lead to a system crash, > for example, we want to cancel the eject. It seems to me that we'll need some > help from the driver core here. There are three different approaches suggested for system device hot-plug: A. Proceed within system device bus scan. B. Proceed within ACPI bus scan. C. Proceed with a sequence (as a mini-boot). Option A uses system devices as tokens, option B uses acpi devices as tokens, and option C uses resource tables as tokens, for their handlers. Here is summary of key questions & answers so far. I hope this clarifies why I am suggesting option 3. 1. What are the system devices? System devices provide system-wide core computing resources, which are essential to compose a computer system. System devices are not connected to any particular standard buses. 2. Why are the system devices special? The system devices are initialized during early boot-time, by multiple subsystems, from the boot-up sequence, in pre-defined order. They provide low-level services to enable other subsystems to come up. 3. Why can't initialize the system devices from the driver structure at boot? The driver structure is initialized at the end of the boot sequence and requires the low-level services from the system devices initialized beforehand. 4. Why do we need a new common framework? Sysfs CPU and memory on-lining/off-lining are performed within the CPU and memory modules. They are common code and do not depend on ACPI. Therefore, a new common framework is necessary to integrate both on-lining/off-lining operation and hot-plugging operation of system devices into a single framework. 5. Why can't do everything with ACPI bus scan? Software dependency among system devices may not be dictated by the ACPI hierarchy. For instance, memory should be initialized before CPUs (i.e. a new cpu may need its local memory), but such ordering cannot be guaranteed by the ACPI hierarchy. Also, as described in 4, online/offline operations are independent from ACPI. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote: > > > But, again, I'm going to ask why you aren't using the existing cpu / > > > memory / bridge / node devices that we have in the kernel. Please use > > > them, or give me a _really_ good reason why they will not work. > > > > We cannot use the existing system devices or ACPI devices here. During > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target > > device information in a platform-neutral way. During hot-add, we first > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices), > > but platform-neutral modules cannot use them as they are ACPI-specific. > > But suppose we're smart and have ACPI scan handlers that will create > "physical" device nodes for those devices during the ACPI namespace scan. > Then, the platform-neutral nodes will be able to bind to those "physical" > nodes. Moreover, it should be possible to get a hierarchy of device objects > this way that will reflect all of the dependencies we need to take into > account during hot-add and hot-remove operations. That may not be what we > have today, but I don't see any *fundamental* obstacles preventing us from > using this approach. I would _much_ rather see that be the solution here as I think it is the proper one. > This is already done for PCI host bridges and platform devices and I don't > see why we can't do that for the other types of devices too. I agree. > The only missing piece I see is a way to handle the "eject" problem, i.e. > when we try do eject a device at the top of a subtree and need to tear down > the entire subtree below it, but if that's going to lead to a system crash, > for example, we want to cancel the eject. It seems to me that we'll need some > help from the driver core here. I say do what we always have done here, if the user asked us to tear something down, let it happen as they are the ones that know best :) Seriously, I guess this gets back to the "fail disconnect" idea that the ACPI developers keep harping on. I thought we already resolved this properly by having them implement it in their bus code, no reason the same thing couldn't happen here, right? I don't think the core needs to do anything special, but if so, I'll be glad to review it. thanks, gre k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > This is already done for PCI host bridges and platform devices and I don't > > see why we can't do that for the other types of devices too. > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > when we try do eject a device at the top of a subtree and need to tear down > > the entire subtree below it, but if that's going to lead to a system crash, > > for example, we want to cancel the eject. It seems to me that we'll need some > > help from the driver core here. > > There are three different approaches suggested for system device > hot-plug: > A. Proceed within system device bus scan. > B. Proceed within ACPI bus scan. > C. Proceed with a sequence (as a mini-boot). > > Option A uses system devices as tokens, option B uses acpi devices as > tokens, and option C uses resource tables as tokens, for their handlers. > > Here is summary of key questions & answers so far. I hope this > clarifies why I am suggesting option 3. > > 1. What are the system devices? > System devices provide system-wide core computing resources, which are > essential to compose a computer system. System devices are not > connected to any particular standard buses. Not a problem, lots of devices are not connected to any "particular standard busses". All this means is that system devices are connected to the "system" bus, nothing more. > 2. Why are the system devices special? > The system devices are initialized during early boot-time, by multiple > subsystems, from the boot-up sequence, in pre-defined order. They > provide low-level services to enable other subsystems to come up. Sorry, no, that doesn't mean they are special, nothing here is unique for the point of view of the driver model from any other device or bus. > 3. Why can't initialize the system devices from the driver structure at > boot? > The driver structure is initialized at the end of the boot sequence and > requires the low-level services from the system devices initialized > beforehand. Wait, what "driver structure"? If you need to initialize the driver core earlier, then do so. Or, even better, just wait until enough of the system has come up and then go initialize all of the devices you have found so far as part of your boot process. None of the above things you have stated seem to have anything to do with your proposed patch, so I don't understand why you have mentioned them... > 4. Why do we need a new common framework? > Sysfs CPU and memory on-lining/off-lining are performed within the CPU > and memory modules. They are common code and do not depend on ACPI. > Therefore, a new common framework is necessary to integrate both > on-lining/off-lining operation and hot-plugging operation of system > devices into a single framework. {sigh} Removing and adding devices and handling hotplug operations is what the driver core was written for, almost 10 years ago. To somehow think that your devices are "special" just because they don't use ACPI is odd, because the driver core itself has nothing to do with ACPI. Don't get the current mix of x86 system code tied into ACPI confused with an driver core issues here please. > 5. Why can't do everything with ACPI bus scan? > Software dependency among system devices may not be dictated by the ACPI > hierarchy. For instance, memory should be initialized before CPUs (i.e. > a new cpu may need its local memory), but such ordering cannot be > guaranteed by the ACPI hierarchy. Also, as described in 4, > online/offline operations are independent from ACPI. That's fine, the driver core is independant from ACPI. I don't care how you do the scaning of your devices, but I do care about you creating new driver core pieces that duplicate the existing functionality of what we have today. In short, I like Rafael's proposal better, and I fail to see how anything you have stated here would matter in how this is implemented. :) thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > This is already done for PCI host bridges and platform devices and I don't > > > see why we can't do that for the other types of devices too. > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > when we try do eject a device at the top of a subtree and need to tear down > > > the entire subtree below it, but if that's going to lead to a system crash, > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > help from the driver core here. > > > > There are three different approaches suggested for system device > > hot-plug: > > A. Proceed within system device bus scan. > > B. Proceed within ACPI bus scan. > > C. Proceed with a sequence (as a mini-boot). > > > > Option A uses system devices as tokens, option B uses acpi devices as > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > Here is summary of key questions & answers so far. I hope this > > clarifies why I am suggesting option 3. > > > > 1. What are the system devices? > > System devices provide system-wide core computing resources, which are > > essential to compose a computer system. System devices are not > > connected to any particular standard buses. > > Not a problem, lots of devices are not connected to any "particular > standard busses". All this means is that system devices are connected > to the "system" bus, nothing more. Can you give me a few examples of other devices that support hotplug and are not connected to any particular buses? I will investigate them to see how they are managed to support hotplug. > > 2. Why are the system devices special? > > The system devices are initialized during early boot-time, by multiple > > subsystems, from the boot-up sequence, in pre-defined order. They > > provide low-level services to enable other subsystems to come up. > > Sorry, no, that doesn't mean they are special, nothing here is unique > for the point of view of the driver model from any other device or bus. I think system devices are unique in a sense that they are initialized before drivers run. > > 3. Why can't initialize the system devices from the driver structure at > > boot? > > The driver structure is initialized at the end of the boot sequence and > > requires the low-level services from the system devices initialized > > beforehand. > > Wait, what "driver structure"? Sorry it was not clear. cpu_dev_init() and memory_dev_init() are called from driver_init() at the end of the boot sequence, and initialize system/cpu and system/memory devices. I assume they are the system bus you are referring with option A. > If you need to initialize the driver > core earlier, then do so. Or, even better, just wait until enough of > the system has come up and then go initialize all of the devices you > have found so far as part of your boot process. They are pseudo drivers that provide sysfs entry points of cpu and memory. They do not actually initialize cpu and memory. I do not think initializing cpu and memory fits into the driver model either, since drivers should run after cpu and memory are initialized. > None of the above things you have stated seem to have anything to do > with your proposed patch, so I don't understand why you have mentioned > them... You suggested option A before, which uses system bus scan to initialize all system devices at boot time as well as hot-plug. I tried to say that this option would not be doable. > > 4. Why do we need a new common framework? > > Sysfs CPU and memory on-lining/off-lining are performed within the CPU > > and memory modules. They are common code and do not depend on ACPI. > > Therefore, a new common framework is necessary to integrate both > > on-lining/off-lining operation and hot-plugging operation of system > > devices into a single framework. > > {sigh} > > Removing and adding devices and handling hotplug operations is what the > driver core was written for, almost 10 years ago. To somehow think that > your devices are "special" just because they don't use ACPI is odd, > because the driver core itself has nothing to do with ACPI. Don't get > the current mix of x86 system code tied into ACPI confused with an > driver core issues here please. CPU online/offline operation is performed within the CPU module. Memory online/offline operation is performed within the memory module. CPU and memory hotplug operations are performed within ACPI. While they deal with the same set of devices, they operate independently and are not managed under a same framework. I agree with you that not using ACPI is perfectly fine. My point is that ACPI framework won't be able to manage operations that do not use ACPI. > > 5. Why can't do everything with ACPI bus scan? > > Software dependency among system devices may not be dictated by the ACPI > > hierarchy. For instance, memory should be initialized before CPUs (i.e. > > a new cpu may need its local memory), but such ordering cannot be > > guaranteed by the ACPI hierarchy. Also, as described in 4, > > online/offline operations are independent from ACPI. > > That's fine, the driver core is independant from ACPI. I don't care how > you do the scaning of your devices, but I do care about you creating new > driver core pieces that duplicate the existing functionality of what we > have today. > > In short, I like Rafael's proposal better, and I fail to see how > anything you have stated here would matter in how this is implemented. :) Doing everything within ACPI means we can only manage ACPI hotplug operations, not online/offline operations. But I understand that you concern about adding a new framework with option C. It is good to know that you are fine with option B. :) So, I will step back, and think about what we can do within ACPI. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Friday, February 01, 2013 08:23:12 AM Greg KH wrote: > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote: > > > > But, again, I'm going to ask why you aren't using the existing cpu / > > > > memory / bridge / node devices that we have in the kernel. Please use > > > > them, or give me a _really_ good reason why they will not work. > > > > > > We cannot use the existing system devices or ACPI devices here. During > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target > > > device information in a platform-neutral way. During hot-add, we first > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices), > > > but platform-neutral modules cannot use them as they are ACPI-specific. > > > > But suppose we're smart and have ACPI scan handlers that will create > > "physical" device nodes for those devices during the ACPI namespace scan. > > Then, the platform-neutral nodes will be able to bind to those "physical" > > nodes. Moreover, it should be possible to get a hierarchy of device objects > > this way that will reflect all of the dependencies we need to take into > > account during hot-add and hot-remove operations. That may not be what we > > have today, but I don't see any *fundamental* obstacles preventing us from > > using this approach. > > I would _much_ rather see that be the solution here as I think it is the > proper one. > > > This is already done for PCI host bridges and platform devices and I don't > > see why we can't do that for the other types of devices too. > > I agree. > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > when we try do eject a device at the top of a subtree and need to tear down > > the entire subtree below it, but if that's going to lead to a system crash, > > for example, we want to cancel the eject. It seems to me that we'll need some > > help from the driver core here. > > I say do what we always have done here, if the user asked us to tear > something down, let it happen as they are the ones that know best :) > > Seriously, I guess this gets back to the "fail disconnect" idea that the > ACPI developers keep harping on. I thought we already resolved this > properly by having them implement it in their bus code, no reason the > same thing couldn't happen here, right? Not really. :-) We haven't ever resolved that particular issue I'm afraid. > I don't think the core needs to do anything special, but if so, I'll be glad > to review it. OK, so this is the use case. We have "eject" defined for something like a container with a number of CPU cores, PCI host bridge, and a memory controller under it. And a few pretty much arbitrary I/O devices as a bonus. Now, there's a button on the system case labeled as "Eject" and if that button is pressed, we're supposed to _try_ to eject all of those things at once. We are allowed to fail that request, though, if that's problematic for some reason, but we're supposed to let the BIOS know about that. Do you seriously think that if that button is pressed, we should just proceed with removing all that stuff no matter what? That'd be kind of like Russian roulette for whoever pressed that button, because s/he could only press it and wait for the system to either crash or not. Or maybe to crash a bit later because of some delayed stuff that would hit one of those devices that had just gone. Surely not a situation any admin of a high-availability system would like to be in. :-) Quite frankly, I have no idea how that can be addressed in a single bus type, let alone ACPI (which is not even a proper bus type, just something pretending to be one). Thanks, Rafael
On Friday, February 01, 2013 01:40:10 PM Toshi Kani wrote: > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > > This is already done for PCI host bridges and platform devices and I don't > > > > see why we can't do that for the other types of devices too. > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > help from the driver core here. > > > > > > There are three different approaches suggested for system device > > > hot-plug: > > > A. Proceed within system device bus scan. > > > B. Proceed within ACPI bus scan. > > > C. Proceed with a sequence (as a mini-boot). > > > > > > Option A uses system devices as tokens, option B uses acpi devices as > > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > > > Here is summary of key questions & answers so far. I hope this > > > clarifies why I am suggesting option 3. > > > > > > 1. What are the system devices? > > > System devices provide system-wide core computing resources, which are > > > essential to compose a computer system. System devices are not > > > connected to any particular standard buses. > > > > Not a problem, lots of devices are not connected to any "particular > > standard busses". All this means is that system devices are connected > > to the "system" bus, nothing more. > > Can you give me a few examples of other devices that support hotplug and > are not connected to any particular buses? I will investigate them to > see how they are managed to support hotplug. > > > > 2. Why are the system devices special? > > > The system devices are initialized during early boot-time, by multiple > > > subsystems, from the boot-up sequence, in pre-defined order. They > > > provide low-level services to enable other subsystems to come up. > > > > Sorry, no, that doesn't mean they are special, nothing here is unique > > for the point of view of the driver model from any other device or bus. > > I think system devices are unique in a sense that they are initialized > before drivers run. > > > > 3. Why can't initialize the system devices from the driver structure at > > > boot? > > > The driver structure is initialized at the end of the boot sequence and > > > requires the low-level services from the system devices initialized > > > beforehand. > > > > Wait, what "driver structure"? > > Sorry it was not clear. cpu_dev_init() and memory_dev_init() are called > from driver_init() at the end of the boot sequence, and initialize > system/cpu and system/memory devices. I assume they are the system bus > you are referring with option A. > > > If you need to initialize the driver > > core earlier, then do so. Or, even better, just wait until enough of > > the system has come up and then go initialize all of the devices you > > have found so far as part of your boot process. > > They are pseudo drivers that provide sysfs entry points of cpu and > memory. They do not actually initialize cpu and memory. I do not think > initializing cpu and memory fits into the driver model either, since > drivers should run after cpu and memory are initialized. > > > None of the above things you have stated seem to have anything to do > > with your proposed patch, so I don't understand why you have mentioned > > them... > > You suggested option A before, which uses system bus scan to initialize > all system devices at boot time as well as hot-plug. I tried to say > that this option would not be doable. > > > > 4. Why do we need a new common framework? > > > Sysfs CPU and memory on-lining/off-lining are performed within the CPU > > > and memory modules. They are common code and do not depend on ACPI. > > > Therefore, a new common framework is necessary to integrate both > > > on-lining/off-lining operation and hot-plugging operation of system > > > devices into a single framework. > > > > {sigh} > > > > Removing and adding devices and handling hotplug operations is what the > > driver core was written for, almost 10 years ago. To somehow think that > > your devices are "special" just because they don't use ACPI is odd, > > because the driver core itself has nothing to do with ACPI. Don't get > > the current mix of x86 system code tied into ACPI confused with an > > driver core issues here please. > > CPU online/offline operation is performed within the CPU module. Memory > online/offline operation is performed within the memory module. CPU and > memory hotplug operations are performed within ACPI. While they deal > with the same set of devices, they operate independently and are not > managed under a same framework. > > I agree with you that not using ACPI is perfectly fine. My point is > that ACPI framework won't be able to manage operations that do not use > ACPI. > > > > 5. Why can't do everything with ACPI bus scan? > > > Software dependency among system devices may not be dictated by the ACPI > > > hierarchy. For instance, memory should be initialized before CPUs (i.e. > > > a new cpu may need its local memory), but such ordering cannot be > > > guaranteed by the ACPI hierarchy. Also, as described in 4, > > > online/offline operations are independent from ACPI. > > > > That's fine, the driver core is independant from ACPI. I don't care how > > you do the scaning of your devices, but I do care about you creating new > > driver core pieces that duplicate the existing functionality of what we > > have today. > > > > In short, I like Rafael's proposal better, and I fail to see how > > anything you have stated here would matter in how this is implemented. :) > > Doing everything within ACPI means we can only manage ACPI hotplug > operations, not online/offline operations. But I understand that you > concern about adding a new framework with option C. It is good to know > that you are fine with option B. :) So, I will step back, and think > about what we can do within ACPI. Not much, because ACPI only knows about a subset of devices that may be involved in that, and a limited one for that matter. For one example, anything connected through PCI and not having a corresponding ACPI object (i.e. pretty much every add-in card in existence) will be unknown to ACPI. And say one of these things is a SATA controller with a number of disks under it and so on. ACPI won't even know that it exists. Moreover, PCI won't know that those disks exist. Etc. Thanks, Rafael
On Fri, 2013-02-01 at 23:21 +0100, Rafael J. Wysocki wrote: > On Friday, February 01, 2013 01:40:10 PM Toshi Kani wrote: > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > see why we can't do that for the other types of devices too. > > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > help from the driver core here. > > > > > > > > There are three different approaches suggested for system device > > > > hot-plug: > > > > A. Proceed within system device bus scan. > > > > B. Proceed within ACPI bus scan. > > > > C. Proceed with a sequence (as a mini-boot). > > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as > > > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > > > > > Here is summary of key questions & answers so far. I hope this > > > > clarifies why I am suggesting option 3. > > > > > > > > 1. What are the system devices? > > > > System devices provide system-wide core computing resources, which are > > > > essential to compose a computer system. System devices are not > > > > connected to any particular standard buses. > > > > > > Not a problem, lots of devices are not connected to any "particular > > > standard busses". All this means is that system devices are connected > > > to the "system" bus, nothing more. > > > > Can you give me a few examples of other devices that support hotplug and > > are not connected to any particular buses? I will investigate them to > > see how they are managed to support hotplug. > > > > > > 2. Why are the system devices special? > > > > The system devices are initialized during early boot-time, by multiple > > > > subsystems, from the boot-up sequence, in pre-defined order. They > > > > provide low-level services to enable other subsystems to come up. > > > > > > Sorry, no, that doesn't mean they are special, nothing here is unique > > > for the point of view of the driver model from any other device or bus. > > > > I think system devices are unique in a sense that they are initialized > > before drivers run. > > > > > > 3. Why can't initialize the system devices from the driver structure at > > > > boot? > > > > The driver structure is initialized at the end of the boot sequence and > > > > requires the low-level services from the system devices initialized > > > > beforehand. > > > > > > Wait, what "driver structure"? > > > > Sorry it was not clear. cpu_dev_init() and memory_dev_init() are called > > from driver_init() at the end of the boot sequence, and initialize > > system/cpu and system/memory devices. I assume they are the system bus > > you are referring with option A. > > > > > If you need to initialize the driver > > > core earlier, then do so. Or, even better, just wait until enough of > > > the system has come up and then go initialize all of the devices you > > > have found so far as part of your boot process. > > > > They are pseudo drivers that provide sysfs entry points of cpu and > > memory. They do not actually initialize cpu and memory. I do not think > > initializing cpu and memory fits into the driver model either, since > > drivers should run after cpu and memory are initialized. > > > > > None of the above things you have stated seem to have anything to do > > > with your proposed patch, so I don't understand why you have mentioned > > > them... > > > > You suggested option A before, which uses system bus scan to initialize > > all system devices at boot time as well as hot-plug. I tried to say > > that this option would not be doable. > > > > > > 4. Why do we need a new common framework? > > > > Sysfs CPU and memory on-lining/off-lining are performed within the CPU > > > > and memory modules. They are common code and do not depend on ACPI. > > > > Therefore, a new common framework is necessary to integrate both > > > > on-lining/off-lining operation and hot-plugging operation of system > > > > devices into a single framework. > > > > > > {sigh} > > > > > > Removing and adding devices and handling hotplug operations is what the > > > driver core was written for, almost 10 years ago. To somehow think that > > > your devices are "special" just because they don't use ACPI is odd, > > > because the driver core itself has nothing to do with ACPI. Don't get > > > the current mix of x86 system code tied into ACPI confused with an > > > driver core issues here please. > > > > CPU online/offline operation is performed within the CPU module. Memory > > online/offline operation is performed within the memory module. CPU and > > memory hotplug operations are performed within ACPI. While they deal > > with the same set of devices, they operate independently and are not > > managed under a same framework. > > > > I agree with you that not using ACPI is perfectly fine. My point is > > that ACPI framework won't be able to manage operations that do not use > > ACPI. > > > > > > 5. Why can't do everything with ACPI bus scan? > > > > Software dependency among system devices may not be dictated by the ACPI > > > > hierarchy. For instance, memory should be initialized before CPUs (i.e. > > > > a new cpu may need its local memory), but such ordering cannot be > > > > guaranteed by the ACPI hierarchy. Also, as described in 4, > > > > online/offline operations are independent from ACPI. > > > > > > That's fine, the driver core is independant from ACPI. I don't care how > > > you do the scaning of your devices, but I do care about you creating new > > > driver core pieces that duplicate the existing functionality of what we > > > have today. > > > > > > In short, I like Rafael's proposal better, and I fail to see how > > > anything you have stated here would matter in how this is implemented. :) > > > > Doing everything within ACPI means we can only manage ACPI hotplug > > operations, not online/offline operations. But I understand that you > > concern about adding a new framework with option C. It is good to know > > that you are fine with option B. :) So, I will step back, and think > > about what we can do within ACPI. > > Not much, because ACPI only knows about a subset of devices that may be > involved in that, and a limited one for that matter. For one example, > anything connected through PCI and not having a corresponding ACPI object (i.e. > pretty much every add-in card in existence) will be unknown to ACPI. And > say one of these things is a SATA controller with a number of disks under it > and so on. ACPI won't even know that it exists. Moreover, PCI won't know > that those disks exist. Etc. Agreed. Thanks for bringing I/Os into the picture. I did not mention them since they have not supported in this patchset, but we certainly need to consider them into the design. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Feb 01, 2013 at 11:12:59PM +0100, Rafael J. Wysocki wrote: > On Friday, February 01, 2013 08:23:12 AM Greg KH wrote: > > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote: > > > > > But, again, I'm going to ask why you aren't using the existing cpu / > > > > > memory / bridge / node devices that we have in the kernel. Please use > > > > > them, or give me a _really_ good reason why they will not work. > > > > > > > > We cannot use the existing system devices or ACPI devices here. During > > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory > > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target > > > > device information in a platform-neutral way. During hot-add, we first > > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices), > > > > but platform-neutral modules cannot use them as they are ACPI-specific. > > > > > > But suppose we're smart and have ACPI scan handlers that will create > > > "physical" device nodes for those devices during the ACPI namespace scan. > > > Then, the platform-neutral nodes will be able to bind to those "physical" > > > nodes. Moreover, it should be possible to get a hierarchy of device objects > > > this way that will reflect all of the dependencies we need to take into > > > account during hot-add and hot-remove operations. That may not be what we > > > have today, but I don't see any *fundamental* obstacles preventing us from > > > using this approach. > > > > I would _much_ rather see that be the solution here as I think it is the > > proper one. > > > > > This is already done for PCI host bridges and platform devices and I don't > > > see why we can't do that for the other types of devices too. > > > > I agree. > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > when we try do eject a device at the top of a subtree and need to tear down > > > the entire subtree below it, but if that's going to lead to a system crash, > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > help from the driver core here. > > > > I say do what we always have done here, if the user asked us to tear > > something down, let it happen as they are the ones that know best :) > > > > Seriously, I guess this gets back to the "fail disconnect" idea that the > > ACPI developers keep harping on. I thought we already resolved this > > properly by having them implement it in their bus code, no reason the > > same thing couldn't happen here, right? > > Not really. :-) We haven't ever resolved that particular issue I'm afraid. Ah, I didn't realize that. > > I don't think the core needs to do anything special, but if so, I'll be glad > > to review it. > > OK, so this is the use case. We have "eject" defined for something like > a container with a number of CPU cores, PCI host bridge, and a memory > controller under it. And a few pretty much arbitrary I/O devices as a bonus. > > Now, there's a button on the system case labeled as "Eject" and if that button > is pressed, we're supposed to _try_ to eject all of those things at once. We > are allowed to fail that request, though, if that's problematic for some > reason, but we're supposed to let the BIOS know about that. > > Do you seriously think that if that button is pressed, we should just proceed > with removing all that stuff no matter what? That'd be kind of like Russian > roulette for whoever pressed that button, because s/he could only press it and > wait for the system to either crash or not. Or maybe to crash a bit later > because of some delayed stuff that would hit one of those devices that had just > gone. Surely not a situation any admin of a high-availability system would > like to be in. :-) > > Quite frankly, I have no idea how that can be addressed in a single bus type, > let alone ACPI (which is not even a proper bus type, just something pretending > to be one). You don't have it as a single bus type, you have a controller somewhere, off of the bus being destroyed, that handles sending remove events to the device and tearing everything down. PCI does this from the very beginning. I know it's more complicated with these types of devices, and I think we are getting closer to the correct solution, I just don't want to ever see duplicate devices in the driver model for the same physical device. thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote: > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > > This is already done for PCI host bridges and platform devices and I don't > > > > see why we can't do that for the other types of devices too. > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > help from the driver core here. > > > > > > There are three different approaches suggested for system device > > > hot-plug: > > > A. Proceed within system device bus scan. > > > B. Proceed within ACPI bus scan. > > > C. Proceed with a sequence (as a mini-boot). > > > > > > Option A uses system devices as tokens, option B uses acpi devices as > > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > > > Here is summary of key questions & answers so far. I hope this > > > clarifies why I am suggesting option 3. > > > > > > 1. What are the system devices? > > > System devices provide system-wide core computing resources, which are > > > essential to compose a computer system. System devices are not > > > connected to any particular standard buses. > > > > Not a problem, lots of devices are not connected to any "particular > > standard busses". All this means is that system devices are connected > > to the "system" bus, nothing more. > > Can you give me a few examples of other devices that support hotplug and > are not connected to any particular buses? I will investigate them to > see how they are managed to support hotplug. Any device that is attached to any bus in the driver model can be hotunplugged from userspace by telling it to be "unbound" from the driver controlling it. Try it for any platform device in your system to see how it happens. > > > 2. Why are the system devices special? > > > The system devices are initialized during early boot-time, by multiple > > > subsystems, from the boot-up sequence, in pre-defined order. They > > > provide low-level services to enable other subsystems to come up. > > > > Sorry, no, that doesn't mean they are special, nothing here is unique > > for the point of view of the driver model from any other device or bus. > > I think system devices are unique in a sense that they are initialized > before drivers run. No, most all devices are "initialized" before a driver runs on it, USB is one such example, PCI another, and I'm pretty sure that there are others. > > If you need to initialize the driver > > core earlier, then do so. Or, even better, just wait until enough of > > the system has come up and then go initialize all of the devices you > > have found so far as part of your boot process. > > They are pseudo drivers that provide sysfs entry points of cpu and > memory. They do not actually initialize cpu and memory. I do not think > initializing cpu and memory fits into the driver model either, since > drivers should run after cpu and memory are initialized. We already represent CPUs in the sysfs tree, don't represent them in two different places with two different structures. Use the existing ones please. > > None of the above things you have stated seem to have anything to do > > with your proposed patch, so I don't understand why you have mentioned > > them... > > You suggested option A before, which uses system bus scan to initialize > all system devices at boot time as well as hot-plug. I tried to say > that this option would not be doable. I haven't yet been convinced otherwise, sorry. Please prove me wrong :) thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote: > On Fri, Feb 01, 2013 at 11:12:59PM +0100, Rafael J. Wysocki wrote: > > On Friday, February 01, 2013 08:23:12 AM Greg KH wrote: > > > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote: > > > > > > But, again, I'm going to ask why you aren't using the existing cpu / > > > > > > memory / bridge / node devices that we have in the kernel. Please use > > > > > > them, or give me a _really_ good reason why they will not work. > > > > > > > > > > We cannot use the existing system devices or ACPI devices here. During > > > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory > > > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target > > > > > device information in a platform-neutral way. During hot-add, we first > > > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices), > > > > > but platform-neutral modules cannot use them as they are ACPI-specific. > > > > > > > > But suppose we're smart and have ACPI scan handlers that will create > > > > "physical" device nodes for those devices during the ACPI namespace scan. > > > > Then, the platform-neutral nodes will be able to bind to those "physical" > > > > nodes. Moreover, it should be possible to get a hierarchy of device objects > > > > this way that will reflect all of the dependencies we need to take into > > > > account during hot-add and hot-remove operations. That may not be what we > > > > have today, but I don't see any *fundamental* obstacles preventing us from > > > > using this approach. > > > > > > I would _much_ rather see that be the solution here as I think it is the > > > proper one. > > > > > > > This is already done for PCI host bridges and platform devices and I don't > > > > see why we can't do that for the other types of devices too. > > > > > > I agree. > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > help from the driver core here. > > > > > > I say do what we always have done here, if the user asked us to tear > > > something down, let it happen as they are the ones that know best :) > > > > > > Seriously, I guess this gets back to the "fail disconnect" idea that the > > > ACPI developers keep harping on. I thought we already resolved this > > > properly by having them implement it in their bus code, no reason the > > > same thing couldn't happen here, right? > > > > Not really. :-) We haven't ever resolved that particular issue I'm afraid. > > Ah, I didn't realize that. > > > > I don't think the core needs to do anything special, but if so, I'll be glad > > > to review it. > > > > OK, so this is the use case. We have "eject" defined for something like > > a container with a number of CPU cores, PCI host bridge, and a memory > > controller under it. And a few pretty much arbitrary I/O devices as a bonus. > > > > Now, there's a button on the system case labeled as "Eject" and if that button > > is pressed, we're supposed to _try_ to eject all of those things at once. We > > are allowed to fail that request, though, if that's problematic for some > > reason, but we're supposed to let the BIOS know about that. > > > > Do you seriously think that if that button is pressed, we should just proceed > > with removing all that stuff no matter what? That'd be kind of like Russian > > roulette for whoever pressed that button, because s/he could only press it and > > wait for the system to either crash or not. Or maybe to crash a bit later > > because of some delayed stuff that would hit one of those devices that had just > > gone. Surely not a situation any admin of a high-availability system would > > like to be in. :-) > > > > Quite frankly, I have no idea how that can be addressed in a single bus type, > > let alone ACPI (which is not even a proper bus type, just something pretending > > to be one). > > You don't have it as a single bus type, you have a controller somewhere, > off of the bus being destroyed, that handles sending remove events to > the device and tearing everything down. PCI does this from the very > beginning. Yes, but those are just remove events and we can only see how destructive they were after the removal. The point is to be able to figure out whether or not we *want* to do the removal in the first place. Say you have a computing node which signals a hardware problem in a processor package (the container with CPU cores, memory, PCI host bridge etc.). You may want to eject that package, but you don't want to kill the system this way. So if the eject is doable, it is very much desirable to do it, but if it is not doable, you'd rather shut the box down and do the replacement afterward. That may be costly, however (maybe weeks of computations), so it should be avoided if possible, but not at the expense of crashing the box if the eject doesn't work out. > I know it's more complicated with these types of devices, and I think we > are getting closer to the correct solution, I just don't want to ever > see duplicate devices in the driver model for the same physical device. Do you mean two things based on struct device for the same hardware component? That's been happening already pretty much forever for every PCI device known to the ACPI layer, for PNP and many others. However, those ACPI things are (or rather should be, but we're going to clean that up) only for convenience (to be able to see the namespace structure and related things in sysfs). So the stuff under /sys/devices/LNXSYSTM\:00/ is not "real". In my view it shouldn't even be under /sys/devices/ (/sys/firmware/acpi/ seems to be a better place for it), but that may be difficult to change without breaking user space (maybe we can just symlink it from /sys/devices/ or something). And the ACPI bus type shouldn't even exist in my opinion. There's much confusion in there and much work to clean that up, I agree, but that's kind of separate from the hotplug thing. Thanks, Rafael
On Saturday, February 02, 2013 09:15:37 PM Rafael J. Wysocki wrote: > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote: > > On Fri, Feb 01, 2013 at 11:12:59PM +0100, Rafael J. Wysocki wrote: > > > On Friday, February 01, 2013 08:23:12 AM Greg KH wrote: > > > > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote: > > > > > > > But, again, I'm going to ask why you aren't using the existing cpu / > > > > > > > memory / bridge / node devices that we have in the kernel. Please use > > > > > > > them, or give me a _really_ good reason why they will not work. > > > > > > > > > > > > We cannot use the existing system devices or ACPI devices here. During > > > > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory > > > > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target > > > > > > device information in a platform-neutral way. During hot-add, we first > > > > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices), > > > > > > but platform-neutral modules cannot use them as they are ACPI-specific. > > > > > > > > > > But suppose we're smart and have ACPI scan handlers that will create > > > > > "physical" device nodes for those devices during the ACPI namespace scan. > > > > > Then, the platform-neutral nodes will be able to bind to those "physical" > > > > > nodes. Moreover, it should be possible to get a hierarchy of device objects > > > > > this way that will reflect all of the dependencies we need to take into > > > > > account during hot-add and hot-remove operations. That may not be what we > > > > > have today, but I don't see any *fundamental* obstacles preventing us from > > > > > using this approach. > > > > > > > > I would _much_ rather see that be the solution here as I think it is the > > > > proper one. > > > > > > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > see why we can't do that for the other types of devices too. > > > > > > > > I agree. > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > help from the driver core here. > > > > > > > > I say do what we always have done here, if the user asked us to tear > > > > something down, let it happen as they are the ones that know best :) > > > > > > > > Seriously, I guess this gets back to the "fail disconnect" idea that the > > > > ACPI developers keep harping on. I thought we already resolved this > > > > properly by having them implement it in their bus code, no reason the > > > > same thing couldn't happen here, right? > > > > > > Not really. :-) We haven't ever resolved that particular issue I'm afraid. > > > > Ah, I didn't realize that. > > > > > > I don't think the core needs to do anything special, but if so, I'll be glad > > > > to review it. > > > > > > OK, so this is the use case. We have "eject" defined for something like > > > a container with a number of CPU cores, PCI host bridge, and a memory > > > controller under it. And a few pretty much arbitrary I/O devices as a bonus. > > > > > > Now, there's a button on the system case labeled as "Eject" and if that button > > > is pressed, we're supposed to _try_ to eject all of those things at once. We > > > are allowed to fail that request, though, if that's problematic for some > > > reason, but we're supposed to let the BIOS know about that. > > > > > > Do you seriously think that if that button is pressed, we should just proceed > > > with removing all that stuff no matter what? That'd be kind of like Russian > > > roulette for whoever pressed that button, because s/he could only press it and > > > wait for the system to either crash or not. Or maybe to crash a bit later > > > because of some delayed stuff that would hit one of those devices that had just > > > gone. Surely not a situation any admin of a high-availability system would > > > like to be in. :-) > > > > > > Quite frankly, I have no idea how that can be addressed in a single bus type, > > > let alone ACPI (which is not even a proper bus type, just something pretending > > > to be one). > > > > You don't have it as a single bus type, you have a controller somewhere, > > off of the bus being destroyed, that handles sending remove events to > > the device and tearing everything down. PCI does this from the very > > beginning. > > Yes, but those are just remove events and we can only see how destructive they > were after the removal. The point is to be able to figure out whether or not > we *want* to do the removal in the first place. > > Say you have a computing node which signals a hardware problem in a processor > package (the container with CPU cores, memory, PCI host bridge etc.). You > may want to eject that package, but you don't want to kill the system this > way. So if the eject is doable, it is very much desirable to do it, but if it > is not doable, you'd rather shut the box down and do the replacement afterward. > That may be costly, however (maybe weeks of computations), so it should be > avoided if possible, but not at the expense of crashing the box if the eject > doesn't work out. It seems to me that we could handle that with the help of a new flag, say "no_eject", in struct device, a global mutex, and a function that will walk the given subtree of the device hierarchy and check if "no_eject" is set for any devices in there. Plus a global "no_eject" switch, perhaps. To be more precise, suppose we have a "no_eject" flag that is set for a device when the kernel is about to start using it in such a way that ejecting it would lead to serious trouble (i.e. it is a memory module holding the kernel's page tables or something like that). Next, suppose that we have a function called, say, "device_may_eject()" that will walk the subtree of the device hierarchy starting at the given node and return false if any of the devices in there have "no_eject" set. Further, require that device_may_eject() has to be called under a global mutex called something like "device_eject_lock". Then, we can arrange things as follows: 1. When a device is about to be used for such purposes that it shouldn't be ejected, the relevant code should: (a) Acquire device_eject_lock. (b) Check if the device is still there and go to (e) if not. (c) Do whatever needs to be done to the device. (d) Set the device's no_eject flag. (e) Release device_eject_lock. 2. When an eject operation is about to be carried out on a subtree of the device hierarchy, the eject code (be it ACPI or something else) should: (a) Acquire device_eject_lock. (b) Call device_may_eject() on the starting device and go to (d) if it returns false. (c) Carry out the eject (that includes calling .remove() from all of the involved drivers in partiular). (d) Release device_eject_lock. 3. When it is OK to eject the device again, the relevant code should just clear its "no_eject" flag under device_eject_lock. If we want to synchronize that with such things like boot or system suspend, a global "no_eject" switch can be used for that (it needs to be manipulated under device_eject_lock) and one more step (check the global "no_eject") is needed between 2(a) and 2(b). Does it look like a reasonable approach? Rafael
On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote: > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote: > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > see why we can't do that for the other types of devices too. > > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > help from the driver core here. > > > > > > > > There are three different approaches suggested for system device > > > > hot-plug: > > > > A. Proceed within system device bus scan. > > > > B. Proceed within ACPI bus scan. > > > > C. Proceed with a sequence (as a mini-boot). > > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as > > > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > > > > > Here is summary of key questions & answers so far. I hope this > > > > clarifies why I am suggesting option 3. > > > > > > > > 1. What are the system devices? > > > > System devices provide system-wide core computing resources, which are > > > > essential to compose a computer system. System devices are not > > > > connected to any particular standard buses. > > > > > > Not a problem, lots of devices are not connected to any "particular > > > standard busses". All this means is that system devices are connected > > > to the "system" bus, nothing more. > > > > Can you give me a few examples of other devices that support hotplug and > > are not connected to any particular buses? I will investigate them to > > see how they are managed to support hotplug. > > Any device that is attached to any bus in the driver model can be > hotunplugged from userspace by telling it to be "unbound" from the > driver controlling it. Try it for any platform device in your system to > see how it happens. The unbind operation, as I understand from you, is to detach a driver from a device. Yes, unbinding can be done for any devices. It is however different from hot-plug operation, which unplugs a device. Today, the unbind operation to an ACPI cpu/memory devices causes hot-unplug (offline) operation to them, which is one of the major issues for us since unbind cannot fail. This patchset addresses this issue by making the unbind operation of ACPI cpu/memory devices to do the unbinding only. ACPI drivers no longer control cpu and memory as they are supposed to be controlled by their drivers, cpu and memory modules. The current hotplug code requires putting all device control stuff into ACPI, which this patchset is trying to fix it. > > > > 2. Why are the system devices special? > > > > The system devices are initialized during early boot-time, by multiple > > > > subsystems, from the boot-up sequence, in pre-defined order. They > > > > provide low-level services to enable other subsystems to come up. > > > > > > Sorry, no, that doesn't mean they are special, nothing here is unique > > > for the point of view of the driver model from any other device or bus. > > > > I think system devices are unique in a sense that they are initialized > > before drivers run. > > No, most all devices are "initialized" before a driver runs on it, USB > is one such example, PCI another, and I'm pretty sure that there are > others. USB devices can be initialized after the USB bus driver is initialized. Similarly, PCI devices can be initialized after the PCI bus driver is initialized. However, CPU and memory are initialized without any dependency to their bus driver since there is no such thing. In addition, CPU and memory have two drivers -- their actual drivers/subsystems and their ACPI drivers. Their actual drivers/subsystems initialize cpu and memory. The ACPI drivers initialize driver-specific data of their ACPI nodes. During hot-plug operation, however, the current code requires these ACPI drivers to do their hot-plug operations. This patchset keeps them separated during hot-plug and let their actual drivers/subsystems to do the job. > > > If you need to initialize the driver > > > core earlier, then do so. Or, even better, just wait until enough of > > > the system has come up and then go initialize all of the devices you > > > have found so far as part of your boot process. > > > > They are pseudo drivers that provide sysfs entry points of cpu and > > memory. They do not actually initialize cpu and memory. I do not think > > initializing cpu and memory fits into the driver model either, since > > drivers should run after cpu and memory are initialized. > > We already represent CPUs in the sysfs tree, don't represent them in two > different places with two different structures. Use the existing ones > please. This patchset does not make any changes to sysfs. It does however make the system drivers to control the system devices, and ACPI to do the ACPI stuff only. The boot sequence calls the following steps to initialize memory and cpu, as well as their acpi nodes. These steps are independent, i.e. #1 and #2 run without ACPI. 1. mm -> initialize memory -> create sysfs system/memory 2. smp/cpu -> initialize cpu -> create sysfs system/cpu 3. acpi core -> acpi scan -> create sysfs acpi nodes While there are 3 separate steps at boot, the current hotplug code tries to do everything in #3. Therefore, this patchset provides a sequencer (which is similar to the boot sequence) to run these steps during a hot-plug operation as well. Hence, we have consistent steps and role mode between boot and hot-plug operations. > > > None of the above things you have stated seem to have anything to do > > > with your proposed patch, so I don't understand why you have mentioned > > > them... > > > > You suggested option A before, which uses system bus scan to initialize > > all system devices at boot time as well as hot-plug. I tried to say > > that this option would not be doable. > > I haven't yet been convinced otherwise, sorry. Please prove me wrong :) This patchset enables the system drivers (i.e. cpu and memory drivers) to do the hot-plug operations, and ACPI core to do the ACPI stuff. We should not go with the system driver only approach since we do need to use ACPI stuff. Also, we should not go with the ACPI only approach since we do need to use the system (and other) drivers. This patchset provides a sequencer to manage these steps across multiple subsystems. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote: > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote: > > On Fri, Feb 01, 2013 at 11:12:59PM +0100, Rafael J. Wysocki wrote: > > > On Friday, February 01, 2013 08:23:12 AM Greg KH wrote: > > > > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote: > > > > > > > But, again, I'm going to ask why you aren't using the existing cpu / > > > > > > > memory / bridge / node devices that we have in the kernel. Please use > > > > > > > them, or give me a _really_ good reason why they will not work. > > > > > > > > > > > > We cannot use the existing system devices or ACPI devices here. During > > > > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory > > > > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target > > > > > > device information in a platform-neutral way. During hot-add, we first > > > > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices), > > > > > > but platform-neutral modules cannot use them as they are ACPI-specific. > > > > > > > > > > But suppose we're smart and have ACPI scan handlers that will create > > > > > "physical" device nodes for those devices during the ACPI namespace scan. > > > > > Then, the platform-neutral nodes will be able to bind to those "physical" > > > > > nodes. Moreover, it should be possible to get a hierarchy of device objects > > > > > this way that will reflect all of the dependencies we need to take into > > > > > account during hot-add and hot-remove operations. That may not be what we > > > > > have today, but I don't see any *fundamental* obstacles preventing us from > > > > > using this approach. > > > > > > > > I would _much_ rather see that be the solution here as I think it is the > > > > proper one. > > > > > > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > see why we can't do that for the other types of devices too. > > > > > > > > I agree. > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > help from the driver core here. > > > > > > > > I say do what we always have done here, if the user asked us to tear > > > > something down, let it happen as they are the ones that know best :) > > > > > > > > Seriously, I guess this gets back to the "fail disconnect" idea that the > > > > ACPI developers keep harping on. I thought we already resolved this > > > > properly by having them implement it in their bus code, no reason the > > > > same thing couldn't happen here, right? > > > > > > Not really. :-) We haven't ever resolved that particular issue I'm afraid. > > > > Ah, I didn't realize that. > > > > > > I don't think the core needs to do anything special, but if so, I'll be glad > > > > to review it. > > > > > > OK, so this is the use case. We have "eject" defined for something like > > > a container with a number of CPU cores, PCI host bridge, and a memory > > > controller under it. And a few pretty much arbitrary I/O devices as a bonus. > > > > > > Now, there's a button on the system case labeled as "Eject" and if that button > > > is pressed, we're supposed to _try_ to eject all of those things at once. We > > > are allowed to fail that request, though, if that's problematic for some > > > reason, but we're supposed to let the BIOS know about that. > > > > > > Do you seriously think that if that button is pressed, we should just proceed > > > with removing all that stuff no matter what? That'd be kind of like Russian > > > roulette for whoever pressed that button, because s/he could only press it and > > > wait for the system to either crash or not. Or maybe to crash a bit later > > > because of some delayed stuff that would hit one of those devices that had just > > > gone. Surely not a situation any admin of a high-availability system would > > > like to be in. :-) > > > > > > Quite frankly, I have no idea how that can be addressed in a single bus type, > > > let alone ACPI (which is not even a proper bus type, just something pretending > > > to be one). > > > > You don't have it as a single bus type, you have a controller somewhere, > > off of the bus being destroyed, that handles sending remove events to > > the device and tearing everything down. PCI does this from the very > > beginning. > > Yes, but those are just remove events and we can only see how destructive they > were after the removal. The point is to be able to figure out whether or not > we *want* to do the removal in the first place. Yes, but, you will always race if you try to test to see if you can shut down a device and then trying to do it. So walking the bus ahead of time isn't a good idea. And, we really don't have a viable way to recover if disconnect() fails, do we. What do we do in that situation, restore the other devices we disconnected successfully? How do we remember/know what they were? PCI hotplug almost had this same problem until the designers finally realized that they just had to accept the fact that removing a PCI device could either happen by: - a user yanking out the device, at which time the OS better clean up properly no matter what happens - the user asked nicely to remove a device, and the OS can take as long as it wants to complete that action, including stalling for noticable amounts of time before eventually, always letting the action succeed. I think the second thing is what you have to do here. If a user tells the OS it wants to remove these devices, you better do it. If you can't, because memory is being used by someone else, either move them off, or just hope that nothing bad happens, before the user gets frustrated and yanks out the CPU/memory module themselves physically :) > Say you have a computing node which signals a hardware problem in a processor > package (the container with CPU cores, memory, PCI host bridge etc.). You > may want to eject that package, but you don't want to kill the system this > way. So if the eject is doable, it is very much desirable to do it, but if it > is not doable, you'd rather shut the box down and do the replacement afterward. > That may be costly, however (maybe weeks of computations), so it should be > avoided if possible, but not at the expense of crashing the box if the eject > doesn't work out. These same "situations" came up for PCI hotplug, and I still say the same resolution there holds true, as described above. The user wants to remove something, so let them do it. They always know best, and get mad at us if we think otherwise :) What does the ACPI spec say about this type of thing? Surely the same people that did the PCI Hotplug spec were consulted when doing this part of the spec, right? Yeah, I know, I can dream... > > I know it's more complicated with these types of devices, and I think we > > are getting closer to the correct solution, I just don't want to ever > > see duplicate devices in the driver model for the same physical device. > > Do you mean two things based on struct device for the same hardware component? > That's been happening already pretty much forever for every PCI device known > to the ACPI layer, for PNP and many others. However, those ACPI things are (or > rather should be, but we're going to clean that up) only for convenience (to be > able to see the namespace structure and related things in sysfs). So the stuff > under /sys/devices/LNXSYSTM\:00/ is not "real". Yes, I've never treated that as a "real" device because they (usually) didn't ever bind to the "real" driver that controlled the device and how it talked to the rest of the os (like a USB device for example.) I always thought just of it as a "shadow" of the firmware image, nothing that should be directly operated on if at all possible. But, as you are pointing out, maybe this needs to be changed. Having users have to look in one part of the tree for one interface to a device, and another totally different part for a different interface to the same physical device is crazy, don't you agree? As to how to solve it, I really have no idea, I don't know ACPI that well at all, and honestly, don't want to, I want to keep what little hair I have left... > In my view it shouldn't even > be under /sys/devices/ (/sys/firmware/acpi/ seems to be a better place for it), I agree. > but that may be difficult to change without breaking user space (maybe we can > just symlink it from /sys/devices/ or something). And the ACPI bus type > shouldn't even exist in my opinion. > > There's much confusion in there and much work to clean that up, I agree, but > that's kind of separate from the hotplug thing. I agree as well. Best of luck. greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote: > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote: > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote: > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > > see why we can't do that for the other types of devices too. > > > > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > > help from the driver core here. > > > > > > > > > > There are three different approaches suggested for system device > > > > > hot-plug: > > > > > A. Proceed within system device bus scan. > > > > > B. Proceed within ACPI bus scan. > > > > > C. Proceed with a sequence (as a mini-boot). > > > > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as > > > > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > > > > > > > Here is summary of key questions & answers so far. I hope this > > > > > clarifies why I am suggesting option 3. > > > > > > > > > > 1. What are the system devices? > > > > > System devices provide system-wide core computing resources, which are > > > > > essential to compose a computer system. System devices are not > > > > > connected to any particular standard buses. > > > > > > > > Not a problem, lots of devices are not connected to any "particular > > > > standard busses". All this means is that system devices are connected > > > > to the "system" bus, nothing more. > > > > > > Can you give me a few examples of other devices that support hotplug and > > > are not connected to any particular buses? I will investigate them to > > > see how they are managed to support hotplug. > > > > Any device that is attached to any bus in the driver model can be > > hotunplugged from userspace by telling it to be "unbound" from the > > driver controlling it. Try it for any platform device in your system to > > see how it happens. > > The unbind operation, as I understand from you, is to detach a driver > from a device. Yes, unbinding can be done for any devices. It is > however different from hot-plug operation, which unplugs a device. Physically, yes, but to the driver involved, and the driver core, there is no difference. That was one of the primary goals of the driver core creation so many years ago. > Today, the unbind operation to an ACPI cpu/memory devices causes > hot-unplug (offline) operation to them, which is one of the major issues > for us since unbind cannot fail. This patchset addresses this issue by > making the unbind operation of ACPI cpu/memory devices to do the > unbinding only. ACPI drivers no longer control cpu and memory as they > are supposed to be controlled by their drivers, cpu and memory modules. I think that's the problem right there, solve that, please. > > > > > 2. Why are the system devices special? > > > > > The system devices are initialized during early boot-time, by multiple > > > > > subsystems, from the boot-up sequence, in pre-defined order. They > > > > > provide low-level services to enable other subsystems to come up. > > > > > > > > Sorry, no, that doesn't mean they are special, nothing here is unique > > > > for the point of view of the driver model from any other device or bus. > > > > > > I think system devices are unique in a sense that they are initialized > > > before drivers run. > > > > No, most all devices are "initialized" before a driver runs on it, USB > > is one such example, PCI another, and I'm pretty sure that there are > > others. > > USB devices can be initialized after the USB bus driver is initialized. > Similarly, PCI devices can be initialized after the PCI bus driver is > initialized. However, CPU and memory are initialized without any > dependency to their bus driver since there is no such thing. You can create such a thing if you want :) > In addition, CPU and memory have two drivers -- their actual > drivers/subsystems and their ACPI drivers. Again, I feel that is the root of the problem. Rafael seems to be working on solving this, which I think is essencial to your work as well. thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote: > > Yes, but those are just remove events and we can only see how destructive they > > were after the removal. The point is to be able to figure out whether or not > > we *want* to do the removal in the first place. > > > > Say you have a computing node which signals a hardware problem in a processor > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > may want to eject that package, but you don't want to kill the system this > > way. So if the eject is doable, it is very much desirable to do it, but if it > > is not doable, you'd rather shut the box down and do the replacement afterward. > > That may be costly, however (maybe weeks of computations), so it should be > > avoided if possible, but not at the expense of crashing the box if the eject > > doesn't work out. > > It seems to me that we could handle that with the help of a new flag, say > "no_eject", in struct device, a global mutex, and a function that will walk > the given subtree of the device hierarchy and check if "no_eject" is set for > any devices in there. Plus a global "no_eject" switch, perhaps. I think this will always be racy, or at worst, slow things down on normal device operations as you will always be having to grab this flag whenever you want to do something new. See my comments earlier about pci hotplug and the design decisions there about "no eject" capabilities for why. thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote: > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote: > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote: > > > On Fri, Feb 01, 2013 at 11:12:59PM +0100, Rafael J. Wysocki wrote: > > > > On Friday, February 01, 2013 08:23:12 AM Greg KH wrote: > > > > > On Thu, Jan 31, 2013 at 09:54:51PM +0100, Rafael J. Wysocki wrote: > > > > > > > > But, again, I'm going to ask why you aren't using the existing cpu / > > > > > > > > memory / bridge / node devices that we have in the kernel. Please use > > > > > > > > them, or give me a _really_ good reason why they will not work. > > > > > > > > > > > > > > We cannot use the existing system devices or ACPI devices here. During > > > > > > > hot-plug, ACPI handler sets this shp_device info, so that cpu and memory > > > > > > > handlers (drivers/cpu.c and mm/memory_hotplug.c) can obtain their target > > > > > > > device information in a platform-neutral way. During hot-add, we first > > > > > > > creates an ACPI device node (i.e. device under /sys/bus/acpi/devices), > > > > > > > but platform-neutral modules cannot use them as they are ACPI-specific. > > > > > > > > > > > > But suppose we're smart and have ACPI scan handlers that will create > > > > > > "physical" device nodes for those devices during the ACPI namespace scan. > > > > > > Then, the platform-neutral nodes will be able to bind to those "physical" > > > > > > nodes. Moreover, it should be possible to get a hierarchy of device objects > > > > > > this way that will reflect all of the dependencies we need to take into > > > > > > account during hot-add and hot-remove operations. That may not be what we > > > > > > have today, but I don't see any *fundamental* obstacles preventing us from > > > > > > using this approach. > > > > > > > > > > I would _much_ rather see that be the solution here as I think it is the > > > > > proper one. > > > > > > > > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > > see why we can't do that for the other types of devices too. > > > > > > > > > > I agree. > > > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > > help from the driver core here. > > > > > > > > > > I say do what we always have done here, if the user asked us to tear > > > > > something down, let it happen as they are the ones that know best :) > > > > > > > > > > Seriously, I guess this gets back to the "fail disconnect" idea that the > > > > > ACPI developers keep harping on. I thought we already resolved this > > > > > properly by having them implement it in their bus code, no reason the > > > > > same thing couldn't happen here, right? > > > > > > > > Not really. :-) We haven't ever resolved that particular issue I'm afraid. > > > > > > Ah, I didn't realize that. > > > > > > > > I don't think the core needs to do anything special, but if so, I'll be glad > > > > > to review it. > > > > > > > > OK, so this is the use case. We have "eject" defined for something like > > > > a container with a number of CPU cores, PCI host bridge, and a memory > > > > controller under it. And a few pretty much arbitrary I/O devices as a bonus. > > > > > > > > Now, there's a button on the system case labeled as "Eject" and if that button > > > > is pressed, we're supposed to _try_ to eject all of those things at once. We > > > > are allowed to fail that request, though, if that's problematic for some > > > > reason, but we're supposed to let the BIOS know about that. > > > > > > > > Do you seriously think that if that button is pressed, we should just proceed > > > > with removing all that stuff no matter what? That'd be kind of like Russian > > > > roulette for whoever pressed that button, because s/he could only press it and > > > > wait for the system to either crash or not. Or maybe to crash a bit later > > > > because of some delayed stuff that would hit one of those devices that had just > > > > gone. Surely not a situation any admin of a high-availability system would > > > > like to be in. :-) > > > > > > > > Quite frankly, I have no idea how that can be addressed in a single bus type, > > > > let alone ACPI (which is not even a proper bus type, just something pretending > > > > to be one). > > > > > > You don't have it as a single bus type, you have a controller somewhere, > > > off of the bus being destroyed, that handles sending remove events to > > > the device and tearing everything down. PCI does this from the very > > > beginning. > > > > Yes, but those are just remove events and we can only see how destructive they > > were after the removal. The point is to be able to figure out whether or not > > we *want* to do the removal in the first place. > > Yes, but, you will always race if you try to test to see if you can shut > down a device and then trying to do it. So walking the bus ahead of > time isn't a good idea. > > And, we really don't have a viable way to recover if disconnect() fails, > do we. What do we do in that situation, restore the other devices we > disconnected successfully? How do we remember/know what they were? > > PCI hotplug almost had this same problem until the designers finally > realized that they just had to accept the fact that removing a PCI > device could either happen by: > - a user yanking out the device, at which time the OS better > clean up properly no matter what happens > - the user asked nicely to remove a device, and the OS can take > as long as it wants to complete that action, including > stalling for noticable amounts of time before eventually, > always letting the action succeed. > > I think the second thing is what you have to do here. If a user tells > the OS it wants to remove these devices, you better do it. If you > can't, because memory is being used by someone else, either move them > off, or just hope that nothing bad happens, before the user gets > frustrated and yanks out the CPU/memory module themselves physically :) Well, that we can't help, but sometimes users really *want* the OS to tell them if it is safe to unplug something at this particualr time (think about the Windows' "safe remove" feature for USB sticks, for example; that came out of users' demand AFAIR). So in my opinion it would be good to give them an option to do "safe eject" or "forcible eject", whichever they prefer. > > Say you have a computing node which signals a hardware problem in a processor > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > may want to eject that package, but you don't want to kill the system this > > way. So if the eject is doable, it is very much desirable to do it, but if it > > is not doable, you'd rather shut the box down and do the replacement afterward. > > That may be costly, however (maybe weeks of computations), so it should be > > avoided if possible, but not at the expense of crashing the box if the eject > > doesn't work out. > > These same "situations" came up for PCI hotplug, and I still say the > same resolution there holds true, as described above. The user wants to > remove something, so let them do it. They always know best, and get mad > at us if we think otherwise :) Well, not necessarily. Users sometimes really don't know what they are doing and want us to give them a hint. My opinion is that if we can give them a hint, there's no reason not to. > What does the ACPI spec say about this type of thing? Surely the same > people that did the PCI Hotplug spec were consulted when doing this part > of the spec, right? Yeah, I know, I can dream... It's not very specific (as usual), but it gives hints. :-) For example, there is the _OST method (Section 6.3.5 of ACPI 5) that we are supposed to use to notify the platform of ejection failures and there are status codes like "0x81: Device in use by application" or "0x82: Device busy" that can be used in there. So definitely the authors took ejection failures for software-related reasons into consideration. > > > I know it's more complicated with these types of devices, and I think we > > > are getting closer to the correct solution, I just don't want to ever > > > see duplicate devices in the driver model for the same physical device. > > > > Do you mean two things based on struct device for the same hardware component? > > That's been happening already pretty much forever for every PCI device known > > to the ACPI layer, for PNP and many others. However, those ACPI things are (or > > rather should be, but we're going to clean that up) only for convenience (to be > > able to see the namespace structure and related things in sysfs). So the stuff > > under /sys/devices/LNXSYSTM\:00/ is not "real". > > Yes, I've never treated that as a "real" device because they (usually) > didn't ever bind to the "real" driver that controlled the device and how > it talked to the rest of the os (like a USB device for example.) I > always thought just of it as a "shadow" of the firmware image, nothing > that should be directly operated on if at all possible. Precisely. That's why I'd like to move that stuff away from /sys/devices/ and I don't see a reason why these objects should be based on struct device. They need kobjects to show up in sysfs, but apart from this they don't really need anything from struct device as far as I can say. > But, as you are pointing out, maybe this needs to be changed. Having > users have to look in one part of the tree for one interface to a > device, and another totally different part for a different interface to > the same physical device is crazy, don't you agree? Well, it is confusing. I don't have a problem with exposing the ACPI namespace in the form of a directory structure in sysfs and I see some benefits from doing that, but I'd like it to be clear what's represented by that directory structure and I don't want people to confuse ACPI device objects with devices (they are abstract interfaces to devices rather than anything else). > As to how to solve it, I really have no idea, I don't know ACPI that > well at all, and honestly, don't want to, I want to keep what little > hair I have left... I totally understand you. :-) > > In my view it shouldn't even > > be under /sys/devices/ (/sys/firmware/acpi/ seems to be a better place for it), > > I agree. > > > but that may be difficult to change without breaking user space (maybe we can > > just symlink it from /sys/devices/ or something). And the ACPI bus type > > shouldn't even exist in my opinion. > > > > There's much confusion in there and much work to clean that up, I agree, but > > that's kind of separate from the hotplug thing. > > I agree as well. > > Best of luck. Thanks. :-) Rafael
On Monday, February 04, 2013 04:48:10 AM Greg KH wrote: > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote: > > > Yes, but those are just remove events and we can only see how destructive they > > > were after the removal. The point is to be able to figure out whether or not > > > we *want* to do the removal in the first place. > > > > > > Say you have a computing node which signals a hardware problem in a processor > > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > > may want to eject that package, but you don't want to kill the system this > > > way. So if the eject is doable, it is very much desirable to do it, but if it > > > is not doable, you'd rather shut the box down and do the replacement afterward. > > > That may be costly, however (maybe weeks of computations), so it should be > > > avoided if possible, but not at the expense of crashing the box if the eject > > > doesn't work out. > > > > It seems to me that we could handle that with the help of a new flag, say > > "no_eject", in struct device, a global mutex, and a function that will walk > > the given subtree of the device hierarchy and check if "no_eject" is set for > > any devices in there. Plus a global "no_eject" switch, perhaps. > > I think this will always be racy, or at worst, slow things down on > normal device operations as you will always be having to grab this flag > whenever you want to do something new. I don't see why this particular scheme should be racy, at least I don't see any obvious races in it (although I'm not that good at races detection in general, admittedly). Also, I don't expect that flag to be used for everything, just for things known to seriously break if forcible eject is done. That may be not precise enough, so that's a matter of defining its purpose more precisely. We can do something like that on the ACPI level (ie. introduce a no_eject flag in struct acpi_device and provide an iterface for the layers above ACPI to manipulate it) but then devices without ACPI namespace objects won't be covered. That may not be a big deal, though. So say dev is about to be used for something incompatible with ejecting, so to speak. Then, one would do platform_lock_eject(dev), which would check if dev has an ACPI handle and then take acpi_eject_lock (if so). The return value of platform_lock_eject(dev) would need to be checked to see if the device is not gone. If it returns success (0), one would do something to the device and call platform_no_eject(dev) and then platform_unlock_eject(dev). To clear no_eject one would just call platform_allow_to_eject(dev) that would do all of the locking and clearing in one operation. The ACPI eject side would be similar to the thing I described previously, so it would (1) take acpi_eject_lock, (2) see if any struct acpi_device involved has no_eject set and if not, then (3) do acpi_bus_trim(), (4) carry out the eject and (5) release acpi_eject_lock. Step (2) above might be optional, ie. if eject is forcible, we would just do (3) etc. without (2). The locking should prevent races from happening (and it should prevent two ejects from happening at the same time too, which is not a bad thing by itself). > See my comments earlier about pci hotplug and the design decisions there > about "no eject" capabilities for why. Well, I replied to that one too. :-) Thanks, Rafael
On Mon, Feb 04, 2013 at 03:21:22PM +0100, Rafael J. Wysocki wrote: > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote: > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote: > > > > Yes, but those are just remove events and we can only see how destructive they > > > > were after the removal. The point is to be able to figure out whether or not > > > > we *want* to do the removal in the first place. > > > > > > > > Say you have a computing node which signals a hardware problem in a processor > > > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > > > may want to eject that package, but you don't want to kill the system this > > > > way. So if the eject is doable, it is very much desirable to do it, but if it > > > > is not doable, you'd rather shut the box down and do the replacement afterward. > > > > That may be costly, however (maybe weeks of computations), so it should be > > > > avoided if possible, but not at the expense of crashing the box if the eject > > > > doesn't work out. > > > > > > It seems to me that we could handle that with the help of a new flag, say > > > "no_eject", in struct device, a global mutex, and a function that will walk > > > the given subtree of the device hierarchy and check if "no_eject" is set for > > > any devices in there. Plus a global "no_eject" switch, perhaps. > > > > I think this will always be racy, or at worst, slow things down on > > normal device operations as you will always be having to grab this flag > > whenever you want to do something new. > > I don't see why this particular scheme should be racy, at least I don't see any > obvious races in it (although I'm not that good at races detection in general, > admittedly). > > Also, I don't expect that flag to be used for everything, just for things known > to seriously break if forcible eject is done. That may be not precise enough, > so that's a matter of defining its purpose more precisely. > > We can do something like that on the ACPI level (ie. introduce a no_eject flag > in struct acpi_device and provide an iterface for the layers above ACPI to > manipulate it) but then devices without ACPI namespace objects won't be > covered. That may not be a big deal, though. > > So say dev is about to be used for something incompatible with ejecting, so to > speak. Then, one would do platform_lock_eject(dev), which would check if dev > has an ACPI handle and then take acpi_eject_lock (if so). The return value of > platform_lock_eject(dev) would need to be checked to see if the device is not > gone. If it returns success (0), one would do something to the device and > call platform_no_eject(dev) and then platform_unlock_eject(dev). How does a device "know" it is doing something that is incompatible with ejecting? That's a non-trivial task from what I can tell. What happens if a device wants to set that flag, right after it was told to eject and the device was in the middle of being removed? How can you "fail" the "I can't be removed me now, so don't" requirement that it now has? thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote: > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote: > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote: > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote: : > > > Yes, but those are just remove events and we can only see how destructive they > > > were after the removal. The point is to be able to figure out whether or not > > > we *want* to do the removal in the first place. > > > > Yes, but, you will always race if you try to test to see if you can shut > > down a device and then trying to do it. So walking the bus ahead of > > time isn't a good idea. > > > > And, we really don't have a viable way to recover if disconnect() fails, > > do we. What do we do in that situation, restore the other devices we > > disconnected successfully? How do we remember/know what they were? > > > > PCI hotplug almost had this same problem until the designers finally > > realized that they just had to accept the fact that removing a PCI > > device could either happen by: > > - a user yanking out the device, at which time the OS better > > clean up properly no matter what happens > > - the user asked nicely to remove a device, and the OS can take > > as long as it wants to complete that action, including > > stalling for noticable amounts of time before eventually, > > always letting the action succeed. > > > > I think the second thing is what you have to do here. If a user tells > > the OS it wants to remove these devices, you better do it. If you > > can't, because memory is being used by someone else, either move them > > off, or just hope that nothing bad happens, before the user gets > > frustrated and yanks out the CPU/memory module themselves physically :) > > Well, that we can't help, but sometimes users really *want* the OS to tell them > if it is safe to unplug something at this particualr time (think about the > Windows' "safe remove" feature for USB sticks, for example; that came out of > users' demand AFAIR). > > So in my opinion it would be good to give them an option to do "safe eject" or > "forcible eject", whichever they prefer. For system device hot-plug, it always needs to be "safe eject". This feature will be implemented on mission critical servers, which are managed by professional IT folks. Crashing a server causes serious money to the business. A user yanking out a system device won't happen, and it immediately crashes the system if it is done. So, we have nothing to do with this case. The 2nd case can hang the operation, waiting forever to proceed, which is still a serious issue for enterprise customers. > > > Say you have a computing node which signals a hardware problem in a processor > > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > > may want to eject that package, but you don't want to kill the system this > > > way. So if the eject is doable, it is very much desirable to do it, but if it > > > is not doable, you'd rather shut the box down and do the replacement afterward. > > > That may be costly, however (maybe weeks of computations), so it should be > > > avoided if possible, but not at the expense of crashing the box if the eject > > > doesn't work out. > > > > These same "situations" came up for PCI hotplug, and I still say the > > same resolution there holds true, as described above. The user wants to > > remove something, so let them do it. They always know best, and get mad > > at us if we think otherwise :) > > Well, not necessarily. Users sometimes really don't know what they are doing > and want us to give them a hint. My opinion is that if we can give them a > hint, there's no reason not to. > > > What does the ACPI spec say about this type of thing? Surely the same > > people that did the PCI Hotplug spec were consulted when doing this part > > of the spec, right? Yeah, I know, I can dream... > > It's not very specific (as usual), but it gives hints. :-) > > For example, there is the _OST method (Section 6.3.5 of ACPI 5) that we are > supposed to use to notify the platform of ejection failures and there are > status codes like "0x81: Device in use by application" or "0x82: Device busy" > that can be used in there. So definitely the authors took ejection failures > for software-related reasons into consideration. That is correct. Also, ACPI spec deliberately does not define implementation details, so we defined DIG64 hotplug spec below (which I contributed to the spec.) http://www.dig64.org/home/DIG64_HPPF_R1_0.pdf For example, Figure 2 in page 14 states memory hot-remove flow. The operation needs to either succeed or fail. Crash or hang is not an option. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2013-02-04 at 15:21 +0100, Rafael J. Wysocki wrote: > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote: > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote: > > > > Yes, but those are just remove events and we can only see how destructive they > > > > were after the removal. The point is to be able to figure out whether or not > > > > we *want* to do the removal in the first place. > > > > > > > > Say you have a computing node which signals a hardware problem in a processor > > > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > > > may want to eject that package, but you don't want to kill the system this > > > > way. So if the eject is doable, it is very much desirable to do it, but if it > > > > is not doable, you'd rather shut the box down and do the replacement afterward. > > > > That may be costly, however (maybe weeks of computations), so it should be > > > > avoided if possible, but not at the expense of crashing the box if the eject > > > > doesn't work out. > > > > > > It seems to me that we could handle that with the help of a new flag, say > > > "no_eject", in struct device, a global mutex, and a function that will walk > > > the given subtree of the device hierarchy and check if "no_eject" is set for > > > any devices in there. Plus a global "no_eject" switch, perhaps. > > > > I think this will always be racy, or at worst, slow things down on > > normal device operations as you will always be having to grab this flag > > whenever you want to do something new. > > I don't see why this particular scheme should be racy, at least I don't see any > obvious races in it (although I'm not that good at races detection in general, > admittedly). > > Also, I don't expect that flag to be used for everything, just for things known > to seriously break if forcible eject is done. That may be not precise enough, > so that's a matter of defining its purpose more precisely. > > We can do something like that on the ACPI level (ie. introduce a no_eject flag > in struct acpi_device and provide an iterface for the layers above ACPI to > manipulate it) but then devices without ACPI namespace objects won't be > covered. That may not be a big deal, though. I am afraid that bringing the device status management into the ACPI level would not a good idea. acpi_device should only reflect ACPI device object information, not how its actual device is being used. I like your initiative of acpi_scan_driver and I think scanning / trimming of ACPI object info is what the ACPI drivers should do. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2013-02-04 at 04:46 -0800, Greg KH wrote: > On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote: > > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote: > > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote: > > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > > > see why we can't do that for the other types of devices too. > > > > > > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > > > help from the driver core here. > > > > > > > > > > > > There are three different approaches suggested for system device > > > > > > hot-plug: > > > > > > A. Proceed within system device bus scan. > > > > > > B. Proceed within ACPI bus scan. > > > > > > C. Proceed with a sequence (as a mini-boot). > > > > > > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as > > > > > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > > > > > > > > > Here is summary of key questions & answers so far. I hope this > > > > > > clarifies why I am suggesting option 3. > > > > > > > > > > > > 1. What are the system devices? > > > > > > System devices provide system-wide core computing resources, which are > > > > > > essential to compose a computer system. System devices are not > > > > > > connected to any particular standard buses. > > > > > > > > > > Not a problem, lots of devices are not connected to any "particular > > > > > standard busses". All this means is that system devices are connected > > > > > to the "system" bus, nothing more. > > > > > > > > Can you give me a few examples of other devices that support hotplug and > > > > are not connected to any particular buses? I will investigate them to > > > > see how they are managed to support hotplug. > > > > > > Any device that is attached to any bus in the driver model can be > > > hotunplugged from userspace by telling it to be "unbound" from the > > > driver controlling it. Try it for any platform device in your system to > > > see how it happens. > > > > The unbind operation, as I understand from you, is to detach a driver > > from a device. Yes, unbinding can be done for any devices. It is > > however different from hot-plug operation, which unplugs a device. > > Physically, yes, but to the driver involved, and the driver core, there > is no difference. That was one of the primary goals of the driver core > creation so many years ago. > > > Today, the unbind operation to an ACPI cpu/memory devices causes > > hot-unplug (offline) operation to them, which is one of the major issues > > for us since unbind cannot fail. This patchset addresses this issue by > > making the unbind operation of ACPI cpu/memory devices to do the > > unbinding only. ACPI drivers no longer control cpu and memory as they > > are supposed to be controlled by their drivers, cpu and memory modules. > > I think that's the problem right there, solve that, please. We cannot eliminate the ACPI drivers since we have to scan ACPI. But we can limit the ACPI drivers to do the scanning stuff only. This is precisely the intend of this patchset. The real stuff, removing actual devices, is done by the system device drivers/modules. > > > > > > 2. Why are the system devices special? > > > > > > The system devices are initialized during early boot-time, by multiple > > > > > > subsystems, from the boot-up sequence, in pre-defined order. They > > > > > > provide low-level services to enable other subsystems to come up. > > > > > > > > > > Sorry, no, that doesn't mean they are special, nothing here is unique > > > > > for the point of view of the driver model from any other device or bus. > > > > > > > > I think system devices are unique in a sense that they are initialized > > > > before drivers run. > > > > > > No, most all devices are "initialized" before a driver runs on it, USB > > > is one such example, PCI another, and I'm pretty sure that there are > > > others. > > > > USB devices can be initialized after the USB bus driver is initialized. > > Similarly, PCI devices can be initialized after the PCI bus driver is > > initialized. However, CPU and memory are initialized without any > > dependency to their bus driver since there is no such thing. > > You can create such a thing if you want :) Well, a pseudo driver could be created for it, but it does not make any difference. Access to CPU and memory does not go thru any bus controller visible to the OS. CPU and memory are connected with links (which are up at begging) and do not have bus structure any more. > > In addition, CPU and memory have two drivers -- their actual > > drivers/subsystems and their ACPI drivers. > > Again, I feel that is the root of the problem. Rafael seems to be > working on solving this, which I think is essencial to your work as > well. Yes, Rafael is doing excellent work to turn ACPI drivers into ACPI "scan" drivers, removing device driver portion, and keeping them as attach / detach operation to ACPI device object. My patchset is very much aligned with this direction. :) Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Monday, February 04, 2013 09:19:09 AM Toshi Kani wrote: > On Mon, 2013-02-04 at 15:21 +0100, Rafael J. Wysocki wrote: > > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote: > > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote: > > > > > Yes, but those are just remove events and we can only see how destructive they > > > > > were after the removal. The point is to be able to figure out whether or not > > > > > we *want* to do the removal in the first place. > > > > > > > > > > Say you have a computing node which signals a hardware problem in a processor > > > > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > > > > may want to eject that package, but you don't want to kill the system this > > > > > way. So if the eject is doable, it is very much desirable to do it, but if it > > > > > is not doable, you'd rather shut the box down and do the replacement afterward. > > > > > That may be costly, however (maybe weeks of computations), so it should be > > > > > avoided if possible, but not at the expense of crashing the box if the eject > > > > > doesn't work out. > > > > > > > > It seems to me that we could handle that with the help of a new flag, say > > > > "no_eject", in struct device, a global mutex, and a function that will walk > > > > the given subtree of the device hierarchy and check if "no_eject" is set for > > > > any devices in there. Plus a global "no_eject" switch, perhaps. > > > > > > I think this will always be racy, or at worst, slow things down on > > > normal device operations as you will always be having to grab this flag > > > whenever you want to do something new. > > > > I don't see why this particular scheme should be racy, at least I don't see any > > obvious races in it (although I'm not that good at races detection in general, > > admittedly). > > > > Also, I don't expect that flag to be used for everything, just for things known > > to seriously break if forcible eject is done. That may be not precise enough, > > so that's a matter of defining its purpose more precisely. > > > > We can do something like that on the ACPI level (ie. introduce a no_eject flag > > in struct acpi_device and provide an iterface for the layers above ACPI to > > manipulate it) but then devices without ACPI namespace objects won't be > > covered. That may not be a big deal, though. > > I am afraid that bringing the device status management into the ACPI > level would not a good idea. acpi_device should only reflect ACPI > device object information, not how its actual device is being used. > > I like your initiative of acpi_scan_driver and I think scanning / > trimming of ACPI object info is what the ACPI drivers should do. ACPI drivers, yes, but the users of ACPI already rely on information in struct acpi_device. Like ACPI device power states, for example. So platform_no_eject(dev) is not much different in that respect from platform_pci_set_power_state(pci_dev). The whole "eject" concept is somewhat ACPI-specific, though, and the eject notifications come from ACPI, so I don't have a problem with limiting it to ACPI-backed devices for the time being. If it turns out the be useful outside of ACPI, then we can move it up to the driver core. For now I don't see a compelling reason to do that. Thanks, Rafael
On Monday, February 04, 2013 09:46:18 AM Toshi Kani wrote: > On Mon, 2013-02-04 at 04:46 -0800, Greg KH wrote: > > On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote: > > > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote: > > > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote: > > > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > > > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > > > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > > > > see why we can't do that for the other types of devices too. > > > > > > > > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > > > > help from the driver core here. > > > > > > > > > > > > > > There are three different approaches suggested for system device > > > > > > > hot-plug: > > > > > > > A. Proceed within system device bus scan. > > > > > > > B. Proceed within ACPI bus scan. > > > > > > > C. Proceed with a sequence (as a mini-boot). > > > > > > > > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as > > > > > > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > > > > > > > > > > > Here is summary of key questions & answers so far. I hope this > > > > > > > clarifies why I am suggesting option 3. > > > > > > > > > > > > > > 1. What are the system devices? > > > > > > > System devices provide system-wide core computing resources, which are > > > > > > > essential to compose a computer system. System devices are not > > > > > > > connected to any particular standard buses. > > > > > > > > > > > > Not a problem, lots of devices are not connected to any "particular > > > > > > standard busses". All this means is that system devices are connected > > > > > > to the "system" bus, nothing more. > > > > > > > > > > Can you give me a few examples of other devices that support hotplug and > > > > > are not connected to any particular buses? I will investigate them to > > > > > see how they are managed to support hotplug. > > > > > > > > Any device that is attached to any bus in the driver model can be > > > > hotunplugged from userspace by telling it to be "unbound" from the > > > > driver controlling it. Try it for any platform device in your system to > > > > see how it happens. > > > > > > The unbind operation, as I understand from you, is to detach a driver > > > from a device. Yes, unbinding can be done for any devices. It is > > > however different from hot-plug operation, which unplugs a device. > > > > Physically, yes, but to the driver involved, and the driver core, there > > is no difference. That was one of the primary goals of the driver core > > creation so many years ago. > > > > > Today, the unbind operation to an ACPI cpu/memory devices causes > > > hot-unplug (offline) operation to them, which is one of the major issues > > > for us since unbind cannot fail. This patchset addresses this issue by > > > making the unbind operation of ACPI cpu/memory devices to do the > > > unbinding only. ACPI drivers no longer control cpu and memory as they > > > are supposed to be controlled by their drivers, cpu and memory modules. > > > > I think that's the problem right there, solve that, please. > > We cannot eliminate the ACPI drivers since we have to scan ACPI. But we > can limit the ACPI drivers to do the scanning stuff only. This is > precisely the intend of this patchset. The real stuff, removing actual > devices, is done by the system device drivers/modules. In case you haven't realized that yet, the $subject patchset has no future. Let's just talk about how we can get what we need in more general terms. Thanks, Rafael
On Mon, 2013-02-04 at 20:48 +0100, Rafael J. Wysocki wrote: > On Monday, February 04, 2013 09:02:46 AM Toshi Kani wrote: > > On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote: > > > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote: > > > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote: > > > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote: > > : > > > > > Yes, but those are just remove events and we can only see how destructive they > > > > > were after the removal. The point is to be able to figure out whether or not > > > > > we *want* to do the removal in the first place. > > > > > > > > Yes, but, you will always race if you try to test to see if you can shut > > > > down a device and then trying to do it. So walking the bus ahead of > > > > time isn't a good idea. > > > > > > > > And, we really don't have a viable way to recover if disconnect() fails, > > > > do we. What do we do in that situation, restore the other devices we > > > > disconnected successfully? How do we remember/know what they were? > > > > > > > > PCI hotplug almost had this same problem until the designers finally > > > > realized that they just had to accept the fact that removing a PCI > > > > device could either happen by: > > > > - a user yanking out the device, at which time the OS better > > > > clean up properly no matter what happens > > > > - the user asked nicely to remove a device, and the OS can take > > > > as long as it wants to complete that action, including > > > > stalling for noticable amounts of time before eventually, > > > > always letting the action succeed. > > > > > > > > I think the second thing is what you have to do here. If a user tells > > > > the OS it wants to remove these devices, you better do it. If you > > > > can't, because memory is being used by someone else, either move them > > > > off, or just hope that nothing bad happens, before the user gets > > > > frustrated and yanks out the CPU/memory module themselves physically :) > > > > > > Well, that we can't help, but sometimes users really *want* the OS to tell them > > > if it is safe to unplug something at this particualr time (think about the > > > Windows' "safe remove" feature for USB sticks, for example; that came out of > > > users' demand AFAIR). > > > > > > So in my opinion it would be good to give them an option to do "safe eject" or > > > "forcible eject", whichever they prefer. > > > > For system device hot-plug, it always needs to be "safe eject". This > > feature will be implemented on mission critical servers, which are > > managed by professional IT folks. Crashing a server causes serious > > money to the business. > > Well, "always" is a bit too strong a word as far as human behavior is concerned > in my opinion. > > That said I would be perfectly fine with not supporting the "forcible eject" to > start with and waiting for the first request to add support for it. I also > would be fine with taking bets on how much time it's going to take for such a > request to appear. :-) Sounds good. In my experience, though, it actually takes a LONG time to convince customers that "safe eject" is actually safe. Enterprise customers are so afraid of doing anything risky that might cause the system to crash or hang due to some defect. I would be very surprised to see a customer asking for a force operation when we do not guarantee its outcome. I have not seen such enterprise customers yet. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Monday, February 04, 2013 09:02:46 AM Toshi Kani wrote: > On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote: > > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote: > > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote: > > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote: > : > > > > Yes, but those are just remove events and we can only see how destructive they > > > > were after the removal. The point is to be able to figure out whether or not > > > > we *want* to do the removal in the first place. > > > > > > Yes, but, you will always race if you try to test to see if you can shut > > > down a device and then trying to do it. So walking the bus ahead of > > > time isn't a good idea. > > > > > > And, we really don't have a viable way to recover if disconnect() fails, > > > do we. What do we do in that situation, restore the other devices we > > > disconnected successfully? How do we remember/know what they were? > > > > > > PCI hotplug almost had this same problem until the designers finally > > > realized that they just had to accept the fact that removing a PCI > > > device could either happen by: > > > - a user yanking out the device, at which time the OS better > > > clean up properly no matter what happens > > > - the user asked nicely to remove a device, and the OS can take > > > as long as it wants to complete that action, including > > > stalling for noticable amounts of time before eventually, > > > always letting the action succeed. > > > > > > I think the second thing is what you have to do here. If a user tells > > > the OS it wants to remove these devices, you better do it. If you > > > can't, because memory is being used by someone else, either move them > > > off, or just hope that nothing bad happens, before the user gets > > > frustrated and yanks out the CPU/memory module themselves physically :) > > > > Well, that we can't help, but sometimes users really *want* the OS to tell them > > if it is safe to unplug something at this particualr time (think about the > > Windows' "safe remove" feature for USB sticks, for example; that came out of > > users' demand AFAIR). > > > > So in my opinion it would be good to give them an option to do "safe eject" or > > "forcible eject", whichever they prefer. > > For system device hot-plug, it always needs to be "safe eject". This > feature will be implemented on mission critical servers, which are > managed by professional IT folks. Crashing a server causes serious > money to the business. Well, "always" is a bit too strong a word as far as human behavior is concerned in my opinion. That said I would be perfectly fine with not supporting the "forcible eject" to start with and waiting for the first request to add support for it. I also would be fine with taking bets on how much time it's going to take for such a request to appear. :-) Thanks, Rafael
On Monday, February 04, 2013 06:33:52 AM Greg KH wrote: > On Mon, Feb 04, 2013 at 03:21:22PM +0100, Rafael J. Wysocki wrote: > > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote: > > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote: > > > > > Yes, but those are just remove events and we can only see how destructive they > > > > > were after the removal. The point is to be able to figure out whether or not > > > > > we *want* to do the removal in the first place. > > > > > > > > > > Say you have a computing node which signals a hardware problem in a processor > > > > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > > > > may want to eject that package, but you don't want to kill the system this > > > > > way. So if the eject is doable, it is very much desirable to do it, but if it > > > > > is not doable, you'd rather shut the box down and do the replacement afterward. > > > > > That may be costly, however (maybe weeks of computations), so it should be > > > > > avoided if possible, but not at the expense of crashing the box if the eject > > > > > doesn't work out. > > > > > > > > It seems to me that we could handle that with the help of a new flag, say > > > > "no_eject", in struct device, a global mutex, and a function that will walk > > > > the given subtree of the device hierarchy and check if "no_eject" is set for > > > > any devices in there. Plus a global "no_eject" switch, perhaps. > > > > > > I think this will always be racy, or at worst, slow things down on > > > normal device operations as you will always be having to grab this flag > > > whenever you want to do something new. > > > > I don't see why this particular scheme should be racy, at least I don't see any > > obvious races in it (although I'm not that good at races detection in general, > > admittedly). > > > > Also, I don't expect that flag to be used for everything, just for things known > > to seriously break if forcible eject is done. That may be not precise enough, > > so that's a matter of defining its purpose more precisely. > > > > We can do something like that on the ACPI level (ie. introduce a no_eject flag > > in struct acpi_device and provide an iterface for the layers above ACPI to > > manipulate it) but then devices without ACPI namespace objects won't be > > covered. That may not be a big deal, though. > > > > So say dev is about to be used for something incompatible with ejecting, so to > > speak. Then, one would do platform_lock_eject(dev), which would check if dev > > has an ACPI handle and then take acpi_eject_lock (if so). The return value of > > platform_lock_eject(dev) would need to be checked to see if the device is not > > gone. If it returns success (0), one would do something to the device and > > call platform_no_eject(dev) and then platform_unlock_eject(dev). > > How does a device "know" it is doing something that is incompatible with > ejecting? That's a non-trivial task from what I can tell. I agree that this is complicated in general. But. There are devices known to have software "offline" and "online" operations such that after the "offline" the given device is guaranteed to be not used until "online". We have that for CPU cores, for example, and user space can do it via /sys/devices/system/cpu/cpuX/online . So, why don't we make the "online" set the no_eject flag (under the lock as appropriate) and the "offline" clear it? And why don't we define such "online" and "offline" for all of the other "system" stuff, like memory, PCI host bridges etc. and make it behave analogously? Then, it is quite simple to say which devices should use the no_eject flag: devices that have "online" and "offline" exported to user space. And guess who's responsible for "offlining" all of those things before trying to eject them: user space is. From the kernel's point of view it is all clear. Hands clean. :-) Now, there's a different problem how to expose all of the relevant information to user space so that it knows what to "offline" for the specific eject operation to succeed, but that's kind of separate and worth addressing anyway. > What happens if a device wants to set that flag, right after it was told > to eject and the device was in the middle of being removed? How can you > "fail" the "I can't be removed me now, so don't" requirement that it now > has? This one is easy. :-) If platform_lock_eject() is called when an eject is under way, it will block on acpi_eject_lock until the eject is complete and if the device is gone as a result of the eject, it will return an error code. In turn, if an eject happens after platform_lock_eject(), it will block until platform_unlock_eject() and if platform_no_eject() is called in between the lock and unlock, it will notice the device with no_eject set and bail out. Quite obviously, it would be a bug to call platform_lock_eject() from within an eject code path. Thanks, Rafael
On Monday, February 04, 2013 12:46:24 PM Toshi Kani wrote: > On Mon, 2013-02-04 at 20:48 +0100, Rafael J. Wysocki wrote: > > On Monday, February 04, 2013 09:02:46 AM Toshi Kani wrote: > > > On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote: > > > > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote: > > > > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote: > > > > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote: > > > : > > > > > > Yes, but those are just remove events and we can only see how destructive they > > > > > > were after the removal. The point is to be able to figure out whether or not > > > > > > we *want* to do the removal in the first place. > > > > > > > > > > Yes, but, you will always race if you try to test to see if you can shut > > > > > down a device and then trying to do it. So walking the bus ahead of > > > > > time isn't a good idea. > > > > > > > > > > And, we really don't have a viable way to recover if disconnect() fails, > > > > > do we. What do we do in that situation, restore the other devices we > > > > > disconnected successfully? How do we remember/know what they were? > > > > > > > > > > PCI hotplug almost had this same problem until the designers finally > > > > > realized that they just had to accept the fact that removing a PCI > > > > > device could either happen by: > > > > > - a user yanking out the device, at which time the OS better > > > > > clean up properly no matter what happens > > > > > - the user asked nicely to remove a device, and the OS can take > > > > > as long as it wants to complete that action, including > > > > > stalling for noticable amounts of time before eventually, > > > > > always letting the action succeed. > > > > > > > > > > I think the second thing is what you have to do here. If a user tells > > > > > the OS it wants to remove these devices, you better do it. If you > > > > > can't, because memory is being used by someone else, either move them > > > > > off, or just hope that nothing bad happens, before the user gets > > > > > frustrated and yanks out the CPU/memory module themselves physically :) > > > > > > > > Well, that we can't help, but sometimes users really *want* the OS to tell them > > > > if it is safe to unplug something at this particualr time (think about the > > > > Windows' "safe remove" feature for USB sticks, for example; that came out of > > > > users' demand AFAIR). > > > > > > > > So in my opinion it would be good to give them an option to do "safe eject" or > > > > "forcible eject", whichever they prefer. > > > > > > For system device hot-plug, it always needs to be "safe eject". This > > > feature will be implemented on mission critical servers, which are > > > managed by professional IT folks. Crashing a server causes serious > > > money to the business. > > > > Well, "always" is a bit too strong a word as far as human behavior is concerned > > in my opinion. > > > > That said I would be perfectly fine with not supporting the "forcible eject" to > > start with and waiting for the first request to add support for it. I also > > would be fine with taking bets on how much time it's going to take for such a > > request to appear. :-) > > Sounds good. In my experience, though, it actually takes a LONG time to > convince customers that "safe eject" is actually safe. Enterprise > customers are so afraid of doing anything risky that might cause the > system to crash or hang due to some defect. I would be very surprised > to see a customer asking for a force operation when we do not guarantee > its outcome. I have not seen such enterprise customers yet. But we're talking about a kernel that is supposed to run on mobile phones too, among other things. Thanks, Rafael
On Mon, 2013-02-04 at 21:12 +0100, Rafael J. Wysocki wrote: > On Monday, February 04, 2013 12:46:24 PM Toshi Kani wrote: > > On Mon, 2013-02-04 at 20:48 +0100, Rafael J. Wysocki wrote: > > > On Monday, February 04, 2013 09:02:46 AM Toshi Kani wrote: > > > > On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote: > > > > > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote: > > > > > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote: > > > > > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote: > > > > : > > > > > > > Yes, but those are just remove events and we can only see how destructive they > > > > > > > were after the removal. The point is to be able to figure out whether or not > > > > > > > we *want* to do the removal in the first place. > > > > > > > > > > > > Yes, but, you will always race if you try to test to see if you can shut > > > > > > down a device and then trying to do it. So walking the bus ahead of > > > > > > time isn't a good idea. > > > > > > > > > > > > And, we really don't have a viable way to recover if disconnect() fails, > > > > > > do we. What do we do in that situation, restore the other devices we > > > > > > disconnected successfully? How do we remember/know what they were? > > > > > > > > > > > > PCI hotplug almost had this same problem until the designers finally > > > > > > realized that they just had to accept the fact that removing a PCI > > > > > > device could either happen by: > > > > > > - a user yanking out the device, at which time the OS better > > > > > > clean up properly no matter what happens > > > > > > - the user asked nicely to remove a device, and the OS can take > > > > > > as long as it wants to complete that action, including > > > > > > stalling for noticable amounts of time before eventually, > > > > > > always letting the action succeed. > > > > > > > > > > > > I think the second thing is what you have to do here. If a user tells > > > > > > the OS it wants to remove these devices, you better do it. If you > > > > > > can't, because memory is being used by someone else, either move them > > > > > > off, or just hope that nothing bad happens, before the user gets > > > > > > frustrated and yanks out the CPU/memory module themselves physically :) > > > > > > > > > > Well, that we can't help, but sometimes users really *want* the OS to tell them > > > > > if it is safe to unplug something at this particualr time (think about the > > > > > Windows' "safe remove" feature for USB sticks, for example; that came out of > > > > > users' demand AFAIR). > > > > > > > > > > So in my opinion it would be good to give them an option to do "safe eject" or > > > > > "forcible eject", whichever they prefer. > > > > > > > > For system device hot-plug, it always needs to be "safe eject". This > > > > feature will be implemented on mission critical servers, which are > > > > managed by professional IT folks. Crashing a server causes serious > > > > money to the business. > > > > > > Well, "always" is a bit too strong a word as far as human behavior is concerned > > > in my opinion. > > > > > > That said I would be perfectly fine with not supporting the "forcible eject" to > > > start with and waiting for the first request to add support for it. I also > > > would be fine with taking bets on how much time it's going to take for such a > > > request to appear. :-) > > > > Sounds good. In my experience, though, it actually takes a LONG time to > > convince customers that "safe eject" is actually safe. Enterprise > > customers are so afraid of doing anything risky that might cause the > > system to crash or hang due to some defect. I would be very surprised > > to see a customer asking for a force operation when we do not guarantee > > its outcome. I have not seen such enterprise customers yet. > > But we're talking about a kernel that is supposed to run on mobile phones too, > among other things. I think using this feature for RAS i.e. replacing a faulty device on-line, will continue to be limited for high-end systems. For low-end systems, it does not make sense for customers to pay much $$ for this feature. They can just shut the system down for replacement, or they can simply buy a new system instead of repairing. That said, using this feature on VM for workload balancing does not require any special hardware. So, I can see someone willing to try out to see how it goes with a force option on VM for personal use. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2013-02-04 at 20:45 +0100, Rafael J. Wysocki wrote: > On Monday, February 04, 2013 09:46:18 AM Toshi Kani wrote: > > On Mon, 2013-02-04 at 04:46 -0800, Greg KH wrote: > > > On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote: > > > > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote: > > > > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote: > > > > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > > > > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > > > > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > > > > > see why we can't do that for the other types of devices too. > > > > > > > > > > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > > > > > help from the driver core here. > > > > > > > > > > > > > > > > There are three different approaches suggested for system device > > > > > > > > hot-plug: > > > > > > > > A. Proceed within system device bus scan. > > > > > > > > B. Proceed within ACPI bus scan. > > > > > > > > C. Proceed with a sequence (as a mini-boot). > > > > > > > > > > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as > > > > > > > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > > > > > > > > > > > > > Here is summary of key questions & answers so far. I hope this > > > > > > > > clarifies why I am suggesting option 3. > > > > > > > > > > > > > > > > 1. What are the system devices? > > > > > > > > System devices provide system-wide core computing resources, which are > > > > > > > > essential to compose a computer system. System devices are not > > > > > > > > connected to any particular standard buses. > > > > > > > > > > > > > > Not a problem, lots of devices are not connected to any "particular > > > > > > > standard busses". All this means is that system devices are connected > > > > > > > to the "system" bus, nothing more. > > > > > > > > > > > > Can you give me a few examples of other devices that support hotplug and > > > > > > are not connected to any particular buses? I will investigate them to > > > > > > see how they are managed to support hotplug. > > > > > > > > > > Any device that is attached to any bus in the driver model can be > > > > > hotunplugged from userspace by telling it to be "unbound" from the > > > > > driver controlling it. Try it for any platform device in your system to > > > > > see how it happens. > > > > > > > > The unbind operation, as I understand from you, is to detach a driver > > > > from a device. Yes, unbinding can be done for any devices. It is > > > > however different from hot-plug operation, which unplugs a device. > > > > > > Physically, yes, but to the driver involved, and the driver core, there > > > is no difference. That was one of the primary goals of the driver core > > > creation so many years ago. > > > > > > > Today, the unbind operation to an ACPI cpu/memory devices causes > > > > hot-unplug (offline) operation to them, which is one of the major issues > > > > for us since unbind cannot fail. This patchset addresses this issue by > > > > making the unbind operation of ACPI cpu/memory devices to do the > > > > unbinding only. ACPI drivers no longer control cpu and memory as they > > > > are supposed to be controlled by their drivers, cpu and memory modules. > > > > > > I think that's the problem right there, solve that, please. > > > > We cannot eliminate the ACPI drivers since we have to scan ACPI. But we > > can limit the ACPI drivers to do the scanning stuff only. This is > > precisely the intend of this patchset. The real stuff, removing actual > > devices, is done by the system device drivers/modules. > > In case you haven't realized that yet, the $subject patchset has no future. That's really disappointing, esp. the fact that this basic approach has been proven to work on other OS for years... > Let's just talk about how we can get what we need in more general terms. So, are we heading to an approach of doing everything in ACPI? I am not clear about which direction we have agreed with or disagreed with. As for the eject flag approach, I agree with Greg. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, 2013-02-04 at 21:07 +0100, Rafael J. Wysocki wrote: > On Monday, February 04, 2013 06:33:52 AM Greg KH wrote: > > On Mon, Feb 04, 2013 at 03:21:22PM +0100, Rafael J. Wysocki wrote: > > > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote: > > > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote: > > > > > > Yes, but those are just remove events and we can only see how destructive they > > > > > > were after the removal. The point is to be able to figure out whether or not > > > > > > we *want* to do the removal in the first place. > > > > > > > > > > > > Say you have a computing node which signals a hardware problem in a processor > > > > > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > > > > > may want to eject that package, but you don't want to kill the system this > > > > > > way. So if the eject is doable, it is very much desirable to do it, but if it > > > > > > is not doable, you'd rather shut the box down and do the replacement afterward. > > > > > > That may be costly, however (maybe weeks of computations), so it should be > > > > > > avoided if possible, but not at the expense of crashing the box if the eject > > > > > > doesn't work out. > > > > > > > > > > It seems to me that we could handle that with the help of a new flag, say > > > > > "no_eject", in struct device, a global mutex, and a function that will walk > > > > > the given subtree of the device hierarchy and check if "no_eject" is set for > > > > > any devices in there. Plus a global "no_eject" switch, perhaps. > > > > > > > > I think this will always be racy, or at worst, slow things down on > > > > normal device operations as you will always be having to grab this flag > > > > whenever you want to do something new. > > > > > > I don't see why this particular scheme should be racy, at least I don't see any > > > obvious races in it (although I'm not that good at races detection in general, > > > admittedly). > > > > > > Also, I don't expect that flag to be used for everything, just for things known > > > to seriously break if forcible eject is done. That may be not precise enough, > > > so that's a matter of defining its purpose more precisely. > > > > > > We can do something like that on the ACPI level (ie. introduce a no_eject flag > > > in struct acpi_device and provide an iterface for the layers above ACPI to > > > manipulate it) but then devices without ACPI namespace objects won't be > > > covered. That may not be a big deal, though. > > > > > > So say dev is about to be used for something incompatible with ejecting, so to > > > speak. Then, one would do platform_lock_eject(dev), which would check if dev > > > has an ACPI handle and then take acpi_eject_lock (if so). The return value of > > > platform_lock_eject(dev) would need to be checked to see if the device is not > > > gone. If it returns success (0), one would do something to the device and > > > call platform_no_eject(dev) and then platform_unlock_eject(dev). > > > > How does a device "know" it is doing something that is incompatible with > > ejecting? That's a non-trivial task from what I can tell. > > I agree that this is complicated in general. But. > > There are devices known to have software "offline" and "online" operations > such that after the "offline" the given device is guaranteed to be not used > until "online". We have that for CPU cores, for example, and user space can > do it via /sys/devices/system/cpu/cpuX/online . So, why don't we make the > "online" set the no_eject flag (under the lock as appropriate) and the > "offline" clear it? And why don't we define such "online" and "offline" for > all of the other "system" stuff, like memory, PCI host bridges etc. and make it > behave analogously? > > Then, it is quite simple to say which devices should use the no_eject flag: > devices that have "online" and "offline" exported to user space. And guess > who's responsible for "offlining" all of those things before trying to eject > them: user space is. From the kernel's point of view it is all clear. Hands > clean. :-) > > Now, there's a different problem how to expose all of the relevant information > to user space so that it knows what to "offline" for the specific eject > operation to succeed, but that's kind of separate and worth addressing > anyway. So, the idea is to run a user space program that off-lines all relevant devices before trimming ACPI devices. Is that right? That sounds like a worth idea to consider with. This basically moves the "sequencer" part into user space instead of the kernel space in my proposal. I agree that how to expose all of the relevant info to user space is an issue. Also, we will need to make sure that the user program always runs per a kernel request and then informs a result back to the kernel, so that the kernel can do the rest of an operation and inform a result to FW with _OST or _EJ0. This loop has to close. I think it is going to be more complicated than the kernel-only approach. In addition, I am not sure if the "no_eject" flag in acpi_device is really necessary here since the user program will inform the kernel if all devices are off-line. Also, the kernel will likely need to expose the device info to the user program to tell which devices need to be off-lined. At that time, the kernel already knows if there is any on-line device in the scope. > > What happens if a device wants to set that flag, right after it was told > > to eject and the device was in the middle of being removed? How can you > > "fail" the "I can't be removed me now, so don't" requirement that it now > > has? > > This one is easy. :-) > > If platform_lock_eject() is called when an eject is under way, it will block > on acpi_eject_lock until the eject is complete and if the device is gone as > a result of the eject, it will return an error code. In this case, we do really need to make sure that the user program does not get killed in the middle of its operation since the kernel is holding a lock while it is under way. Thanks, -Toshi > In turn, if an eject happens after platform_lock_eject(), it will block until > platform_unlock_eject() and if platform_no_eject() is called in between the > lock and unlock, it will notice the device with no_eject set and bail out. > > Quite obviously, it would be a bug to call platform_lock_eject() from within an > eject code path. > > Thanks, > Rafael > > -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Monday, February 04, 2013 01:34:18 PM Toshi Kani wrote: > On Mon, 2013-02-04 at 21:12 +0100, Rafael J. Wysocki wrote: > > On Monday, February 04, 2013 12:46:24 PM Toshi Kani wrote: > > > On Mon, 2013-02-04 at 20:48 +0100, Rafael J. Wysocki wrote: > > > > On Monday, February 04, 2013 09:02:46 AM Toshi Kani wrote: > > > > > On Mon, 2013-02-04 at 14:41 +0100, Rafael J. Wysocki wrote: > > > > > > On Sunday, February 03, 2013 07:23:49 PM Greg KH wrote: > > > > > > > On Sat, Feb 02, 2013 at 09:15:37PM +0100, Rafael J. Wysocki wrote: > > > > > > > > On Saturday, February 02, 2013 03:58:01 PM Greg KH wrote: > > > > > : > > > > > > > > Yes, but those are just remove events and we can only see how destructive they > > > > > > > > were after the removal. The point is to be able to figure out whether or not > > > > > > > > we *want* to do the removal in the first place. > > > > > > > > > > > > > > Yes, but, you will always race if you try to test to see if you can shut > > > > > > > down a device and then trying to do it. So walking the bus ahead of > > > > > > > time isn't a good idea. > > > > > > > > > > > > > > And, we really don't have a viable way to recover if disconnect() fails, > > > > > > > do we. What do we do in that situation, restore the other devices we > > > > > > > disconnected successfully? How do we remember/know what they were? > > > > > > > > > > > > > > PCI hotplug almost had this same problem until the designers finally > > > > > > > realized that they just had to accept the fact that removing a PCI > > > > > > > device could either happen by: > > > > > > > - a user yanking out the device, at which time the OS better > > > > > > > clean up properly no matter what happens > > > > > > > - the user asked nicely to remove a device, and the OS can take > > > > > > > as long as it wants to complete that action, including > > > > > > > stalling for noticable amounts of time before eventually, > > > > > > > always letting the action succeed. > > > > > > > > > > > > > > I think the second thing is what you have to do here. If a user tells > > > > > > > the OS it wants to remove these devices, you better do it. If you > > > > > > > can't, because memory is being used by someone else, either move them > > > > > > > off, or just hope that nothing bad happens, before the user gets > > > > > > > frustrated and yanks out the CPU/memory module themselves physically :) > > > > > > > > > > > > Well, that we can't help, but sometimes users really *want* the OS to tell them > > > > > > if it is safe to unplug something at this particualr time (think about the > > > > > > Windows' "safe remove" feature for USB sticks, for example; that came out of > > > > > > users' demand AFAIR). > > > > > > > > > > > > So in my opinion it would be good to give them an option to do "safe eject" or > > > > > > "forcible eject", whichever they prefer. > > > > > > > > > > For system device hot-plug, it always needs to be "safe eject". This > > > > > feature will be implemented on mission critical servers, which are > > > > > managed by professional IT folks. Crashing a server causes serious > > > > > money to the business. > > > > > > > > Well, "always" is a bit too strong a word as far as human behavior is concerned > > > > in my opinion. > > > > > > > > That said I would be perfectly fine with not supporting the "forcible eject" to > > > > start with and waiting for the first request to add support for it. I also > > > > would be fine with taking bets on how much time it's going to take for such a > > > > request to appear. :-) > > > > > > Sounds good. In my experience, though, it actually takes a LONG time to > > > convince customers that "safe eject" is actually safe. Enterprise > > > customers are so afraid of doing anything risky that might cause the > > > system to crash or hang due to some defect. I would be very surprised > > > to see a customer asking for a force operation when we do not guarantee > > > its outcome. I have not seen such enterprise customers yet. > > > > But we're talking about a kernel that is supposed to run on mobile phones too, > > among other things. > > I think using this feature for RAS i.e. replacing a faulty device > on-line, will continue to be limited for high-end systems. For low-end > systems, it does not make sense for customers to pay much $$ for this > feature. They can just shut the system down for replacement, or they > can simply buy a new system instead of repairing. > > That said, using this feature on VM for workload balancing does not > require any special hardware. So, I can see someone willing to try out > to see how it goes with a force option on VM for personal use. Besides, SMP was a $$ "enterprise" feature not so long ago, so things tend to change. :-) Thanks, Rafael
On Monday, February 04, 2013 01:59:27 PM Toshi Kani wrote: > On Mon, 2013-02-04 at 20:45 +0100, Rafael J. Wysocki wrote: > > On Monday, February 04, 2013 09:46:18 AM Toshi Kani wrote: > > > On Mon, 2013-02-04 at 04:46 -0800, Greg KH wrote: > > > > On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote: > > > > > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote: > > > > > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote: > > > > > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > > > > > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > > > > > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > > > > > > see why we can't do that for the other types of devices too. > > > > > > > > > > > > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > > > > > > help from the driver core here. > > > > > > > > > > > > > > > > > > There are three different approaches suggested for system device > > > > > > > > > hot-plug: > > > > > > > > > A. Proceed within system device bus scan. > > > > > > > > > B. Proceed within ACPI bus scan. > > > > > > > > > C. Proceed with a sequence (as a mini-boot). > > > > > > > > > > > > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as > > > > > > > > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > > > > > > > > > > > > > > > Here is summary of key questions & answers so far. I hope this > > > > > > > > > clarifies why I am suggesting option 3. > > > > > > > > > > > > > > > > > > 1. What are the system devices? > > > > > > > > > System devices provide system-wide core computing resources, which are > > > > > > > > > essential to compose a computer system. System devices are not > > > > > > > > > connected to any particular standard buses. > > > > > > > > > > > > > > > > Not a problem, lots of devices are not connected to any "particular > > > > > > > > standard busses". All this means is that system devices are connected > > > > > > > > to the "system" bus, nothing more. > > > > > > > > > > > > > > Can you give me a few examples of other devices that support hotplug and > > > > > > > are not connected to any particular buses? I will investigate them to > > > > > > > see how they are managed to support hotplug. > > > > > > > > > > > > Any device that is attached to any bus in the driver model can be > > > > > > hotunplugged from userspace by telling it to be "unbound" from the > > > > > > driver controlling it. Try it for any platform device in your system to > > > > > > see how it happens. > > > > > > > > > > The unbind operation, as I understand from you, is to detach a driver > > > > > from a device. Yes, unbinding can be done for any devices. It is > > > > > however different from hot-plug operation, which unplugs a device. > > > > > > > > Physically, yes, but to the driver involved, and the driver core, there > > > > is no difference. That was one of the primary goals of the driver core > > > > creation so many years ago. > > > > > > > > > Today, the unbind operation to an ACPI cpu/memory devices causes > > > > > hot-unplug (offline) operation to them, which is one of the major issues > > > > > for us since unbind cannot fail. This patchset addresses this issue by > > > > > making the unbind operation of ACPI cpu/memory devices to do the > > > > > unbinding only. ACPI drivers no longer control cpu and memory as they > > > > > are supposed to be controlled by their drivers, cpu and memory modules. > > > > > > > > I think that's the problem right there, solve that, please. > > > > > > We cannot eliminate the ACPI drivers since we have to scan ACPI. But we > > > can limit the ACPI drivers to do the scanning stuff only. This is > > > precisely the intend of this patchset. The real stuff, removing actual > > > devices, is done by the system device drivers/modules. > > > > In case you haven't realized that yet, the $subject patchset has no future. > > That's really disappointing, esp. the fact that this basic approach has > been proven to work on other OS for years... > > > > Let's just talk about how we can get what we need in more general terms. > > So, are we heading to an approach of doing everything in ACPI? I am not > clear about which direction we have agreed with or disagreed with. > > As for the eject flag approach, I agree with Greg. Well, I'm not sure which of the Greg's thoughts you agree with. :-) Thanks, Rafael
On Tue, 2013-02-05 at 00:23 +0100, Rafael J. Wysocki wrote: > On Monday, February 04, 2013 01:59:27 PM Toshi Kani wrote: > > On Mon, 2013-02-04 at 20:45 +0100, Rafael J. Wysocki wrote: > > > On Monday, February 04, 2013 09:46:18 AM Toshi Kani wrote: > > > > On Mon, 2013-02-04 at 04:46 -0800, Greg KH wrote: > > > > > On Sun, Feb 03, 2013 at 05:28:09PM -0700, Toshi Kani wrote: > > > > > > On Sat, 2013-02-02 at 16:01 +0100, Greg KH wrote: > > > > > > > On Fri, Feb 01, 2013 at 01:40:10PM -0700, Toshi Kani wrote: > > > > > > > > On Fri, 2013-02-01 at 07:30 +0000, Greg KH wrote: > > > > > > > > > On Thu, Jan 31, 2013 at 06:32:18PM -0700, Toshi Kani wrote: > > > > > > > > > > This is already done for PCI host bridges and platform devices and I don't > > > > > > > > > > > see why we can't do that for the other types of devices too. > > > > > > > > > > > > > > > > > > > > > > The only missing piece I see is a way to handle the "eject" problem, i.e. > > > > > > > > > > > when we try do eject a device at the top of a subtree and need to tear down > > > > > > > > > > > the entire subtree below it, but if that's going to lead to a system crash, > > > > > > > > > > > for example, we want to cancel the eject. It seems to me that we'll need some > > > > > > > > > > > help from the driver core here. > > > > > > > > > > > > > > > > > > > > There are three different approaches suggested for system device > > > > > > > > > > hot-plug: > > > > > > > > > > A. Proceed within system device bus scan. > > > > > > > > > > B. Proceed within ACPI bus scan. > > > > > > > > > > C. Proceed with a sequence (as a mini-boot). > > > > > > > > > > > > > > > > > > > > Option A uses system devices as tokens, option B uses acpi devices as > > > > > > > > > > tokens, and option C uses resource tables as tokens, for their handlers. > > > > > > > > > > > > > > > > > > > > Here is summary of key questions & answers so far. I hope this > > > > > > > > > > clarifies why I am suggesting option 3. > > > > > > > > > > > > > > > > > > > > 1. What are the system devices? > > > > > > > > > > System devices provide system-wide core computing resources, which are > > > > > > > > > > essential to compose a computer system. System devices are not > > > > > > > > > > connected to any particular standard buses. > > > > > > > > > > > > > > > > > > Not a problem, lots of devices are not connected to any "particular > > > > > > > > > standard busses". All this means is that system devices are connected > > > > > > > > > to the "system" bus, nothing more. > > > > > > > > > > > > > > > > Can you give me a few examples of other devices that support hotplug and > > > > > > > > are not connected to any particular buses? I will investigate them to > > > > > > > > see how they are managed to support hotplug. > > > > > > > > > > > > > > Any device that is attached to any bus in the driver model can be > > > > > > > hotunplugged from userspace by telling it to be "unbound" from the > > > > > > > driver controlling it. Try it for any platform device in your system to > > > > > > > see how it happens. > > > > > > > > > > > > The unbind operation, as I understand from you, is to detach a driver > > > > > > from a device. Yes, unbinding can be done for any devices. It is > > > > > > however different from hot-plug operation, which unplugs a device. > > > > > > > > > > Physically, yes, but to the driver involved, and the driver core, there > > > > > is no difference. That was one of the primary goals of the driver core > > > > > creation so many years ago. > > > > > > > > > > > Today, the unbind operation to an ACPI cpu/memory devices causes > > > > > > hot-unplug (offline) operation to them, which is one of the major issues > > > > > > for us since unbind cannot fail. This patchset addresses this issue by > > > > > > making the unbind operation of ACPI cpu/memory devices to do the > > > > > > unbinding only. ACPI drivers no longer control cpu and memory as they > > > > > > are supposed to be controlled by their drivers, cpu and memory modules. > > > > > > > > > > I think that's the problem right there, solve that, please. > > > > > > > > We cannot eliminate the ACPI drivers since we have to scan ACPI. But we > > > > can limit the ACPI drivers to do the scanning stuff only. This is > > > > precisely the intend of this patchset. The real stuff, removing actual > > > > devices, is done by the system device drivers/modules. > > > > > > In case you haven't realized that yet, the $subject patchset has no future. > > > > That's really disappointing, esp. the fact that this basic approach has > > been proven to work on other OS for years... > > > > > > > Let's just talk about how we can get what we need in more general terms. > > > > So, are we heading to an approach of doing everything in ACPI? I am not > > clear about which direction we have agreed with or disagreed with. > > > > As for the eject flag approach, I agree with Greg. > > Well, I'm not sure which of the Greg's thoughts you agree with. :-) Sorry, that was the Greg's comment below. But then, I saw your other email clarifying that the no_eject flag only reflects online/offline status, not how the device is being used. So, I replied with my thoughts in a separate email. :) === How does a device "know" it is doing something that is incompatible with ejecting? That's a non-trivial task from what I can tell. === Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Monday, February 04, 2013 03:13:29 PM Toshi Kani wrote: > On Mon, 2013-02-04 at 21:07 +0100, Rafael J. Wysocki wrote: > > On Monday, February 04, 2013 06:33:52 AM Greg KH wrote: > > > On Mon, Feb 04, 2013 at 03:21:22PM +0100, Rafael J. Wysocki wrote: > > > > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote: > > > > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote: > > > > > > > Yes, but those are just remove events and we can only see how destructive they > > > > > > > were after the removal. The point is to be able to figure out whether or not > > > > > > > we *want* to do the removal in the first place. > > > > > > > > > > > > > > Say you have a computing node which signals a hardware problem in a processor > > > > > > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > > > > > > may want to eject that package, but you don't want to kill the system this > > > > > > > way. So if the eject is doable, it is very much desirable to do it, but if it > > > > > > > is not doable, you'd rather shut the box down and do the replacement afterward. > > > > > > > That may be costly, however (maybe weeks of computations), so it should be > > > > > > > avoided if possible, but not at the expense of crashing the box if the eject > > > > > > > doesn't work out. > > > > > > > > > > > > It seems to me that we could handle that with the help of a new flag, say > > > > > > "no_eject", in struct device, a global mutex, and a function that will walk > > > > > > the given subtree of the device hierarchy and check if "no_eject" is set for > > > > > > any devices in there. Plus a global "no_eject" switch, perhaps. > > > > > > > > > > I think this will always be racy, or at worst, slow things down on > > > > > normal device operations as you will always be having to grab this flag > > > > > whenever you want to do something new. > > > > > > > > I don't see why this particular scheme should be racy, at least I don't see any > > > > obvious races in it (although I'm not that good at races detection in general, > > > > admittedly). > > > > > > > > Also, I don't expect that flag to be used for everything, just for things known > > > > to seriously break if forcible eject is done. That may be not precise enough, > > > > so that's a matter of defining its purpose more precisely. > > > > > > > > We can do something like that on the ACPI level (ie. introduce a no_eject flag > > > > in struct acpi_device and provide an iterface for the layers above ACPI to > > > > manipulate it) but then devices without ACPI namespace objects won't be > > > > covered. That may not be a big deal, though. > > > > > > > > So say dev is about to be used for something incompatible with ejecting, so to > > > > speak. Then, one would do platform_lock_eject(dev), which would check if dev > > > > has an ACPI handle and then take acpi_eject_lock (if so). The return value of > > > > platform_lock_eject(dev) would need to be checked to see if the device is not > > > > gone. If it returns success (0), one would do something to the device and > > > > call platform_no_eject(dev) and then platform_unlock_eject(dev). > > > > > > How does a device "know" it is doing something that is incompatible with > > > ejecting? That's a non-trivial task from what I can tell. > > > > I agree that this is complicated in general. But. > > > > There are devices known to have software "offline" and "online" operations > > such that after the "offline" the given device is guaranteed to be not used > > until "online". We have that for CPU cores, for example, and user space can > > do it via /sys/devices/system/cpu/cpuX/online . So, why don't we make the > > "online" set the no_eject flag (under the lock as appropriate) and the > > "offline" clear it? And why don't we define such "online" and "offline" for > > all of the other "system" stuff, like memory, PCI host bridges etc. and make it > > behave analogously? > > > > Then, it is quite simple to say which devices should use the no_eject flag: > > devices that have "online" and "offline" exported to user space. And guess > > who's responsible for "offlining" all of those things before trying to eject > > them: user space is. From the kernel's point of view it is all clear. Hands > > clean. :-) > > > > Now, there's a different problem how to expose all of the relevant information > > to user space so that it knows what to "offline" for the specific eject > > operation to succeed, but that's kind of separate and worth addressing > > anyway. > > So, the idea is to run a user space program that off-lines all relevant > devices before trimming ACPI devices. Is that right? That sounds like > a worth idea to consider with. This basically moves the "sequencer" > part into user space instead of the kernel space in my proposal. I > agree that how to expose all of the relevant info to user space is an > issue. Also, we will need to make sure that the user program always > runs per a kernel request and then informs a result back to the kernel, > so that the kernel can do the rest of an operation and inform a result > to FW with _OST or _EJ0. This loop has to close. I think it is going > to be more complicated than the kernel-only approach. I actually didn't think about that. The point is that trying to offline everything *synchronously* may just be pointless, because it may be offlined upfront, before the eject is even requested. So the sequence would be to first offline things that we'll want to eject from user space and then to send the eject request (e.g. via sysfs too). Eject requests from eject buttons and things like that may just fail if some components involved that should be offline are online. The fact that we might be able to offline them synchronously if we tried doesn't matter, pretty much as it doesn't matter for hot-swappable disks. You'd probably never try to hot-remove a disk before unmounting filesystems mounted from it or failing it as a RAID component and nobody sane wants the kernel to do things like that automatically when the user presses the eject button. In my opinion we should treat memory eject, or CPU package eject, or PCI host bridge eject in exactly the same way: Don't eject if it is not prepared for ejecting in the first place. And if you think about it, that makes things *massively* simpler, because now the kernel doesn't heed to worry about all of those "synchronous removal" scenarions that very well may involve every single device in the system and the whole problem is nicely split into several separate "implement offline/online" problems that are subsystem-specific and a single "eject if everything relevant is offline" problem which is kind of trivial. Plus the one of exposing information to user space, which is separate too. Now, each of them can be worked on separately, *tested* separately and debugged separately if need be and it is much easier to isolate failures and so on. > In addition, I am not sure if the "no_eject" flag in acpi_device is > really necessary here since the user program will inform the kernel if > all devices are off-line. Also, the kernel will likely need to expose > the device info to the user program to tell which devices need to be > off-lined. At that time, the kernel already knows if there is any > on-line device in the scope. Well, that depends no what "the kernel" means and how it knows that. Surely the "online" components have to be marked somehow so that it is easy to check if they are in the scope in the subsystem-independent way, so why don't we use something like the no_eject flag for that? Rafael
On Tue, Feb 05, 2013 at 12:52:30AM +0100, Rafael J. Wysocki wrote: > You'd probably never try to hot-remove a disk before unmounting filesystems > mounted from it or failing it as a RAID component and nobody sane wants the > kernel to do things like that automatically when the user presses the eject > button. In my opinion we should treat memory eject, or CPU package eject, or > PCI host bridge eject in exactly the same way: Don't eject if it is not > prepared for ejecting in the first place. Bad example, we have disks hot-removed all the time without any filesystems being unmounted, and have supported this since the 2.2 days (although we didn't get it "right" until 2.6.) PCI Host bridge eject is the same as PCI eject today, the user asks us to do it, and we can not fail it from happening. We also can have them removed without us being told about it in the first place, and can properly clean up from it all. > And if you think about it, that makes things *massively* simpler, because now > the kernel doesn't heed to worry about all of those "synchronous removal" > scenarions that very well may involve every single device in the system and > the whole problem is nicely split into several separate "implement > offline/online" problems that are subsystem-specific and a single > "eject if everything relevant is offline" problem which is kind of trivial. > Plus the one of exposing information to user space, which is separate too. > > Now, each of them can be worked on separately, *tested* separately and > debugged separately if need be and it is much easier to isolate failures > and so on. So you are agreeing with me in that we can not fail hot removing any device, nice :) greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2013-02-05 at 00:52 +0100, Rafael J. Wysocki wrote: > On Monday, February 04, 2013 03:13:29 PM Toshi Kani wrote: > > On Mon, 2013-02-04 at 21:07 +0100, Rafael J. Wysocki wrote: > > > On Monday, February 04, 2013 06:33:52 AM Greg KH wrote: > > > > On Mon, Feb 04, 2013 at 03:21:22PM +0100, Rafael J. Wysocki wrote: > > > > > On Monday, February 04, 2013 04:48:10 AM Greg KH wrote: > > > > > > On Sun, Feb 03, 2013 at 09:44:39PM +0100, Rafael J. Wysocki wrote: > > > > > > > > Yes, but those are just remove events and we can only see how destructive they > > > > > > > > were after the removal. The point is to be able to figure out whether or not > > > > > > > > we *want* to do the removal in the first place. > > > > > > > > > > > > > > > > Say you have a computing node which signals a hardware problem in a processor > > > > > > > > package (the container with CPU cores, memory, PCI host bridge etc.). You > > > > > > > > may want to eject that package, but you don't want to kill the system this > > > > > > > > way. So if the eject is doable, it is very much desirable to do it, but if it > > > > > > > > is not doable, you'd rather shut the box down and do the replacement afterward. > > > > > > > > That may be costly, however (maybe weeks of computations), so it should be > > > > > > > > avoided if possible, but not at the expense of crashing the box if the eject > > > > > > > > doesn't work out. > > > > > > > > > > > > > > It seems to me that we could handle that with the help of a new flag, say > > > > > > > "no_eject", in struct device, a global mutex, and a function that will walk > > > > > > > the given subtree of the device hierarchy and check if "no_eject" is set for > > > > > > > any devices in there. Plus a global "no_eject" switch, perhaps. > > > > > > > > > > > > I think this will always be racy, or at worst, slow things down on > > > > > > normal device operations as you will always be having to grab this flag > > > > > > whenever you want to do something new. > > > > > > > > > > I don't see why this particular scheme should be racy, at least I don't see any > > > > > obvious races in it (although I'm not that good at races detection in general, > > > > > admittedly). > > > > > > > > > > Also, I don't expect that flag to be used for everything, just for things known > > > > > to seriously break if forcible eject is done. That may be not precise enough, > > > > > so that's a matter of defining its purpose more precisely. > > > > > > > > > > We can do something like that on the ACPI level (ie. introduce a no_eject flag > > > > > in struct acpi_device and provide an iterface for the layers above ACPI to > > > > > manipulate it) but then devices without ACPI namespace objects won't be > > > > > covered. That may not be a big deal, though. > > > > > > > > > > So say dev is about to be used for something incompatible with ejecting, so to > > > > > speak. Then, one would do platform_lock_eject(dev), which would check if dev > > > > > has an ACPI handle and then take acpi_eject_lock (if so). The return value of > > > > > platform_lock_eject(dev) would need to be checked to see if the device is not > > > > > gone. If it returns success (0), one would do something to the device and > > > > > call platform_no_eject(dev) and then platform_unlock_eject(dev). > > > > > > > > How does a device "know" it is doing something that is incompatible with > > > > ejecting? That's a non-trivial task from what I can tell. > > > > > > I agree that this is complicated in general. But. > > > > > > There are devices known to have software "offline" and "online" operations > > > such that after the "offline" the given device is guaranteed to be not used > > > until "online". We have that for CPU cores, for example, and user space can > > > do it via /sys/devices/system/cpu/cpuX/online . So, why don't we make the > > > "online" set the no_eject flag (under the lock as appropriate) and the > > > "offline" clear it? And why don't we define such "online" and "offline" for > > > all of the other "system" stuff, like memory, PCI host bridges etc. and make it > > > behave analogously? > > > > > > Then, it is quite simple to say which devices should use the no_eject flag: > > > devices that have "online" and "offline" exported to user space. And guess > > > who's responsible for "offlining" all of those things before trying to eject > > > them: user space is. From the kernel's point of view it is all clear. Hands > > > clean. :-) > > > > > > Now, there's a different problem how to expose all of the relevant information > > > to user space so that it knows what to "offline" for the specific eject > > > operation to succeed, but that's kind of separate and worth addressing > > > anyway. > > > > So, the idea is to run a user space program that off-lines all relevant > > devices before trimming ACPI devices. Is that right? That sounds like > > a worth idea to consider with. This basically moves the "sequencer" > > part into user space instead of the kernel space in my proposal. I > > agree that how to expose all of the relevant info to user space is an > > issue. Also, we will need to make sure that the user program always > > runs per a kernel request and then informs a result back to the kernel, > > so that the kernel can do the rest of an operation and inform a result > > to FW with _OST or _EJ0. This loop has to close. I think it is going > > to be more complicated than the kernel-only approach. > > I actually didn't think about that. The point is that trying to offline > everything *synchronously* may just be pointless, because it may be > offlined upfront, before the eject is even requested. So the sequence > would be to first offline things that we'll want to eject from user space > and then to send the eject request (e.g. via sysfs too). > > Eject requests from eject buttons and things like that may just fail if > some components involved that should be offline are online. The fact that > we might be able to offline them synchronously if we tried doesn't matter, > pretty much as it doesn't matter for hot-swappable disks. > > You'd probably never try to hot-remove a disk before unmounting filesystems > mounted from it or failing it as a RAID component and nobody sane wants the > kernel to do things like that automatically when the user presses the eject > button. In my opinion we should treat memory eject, or CPU package eject, or > PCI host bridge eject in exactly the same way: Don't eject if it is not > prepared for ejecting in the first place. > > And if you think about it, that makes things *massively* simpler, because now > the kernel doesn't heed to worry about all of those "synchronous removal" > scenarions that very well may involve every single device in the system and > the whole problem is nicely split into several separate "implement > offline/online" problems that are subsystem-specific and a single > "eject if everything relevant is offline" problem which is kind of trivial. > Plus the one of exposing information to user space, which is separate too. Oh, I see. Yes, it certainly makes things really simpler. It will bring burden to a user, but it could be solved with proper tools. I totally agree that I/Os should be removed beforehand. For CPUs and memory, it would be a bad TCE for asking a user to find a right set of the devices to off-line, but this could be addressed with proper tools. I think we need to check if memory block (a unit of sysfs memory online/offline) and an ACPI memory object actually corresponds nicely. But in high-level, this sounds like a workable plan. > Now, each of them can be worked on separately, *tested* separately and > debugged separately if need be and it is much easier to isolate failures > and so on. Right, but it is also the case with "synchronous removal" as long as we have sysfs online interface. The difference is that this approach only supports sysfs interface for off-lining. > > In addition, I am not sure if the "no_eject" flag in acpi_device is > > really necessary here since the user program will inform the kernel if > > all devices are off-line. Also, the kernel will likely need to expose > > the device info to the user program to tell which devices need to be > > off-lined. At that time, the kernel already knows if there is any > > on-line device in the scope. > > Well, that depends no what "the kernel" means and how it knows that. Surely > the "online" components have to be marked somehow so that it is easy to check > if they are in the scope in the subsystem-independent way, so why don't we use > something like the no_eject flag for that? Yes, I see your point. My previous comment assumed that the kernel would have to obtain device info and tell a user program to off-line them. In such case, I thought we would have to walk thru the actual device tree and see online/offline info anyway. But, since we are not doing anything like that, having the flag in acpi_device seems to be a reasonable way to avoid dealing with the actual device tree. Thanks, -Toshi -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Monday, February 04, 2013 04:04:47 PM Greg KH wrote: > On Tue, Feb 05, 2013 at 12:52:30AM +0100, Rafael J. Wysocki wrote: > > You'd probably never try to hot-remove a disk before unmounting filesystems > > mounted from it or failing it as a RAID component and nobody sane wants the > > kernel to do things like that automatically when the user presses the eject > > button. In my opinion we should treat memory eject, or CPU package eject, or > > PCI host bridge eject in exactly the same way: Don't eject if it is not > > prepared for ejecting in the first place. > > Bad example, we have disks hot-removed all the time without any > filesystems being unmounted, and have supported this since the 2.2 days > (although we didn't get it "right" until 2.6.) Well, that wasn't my point. My point was that we have tools for unmounting filesystems from disks that the user wants to hot-remove and the user is supposed to use those tools before hot-removing the disks. At least I wouldn't recommend anyone to do otherwise. :-) Now, for memory hot-removal we don't have anything like that, as far as I can say, so my point was why don't we add memory "offline" that can be done and tested separately from hot-removal and use that before we go and hot-remove stuff? And analogously for PCI host bridges etc.? [Now, there's a question if an "eject" button on the system case, if there is one, should *always* cause the eject to happen even though things are not "offline". My opinion is that not necessarily, because users may not be aware that they are doing something wrong. Quite analogously, does the power button always cause the system to shut down? No. So why the heck should an eject button always cause an eject to happen? I see no reason. That said, the most straightforward approach may be simply to let user space disable eject events for specific devices when it wants and only enable them when it knows that the given devices are ready for removal. But I'm digressing.] > PCI Host bridge eject is the same as PCI eject today, the user asks us > to do it, and we can not fail it from happening. We also can have them > removed without us being told about it in the first place, and can > properly clean up from it all. Well, are you sure we'll always clean up? I kind of have my doubts. :-) > > And if you think about it, that makes things *massively* simpler, because now > > the kernel doesn't heed to worry about all of those "synchronous removal" > > scenarions that very well may involve every single device in the system and > > the whole problem is nicely split into several separate "implement > > offline/online" problems that are subsystem-specific and a single > > "eject if everything relevant is offline" problem which is kind of trivial. > > Plus the one of exposing information to user space, which is separate too. > > > > Now, each of them can be worked on separately, *tested* separately and > > debugged separately if need be and it is much easier to isolate failures > > and so on. > > So you are agreeing with me in that we can not fail hot removing any > device, nice :) That depends on how you define hot-removing. If you regard the "offline" as a separate operation that can be carried out independently and hot-remove as the last step causing the device to actually go away, then I agree that it can't fail. The "offline" itself, however, is a different matter (pretty much like unmounting a file system). Thanks, Rafael
On Monday, February 04, 2013 04:04:47 PM Greg KH wrote: > On Tue, Feb 05, 2013 at 12:52:30AM +0100, Rafael J. Wysocki wrote: > > You'd probably never try to hot-remove a disk before unmounting filesystems > > mounted from it or failing it as a RAID component and nobody sane wants the > > kernel to do things like that automatically when the user presses the eject > > button. In my opinion we should treat memory eject, or CPU package eject, or > > PCI host bridge eject in exactly the same way: Don't eject if it is not > > prepared for ejecting in the first place. > > Bad example, we have disks hot-removed all the time without any > filesystems being unmounted, and have supported this since the 2.2 days > (although we didn't get it "right" until 2.6.) I actually don't think it is really bad, because it exposes the problem nicely. Namely, there are two arguments that can be made here. The first one is the usability argument: Users should always be allowed to do what they want, because it is [explicit content] annoying if software pretends to know better what to do than the user (it is a convenience argument too, because usually it's *easier* to allow users to do what they want). The second one is the data integrity argument: Operations that may lead to data loss should never be carried out, because it is [explicit content] disappointing to lose valuable stuff by a stupid mistake if software allows that mistake to be made (that also may be costly in terms of real money). You seem to believe that we should always follow the usability argument, while Toshi seems to be thinking that (at least in the case of the "system" devices), the data integrity argument is more important. They are both valid arguments, however, and they are in conflict, so this is a matter of balance. You're saying that in the case of disks we always follow the usability argument entirely. I'm fine with that, although I suspect that some people may not be considering this as the right balance. Toshi seems to be thinking that for the hotplug of memory/CPUs/host bridges we should always follow the data integrity argument entirely, because the users of that feature value their data so much that they pretty much don't care about usability. That very well may be the case, so I'm fine with that too, although I'm sure there are people who'll argue that this is not the right balance either. Now, the point is that we *can* do what Toshi is arguing for and that doesn't seem to be overly complicated, so my question is: Why don't we do that, at least to start with? If it turns out eventually that the users care about usability too, after all, we can add a switch to adjust things more to their liking. Still, we can very well do that later. Thanks, Rafael
On Tue, Feb 05, 2013 at 12:11:17PM +0100, Rafael J. Wysocki wrote: > On Monday, February 04, 2013 04:04:47 PM Greg KH wrote: > > On Tue, Feb 05, 2013 at 12:52:30AM +0100, Rafael J. Wysocki wrote: > > > You'd probably never try to hot-remove a disk before unmounting filesystems > > > mounted from it or failing it as a RAID component and nobody sane wants the > > > kernel to do things like that automatically when the user presses the eject > > > button. In my opinion we should treat memory eject, or CPU package eject, or > > > PCI host bridge eject in exactly the same way: Don't eject if it is not > > > prepared for ejecting in the first place. > > > > Bad example, we have disks hot-removed all the time without any > > filesystems being unmounted, and have supported this since the 2.2 days > > (although we didn't get it "right" until 2.6.) > > I actually don't think it is really bad, because it exposes the problem nicely. > > Namely, there are two arguments that can be made here. The first one is the > usability argument: Users should always be allowed to do what they want, > because it is [explicit content] annoying if software pretends to know better > what to do than the user (it is a convenience argument too, because usually > it's *easier* to allow users to do what they want). The second one is the > data integrity argument: Operations that may lead to data loss should never > be carried out, because it is [explicit content] disappointing to lose valuable > stuff by a stupid mistake if software allows that mistake to be made (that also > may be costly in terms of real money). > > You seem to believe that we should always follow the usability argument, while > Toshi seems to be thinking that (at least in the case of the "system" devices), > the data integrity argument is more important. They are both valid arguments, > however, and they are in conflict, so this is a matter of balance. > > You're saying that in the case of disks we always follow the usability argument > entirely. I'm fine with that, although I suspect that some people may not be > considering this as the right balance. > > Toshi seems to be thinking that for the hotplug of memory/CPUs/host bridges we > should always follow the data integrity argument entirely, because the users of > that feature value their data so much that they pretty much don't care about > usability. That very well may be the case, so I'm fine with that too, although > I'm sure there are people who'll argue that this is not the right balance > either. > > Now, the point is that we *can* do what Toshi is arguing for and that doesn't > seem to be overly complicated, so my question is: Why don't we do that, at > least to start with? If it turns out eventually that the users care about > usability too, after all, we can add a switch to adjust things more to their > liking. Still, we can very well do that later. Ok, I'd much rather deal with reviewing actual implementations than talking about theory at this point in time, so let's see what you all can come up with next and I'll be glad to review it. thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tuesday, February 05, 2013 10:39:48 AM Greg KH wrote: > On Tue, Feb 05, 2013 at 12:11:17PM +0100, Rafael J. Wysocki wrote: > > On Monday, February 04, 2013 04:04:47 PM Greg KH wrote: > > > On Tue, Feb 05, 2013 at 12:52:30AM +0100, Rafael J. Wysocki wrote: > > > > You'd probably never try to hot-remove a disk before unmounting filesystems > > > > mounted from it or failing it as a RAID component and nobody sane wants the > > > > kernel to do things like that automatically when the user presses the eject > > > > button. In my opinion we should treat memory eject, or CPU package eject, or > > > > PCI host bridge eject in exactly the same way: Don't eject if it is not > > > > prepared for ejecting in the first place. > > > > > > Bad example, we have disks hot-removed all the time without any > > > filesystems being unmounted, and have supported this since the 2.2 days > > > (although we didn't get it "right" until 2.6.) > > > > I actually don't think it is really bad, because it exposes the problem nicely. > > > > Namely, there are two arguments that can be made here. The first one is the > > usability argument: Users should always be allowed to do what they want, > > because it is [explicit content] annoying if software pretends to know better > > what to do than the user (it is a convenience argument too, because usually > > it's *easier* to allow users to do what they want). The second one is the > > data integrity argument: Operations that may lead to data loss should never > > be carried out, because it is [explicit content] disappointing to lose valuable > > stuff by a stupid mistake if software allows that mistake to be made (that also > > may be costly in terms of real money). > > > > You seem to believe that we should always follow the usability argument, while > > Toshi seems to be thinking that (at least in the case of the "system" devices), > > the data integrity argument is more important. They are both valid arguments, > > however, and they are in conflict, so this is a matter of balance. > > > > You're saying that in the case of disks we always follow the usability argument > > entirely. I'm fine with that, although I suspect that some people may not be > > considering this as the right balance. > > > > Toshi seems to be thinking that for the hotplug of memory/CPUs/host bridges we > > should always follow the data integrity argument entirely, because the users of > > that feature value their data so much that they pretty much don't care about > > usability. That very well may be the case, so I'm fine with that too, although > > I'm sure there are people who'll argue that this is not the right balance > > either. > > > > Now, the point is that we *can* do what Toshi is arguing for and that doesn't > > seem to be overly complicated, so my question is: Why don't we do that, at > > least to start with? If it turns out eventually that the users care about > > usability too, after all, we can add a switch to adjust things more to their > > liking. Still, we can very well do that later. > > Ok, I'd much rather deal with reviewing actual implementations than > talking about theory at this point in time, so let's see what you all > can come up with next and I'll be glad to review it. Sure, thanks a lot for your comments so far! Rafael
diff --git a/include/linux/sys_hotplug.h b/include/linux/sys_hotplug.h new file mode 100644 index 0000000..86674dd --- /dev/null +++ b/include/linux/sys_hotplug.h @@ -0,0 +1,181 @@ +/* + * sys_hotplug.h - System device hot-plug framework + * + * Copyright (C) 2012 Hewlett-Packard Development Company, L.P. + * Toshi Kani <toshi.kani@hp.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _LINUX_SYS_HOTPLUG_H +#define _LINUX_SYS_HOTPLUG_H + +#include <linux/list.h> +#include <linux/device.h> + +/* + * System device hot-plug operation proceeds in the following order. + * Validate phase -> Execute phase -> Commit phase + * + * The order values below define the calling sequence of platform + * neutral handlers for each phase in ascending order. The order + * values of firmware-specific handlers are defined in sys_hotplug.h + * under firmware specific directories. + */ + +/* All order values must be smaller than this value */ +#define SHP_ORDER_MAX 0xffffff + +/* Add Validate order values */ + +/* Add Execute order values */ +#define SHP_MEM_ADD_EXECUTE_ORDER 100 +#define SHP_CPU_ADD_EXECUTE_ORDER 110 + +/* Add Commit order values */ + +/* Delete Validate order values */ +#define SHP_CPU_DEL_VALIDATE_ORDER 100 +#define SHP_MEM_DEL_VALIDATE_ORDER 110 + +/* Delete Execute order values */ +#define SHP_CPU_DEL_EXECUTE_ORDER 10 +#define SHP_MEM_DEL_EXECUTE_ORDER 20 + +/* Delete Commit order values */ + +/* + * Hot-plug request types + */ +#define SHP_REQ_ADD 0x000000 +#define SHP_REQ_DELETE 0x000001 +#define SHP_REQ_MASK 0x0000ff + +/* + * Hot-plug phase types + */ +#define SHP_PH_VALIDATE 0x000000 +#define SHP_PH_EXECUTE 0x000100 +#define SHP_PH_COMMIT 0x000200 +#define SHP_PH_MASK 0x00ff00 + +/* + * Hot-plug operation types + */ +#define SHP_OP_HOTPLUG 0x000000 +#define SHP_OP_ONLINE 0x010000 +#define SHP_OP_MASK 0xff0000 + +/* + * Hot-plug phases + */ +enum shp_phase { + SHP_ADD_VALIDATE = (SHP_REQ_ADD|SHP_PH_VALIDATE), + SHP_ADD_EXECUTE = (SHP_REQ_ADD|SHP_PH_EXECUTE), + SHP_ADD_COMMIT = (SHP_REQ_ADD|SHP_PH_COMMIT), + SHP_DEL_VALIDATE = (SHP_REQ_DELETE|SHP_PH_VALIDATE), + SHP_DEL_EXECUTE = (SHP_REQ_DELETE|SHP_PH_EXECUTE), + SHP_DEL_COMMIT = (SHP_REQ_DELETE|SHP_PH_COMMIT) +}; + +/* + * Hot-plug operations + */ +enum shp_operation { + SHP_HOTPLUG_ADD = (SHP_OP_HOTPLUG|SHP_REQ_ADD), + SHP_HOTPLUG_DEL = (SHP_OP_HOTPLUG|SHP_REQ_DELETE), + SHP_ONLINE_ADD = (SHP_OP_ONLINE|SHP_REQ_ADD), + SHP_ONLINE_DEL = (SHP_OP_ONLINE|SHP_REQ_DELETE) +}; + +/* + * Hot-plug device classes + */ +enum shp_class { + SHP_CLS_INVALID = 0, + SHP_CLS_CPU = 1, + SHP_CLS_MEMORY = 2, + SHP_CLS_HOSTBRIDGE = 3, + SHP_CLS_CONTAINER = 4, +}; + +/* + * Hot-plug device information + */ +union shp_dev_info { + struct shp_cpu { + u32 cpu_id; + } cpu; + + struct shp_memory { + int node; + u64 start_addr; + u64 length; + } mem; + + struct shp_hostbridge { + } hb; + + struct shp_node { + } node; +}; + +struct shp_device { + struct list_head list; + struct device *device; + enum shp_class class; + union shp_dev_info info; +}; + +/* + * Hot-plug request + */ +struct shp_request { + /* common info */ + enum shp_operation operation; /* operation */ + + /* hot-plug event info: only valid for hot-plug operations */ + void *handle; /* FW handle */ + u32 event; /* FW event */ + + /* device resource info */ + struct list_head dev_list; /* shp_device list */ +}; + +/* + * Inline Utility Functions + */ +static inline bool shp_is_hotplug_op(enum shp_operation operation) +{ + return (operation & SHP_OP_MASK) == SHP_OP_HOTPLUG; +} + +static inline bool shp_is_online_op(enum shp_operation operation) +{ + return (operation & SHP_OP_MASK) == SHP_OP_ONLINE; +} + +static inline bool shp_is_add_op(enum shp_operation operation) +{ + return (operation & SHP_REQ_MASK) == SHP_REQ_ADD; +} + +static inline bool shp_is_add_phase(enum shp_phase phase) +{ + return (phase & SHP_REQ_MASK) == SHP_REQ_ADD; +} + +/* + * Externs + */ +typedef int (*shp_func)(struct shp_request *req, int rollback); +extern int shp_register_handler(enum shp_phase phase, shp_func func, u32 order); +extern int shp_unregister_handler(enum shp_phase phase, shp_func func); +extern int shp_submit_req(struct shp_request *req); +extern struct shp_request *shp_alloc_request(enum shp_operation operation); +extern void shp_add_dev_info(struct shp_request *shp_req, + struct shp_device *shp_dev); + +#endif /* _LINUX_SYS_HOTPLUG_H */
Added include/linux/sys_hotplug.h, which defines the system device hotplug framework interfaces used by the framework itself and handlers. The order values define the calling sequence of handlers. For add execute, the ordering is ACPI->MEM->CPU. Memory is onlined before CPU so that threads on new CPUs can start using their local memory. The ordering of the delete execute is symmetric to the add execute. struct shp_request defines a hot-plug request information. The device resource information is managed with a list so that a single request may target to multiple devices. Signed-off-by: Toshi Kani <toshi.kani@hp.com> --- include/linux/sys_hotplug.h | 181 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 include/linux/sys_hotplug.h -- To unsubscribe from this list: send the line "unsubscribe linux-acpi" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html