diff mbox

[v2] acpi : acpi_bus_trim() stops removing devices when failing to remove the device

Message ID 50769B8C.2060901@jp.fujitsu.com (mailing list archive)
State Rejected, archived
Headers show

Commit Message

Yasuaki Ishimatsu Oct. 11, 2012, 10:12 a.m. UTC
acpi_bus_trim() stops removing devices, when acpi_bus_remove() return error
number. But acpi_bus_remove() cannot return error number correctly.
acpi_bus_remove() only return -EINVAL, when dev argument is NULL. Thus even if
device cannot be removed correctly, acpi_bus_trim() ignores and continues to
remove devices. acpi_bus_hot_remove_device() uses acpi_bus_trim() for removing
devices. Therefore acpi_bus_hot_remove_device() can send "_EJ0" to firmware,
even if the device is running on the system. In this case, the system cannot
work well.

Vasilis hit the bug at memory hotplug and reported it as follow:
https://lkml.org/lkml/2012/9/26/318

So acpi_bus_trim() should check whether device was removed or not correctly.
The patch adds error check into some functions to remove the device.

Applying the patch, acpi_bus_trim() stops removing devices when failing
to remove the device. But I think there is no impact with the
exceptionof CPU and Memory hotplug path. Because other device also fails
but the fail is an irregular case like device is NULL.

v1->v2
- add a rollback for reinstalling a notify handler.

Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

---
 drivers/acpi/scan.c    |   21 ++++++++++++++++++---
 drivers/base/dd.c      |   22 +++++++++++++++++-----
 include/linux/device.h |    2 +-
 3 files changed, 36 insertions(+), 9 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Toshi Kani Oct. 11, 2012, 1:58 p.m. UTC | #1
On Thu, 2012-10-11 at 19:12 +0900, Yasuaki Ishimatsu wrote:
> acpi_bus_trim() stops removing devices, when acpi_bus_remove() return error
> number. But acpi_bus_remove() cannot return error number correctly.
> acpi_bus_remove() only return -EINVAL, when dev argument is NULL. Thus even if
> device cannot be removed correctly, acpi_bus_trim() ignores and continues to
> remove devices. acpi_bus_hot_remove_device() uses acpi_bus_trim() for removing
> devices. Therefore acpi_bus_hot_remove_device() can send "_EJ0" to firmware,
> even if the device is running on the system. In this case, the system cannot
> work well.
> 
> Vasilis hit the bug at memory hotplug and reported it as follow:
> https://lkml.org/lkml/2012/9/26/318
> 
> So acpi_bus_trim() should check whether device was removed or not correctly.
> The patch adds error check into some functions to remove the device.
> 
> Applying the patch, acpi_bus_trim() stops removing devices when failing
> to remove the device. But I think there is no impact with the
> exceptionof CPU and Memory hotplug path. Because other device also fails
> but the fail is an irregular case like device is NULL.
> 
> v1->v2
> - add a rollback for reinstalling a notify handler.
> 
> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

Thanks for the update. Looks good.

Reviewed-by: Toshi Kani <toshi.kani@hp.com>

-Toshi

> 
> ---
>  drivers/acpi/scan.c    |   21 ++++++++++++++++++---
>  drivers/base/dd.c      |   22 +++++++++++++++++-----
>  include/linux/device.h |    2 +-
>  3 files changed, 36 insertions(+), 9 deletions(-)
> 
> Index: linux-3.6/drivers/acpi/scan.c
> ===================================================================
> --- linux-3.6.orig/drivers/acpi/scan.c	2012-10-11 18:31:40.189019503 +0900
> +++ linux-3.6/drivers/acpi/scan.c	2012-10-11 18:42:35.669041641 +0900
> @@ -445,18 +445,29 @@ static int acpi_device_remove(struct dev
>  {
>  	struct acpi_device *acpi_dev = to_acpi_device(dev);
>  	struct acpi_driver *acpi_drv = acpi_dev->driver;
> +	int ret;
>  
>  	if (acpi_drv) {
>  		if (acpi_drv->ops.notify)
>  			acpi_device_remove_notify_handler(acpi_dev);
> -		if (acpi_drv->ops.remove)
> -			acpi_drv->ops.remove(acpi_dev, acpi_dev->removal_type);
> +		if (acpi_drv->ops.remove) {
> +			ret = acpi_drv->ops.remove(acpi_dev,
> +						   acpi_dev->removal_type);
> +			if (ret)
> +				goto rollback;
> +		}
>  	}
>  	acpi_dev->driver = NULL;
>  	acpi_dev->driver_data = NULL;
>  
>  	put_device(dev);
>  	return 0;
> +
> +rollback:
> +	if (acpi_drv->ops.notify)
> +		acpi_device_install_notify_handler(acpi_dev);
> +
> +	return ret;
>  }
>  
>  struct bus_type acpi_bus_type = {
> @@ -1226,11 +1237,15 @@ static int acpi_device_set_context(struc
>  
>  static int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
>  {
> +	int ret;
> +
>  	if (!dev)
>  		return -EINVAL;
>  
>  	dev->removal_type = ACPI_BUS_REMOVAL_EJECT;
> -	device_release_driver(&dev->dev);
> +	ret = device_release_driver(&dev->dev);
> +	if (ret)
> +		return ret;
>  
>  	if (!rmdevice)
>  		return 0;
> Index: linux-3.6/drivers/base/dd.c
> ===================================================================
> --- linux-3.6.orig/drivers/base/dd.c	2012-10-11 18:31:40.191019505 +0900
> +++ linux-3.6/drivers/base/dd.c	2012-10-11 18:31:46.873020548 +0900
> @@ -475,9 +475,10 @@ EXPORT_SYMBOL_GPL(driver_attach);
>   * __device_release_driver() must be called with @dev lock held.
>   * When called for a USB interface, @dev->parent lock must be held as well.
>   */
> -static void __device_release_driver(struct device *dev)
> +static int __device_release_driver(struct device *dev)
>  {
>  	struct device_driver *drv;
> +	int ret = 0;
>  
>  	drv = dev->driver;
>  	if (drv) {
> @@ -493,9 +494,11 @@ static void __device_release_driver(stru
>  		pm_runtime_put_sync(dev);
>  
>  		if (dev->bus && dev->bus->remove)
> -			dev->bus->remove(dev);
> +			ret = dev->bus->remove(dev);
>  		else if (drv->remove)
> -			drv->remove(dev);
> +			ret = drv->remove(dev);
> +		if (ret)
> +			goto rollback;
>  		devres_release_all(dev);
>  		dev->driver = NULL;
>  		dev_set_drvdata(dev, NULL);
> @@ -506,6 +509,12 @@ static void __device_release_driver(stru
>  						     dev);
>  
>  	}
> +
> +	return ret;
> +
> +rollback:
> +	driver_sysfs_add(dev);
> +	return ret;
>  }
>  
>  /**
> @@ -515,16 +524,19 @@ static void __device_release_driver(stru
>   * Manually detach device from driver.
>   * When called for a USB interface, @dev->parent lock must be held.
>   */
> -void device_release_driver(struct device *dev)
> +int device_release_driver(struct device *dev)
>  {
> +	int ret;
>  	/*
>  	 * If anyone calls device_release_driver() recursively from
>  	 * within their ->remove callback for the same device, they
>  	 * will deadlock right here.
>  	 */
>  	device_lock(dev);
> -	__device_release_driver(dev);
> +	ret = __device_release_driver(dev);
>  	device_unlock(dev);
> +
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(device_release_driver);
>  
> Index: linux-3.6/include/linux/device.h
> ===================================================================
> --- linux-3.6.orig/include/linux/device.h	2012-10-11 18:31:40.194019508 +0900
> +++ linux-3.6/include/linux/device.h	2012-10-11 18:31:46.881020556 +0900
> @@ -834,7 +834,7 @@ static inline void *dev_get_platdata(con
>   * for information on use.
>   */
>  extern int __must_check device_bind_driver(struct device *dev);
> -extern void device_release_driver(struct device *dev);
> +extern int device_release_driver(struct device *dev);
>  extern int  __must_check device_attach(struct device *dev);
>  extern int __must_check driver_attach(struct device_driver *drv);
>  extern int __must_check device_reprobe(struct device *dev);
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yasuaki Ishimatsu Oct. 12, 2012, 4:31 a.m. UTC | #2
Hi Toshi,

2012/10/11 22:58, Toshi Kani wrote:
> On Thu, 2012-10-11 at 19:12 +0900, Yasuaki Ishimatsu wrote:
>> acpi_bus_trim() stops removing devices, when acpi_bus_remove() return error
>> number. But acpi_bus_remove() cannot return error number correctly.
>> acpi_bus_remove() only return -EINVAL, when dev argument is NULL. Thus even if
>> device cannot be removed correctly, acpi_bus_trim() ignores and continues to
>> remove devices. acpi_bus_hot_remove_device() uses acpi_bus_trim() for removing
>> devices. Therefore acpi_bus_hot_remove_device() can send "_EJ0" to firmware,
>> even if the device is running on the system. In this case, the system cannot
>> work well.
>>
>> Vasilis hit the bug at memory hotplug and reported it as follow:
>> https://lkml.org/lkml/2012/9/26/318
>>
>> So acpi_bus_trim() should check whether device was removed or not correctly.
>> The patch adds error check into some functions to remove the device.
>>
>> Applying the patch, acpi_bus_trim() stops removing devices when failing
>> to remove the device. But I think there is no impact with the
>> exceptionof CPU and Memory hotplug path. Because other device also fails
>> but the fail is an irregular case like device is NULL.
>>
>> v1->v2
>> - add a rollback for reinstalling a notify handler.
>>
>> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
>
> Thanks for the update. Looks good.
>
> Reviewed-by: Toshi Kani <toshi.kani@hp.com>

Thank you for reviewing.

Thanks,
Yasauaki Ishimatsu

> -Toshi
>
>>
>> ---
>>   drivers/acpi/scan.c    |   21 ++++++++++++++++++---
>>   drivers/base/dd.c      |   22 +++++++++++++++++-----
>>   include/linux/device.h |    2 +-
>>   3 files changed, 36 insertions(+), 9 deletions(-)
>>
>> Index: linux-3.6/drivers/acpi/scan.c
>> ===================================================================
>> --- linux-3.6.orig/drivers/acpi/scan.c	2012-10-11 18:31:40.189019503 +0900
>> +++ linux-3.6/drivers/acpi/scan.c	2012-10-11 18:42:35.669041641 +0900
>> @@ -445,18 +445,29 @@ static int acpi_device_remove(struct dev
>>   {
>>   	struct acpi_device *acpi_dev = to_acpi_device(dev);
>>   	struct acpi_driver *acpi_drv = acpi_dev->driver;
>> +	int ret;
>>
>>   	if (acpi_drv) {
>>   		if (acpi_drv->ops.notify)
>>   			acpi_device_remove_notify_handler(acpi_dev);
>> -		if (acpi_drv->ops.remove)
>> -			acpi_drv->ops.remove(acpi_dev, acpi_dev->removal_type);
>> +		if (acpi_drv->ops.remove) {
>> +			ret = acpi_drv->ops.remove(acpi_dev,
>> +						   acpi_dev->removal_type);
>> +			if (ret)
>> +				goto rollback;
>> +		}
>>   	}
>>   	acpi_dev->driver = NULL;
>>   	acpi_dev->driver_data = NULL;
>>
>>   	put_device(dev);
>>   	return 0;
>> +
>> +rollback:
>> +	if (acpi_drv->ops.notify)
>> +		acpi_device_install_notify_handler(acpi_dev);
>> +
>> +	return ret;
>>   }
>>
>>   struct bus_type acpi_bus_type = {
>> @@ -1226,11 +1237,15 @@ static int acpi_device_set_context(struc
>>
>>   static int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
>>   {
>> +	int ret;
>> +
>>   	if (!dev)
>>   		return -EINVAL;
>>
>>   	dev->removal_type = ACPI_BUS_REMOVAL_EJECT;
>> -	device_release_driver(&dev->dev);
>> +	ret = device_release_driver(&dev->dev);
>> +	if (ret)
>> +		return ret;
>>
>>   	if (!rmdevice)
>>   		return 0;
>> Index: linux-3.6/drivers/base/dd.c
>> ===================================================================
>> --- linux-3.6.orig/drivers/base/dd.c	2012-10-11 18:31:40.191019505 +0900
>> +++ linux-3.6/drivers/base/dd.c	2012-10-11 18:31:46.873020548 +0900
>> @@ -475,9 +475,10 @@ EXPORT_SYMBOL_GPL(driver_attach);
>>    * __device_release_driver() must be called with @dev lock held.
>>    * When called for a USB interface, @dev->parent lock must be held as well.
>>    */
>> -static void __device_release_driver(struct device *dev)
>> +static int __device_release_driver(struct device *dev)
>>   {
>>   	struct device_driver *drv;
>> +	int ret = 0;
>>
>>   	drv = dev->driver;
>>   	if (drv) {
>> @@ -493,9 +494,11 @@ static void __device_release_driver(stru
>>   		pm_runtime_put_sync(dev);
>>
>>   		if (dev->bus && dev->bus->remove)
>> -			dev->bus->remove(dev);
>> +			ret = dev->bus->remove(dev);
>>   		else if (drv->remove)
>> -			drv->remove(dev);
>> +			ret = drv->remove(dev);
>> +		if (ret)
>> +			goto rollback;
>>   		devres_release_all(dev);
>>   		dev->driver = NULL;
>>   		dev_set_drvdata(dev, NULL);
>> @@ -506,6 +509,12 @@ static void __device_release_driver(stru
>>   						     dev);
>>
>>   	}
>> +
>> +	return ret;
>> +
>> +rollback:
>> +	driver_sysfs_add(dev);
>> +	return ret;
>>   }
>>
>>   /**
>> @@ -515,16 +524,19 @@ static void __device_release_driver(stru
>>    * Manually detach device from driver.
>>    * When called for a USB interface, @dev->parent lock must be held.
>>    */
>> -void device_release_driver(struct device *dev)
>> +int device_release_driver(struct device *dev)
>>   {
>> +	int ret;
>>   	/*
>>   	 * If anyone calls device_release_driver() recursively from
>>   	 * within their ->remove callback for the same device, they
>>   	 * will deadlock right here.
>>   	 */
>>   	device_lock(dev);
>> -	__device_release_driver(dev);
>> +	ret = __device_release_driver(dev);
>>   	device_unlock(dev);
>> +
>> +	return ret;
>>   }
>>   EXPORT_SYMBOL_GPL(device_release_driver);
>>
>> Index: linux-3.6/include/linux/device.h
>> ===================================================================
>> --- linux-3.6.orig/include/linux/device.h	2012-10-11 18:31:40.194019508 +0900
>> +++ linux-3.6/include/linux/device.h	2012-10-11 18:31:46.881020556 +0900
>> @@ -834,7 +834,7 @@ static inline void *dev_get_platdata(con
>>    * for information on use.
>>    */
>>   extern int __must_check device_bind_driver(struct device *dev);
>> -extern void device_release_driver(struct device *dev);
>> +extern int device_release_driver(struct device *dev);
>>   extern int  __must_check device_attach(struct device *dev);
>>   extern int __must_check driver_attach(struct device_driver *drv);
>>   extern int __must_check device_reprobe(struct device *dev);
>>
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael Wysocki Oct. 19, 2012, 4:29 a.m. UTC | #3
On Thursday 11 of October 2012 19:12:28 Yasuaki Ishimatsu wrote:
> acpi_bus_trim() stops removing devices, when acpi_bus_remove() return error
> number. But acpi_bus_remove() cannot return error number correctly.
> acpi_bus_remove() only return -EINVAL, when dev argument is NULL. Thus even if
> device cannot be removed correctly, acpi_bus_trim() ignores and continues to
> remove devices. acpi_bus_hot_remove_device() uses acpi_bus_trim() for removing
> devices. Therefore acpi_bus_hot_remove_device() can send "_EJ0" to firmware,
> even if the device is running on the system. In this case, the system cannot
> work well.
> 
> Vasilis hit the bug at memory hotplug and reported it as follow:
> https://lkml.org/lkml/2012/9/26/318
> 
> So acpi_bus_trim() should check whether device was removed or not correctly.
> The patch adds error check into some functions to remove the device.
> 
> Applying the patch, acpi_bus_trim() stops removing devices when failing
> to remove the device. But I think there is no impact with the
> exceptionof CPU and Memory hotplug path. Because other device also fails
> but the fail is an irregular case like device is NULL.
> 
> v1->v2
> - add a rollback for reinstalling a notify handler.
> 
> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

Greg, do you think there may be any problems with the changes in dd.c?

Rafael


> ---
>  drivers/acpi/scan.c    |   21 ++++++++++++++++++---
>  drivers/base/dd.c      |   22 +++++++++++++++++-----
>  include/linux/device.h |    2 +-
>  3 files changed, 36 insertions(+), 9 deletions(-)
> 
> Index: linux-3.6/drivers/acpi/scan.c
> ===================================================================
> --- linux-3.6.orig/drivers/acpi/scan.c	2012-10-11 18:31:40.189019503 +0900
> +++ linux-3.6/drivers/acpi/scan.c	2012-10-11 18:42:35.669041641 +0900
> @@ -445,18 +445,29 @@ static int acpi_device_remove(struct dev
>  {
>  	struct acpi_device *acpi_dev = to_acpi_device(dev);
>  	struct acpi_driver *acpi_drv = acpi_dev->driver;
> +	int ret;
>  
>  	if (acpi_drv) {
>  		if (acpi_drv->ops.notify)
>  			acpi_device_remove_notify_handler(acpi_dev);
> -		if (acpi_drv->ops.remove)
> -			acpi_drv->ops.remove(acpi_dev, acpi_dev->removal_type);
> +		if (acpi_drv->ops.remove) {
> +			ret = acpi_drv->ops.remove(acpi_dev,
> +						   acpi_dev->removal_type);
> +			if (ret)
> +				goto rollback;
> +		}
>  	}
>  	acpi_dev->driver = NULL;
>  	acpi_dev->driver_data = NULL;
>  
>  	put_device(dev);
>  	return 0;
> +
> +rollback:
> +	if (acpi_drv->ops.notify)
> +		acpi_device_install_notify_handler(acpi_dev);
> +
> +	return ret;
>  }
>  
>  struct bus_type acpi_bus_type = {
> @@ -1226,11 +1237,15 @@ static int acpi_device_set_context(struc
>  
>  static int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
>  {
> +	int ret;
> +
>  	if (!dev)
>  		return -EINVAL;
>  
>  	dev->removal_type = ACPI_BUS_REMOVAL_EJECT;
> -	device_release_driver(&dev->dev);
> +	ret = device_release_driver(&dev->dev);
> +	if (ret)
> +		return ret;
>  
>  	if (!rmdevice)
>  		return 0;
> Index: linux-3.6/drivers/base/dd.c
> ===================================================================
> --- linux-3.6.orig/drivers/base/dd.c	2012-10-11 18:31:40.191019505 +0900
> +++ linux-3.6/drivers/base/dd.c	2012-10-11 18:31:46.873020548 +0900
> @@ -475,9 +475,10 @@ EXPORT_SYMBOL_GPL(driver_attach);
>   * __device_release_driver() must be called with @dev lock held.
>   * When called for a USB interface, @dev->parent lock must be held as well.
>   */
> -static void __device_release_driver(struct device *dev)
> +static int __device_release_driver(struct device *dev)
>  {
>  	struct device_driver *drv;
> +	int ret = 0;
>  
>  	drv = dev->driver;
>  	if (drv) {
> @@ -493,9 +494,11 @@ static void __device_release_driver(stru
>  		pm_runtime_put_sync(dev);
>  
>  		if (dev->bus && dev->bus->remove)
> -			dev->bus->remove(dev);
> +			ret = dev->bus->remove(dev);
>  		else if (drv->remove)
> -			drv->remove(dev);
> +			ret = drv->remove(dev);
> +		if (ret)
> +			goto rollback;
>  		devres_release_all(dev);
>  		dev->driver = NULL;
>  		dev_set_drvdata(dev, NULL);
> @@ -506,6 +509,12 @@ static void __device_release_driver(stru
>  						     dev);
>  
>  	}
> +
> +	return ret;
> +
> +rollback:
> +	driver_sysfs_add(dev);
> +	return ret;
>  }
>  
>  /**
> @@ -515,16 +524,19 @@ static void __device_release_driver(stru
>   * Manually detach device from driver.
>   * When called for a USB interface, @dev->parent lock must be held.
>   */
> -void device_release_driver(struct device *dev)
> +int device_release_driver(struct device *dev)
>  {
> +	int ret;
>  	/*
>  	 * If anyone calls device_release_driver() recursively from
>  	 * within their ->remove callback for the same device, they
>  	 * will deadlock right here.
>  	 */
>  	device_lock(dev);
> -	__device_release_driver(dev);
> +	ret = __device_release_driver(dev);
>  	device_unlock(dev);
> +
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(device_release_driver);
>  
> Index: linux-3.6/include/linux/device.h
> ===================================================================
> --- linux-3.6.orig/include/linux/device.h	2012-10-11 18:31:40.194019508 +0900
> +++ linux-3.6/include/linux/device.h	2012-10-11 18:31:46.881020556 +0900
> @@ -834,7 +834,7 @@ static inline void *dev_get_platdata(con
>   * for information on use.
>   */
>  extern int __must_check device_bind_driver(struct device *dev);
> -extern void device_release_driver(struct device *dev);
> +extern int device_release_driver(struct device *dev);
>  extern int  __must_check device_attach(struct device *dev);
>  extern int __must_check driver_attach(struct device_driver *drv);
>  extern int __must_check device_reprobe(struct device *dev);
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>
Greg KH Oct. 19, 2012, 5:59 p.m. UTC | #4
On Fri, Oct 19, 2012 at 06:29:52AM +0200, Rafael J. Wysocki wrote:
> On Thursday 11 of October 2012 19:12:28 Yasuaki Ishimatsu wrote:
> > acpi_bus_trim() stops removing devices, when acpi_bus_remove() return error
> > number. But acpi_bus_remove() cannot return error number correctly.
> > acpi_bus_remove() only return -EINVAL, when dev argument is NULL. Thus even if
> > device cannot be removed correctly, acpi_bus_trim() ignores and continues to
> > remove devices. acpi_bus_hot_remove_device() uses acpi_bus_trim() for removing
> > devices. Therefore acpi_bus_hot_remove_device() can send "_EJ0" to firmware,
> > even if the device is running on the system. In this case, the system cannot
> > work well.
> > 
> > Vasilis hit the bug at memory hotplug and reported it as follow:
> > https://lkml.org/lkml/2012/9/26/318
> > 
> > So acpi_bus_trim() should check whether device was removed or not correctly.
> > The patch adds error check into some functions to remove the device.
> > 
> > Applying the patch, acpi_bus_trim() stops removing devices when failing
> > to remove the device. But I think there is no impact with the
> > exceptionof CPU and Memory hotplug path. Because other device also fails
> > but the fail is an irregular case like device is NULL.
> > 
> > v1->v2
> > - add a rollback for reinstalling a notify handler.
> > 
> > Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> 
> Greg, do you think there may be any problems with the changes in dd.c?

Yes, I don't like it.

remove should always work, just like the exit call in a module.  It
means that the core wants to remove the driver, so it is going to
happen, a driver can't refuse it.

Which brings me to the larger question, why would this solve anything?
If the kernel wants to unbind a device, why would we ever not want that
to happen?

So, NAK on this patch, sorry.  Fix up the ACPI core to handle this
properly, don't mess with the driver core here.

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yasuaki Ishimatsu Oct. 26, 2012, 7:33 a.m. UTC | #5
Hi Greg,

Sorry for late reply.

2012/10/20 2:59, Greg Kroah-Hartman wrote:
> On Fri, Oct 19, 2012 at 06:29:52AM +0200, Rafael J. Wysocki wrote:
>> On Thursday 11 of October 2012 19:12:28 Yasuaki Ishimatsu wrote:
>>> acpi_bus_trim() stops removing devices, when acpi_bus_remove() return error
>>> number. But acpi_bus_remove() cannot return error number correctly.
>>> acpi_bus_remove() only return -EINVAL, when dev argument is NULL. Thus even if
>>> device cannot be removed correctly, acpi_bus_trim() ignores and continues to
>>> remove devices. acpi_bus_hot_remove_device() uses acpi_bus_trim() for removing
>>> devices. Therefore acpi_bus_hot_remove_device() can send "_EJ0" to firmware,
>>> even if the device is running on the system. In this case, the system cannot
>>> work well.
>>>
>>> Vasilis hit the bug at memory hotplug and reported it as follow:
>>> https://lkml.org/lkml/2012/9/26/318
>>>
>>> So acpi_bus_trim() should check whether device was removed or not correctly.
>>> The patch adds error check into some functions to remove the device.
>>>
>>> Applying the patch, acpi_bus_trim() stops removing devices when failing
>>> to remove the device. But I think there is no impact with the
>>> exceptionof CPU and Memory hotplug path. Because other device also fails
>>> but the fail is an irregular case like device is NULL.
>>>
>>> v1->v2
>>> - add a rollback for reinstalling a notify handler.
>>>
>>> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
>>
>> Greg, do you think there may be any problems with the changes in dd.c?
>
> Yes, I don't like it.
>
> remove should always work, just like the exit call in a module.  It
> means that the core wants to remove the driver, so it is going to
> happen, a driver can't refuse it.
>
> Which brings me to the larger question, why would this solve anything?

Now we are developing physical memory hot plug.

https://lkml.org/lkml/2012/10/23/213

So if we aplly the patch-set, we can hot remove a physical memory
by the following way.

"echo 1 > /sys/bus/acpi/devices/PNP/eject"

In this case, acpi_bus_hot_remove_device() tries to remove memory
device by acpi_bus_trim(). But if the memory has irremovable memory,
memory hot remove fails. And the memory remains in kernel.
However acpi_bus_trim() cannot notice that memory hot remove fails and
retruns 0. So acpi_bus_hot_remove_device() continues to remove memory
devices and sends _EJ0 method to firmware. Thus the memory device cannot
be used. But the memory remains in kernel yet. So if someone access the
memory, kernel panic occurs.

Thanks,
Yasuaki Ishimatsu

> If the kernel wants to unbind a device, why would we ever not want that
> to happen?
>
> So, NAK on this patch, sorry.  Fix up the ACPI core to handle this
> properly, don't mess with the driver core here.
>
> greg k-h
>


--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Greg KH Oct. 26, 2012, 3:25 p.m. UTC | #6
On Fri, Oct 26, 2012 at 04:33:49PM +0900, Yasuaki Ishimatsu wrote:
> Hi Greg,
> 
> Sorry for late reply.
> 
> 2012/10/20 2:59, Greg Kroah-Hartman wrote:
> >On Fri, Oct 19, 2012 at 06:29:52AM +0200, Rafael J. Wysocki wrote:
> >>On Thursday 11 of October 2012 19:12:28 Yasuaki Ishimatsu wrote:
> >>>acpi_bus_trim() stops removing devices, when acpi_bus_remove() return error
> >>>number. But acpi_bus_remove() cannot return error number correctly.
> >>>acpi_bus_remove() only return -EINVAL, when dev argument is NULL. Thus even if
> >>>device cannot be removed correctly, acpi_bus_trim() ignores and continues to
> >>>remove devices. acpi_bus_hot_remove_device() uses acpi_bus_trim() for removing
> >>>devices. Therefore acpi_bus_hot_remove_device() can send "_EJ0" to firmware,
> >>>even if the device is running on the system. In this case, the system cannot
> >>>work well.
> >>>
> >>>Vasilis hit the bug at memory hotplug and reported it as follow:
> >>>https://lkml.org/lkml/2012/9/26/318
> >>>
> >>>So acpi_bus_trim() should check whether device was removed or not correctly.
> >>>The patch adds error check into some functions to remove the device.
> >>>
> >>>Applying the patch, acpi_bus_trim() stops removing devices when failing
> >>>to remove the device. But I think there is no impact with the
> >>>exceptionof CPU and Memory hotplug path. Because other device also fails
> >>>but the fail is an irregular case like device is NULL.
> >>>
> >>>v1->v2
> >>>- add a rollback for reinstalling a notify handler.
> >>>
> >>>Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> >>
> >>Greg, do you think there may be any problems with the changes in dd.c?
> >
> >Yes, I don't like it.
> >
> >remove should always work, just like the exit call in a module.  It
> >means that the core wants to remove the driver, so it is going to
> >happen, a driver can't refuse it.
> >
> >Which brings me to the larger question, why would this solve anything?
> 
> Now we are developing physical memory hot plug.
> 
> https://lkml.org/lkml/2012/10/23/213
> 
> So if we aplly the patch-set, we can hot remove a physical memory
> by the following way.
> 
> "echo 1 > /sys/bus/acpi/devices/PNP/eject"
> 
> In this case, acpi_bus_hot_remove_device() tries to remove memory
> device by acpi_bus_trim(). But if the memory has irremovable memory,
> memory hot remove fails. And the memory remains in kernel.
> However acpi_bus_trim() cannot notice that memory hot remove fails and
> retruns 0. So acpi_bus_hot_remove_device() continues to remove memory
> devices and sends _EJ0 method to firmware. Thus the memory device cannot
> be used. But the memory remains in kernel yet. So if someone access the
> memory, kernel panic occurs.

Why can't you check to find out if you can do the remove operation
before you enter the driver core asking to actually remove the devices?
That would allow you to "know" if you can do this before having to go
through the whole operation.  What happens if you can complete half of
the removal, and do that, but not the whole thing?  Don't you end up
with half of the memory chunk gone from the system now?

In other words, please solve this at a higher level than the driver
core if at all possible.

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yasuaki Ishimatsu Oct. 31, 2012, 10:52 a.m. UTC | #7
Hi Greg,

2012/10/27 0:25, Greg Kroah-Hartman wrote:
> On Fri, Oct 26, 2012 at 04:33:49PM +0900, Yasuaki Ishimatsu wrote:
>> Hi Greg,
>>
>> Sorry for late reply.
>>
>> 2012/10/20 2:59, Greg Kroah-Hartman wrote:
>>> On Fri, Oct 19, 2012 at 06:29:52AM +0200, Rafael J. Wysocki wrote:
>>>> On Thursday 11 of October 2012 19:12:28 Yasuaki Ishimatsu wrote:
>>>>> acpi_bus_trim() stops removing devices, when acpi_bus_remove() return error
>>>>> number. But acpi_bus_remove() cannot return error number correctly.
>>>>> acpi_bus_remove() only return -EINVAL, when dev argument is NULL. Thus even if
>>>>> device cannot be removed correctly, acpi_bus_trim() ignores and continues to
>>>>> remove devices. acpi_bus_hot_remove_device() uses acpi_bus_trim() for removing
>>>>> devices. Therefore acpi_bus_hot_remove_device() can send "_EJ0" to firmware,
>>>>> even if the device is running on the system. In this case, the system cannot
>>>>> work well.
>>>>>
>>>>> Vasilis hit the bug at memory hotplug and reported it as follow:
>>>>> https://lkml.org/lkml/2012/9/26/318
>>>>>
>>>>> So acpi_bus_trim() should check whether device was removed or not correctly.
>>>>> The patch adds error check into some functions to remove the device.
>>>>>
>>>>> Applying the patch, acpi_bus_trim() stops removing devices when failing
>>>>> to remove the device. But I think there is no impact with the
>>>>> exceptionof CPU and Memory hotplug path. Because other device also fails
>>>>> but the fail is an irregular case like device is NULL.
>>>>>
>>>>> v1->v2
>>>>> - add a rollback for reinstalling a notify handler.
>>>>>
>>>>> Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
>>>>
>>>> Greg, do you think there may be any problems with the changes in dd.c?
>>>
>>> Yes, I don't like it.
>>>
>>> remove should always work, just like the exit call in a module.  It
>>> means that the core wants to remove the driver, so it is going to
>>> happen, a driver can't refuse it.
>>>
>>> Which brings me to the larger question, why would this solve anything?
>>
>> Now we are developing physical memory hot plug.
>>
>> https://lkml.org/lkml/2012/10/23/213
>>
>> So if we aplly the patch-set, we can hot remove a physical memory
>> by the following way.
>>
>> "echo 1 > /sys/bus/acpi/devices/PNP/eject"
>>
>> In this case, acpi_bus_hot_remove_device() tries to remove memory
>> device by acpi_bus_trim(). But if the memory has irremovable memory,
>> memory hot remove fails. And the memory remains in kernel.
>> However acpi_bus_trim() cannot notice that memory hot remove fails and
>> retruns 0. So acpi_bus_hot_remove_device() continues to remove memory
>> devices and sends _EJ0 method to firmware. Thus the memory device cannot
>> be used. But the memory remains in kernel yet. So if someone access the
>> memory, kernel panic occurs.
>
> Why can't you check to find out if you can do the remove operation
> before you enter the driver core asking to actually remove the devices?
> That would allow you to "know" if you can do this before having to go
> through the whole operation.  What happens if you can complete half of
> the removal, and do that, but not the whole thing?  Don't you end up
> with half of the memory chunk gone from the system now?
>

> In other words, please solve this at a higher level than the driver
> core if at all possible.

O.K.
I'll check whether the problem is sloved at a higher level or not.

Thanks,
Yasuaki Ishimatsu

>
> greg k-h
>


--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

Index: linux-3.6/drivers/acpi/scan.c
===================================================================
--- linux-3.6.orig/drivers/acpi/scan.c	2012-10-11 18:31:40.189019503 +0900
+++ linux-3.6/drivers/acpi/scan.c	2012-10-11 18:42:35.669041641 +0900
@@ -445,18 +445,29 @@  static int acpi_device_remove(struct dev
 {
 	struct acpi_device *acpi_dev = to_acpi_device(dev);
 	struct acpi_driver *acpi_drv = acpi_dev->driver;
+	int ret;
 
 	if (acpi_drv) {
 		if (acpi_drv->ops.notify)
 			acpi_device_remove_notify_handler(acpi_dev);
-		if (acpi_drv->ops.remove)
-			acpi_drv->ops.remove(acpi_dev, acpi_dev->removal_type);
+		if (acpi_drv->ops.remove) {
+			ret = acpi_drv->ops.remove(acpi_dev,
+						   acpi_dev->removal_type);
+			if (ret)
+				goto rollback;
+		}
 	}
 	acpi_dev->driver = NULL;
 	acpi_dev->driver_data = NULL;
 
 	put_device(dev);
 	return 0;
+
+rollback:
+	if (acpi_drv->ops.notify)
+		acpi_device_install_notify_handler(acpi_dev);
+
+	return ret;
 }
 
 struct bus_type acpi_bus_type = {
@@ -1226,11 +1237,15 @@  static int acpi_device_set_context(struc
 
 static int acpi_bus_remove(struct acpi_device *dev, int rmdevice)
 {
+	int ret;
+
 	if (!dev)
 		return -EINVAL;
 
 	dev->removal_type = ACPI_BUS_REMOVAL_EJECT;
-	device_release_driver(&dev->dev);
+	ret = device_release_driver(&dev->dev);
+	if (ret)
+		return ret;
 
 	if (!rmdevice)
 		return 0;
Index: linux-3.6/drivers/base/dd.c
===================================================================
--- linux-3.6.orig/drivers/base/dd.c	2012-10-11 18:31:40.191019505 +0900
+++ linux-3.6/drivers/base/dd.c	2012-10-11 18:31:46.873020548 +0900
@@ -475,9 +475,10 @@  EXPORT_SYMBOL_GPL(driver_attach);
  * __device_release_driver() must be called with @dev lock held.
  * When called for a USB interface, @dev->parent lock must be held as well.
  */
-static void __device_release_driver(struct device *dev)
+static int __device_release_driver(struct device *dev)
 {
 	struct device_driver *drv;
+	int ret = 0;
 
 	drv = dev->driver;
 	if (drv) {
@@ -493,9 +494,11 @@  static void __device_release_driver(stru
 		pm_runtime_put_sync(dev);
 
 		if (dev->bus && dev->bus->remove)
-			dev->bus->remove(dev);
+			ret = dev->bus->remove(dev);
 		else if (drv->remove)
-			drv->remove(dev);
+			ret = drv->remove(dev);
+		if (ret)
+			goto rollback;
 		devres_release_all(dev);
 		dev->driver = NULL;
 		dev_set_drvdata(dev, NULL);
@@ -506,6 +509,12 @@  static void __device_release_driver(stru
 						     dev);
 
 	}
+
+	return ret;
+
+rollback:
+	driver_sysfs_add(dev);
+	return ret;
 }
 
 /**
@@ -515,16 +524,19 @@  static void __device_release_driver(stru
  * Manually detach device from driver.
  * When called for a USB interface, @dev->parent lock must be held.
  */
-void device_release_driver(struct device *dev)
+int device_release_driver(struct device *dev)
 {
+	int ret;
 	/*
 	 * If anyone calls device_release_driver() recursively from
 	 * within their ->remove callback for the same device, they
 	 * will deadlock right here.
 	 */
 	device_lock(dev);
-	__device_release_driver(dev);
+	ret = __device_release_driver(dev);
 	device_unlock(dev);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(device_release_driver);
 
Index: linux-3.6/include/linux/device.h
===================================================================
--- linux-3.6.orig/include/linux/device.h	2012-10-11 18:31:40.194019508 +0900
+++ linux-3.6/include/linux/device.h	2012-10-11 18:31:46.881020556 +0900
@@ -834,7 +834,7 @@  static inline void *dev_get_platdata(con
  * for information on use.
  */
 extern int __must_check device_bind_driver(struct device *dev);
-extern void device_release_driver(struct device *dev);
+extern int device_release_driver(struct device *dev);
 extern int  __must_check device_attach(struct device *dev);
 extern int __must_check driver_attach(struct device_driver *drv);
 extern int __must_check device_reprobe(struct device *dev);