diff mbox

[V15,01/11] acpi: apei: read ack upon ghes record consumption

Message ID 1492556723-9189-2-git-send-email-tbaicar@codeaurora.org (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Tyler Baicar April 18, 2017, 11:05 p.m. UTC
A RAS (Reliability, Availability, Serviceability) controller
may be a separate processor running in parallel with OS
execution, and may generate error records for consumption by
the OS. If the RAS controller produces multiple error records,
then they may be overwritten before the OS has consumed them.

The Generic Hardware Error Source (GHES) v2 structure
introduces the capability for the OS to acknowledge the
consumption of the error record generated by the RAS
controller. A RAS controller supporting GHESv2 shall wait for
the acknowledgment before writing a new error record, thus
eliminating the race condition.

Add support for parsing of GHESv2 sub-tables as well.

Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
Reviewed-by: James Morse <james.morse@arm.com>
---
 drivers/acpi/apei/ghes.c | 55 +++++++++++++++++++++++++++++++++++++++++++++---
 drivers/acpi/apei/hest.c |  7 ++++--
 include/acpi/ghes.h      |  5 ++++-
 3 files changed, 61 insertions(+), 6 deletions(-)

Comments

Borislav Petkov April 19, 2017, 6:31 p.m. UTC | #1
On Tue, Apr 18, 2017 at 05:05:13PM -0600, Tyler Baicar wrote:
> A RAS (Reliability, Availability, Serviceability) controller
> may be a separate processor running in parallel with OS
> execution, and may generate error records for consumption by
> the OS. If the RAS controller produces multiple error records,
> then they may be overwritten before the OS has consumed them.
> 
> The Generic Hardware Error Source (GHES) v2 structure
> introduces the capability for the OS to acknowledge the
> consumption of the error record generated by the RAS
> controller. A RAS controller supporting GHESv2 shall wait for
> the acknowledgment before writing a new error record, thus
> eliminating the race condition.
> 
> Add support for parsing of GHESv2 sub-tables as well.
> 
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
> Reviewed-by: James Morse <james.morse@arm.com>
> ---
>  drivers/acpi/apei/ghes.c | 55 +++++++++++++++++++++++++++++++++++++++++++++---
>  drivers/acpi/apei/hest.c |  7 ++++--
>  include/acpi/ghes.h      |  5 ++++-
>  3 files changed, 61 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 79b3c9c..6d87ab7 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -46,6 +46,7 @@
>  #include <linux/nmi.h>
>  #include <linux/sched/clock.h>
>  
> +#include <acpi/actbl1.h>
>  #include <acpi/ghes.h>
>  #include <acpi/apei.h>
>  #include <asm/tlbflush.h>
> @@ -80,6 +81,10 @@
>  	((struct acpi_hest_generic_status *)				\
>  	 ((struct ghes_estatus_node *)(estatus_node) + 1))
>  
> +#define IS_HEST_TYPE_GENERIC_V2(ghes)				\
> +	((struct acpi_hest_header *)ghes->generic)->type ==	\

This is a nasty hack: casting the ghes->generic pointer to a pointer of its
first member which is a acpi_hest_header.

Why isn't this a nice inline function with proper dereferencing:

static inline bool is_hest_type_generic_v2(struct ghes *ghes)
{
        return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
}

?

Also, please integrate scripts/checkpatch.pl in your patch creation
workflow. Some of the warnings/errors *actually* make sense.

>  /*
>   * This driver isn't really modular, however for the time being,
>   * continuing to use module_param is the easiest way to remain
> @@ -240,6 +245,17 @@ static int ghes_estatus_pool_expand(unsigned long len)
>  	return 0;
>  }
>  
> +static int map_gen_v2(struct ghes *ghes)
> +{
> +	return apei_map_generic_address(&ghes->generic_v2->read_ack_register);
> +}
> +
> +static void unmap_gen_v2(struct ghes *ghes)
> +{
> +	apei_unmap_generic_address(&ghes->generic_v2->read_ack_register);
> +	return;
> +}

Like this one, for example:

WARNING: void function return statements are not generally useful
#89: FILE: drivers/acpi/apei/ghes.c:257:
+       return;
+}

> +
>  static struct ghes *ghes_new(struct acpi_hest_generic *generic)
>  {
>  	struct ghes *ghes;
> @@ -249,10 +265,17 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
>  	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
>  	if (!ghes)
>  		return ERR_PTR(-ENOMEM);
> +
>  	ghes->generic = generic;
> +	if (IS_HEST_TYPE_GENERIC_V2(ghes)) {
> +		rc = map_gen_v2(ghes);
> +		if (rc)
> +			goto err_free;
> +	}
> +
>  	rc = apei_map_generic_address(&generic->error_status_address);
>  	if (rc)
> -		goto err_free;
> +		goto err_unmap_read_ack_addr;
>  	error_block_length = generic->error_block_length;
>  	if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
>  		pr_warning(FW_WARN GHES_PFX
> @@ -264,13 +287,16 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
>  	ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
>  	if (!ghes->estatus) {
>  		rc = -ENOMEM;
> -		goto err_unmap;
> +		goto err_unmap_status_addr;
>  	}
>  
>  	return ghes;
>  
> -err_unmap:
> +err_unmap_status_addr:
>  	apei_unmap_generic_address(&generic->error_status_address);
> +err_unmap_read_ack_addr:
> +	if (IS_HEST_TYPE_GENERIC_V2(ghes))
> +		unmap_gen_v2(ghes);
>  err_free:
>  	kfree(ghes);
>  	return ERR_PTR(rc);
> @@ -280,6 +306,8 @@ static void ghes_fini(struct ghes *ghes)
>  {
>  	kfree(ghes->estatus);
>  	apei_unmap_generic_address(&ghes->generic->error_status_address);
> +	if (IS_HEST_TYPE_GENERIC_V2(ghes))
> +		unmap_gen_v2(ghes);
>  }
>  
>  static inline int ghes_severity(int severity)
> @@ -649,6 +677,21 @@ static void ghes_estatus_cache_add(
>  	rcu_read_unlock();
>  }
>  
> +static int ghes_ack_error(struct acpi_hest_generic_v2 *generic_v2)

If you name this function parameter to something shorter, say gv2, for
example...

> +{
> +	int rc;
> +	u64 val = 0;
> +
> +	rc = apei_read(&val, &generic_v2->read_ack_register);
> +	if (rc)
> +		return rc;
> +
> +	val &= generic_v2->read_ack_preserve << generic_v2->read_ack_register.bit_offset;
> +	val |= generic_v2->read_ack_write << generic_v2->read_ack_register.bit_offset;

... you can align those two nicely while remaining within the 80 cols width:

        val &= gv2->read_ack_preserve << gv2->read_ack_register.bit_offset;
        val |= gv2->read_ack_write    << gv2->read_ack_register.bit_offset;

and make them readable at a quick glance.

> +
> +	return apei_write(val, &generic_v2->read_ack_register);
> +}
> +
>  static int ghes_proc(struct ghes *ghes)
>  {
>  	int rc;
> @@ -661,6 +704,12 @@ static int ghes_proc(struct ghes *ghes)
>  			ghes_estatus_cache_add(ghes->generic, ghes->estatus);
>  	}
>  	ghes_do_proc(ghes, ghes->estatus);

This needs a comment why v2 needs to ACK the error. The commit message
is not necessarily something we'll find quickly in the future.

> +
> +	if (IS_HEST_TYPE_GENERIC_V2(ghes)) {
> +		rc = ghes_ack_error(ghes->generic_v2);
> +		if (rc)
> +			return rc;
> +	}
>  out:
>  	ghes_clear_estatus(ghes);
>  	return rc;
Tyler Baicar April 19, 2017, 8:31 p.m. UTC | #2
On 4/19/2017 12:31 PM, Borislav Petkov wrote:
> On Tue, Apr 18, 2017 at 05:05:13PM -0600, Tyler Baicar wrote:
>> A RAS (Reliability, Availability, Serviceability) controller
>> may be a separate processor running in parallel with OS
>> execution, and may generate error records for consumption by
>> the OS. If the RAS controller produces multiple error records,
>> then they may be overwritten before the OS has consumed them.
>>
>> The Generic Hardware Error Source (GHES) v2 structure
>> introduces the capability for the OS to acknowledge the
>> consumption of the error record generated by the RAS
>> controller. A RAS controller supporting GHESv2 shall wait for
>> the acknowledgment before writing a new error record, thus
>> eliminating the race condition.
>>
>> Add support for parsing of GHESv2 sub-tables as well.
>>
>> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
>> CC: Jonathan (Zhixiong) Zhang <zjzhang@codeaurora.org>
>> Reviewed-by: James Morse <james.morse@arm.com>
>> ---
>>   drivers/acpi/apei/ghes.c | 55 +++++++++++++++++++++++++++++++++++++++++++++---
>>   drivers/acpi/apei/hest.c |  7 ++++--
>>   include/acpi/ghes.h      |  5 ++++-
>>   3 files changed, 61 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
>> index 79b3c9c..6d87ab7 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -46,6 +46,7 @@
>>   #include <linux/nmi.h>
>>   #include <linux/sched/clock.h>
>>   
>> +#include <acpi/actbl1.h>
>>   #include <acpi/ghes.h>
>>   #include <acpi/apei.h>
>>   #include <asm/tlbflush.h>
>> @@ -80,6 +81,10 @@
>>   	((struct acpi_hest_generic_status *)				\
>>   	 ((struct ghes_estatus_node *)(estatus_node) + 1))
>>   
>> +#define IS_HEST_TYPE_GENERIC_V2(ghes)				\
>> +	((struct acpi_hest_header *)ghes->generic)->type ==	\
> This is a nasty hack: casting the ghes->generic pointer to a pointer of its
> first member which is a acpi_hest_header.
>
> Why isn't this a nice inline function with proper dereferencing:
>
> static inline bool is_hest_type_generic_v2(struct ghes *ghes)
> {
>          return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
> }
>
> ?
I'll change it to this.
> Also, please integrate scripts/checkpatch.pl in your patch creation
> workflow. Some of the warnings/errors *actually* make sense.
>
>>   /*
>>    * This driver isn't really modular, however for the time being,
>>    * continuing to use module_param is the easiest way to remain
>> @@ -240,6 +245,17 @@ static int ghes_estatus_pool_expand(unsigned long len)
>>   	return 0;
>>   }
>>   
>> +static int map_gen_v2(struct ghes *ghes)
>> +{
>> +	return apei_map_generic_address(&ghes->generic_v2->read_ack_register);
>> +}
>> +
>> +static void unmap_gen_v2(struct ghes *ghes)
>> +{
>> +	apei_unmap_generic_address(&ghes->generic_v2->read_ack_register);
>> +	return;
>> +}
> Like this one, for example:
>
> WARNING: void function return statements are not generally useful
> #89: FILE: drivers/acpi/apei/ghes.c:257:
> +       return;
> +}
Will remove the return.
>
>> +
>>   static struct ghes *ghes_new(struct acpi_hest_generic *generic)
>>   {
>>   	struct ghes *ghes;
>> @@ -249,10 +265,17 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
>>   	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
>>   	if (!ghes)
>>   		return ERR_PTR(-ENOMEM);
>> +
>>   	ghes->generic = generic;
>> +	if (IS_HEST_TYPE_GENERIC_V2(ghes)) {
>> +		rc = map_gen_v2(ghes);
>> +		if (rc)
>> +			goto err_free;
>> +	}
>> +
>>   	rc = apei_map_generic_address(&generic->error_status_address);
>>   	if (rc)
>> -		goto err_free;
>> +		goto err_unmap_read_ack_addr;
>>   	error_block_length = generic->error_block_length;
>>   	if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
>>   		pr_warning(FW_WARN GHES_PFX
>> @@ -264,13 +287,16 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic)
>>   	ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
>>   	if (!ghes->estatus) {
>>   		rc = -ENOMEM;
>> -		goto err_unmap;
>> +		goto err_unmap_status_addr;
>>   	}
>>   
>>   	return ghes;
>>   
>> -err_unmap:
>> +err_unmap_status_addr:
>>   	apei_unmap_generic_address(&generic->error_status_address);
>> +err_unmap_read_ack_addr:
>> +	if (IS_HEST_TYPE_GENERIC_V2(ghes))
>> +		unmap_gen_v2(ghes);
>>   err_free:
>>   	kfree(ghes);
>>   	return ERR_PTR(rc);
>> @@ -280,6 +306,8 @@ static void ghes_fini(struct ghes *ghes)
>>   {
>>   	kfree(ghes->estatus);
>>   	apei_unmap_generic_address(&ghes->generic->error_status_address);
>> +	if (IS_HEST_TYPE_GENERIC_V2(ghes))
>> +		unmap_gen_v2(ghes);
>>   }
>>   
>>   static inline int ghes_severity(int severity)
>> @@ -649,6 +677,21 @@ static void ghes_estatus_cache_add(
>>   	rcu_read_unlock();
>>   }
>>   
>> +static int ghes_ack_error(struct acpi_hest_generic_v2 *generic_v2)
> If you name this function parameter to something shorter, say gv2, for
> example...
Will do.
>
>> +{
>> +	int rc;
>> +	u64 val = 0;
>> +
>> +	rc = apei_read(&val, &generic_v2->read_ack_register);
>> +	if (rc)
>> +		return rc;
>> +
>> +	val &= generic_v2->read_ack_preserve << generic_v2->read_ack_register.bit_offset;
>> +	val |= generic_v2->read_ack_write << generic_v2->read_ack_register.bit_offset;
> ... you can align those two nicely while remaining within the 80 cols width:
>
>          val &= gv2->read_ack_preserve << gv2->read_ack_register.bit_offset;
>          val |= gv2->read_ack_write    << gv2->read_ack_register.bit_offset;
>
> and make them readable at a quick glance.
Will do.
>> +
>> +	return apei_write(val, &generic_v2->read_ack_register);
>> +}
>> +
>>   static int ghes_proc(struct ghes *ghes)
>>   {
>>   	int rc;
>> @@ -661,6 +704,12 @@ static int ghes_proc(struct ghes *ghes)
>>   			ghes_estatus_cache_add(ghes->generic, ghes->estatus);
>>   	}
>>   	ghes_do_proc(ghes, ghes->estatus);
> This needs a comment why v2 needs to ACK the error. The commit message
> is not necessarily something we'll find quickly in the future.
Will do.

Thanks,
Tyler
Borislav Petkov April 19, 2017, 8:41 p.m. UTC | #3
On Wed, Apr 19, 2017 at 02:31:13PM -0600, Baicar, Tyler wrote:
> Will do.

You don't necessarily have to reply with "will do" if you agree with the
review.

Also, please wait until I've gone through the whole pile before sending
it again.

Thanks.
diff mbox

Patch

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 79b3c9c..6d87ab7 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -46,6 +46,7 @@ 
 #include <linux/nmi.h>
 #include <linux/sched/clock.h>
 
+#include <acpi/actbl1.h>
 #include <acpi/ghes.h>
 #include <acpi/apei.h>
 #include <asm/tlbflush.h>
@@ -80,6 +81,10 @@ 
 	((struct acpi_hest_generic_status *)				\
 	 ((struct ghes_estatus_node *)(estatus_node) + 1))
 
+#define IS_HEST_TYPE_GENERIC_V2(ghes)				\
+	((struct acpi_hest_header *)ghes->generic)->type ==	\
+	 ACPI_HEST_TYPE_GENERIC_ERROR_V2
+
 /*
  * This driver isn't really modular, however for the time being,
  * continuing to use module_param is the easiest way to remain
@@ -240,6 +245,17 @@  static int ghes_estatus_pool_expand(unsigned long len)
 	return 0;
 }
 
+static int map_gen_v2(struct ghes *ghes)
+{
+	return apei_map_generic_address(&ghes->generic_v2->read_ack_register);
+}
+
+static void unmap_gen_v2(struct ghes *ghes)
+{
+	apei_unmap_generic_address(&ghes->generic_v2->read_ack_register);
+	return;
+}
+
 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 {
 	struct ghes *ghes;
@@ -249,10 +265,17 @@  static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 	ghes = kzalloc(sizeof(*ghes), GFP_KERNEL);
 	if (!ghes)
 		return ERR_PTR(-ENOMEM);
+
 	ghes->generic = generic;
+	if (IS_HEST_TYPE_GENERIC_V2(ghes)) {
+		rc = map_gen_v2(ghes);
+		if (rc)
+			goto err_free;
+	}
+
 	rc = apei_map_generic_address(&generic->error_status_address);
 	if (rc)
-		goto err_free;
+		goto err_unmap_read_ack_addr;
 	error_block_length = generic->error_block_length;
 	if (error_block_length > GHES_ESTATUS_MAX_SIZE) {
 		pr_warning(FW_WARN GHES_PFX
@@ -264,13 +287,16 @@  static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 	ghes->estatus = kmalloc(error_block_length, GFP_KERNEL);
 	if (!ghes->estatus) {
 		rc = -ENOMEM;
-		goto err_unmap;
+		goto err_unmap_status_addr;
 	}
 
 	return ghes;
 
-err_unmap:
+err_unmap_status_addr:
 	apei_unmap_generic_address(&generic->error_status_address);
+err_unmap_read_ack_addr:
+	if (IS_HEST_TYPE_GENERIC_V2(ghes))
+		unmap_gen_v2(ghes);
 err_free:
 	kfree(ghes);
 	return ERR_PTR(rc);
@@ -280,6 +306,8 @@  static void ghes_fini(struct ghes *ghes)
 {
 	kfree(ghes->estatus);
 	apei_unmap_generic_address(&ghes->generic->error_status_address);
+	if (IS_HEST_TYPE_GENERIC_V2(ghes))
+		unmap_gen_v2(ghes);
 }
 
 static inline int ghes_severity(int severity)
@@ -649,6 +677,21 @@  static void ghes_estatus_cache_add(
 	rcu_read_unlock();
 }
 
+static int ghes_ack_error(struct acpi_hest_generic_v2 *generic_v2)
+{
+	int rc;
+	u64 val = 0;
+
+	rc = apei_read(&val, &generic_v2->read_ack_register);
+	if (rc)
+		return rc;
+
+	val &= generic_v2->read_ack_preserve << generic_v2->read_ack_register.bit_offset;
+	val |= generic_v2->read_ack_write << generic_v2->read_ack_register.bit_offset;
+
+	return apei_write(val, &generic_v2->read_ack_register);
+}
+
 static int ghes_proc(struct ghes *ghes)
 {
 	int rc;
@@ -661,6 +704,12 @@  static int ghes_proc(struct ghes *ghes)
 			ghes_estatus_cache_add(ghes->generic, ghes->estatus);
 	}
 	ghes_do_proc(ghes, ghes->estatus);
+
+	if (IS_HEST_TYPE_GENERIC_V2(ghes)) {
+		rc = ghes_ack_error(ghes->generic_v2);
+		if (rc)
+			return rc;
+	}
 out:
 	ghes_clear_estatus(ghes);
 	return rc;
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 8f2a98e..456b488 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -52,6 +52,7 @@ 
 	[ACPI_HEST_TYPE_AER_ENDPOINT] = sizeof(struct acpi_hest_aer),
 	[ACPI_HEST_TYPE_AER_BRIDGE] = sizeof(struct acpi_hest_aer_bridge),
 	[ACPI_HEST_TYPE_GENERIC_ERROR] = sizeof(struct acpi_hest_generic),
+	[ACPI_HEST_TYPE_GENERIC_ERROR_V2] = sizeof(struct acpi_hest_generic_v2),
 };
 
 static int hest_esrc_len(struct acpi_hest_header *hest_hdr)
@@ -141,7 +142,8 @@  static int __init hest_parse_ghes_count(struct acpi_hest_header *hest_hdr, void
 {
 	int *count = data;
 
-	if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR)
+	if (hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR ||
+	    hest_hdr->type == ACPI_HEST_TYPE_GENERIC_ERROR_V2)
 		(*count)++;
 	return 0;
 }
@@ -152,7 +154,8 @@  static int __init hest_parse_ghes(struct acpi_hest_header *hest_hdr, void *data)
 	struct ghes_arr *ghes_arr = data;
 	int rc, i;
 
-	if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR)
+	if (hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR &&
+	    hest_hdr->type != ACPI_HEST_TYPE_GENERIC_ERROR_V2)
 		return 0;
 
 	if (!((struct acpi_hest_generic *)hest_hdr)->enabled)
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 720446c..68f088a 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -13,7 +13,10 @@ 
 #define GHES_EXITING		0x0002
 
 struct ghes {
-	struct acpi_hest_generic *generic;
+	union {
+		struct acpi_hest_generic *generic;
+		struct acpi_hest_generic_v2 *generic_v2;
+	};
 	struct acpi_hest_generic_status *estatus;
 	u64 buffer_paddr;
 	unsigned long flags;