From patchwork Wed Jul 13 05:14:25 2011
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: "Huang, Ying" <ying.huang@intel.com>
X-Patchwork-Id: 970452
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
	by demeter1.kernel.org (8.14.4/8.14.4) with ESMTP id p6D5GSFc002109
	for <patchwork-acpi@patchwork.kernel.org>;
	Wed, 13 Jul 2011 05:16:29 GMT
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
	id S964942Ab1GMFQB (ORCPT
	<rfc822;patchwork-acpi@patchwork.kernel.org>);
	Wed, 13 Jul 2011 01:16:01 -0400
Received: from mga09.intel.com ([134.134.136.24]:9791 "EHLO mga09.intel.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S964917Ab1GMFOz (ORCPT <rfc822;linux-acpi@vger.kernel.org>);
	Wed, 13 Jul 2011 01:14:55 -0400
Received: from orsmga001.jf.intel.com ([10.7.209.18])
	by orsmga102.jf.intel.com with ESMTP; 12 Jul 2011 22:14:55 -0700
X-ExtLoop1: 1
X-IronPort-AV: E=Sophos;i="4.65,523,1304319600"; d="scan'208";a="27048248"
Received: from yhuang-dev.sh.intel.com ([10.239.13.105])
	by orsmga001.jf.intel.com with ESMTP; 12 Jul 2011 22:14:53 -0700
From: Huang Ying <ying.huang@intel.com>
To: Len Brown <lenb@kernel.org>
Cc: linux-kernel@vger.kernel.org, Andi Kleen <andi@firstfloor.org>,
	Tony Luck <tony.luck@intel.com>, ying.huang@intel.com,
	linux-acpi@vger.kernel.org
Subject: [PATCH 14/17] ACPI, APEI, GHES,
	printk support for recoverable error via NMI
Date: Wed, 13 Jul 2011 13:14:25 +0800
Message-Id: <1310534068-30547-15-git-send-email-ying.huang@intel.com>
X-Mailer: git-send-email 1.7.5.4
In-Reply-To: <1310534068-30547-1-git-send-email-ying.huang@intel.com>
References: <1310534068-30547-1-git-send-email-ying.huang@intel.com>
Sender: linux-acpi-owner@vger.kernel.org
Precedence: bulk
List-ID: <linux-acpi.vger.kernel.org>
X-Mailing-List: linux-acpi@vger.kernel.org
X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by
	milter-greylist-4.2.6 (demeter1.kernel.org [140.211.167.41]);
	Wed, 13 Jul 2011 05:16:31 +0000 (UTC)

Some APEI GHES recoverable errors are reported via NMI, but printk is
not safe in NMI context.

To solve the issue, a lock-less memory allocator is used to allocate
memory in NMI handler, save the error record into the allocated
memory, put the error record into a lock-less list.  On the other
hand, an irq_work is used to delay the operation from NMI context to
IRQ context.  The irq_work IRQ handler will remove nodes from
lock-less list, printk the error record and do some further processing
include recovery operation, then free the memory.

Signed-off-by: Huang Ying <ying.huang@intel.com>
---
 drivers/acpi/apei/Kconfig |    2 
 drivers/acpi/apei/ghes.c  |  209 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 193 insertions(+), 18 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -13,6 +13,8 @@ config ACPI_APEI_GHES
 	bool "APEI Generic Hardware Error Source"
 	depends on ACPI_APEI && X86
 	select ACPI_HED
+	select LLIST
+	select GENERIC_ALLOCATOR
 	help
 	  Generic Hardware Error Source provides a way to report
 	  platform hardware errors (such as that from chipset). It
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -12,7 +12,7 @@
  * For more information about Generic Hardware Error Source, please
  * refer to ACPI Specification version 4.0, section 17.3.2.6
  *
- * Copyright 2010 Intel Corp.
+ * Copyright 2010,2011 Intel Corp.
  *   Author: Huang Ying <ying.huang@intel.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -42,6 +42,9 @@
 #include <linux/mutex.h>
 #include <linux/ratelimit.h>
 #include <linux/vmalloc.h>
+#include <linux/irq_work.h>
+#include <linux/llist.h>
+#include <linux/genalloc.h>
 #include <acpi/apei.h>
 #include <acpi/atomicio.h>
 #include <acpi/hed.h>
@@ -53,6 +56,15 @@
 #define GHES_PFX	"GHES: "
 
 #define GHES_ESTATUS_MAX_SIZE		65536
+#define GHES_ESOURCE_PREALLOC_MAX_SIZE	65536
+
+#define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3
+
+#define GHES_ESTATUS_NODE_LEN(estatus_len)			\
+	(sizeof(struct ghes_estatus_node) + (estatus_len))
+#define GHES_ESTATUS_FROM_NODE(estatus_node)				\
+	((struct acpi_hest_generic_status *)				\
+	 ((struct ghes_estatus_node *)(estatus_node) + 1))
 
 /*
  * One struct ghes is created for each generic hardware error source.
@@ -77,6 +89,11 @@ struct ghes {
 	};
 };
 
+struct ghes_estatus_node {
+	struct llist_node llnode;
+	struct acpi_hest_generic *generic;
+};
+
 int ghes_disable;
 module_param_named(disable, ghes_disable, bool, 0);
 
@@ -124,6 +141,19 @@ static struct vm_struct *ghes_ioremap_ar
 static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
 static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
 
+/*
+ * printk is not safe in NMI context.  So in NMI handler, we allocate
+ * required memory from lock-less memory allocator
+ * (ghes_estatus_pool), save estatus into it, put them into lock-less
+ * list (ghes_estatus_llist), then delay printk into IRQ context via
+ * irq_work (ghes_proc_irq_work).  ghes_estatus_size_request record
+ * required pool size by all NMI error source.
+ */
+static struct gen_pool *ghes_estatus_pool;
+static unsigned long ghes_estatus_pool_size_request;
+static struct llist_head ghes_estatus_llist;
+static struct irq_work ghes_proc_irq_work;
+
 static int ghes_ioremap_init(void)
 {
 	ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES,
@@ -183,6 +213,55 @@ static void ghes_iounmap_irq(void __iome
 	__flush_tlb_one(vaddr);
 }
 
+static int ghes_estatus_pool_init(void)
+{
+	ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, -1);
+	if (!ghes_estatus_pool)
+		return -ENOMEM;
+	return 0;
+}
+
+static void ghes_estatus_pool_free_chunk_page(struct gen_pool *pool,
+					      struct gen_pool_chunk *chunk,
+					      void *data)
+{
+	free_page(chunk->start_addr);
+}
+
+static void ghes_estatus_pool_exit(void)
+{
+	gen_pool_for_each_chunk(ghes_estatus_pool,
+				ghes_estatus_pool_free_chunk_page, NULL);
+	gen_pool_destroy(ghes_estatus_pool);
+}
+
+static int ghes_estatus_pool_expand(unsigned long len)
+{
+	unsigned long i, pages, size, addr;
+	int ret;
+
+	ghes_estatus_pool_size_request += PAGE_ALIGN(len);
+	size = gen_pool_size(ghes_estatus_pool);
+	if (size >= ghes_estatus_pool_size_request)
+		return 0;
+	pages = (ghes_estatus_pool_size_request - size) / PAGE_SIZE;
+	for (i = 0; i < pages; i++) {
+		addr = __get_free_page(GFP_KERNEL);
+		if (!addr)
+			return -ENOMEM;
+		ret = gen_pool_add(ghes_estatus_pool, addr, PAGE_SIZE, -1);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void ghes_estatus_pool_shrink(unsigned long len)
+{
+	ghes_estatus_pool_size_request -= PAGE_ALIGN(len);
+}
+
 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 {
 	struct ghes *ghes;
@@ -344,13 +423,13 @@ static void ghes_clear_estatus(struct gh
 	ghes->flags &= ~GHES_TO_CLEAR;
 }
 
-static void ghes_do_proc(struct ghes *ghes)
+static void ghes_do_proc(const struct acpi_hest_generic_status *estatus)
 {
 	int sev, processed = 0;
 	struct acpi_hest_generic_data *gdata;
 
-	sev = ghes_severity(ghes->estatus->error_severity);
-	apei_estatus_for_each_section(ghes->estatus, gdata) {
+	sev = ghes_severity(estatus->error_severity);
+	apei_estatus_for_each_section(estatus, gdata) {
 #ifdef CONFIG_X86_MCE
 		if (!uuid_le_cmp(*(uuid_le *)gdata->section_type,
 				 CPER_SEC_PLATFORM_MEM)) {
@@ -363,27 +442,37 @@ static void ghes_do_proc(struct ghes *gh
 	}
 }
 
-static void __ghes_print_estatus(const char *pfx, struct ghes *ghes)
+static void __ghes_print_estatus(const char *pfx,
+				 const struct acpi_hest_generic *generic,
+				 const struct acpi_hest_generic_status *estatus)
 {
 	if (pfx == NULL) {
-		if (ghes_severity(ghes->estatus->error_severity) <=
+		if (ghes_severity(estatus->error_severity) <=
 		    GHES_SEV_CORRECTED)
 			pfx = KERN_WARNING HW_ERR;
 		else
 			pfx = KERN_ERR HW_ERR;
 	}
 	printk("%s""Hardware error from APEI Generic Hardware Error Source: %d\n",
-	       pfx, ghes->generic->header.source_id);
-	apei_estatus_print(pfx, ghes->estatus);
+	       pfx, generic->header.source_id);
+	apei_estatus_print(pfx, estatus);
 }
 
-static void ghes_print_estatus(const char *pfx, struct ghes *ghes)
+static void ghes_print_estatus(const char *pfx,
+			       const struct acpi_hest_generic *generic,
+			       const struct acpi_hest_generic_status *estatus)
 {
 	/* Not more than 2 messages every 5 seconds */
-	static DEFINE_RATELIMIT_STATE(ratelimit, 5*HZ, 2);
+	static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2);
+	static DEFINE_RATELIMIT_STATE(ratelimit_uncorrected, 5*HZ, 2);
+	struct ratelimit_state *ratelimit;
 
-	if (__ratelimit(&ratelimit))
-		__ghes_print_estatus(pfx, ghes);
+	if (ghes_severity(estatus->error_severity) <= GHES_SEV_CORRECTED)
+		ratelimit = &ratelimit_corrected;
+	else
+		ratelimit = &ratelimit_uncorrected;
+	if (__ratelimit(ratelimit))
+		__ghes_print_estatus(pfx, generic, estatus);
 }
 
 static int ghes_proc(struct ghes *ghes)
@@ -393,8 +482,8 @@ static int ghes_proc(struct ghes *ghes)
 	rc = ghes_read_estatus(ghes, 0);
 	if (rc)
 		goto out;
-	ghes_print_estatus(NULL, ghes);
-	ghes_do_proc(ghes);
+	ghes_print_estatus(NULL, ghes->generic, ghes->estatus);
+	ghes_do_proc(ghes->estatus);
 
 out:
 	ghes_clear_estatus(ghes);
@@ -453,6 +542,40 @@ static int ghes_notify_sci(struct notifi
 	return ret;
 }
 
+static void ghes_proc_in_irq(struct irq_work *irq_work)
+{
+	struct llist_node *llnode, *next, *tail = NULL;
+	struct ghes_estatus_node *estatus_node;
+	struct acpi_hest_generic_status *estatus;
+	u32 len, node_len;
+
+	/*
+	 * Because the time order of estatus in list is reversed,
+	 * revert it back to proper order.
+	 */
+	llnode = llist_del_all(&ghes_estatus_llist);
+	while (llnode) {
+		next = llnode->next;
+		llnode->next = tail;
+		tail = llnode;
+		llnode = next;
+	}
+	llnode = tail;
+	while (llnode) {
+		next = llnode->next;
+		estatus_node = llist_entry(llnode, struct ghes_estatus_node,
+					   llnode);
+		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+		len = apei_estatus_len(estatus);
+		node_len = GHES_ESTATUS_NODE_LEN(len);
+		ghes_do_proc(estatus);
+		ghes_print_estatus(NULL, estatus_node->generic, estatus);
+		gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
+			      node_len);
+		llnode = next;
+	}
+}
+
 static int ghes_notify_nmi(struct notifier_block *this,
 				  unsigned long cmd, void *data)
 {
@@ -482,7 +605,8 @@ static int ghes_notify_nmi(struct notifi
 
 	if (sev_global >= GHES_SEV_PANIC) {
 		oops_begin();
-		__ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global);
+		__ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global->generic,
+				     ghes_global->estatus);
 		/* reboot to log the error! */
 		if (panic_timeout == 0)
 			panic_timeout = ghes_panic_timeout;
@@ -490,12 +614,31 @@ static int ghes_notify_nmi(struct notifi
 	}
 
 	list_for_each_entry_rcu(ghes, &ghes_nmi, list) {
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+		u32 len, node_len;
+		struct ghes_estatus_node *estatus_node;
+		struct acpi_hest_generic_status *estatus;
+#endif
 		if (!(ghes->flags & GHES_TO_CLEAR))
 			continue;
-		/* Do not print estatus because printk is not NMI safe */
-		ghes_do_proc(ghes);
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+		/* Save estatus for further processing in IRQ context */
+		len = apei_estatus_len(ghes->estatus);
+		node_len = GHES_ESTATUS_NODE_LEN(len);
+		estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool,
+						      node_len);
+		if (estatus_node) {
+			estatus_node->generic = ghes->generic;
+			estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+			memcpy(estatus, ghes->estatus, len);
+			llist_add(&estatus_node->llnode, &ghes_estatus_llist);
+		}
+#endif
 		ghes_clear_estatus(ghes);
 	}
+#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
+	irq_work_queue(&ghes_proc_irq_work);
+#endif
 
 out:
 	raw_spin_unlock(&ghes_nmi_lock);
@@ -510,10 +653,26 @@ static struct notifier_block ghes_notifi
 	.notifier_call = ghes_notify_nmi,
 };
 
+static unsigned long ghes_esource_prealloc_size(
+	const struct acpi_hest_generic *generic)
+{
+	unsigned long block_length, prealloc_records, prealloc_size;
+
+	block_length = min_t(unsigned long, generic->error_block_length,
+			     GHES_ESTATUS_MAX_SIZE);
+	prealloc_records = max_t(unsigned long,
+				 generic->records_to_preallocate, 1);
+	prealloc_size = min_t(unsigned long, block_length * prealloc_records,
+			      GHES_ESOURCE_PREALLOC_MAX_SIZE);
+
+	return prealloc_size;
+}
+
 static int __devinit ghes_probe(struct platform_device *ghes_dev)
 {
 	struct acpi_hest_generic *generic;
 	struct ghes *ghes = NULL;
+	unsigned long len;
 	int rc = -EINVAL;
 
 	generic = *(struct acpi_hest_generic **)ghes_dev->dev.platform_data;
@@ -579,6 +738,8 @@ static int __devinit ghes_probe(struct p
 		mutex_unlock(&ghes_list_mutex);
 		break;
 	case ACPI_HEST_NOTIFY_NMI:
+		len = ghes_esource_prealloc_size(generic);
+		ghes_estatus_pool_expand(len);
 		mutex_lock(&ghes_list_mutex);
 		if (list_empty(&ghes_nmi))
 			register_die_notifier(&ghes_notifier_nmi);
@@ -603,6 +764,7 @@ static int __devexit ghes_remove(struct
 {
 	struct ghes *ghes;
 	struct acpi_hest_generic *generic;
+	unsigned long len;
 
 	ghes = platform_get_drvdata(ghes_dev);
 	generic = ghes->generic;
@@ -633,6 +795,8 @@ static int __devexit ghes_remove(struct
 		 * freed after NMI handler finishes.
 		 */
 		synchronize_rcu();
+		len = ghes_esource_prealloc_size(generic);
+		ghes_estatus_pool_shrink(len);
 		break;
 	default:
 		BUG();
@@ -673,14 +837,20 @@ static int __init ghes_init(void)
 		return -EINVAL;
 	}
 
+	init_irq_work(&ghes_proc_irq_work, ghes_proc_in_irq);
+
 	rc = ghes_ioremap_init();
 	if (rc)
 		goto err;
 
-	rc = platform_driver_register(&ghes_platform_driver);
+	rc = ghes_estatus_pool_init();
 	if (rc)
 		goto err_ioremap_exit;
 
+	rc = platform_driver_register(&ghes_platform_driver);
+	if (rc)
+		goto err_pool_exit;
+
 	rc = apei_osc_setup();
 	if (rc == 0 && osc_sb_apei_support_acked)
 		pr_info(GHES_PFX "APEI firmware first mode is enabled by APEI bit and WHEA _OSC.\n");
@@ -692,6 +862,8 @@ static int __init ghes_init(void)
 		pr_info(GHES_PFX "Failed to enable APEI firmware first mode.\n");
 
 	return 0;
+err_pool_exit:
+	ghes_estatus_pool_exit();
 err_ioremap_exit:
 	ghes_ioremap_exit();
 err:
@@ -701,6 +873,7 @@ err:
 static void __exit ghes_exit(void)
 {
 	platform_driver_unregister(&ghes_platform_driver);
+	ghes_estatus_pool_exit();
 	ghes_ioremap_exit();
 }