[15/17] ACPI, APEI, GHES, Error records content based throttle

Message ID	1310534068-30547-16-git-send-email-ying.huang@intel.com (mailing list archive)
State	New, archived
Headers	show Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter2.kernel.org (8.14.4/8.14.4) with ESMTP id p6D5GCCj014710 for <patchwork-acpi@patchwork.kernel.org>; Wed, 13 Jul 2011 05:16:12 GMT From: Huang Ying <ying.huang@intel.com> To: Len Brown <lenb@kernel.org> Cc: linux-kernel@vger.kernel.org, Andi Kleen <andi@firstfloor.org>, Tony Luck <tony.luck@intel.com>, ying.huang@intel.com, linux-acpi@vger.kernel.org Subject: [PATCH 15/17] ACPI, APEI, GHES, Error records content based throttle Date: Wed, 13 Jul 2011 13:14:26 +0800 Message-Id: <1310534068-30547-16-git-send-email-ying.huang@intel.com> In-Reply-To: <1310534068-30547-1-git-send-email-ying.huang@intel.com> References: <1310534068-30547-1-git-send-email-ying.huang@intel.com> Sender: linux-acpi-owner@vger.kernel.org Precedence: bulk

--- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -60,6 +60,21 @@ #define GHES_ESTATUS_POOL_MIN_ALLOC_ORDER 3 +/* This is just an estimation for memory pool allocation */ +#define GHES_ESTATUS_CACHE_AVG_SIZE 512 + +#define GHES_ESTATUS_CACHES_SIZE 4 + +#define GHES_ESTATUS_IN_CACHE_MAX_NSEC (10 * NSEC_PER_SEC) +/* Prevent too many caches are allocated because of RCU */ +#define GHES_ESTATUS_CACHE_ALLOCED_MAX (GHES_ESTATUS_CACHES_SIZE * 3 / 2) + +#define GHES_ESTATUS_CACHE_LEN(estatus_len) \ + (sizeof(struct ghes_estatus_cache) + (estatus_len)) +#define GHES_ESTATUS_FROM_CACHE(estatus_cache) \ + ((struct acpi_hest_generic_status *) \ + ((struct ghes_estatus_cache *)(estatus_cache) + 1)) + #define GHES_ESTATUS_NODE_LEN(estatus_len) \ (sizeof(struct ghes_estatus_node) + (estatus_len)) #define GHES_ESTATUS_FROM_NODE(estatus_node) \ @@ -94,6 +109,14 @@ struct ghes_estatus_node { struct acpi_hest_generic *generic; }; +struct ghes_estatus_cache { + u32 estatus_len; + atomic_t count; + struct acpi_hest_generic *generic; + unsigned long long time_in; + struct rcu_head rcu; +}; + int ghes_disable; module_param_named(disable, ghes_disable, bool, 0); @@ -154,6 +177,9 @@ static unsigned long ghes_estatus_pool_s static struct llist_head ghes_estatus_llist; static struct irq_work ghes_proc_irq_work; +struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; +static atomic_t ghes_estatus_cache_alloced; + static int ghes_ioremap_init(void) { ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES, @@ -458,9 +484,9 @@ static void __ghes_print_estatus(const c apei_estatus_print(pfx, estatus); } -static void ghes_print_estatus(const char *pfx, - const struct acpi_hest_generic *generic, - const struct acpi_hest_generic_status *estatus) +static int ghes_print_estatus(const char *pfx, + const struct acpi_hest_generic *generic, + const struct acpi_hest_generic_status *estatus) { /* Not more than 2 messages every 5 seconds */ static DEFINE_RATELIMIT_STATE(ratelimit_corrected, 5*HZ, 2); @@ -471,8 +497,137 @@ static void ghes_print_estatus(const cha ratelimit = &ratelimit_corrected; else ratelimit = &ratelimit_uncorrected; - if (__ratelimit(ratelimit)) + if (__ratelimit(ratelimit)) { __ghes_print_estatus(pfx, generic, estatus); + return 1; + } + return 0; +} + +/* + * GHES error status reporting throttle, to report more kinds of + * errors, instead of just most frequently occurred errors. + */ +static int ghes_estatus_cached(struct acpi_hest_generic_status *estatus) +{ + u32 len; + int i, cached = 0; + unsigned long long now; + struct ghes_estatus_cache *cache; + struct acpi_hest_generic_status *cache_estatus; + + len = apei_estatus_len(estatus); + rcu_read_lock(); + for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) { + cache = rcu_dereference(ghes_estatus_caches[i]); + if (cache == NULL) + continue; + if (len != cache->estatus_len) + continue; + cache_estatus = GHES_ESTATUS_FROM_CACHE(cache); + if (memcmp(estatus, cache_estatus, len)) + continue; + atomic_inc(&cache->count); + now = sched_clock(); + if (now - cache->time_in < GHES_ESTATUS_IN_CACHE_MAX_NSEC) + cached = 1; + break; + } + rcu_read_unlock(); + return cached; +} + +static struct ghes_estatus_cache *ghes_estatus_cache_alloc( + struct acpi_hest_generic *generic, + struct acpi_hest_generic_status *estatus) +{ + int alloced; + u32 len, cache_len; + struct ghes_estatus_cache *cache; + struct acpi_hest_generic_status *cache_estatus; + + alloced = atomic_add_return(1, &ghes_estatus_cache_alloced); + if (alloced > GHES_ESTATUS_CACHE_ALLOCED_MAX) { + atomic_dec(&ghes_estatus_cache_alloced); + return NULL; + } + len = apei_estatus_len(estatus); + cache_len = GHES_ESTATUS_CACHE_LEN(len); + cache = (void *)gen_pool_alloc(ghes_estatus_pool, cache_len); + if (!cache) { + atomic_dec(&ghes_estatus_cache_alloced); + return NULL; + } + cache_estatus = GHES_ESTATUS_FROM_CACHE(cache); + memcpy(cache_estatus, estatus, len); + cache->estatus_len = len; + atomic_set(&cache->count, 0); + cache->generic = generic; + cache->time_in = sched_clock(); + return cache; +} + +static void ghes_estatus_cache_free(struct ghes_estatus_cache *cache) +{ + u32 len; + + len = apei_estatus_len(GHES_ESTATUS_FROM_CACHE(cache)); + len = GHES_ESTATUS_CACHE_LEN(len); + gen_pool_free(ghes_estatus_pool, (unsigned long)cache, len); + atomic_dec(&ghes_estatus_cache_alloced); +} + +static void ghes_estatus_cache_rcu_free(struct rcu_head *head) +{ + struct ghes_estatus_cache *cache; + + cache = container_of(head, struct ghes_estatus_cache, rcu); + ghes_estatus_cache_free(cache); +} + +static void ghes_estatus_cache_add( + struct acpi_hest_generic *generic, + struct acpi_hest_generic_status *estatus) +{ + int i, slot = -1, count; + unsigned long long now, duration, period, max_period = 0; + struct ghes_estatus_cache *cache, *slot_cache = NULL, *new_cache; + + new_cache = ghes_estatus_cache_alloc(generic, estatus); + if (new_cache == NULL) + return; + rcu_read_lock(); + now = sched_clock(); + for (i = 0; i < GHES_ESTATUS_CACHES_SIZE; i++) { + cache = rcu_dereference(ghes_estatus_caches[i]); + if (cache == NULL) { + slot = i; + slot_cache = NULL; + break; + } + duration = now - cache->time_in; + if (duration >= GHES_ESTATUS_IN_CACHE_MAX_NSEC) { + slot = i; + slot_cache = cache; + break; + } + count = atomic_read(&cache->count); + period = duration / (count + 1); + if (period > max_period) { + max_period = period; + slot = i; + slot_cache = cache; + } + } + /* new_cache must be put into array after its contents are written */ + smp_wmb(); + if (slot != -1 && cmpxchg(ghes_estatus_caches + slot, + slot_cache, new_cache) == slot_cache) { + if (slot_cache) + call_rcu(&slot_cache->rcu, ghes_estatus_cache_rcu_free); + } else + ghes_estatus_cache_free(new_cache); + rcu_read_unlock(); } static int ghes_proc(struct ghes *ghes) @@ -482,9 +637,11 @@ static int ghes_proc(struct ghes *ghes) rc = ghes_read_estatus(ghes, 0); if (rc) goto out; - ghes_print_estatus(NULL, ghes->generic, ghes->estatus); + if (!ghes_estatus_cached(ghes->estatus)) { + if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus)) + ghes_estatus_cache_add(ghes->generic, ghes->estatus); + } ghes_do_proc(ghes->estatus); - out: ghes_clear_estatus(ghes); return 0; @@ -546,6 +703,7 @@ static void ghes_proc_in_irq(struct irq_ { struct llist_node *llnode, *next, *tail = NULL; struct ghes_estatus_node *estatus_node; + struct acpi_hest_generic *generic; struct acpi_hest_generic_status *estatus; u32 len, node_len; @@ -569,7 +727,11 @@ static void ghes_proc_in_irq(struct irq_ len = apei_estatus_len(estatus); node_len = GHES_ESTATUS_NODE_LEN(len); ghes_do_proc(estatus); - ghes_print_estatus(NULL, estatus_node->generic, estatus); + if (!ghes_estatus_cached(estatus)) { + generic = estatus_node->generic; + if (ghes_print_estatus(NULL, generic, estatus)) + ghes_estatus_cache_add(generic, estatus); + } gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len); llnode = next; @@ -622,6 +784,8 @@ static int ghes_notify_nmi(struct notifi if (!(ghes->flags & GHES_TO_CLEAR)) continue; #ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG + if (ghes_estatus_cached(ghes->estatus)) + goto next; /* Save estatus for further processing in IRQ context */ len = apei_estatus_len(ghes->estatus); node_len = GHES_ESTATUS_NODE_LEN(len); @@ -633,6 +797,7 @@ static int ghes_notify_nmi(struct notifi memcpy(estatus, ghes->estatus, len); llist_add(&estatus_node->llnode, &ghes_estatus_llist); } +next: #endif ghes_clear_estatus(ghes); } @@ -847,6 +1012,11 @@ static int __init ghes_init(void) if (rc) goto err_ioremap_exit; + rc = ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE * + GHES_ESTATUS_CACHE_ALLOCED_MAX); + if (rc) + goto err_pool_exit; + rc = platform_driver_register(&ghes_platform_driver); if (rc) goto err_pool_exit;

[15/17] ACPI, APEI, GHES, Error records content based throttle

Commit Message

Patch