diff mbox

[v2] irqchip/gicv3-its: Enable cacheable attribute Read-allocate hints

Message ID 8c3ea714-5bd2-6784-8eae-8b953860c8f6@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Shanker Donthineni Nov. 8, 2016, 7:25 a.m. UTC
Hi Marc,

I attached the test code that I used to benchmark ReadAllocate hints 
performance on Qualcomm hardware.

Shanker


On 11/08/2016 01:18 AM, Shanker Donthineni wrote:
> Read-allocation hints are not enabled for both the GIC-ITS and GICR
> tables. This forces the hardware to always read the table contents
> from an external memory (DDR) which is slow compared to cache memory.
> Most of the tables are often read by hardware. So, it's better to
> enable Read-allocate hints in addition to Write-allocate hints in
> order to improve the GICR_PEND, GICR_PROP, Collection, Device, and
> vCPU tables lookup time.
>
> Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
> ---
> Implemented a test case to prove that enabling Read Allocation hints
> improves ITS lookup time ~15% while delivering a LPI event. Used the
> ITS command INV to analyze time spent in device, collection, prop and
> pending table lookups.
>    
> Pseudo code:
>      Create a fake ITS device.
>      Record PMU cycle counter before sending INV command.
>      Build and send ITS INT command.
>           ITS hardware triggers device table lookup.
>                ITTE table & collection table lookup.
>            ITS property table lookup.
>            ITS pending table lookup.
>            Deliver interrupt to CPU interface.
>      do_IRQ() called.
>      Measure the total CPU cycle spent to reach this point.
>
> Without ReadAllocation hints:
> /sys/kernel/debug # echo 100 > lpitest
> [   94.693968] CPU[1] niter=100 cycles=0x8dfc0 avg=0x16b7 min=0x1652
>
> With ReadAllocation hints:
> /sys/kernel/debug # echo 100 > lpitest
> [   98.617873] CPU[1] niter=100 cycles=0x7df49 avg=0x1427 min=0x1388
>
>   drivers/irqchip/irq-gic-v3-its.c | 10 +++++-----
>   1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index c5dee30..227a1eb 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -961,7 +961,7 @@ static bool its_parse_baser_device(struct its_node *its, struct its_baser *baser
>   				   u32 psz, u32 *order)
>   {
>   	u64 esz = GITS_BASER_ENTRY_SIZE(its_read_baser(its, baser));
> -	u64 val = GITS_BASER_InnerShareable | GITS_BASER_WaWb;
> +	u64 val = GITS_BASER_InnerShareable | GITS_BASER_RaWaWb;
>   	u32 ids = its->device_ids;
>   	u32 new_order = *order;
>   	bool indirect = false;
> @@ -1026,7 +1026,7 @@ static int its_alloc_tables(struct its_node *its)
>   	u64 typer = gic_read_typer(its->base + GITS_TYPER);
>   	u32 ids = GITS_TYPER_DEVBITS(typer);
>   	u64 shr = GITS_BASER_InnerShareable;
> -	u64 cache = GITS_BASER_WaWb;
> +	u64 cache = GITS_BASER_RaWaWb;
>   	u32 psz = SZ_64K;
>   	int err, i;
>   
> @@ -1123,7 +1123,7 @@ static void its_cpu_init_lpis(void)
>   	/* set PROPBASE */
>   	val = (page_to_phys(gic_rdists->prop_page) |
>   	       GICR_PROPBASER_InnerShareable |
> -	       GICR_PROPBASER_WaWb |
> +	       GICR_PROPBASER_RaWaWb |
>   	       ((LPI_NRBITS - 1) & GICR_PROPBASER_IDBITS_MASK));
>   
>   	writeq_relaxed(val, rbase + GICR_PROPBASER);
> @@ -1148,7 +1148,7 @@ static void its_cpu_init_lpis(void)
>   	/* set PENDBASE */
>   	val = (page_to_phys(pend_page) |
>   	       GICR_PENDBASER_InnerShareable |
> -	       GICR_PENDBASER_WaWb);
> +	       GICR_PENDBASER_RaWaWb);
>   
>   	writeq_relaxed(val, rbase + GICR_PENDBASER);
>   	tmp = readq_relaxed(rbase + GICR_PENDBASER);
> @@ -1712,7 +1712,7 @@ static int __init its_probe_one(struct resource *res,
>   		goto out_free_tables;
>   
>   	baser = (virt_to_phys(its->cmd_base)	|
> -		 GITS_CBASER_WaWb		|
> +		 GITS_CBASER_RaWaWb		|
>   		 GITS_CBASER_InnerShareable	|
>   		 (ITS_CMD_QUEUE_SZ / SZ_4K - 1)	|
>   		 GITS_CBASER_VALID);

Comments

kernel test robot Nov. 8, 2016, 9:56 a.m. UTC | #1
Hi Shanker,

[auto build test ERROR on arm64/for-next/core]
[also build test ERROR on v4.9-rc4 next-20161028]
[cannot apply to tip/irq/core arm-jcooper/irqchip/core]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Shanker-Donthineni/irqchip-gicv3-its-Test-code-for-measuring-Read-allocate/20161108-154723
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
config: arm-multi_v7_defconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=arm 

All errors (new ones prefixed by >>):

   /tmp/ccEUFgcT.s: Assembler messages:
>> /tmp/ccEUFgcT.s:700: Error: selected processor does not support requested special purpose register -- `mrs r10,pmccntr_el0'

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox

Patch

From 8a60136b1b805746451b5ff4d82eaa251bfd8c6c Mon Sep 17 00:00:00 2001
From: Shanker Donthineni <shankerd@codeaurora.org>
Date: Mon, 7 Nov 2016 23:08:28 -0600
Subject: [PATCH 2/2] irqchip/gicv3-its: Test code for measuring Read-allocate
 hints performance

Apply this patch on tip of the v4.9-rc4 kernel and do two steps mentioned
below for measuring performance improvement with Read-allocate hints.

mount -t debugfs none /sys/kernel/debug
echo 10 > /sys/kernel/debug/lpitest

Test displays the CPU cycles that are spent to deliver LPI event.

Example:
[   93.139710] CPU[1] iter=10 cycles=0xdd8c avg=0x1627 min=0x13a3

Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
---
 arch/arm64/mm/cache.S              |  55 +++++++++++++
 drivers/irqchip/irq-gic-v3-its.c   | 160 +++++++++++++++++++++++++++++++++++++
 drivers/irqchip/irq-gic-v3.c       |   7 ++
 include/linux/irqchip/arm-gic-v3.h |  23 ++++++
 4 files changed, 245 insertions(+)

diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 58b5a90..0a03420 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -198,3 +198,58 @@  ENTRY(__dma_unmap_area)
 	b.ne	__dma_inv_area
 	ret
 ENDPIPROC(__dma_unmap_area)
+
+/*
+ *      flush_dcache_all(), Flush the whole D-cache.
+ */
+ENTRY(flush_dcache_all)
+	mov     x12, lr
+	dmb     sy                              // ensure ordering with previous memory accesses
+	mrs     x0, clidr_el1                   // read clidr
+	and     x3, x0, #0x7000000              // extract loc from clidr
+	lsr     x3, x3, #23                     // left align loc bit field
+	cbz     x3, finished                    // if loc is 0, then no need to clean
+	mov     x10, #0                         // start clean at cache level 0
+loop1:
+	add     x2, x10, x10, lsr #1            // work out 3x current cache level
+	lsr     x1, x0, x2                      // extract cache type bits from clidr
+	and     x1, x1, #7                      // mask of the bits for current cache only
+	cmp     x1, #2                          // see what cache we have at this level
+	b.lt    skip                            // skip if no cache, or just i-cache
+	mrs	x9, daif
+	disable_irq
+	msr     csselr_el1, x10                 // select current cache level in csselr
+	isb                                     // isb to sych the new cssr&csidr
+	mrs     x1, ccsidr_el1                  // read the new ccsidr
+	msr	daif, x9
+	and     x2, x1, #7                      // extract the length of the cache lines
+	add     x2, x2, #4                      // add 4 (line length offset)
+	mov     x4, #0x3ff
+	and     x4, x4, x1, lsr #3              // find maximum number on the way size
+	clz     w5, w4                          // find bit position of way size increment
+	mov     x7, #0x7fff
+	and     x7, x7, x1, lsr #13             // extract max number of the index size
+loop2:
+	mov     x9, x4                          // create working copy of max way size
+loop3:
+	lsl     x6, x9, x5
+	orr     x11, x10, x6                    // factor way and cache number into x11
+	lsl     x6, x7, x2
+	orr     x11, x11, x6                    // factor index number into x11
+	dc      cisw, x11                       // clean & invalidate by set/way
+	subs    x9, x9, #1                      // decrement the way
+	b.ge    loop3
+	subs    x7, x7, #1                      // decrement the index
+	b.ge    loop2
+skip:
+	add     x10, x10, #2                    // increment cache number
+	cmp     x3, x10
+	b.gt    loop1
+finished:
+	mov     x10, #0                         // swith back to cache level 0
+	msr     csselr_el1, x10                 // select current cache level in csselr
+	dsb     sy
+	isb
+	mov     x0, #0
+	ret     x12
+ENDPROC(flush_dcache_all)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 227a1eb..7aec44f 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1882,3 +1882,163 @@  int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
 
 	return 0;
 }
+#include <linux/debugfs.h>
+
+struct lpitest_cntx lpitest1 = {
+	.wq = __WAIT_QUEUE_HEAD_INITIALIZER(lpitest1.wq),
+};
+struct lpitest_cntx *lpitest = &lpitest1;
+
+static struct its_device *lpi_its_dev;
+
+static struct its_collection *its_build_int_cmd(struct its_cmd_block *cmd,
+						struct its_cmd_desc *desc)
+{
+	struct its_collection *col;
+
+	col = dev_event_to_col(desc->its_inv_cmd.dev,
+			       desc->its_inv_cmd.event_id);
+
+	its_encode_cmd(cmd, 0x03);
+	its_encode_devid(cmd, desc->its_inv_cmd.dev->device_id);
+	its_encode_event_id(cmd, desc->its_inv_cmd.event_id);
+
+	its_fixup_cmd(cmd);
+
+	return col;
+}
+
+static void its_send_int(struct its_device *dev, u32 event_id)
+{
+	struct its_cmd_desc desc;
+
+	desc.its_inv_cmd.dev = dev;
+	desc.its_inv_cmd.event_id = event_id;
+
+	its_send_single_command(dev->its, its_build_int_cmd, &desc);
+}
+
+static ssize_t lpitest_write(struct file *file, const char __user *buffer,
+		size_t count, loff_t *pos)
+{
+	unsigned long val, lcnt;
+	u64 cycles, dcycles, mcycles = ~0;
+	int cpu, ret;
+
+	ret = kstrtoul_from_user(buffer, count, 10, &val);
+	if (ret || val <= 0)
+		return ret;
+
+	preempt_disable();
+
+	flush_dcache_all();
+
+	cpu = smp_processor_id();
+	*pos += count;
+	lpitest->irqnr = lpi_its_dev->event_map.lpi_base + cpu;
+
+	lpitest->total_cycles = 0;
+	lcnt = val;
+	while (val) {
+		cycles = pmu_read_cycles();
+		lpitest->done = 1;
+		its_send_int(lpi_its_dev, cpu);
+		wait_event_interruptible(lpitest->wq, !lpitest->done);
+		dcycles = lpitest->end_cycles - cycles;
+		lpitest->total_cycles += dcycles;
+		if (mcycles > dcycles)
+			mcycles = dcycles;
+		val--;
+	}
+	preempt_enable();
+
+	pr_info("CPU[%d] niter=%ld cycles=0x%lx avg=0x%lx min=0x%lx\n", cpu,
+		(unsigned long)lcnt,
+	        (unsigned long)lpitest->total_cycles,
+		(unsigned long)lpitest->total_cycles/lcnt,
+		(unsigned long)mcycles);
+
+	return ret ? ret : count;
+}
+
+static int lpitest_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, NULL, NULL);
+}
+
+static const struct file_operations lpitest_fops = {
+	.owner = THIS_MODULE,
+	.open = lpitest_proc_open,
+	.read = seq_read,
+	.write = lpitest_write,
+};
+
+static void pmu_enable_cycle_counter(void *discard)
+{
+	u64 tmp;
+
+	asm volatile("mrs %0, pmcr_el0\n"
+		"orr %0, %0, #(1 << 0)\n"
+		"orr %0, %0, #(1 << 2)\n"
+		"bic %0, %0, #(1 << 3)\n"
+		"orr %0, %0, #(1 << 6)\n"
+		"msr pmcr_el0, %0\n"
+		"mov %0, #0b11111\n"
+		"msr pmselr_el0, %0\n"
+		"isb \n"
+		"mrs %0, pmxevtyper_el0\n"
+		"orr %0, %0, #(1 << 27)\n"
+		"bic %0, %0, #(3 << 30)\n"
+		"bic %0, %0, #(3 << 28)\n"
+		"msr pmxevtyper_el0, %0\n"
+		"mrs %0, pmcntenset_el0\n"
+		"orr %0, %0, #(1 << 31)\n"
+		"msr pmcntenset_el0, %0\n"
+		: "=r" (tmp));
+}
+
+static int __init its_lpitest_init(void)
+{
+	struct its_device *its_dev;
+	struct its_node *its;
+	struct dentry *dentry;
+	irq_hw_number_t hwirq;
+	int i, nvec = 64;
+	u8 *cfg;
+
+	if (list_empty(&its_nodes))
+		return 0;
+	its = list_first_entry(&its_nodes, struct its_node, entry);
+
+	dentry = debugfs_create_file("lpitest", 0666, NULL, NULL, &lpitest_fops);
+	if (!dentry) {
+		pr_err("failed to create debugfs for its-lpitest");
+		return -ENOMEM;
+	}
+
+	its_dev = its_create_device(its, 0xFFFF, nvec);
+	if (!its_dev) {
+		pr_err("failed to create its device for lpitest");
+		return -ENOMEM;
+	}
+
+	lpi_its_dev = its_dev;
+	hwirq = its_dev->event_map.lpi_base;
+	cfg = page_address(gic_rdists->prop_page) + hwirq - 8192;
+
+	for (i = 0; i < nvec; i++) {
+		lpi_its_dev->event_map.col_map[i] = i;
+		its_send_mapvi(its_dev, hwirq + i, i);
+		*cfg |= LPI_PROP_ENABLED;
+		dsb(ishst);
+		its_send_inv(its_dev, i);
+		cfg++;
+	}
+
+	on_each_cpu(pmu_enable_cycle_counter, NULL, 1);
+
+	pr_info("lpitest successfully initialized lpi_base=%d\n", (u32)hwirq);
+
+	return 0;
+}
+late_initcall(its_lpitest_init);
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 19d642e..ab44a0f 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -355,6 +355,13 @@  static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs
 			if (static_key_true(&supports_deactivate))
 				gic_write_eoir(irqnr);
 
+			if (irqnr == lpitest->irqnr) {
+				lpitest->end_cycles = pmu_read_cycles();
+				lpitest->done = 0;
+				wake_up_interruptible(&lpitest->wq);
+				continue;
+			}
+
 			err = handle_domain_irq(gic_data.domain, irqnr, regs);
 			if (err) {
 				WARN_ONCE(true, "Unexpected interrupt received!\n");
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index b7e3431..986b7f4 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -450,6 +450,29 @@  static inline bool gic_enable_sre(void)
 	return !!(val & ICC_SRE_EL1_SRE);
 }
 
+#include <linux/wait.h>
+
+extern void flush_dcache_all(void);
+
+static __always_inline volatile u64 pmu_read_cycles(void)
+{
+        u64 cycles;
+
+        asm volatile("mrs %0, pmccntr_el0\n"
+                     "isb \n\t": [reg] "=r" (cycles));
+        return cycles;
+}
+
+struct lpitest_cntx {
+	u64 total_cycles;
+	u64 end_cycles;
+	u32 irqnr;
+	u32 done;
+	wait_queue_head_t wq;
+};
+
+extern struct lpitest_cntx *lpitest;
+
 #endif
 
 #endif
-- 
Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.