From 8a60136b1b805746451b5ff4d82eaa251bfd8c6c Mon Sep 17 00:00:00 2001
From: Shanker Donthineni <shankerd@codeaurora.org>
Date: Mon, 7 Nov 2016 23:08:28 -0600
Subject: [PATCH 2/2] irqchip/gicv3-its: Test code for measuring Read-allocate
hints performance
Apply this patch on tip of the v4.9-rc4 kernel and do two steps mentioned
below for measuring performance improvement with Read-allocate hints.
mount -t debugfs none /sys/kernel/debug
echo 10 > /sys/kernel/debug/lpitest
Test displays the CPU cycles that are spent to deliver LPI event.
Example:
[ 93.139710] CPU[1] iter=10 cycles=0xdd8c avg=0x1627 min=0x13a3
Signed-off-by: Shanker Donthineni <shankerd@codeaurora.org>
---
arch/arm64/mm/cache.S | 55 +++++++++++++
drivers/irqchip/irq-gic-v3-its.c | 160 +++++++++++++++++++++++++++++++++++++
drivers/irqchip/irq-gic-v3.c | 7 ++
include/linux/irqchip/arm-gic-v3.h | 23 ++++++
4 files changed, 245 insertions(+)
@@ -198,3 +198,58 @@ ENTRY(__dma_unmap_area)
b.ne __dma_inv_area
ret
ENDPIPROC(__dma_unmap_area)
+
+/*
+ * flush_dcache_all(), Flush the whole D-cache.
+ */
+ENTRY(flush_dcache_all)
+ mov x12, lr
+ dmb sy // ensure ordering with previous memory accesses
+ mrs x0, clidr_el1 // read clidr
+ and x3, x0, #0x7000000 // extract loc from clidr
+ lsr x3, x3, #23 // left align loc bit field
+ cbz x3, finished // if loc is 0, then no need to clean
+ mov x10, #0 // start clean at cache level 0
+loop1:
+ add x2, x10, x10, lsr #1 // work out 3x current cache level
+ lsr x1, x0, x2 // extract cache type bits from clidr
+ and x1, x1, #7 // mask of the bits for current cache only
+ cmp x1, #2 // see what cache we have at this level
+ b.lt skip // skip if no cache, or just i-cache
+ mrs x9, daif
+ disable_irq
+ msr csselr_el1, x10 // select current cache level in csselr
+ isb // isb to sych the new cssr&csidr
+ mrs x1, ccsidr_el1 // read the new ccsidr
+ msr daif, x9
+ and x2, x1, #7 // extract the length of the cache lines
+ add x2, x2, #4 // add 4 (line length offset)
+ mov x4, #0x3ff
+ and x4, x4, x1, lsr #3 // find maximum number on the way size
+ clz w5, w4 // find bit position of way size increment
+ mov x7, #0x7fff
+ and x7, x7, x1, lsr #13 // extract max number of the index size
+loop2:
+ mov x9, x4 // create working copy of max way size
+loop3:
+ lsl x6, x9, x5
+ orr x11, x10, x6 // factor way and cache number into x11
+ lsl x6, x7, x2
+ orr x11, x11, x6 // factor index number into x11
+ dc cisw, x11 // clean & invalidate by set/way
+ subs x9, x9, #1 // decrement the way
+ b.ge loop3
+ subs x7, x7, #1 // decrement the index
+ b.ge loop2
+skip:
+ add x10, x10, #2 // increment cache number
+ cmp x3, x10
+ b.gt loop1
+finished:
+ mov x10, #0 // swith back to cache level 0
+ msr csselr_el1, x10 // select current cache level in csselr
+ dsb sy
+ isb
+ mov x0, #0
+ ret x12
+ENDPROC(flush_dcache_all)
@@ -1882,3 +1882,163 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
return 0;
}
+#include <linux/debugfs.h>
+
+struct lpitest_cntx lpitest1 = {
+ .wq = __WAIT_QUEUE_HEAD_INITIALIZER(lpitest1.wq),
+};
+struct lpitest_cntx *lpitest = &lpitest1;
+
+static struct its_device *lpi_its_dev;
+
+static struct its_collection *its_build_int_cmd(struct its_cmd_block *cmd,
+ struct its_cmd_desc *desc)
+{
+ struct its_collection *col;
+
+ col = dev_event_to_col(desc->its_inv_cmd.dev,
+ desc->its_inv_cmd.event_id);
+
+ its_encode_cmd(cmd, 0x03);
+ its_encode_devid(cmd, desc->its_inv_cmd.dev->device_id);
+ its_encode_event_id(cmd, desc->its_inv_cmd.event_id);
+
+ its_fixup_cmd(cmd);
+
+ return col;
+}
+
+static void its_send_int(struct its_device *dev, u32 event_id)
+{
+ struct its_cmd_desc desc;
+
+ desc.its_inv_cmd.dev = dev;
+ desc.its_inv_cmd.event_id = event_id;
+
+ its_send_single_command(dev->its, its_build_int_cmd, &desc);
+}
+
+static ssize_t lpitest_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *pos)
+{
+ unsigned long val, lcnt;
+ u64 cycles, dcycles, mcycles = ~0;
+ int cpu, ret;
+
+ ret = kstrtoul_from_user(buffer, count, 10, &val);
+ if (ret || val <= 0)
+ return ret;
+
+ preempt_disable();
+
+ flush_dcache_all();
+
+ cpu = smp_processor_id();
+ *pos += count;
+ lpitest->irqnr = lpi_its_dev->event_map.lpi_base + cpu;
+
+ lpitest->total_cycles = 0;
+ lcnt = val;
+ while (val) {
+ cycles = pmu_read_cycles();
+ lpitest->done = 1;
+ its_send_int(lpi_its_dev, cpu);
+ wait_event_interruptible(lpitest->wq, !lpitest->done);
+ dcycles = lpitest->end_cycles - cycles;
+ lpitest->total_cycles += dcycles;
+ if (mcycles > dcycles)
+ mcycles = dcycles;
+ val--;
+ }
+ preempt_enable();
+
+ pr_info("CPU[%d] niter=%ld cycles=0x%lx avg=0x%lx min=0x%lx\n", cpu,
+ (unsigned long)lcnt,
+ (unsigned long)lpitest->total_cycles,
+ (unsigned long)lpitest->total_cycles/lcnt,
+ (unsigned long)mcycles);
+
+ return ret ? ret : count;
+}
+
+static int lpitest_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, NULL, NULL);
+}
+
+static const struct file_operations lpitest_fops = {
+ .owner = THIS_MODULE,
+ .open = lpitest_proc_open,
+ .read = seq_read,
+ .write = lpitest_write,
+};
+
+static void pmu_enable_cycle_counter(void *discard)
+{
+ u64 tmp;
+
+ asm volatile("mrs %0, pmcr_el0\n"
+ "orr %0, %0, #(1 << 0)\n"
+ "orr %0, %0, #(1 << 2)\n"
+ "bic %0, %0, #(1 << 3)\n"
+ "orr %0, %0, #(1 << 6)\n"
+ "msr pmcr_el0, %0\n"
+ "mov %0, #0b11111\n"
+ "msr pmselr_el0, %0\n"
+ "isb \n"
+ "mrs %0, pmxevtyper_el0\n"
+ "orr %0, %0, #(1 << 27)\n"
+ "bic %0, %0, #(3 << 30)\n"
+ "bic %0, %0, #(3 << 28)\n"
+ "msr pmxevtyper_el0, %0\n"
+ "mrs %0, pmcntenset_el0\n"
+ "orr %0, %0, #(1 << 31)\n"
+ "msr pmcntenset_el0, %0\n"
+ : "=r" (tmp));
+}
+
+static int __init its_lpitest_init(void)
+{
+ struct its_device *its_dev;
+ struct its_node *its;
+ struct dentry *dentry;
+ irq_hw_number_t hwirq;
+ int i, nvec = 64;
+ u8 *cfg;
+
+ if (list_empty(&its_nodes))
+ return 0;
+ its = list_first_entry(&its_nodes, struct its_node, entry);
+
+ dentry = debugfs_create_file("lpitest", 0666, NULL, NULL, &lpitest_fops);
+ if (!dentry) {
+ pr_err("failed to create debugfs for its-lpitest");
+ return -ENOMEM;
+ }
+
+ its_dev = its_create_device(its, 0xFFFF, nvec);
+ if (!its_dev) {
+ pr_err("failed to create its device for lpitest");
+ return -ENOMEM;
+ }
+
+ lpi_its_dev = its_dev;
+ hwirq = its_dev->event_map.lpi_base;
+ cfg = page_address(gic_rdists->prop_page) + hwirq - 8192;
+
+ for (i = 0; i < nvec; i++) {
+ lpi_its_dev->event_map.col_map[i] = i;
+ its_send_mapvi(its_dev, hwirq + i, i);
+ *cfg |= LPI_PROP_ENABLED;
+ dsb(ishst);
+ its_send_inv(its_dev, i);
+ cfg++;
+ }
+
+ on_each_cpu(pmu_enable_cycle_counter, NULL, 1);
+
+ pr_info("lpitest successfully initialized lpi_base=%d\n", (u32)hwirq);
+
+ return 0;
+}
+late_initcall(its_lpitest_init);
@@ -355,6 +355,13 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs
if (static_key_true(&supports_deactivate))
gic_write_eoir(irqnr);
+ if (irqnr == lpitest->irqnr) {
+ lpitest->end_cycles = pmu_read_cycles();
+ lpitest->done = 0;
+ wake_up_interruptible(&lpitest->wq);
+ continue;
+ }
+
err = handle_domain_irq(gic_data.domain, irqnr, regs);
if (err) {
WARN_ONCE(true, "Unexpected interrupt received!\n");
@@ -450,6 +450,29 @@ static inline bool gic_enable_sre(void)
return !!(val & ICC_SRE_EL1_SRE);
}
+#include <linux/wait.h>
+
+extern void flush_dcache_all(void);
+
+static __always_inline volatile u64 pmu_read_cycles(void)
+{
+ u64 cycles;
+
+ asm volatile("mrs %0, pmccntr_el0\n"
+ "isb \n\t": [reg] "=r" (cycles));
+ return cycles;
+}
+
+struct lpitest_cntx {
+ u64 total_cycles;
+ u64 end_cycles;
+ u32 irqnr;
+ u32 done;
+ wait_queue_head_t wq;
+};
+
+extern struct lpitest_cntx *lpitest;
+
#endif
#endif
--
Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.