diff mbox

[2/5] ARM: Add Broadcom Brahma-B15 readahead cache support

Message ID 1425689693-31034-3-git-send-email-f.fainelli@gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Florian Fainelli March 7, 2015, 12:54 a.m. UTC
This patch adds support for the Broadcom Brahma-B15 CPU readahead cache
controller. This cache controller sits between the L2 and the memory bus
and its purpose is to provide a friendler burst size towards the DDR
interface than the native cache line size.

The readahead cache is mostly transparent, except for
flush_kern_cache_all, flush_kern_cache_louis and flush_icache_all, which
is precisely what we are overriding here.

The readahead cache only intercepts reads, not writes, as such, some
data can remain stale in any of its buffers, such that we need to flush
it, which is an operation that needs to happen in a particular order:

- disable the readahead cache
- flush it
- call the appropriate cache-v7.S function
- re-enable

This patch tries to minimize the impact to the cache-v7.S file by only
providing a stub in case CONFIG_CACHE_B15_RAC is enabled (default for
ARCH_BRCMSTB since it is the current user).

Signed-off-by: Alamy Liu <alamyliu@broadcom.com>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 arch/arm/include/asm/cacheflush.h             |   2 +-
 arch/arm/include/asm/glue-cache.h             |   4 +
 arch/arm/include/asm/hardware/cache-b15-rac.h |  12 ++
 arch/arm/mm/Kconfig                           |   8 ++
 arch/arm/mm/Makefile                          |   1 +
 arch/arm/mm/cache-b15-rac.c                   | 181 ++++++++++++++++++++++++++
 6 files changed, 207 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm/include/asm/hardware/cache-b15-rac.h
 create mode 100644 arch/arm/mm/cache-b15-rac.c

Comments

Russell King - ARM Linux March 16, 2015, 9:02 p.m. UTC | #1
On Fri, Mar 06, 2015 at 04:54:50PM -0800, Florian Fainelli wrote:
> This patch adds support for the Broadcom Brahma-B15 CPU readahead cache
> controller. This cache controller sits between the L2 and the memory bus
> and its purpose is to provide a friendler burst size towards the DDR
> interface than the native cache line size.
> 
> The readahead cache is mostly transparent, except for
> flush_kern_cache_all, flush_kern_cache_louis and flush_icache_all, which
> is precisely what we are overriding here.
> 
> The readahead cache only intercepts reads, not writes, as such, some
> data can remain stale in any of its buffers, such that we need to flush
> it, which is an operation that needs to happen in a particular order:
> 
> - disable the readahead cache
> - flush it
> - call the appropriate cache-v7.S function
> - re-enable
> 
> This patch tries to minimize the impact to the cache-v7.S file by only
> providing a stub in case CONFIG_CACHE_B15_RAC is enabled (default for
> ARCH_BRCMSTB since it is the current user).
> 
> Signed-off-by: Alamy Liu <alamyliu@broadcom.com>
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>  arch/arm/include/asm/cacheflush.h             |   2 +-
>  arch/arm/include/asm/glue-cache.h             |   4 +
>  arch/arm/include/asm/hardware/cache-b15-rac.h |  12 ++
>  arch/arm/mm/Kconfig                           |   8 ++
>  arch/arm/mm/Makefile                          |   1 +
>  arch/arm/mm/cache-b15-rac.c                   | 181 ++++++++++++++++++++++++++
>  6 files changed, 207 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm/include/asm/hardware/cache-b15-rac.h
>  create mode 100644 arch/arm/mm/cache-b15-rac.c
> 
> diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
> index 2d46862e7bef..4d847e185cf6 100644
> --- a/arch/arm/include/asm/cacheflush.h
> +++ b/arch/arm/include/asm/cacheflush.h
> @@ -199,7 +199,7 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *,
>   */
>  #if (defined(CONFIG_CPU_V7) && \
>       (defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_V6K))) || \
> -	defined(CONFIG_SMP_ON_UP)
> +	defined(CONFIG_SMP_ON_UP) || defined(CONFIG_CACHE_B15_RAC)
>  #define __flush_icache_preferred	__cpuc_flush_icache_all
>  #elif __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
>  #define __flush_icache_preferred	__flush_icache_all_v7_smp
> diff --git a/arch/arm/include/asm/glue-cache.h b/arch/arm/include/asm/glue-cache.h
> index a3c24cd5b7c8..11f33b5f9284 100644
> --- a/arch/arm/include/asm/glue-cache.h
> +++ b/arch/arm/include/asm/glue-cache.h
> @@ -117,6 +117,10 @@
>  # endif
>  #endif
>  
> +#if defined(CONFIG_CACHE_B15_RAC)
> +# define MULTI_CACHE 1
> +#endif
> +
>  #if defined(CONFIG_CPU_V7M)
>  # ifdef _CACHE
>  #  define MULTI_CACHE 1
> diff --git a/arch/arm/include/asm/hardware/cache-b15-rac.h b/arch/arm/include/asm/hardware/cache-b15-rac.h
> new file mode 100644
> index 000000000000..76b888f53f90
> --- /dev/null
> +++ b/arch/arm/include/asm/hardware/cache-b15-rac.h
> @@ -0,0 +1,12 @@
> +#ifndef __ASM_ARM_HARDWARE_CACHE_B15_RAC_H
> +#define __ASM_ARM_HARDWARE_CACHE_B15_RAC_H
> +
> +#ifndef __ASSEMBLY__
> +
> +void b15_flush_kern_cache_all(void);
> +void b15_flush_kern_cache_louis(void);
> +void b15_flush_icache_all(void);
> +
> +#endif
> +
> +#endif
> diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
> index 9b4f29e595a4..4d5652a39304 100644
> --- a/arch/arm/mm/Kconfig
> +++ b/arch/arm/mm/Kconfig
> @@ -853,6 +853,14 @@ config OUTER_CACHE_SYNC
>  	  The outer cache has a outer_cache_fns.sync function pointer
>  	  that can be used to drain the write buffer of the outer cache.
>  
> +config CACHE_B15_RAC
> +	bool "Enable the Broadcom Brahma-B15 read-ahead cache controller"
> +	depends on ARCH_BRCMSTB
> +	default y
> +	help
> +	  This option enables the Broadcom Brahma-B15 read-ahead cache
> +	  controller. If disabled, the read-ahead cache remains off.
> +
>  config CACHE_FEROCEON_L2
>  	bool "Enable the Feroceon L2 cache controller"
>  	depends on ARCH_MV78XX0 || ARCH_MVEBU
> diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
> index d3afdf9eb65a..a6797fdb6721 100644
> --- a/arch/arm/mm/Makefile
> +++ b/arch/arm/mm/Makefile
> @@ -96,6 +96,7 @@ AFLAGS_proc-v6.o	:=-Wa,-march=armv6
>  AFLAGS_proc-v7.o	:=-Wa,-march=armv7-a
>  
>  obj-$(CONFIG_OUTER_CACHE)	+= l2c-common.o
> +obj-$(CONFIG_CACHE_B15_RAC)	+= cache-b15-rac.o
>  obj-$(CONFIG_CACHE_FEROCEON_L2)	+= cache-feroceon-l2.o
>  obj-$(CONFIG_CACHE_L2X0)	+= cache-l2x0.o l2c-l2x0-resume.o
>  obj-$(CONFIG_CACHE_XSC3L2)	+= cache-xsc3l2.o
> diff --git a/arch/arm/mm/cache-b15-rac.c b/arch/arm/mm/cache-b15-rac.c
> new file mode 100644
> index 000000000000..1c5bca6e906b
> --- /dev/null
> +++ b/arch/arm/mm/cache-b15-rac.c
> @@ -0,0 +1,181 @@
> +/*
> + * Broadcom Brahma-B15 CPU read-ahead cache management functions
> + *
> + * Copyright (C) 2015, Broadcom Corporation
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/err.h>
> +#include <linux/spinlock.h>
> +#include <linux/io.h>
> +#include <linux/bitops.h>
> +#include <linux/of_address.h>
> +
> +#include <asm/cacheflush.h>
> +#include <asm/hardware/cache-b15-rac.h>
> +
> +extern void v7_flush_kern_cache_all(void);
> +extern void v7_flush_kern_cache_louis(void);
> +extern void v7_flush_icache_all(void);
> +
> +/* RAC register offsets, relative to the HIF_CPU_BIUCTRL register base */
> +#define RAC_CONFIG0_REG			(0x78)
> +#define  RACENPREF_MASK			(0x3)
> +#define  RACPREFINST_SHIFT		(0)
> +#define  RACENINST_SHIFT		(2)
> +#define  RACPREFDATA_SHIFT		(4)
> +#define  RACENDATA_SHIFT		(6)
> +#define  RAC_CPU_SHIFT			(8)
> +#define  RACCFG_MASK			(0xff)
> +#define RAC_CONFIG1_REG			(0x7c)
> +#define RAC_FLUSH_REG			(0x80)
> +#define  FLUSH_RAC			(1 << 0)

					BIT(0) ?

> +
> +/* Bitmask to enable instruction and data prefetching with a 256-bytes stride */
> +#define RAC_DATA_INST_EN_MASK		(1 << RACPREFINST_SHIFT | \
> +					 RACENPREF_MASK << RACENINST_SHIFT | \
> +					 1 << RACPREFDATA_SHIFT | \
> +					 RACENPREF_MASK << RACENDATA_SHIFT)
> +
> +#define RAC_ENABLED			(1 << 0)

					BIT(0) ?

However, you don't use RAC_ENABLED as a bitmask, but a bit index, so
shouldn't this be zero?

> +
> +static void __iomem *b15_rac_base;
> +static DEFINE_SPINLOCK(rac_lock);
> +
> +/* Initialization flag to avoid checking for b15_rac_base, and to prevent
> + * multi-platform kernels from crashing here as well.
> + */
> +static unsigned long b15_rac_flags;
> +
> +static inline u32 __b15_rac_disable(void)
> +{
> +	u32 val = __raw_readl(b15_rac_base + RAC_CONFIG0_REG);
> +	__raw_writel(0, b15_rac_base + RAC_CONFIG0_REG);
> +	dmb();
> +	return val;
> +}
> +
> +static inline void __b15_rac_flush(void)
> +{
> +	u32 reg;
> +
> +	__raw_writel(FLUSH_RAC, b15_rac_base + RAC_FLUSH_REG);
> +	do {
> +		/* This dmb() is required to force the Bus Interface Unit
> +		 * to clean oustanding writes, and forces an idle cycle
> +		 * to be inserted.
> +		 */
> +		dmb();
> +		reg = __raw_readl(b15_rac_base + RAC_FLUSH_REG);
> +	} while (reg & RAC_FLUSH_REG);
> +}
> +
> +static inline u32 b15_rac_disable_and_flush(void)
> +{
> +	u32 reg;
> +
> +	reg = __b15_rac_disable();
> +	__b15_rac_flush();
> +	return reg;
> +}
> +
> +static inline void __b15_rac_enable(u32 val)
> +{
> +	__raw_writel(val, b15_rac_base + RAC_CONFIG0_REG);
> +	/* dsb() is required here to be consistent with __flush_icache_all() */
> +	dsb();
> +}
> +
> +#define BUILD_RAC_CACHE_OP(name, bar)				\
> +void b15_flush_##name(void)					\
> +{								\
> +	unsigned int do_flush;					\
> +	u32 val = 0;						\
> +								\
> +	spin_lock(&rac_lock);					\
> +	do_flush = test_bit(RAC_ENABLED, &b15_rac_flags);	\

Do you need to use test_bit() here?  You set and test this location
under a spinlock, so it's safe to use non-atomic ops here.

> +static void b15_rac_enable(void)
> +{
> +	unsigned int cpu;
> +	u32 enable = 0;
> +
> +	for_each_possible_cpu(cpu)
> +		enable |= (RAC_DATA_INST_EN_MASK << (cpu * RAC_CPU_SHIFT));

		enable |= RAC_DATA_INST_EN_MASK << (cpu * RAC_CPU_SHIFT);

You don't need the additional parens - the right hand side of |= is
already expected to be an expression by the compiler.

> +	spin_lock(&rac_lock);
> +	reg = __raw_readl(b15_rac_base + RAC_CONFIG0_REG);
> +	for_each_possible_cpu(cpu)
> +		en_mask |= ((1 << RACPREFDATA_SHIFT) << (cpu * RAC_CPU_SHIFT));

		en_mask |= 1 << (RACPREFDATA_SHIFT + cpu * RAC_CPU_SHIFT);

looks nicer, rather than having two shifts.

What happens when the system goes down (eg, for kexec?)  Does the RAC
need to be disabled for that?
Florian Fainelli March 16, 2015, 9:20 p.m. UTC | #2
On 16/03/15 14:02, Russell King - ARM Linux wrote:
> On Fri, Mar 06, 2015 at 04:54:50PM -0800, Florian Fainelli wrote:
>> This patch adds support for the Broadcom Brahma-B15 CPU readahead cache
>> controller. This cache controller sits between the L2 and the memory bus
>> and its purpose is to provide a friendler burst size towards the DDR
>> interface than the native cache line size.
>>
>> The readahead cache is mostly transparent, except for
>> flush_kern_cache_all, flush_kern_cache_louis and flush_icache_all, which
>> is precisely what we are overriding here.
>>
>> The readahead cache only intercepts reads, not writes, as such, some
>> data can remain stale in any of its buffers, such that we need to flush
>> it, which is an operation that needs to happen in a particular order:
>>
>> - disable the readahead cache
>> - flush it
>> - call the appropriate cache-v7.S function
>> - re-enable
>>
>> This patch tries to minimize the impact to the cache-v7.S file by only
>> providing a stub in case CONFIG_CACHE_B15_RAC is enabled (default for
>> ARCH_BRCMSTB since it is the current user).
>>
>> Signed-off-by: Alamy Liu <alamyliu@broadcom.com>
>> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
>> ---

[snip]

>> +/* Bitmask to enable instruction and data prefetching with a 256-bytes stride */
>> +#define RAC_DATA_INST_EN_MASK		(1 << RACPREFINST_SHIFT | \
>> +					 RACENPREF_MASK << RACENINST_SHIFT | \
>> +					 1 << RACPREFDATA_SHIFT | \
>> +					 RACENPREF_MASK << RACENDATA_SHIFT)
>> +
>> +#define RAC_ENABLED			(1 << 0)
> 
> 					BIT(0) ?
> 
> However, you don't use RAC_ENABLED as a bitmask, but a bit index, so
> shouldn't this be zero?

In subsequent patches we have a need for distinguishing RAC_ENABLED from
RAC_SUSPENDED, so that's the primary reason for using it as a bitmask
(could make that clear somewhere).

[snip]

>> +#define BUILD_RAC_CACHE_OP(name, bar)				\
>> +void b15_flush_##name(void)					\
>> +{								\
>> +	unsigned int do_flush;					\
>> +	u32 val = 0;						\
>> +								\
>> +	spin_lock(&rac_lock);					\
>> +	do_flush = test_bit(RAC_ENABLED, &b15_rac_flags);	\
> 
> Do you need to use test_bit() here?  You set and test this location
> under a spinlock, so it's safe to use non-atomic ops here.

Right, we don't need the test_bit, it just felt a little nicer.

> 
>> +static void b15_rac_enable(void)
>> +{
>> +	unsigned int cpu;
>> +	u32 enable = 0;
>> +
>> +	for_each_possible_cpu(cpu)
>> +		enable |= (RAC_DATA_INST_EN_MASK << (cpu * RAC_CPU_SHIFT));
> 
> 		enable |= RAC_DATA_INST_EN_MASK << (cpu * RAC_CPU_SHIFT);
> 
> You don't need the additional parens - the right hand side of |= is
> already expected to be an expression by the compiler.
> 
>> +	spin_lock(&rac_lock);
>> +	reg = __raw_readl(b15_rac_base + RAC_CONFIG0_REG);
>> +	for_each_possible_cpu(cpu)
>> +		en_mask |= ((1 << RACPREFDATA_SHIFT) << (cpu * RAC_CPU_SHIFT));
> 
> 		en_mask |= 1 << (RACPREFDATA_SHIFT + cpu * RAC_CPU_SHIFT);
> 
> looks nicer, rather than having two shifts.

Indeed, thanks.

> 
> What happens when the system goes down (eg, for kexec?)  Does the RAC
> need to be disabled for that?

Per boot convention, I would say so, yes, since this is another level of
instruction and data cache, we should turn it off. Can we register some
sort of notifier specifically for kexec?

Thanks!
Russell King - ARM Linux March 17, 2015, 12:10 a.m. UTC | #3
On Mon, Mar 16, 2015 at 02:20:53PM -0700, Florian Fainelli wrote:
> On 16/03/15 14:02, Russell King - ARM Linux wrote:
> > On Fri, Mar 06, 2015 at 04:54:50PM -0800, Florian Fainelli wrote:
> >> This patch adds support for the Broadcom Brahma-B15 CPU readahead cache
> >> controller. This cache controller sits between the L2 and the memory bus
> >> and its purpose is to provide a friendler burst size towards the DDR
> >> interface than the native cache line size.
> >>
> >> The readahead cache is mostly transparent, except for
> >> flush_kern_cache_all, flush_kern_cache_louis and flush_icache_all, which
> >> is precisely what we are overriding here.
> >>
> >> The readahead cache only intercepts reads, not writes, as such, some
> >> data can remain stale in any of its buffers, such that we need to flush
> >> it, which is an operation that needs to happen in a particular order:
> >>
> >> - disable the readahead cache
> >> - flush it
> >> - call the appropriate cache-v7.S function
> >> - re-enable
> >>
> >> This patch tries to minimize the impact to the cache-v7.S file by only
> >> providing a stub in case CONFIG_CACHE_B15_RAC is enabled (default for
> >> ARCH_BRCMSTB since it is the current user).
> >>
> >> Signed-off-by: Alamy Liu <alamyliu@broadcom.com>
> >> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> >> ---
> 
> [snip]
> 
> >> +/* Bitmask to enable instruction and data prefetching with a 256-bytes stride */
> >> +#define RAC_DATA_INST_EN_MASK		(1 << RACPREFINST_SHIFT | \
> >> +					 RACENPREF_MASK << RACENINST_SHIFT | \
> >> +					 1 << RACPREFDATA_SHIFT | \
> >> +					 RACENPREF_MASK << RACENDATA_SHIFT)
> >> +
> >> +#define RAC_ENABLED			(1 << 0)
> > 
> > 					BIT(0) ?
> > 
> > However, you don't use RAC_ENABLED as a bitmask, but a bit index, so
> > shouldn't this be zero?
> 
> In subsequent patches we have a need for distinguishing RAC_ENABLED from
> RAC_SUSPENDED, so that's the primary reason for using it as a bitmask
> (could make that clear somewhere).

However, test_bit() etc take a bit _number_ not a bit _mask_.  So:

Passing in 1 << 0 will test bit 1 rather than bit 0.
Passing in 1 << 1 will test bit 2 rather than bit 1.
Passing in 1 << 2 will test bit 4 rather than bit 2.
Passing in 1 << 3 will test bit 8 rather than bit 3.
etc.

This is not what you wanted.  Either use a mask directly, or use
test_bit() with a bit number etc.  Don't try and do both together. :)

> > What happens when the system goes down (eg, for kexec?)  Does the RAC
> > need to be disabled for that?
> 
> Per boot convention, I would say so, yes, since this is another level of
> instruction and data cache, we should turn it off. Can we register some
> sort of notifier specifically for kexec?

The code at present doesn't expect there to be platform specific caches,
so that probably isn't catered for yet.  I mentioned the point to raise
the issue that there's an oversight here.
Florian Fainelli March 17, 2015, 12:32 a.m. UTC | #4
On 16/03/15 17:10, Russell King - ARM Linux wrote:
> On Mon, Mar 16, 2015 at 02:20:53PM -0700, Florian Fainelli wrote:
>> On 16/03/15 14:02, Russell King - ARM Linux wrote:
>>> On Fri, Mar 06, 2015 at 04:54:50PM -0800, Florian Fainelli wrote:
>>>> This patch adds support for the Broadcom Brahma-B15 CPU readahead cache
>>>> controller. This cache controller sits between the L2 and the memory bus
>>>> and its purpose is to provide a friendler burst size towards the DDR
>>>> interface than the native cache line size.
>>>>
>>>> The readahead cache is mostly transparent, except for
>>>> flush_kern_cache_all, flush_kern_cache_louis and flush_icache_all, which
>>>> is precisely what we are overriding here.
>>>>
>>>> The readahead cache only intercepts reads, not writes, as such, some
>>>> data can remain stale in any of its buffers, such that we need to flush
>>>> it, which is an operation that needs to happen in a particular order:
>>>>
>>>> - disable the readahead cache
>>>> - flush it
>>>> - call the appropriate cache-v7.S function
>>>> - re-enable
>>>>
>>>> This patch tries to minimize the impact to the cache-v7.S file by only
>>>> providing a stub in case CONFIG_CACHE_B15_RAC is enabled (default for
>>>> ARCH_BRCMSTB since it is the current user).
>>>>
>>>> Signed-off-by: Alamy Liu <alamyliu@broadcom.com>
>>>> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
>>>> ---
>>
>> [snip]
>>
>>>> +/* Bitmask to enable instruction and data prefetching with a 256-bytes stride */
>>>> +#define RAC_DATA_INST_EN_MASK		(1 << RACPREFINST_SHIFT | \
>>>> +					 RACENPREF_MASK << RACENINST_SHIFT | \
>>>> +					 1 << RACPREFDATA_SHIFT | \
>>>> +					 RACENPREF_MASK << RACENDATA_SHIFT)
>>>> +
>>>> +#define RAC_ENABLED			(1 << 0)
>>>
>>> 					BIT(0) ?
>>>
>>> However, you don't use RAC_ENABLED as a bitmask, but a bit index, so
>>> shouldn't this be zero?
>>
>> In subsequent patches we have a need for distinguishing RAC_ENABLED from
>> RAC_SUSPENDED, so that's the primary reason for using it as a bitmask
>> (could make that clear somewhere).
> 
> However, test_bit() etc take a bit _number_ not a bit _mask_.  So:
> 
> Passing in 1 << 0 will test bit 1 rather than bit 0.
> Passing in 1 << 1 will test bit 2 rather than bit 1.
> Passing in 1 << 2 will test bit 4 rather than bit 2.
> Passing in 1 << 3 will test bit 8 rather than bit 3.
> etc.
> 
> This is not what you wanted.  Either use a mask directly, or use
> test_bit() with a bit number etc.  Don't try and do both together. :)

Fixed, thanks.

> 
>>> What happens when the system goes down (eg, for kexec?)  Does the RAC
>>> need to be disabled for that?
>>
>> Per boot convention, I would say so, yes, since this is another level of
>> instruction and data cache, we should turn it off. Can we register some
>> sort of notifier specifically for kexec?
> 
> The code at present doesn't expect there to be platform specific caches,
> so that probably isn't catered for yet.  I mentioned the point to raise
> the issue that there's an oversight here.

Since kexec goes through the usual suspend/resume path, it will suspend
the RAC by calling into syscore_suspend (patch 5), that's when
CONFIG_KERNEL_KEXEC_JUMP is set, which is not guaranteed.

kernel_restart_prepare() calls a reboot notifier with SYS_RESTART, so if
we disable the RAC at this point, we should be good, I will add that.
Will Deacon March 17, 2015, 5:29 p.m. UTC | #5
On Sat, Mar 07, 2015 at 12:54:50AM +0000, Florian Fainelli wrote:
> This patch adds support for the Broadcom Brahma-B15 CPU readahead cache
> controller. This cache controller sits between the L2 and the memory bus
> and its purpose is to provide a friendler burst size towards the DDR
> interface than the native cache line size.
> 
> The readahead cache is mostly transparent, except for
> flush_kern_cache_all, flush_kern_cache_louis and flush_icache_all, which
> is precisely what we are overriding here.

I'm struggling to understand why you care about flush_kern_cache_louis
and flush_icache_all for a cache that sits the other side of the L2.

Can you explain why we need to do anything in these cases, please?

Will
Florian Fainelli March 17, 2015, 6:02 p.m. UTC | #6
On 17/03/15 10:29, Will Deacon wrote:
> On Sat, Mar 07, 2015 at 12:54:50AM +0000, Florian Fainelli wrote:
>> This patch adds support for the Broadcom Brahma-B15 CPU readahead cache
>> controller. This cache controller sits between the L2 and the memory bus
>> and its purpose is to provide a friendler burst size towards the DDR
>> interface than the native cache line size.
>>
>> The readahead cache is mostly transparent, except for
>> flush_kern_cache_all, flush_kern_cache_louis and flush_icache_all, which
>> is precisely what we are overriding here.
> 
> I'm struggling to understand why you care about flush_kern_cache_louis
> and flush_icache_all for a cache that sits the other side of the L2.
> 
> Can you explain why we need to do anything in these cases, please?

Let's try, as you may have read in the comment, all MVA-based cache
maintenance operations are snooped by the RAC, so they are effectively
"transparent" to software, all others are not.

flush_kern_cache_louis() and flush_icache_all() both use ICALLIUS in the
SMP case and ICIALLU in the UP case which were flagged as not being
transparently handled.

The concern is that, if you perform a L1 cache (data or instruction)
flush (essentially an invalidate), this will also flush (invalidate)
corresponding L2 cache lines, but the RAC has no way to be signaled that
is should also invalidate its own RAC cache lines pertaining to that
data, and RAC holds per-CPU "super" cache lines.

In arch/arm/kernel/smp.c, all uses of flush_cache_louis() are for
writing-back data, so the RAC is not an issue. In
arch/arm/kernel/suspend.c, flush_cache_louis() is known not to guarantee
a "clean" all the way to main memory, so __cpu_flush_dcache_area is used
in conjunction. In arch/arm/mm/idmap.c and mmu.c, the use of
flush_cache_louis() seems to be meant to see fresh data, not write-back,
so not transparent to the RAC, is that right?

It may very well be that we are super cautious here and that the only
case to take care of is essentially flush_cache_all(), and nothing more.

Would you suggestions on how to instrument/exercise whether we really
need to deal with flush_cache_louis() and flush_icache_all()?

Thanks!
Will Deacon March 23, 2015, 11:14 a.m. UTC | #7
On Tue, Mar 17, 2015 at 06:02:22PM +0000, Florian Fainelli wrote:
> On 17/03/15 10:29, Will Deacon wrote:
> > On Sat, Mar 07, 2015 at 12:54:50AM +0000, Florian Fainelli wrote:
> >> This patch adds support for the Broadcom Brahma-B15 CPU readahead cache
> >> controller. This cache controller sits between the L2 and the memory bus
> >> and its purpose is to provide a friendler burst size towards the DDR
> >> interface than the native cache line size.
> >>
> >> The readahead cache is mostly transparent, except for
> >> flush_kern_cache_all, flush_kern_cache_louis and flush_icache_all, which
> >> is precisely what we are overriding here.
> > 
> > I'm struggling to understand why you care about flush_kern_cache_louis
> > and flush_icache_all for a cache that sits the other side of the L2.
> > 
> > Can you explain why we need to do anything in these cases, please?
> 
> Let's try, as you may have read in the comment, all MVA-based cache
> maintenance operations are snooped by the RAC, so they are effectively
> "transparent" to software, all others are not.
> 
> flush_kern_cache_louis() and flush_icache_all() both use ICALLIUS in the
> SMP case and ICIALLU in the UP case which were flagged as not being
> transparently handled.
> 
> The concern is that, if you perform a L1 cache (data or instruction)
> flush (essentially an invalidate), this will also flush (invalidate)
> corresponding L2 cache lines, but the RAC has no way to be signaled that
> is should also invalidate its own RAC cache lines pertaining to that
> data, and RAC holds per-CPU "super" cache lines.
> 
> In arch/arm/kernel/smp.c, all uses of flush_cache_louis() are for
> writing-back data, so the RAC is not an issue. In
> arch/arm/kernel/suspend.c, flush_cache_louis() is known not to guarantee
> a "clean" all the way to main memory, so __cpu_flush_dcache_area is used
> in conjunction. In arch/arm/mm/idmap.c and mmu.c, the use of
> flush_cache_louis() seems to be meant to see fresh data, not write-back,
> so not transparent to the RAC, is that right?
> 
> It may very well be that we are super cautious here and that the only
> case to take care of is essentially flush_cache_all(), and nothing more.
> 
> Would you suggestions on how to instrument/exercise whether we really
> need to deal with flush_cache_louis() and flush_icache_all()?

I think that both flush_cache_louis and flush_icache_all only care about
the inner-shareable domain, so you don't need to do anything with the
RAC. It's a bit like the PL310 outer-cache, which is also not affected
by these operations.

I don't think there's a good way to determine statically if we have
missing cacheflush calls. Maybe a better bet would be to implement a
RAC driver using the outer_cache framework and only implement the
flush_all callback.

Will
Florian Fainelli July 27, 2015, 6:47 p.m. UTC | #8
On 23/03/15 04:14, Will Deacon wrote:
> On Tue, Mar 17, 2015 at 06:02:22PM +0000, Florian Fainelli wrote:
>> On 17/03/15 10:29, Will Deacon wrote:
>>> On Sat, Mar 07, 2015 at 12:54:50AM +0000, Florian Fainelli wrote:
>>>> This patch adds support for the Broadcom Brahma-B15 CPU readahead cache
>>>> controller. This cache controller sits between the L2 and the memory bus
>>>> and its purpose is to provide a friendler burst size towards the DDR
>>>> interface than the native cache line size.
>>>>
>>>> The readahead cache is mostly transparent, except for
>>>> flush_kern_cache_all, flush_kern_cache_louis and flush_icache_all, which
>>>> is precisely what we are overriding here.
>>>
>>> I'm struggling to understand why you care about flush_kern_cache_louis
>>> and flush_icache_all for a cache that sits the other side of the L2.
>>>
>>> Can you explain why we need to do anything in these cases, please?
>>
>> Let's try, as you may have read in the comment, all MVA-based cache
>> maintenance operations are snooped by the RAC, so they are effectively
>> "transparent" to software, all others are not.
>>
>> flush_kern_cache_louis() and flush_icache_all() both use ICALLIUS in the
>> SMP case and ICIALLU in the UP case which were flagged as not being
>> transparently handled.
>>
>> The concern is that, if you perform a L1 cache (data or instruction)
>> flush (essentially an invalidate), this will also flush (invalidate)
>> corresponding L2 cache lines, but the RAC has no way to be signaled that
>> is should also invalidate its own RAC cache lines pertaining to that
>> data, and RAC holds per-CPU "super" cache lines.
>>
>> In arch/arm/kernel/smp.c, all uses of flush_cache_louis() are for
>> writing-back data, so the RAC is not an issue. In
>> arch/arm/kernel/suspend.c, flush_cache_louis() is known not to guarantee
>> a "clean" all the way to main memory, so __cpu_flush_dcache_area is used
>> in conjunction. In arch/arm/mm/idmap.c and mmu.c, the use of
>> flush_cache_louis() seems to be meant to see fresh data, not write-back,
>> so not transparent to the RAC, is that right?
>>
>> It may very well be that we are super cautious here and that the only
>> case to take care of is essentially flush_cache_all(), and nothing more.
>>
>> Would you suggestions on how to instrument/exercise whether we really
>> need to deal with flush_cache_louis() and flush_icache_all()?
> 
> I think that both flush_cache_louis and flush_icache_all only care about
> the inner-shareable domain, so you don't need to do anything with the
> RAC. It's a bit like the PL310 outer-cache, which is also not affected
> by these operations.

I see, will keep experimenting with removing these two and see if
anything breaks.

> 
> I don't think there's a good way to determine statically if we have
> missing cacheflush calls. Maybe a better bet would be to implement a
> RAC driver using the outer_cache framework and only implement the
> flush_all callback.

Last I tried this, the performance became absolutely terrible for e.g:
networking which involves doing frequent invalidation + write-back due
to DMA operations. Also, it did not seem to me like it was possible to
get an information about the DMA transfer direction (at least not at
this level) which could help speed the write-back case since there
nothing to do in that case (unlike in the PL310 case).
diff mbox

Patch

diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
index 2d46862e7bef..4d847e185cf6 100644
--- a/arch/arm/include/asm/cacheflush.h
+++ b/arch/arm/include/asm/cacheflush.h
@@ -199,7 +199,7 @@  extern void copy_to_user_page(struct vm_area_struct *, struct page *,
  */
 #if (defined(CONFIG_CPU_V7) && \
      (defined(CONFIG_CPU_V6) || defined(CONFIG_CPU_V6K))) || \
-	defined(CONFIG_SMP_ON_UP)
+	defined(CONFIG_SMP_ON_UP) || defined(CONFIG_CACHE_B15_RAC)
 #define __flush_icache_preferred	__cpuc_flush_icache_all
 #elif __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
 #define __flush_icache_preferred	__flush_icache_all_v7_smp
diff --git a/arch/arm/include/asm/glue-cache.h b/arch/arm/include/asm/glue-cache.h
index a3c24cd5b7c8..11f33b5f9284 100644
--- a/arch/arm/include/asm/glue-cache.h
+++ b/arch/arm/include/asm/glue-cache.h
@@ -117,6 +117,10 @@ 
 # endif
 #endif
 
+#if defined(CONFIG_CACHE_B15_RAC)
+# define MULTI_CACHE 1
+#endif
+
 #if defined(CONFIG_CPU_V7M)
 # ifdef _CACHE
 #  define MULTI_CACHE 1
diff --git a/arch/arm/include/asm/hardware/cache-b15-rac.h b/arch/arm/include/asm/hardware/cache-b15-rac.h
new file mode 100644
index 000000000000..76b888f53f90
--- /dev/null
+++ b/arch/arm/include/asm/hardware/cache-b15-rac.h
@@ -0,0 +1,12 @@ 
+#ifndef __ASM_ARM_HARDWARE_CACHE_B15_RAC_H
+#define __ASM_ARM_HARDWARE_CACHE_B15_RAC_H
+
+#ifndef __ASSEMBLY__
+
+void b15_flush_kern_cache_all(void);
+void b15_flush_kern_cache_louis(void);
+void b15_flush_icache_all(void);
+
+#endif
+
+#endif
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 9b4f29e595a4..4d5652a39304 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -853,6 +853,14 @@  config OUTER_CACHE_SYNC
 	  The outer cache has a outer_cache_fns.sync function pointer
 	  that can be used to drain the write buffer of the outer cache.
 
+config CACHE_B15_RAC
+	bool "Enable the Broadcom Brahma-B15 read-ahead cache controller"
+	depends on ARCH_BRCMSTB
+	default y
+	help
+	  This option enables the Broadcom Brahma-B15 read-ahead cache
+	  controller. If disabled, the read-ahead cache remains off.
+
 config CACHE_FEROCEON_L2
 	bool "Enable the Feroceon L2 cache controller"
 	depends on ARCH_MV78XX0 || ARCH_MVEBU
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index d3afdf9eb65a..a6797fdb6721 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -96,6 +96,7 @@  AFLAGS_proc-v6.o	:=-Wa,-march=armv6
 AFLAGS_proc-v7.o	:=-Wa,-march=armv7-a
 
 obj-$(CONFIG_OUTER_CACHE)	+= l2c-common.o
+obj-$(CONFIG_CACHE_B15_RAC)	+= cache-b15-rac.o
 obj-$(CONFIG_CACHE_FEROCEON_L2)	+= cache-feroceon-l2.o
 obj-$(CONFIG_CACHE_L2X0)	+= cache-l2x0.o l2c-l2x0-resume.o
 obj-$(CONFIG_CACHE_XSC3L2)	+= cache-xsc3l2.o
diff --git a/arch/arm/mm/cache-b15-rac.c b/arch/arm/mm/cache-b15-rac.c
new file mode 100644
index 000000000000..1c5bca6e906b
--- /dev/null
+++ b/arch/arm/mm/cache-b15-rac.c
@@ -0,0 +1,181 @@ 
+/*
+ * Broadcom Brahma-B15 CPU read-ahead cache management functions
+ *
+ * Copyright (C) 2015, Broadcom Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/io.h>
+#include <linux/bitops.h>
+#include <linux/of_address.h>
+
+#include <asm/cacheflush.h>
+#include <asm/hardware/cache-b15-rac.h>
+
+extern void v7_flush_kern_cache_all(void);
+extern void v7_flush_kern_cache_louis(void);
+extern void v7_flush_icache_all(void);
+
+/* RAC register offsets, relative to the HIF_CPU_BIUCTRL register base */
+#define RAC_CONFIG0_REG			(0x78)
+#define  RACENPREF_MASK			(0x3)
+#define  RACPREFINST_SHIFT		(0)
+#define  RACENINST_SHIFT		(2)
+#define  RACPREFDATA_SHIFT		(4)
+#define  RACENDATA_SHIFT		(6)
+#define  RAC_CPU_SHIFT			(8)
+#define  RACCFG_MASK			(0xff)
+#define RAC_CONFIG1_REG			(0x7c)
+#define RAC_FLUSH_REG			(0x80)
+#define  FLUSH_RAC			(1 << 0)
+
+/* Bitmask to enable instruction and data prefetching with a 256-bytes stride */
+#define RAC_DATA_INST_EN_MASK		(1 << RACPREFINST_SHIFT | \
+					 RACENPREF_MASK << RACENINST_SHIFT | \
+					 1 << RACPREFDATA_SHIFT | \
+					 RACENPREF_MASK << RACENDATA_SHIFT)
+
+#define RAC_ENABLED			(1 << 0)
+
+static void __iomem *b15_rac_base;
+static DEFINE_SPINLOCK(rac_lock);
+
+/* Initialization flag to avoid checking for b15_rac_base, and to prevent
+ * multi-platform kernels from crashing here as well.
+ */
+static unsigned long b15_rac_flags;
+
+static inline u32 __b15_rac_disable(void)
+{
+	u32 val = __raw_readl(b15_rac_base + RAC_CONFIG0_REG);
+	__raw_writel(0, b15_rac_base + RAC_CONFIG0_REG);
+	dmb();
+	return val;
+}
+
+static inline void __b15_rac_flush(void)
+{
+	u32 reg;
+
+	__raw_writel(FLUSH_RAC, b15_rac_base + RAC_FLUSH_REG);
+	do {
+		/* This dmb() is required to force the Bus Interface Unit
+		 * to clean oustanding writes, and forces an idle cycle
+		 * to be inserted.
+		 */
+		dmb();
+		reg = __raw_readl(b15_rac_base + RAC_FLUSH_REG);
+	} while (reg & RAC_FLUSH_REG);
+}
+
+static inline u32 b15_rac_disable_and_flush(void)
+{
+	u32 reg;
+
+	reg = __b15_rac_disable();
+	__b15_rac_flush();
+	return reg;
+}
+
+static inline void __b15_rac_enable(u32 val)
+{
+	__raw_writel(val, b15_rac_base + RAC_CONFIG0_REG);
+	/* dsb() is required here to be consistent with __flush_icache_all() */
+	dsb();
+}
+
+#define BUILD_RAC_CACHE_OP(name, bar)				\
+void b15_flush_##name(void)					\
+{								\
+	unsigned int do_flush;					\
+	u32 val = 0;						\
+								\
+	spin_lock(&rac_lock);					\
+	do_flush = test_bit(RAC_ENABLED, &b15_rac_flags);	\
+	if (do_flush)						\
+		val = b15_rac_disable_and_flush();		\
+	v7_flush_##name();					\
+	if (!do_flush)						\
+		bar;						\
+	else							\
+		__b15_rac_enable(val);				\
+	spin_unlock(&rac_lock);					\
+}
+
+#define nobarrier
+
+/* The readahead cache present in the Brahma-B15 CPU is a special piece of
+ * hardware after the integrated L2 cache of the B15 CPU complex whose purpose
+ * is to prefetch instruction and/or data with a line size of either 64 bytes
+ * or 256 bytes. The rationale is that the data-bus of the CPU interface is
+ * optimized for 256-bytes transactions, and enabling the readahead cache
+ * provides a significant performance boost we want it enabled (typically
+ * twice the performance for a memcpy benchmark application).
+ *
+ * The readahead cache is transparent for Modified Virtual Addresses
+ * cache maintenance operations: ICIMVAU, DCIMVAC, DCCMVAC, DCCMVAU and
+ * DCCIMVAC.
+ *
+ * It is however not transparent for the following cache maintenance
+ * operations: DCISW, DCCSW, DCCISW, ICIALLUIS and ICIALLU which is precisely
+ * what we are patching here with our BUILD_RAC_CACHE_OP here.
+ */
+
+BUILD_RAC_CACHE_OP(kern_cache_all, nobarrier);
+BUILD_RAC_CACHE_OP(kern_cache_louis, nobarrier);
+BUILD_RAC_CACHE_OP(icache_all, dsb());
+
+static void b15_rac_enable(void)
+{
+	unsigned int cpu;
+	u32 enable = 0;
+
+	for_each_possible_cpu(cpu)
+		enable |= (RAC_DATA_INST_EN_MASK << (cpu * RAC_CPU_SHIFT));
+
+	b15_rac_disable_and_flush();
+	__b15_rac_enable(enable);
+}
+
+static int __init b15_rac_init(void)
+{
+	struct device_node *dn;
+	int ret = 0, cpu;
+	u32 reg, en_mask = 0;
+
+	dn = of_find_compatible_node(NULL, NULL, "brcm,brcmstb-cpu-biu-ctrl");
+	if (!dn)
+		return -ENODEV;
+
+	WARN(num_possible_cpus() > 4, "RAC only supports 4 CPUs\n");
+
+	b15_rac_base = of_iomap(dn, 0);
+	if (!b15_rac_base) {
+		pr_err("failed to remap BIU control base\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock(&rac_lock);
+	reg = __raw_readl(b15_rac_base + RAC_CONFIG0_REG);
+	for_each_possible_cpu(cpu)
+		en_mask |= ((1 << RACPREFDATA_SHIFT) << (cpu * RAC_CPU_SHIFT));
+	WARN(reg & en_mask, "Read-ahead cache not previously disabled\n");
+
+	b15_rac_enable();
+	set_bit(RAC_ENABLED, &b15_rac_flags);
+	spin_unlock(&rac_lock);
+
+	pr_info("Broadcom Brahma-B15 readahead cache at: 0x%p\n",
+		b15_rac_base + RAC_CONFIG0_REG);
+
+out:
+	of_node_put(dn);
+	return ret;
+}
+arch_initcall(b15_rac_init);