Message ID | 1447403678-7217-1-git-send-email-jszhang@marvell.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, 13 Nov 2015 16:34:38 +0800 Jisheng Zhang <jszhang@marvell.com> wrote: > This driver use both readl/writel and their relaxed version, this patch > tries to unify the io accesses. I'm sorry. This is the version I'd like to send for review and merge. Can you please kindly have a review? Thanks > > Signed-off-by: Jisheng Zhang <jszhang@marvell.com> > --- > drivers/clocksource/arm_global_timer.c | 22 +++++++++++----------- > 1 file changed, 11 insertions(+), 11 deletions(-) > > diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c > index a2cb6fa..84a5a5d 100644 > --- a/drivers/clocksource/arm_global_timer.c > +++ b/drivers/clocksource/arm_global_timer.c > @@ -99,27 +99,27 @@ static void gt_compare_set(unsigned long delta, int periodic) > > counter += delta; > ctrl = GT_CONTROL_TIMER_ENABLE; > - writel(ctrl, gt_base + GT_CONTROL); > - writel(lower_32_bits(counter), gt_base + GT_COMP0); > - writel(upper_32_bits(counter), gt_base + GT_COMP1); > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > + writel_relaxed(lower_32_bits(counter), gt_base + GT_COMP0); > + writel_relaxed(upper_32_bits(counter), gt_base + GT_COMP1); > > if (periodic) { > - writel(delta, gt_base + GT_AUTO_INC); > + writel_relaxed(delta, gt_base + GT_AUTO_INC); > ctrl |= GT_CONTROL_AUTO_INC; > } > > ctrl |= GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE; > - writel(ctrl, gt_base + GT_CONTROL); > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > } > > static int gt_clockevent_shutdown(struct clock_event_device *evt) > { > unsigned long ctrl; > > - ctrl = readl(gt_base + GT_CONTROL); > + ctrl = readl_relaxed(gt_base + GT_CONTROL); > ctrl &= ~(GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE | > GT_CONTROL_AUTO_INC); > - writel(ctrl, gt_base + GT_CONTROL); > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > return 0; > } > > @@ -212,11 +212,11 @@ static u64 notrace gt_sched_clock_read(void) > > static void __init gt_clocksource_init(void) > { > - writel(0, gt_base + GT_CONTROL); > - writel(0, gt_base + GT_COUNTER0); > - writel(0, gt_base + GT_COUNTER1); > + writel_relaxed(0, gt_base + GT_CONTROL); > + writel_relaxed(0, gt_base + GT_COUNTER0); > + writel_relaxed(0, gt_base + GT_COUNTER1); > /* enables timer on all the cores */ > - writel(GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); > + writel_relaxed(GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); > > #ifdef CONFIG_CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK > sched_clock_register(gt_sched_clock_read, 64, gt_clk_rate);
On Friday 13 November 2015 16:40:25 Jisheng Zhang wrote: > On Fri, 13 Nov 2015 16:34:38 +0800 > Jisheng Zhang <jszhang@marvell.com> wrote: > > > This driver use both readl/writel and their relaxed version, this patch > > tries to unify the io accesses. > > I'm sorry. This is the version I'd like to send for review and merge. Can you > please kindly have a review? I would prefer to use write_relaxed() as sparingly as we can, it is too hard to verify each case to ensure that we don't have to watch out for ordering or locking issues. > > diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c > > index a2cb6fa..84a5a5d 100644 > > --- a/drivers/clocksource/arm_global_timer.c > > +++ b/drivers/clocksource/arm_global_timer.c > > @@ -99,27 +99,27 @@ static void gt_compare_set(unsigned long delta, int periodic) > > > > counter += delta; > > ctrl = GT_CONTROL_TIMER_ENABLE; > > - writel(ctrl, gt_base + GT_CONTROL); > > - writel(lower_32_bits(counter), gt_base + GT_COMP0); > > - writel(upper_32_bits(counter), gt_base + GT_COMP1); > > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > > + writel_relaxed(lower_32_bits(counter), gt_base + GT_COMP0); > > + writel_relaxed(upper_32_bits(counter), gt_base + GT_COMP1); > > > > if (periodic) { > > - writel(delta, gt_base + GT_AUTO_INC); > > + writel_relaxed(delta, gt_base + GT_AUTO_INC); > > ctrl |= GT_CONTROL_AUTO_INC; > > } > > > > ctrl |= GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE; > > - writel(ctrl, gt_base + GT_CONTROL); > > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > > } This seems fine. Do you have any performance numbers to show how much we save per call on a platform you care about, and how often it is called for a typical workload? I see that _gt_counter_read() already uses readl_relaxed(), and it seems to be a much bigger win there, as we read the clock more often than we write the comparator, so the person who did that probably thought that this one wasn't important enough. Can you add an explanation in the changelog why you think that was a mistake? Unifying the accessors across a driver is not enough of a reason I think. > > static int gt_clockevent_shutdown(struct clock_event_device *evt) > > { > > unsigned long ctrl; > > > > - ctrl = readl(gt_base + GT_CONTROL); > > + ctrl = readl_relaxed(gt_base + GT_CONTROL); > > ctrl &= ~(GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE | > > GT_CONTROL_AUTO_INC); > > - writel(ctrl, gt_base + GT_CONTROL); > > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > > return 0; > > } This is certainly not performance critical, better leave it using the standard accessors. > > @@ -212,11 +212,11 @@ static u64 notrace gt_sched_clock_read(void) > > > > static void __init gt_clocksource_init(void) > > { > > - writel(0, gt_base + GT_CONTROL); > > - writel(0, gt_base + GT_COUNTER0); > > - writel(0, gt_base + GT_COUNTER1); > > + writel_relaxed(0, gt_base + GT_CONTROL); > > + writel_relaxed(0, gt_base + GT_COUNTER0); > > + writel_relaxed(0, gt_base + GT_COUNTER1); > > /* enables timer on all the cores */ > > - writel(GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); > > + writel_relaxed(GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); > > > > #ifdef CONFIG_CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK > > sched_clock_register(gt_sched_clock_read, 64, gt_clk_rate); > > Same here. Arnd
Dear Arnd, On Fri, 13 Nov 2015 10:28:01 +0100 Arnd Bergmann <arnd@arndb.de> wrote: > On Friday 13 November 2015 16:40:25 Jisheng Zhang wrote: > > On Fri, 13 Nov 2015 16:34:38 +0800 > > Jisheng Zhang <jszhang@marvell.com> wrote: > > > > > This driver use both readl/writel and their relaxed version, this patch > > > tries to unify the io accesses. > > > > I'm sorry. This is the version I'd like to send for review and merge. Can you > > please kindly have a review? > > I would prefer to use write_relaxed() as sparingly as we can, it is too > hard to verify each case to ensure that we don't have to watch out > for ordering or locking issues. > > > > diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c > > > index a2cb6fa..84a5a5d 100644 > > > --- a/drivers/clocksource/arm_global_timer.c > > > +++ b/drivers/clocksource/arm_global_timer.c > > > @@ -99,27 +99,27 @@ static void gt_compare_set(unsigned long delta, int periodic) > > > > > > counter += delta; > > > ctrl = GT_CONTROL_TIMER_ENABLE; > > > - writel(ctrl, gt_base + GT_CONTROL); > > > - writel(lower_32_bits(counter), gt_base + GT_COMP0); > > > - writel(upper_32_bits(counter), gt_base + GT_COMP1); > > > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > > > + writel_relaxed(lower_32_bits(counter), gt_base + GT_COMP0); > > > + writel_relaxed(upper_32_bits(counter), gt_base + GT_COMP1); > > > > > > if (periodic) { > > > - writel(delta, gt_base + GT_AUTO_INC); > > > + writel_relaxed(delta, gt_base + GT_AUTO_INC); > > > ctrl |= GT_CONTROL_AUTO_INC; > > > } > > > > > > ctrl |= GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE; > > > - writel(ctrl, gt_base + GT_CONTROL); > > > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > > > } > > This seems fine. Do you have any performance numbers to show how much > we save per call on a platform you care about, and how often it is > called for a typical workload? To be honest, all my platforms don't make use of global timer for clockevent, we use dw_apb_timer and twd or arch_timer instead, but one performance impact I saw in our case can also apply for the case with global timer as clokevent: there are 500-1000 short sleeps, yes not good userspace behavior, so we program clockevent device 500-1000 times/s. If the system is powered by CA9 with outer L2 cache, the writel will contend for l2x0_lock for 500-1000 times/s. Then the L2 cache maintenance from other device driver have more chance to spinning at the l2x0_lock, so other device driver performance is impacted. Thanks, Jisheng > > I see that _gt_counter_read() already uses readl_relaxed(), and it > seems to be a much bigger win there, as we read the clock more > often than we write the comparator, so the person who did that > probably thought that this one wasn't important enough. Can you > add an explanation in the changelog why you think that was a > mistake? > > Unifying the accessors across a driver is not enough of a reason > I think. > > > > static int gt_clockevent_shutdown(struct clock_event_device *evt) > > > { > > > unsigned long ctrl; > > > > > > - ctrl = readl(gt_base + GT_CONTROL); > > > + ctrl = readl_relaxed(gt_base + GT_CONTROL); > > > ctrl &= ~(GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE | > > > GT_CONTROL_AUTO_INC); > > > - writel(ctrl, gt_base + GT_CONTROL); > > > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > > > return 0; > > > } > > This is certainly not performance critical, better leave it using > the standard accessors. > > > > @@ -212,11 +212,11 @@ static u64 notrace gt_sched_clock_read(void) > > > > > > static void __init gt_clocksource_init(void) > > > { > > > - writel(0, gt_base + GT_CONTROL); > > > - writel(0, gt_base + GT_COUNTER0); > > > - writel(0, gt_base + GT_COUNTER1); > > > + writel_relaxed(0, gt_base + GT_CONTROL); > > > + writel_relaxed(0, gt_base + GT_COUNTER0); > > > + writel_relaxed(0, gt_base + GT_COUNTER1); > > > /* enables timer on all the cores */ > > > - writel(GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); > > > + writel_relaxed(GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); > > > > > > #ifdef CONFIG_CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK > > > sched_clock_register(gt_sched_clock_read, 64, gt_clk_rate); > > > > > > Same here. > > Arnd
On Friday 13 November 2015 17:59:48 Jisheng Zhang wrote: > On Fri, 13 Nov 2015 10:28:01 +0100 > Arnd Bergmann <arnd@arndb.de> wrote: > > On Friday 13 November 2015 16:40:25 Jisheng Zhang wrote: > > > On Fri, 13 Nov 2015 16:34:38 +0800 > > > > diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c > > > > index a2cb6fa..84a5a5d 100644 > > > > --- a/drivers/clocksource/arm_global_timer.c > > > > +++ b/drivers/clocksource/arm_global_timer.c > > > > @@ -99,27 +99,27 @@ static void gt_compare_set(unsigned long delta, int periodic) > > > > > > > > counter += delta; > > > > ctrl = GT_CONTROL_TIMER_ENABLE; > > > > - writel(ctrl, gt_base + GT_CONTROL); > > > > - writel(lower_32_bits(counter), gt_base + GT_COMP0); > > > > - writel(upper_32_bits(counter), gt_base + GT_COMP1); > > > > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > > > > + writel_relaxed(lower_32_bits(counter), gt_base + GT_COMP0); > > > > + writel_relaxed(upper_32_bits(counter), gt_base + GT_COMP1); > > > > > > > > if (periodic) { > > > > - writel(delta, gt_base + GT_AUTO_INC); > > > > + writel_relaxed(delta, gt_base + GT_AUTO_INC); > > > > ctrl |= GT_CONTROL_AUTO_INC; > > > > } > > > > > > > > ctrl |= GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE; > > > > - writel(ctrl, gt_base + GT_CONTROL); > > > > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > > > > } > > > > This seems fine. Do you have any performance numbers to show how much > > we save per call on a platform you care about, and how often it is > > called for a typical workload? > > To be honest, all my platforms don't make use of global timer for clockevent, > we use dw_apb_timer and twd or arch_timer instead, but one performance impact > I saw in our case can also apply for the case with global timer as clokevent: > > there are 500-1000 short sleeps, yes not good userspace behavior, so we > program clockevent device 500-1000 times/s. If the system is powered by CA9 > with outer L2 cache, the writel will contend for l2x0_lock for 500-1000 times/s. > Then the L2 cache maintenance from other device driver have more chance to > spinning at the l2x0_lock, so other device driver performance is impacted. Just to make sure I get this right: which outer cache implementation do you use in this case? Most Cortex-A9 use pl310, which does not require l2x0_lock for outer_cache.sync(). The Aurora outer cache sync has a different method and also doesn't use l2x0_lock. Finally, tauros3 doesn't need a cache sync at all. Did you look at an older kernel version? We used to do a loop in the Aurora cache sync operation until I fixed that, so it should be a bit faster now. It will still require doing the actual sync, but at least there should not be any lock contention these days. Arnd
Dear Arnd, On Fri, 13 Nov 2015 11:33:12 +0100 Arnd Bergmann wrote: > On Friday 13 November 2015 17:59:48 Jisheng Zhang wrote: > > On Fri, 13 Nov 2015 10:28:01 +0100 > > Arnd Bergmann wrote: > > > On Friday 13 November 2015 16:40:25 Jisheng Zhang wrote: > > > > On Fri, 13 Nov 2015 16:34:38 +0800 > > > > > diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c > > > > > index a2cb6fa..84a5a5d 100644 > > > > > --- a/drivers/clocksource/arm_global_timer.c > > > > > +++ b/drivers/clocksource/arm_global_timer.c > > > > > @@ -99,27 +99,27 @@ static void gt_compare_set(unsigned long delta, int periodic) > > > > > > > > > > counter += delta; > > > > > ctrl = GT_CONTROL_TIMER_ENABLE; > > > > > - writel(ctrl, gt_base + GT_CONTROL); > > > > > - writel(lower_32_bits(counter), gt_base + GT_COMP0); > > > > > - writel(upper_32_bits(counter), gt_base + GT_COMP1); > > > > > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > > > > > + writel_relaxed(lower_32_bits(counter), gt_base + GT_COMP0); > > > > > + writel_relaxed(upper_32_bits(counter), gt_base + GT_COMP1); > > > > > > > > > > if (periodic) { > > > > > - writel(delta, gt_base + GT_AUTO_INC); > > > > > + writel_relaxed(delta, gt_base + GT_AUTO_INC); > > > > > ctrl |= GT_CONTROL_AUTO_INC; > > > > > } > > > > > > > > > > ctrl |= GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE; > > > > > - writel(ctrl, gt_base + GT_CONTROL); > > > > > + writel_relaxed(ctrl, gt_base + GT_CONTROL); > > > > > } > > > > > > This seems fine. Do you have any performance numbers to show how much > > > we save per call on a platform you care about, and how often it is > > > called for a typical workload? > > > > To be honest, all my platforms don't make use of global timer for clockevent, > > we use dw_apb_timer and twd or arch_timer instead, but one performance impact > > I saw in our case can also apply for the case with global timer as clokevent: > > > > there are 500-1000 short sleeps, yes not good userspace behavior, so we > > program clockevent device 500-1000 times/s. If the system is powered by CA9 > > with outer L2 cache, the writel will contend for l2x0_lock for 500-1000 times/s. > > Then the L2 cache maintenance from other device driver have more chance to > > spinning at the l2x0_lock, so other device driver performance is impacted. > > Just to make sure I get this right: which outer cache implementation do you > use in this case? Most Cortex-A9 use pl310, which does not require l2x0_lock PL310 > for outer_cache.sync(). The Aurora outer cache sync has a different method > and also doesn't use l2x0_lock. Finally, tauros3 doesn't need a cache sync > at all. > > Did you look at an older kernel version? We used to do a loop in the oops, yes. The kernel version in product still needs the spinlock in sync. I didn't check the L2 cache code for about 1 year, sorry for that. If we upgrade to newer kernel version, yes, the bit performance bottleneck -- spinlock contention won't exist anymore. Thanks for pointing out this. But I think we may still see trivial system performance improvement in 500-1000 times/s of clockevent programming case due to the mb() in writel. Thanks, Jisheng > Aurora cache sync operation until I fixed that, so it should be a bit > faster now. It will still require doing the actual sync, but at least > there should not be any lock contention these days. > > Arnd
On Friday 13 November 2015 20:20:01 Jisheng Zhang wrote: > > > for outer_cache.sync(). The Aurora outer cache sync has a different method > > and also doesn't use l2x0_lock. Finally, tauros3 doesn't need a cache sync > > at all. > > > > Did you look at an older kernel version? We used to do a loop in the > > oops, yes. The kernel version in product still needs the spinlock in sync. > I didn't check the L2 cache code for about 1 year, sorry for that. > If we upgrade to newer kernel version, yes, the bit performance bottleneck -- > spinlock contention won't exist anymore. Thanks for pointing out this. If you still see lock contention on the l2x0 lock with your patch applied, you might want to backport the optimizations to your product kernel, even more so for the aurora controller in the Armada 370 that had some extra optimizations. > But I think we may still see trivial system performance improvement in 500-1000 > times/s of clockevent programming case due to the mb() in writel. Yes, I think it's fine. Just try to put your best estimate of the overhead in the patch description when you do the new version. Unfortunately, it is not easy to measure what the actual overhead is because low-level benchmarks of outer_cache.sync will show a much lower overhead than doing it occasionally with an active cache. Arnd
diff --git a/drivers/clocksource/arm_global_timer.c b/drivers/clocksource/arm_global_timer.c index a2cb6fa..84a5a5d 100644 --- a/drivers/clocksource/arm_global_timer.c +++ b/drivers/clocksource/arm_global_timer.c @@ -99,27 +99,27 @@ static void gt_compare_set(unsigned long delta, int periodic) counter += delta; ctrl = GT_CONTROL_TIMER_ENABLE; - writel(ctrl, gt_base + GT_CONTROL); - writel(lower_32_bits(counter), gt_base + GT_COMP0); - writel(upper_32_bits(counter), gt_base + GT_COMP1); + writel_relaxed(ctrl, gt_base + GT_CONTROL); + writel_relaxed(lower_32_bits(counter), gt_base + GT_COMP0); + writel_relaxed(upper_32_bits(counter), gt_base + GT_COMP1); if (periodic) { - writel(delta, gt_base + GT_AUTO_INC); + writel_relaxed(delta, gt_base + GT_AUTO_INC); ctrl |= GT_CONTROL_AUTO_INC; } ctrl |= GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE; - writel(ctrl, gt_base + GT_CONTROL); + writel_relaxed(ctrl, gt_base + GT_CONTROL); } static int gt_clockevent_shutdown(struct clock_event_device *evt) { unsigned long ctrl; - ctrl = readl(gt_base + GT_CONTROL); + ctrl = readl_relaxed(gt_base + GT_CONTROL); ctrl &= ~(GT_CONTROL_COMP_ENABLE | GT_CONTROL_IRQ_ENABLE | GT_CONTROL_AUTO_INC); - writel(ctrl, gt_base + GT_CONTROL); + writel_relaxed(ctrl, gt_base + GT_CONTROL); return 0; } @@ -212,11 +212,11 @@ static u64 notrace gt_sched_clock_read(void) static void __init gt_clocksource_init(void) { - writel(0, gt_base + GT_CONTROL); - writel(0, gt_base + GT_COUNTER0); - writel(0, gt_base + GT_COUNTER1); + writel_relaxed(0, gt_base + GT_CONTROL); + writel_relaxed(0, gt_base + GT_COUNTER0); + writel_relaxed(0, gt_base + GT_COUNTER1); /* enables timer on all the cores */ - writel(GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); + writel_relaxed(GT_CONTROL_TIMER_ENABLE, gt_base + GT_CONTROL); #ifdef CONFIG_CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK sched_clock_register(gt_sched_clock_read, 64, gt_clk_rate);
This driver use both readl/writel and their relaxed version, this patch tries to unify the io accesses. Signed-off-by: Jisheng Zhang <jszhang@marvell.com> --- drivers/clocksource/arm_global_timer.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-)