diff mbox

[RFC] arm: use built-in byte swap function

Message ID 20130208191208.2ef3d78bda71aa7b44d00d7b@freescale.com (mailing list archive)
State New, archived
Headers show

Commit Message

Kim Phillips Feb. 9, 2013, 1:12 a.m. UTC
On Fri, 8 Feb 2013 17:47:33 -0500
Nicolas Pitre <nico@fluxnic.net> wrote:

> On Fri, 8 Feb 2013, Woodhouse, David wrote:
> 
> > On Fri, 2013-02-08 at 15:04 -0500, Nicolas Pitre wrote:
> > > On Fri, 8 Feb 2013, Woodhouse, David wrote:
> > > 
> > > > On Thu, 2013-02-07 at 18:13 +0000, Russell King - ARM Linux wrote:
> > > > > 
> > > > > However, the biggest reason not to use libgcc is that we want to control
> > > > > what gets used in the kernel - for example, no floating point, and no
> > > > > use of 64 x 64bit division.
> > > > 
> > > > Which is all very sensible. But there's no particular reason we couldn't
> > > > add a __bswap[sd]i2 to the kernel's version of libgcc if we wanted to.
> > > 
> > > Absolutely.
> > 
> > And then ARM can just set ARCH_USE_BUILTIN_BSWAP like other
> > architectures do, right?
> 
> If that turns out to be beneficial over what we have now, then yes.
> I didn't read back the whole thread to form an opinion though.

The diff below implements __bswap[sd]i2 in arch/arm/lib, and
results in the following savings in vmlinux size:

column 1: name of defconfig
column 2: text+data+bss, linux-next-20130207 vanilla, gcc 4.6.3
column 3: text+data+bss, linux-next-20130207+below diff, gcc 4.6.3
column 4: col. 3 - col. 2 (ie., -ve numbers represent savings)

acs5k_tiny_defconfig: 	2886048 	2886016 	-32
ag5evm_defconfig: 	2767596 	2767564 	-32
am200epdkit_defconfig: 	3283557 	3283557 	0
ap4evb_defconfig: 	2059540 	2059524 	-16
armadillo800eva_defconfig: 	4521440 	4521200 	-240
assabet_defconfig: 	3562923 	3562603 	-320
at91_dt_defconfig: 	4195020 	4189164 	-5856
at91rm9200_defconfig: 	5804566 	5803742 	-824
at91sam9261_defconfig: 	4726625 	4725969 	-656
at91sam9263_defconfig: 	5115407 	5114459 	-948
at91sam9g20_defconfig: 	4175306 	4175138 	-168
at91x40_defconfig: 	1344264 	1344256 	-8
badge4_defconfig: 	3359661 	3359485 	-176
bcm_defconfig: 	3575901 	3575757 	-144
bonito_defconfig: 	2221729 	2221681 	-48
cerfcube_defconfig: 	3063610 	3063370 	-240
cns3420vb_defconfig: 	2595667 	2595651 	-16
collie_defconfig: 	3015148 	3014972 	-176
da8xx_omapl_defconfig: 	4168517 	4168165 	-352
davinci_all_defconfig: 	4060434 	4060110 	-324
dove_defconfig: 	4940257 	4939245 	-1012
ebsa110_defconfig: 	3236795 	3236587 	-208
ep93xx_defconfig: 	4169466 	4168986 	-480
eseries_pxa_defconfig: 	3090656 	3090448 	-208
exynos4_defconfig: 	3008671 	3008639 	-32
ezx_defconfig: 	10042035 	10041835 	-200
footbridge_defconfig: 	3662497 	3662369 	-128
h3600_defconfig: 	3471140 	3470820 	-320
h5000_defconfig: 	2976046 	2976030 	-16
h7201_defconfig: 	2158753 	2158705 	-48
h7202_defconfig: 	3395888 	3395520 	-368
hackkit_defconfig: 	3128690 	3128578 	-112
imote2_defconfig: 	9916412 	9916180 	-232
imx_v4_v5_defconfig: 	6866952 	6866272 	-680
imx_v6_v7_defconfig: 	7672373 	7667089 	-5284
integrator_defconfig: 	4224588 	4224284 	-304
iop13xx_defconfig: 	5229604 	5228468 	-1136
iop32x_defconfig: 	5616676 	5615492 	-1184
iop33x_defconfig: 	4611219 	4610835 	-384
ixp4xx_defconfig: 	4566094 	4564238 	-1856
jornada720_defconfig: 	3140569 	3140345 	-224
kota2_defconfig: 	3971280 	3971280 	0
kzm9d_defconfig: 	3042784 	3042784 	0
kzm9g_defconfig: 	4728939 	4728907 	-32
lart_defconfig: 	2941150 	2941054 	-96
lpc32xx_defconfig: 	5009215 	5004619 	-4596
mackerel_defconfig: 	4687162 	4686922 	-240
mini2440_defconfig: 	5095413 	5095037 	-376
msm_defconfig: 	2920312 	2920224 	-88
multi_v7_defconfig: 	3511744 	3511712 	-32
mvebu_defconfig: 	3871035 	3870931 	-104
mxs_defconfig: 	11091983 	11095679 	3696
netwinder_defconfig: 	3814476 	3814124 	-352
nhk8815_defconfig: 	4277138 	4276778 	-360
nuc910_defconfig: 	2527988 	2527956 	-32
nuc950_defconfig: 	2634352 	2634312 	-40
nuc960_defconfig: 	2546592 	2546552 	-40
omap2plus_defconfig: 	13801444 	13800144 	-1300
palmz72_defconfig: 	3345726 	3345758 	32
pcm027_defconfig: 	3634230 	3634158 	-72
pleb_defconfig: 	2847643 	2847515 	-128
prima2_defconfig: 	3080439 	3080347 	-92
pxa3xx_defconfig: 	4266677 	4266457 	-220
realview_defconfig: 	3891673 	3891465 	-208
realview-smp_defconfig: 	4157253 	4157045 	-208
rpc_defconfig: 	3974677 	3974661 	-16
s3c2410_defconfig: 	5217336 	5217000 	-336
s3c6400_defconfig: 	3209178 	3209178 	0
s5p64x0_defconfig: 	2843307 	2843275 	-32
s5pc100_defconfig: 	2696641 	2696657 	16
shannon_defconfig: 	3586908 	3586636 	-272
shark_defconfig: 	3565471 	3565423 	-48
simpad_defconfig: 	3681433 	3681161 	-272
socfpga_defconfig: 	4326480 	4326408 	-72
spear13xx_defconfig: 	4062410 	4062378 	-32
spear6xx_defconfig: 	3457392 	3457360 	-32
tct_hammer_defconfig: 	2406856 	2406824 	-32
trizeps4_defconfig: 	5869063 	5868483 	-580
u300_defconfig: 	2627488 	2627488 	0
versatile_defconfig: 	3789155 	3788787 	-368
vexpress_defconfig: 	5105737 	5105489 	-248
viper_defconfig: 	3309176 	3308920 	-256
xcep_defconfig: 	2820463 	2820367 	-96
zeus_defconfig: 	3775525 	3775367 	-158

gcc 4.7.3 runs haven't been as good across the board as gcc 4.6.3,
however:

acs5k_tiny_defconfig: 	2873380 	2873330 	-50
ag5evm_defconfig: 	2753016 	2753128 	112
am200epdkit_defconfig: 	3274245 	3274295 	50
ap4evb_defconfig: 	2051952 	2051944 	-8
armadillo800eva_defconfig: 	4531864 	4531718 	-146
assabet_defconfig: 	3540515 	3540337 	-178
at91_dt_defconfig: 	4211064 	4205394 	-5670
at91rm9200_defconfig: 	5785930 	5785368 	-562
at91sam9261_defconfig: 	4705357 	4705139 	-218
at91sam9263_defconfig: 	5098831 	5098501 	-330
at91sam9g20_defconfig: 	4163686 	4163576 	-110
at91sam9g45_defconfig: 	4675215 	4669553 	-5662
at91x40_defconfig: 	1342256 	1342192 	-64
badge4_defconfig: 	3338677 	3338615 	-62
bcm_defconfig: 	3568365 	3568259 	-106
bonito_defconfig: 	2225353 	2225355 	2
cerfcube_defconfig: 	3043650 	3043556 	-94
cns3420vb_defconfig: 	2589035 	2589049 	14
collie_defconfig: 	2994100 	2993972 	-128
da8xx_omapl_defconfig: 	4155189 	4155047 	-142
davinci_all_defconfig: 	4043450 	4043304 	-146
dove_defconfig: 	4902213 	4901627 	-586
ebsa110_defconfig: 	3216131 	3215953 	-178
ep93xx_defconfig: 	4154038 	4153796 	-242
eseries_pxa_defconfig: 	3076688 	3076610 	-78
exynos4_defconfig: 	3001451 	3001465 	14
ezx_defconfig: 	10022647 	10022553 	-94
footbridge_defconfig: 	3640857 	3640763 	-94
h3600_defconfig: 	3453932 	3453722 	-210
h5000_defconfig: 	2963034 	2963036 	2
h7201_defconfig: 	2148969 	2148987 	18
h7202_defconfig: 	3376072 	3375894 	-178
hackkit_defconfig: 	3107994 	3107916 	-78
imote2_defconfig: 	9899372 	9899278 	-94
imx_v4_v5_defconfig: 	6834892 	6834338 	-554
imx_v6_v7_defconfig: 	7637605 	7636935 	-670
integrator_defconfig: 	4204944 	4204674 	-270
iop13xx_defconfig: 	5193128 	5192790 	-338
iop32x_defconfig: 	5576144 	5575758 	-386
iop33x_defconfig: 	4584683 	4584585 	-98
ixp4xx_defconfig: 	4546578 	4545176 	-1402
jornada720_defconfig: 	3121377 	3121299 	-78
kota2_defconfig: 	3951220 	3951220 	0
kzm9d_defconfig: 	3055284 	3055252 	-32
kzm9g_defconfig: 	4742195 	4742161 	-34
lart_defconfig: 	2922550 	2926600 	4050
lpc32xx_defconfig: 	5036722 	5032072 	-4650
mackerel_defconfig: 	4660786 	4660612 	-174
mini2440_defconfig: 	5074529 	5074231 	-298
msm_defconfig: 	2911200 	2911240 	40
multi_v7_defconfig: 	3502888 	3502824 	-64
mvebu_defconfig: 	3853083 	3853165 	82
mxs_defconfig: 	11071139 	11070893 	-246
netwinder_defconfig: 	3791108 	3790884 	-224
netx_defconfig: 	3951133 	3950911 	-222
nhk8815_defconfig: 	4259910 	4259696 	-214
nuc910_defconfig: 	2523448 	2523408 	-40
nuc950_defconfig: 	2633644 	2633604 	-40
nuc960_defconfig: 	2541916 	2541876 	-40
omap2plus_defconfig: 	13770376 	13770026 	-350
palmz72_defconfig: 	3335850 	3335868 	18
pcm027_defconfig: 	3615962 	3615896 	-66
pleb_defconfig: 	2828707 	2828753 	46
prima2_defconfig: 	3021195 	3021213 	18
pxa3xx_defconfig: 	4250245 	4250135 	-110
realview_defconfig: 	3874745 	3874791 	46
realview-smp_defconfig: 	4144973 	4145051 	78
rpc_defconfig: 	3952749 	3952831 	82
s3c2410_defconfig: 	5191024 	5195006 	3982
s3c6400_defconfig: 	3208530 	3208548 	18
s5p64x0_defconfig: 	2837611 	2837625 	14
s5pc100_defconfig: 	2689877 	2689895 	18
shannon_defconfig: 	3567412 	3567234 	-178
shark_defconfig: 	3548103 	3548149 	46
simpad_defconfig: 	3657889 	3657763 	-126
socfpga_defconfig: 	4287524 	4287610 	86
spear13xx_defconfig: 	4040898 	4044932 	4034
spear6xx_defconfig: 	3446968 	3446998 	30
tct_hammer_defconfig: 	2394112 	2394126 	14
trizeps4_defconfig: 	5846883 	5846505 	-378
u300_defconfig: 	2622856 	2622934 	78
versatile_defconfig: 	3766635 	3766409 	-226
vexpress_defconfig: 	5076237 	5076279 	42
viper_defconfig: 	3293388 	3293294 	-94
xcep_defconfig: 	2804423 	2808549 	4126
zeus_defconfig: 	3760613 	3760539 	-74

Haven't looked at why.

In any case, some questions I have are:

(a) are the __bswap[sd]i2 implementations acceptable written in C,
as in the diff?  I don't speak ARM asm (yet at least).  The
generated code looks pretty optimal in both armv5 and 6+.

(b) would adding __bswap[sd]i2 to the kernel build need to be
disabled on armv6+? AFAICT, gcc doesn't emit calls - even for the
8-byte swap, even with -Os, on armv6+.

(c) testing allyesconfigs is proving to be a pain - lots of
breakeage - other than defconfig testing, is there any more I can do?

Thanks, and for your patience,

Kim

Comments

Nicolas Pitre Feb. 9, 2013, 3:16 a.m. UTC | #1
On Fri, 8 Feb 2013, Kim Phillips wrote:

> On Fri, 8 Feb 2013 17:47:33 -0500
> Nicolas Pitre <nico@fluxnic.net> wrote:
> 
> > On Fri, 8 Feb 2013, Woodhouse, David wrote:
> > 
> > > On Fri, 2013-02-08 at 15:04 -0500, Nicolas Pitre wrote:
> > > > On Fri, 8 Feb 2013, Woodhouse, David wrote:
> > > > 
> > > > > On Thu, 2013-02-07 at 18:13 +0000, Russell King - ARM Linux wrote:
> > > > > > 
> > > > > > However, the biggest reason not to use libgcc is that we want to control
> > > > > > what gets used in the kernel - for example, no floating point, and no
> > > > > > use of 64 x 64bit division.
> > > > > 
> > > > > Which is all very sensible. But there's no particular reason we couldn't
> > > > > add a __bswap[sd]i2 to the kernel's version of libgcc if we wanted to.
> > > > 
> > > > Absolutely.
> > > 
> > > And then ARM can just set ARCH_USE_BUILTIN_BSWAP like other
> > > architectures do, right?
> > 
> > If that turns out to be beneficial over what we have now, then yes.
> > I didn't read back the whole thread to form an opinion though.
> 
> The diff below implements __bswap[sd]i2 in arch/arm/lib, and
> results in the following savings in vmlinux size:
> 
> column 1: name of defconfig
> column 2: text+data+bss, linux-next-20130207 vanilla, gcc 4.6.3
> column 3: text+data+bss, linux-next-20130207+below diff, gcc 4.6.3
> column 4: col. 3 - col. 2 (ie., -ve numbers represent savings)
> 
[...]
> imx_v6_v7_defconfig: 	7672373 	7667089 	-5284
> lart_defconfig:	2941150		2941054         -96
> mxs_defconfig: 	11091983 	11095679 	3696

The savings are good, with some impressive cases.  However the 
mxs_defconfig is completely the opposite and by far.  Any idea?

> gcc 4.7.3 runs haven't been as good across the board as gcc 4.6.3,
> however:

Not only that, but in many cases the results are wildly different given 
the same config:

> imx_v6_v7_defconfig:	7637605		7636935		-670
> lart_defconfig: 	2922550 	2926600 	4050
> mxs_defconfig: 	11071139 	11070893 	-246

The mxs_defconfig became much better while lart_defconfig regressed a 
lot.

> Haven't looked at why.

Would be a good idea since this is rather weird and gcc could benefit 
from your findings.

> In any case, some questions I have are:
> 
> (a) are the __bswap[sd]i2 implementations acceptable written in C,
> as in the diff?  I don't speak ARM asm (yet at least).  The
> generated code looks pretty optimal in both armv5 and 6+.

It looks pretty nice indeed:

__bswapsi2:
        eor     r2, r0, r0, ror #16
        mov     r1, r2, lsr #8
        bic     r3, r1, #65280
        eor     r0, r3, r0, ror #8
        bx      lr

There is no way to do better than that.  But that's true only if -Os is 
_not_ used.  With -Os we get the following output:

__bswapsi2:
        mov     r3, r0, asl #24
        and     r2, r0, #65280
        orr     r3, r3, r0, lsr #24
        orr     r3, r3, r2, asl #8
        and     r0, r0, #16711680
        orr     r0, r3, r0, lsr #8
        bx      lr

I really don't get why gcc thinks the above is shorter.  I'm saving you 
from pasting the __bswapdi2 result which is also way way worse.
That was with Linaro gcc v4.6.2.

With Sourcery gcc v4.5.1 we get:

__bswapsi2:
        stmfd   sp!, {r3, lr}
        bl      __bswapsi2
        ldmfd   sp!, {r3, pc}

This is indeed shorter, but much less useful.  So you better enforce -O2 
for this file.  And the nice thing with C code is that it is fully 
optimized with the rev instruction when compiling for ARMv6+ if it is 
ever used in that case.

> (b) would adding __bswap[sd]i2 to the kernel build need to be
> disabled on armv6+? AFAICT, gcc doesn't emit calls - even for the
> 8-byte swap, even with -Os, on armv6+.

I wouldn't bother.  That would save only 6 instructions total.  And who 
knows if some gcc flavor start calling them for some reason eventually.

> (c) testing allyesconfigs is proving to be a pain - lots of
> breakeage - other than defconfig testing, is there any more I can do?

The defconfigs provide wildly different results and that is a good 
thing for further investigation.  You may concentrate on a small 
interesting sample such as those I kept above.

With allyesconfig the good configs would cancel out the bad ones making 
the bad ones invisible.


Nicolas
diff mbox

Patch

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 4265a26..5e8b735 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -58,6 +58,7 @@  config ARM
 	select CLONE_BACKWARDS
 	select OLD_SIGSUSPEND3
 	select OLD_SIGACTION
+	select ARCH_USE_BUILTIN_BSWAP
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
index c9865f6..c225624 100644
--- a/arch/arm/boot/compressed/Makefile
+++ b/arch/arm/boot/compressed/Makefile
@@ -111,12 +111,12 @@  endif
 
 targets       := vmlinux vmlinux.lds \
 		 piggy.$(suffix_y) piggy.$(suffix_y).o \
-		 lib1funcs.o lib1funcs.S ashldi3.o ashldi3.S \
+		 lib1funcs.o lib1funcs.S ashldi3.o ashldi3.S bswapsdi2.o \
 		 font.o font.c head.o misc.o $(OBJS)
 
 # Make sure files are removed during clean
 extra-y       += piggy.gzip piggy.lzo piggy.lzma piggy.xzkern \
-		 lib1funcs.S ashldi3.S $(libfdt) $(libfdt_hdrs)
+		 lib1funcs.S ashldi3.S bswapsdi2.o $(libfdt) $(libfdt_hdrs)
 
 ifeq ($(CONFIG_FUNCTION_TRACER),y)
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
@@ -158,6 +158,12 @@  ashldi3 = $(obj)/ashldi3.o
 $(obj)/ashldi3.S: $(srctree)/arch/$(SRCARCH)/lib/ashldi3.S
 	$(call cmd,shipped)
 
+# For __bswapsi2, __bswapdi2
+bswapsdi2 = $(obj)/bswapsdi2.o
+
+$(obj)/bswapsdi2.o: $(srctree)/arch/$(SRCARCH)/lib/bswapsdi2.o
+	$(call cmd,shipped)
+
 # We need to prevent any GOTOFF relocs being used with references
 # to symbols in the .bss section since we cannot relocate them
 # independently from the rest at run time.  This can be achieved by
@@ -179,7 +185,8 @@  if [ $(words $(ZRELADDR)) -gt 1 -a "$(CONFIG_AUTO_ZRELADDR)" = "" ]; then \
 fi
 
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/$(HEAD) $(obj)/piggy.$(suffix_y).o \
-		$(addprefix $(obj)/, $(OBJS)) $(lib1funcs) $(ashldi3) FORCE
+		$(addprefix $(obj)/, $(OBJS)) $(lib1funcs) $(ashldi3) \
+		$(bswapsdi2) FORCE
 	@$(check_for_multiple_zreladdr)
 	$(call if_changed,ld)
 	@$(check_for_bad_syms)
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 60d3b73..ba578f7 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -35,6 +35,8 @@  extern void __ucmpdi2(void);
 extern void __udivsi3(void);
 extern void __umodsi3(void);
 extern void __do_div64(void);
+extern void __bswapsi2(void);
+extern void __bswapdi2(void);
 
 extern void __aeabi_idiv(void);
 extern void __aeabi_idivmod(void);
@@ -114,6 +116,8 @@  EXPORT_SYMBOL(__ucmpdi2);
 EXPORT_SYMBOL(__udivsi3);
 EXPORT_SYMBOL(__umodsi3);
 EXPORT_SYMBOL(__do_div64);
+EXPORT_SYMBOL(__bswapsi2);
+EXPORT_SYMBOL(__bswapdi2);
 
 #ifdef CONFIG_AEABI
 EXPORT_SYMBOL(__aeabi_idiv);
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index af72969..5383df7 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -13,7 +13,7 @@  lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
 		   ashldi3.o ashrdi3.o lshrdi3.o muldi3.o             \
 		   ucmpdi2.o lib1funcs.o div64.o                      \
 		   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
-		   call_with_stack.o
+		   call_with_stack.o bswapsdi2.o
 
 mmu-y	:= clear_user.o copy_page.o getuser.o putuser.o
 
diff --git a/arch/arm/lib/bswapsdi2.c b/arch/arm/lib/bswapsdi2.c
new file mode 100644
index 0000000..1923b2f
--- /dev/null
+++ b/arch/arm/lib/bswapsdi2.c
@@ -0,0 +1,19 @@ 
+unsigned int __bswapsi2(unsigned int x)
+{
+	return ((x & 0x000000ffUL) << 24) |
+	       ((x & 0x0000ff00UL) <<  8) |
+	       ((x & 0x00ff0000UL) >>  8) |
+	       ((x & 0xff000000UL) >> 24);
+}
+
+unsigned long long __bswapdi2(unsigned long long x)
+{
+	return ((x & 0x00000000000000ffULL) << 56) |
+	       ((x & 0x000000000000ff00ULL) << 40) |
+	       ((x & 0x0000000000ff0000ULL) << 24) |
+	       ((x & 0x00000000ff000000ULL) <<  8) |
+	       ((x & 0x000000ff00000000ULL) >>  8) |
+	       ((x & 0x0000ff0000000000ULL) >> 24) |
+	       ((x & 0x00ff000000000000ULL) >> 40) |
+	       ((x & 0xff00000000000000ULL) >> 56);
+}