Message ID | alpine.LFD.2.10.1311120839310.9667@knanqh.ubzr (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Nov 12, 2013 at 09:01:16AM -0500, Nicolas Pitre wrote: > What about this patch which I think is currently your best option. Note > it would need to use the facilities from asm/opcodes.h to make it endian > agnostic. > > diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c > index 6a1b8a81b1..379cffe4ab 100644 > --- a/arch/arm/kernel/setup.c > +++ b/arch/arm/kernel/setup.c > @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void) > elf_hwcap |= HWCAP_IDIVT; > } > > + /* > + * Patch our division routines with the corresponding opcode > + * if the hardware supports it. > + */ > + if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) { > + extern char __aeabi_uidiv, __aeabi_idiv; > + u16 *uidiv = (u16 *)&__aeabi_uidiv; > + u16 *idiv = (u16 *)&__aeabi_idiv; > + > + uidiv[0] = 0xfbb0; /* udiv r0, r0, r1 */ > + uidiv[1] = 0xf0f1; > + uidiv[2] = 0x4770; /* bx lr */ > + > + idiv[0] = 0xfb90; /* sdiv r0, r0, r1 */ > + idiv[1] = 0xf0f1; > + idiv[2] = 0x4770; /* bx lr */ > + } else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) { > + extern char __aeabi_uidiv, __aeabi_idiv; > + u32 *uidiv = (u32 *)&__aeabi_uidiv; > + u32 *idiv = (u32 *)&__aeabi_idiv; > + > + uidiv[0] = 0xe730f110; /* udiv r0, r0, r1 */ > + uidiv[1] = 0xe12fff1e; /* bx lr */ > + > + idiv[0] = 0xe710f110; /* sdiv r0, r0, r1 */ > + idiv[1] = 0xe12fff1e; /* bx lr */ > + } What about endianness, and what if XIP is enabled?
On Tue, 12 Nov 2013, Russell King - ARM Linux wrote: > On Tue, Nov 12, 2013 at 09:01:16AM -0500, Nicolas Pitre wrote: > > What about this patch which I think is currently your best option. Note > > it would need to use the facilities from asm/opcodes.h to make it endian > > agnostic. > > > > diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c > > index 6a1b8a81b1..379cffe4ab 100644 > > --- a/arch/arm/kernel/setup.c > > +++ b/arch/arm/kernel/setup.c > > @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void) > > elf_hwcap |= HWCAP_IDIVT; > > } > > > > + /* > > + * Patch our division routines with the corresponding opcode > > + * if the hardware supports it. > > + */ > > + if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) { > > + extern char __aeabi_uidiv, __aeabi_idiv; > > + u16 *uidiv = (u16 *)&__aeabi_uidiv; > > + u16 *idiv = (u16 *)&__aeabi_idiv; > > + > > + uidiv[0] = 0xfbb0; /* udiv r0, r0, r1 */ > > + uidiv[1] = 0xf0f1; > > + uidiv[2] = 0x4770; /* bx lr */ > > + > > + idiv[0] = 0xfb90; /* sdiv r0, r0, r1 */ > > + idiv[1] = 0xf0f1; > > + idiv[2] = 0x4770; /* bx lr */ > > + } else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) { > > + extern char __aeabi_uidiv, __aeabi_idiv; > > + u32 *uidiv = (u32 *)&__aeabi_uidiv; > > + u32 *idiv = (u32 *)&__aeabi_idiv; > > + > > + uidiv[0] = 0xe730f110; /* udiv r0, r0, r1 */ > > + uidiv[1] = 0xe12fff1e; /* bx lr */ > > + > > + idiv[0] = 0xe710f110; /* sdiv r0, r0, r1 */ > > + idiv[1] = 0xe12fff1e; /* bx lr */ > > + } > > What about endianness, and what if XIP is enabled? Just as I said above the diff: this needs refined. Obviously XIP can't use this and doesn't need it either as a XIP kernel should be optimized for the very platform it will run onto i.e. gcc should already emit those div opcodes inline if appropriate. Nicolas
On 12/11/13 14:04, Russell King - ARM Linux wrote: > On Tue, Nov 12, 2013 at 09:01:16AM -0500, Nicolas Pitre wrote: >> What about this patch which I think is currently your best option. Note >> it would need to use the facilities from asm/opcodes.h to make it endian >> agnostic. >> >> diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c >> index 6a1b8a81b1..379cffe4ab 100644 >> --- a/arch/arm/kernel/setup.c >> +++ b/arch/arm/kernel/setup.c >> @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void) >> elf_hwcap |= HWCAP_IDIVT; >> } >> >> + /* >> + * Patch our division routines with the corresponding opcode >> + * if the hardware supports it. >> + */ >> + if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) { >> + extern char __aeabi_uidiv, __aeabi_idiv; >> + u16 *uidiv = (u16 *)&__aeabi_uidiv; >> + u16 *idiv = (u16 *)&__aeabi_idiv; >> + >> + uidiv[0] = 0xfbb0; /* udiv r0, r0, r1 */ >> + uidiv[1] = 0xf0f1; >> + uidiv[2] = 0x4770; /* bx lr */ >> + >> + idiv[0] = 0xfb90; /* sdiv r0, r0, r1 */ >> + idiv[1] = 0xf0f1; >> + idiv[2] = 0x4770; /* bx lr */ >> + } else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) { >> + extern char __aeabi_uidiv, __aeabi_idiv; >> + u32 *uidiv = (u32 *)&__aeabi_uidiv; >> + u32 *idiv = (u32 *)&__aeabi_idiv; >> + >> + uidiv[0] = 0xe730f110; /* udiv r0, r0, r1 */ >> + uidiv[1] = 0xe12fff1e; /* bx lr */ >> + >> + idiv[0] = 0xe710f110; /* sdiv r0, r0, r1 */ >> + idiv[1] = 0xe12fff1e; /* bx lr */ >> + } > > What about endianness, and what if XIP is enabled? I was also going to add a note about endian-ness. Given these are single instructoins for ARM, is it possible we could make a table of all the callers and fix them up when we initialise as we do for the SMP/UP case and for page-offset?
Nicolas Pitre <nicolas.pitre@linaro.org> writes: > What about this patch which I think is currently your best option. Note > it would need to use the facilities from asm/opcodes.h to make it endian > agnostic. > > diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c > index 6a1b8a81b1..379cffe4ab 100644 > --- a/arch/arm/kernel/setup.c > +++ b/arch/arm/kernel/setup.c > @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void) > elf_hwcap |= HWCAP_IDIVT; > } > > + /* > + * Patch our division routines with the corresponding opcode > + * if the hardware supports it. > + */ > + if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) { > + extern char __aeabi_uidiv, __aeabi_idiv; It would be safer to declare these as arrays of unspecified size. Otherwise the compiler might do evil things with what to it looks like out of bounds indexing. There should also be some cache maintenance after this patching, or is that already happening for some other reason?
On Tue, 12 Nov 2013, Ben Dooks wrote: > Given these are single instructoins for ARM, is it possible we could > make a table of all the callers and fix them up when we initialise > as we do for the SMP/UP case and for page-offset? Not really. Calls to those functions are generated by the compiler implicitly when a divisor operand is used and therefore we cannot annotate those calls. We'd have to use special accessors everywhere to replace the standard division operand (like we do for 64 by 32 bit divisions) but I doubt that people would accept that. You cannot just scan the binary for the appropriate branch opcode either as you may turn up false positives in literal pools. Nicolas
On Tue, 12 Nov 2013, Måns Rullgård wrote: > Nicolas Pitre <nicolas.pitre@linaro.org> writes: > > > What about this patch which I think is currently your best option. Note > > it would need to use the facilities from asm/opcodes.h to make it endian > > agnostic. > > > > diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c > > index 6a1b8a81b1..379cffe4ab 100644 > > --- a/arch/arm/kernel/setup.c > > +++ b/arch/arm/kernel/setup.c > > @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void) > > elf_hwcap |= HWCAP_IDIVT; > > } > > > > + /* > > + * Patch our division routines with the corresponding opcode > > + * if the hardware supports it. > > + */ > > + if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) { > > + extern char __aeabi_uidiv, __aeabi_idiv; > > It would be safer to declare these as arrays of unspecified size. > Otherwise the compiler might do evil things with what to it looks like > out of bounds indexing. Right. > > There should also be some cache maintenance after this patching, or is > that already happening for some other reason? This is so early during boot that the MMU isn't even fully initialized yet. The cache will be flushed. Nicolas
Nicolas Pitre <nicolas.pitre@linaro.org> writes: > On Tue, 12 Nov 2013, Ben Dooks wrote: > >> Given these are single instructoins for ARM, is it possible we could >> make a table of all the callers and fix them up when we initialise >> as we do for the SMP/UP case and for page-offset? > > Not really. Calls to those functions are generated by the compiler > implicitly when a divisor operand is used and therefore we cannot > annotate those calls. We'd have to use special accessors everywhere to > replace the standard division operand (like we do for 64 by 32 bit > divisions) but I doubt that people would accept that. It might be possible to extract this information from relocation tables.
On Tue, 12 Nov 2013, Måns Rullgård wrote: > Nicolas Pitre <nicolas.pitre@linaro.org> writes: > > > On Tue, 12 Nov 2013, Ben Dooks wrote: > > > >> Given these are single instructoins for ARM, is it possible we could > >> make a table of all the callers and fix them up when we initialise > >> as we do for the SMP/UP case and for page-offset? > > > > Not really. Calls to those functions are generated by the compiler > > implicitly when a divisor operand is used and therefore we cannot > > annotate those calls. We'd have to use special accessors everywhere to > > replace the standard division operand (like we do for 64 by 32 bit > > divisions) but I doubt that people would accept that. > > It might be possible to extract this information from relocation tables. True, but only for individual .o files. Once the linker puts them together the information is lost, and trying to infer what the linker has done is insane. Filtering the compiler output to annotate idiv calls before it is assembled would probably be a better solution. Is it worth it? I'm not sure. Nicolas
On Tue, 12 Nov 2013, Nicolas Pitre wrote: > On Tue, 12 Nov 2013, Måns Rullgård wrote: > > > It might be possible to extract this information from relocation tables. > > True, but only for individual .o files. Once the linker puts them > together the information is lost, and trying to infer what the linker > has done is insane. > > Filtering the compiler output to annotate idiv calls before it is > assembled would probably be a better solution. Another solution is to patch the call site from within __aeabi_idiv at run time using lr. That wouldn't work in the presence of tail call optimization though. Again this might not be worth it and patching __aeabi_idiv instead might be a good enough compromize. Nicolas
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 6a1b8a81b1..379cffe4ab 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -383,6 +383,34 @@ static void __init cpuid_init_hwcaps(void) elf_hwcap |= HWCAP_IDIVT; } + /* + * Patch our division routines with the corresponding opcode + * if the hardware supports it. + */ + if (IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVT)) { + extern char __aeabi_uidiv, __aeabi_idiv; + u16 *uidiv = (u16 *)&__aeabi_uidiv; + u16 *idiv = (u16 *)&__aeabi_idiv; + + uidiv[0] = 0xfbb0; /* udiv r0, r0, r1 */ + uidiv[1] = 0xf0f1; + uidiv[2] = 0x4770; /* bx lr */ + + idiv[0] = 0xfb90; /* sdiv r0, r0, r1 */ + idiv[1] = 0xf0f1; + idiv[2] = 0x4770; /* bx lr */ + } else if (!IS_ENABLED(CONFIG_THUMB2_KERNEL) && (elf_hwcap & HWCAP_IDIVA)) { + extern char __aeabi_uidiv, __aeabi_idiv; + u32 *uidiv = (u32 *)&__aeabi_uidiv; + u32 *idiv = (u32 *)&__aeabi_idiv; + + uidiv[0] = 0xe730f110; /* udiv r0, r0, r1 */ + uidiv[1] = 0xe12fff1e; /* bx lr */ + + idiv[0] = 0xe710f110; /* sdiv r0, r0, r1 */ + idiv[1] = 0xe12fff1e; /* bx lr */ + } + /* LPAE implies atomic ldrd/strd instructions */ vmsa = (read_cpuid_ext(CPUID_EXT_MMFR0) & 0xf) >> 0; if (vmsa >= 5)