diff mbox

[v3] ARM: add get_user() support for 8 byte types

Message ID 20140612155843.GK23430@n2100.arm.linux.org.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Russell King - ARM Linux June 12, 2014, 3:58 p.m. UTC
On Thu, Jun 12, 2014 at 04:42:35PM +0100, Daniel Thompson wrote:
> A new atomic modeset/pageflip ioctl being developed in DRM requires
> get_user() to work for 64bit types (in addition to just put_user()).
> 
> v1: original
> v2: pass correct size to check_uaccess, and better handling of narrowing
>     double word read with __get_user_xb() (Russell King's suggestion)
> v3: fix a couple of checkpatch issues

This is still unsafe.

>  #define __get_user_check(x,p)							\
>  	({								\
>  		unsigned long __limit = current_thread_info()->addr_limit - 1; \
>  		register const typeof(*(p)) __user *__p asm("r0") = (p);\
> -		register unsigned long __r2 asm("r2");			\
> +		register typeof(x) __r2 asm("r2");			\

So, __r2 becomes the type of 'x'.  If 'x' is a 64-bit type, and *p is
an 8-bit, 16-bit, or 32-bit type, this fails horribly by leaving the
upper word of __r2 undefined.

__r2 must follow the size of the value we are reading.  I think the
last solution which was proposed was this:

 arch/arm/include/asm/uaccess.h |   17 +++++++++++++----
 arch/arm/lib/getuser.S         |   33 ++++++++++++++++++++++++++++++++-
 2 files changed, 45 insertions(+), 5 deletions(-)

Comments

Daniel Thompson June 17, 2014, 10:17 a.m. UTC | #1
On 12/06/14 16:58, Russell King - ARM Linux wrote:
> On Thu, Jun 12, 2014 at 04:42:35PM +0100, Daniel Thompson wrote:
>> A new atomic modeset/pageflip ioctl being developed in DRM requires
>> get_user() to work for 64bit types (in addition to just put_user()).
>>
>> v1: original
>> v2: pass correct size to check_uaccess, and better handling of narrowing
>>     double word read with __get_user_xb() (Russell King's suggestion)
>> v3: fix a couple of checkpatch issues
> 
> This is still unsafe.
> 
>>  #define __get_user_check(x,p)							\
>>  	({								\
>>  		unsigned long __limit = current_thread_info()->addr_limit - 1; \
>>  		register const typeof(*(p)) __user *__p asm("r0") = (p);\
>> -		register unsigned long __r2 asm("r2");			\
>> +		register typeof(x) __r2 asm("r2");			\
> 
> So, __r2 becomes the type of 'x'.  If 'x' is a 64-bit type, and *p is
> an 8-bit, 16-bit, or 32-bit type, this fails horribly by leaving the
> upper word of __r2 undefined.

It is true that at after the switch statement the contents of r3 are
undefined. However...


>  		register unsigned long __l asm("r1") = __limit; \
>  		register int __e asm("r0");			\
>  		switch (sizeof(*(__p))) {			\
> @@ -142,6 +152,12 @@ extern int __get_user_4(void *);
>  		case 4:						\
>  			__get_user_x(__r2, __p, __e, __l, 4);	\
>  			break;					\
> +		case 8:						\
> +			if (sizeof((x)) < 8)			\
> +				__get_user_xb(__r2, __p, __e, __l, 4);\
> +			else					\
> +				__get_user_x(__r2, __p, __e, __l, 8);\
> +			break;					\
>  		default: __e = __get_user_bad(); break;		\
>  		}						\
>  		x = (typeof(*(p))) __r2;			\

... at this point there is a narrowing cast followed by an implicit
widening. This results in compiler either ignoring r3 altogether or, if
spilling to the stack, generating code to set r3 to zero before doing
the store.

I think this approach also makes 8-bit and 16-bit get_user() faster in
some cases where the type of *p and x are similar 8- or 16-bit types.
This is because the compiler will never generate a redundant narrowings.

Note that the speed improvement looks extremely marginal; the size of
the .text section (for a multi_v7_defconfig kernel) only gets 96 bytes
smaller (a.k.a. 0.0015%).


Daniel.
Russell King - ARM Linux June 17, 2014, 11:09 a.m. UTC | #2
On Tue, Jun 17, 2014 at 11:17:23AM +0100, Daniel Thompson wrote:
> ... at this point there is a narrowing cast followed by an implicit
> widening. This results in compiler either ignoring r3 altogether or, if
> spilling to the stack, generating code to set r3 to zero before doing
> the store.

In actual fact, there's very little difference between the two
implementations in terms of generated code.

The difference between them is what happens on the 64-bit big endian
narrowing case, where we use __get_user_4 with your version.  This
adds one additional instruction.

The little endian case results in identical code except for register
usage - for example, with my test for a 32-bit being widened to 64-bit:

        str     lr, [sp, #-4]!
-       mov     r3, r0
+       mov     ip, r0
        mov     r0, r1
 #APP
 @ 280 "t-getuser.c" 1
        bl      __get_user_4
 @ 0 "" 2
-       str     r2, [r3, #0]
-       mov     r2, #0
-       str     r2, [r3, #4]
+       mov     r3, #0
+       str     r2, [ip, #0]
+       str     r3, [ip, #4]
        ldr     pc, [sp], #4

and 64-bit narrowed to 32-bit:

        str     lr, [sp, #-4]!
-       mov     ip, r0
+       mov     r3, r0
        mov     r0, r1
 #APP
 @ 275 "t-getuser.c" 1
-       bl      __get_user_8
+       bl      __get_user_4
 @ 0 "" 2
-       str     r2, [ip, #0]
+       str     r2, [r3, #0]
        ldr     pc, [sp], #4

In terms of type checking, both seem to get it correct (which is something
I'm concerned about by any implementation since this is just as important
as the generated code).
Daniel Thompson June 17, 2014, 1:28 p.m. UTC | #3
On 17/06/14 12:09, Russell King - ARM Linux wrote:
> On Tue, Jun 17, 2014 at 11:17:23AM +0100, Daniel Thompson wrote:
>> ... at this point there is a narrowing cast followed by an implicit
>> widening. This results in compiler either ignoring r3 altogether or, if
>> spilling to the stack, generating code to set r3 to zero before doing
>> the store.
> 
> In actual fact, there's very little difference between the two
> implementations in terms of generated code.
> 
> The difference between them is what happens on the 64-bit big endian
> narrowing case, where we use __get_user_4 with your version.  This
> adds one additional instruction.

Good point.


> and 64-bit narrowed to 32-bit:
> 
>         str     lr, [sp, #-4]!
> -       mov     ip, r0
> +       mov     r3, r0
>         mov     r0, r1
>  #APP
>  @ 275 "t-getuser.c" 1
> -       bl      __get_user_8
> +       bl      __get_user_4
>  @ 0 "" 2
> -       str     r2, [ip, #0]
> +       str     r2, [r3, #0]
>         ldr     pc, [sp], #4

The later case avoids allocating r3 for the __get_user_x and should
reduce register pressure and, potentially, saves a few instructions
elsewhere (one of my rather large test functions does demonstrate this
effect).

I don't know if we care about that. If we do I'm certainly happy to put
a patch together than exploits this (whilst avoiding the add in the big
endian case).


Daniel.
Russell King - ARM Linux June 17, 2014, 1:36 p.m. UTC | #4
On Tue, Jun 17, 2014 at 02:28:44PM +0100, Daniel Thompson wrote:
> On 17/06/14 12:09, Russell King - ARM Linux wrote:
> > On Tue, Jun 17, 2014 at 11:17:23AM +0100, Daniel Thompson wrote:
> >> ... at this point there is a narrowing cast followed by an implicit
> >> widening. This results in compiler either ignoring r3 altogether or, if
> >> spilling to the stack, generating code to set r3 to zero before doing
> >> the store.
> > 
> > In actual fact, there's very little difference between the two
> > implementations in terms of generated code.
> > 
> > The difference between them is what happens on the 64-bit big endian
> > narrowing case, where we use __get_user_4 with your version.  This
> > adds one additional instruction.
> 
> Good point.
> 
> 
> > and 64-bit narrowed to 32-bit:
> > 
> >         str     lr, [sp, #-4]!
> > -       mov     ip, r0
> > +       mov     r3, r0
> >         mov     r0, r1
> >  #APP
> >  @ 275 "t-getuser.c" 1
> > -       bl      __get_user_8
> > +       bl      __get_user_4
> >  @ 0 "" 2
> > -       str     r2, [ip, #0]
> > +       str     r2, [r3, #0]
> >         ldr     pc, [sp], #4
> 
> The later case avoids allocating r3 for the __get_user_x and should
> reduce register pressure and, potentially, saves a few instructions
> elsewhere (one of my rather large test functions does demonstrate this
> effect).
> 
> I don't know if we care about that. If we do I'm certainly happy to put
> a patch together than exploits this (whilst avoiding the add in the big
> endian case).

No need - the + case is your version, the - case is my version.  So your
version wins on this point. :)
Daniel Thompson June 17, 2014, 1:54 p.m. UTC | #5
On 17/06/14 14:36, Russell King - ARM Linux wrote:
> On Tue, Jun 17, 2014 at 02:28:44PM +0100, Daniel Thompson wrote:
>> On 17/06/14 12:09, Russell King - ARM Linux wrote:
>>> On Tue, Jun 17, 2014 at 11:17:23AM +0100, Daniel Thompson wrote:
>>>> ... at this point there is a narrowing cast followed by an implicit
>>>> widening. This results in compiler either ignoring r3 altogether or, if
>>>> spilling to the stack, generating code to set r3 to zero before doing
>>>> the store.
>>>
>>> In actual fact, there's very little difference between the two
>>> implementations in terms of generated code.
>>>
>>> The difference between them is what happens on the 64-bit big endian
>>> narrowing case, where we use __get_user_4 with your version.  This
>>> adds one additional instruction.
>>
>> Good point.
>>
>>
>>> and 64-bit narrowed to 32-bit:
>>>
>>>         str     lr, [sp, #-4]!
>>> -       mov     ip, r0
>>> +       mov     r3, r0
>>>         mov     r0, r1
>>>  #APP
>>>  @ 275 "t-getuser.c" 1
>>> -       bl      __get_user_8
>>> +       bl      __get_user_4
>>>  @ 0 "" 2
>>> -       str     r2, [ip, #0]
>>> +       str     r2, [r3, #0]
>>>         ldr     pc, [sp], #4
>>
>> The later case avoids allocating r3 for the __get_user_x and should
>> reduce register pressure and, potentially, saves a few instructions
>> elsewhere (one of my rather large test functions does demonstrate this
>> effect).
>>
>> I don't know if we care about that. If we do I'm certainly happy to put
>> a patch together than exploits this (whilst avoiding the add in the big
>> endian case).
> 
> No need - the + case is your version, the - case is my version.  So your
> version wins on this point. :)

:) Thanks, although credit really goes to Rob Clark...

I think currently:

1. Rob's patch is better for register pressure in the narrowing case
   (above).

2. Your patch is probably better for big endian due to the add in Rob's
   version. I say probably because, without proof, I suspect the cost
   of the add would in most cases outweigh the register pressure
   benefit.

3. Your patch has better implementation of __get_user_8 (it uses ldrd).

Hence I'm suspect we need to combine elements from both patches.
diff mbox

Patch

diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 72abdc5..747f2cb 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -93,6 +93,9 @@  static inline void set_fs(mm_segment_t fs)
 		: "cc"); \
 	flag; })
 
+#define __inttype(x) \
+	__typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL))
+
 /*
  * Single-value transfer routines.  They automatically use the right
  * size if we just have the right pointer type.  Note that the functions
@@ -107,14 +110,16 @@  static inline void set_fs(mm_segment_t fs)
 extern int __get_user_1(void *);
 extern int __get_user_2(void *);
 extern int __get_user_4(void *);
+extern int __get_user_8(void *);
 
-#define __GUP_CLOBBER_1	"lr", "cc"
+#define __GUP_CLOBBER_1 "lr", "cc"
 #ifdef CONFIG_CPU_USE_DOMAINS
-#define __GUP_CLOBBER_2	"ip", "lr", "cc"
+#define __GUP_CLOBBER_2 "ip", "lr", "cc"
 #else
 #define __GUP_CLOBBER_2 "lr", "cc"
 #endif
-#define __GUP_CLOBBER_4	"lr", "cc"
+#define __GUP_CLOBBER_4 "lr", "cc"
+#define __GUP_CLOBBER_8 "lr", "cc"
 
 #define __get_user_x(__r2,__p,__e,__l,__s)				\
 	   __asm__ __volatile__ (					\
@@ -129,7 +134,7 @@  extern int __get_user_4(void *);
 	({								\
 		unsigned long __limit = current_thread_info()->addr_limit - 1; \
 		register const typeof(*(p)) __user *__p asm("r0") = (p);\
-		register unsigned long __r2 asm("r2");			\
+		register __inttype(*p) __r2 asm("r2");			\
 		register unsigned long __l asm("r1") = __limit;		\
 		register int __e asm("r0");				\
 		switch (sizeof(*(__p))) {				\
@@ -142,6 +147,9 @@  extern int __get_user_4(void *);
 		case 4:							\
 			__get_user_x(__r2, __p, __e, __l, 4);		\
 			break;						\
+		case 8:							\
+			__get_user_x(__r2, __p, __e, __l, 8);		\
+			break;						\
 		default: __e = __get_user_bad(); break;			\
 		}							\
 		x = (typeof(*(p))) __r2;				\
@@ -150,6 +158,7 @@  extern int __get_user_4(void *);
 
 #define get_user(x,p)							\
 	({								\
+		__chk_user_ptr(ptr);					\
 		might_fault();						\
 		__get_user_check(x,p);					\
 	 })
diff --git a/arch/arm/lib/getuser.S b/arch/arm/lib/getuser.S
index 9b06bb4..3583c83 100644
--- a/arch/arm/lib/getuser.S
+++ b/arch/arm/lib/getuser.S
@@ -18,7 +18,7 @@ 
  * Inputs:	r0 contains the address
  *		r1 contains the address limit, which must be preserved
  * Outputs:	r0 is the error code
- *		r2 contains the zero-extended value
+ *		r2/r3 contains the zero-extended value
  *		lr corrupted
  *
  * No other registers must be altered.  (see <asm/uaccess.h>
@@ -32,6 +32,14 @@ 
 #include <asm/errno.h>
 #include <asm/domain.h>
 
+#ifdef __ARMEB__
+#define rlo8	r3
+#define rhi8	r2
+#else
+#define	rlo8	r2
+#define rhi8	r3
+#endif
+
 ENTRY(__get_user_1)
 	check_uaccess r0, 1, r1, r2, __get_user_bad
 1: TUSER(ldrb)	r2, [r0]
@@ -66,15 +74,38 @@  ENTRY(__get_user_4)
 	mov	pc, lr
 ENDPROC(__get_user_4)
 
+ENTRY(__get_user_8)
+	check_uaccess r0, 4, r1, r2, __get_user_bad8
+#ifdef CONFIG_CPU_USE_DOMAINS
+#define	GU8_FIXUPS	5b, 6b
+5:	ldrt	rlo8, [r0], #4
+6:	ldrt	rhi8, [r0], #0
+#elif __LINUX_ARM_ARCH__ >= 6
+#define	GU8_FIXUPS	5b
+5:	ldrd	r2, [r0]
+#else
+#define	GU8_FIXUPS	5b, 6b
+5:	ldr	rlo8, [r0, #0]
+6:	ldr	rhi8, [r0, #4]
+#endif
+	mov	r0, #0
+	mov	pc, lr
+
+__get_user_bad8:
+	mov	r3, #0
 __get_user_bad:
 	mov	r2, #0
 	mov	r0, #-EFAULT
 	mov	pc, lr
 ENDPROC(__get_user_bad)
+ENDPROC(__get_user_bad8)
 
 .pushsection __ex_table, "a"
 	.long	1b, __get_user_bad
 	.long	2b, __get_user_bad
 	.long	3b, __get_user_bad
 	.long	4b, __get_user_bad
+	.irp	param, GU8_FIXUPS
+	.long	\param, __get_user_bad8
+	.endr
 .popsection