diff mbox series

[2/4] arm64: Avoid masking "old" for LSE cmpxchg() implementation

Message ID 1543347887-21101-3-git-send-email-will.deacon@arm.com (mailing list archive)
State New, archived
Headers show
Series Rewrite of percpu atomics and introduction of LSE | expand

Commit Message

Will Deacon Nov. 27, 2018, 7:44 p.m. UTC
The CAS instructions implicitly access only the relevant bits of the "old"
argument, so there is no need for explicit masking via type-casting as
there is in the LL/SC implementation.

Move the casting into the LL/SC code and remove it altogether for the LSE
implementation.

Signed-off-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm64/include/asm/atomic_ll_sc.h | 8 ++++++++
 arch/arm64/include/asm/atomic_lse.h   | 4 ++--
 arch/arm64/include/asm/cmpxchg.h      | 4 ++--
 3 files changed, 12 insertions(+), 4 deletions(-)

Comments

Ard Biesheuvel Dec. 4, 2018, 4:58 p.m. UTC | #1
On Tue, 27 Nov 2018 at 20:44, Will Deacon <will.deacon@arm.com> wrote:
>
> The CAS instructions implicitly access only the relevant bits of the "old"
> argument, so there is no need for explicit masking via type-casting as
> there is in the LL/SC implementation.
>
> Move the casting into the LL/SC code and remove it altogether for the LSE
> implementation.
>
> Signed-off-by: Will Deacon <will.deacon@arm.com>
> ---
>  arch/arm64/include/asm/atomic_ll_sc.h | 8 ++++++++
>  arch/arm64/include/asm/atomic_lse.h   | 4 ++--
>  arch/arm64/include/asm/cmpxchg.h      | 4 ++--
>  3 files changed, 12 insertions(+), 4 deletions(-)
>
> diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
> index f02d3bf7b9e6..b53f70dd6e10 100644
> --- a/arch/arm64/include/asm/atomic_ll_sc.h
> +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> @@ -257,6 +257,14 @@ __LL_SC_PREFIX(__cmpxchg_case_##name##sz(volatile void *ptr,               \
>         unsigned long tmp;                                              \
>         u##sz oldval;                                                   \
>                                                                         \
> +       /*                                                              \
> +        * Sub-word sizes require explicit casting so that the compare  \
> +        * part of the cmpxchg doesn't end up interpreting non-zero     \
> +        * upper bits of the register containing "old".                 \
> +        */                                                             \
> +       if (sz < 32)                                                    \
> +               old = (u##sz)old;                                       \
> +                                                                       \
>         asm volatile(                                                   \
>         "       prfm    pstl1strm, %[v]\n"                              \
>         "1:     ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n"          \
> diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
> index 4d6f917b654e..a424355240c5 100644
> --- a/arch/arm64/include/asm/atomic_lse.h
> +++ b/arch/arm64/include/asm/atomic_lse.h
> @@ -448,11 +448,11 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
>
>  #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)                    \
>  static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,      \
> -                                             unsigned long old,        \
> +                                             u##sz old,                \
>                                               u##sz new)                \
>  {                                                                      \
>         register unsigned long x0 asm ("x0") = (unsigned long)ptr;      \
> -       register unsigned long x1 asm ("x1") = old;                     \
> +       register u##sz x1 asm ("x1") = old;                             \

This looks backwards to me, but perhaps I am missing something:
changing from unsigned long to a narrower type makes it the compiler's
job to perform the cast, no? Given that 'cas' ignores the upper bits
anyway, what does this change buy us?



>         register u##sz x2 asm ("x2") = new;                             \
>                                                                         \
>         asm volatile(ARM64_LSE_ATOMIC_INSN(                             \
> diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
> index 1f0340fc6dad..3f9376f1c409 100644
> --- a/arch/arm64/include/asm/cmpxchg.h
> +++ b/arch/arm64/include/asm/cmpxchg.h
> @@ -123,9 +123,9 @@ static inline unsigned long __cmpxchg##sfx(volatile void *ptr,              \
>  {                                                                      \
>         switch (size) {                                                 \
>         case 1:                                                         \
> -               return __cmpxchg_case##sfx##_8(ptr, (u8)old, new);      \
> +               return __cmpxchg_case##sfx##_8(ptr, old, new);          \
>         case 2:                                                         \
> -               return __cmpxchg_case##sfx##_16(ptr, (u16)old, new);    \
> +               return __cmpxchg_case##sfx##_16(ptr, old, new);         \
>         case 4:                                                         \
>                 return __cmpxchg_case##sfx##_32(ptr, old, new);         \
>         case 8:                                                         \
> --
> 2.1.4
>
Will Deacon Dec. 7, 2018, 3:49 p.m. UTC | #2
Hi Ard,

On Tue, Dec 04, 2018 at 05:58:50PM +0100, Ard Biesheuvel wrote:
> On Tue, 27 Nov 2018 at 20:44, Will Deacon <will.deacon@arm.com> wrote:
> >
> > The CAS instructions implicitly access only the relevant bits of the "old"
> > argument, so there is no need for explicit masking via type-casting as
> > there is in the LL/SC implementation.
> >
> > Move the casting into the LL/SC code and remove it altogether for the LSE
> > implementation.
> >
> > Signed-off-by: Will Deacon <will.deacon@arm.com>
> > ---
> >  arch/arm64/include/asm/atomic_ll_sc.h | 8 ++++++++
> >  arch/arm64/include/asm/atomic_lse.h   | 4 ++--
> >  arch/arm64/include/asm/cmpxchg.h      | 4 ++--
> >  3 files changed, 12 insertions(+), 4 deletions(-)
> >
> > diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
> > index f02d3bf7b9e6..b53f70dd6e10 100644
> > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > @@ -257,6 +257,14 @@ __LL_SC_PREFIX(__cmpxchg_case_##name##sz(volatile void *ptr,               \
> >         unsigned long tmp;                                              \
> >         u##sz oldval;                                                   \
> >                                                                         \
> > +       /*                                                              \
> > +        * Sub-word sizes require explicit casting so that the compare  \
> > +        * part of the cmpxchg doesn't end up interpreting non-zero     \
> > +        * upper bits of the register containing "old".                 \
> > +        */                                                             \
> > +       if (sz < 32)                                                    \
> > +               old = (u##sz)old;                                       \
> > +                                                                       \
> >         asm volatile(                                                   \
> >         "       prfm    pstl1strm, %[v]\n"                              \
> >         "1:     ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n"          \
> > diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
> > index 4d6f917b654e..a424355240c5 100644
> > --- a/arch/arm64/include/asm/atomic_lse.h
> > +++ b/arch/arm64/include/asm/atomic_lse.h
> > @@ -448,11 +448,11 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
> >
> >  #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)                    \
> >  static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,      \
> > -                                             unsigned long old,        \
> > +                                             u##sz old,                \
> >                                               u##sz new)                \
> >  {                                                                      \
> >         register unsigned long x0 asm ("x0") = (unsigned long)ptr;      \
> > -       register unsigned long x1 asm ("x1") = old;                     \
> > +       register u##sz x1 asm ("x1") = old;                             \
> 
> This looks backwards to me, but perhaps I am missing something:
> changing from unsigned long to a narrower type makes it the compiler's
> job to perform the cast, no? Given that 'cas' ignores the upper bits
> anyway, what does this change buy us?

A narrowing cast doesn't actually result in any additional instructions --
the masking occurs if you do a widening cast. In this case, the change I'm
proposing means we avoid the redundant widening casts for the LSE operations
because, as you point out, the underlying instruction only operates on the
relevant bits.

Will
Ard Biesheuvel Dec. 7, 2018, 4:05 p.m. UTC | #3
On Fri, 7 Dec 2018 at 16:49, Will Deacon <will.deacon@arm.com> wrote:
>
> Hi Ard,
>
> On Tue, Dec 04, 2018 at 05:58:50PM +0100, Ard Biesheuvel wrote:
> > On Tue, 27 Nov 2018 at 20:44, Will Deacon <will.deacon@arm.com> wrote:
> > >
> > > The CAS instructions implicitly access only the relevant bits of the "old"
> > > argument, so there is no need for explicit masking via type-casting as
> > > there is in the LL/SC implementation.
> > >
> > > Move the casting into the LL/SC code and remove it altogether for the LSE
> > > implementation.
> > >
> > > Signed-off-by: Will Deacon <will.deacon@arm.com>
> > > ---
> > >  arch/arm64/include/asm/atomic_ll_sc.h | 8 ++++++++
> > >  arch/arm64/include/asm/atomic_lse.h   | 4 ++--
> > >  arch/arm64/include/asm/cmpxchg.h      | 4 ++--
> > >  3 files changed, 12 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
> > > index f02d3bf7b9e6..b53f70dd6e10 100644
> > > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > > @@ -257,6 +257,14 @@ __LL_SC_PREFIX(__cmpxchg_case_##name##sz(volatile void *ptr,               \
> > >         unsigned long tmp;                                              \
> > >         u##sz oldval;                                                   \
> > >                                                                         \
> > > +       /*                                                              \
> > > +        * Sub-word sizes require explicit casting so that the compare  \
> > > +        * part of the cmpxchg doesn't end up interpreting non-zero     \
> > > +        * upper bits of the register containing "old".                 \
> > > +        */                                                             \
> > > +       if (sz < 32)                                                    \
> > > +               old = (u##sz)old;                                       \
> > > +                                                                       \
> > >         asm volatile(                                                   \
> > >         "       prfm    pstl1strm, %[v]\n"                              \
> > >         "1:     ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n"          \
> > > diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
> > > index 4d6f917b654e..a424355240c5 100644
> > > --- a/arch/arm64/include/asm/atomic_lse.h
> > > +++ b/arch/arm64/include/asm/atomic_lse.h
> > > @@ -448,11 +448,11 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
> > >
> > >  #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)                    \
> > >  static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,      \
> > > -                                             unsigned long old,        \
> > > +                                             u##sz old,                \
> > >                                               u##sz new)                \
> > >  {                                                                      \
> > >         register unsigned long x0 asm ("x0") = (unsigned long)ptr;      \
> > > -       register unsigned long x1 asm ("x1") = old;                     \
> > > +       register u##sz x1 asm ("x1") = old;                             \
> >
> > This looks backwards to me, but perhaps I am missing something:
> > changing from unsigned long to a narrower type makes it the compiler's
> > job to perform the cast, no? Given that 'cas' ignores the upper bits
> > anyway, what does this change buy us?
>
> A narrowing cast doesn't actually result in any additional instructions --
> the masking occurs if you do a widening cast. In this case, the change I'm
> proposing means we avoid the redundant widening casts for the LSE operations
> because, as you point out, the underlying instruction only operates on the
> relevant bits.
>

Playing around with some code, I found out that

static inline void foo(unsigned char x)
{
    asm("" :: "r"(x));
}

void bar(unsigned long l)
{
    foo(l);
}

produces different code than

static inline void foo(unsigned longr x)
{
    asm("" :: "r"(x));
}

void bar(unsigned long l)
{
    foo((unsigned char)l);
}

so what you are saying appears to be accurate. Whether it is correct,
is another matter, though, since the 'unsigned char' argument passed
into the asm() block may have higher bits set.
Will Deacon Dec. 7, 2018, 4:22 p.m. UTC | #4
On Fri, Dec 07, 2018 at 05:05:16PM +0100, Ard Biesheuvel wrote:
> On Fri, 7 Dec 2018 at 16:49, Will Deacon <will.deacon@arm.com> wrote:
> >
> > Hi Ard,
> >
> > On Tue, Dec 04, 2018 at 05:58:50PM +0100, Ard Biesheuvel wrote:
> > > On Tue, 27 Nov 2018 at 20:44, Will Deacon <will.deacon@arm.com> wrote:
> > > >
> > > > The CAS instructions implicitly access only the relevant bits of the "old"
> > > > argument, so there is no need for explicit masking via type-casting as
> > > > there is in the LL/SC implementation.
> > > >
> > > > Move the casting into the LL/SC code and remove it altogether for the LSE
> > > > implementation.
> > > >
> > > > Signed-off-by: Will Deacon <will.deacon@arm.com>
> > > > ---
> > > >  arch/arm64/include/asm/atomic_ll_sc.h | 8 ++++++++
> > > >  arch/arm64/include/asm/atomic_lse.h   | 4 ++--
> > > >  arch/arm64/include/asm/cmpxchg.h      | 4 ++--
> > > >  3 files changed, 12 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
> > > > index f02d3bf7b9e6..b53f70dd6e10 100644
> > > > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > > > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > > > @@ -257,6 +257,14 @@ __LL_SC_PREFIX(__cmpxchg_case_##name##sz(volatile void *ptr,               \
> > > >         unsigned long tmp;                                              \
> > > >         u##sz oldval;                                                   \
> > > >                                                                         \
> > > > +       /*                                                              \
> > > > +        * Sub-word sizes require explicit casting so that the compare  \
> > > > +        * part of the cmpxchg doesn't end up interpreting non-zero     \
> > > > +        * upper bits of the register containing "old".                 \
> > > > +        */                                                             \
> > > > +       if (sz < 32)                                                    \
> > > > +               old = (u##sz)old;                                       \
> > > > +                                                                       \
> > > >         asm volatile(                                                   \
> > > >         "       prfm    pstl1strm, %[v]\n"                              \
> > > >         "1:     ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n"          \
> > > > diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
> > > > index 4d6f917b654e..a424355240c5 100644
> > > > --- a/arch/arm64/include/asm/atomic_lse.h
> > > > +++ b/arch/arm64/include/asm/atomic_lse.h
> > > > @@ -448,11 +448,11 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
> > > >
> > > >  #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)                    \
> > > >  static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,      \
> > > > -                                             unsigned long old,        \
> > > > +                                             u##sz old,                \
> > > >                                               u##sz new)                \
> > > >  {                                                                      \
> > > >         register unsigned long x0 asm ("x0") = (unsigned long)ptr;      \
> > > > -       register unsigned long x1 asm ("x1") = old;                     \
> > > > +       register u##sz x1 asm ("x1") = old;                             \
> > >
> > > This looks backwards to me, but perhaps I am missing something:
> > > changing from unsigned long to a narrower type makes it the compiler's
> > > job to perform the cast, no? Given that 'cas' ignores the upper bits
> > > anyway, what does this change buy us?
> >
> > A narrowing cast doesn't actually result in any additional instructions --
> > the masking occurs if you do a widening cast. In this case, the change I'm
> > proposing means we avoid the redundant widening casts for the LSE operations
> > because, as you point out, the underlying instruction only operates on the
> > relevant bits.
> >
> 
> Playing around with some code, I found out that
> 
> static inline void foo(unsigned char x)
> {
>     asm("" :: "r"(x));
> }
> 
> void bar(unsigned long l)
> {
>     foo(l);
> }
> 
> produces different code than
> 
> static inline void foo(unsigned longr x)
> {
>     asm("" :: "r"(x));
> }
> 
> void bar(unsigned long l)
> {
>     foo((unsigned char)l);
> }
> 
> so what you are saying appears to be accurate. Whether it is correct,
> is another matter, though, since the 'unsigned char' argument passed
> into the asm() block may have higher bits set.

Right, which is why I've kept the casting for sizes < 32 bits in the LL/SC
code.

Will
Ard Biesheuvel Dec. 7, 2018, 5:03 p.m. UTC | #5
On Fri, 7 Dec 2018 at 17:22, Will Deacon <will.deacon@arm.com> wrote:
>
> On Fri, Dec 07, 2018 at 05:05:16PM +0100, Ard Biesheuvel wrote:
> > On Fri, 7 Dec 2018 at 16:49, Will Deacon <will.deacon@arm.com> wrote:
> > >
> > > Hi Ard,
> > >
> > > On Tue, Dec 04, 2018 at 05:58:50PM +0100, Ard Biesheuvel wrote:
> > > > On Tue, 27 Nov 2018 at 20:44, Will Deacon <will.deacon@arm.com> wrote:
> > > > >
> > > > > The CAS instructions implicitly access only the relevant bits of the "old"
> > > > > argument, so there is no need for explicit masking via type-casting as
> > > > > there is in the LL/SC implementation.
> > > > >
> > > > > Move the casting into the LL/SC code and remove it altogether for the LSE
> > > > > implementation.
> > > > >
> > > > > Signed-off-by: Will Deacon <will.deacon@arm.com>
> > > > > ---
> > > > >  arch/arm64/include/asm/atomic_ll_sc.h | 8 ++++++++
> > > > >  arch/arm64/include/asm/atomic_lse.h   | 4 ++--
> > > > >  arch/arm64/include/asm/cmpxchg.h      | 4 ++--
> > > > >  3 files changed, 12 insertions(+), 4 deletions(-)
> > > > >
> > > > > diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
> > > > > index f02d3bf7b9e6..b53f70dd6e10 100644
> > > > > --- a/arch/arm64/include/asm/atomic_ll_sc.h
> > > > > +++ b/arch/arm64/include/asm/atomic_ll_sc.h
> > > > > @@ -257,6 +257,14 @@ __LL_SC_PREFIX(__cmpxchg_case_##name##sz(volatile void *ptr,               \
> > > > >         unsigned long tmp;                                              \
> > > > >         u##sz oldval;                                                   \
> > > > >                                                                         \
> > > > > +       /*                                                              \
> > > > > +        * Sub-word sizes require explicit casting so that the compare  \
> > > > > +        * part of the cmpxchg doesn't end up interpreting non-zero     \
> > > > > +        * upper bits of the register containing "old".                 \
> > > > > +        */                                                             \
> > > > > +       if (sz < 32)                                                    \
> > > > > +               old = (u##sz)old;                                       \
> > > > > +                                                                       \
> > > > >         asm volatile(                                                   \
> > > > >         "       prfm    pstl1strm, %[v]\n"                              \
> > > > >         "1:     ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n"          \
> > > > > diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
> > > > > index 4d6f917b654e..a424355240c5 100644
> > > > > --- a/arch/arm64/include/asm/atomic_lse.h
> > > > > +++ b/arch/arm64/include/asm/atomic_lse.h
> > > > > @@ -448,11 +448,11 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
> > > > >
> > > > >  #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)                    \
> > > > >  static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,      \
> > > > > -                                             unsigned long old,        \
> > > > > +                                             u##sz old,                \
> > > > >                                               u##sz new)                \
> > > > >  {                                                                      \
> > > > >         register unsigned long x0 asm ("x0") = (unsigned long)ptr;      \
> > > > > -       register unsigned long x1 asm ("x1") = old;                     \
> > > > > +       register u##sz x1 asm ("x1") = old;                             \
> > > >
> > > > This looks backwards to me, but perhaps I am missing something:
> > > > changing from unsigned long to a narrower type makes it the compiler's
> > > > job to perform the cast, no? Given that 'cas' ignores the upper bits
> > > > anyway, what does this change buy us?
> > >
> > > A narrowing cast doesn't actually result in any additional instructions --
> > > the masking occurs if you do a widening cast. In this case, the change I'm
> > > proposing means we avoid the redundant widening casts for the LSE operations
> > > because, as you point out, the underlying instruction only operates on the
> > > relevant bits.
> > >
> >
> > Playing around with some code, I found out that
> >
> > static inline void foo(unsigned char x)
> > {
> >     asm("" :: "r"(x));
> > }
> >
> > void bar(unsigned long l)
> > {
> >     foo(l);
> > }
> >
> > produces different code than
> >
> > static inline void foo(unsigned longr x)
> > {
> >     asm("" :: "r"(x));
> > }
> >
> > void bar(unsigned long l)
> > {
> >     foo((unsigned char)l);
> > }
> >
> > so what you are saying appears to be accurate. Whether it is correct,
> > is another matter, though, since the 'unsigned char' argument passed
> > into the asm() block may have higher bits set.
>
> Right, which is why I've kept the casting for sizes < 32 bits in the LL/SC
> code.
>

OK.

So if we are relying on the cast to occur implicitly by the cas
instruction, does it really make sense to change the prototype?
Shouldn't we keep it at unsigned long so that we explicitly pass the
whole value in (but use an explicit cast in the LL/SC implementation
as you did)
Will Deacon Dec. 7, 2018, 5:15 p.m. UTC | #6
On Fri, Dec 07, 2018 at 06:03:35PM +0100, Ard Biesheuvel wrote:
> On Fri, 7 Dec 2018 at 17:22, Will Deacon <will.deacon@arm.com> wrote:
> > On Fri, Dec 07, 2018 at 05:05:16PM +0100, Ard Biesheuvel wrote:
> > > On Fri, 7 Dec 2018 at 16:49, Will Deacon <will.deacon@arm.com> wrote:
> > > > On Tue, Dec 04, 2018 at 05:58:50PM +0100, Ard Biesheuvel wrote:
> > > > > On Tue, 27 Nov 2018 at 20:44, Will Deacon <will.deacon@arm.com> wrote:
> > > > > > diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
> > > > > > index 4d6f917b654e..a424355240c5 100644
> > > > > > --- a/arch/arm64/include/asm/atomic_lse.h
> > > > > > +++ b/arch/arm64/include/asm/atomic_lse.h
> > > > > > @@ -448,11 +448,11 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
> > > > > >
> > > > > >  #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)                    \
> > > > > >  static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,      \
> > > > > > -                                             unsigned long old,        \
> > > > > > +                                             u##sz old,                \
> > > > > >                                               u##sz new)                \
> > > > > >  {                                                                      \
> > > > > >         register unsigned long x0 asm ("x0") = (unsigned long)ptr;      \
> > > > > > -       register unsigned long x1 asm ("x1") = old;                     \
> > > > > > +       register u##sz x1 asm ("x1") = old;                             \
> > > > >
> > > > > This looks backwards to me, but perhaps I am missing something:
> > > > > changing from unsigned long to a narrower type makes it the compiler's
> > > > > job to perform the cast, no? Given that 'cas' ignores the upper bits
> > > > > anyway, what does this change buy us?
> > > >
> > > > A narrowing cast doesn't actually result in any additional instructions --
> > > > the masking occurs if you do a widening cast. In this case, the change I'm
> > > > proposing means we avoid the redundant widening casts for the LSE operations
> > > > because, as you point out, the underlying instruction only operates on the
> > > > relevant bits.
> > > >
> > >
> > > Playing around with some code, I found out that
> > >
> > > static inline void foo(unsigned char x)
> > > {
> > >     asm("" :: "r"(x));
> > > }
> > >
> > > void bar(unsigned long l)
> > > {
> > >     foo(l);
> > > }
> > >
> > > produces different code than
> > >
> > > static inline void foo(unsigned longr x)
> > > {
> > >     asm("" :: "r"(x));
> > > }
> > >
> > > void bar(unsigned long l)
> > > {
> > >     foo((unsigned char)l);
> > > }
> > >
> > > so what you are saying appears to be accurate. Whether it is correct,
> > > is another matter, though, since the 'unsigned char' argument passed
> > > into the asm() block may have higher bits set.
> >
> > Right, which is why I've kept the casting for sizes < 32 bits in the LL/SC
> > code.
> >
> 
> OK.
> 
> So if we are relying on the cast to occur implicitly by the cas
> instruction, does it really make sense to change the prototype?
> Shouldn't we keep it at unsigned long so that we explicitly pass the
> whole value in (but use an explicit cast in the LL/SC implementation
> as you did)

If we change the prototype of the __cmpxchg_case_* functions so that the
old and new parameters are unsigned long, then we'll get widening casts
(and explicit masking) for any caller of cmpxchg() that passes narrower
types such as u16.

Or are you suggesting something else?

Will
Ard Biesheuvel Dec. 7, 2018, 5:18 p.m. UTC | #7
On Fri, 7 Dec 2018 at 18:15, Will Deacon <will.deacon@arm.com> wrote:
>
> On Fri, Dec 07, 2018 at 06:03:35PM +0100, Ard Biesheuvel wrote:
> > On Fri, 7 Dec 2018 at 17:22, Will Deacon <will.deacon@arm.com> wrote:
> > > On Fri, Dec 07, 2018 at 05:05:16PM +0100, Ard Biesheuvel wrote:
> > > > On Fri, 7 Dec 2018 at 16:49, Will Deacon <will.deacon@arm.com> wrote:
> > > > > On Tue, Dec 04, 2018 at 05:58:50PM +0100, Ard Biesheuvel wrote:
> > > > > > On Tue, 27 Nov 2018 at 20:44, Will Deacon <will.deacon@arm.com> wrote:
> > > > > > > diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
> > > > > > > index 4d6f917b654e..a424355240c5 100644
> > > > > > > --- a/arch/arm64/include/asm/atomic_lse.h
> > > > > > > +++ b/arch/arm64/include/asm/atomic_lse.h
> > > > > > > @@ -448,11 +448,11 @@ static inline long atomic64_dec_if_positive(atomic64_t *v)
> > > > > > >
> > > > > > >  #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)                    \
> > > > > > >  static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,      \
> > > > > > > -                                             unsigned long old,        \
> > > > > > > +                                             u##sz old,                \
> > > > > > >                                               u##sz new)                \
> > > > > > >  {                                                                      \
> > > > > > >         register unsigned long x0 asm ("x0") = (unsigned long)ptr;      \
> > > > > > > -       register unsigned long x1 asm ("x1") = old;                     \
> > > > > > > +       register u##sz x1 asm ("x1") = old;                             \
> > > > > >
> > > > > > This looks backwards to me, but perhaps I am missing something:
> > > > > > changing from unsigned long to a narrower type makes it the compiler's
> > > > > > job to perform the cast, no? Given that 'cas' ignores the upper bits
> > > > > > anyway, what does this change buy us?
> > > > >
> > > > > A narrowing cast doesn't actually result in any additional instructions --
> > > > > the masking occurs if you do a widening cast. In this case, the change I'm
> > > > > proposing means we avoid the redundant widening casts for the LSE operations
> > > > > because, as you point out, the underlying instruction only operates on the
> > > > > relevant bits.
> > > > >
> > > >
> > > > Playing around with some code, I found out that
> > > >
> > > > static inline void foo(unsigned char x)
> > > > {
> > > >     asm("" :: "r"(x));
> > > > }
> > > >
> > > > void bar(unsigned long l)
> > > > {
> > > >     foo(l);
> > > > }
> > > >
> > > > produces different code than
> > > >
> > > > static inline void foo(unsigned longr x)
> > > > {
> > > >     asm("" :: "r"(x));
> > > > }
> > > >
> > > > void bar(unsigned long l)
> > > > {
> > > >     foo((unsigned char)l);
> > > > }
> > > >
> > > > so what you are saying appears to be accurate. Whether it is correct,
> > > > is another matter, though, since the 'unsigned char' argument passed
> > > > into the asm() block may have higher bits set.
> > >
> > > Right, which is why I've kept the casting for sizes < 32 bits in the LL/SC
> > > code.
> > >
> >
> > OK.
> >
> > So if we are relying on the cast to occur implicitly by the cas
> > instruction, does it really make sense to change the prototype?
> > Shouldn't we keep it at unsigned long so that we explicitly pass the
> > whole value in (but use an explicit cast in the LL/SC implementation
> > as you did)
>
> If we change the prototype of the __cmpxchg_case_* functions so that the
> old and new parameters are unsigned long, then we'll get widening casts
> (and explicit masking) for any caller of cmpxchg() that passes narrower
> types such as u16.
>
> Or are you suggesting something else?
>

No. It's all a bit counter intuitive, at least to me, but if this gets
rid of the casts, then let's go with it.
diff mbox series

Patch

diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index f02d3bf7b9e6..b53f70dd6e10 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -257,6 +257,14 @@  __LL_SC_PREFIX(__cmpxchg_case_##name##sz(volatile void *ptr,		\
 	unsigned long tmp;						\
 	u##sz oldval;							\
 									\
+	/*								\
+	 * Sub-word sizes require explicit casting so that the compare  \
+	 * part of the cmpxchg doesn't end up interpreting non-zero	\
+	 * upper bits of the register containing "old".			\
+	 */								\
+	if (sz < 32)							\
+		old = (u##sz)old;					\
+									\
 	asm volatile(							\
 	"	prfm	pstl1strm, %[v]\n"				\
 	"1:	ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n"		\
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 4d6f917b654e..a424355240c5 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -448,11 +448,11 @@  static inline long atomic64_dec_if_positive(atomic64_t *v)
 
 #define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)			\
 static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,	\
-					      unsigned long old,	\
+					      u##sz old,		\
 					      u##sz new)		\
 {									\
 	register unsigned long x0 asm ("x0") = (unsigned long)ptr;	\
-	register unsigned long x1 asm ("x1") = old;			\
+	register u##sz x1 asm ("x1") = old;				\
 	register u##sz x2 asm ("x2") = new;				\
 									\
 	asm volatile(ARM64_LSE_ATOMIC_INSN(				\
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index 1f0340fc6dad..3f9376f1c409 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -123,9 +123,9 @@  static inline unsigned long __cmpxchg##sfx(volatile void *ptr,		\
 {									\
 	switch (size) {							\
 	case 1:								\
-		return __cmpxchg_case##sfx##_8(ptr, (u8)old, new);	\
+		return __cmpxchg_case##sfx##_8(ptr, old, new);		\
 	case 2:								\
-		return __cmpxchg_case##sfx##_16(ptr, (u16)old, new);	\
+		return __cmpxchg_case##sfx##_16(ptr, old, new);		\
 	case 4:								\
 		return __cmpxchg_case##sfx##_32(ptr, old, new);		\
 	case 8:								\