diff mbox series

[RFC,05/28] x86: Define the stack protector guard symbol explicitly

Message ID 20240925150059.3955569-35-ardb+git@google.com (mailing list archive)
State New
Headers show
Series x86: Rely on toolchain for relocatable code | expand

Commit Message

Ard Biesheuvel Sept. 25, 2024, 3:01 p.m. UTC
From: Ard Biesheuvel <ardb@kernel.org>

Specify the guard symbol for the stack cookie explicitly, rather than
positioning it exactly 40 bytes into the per-CPU area. Doing so removes
the need for the per-CPU region to be absolute rather than relative to
the placement of the per-CPU template region in the kernel image, and
this allows the special handling for absolute per-CPU symbols to be
removed entirely.

This is a worthwhile cleanup in itself, but it is also a prerequisite
for PIE codegen and PIE linking, which can replace our bespoke and
rather clunky runtime relocation handling.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
---
 arch/x86/Makefile                     |  4 ++++
 arch/x86/include/asm/init.h           |  2 +-
 arch/x86/include/asm/processor.h      | 11 +++--------
 arch/x86/include/asm/stackprotector.h |  4 ----
 tools/perf/util/annotate.c            |  4 ++--
 5 files changed, 10 insertions(+), 15 deletions(-)

Comments

Ian Rogers Sept. 25, 2024, 3:53 p.m. UTC | #1
On Wed, Sep 25, 2024 at 8:02 AM Ard Biesheuvel <ardb+git@google.com> wrote:
>
> From: Ard Biesheuvel <ardb@kernel.org>
>
> Specify the guard symbol for the stack cookie explicitly, rather than
> positioning it exactly 40 bytes into the per-CPU area. Doing so removes
> the need for the per-CPU region to be absolute rather than relative to
> the placement of the per-CPU template region in the kernel image, and
> this allows the special handling for absolute per-CPU symbols to be
> removed entirely.
>
> This is a worthwhile cleanup in itself, but it is also a prerequisite
> for PIE codegen and PIE linking, which can replace our bespoke and
> rather clunky runtime relocation handling.
>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  arch/x86/Makefile                     |  4 ++++
>  arch/x86/include/asm/init.h           |  2 +-
>  arch/x86/include/asm/processor.h      | 11 +++--------
>  arch/x86/include/asm/stackprotector.h |  4 ----
>  tools/perf/util/annotate.c            |  4 ++--
>  5 files changed, 10 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> index 6b3fe6e2aadd..b78b7623a4a9 100644
> --- a/arch/x86/Makefile
> +++ b/arch/x86/Makefile
> @@ -193,6 +193,10 @@ else
>          KBUILD_RUSTFLAGS += -Cno-redzone=y
>          KBUILD_RUSTFLAGS += -Ccode-model=kernel
>
> +        ifeq ($(CONFIG_STACKPROTECTOR),y)
> +                KBUILD_CFLAGS += -mstack-protector-guard-symbol=fixed_percpu_data
> +        endif
> +
>          # Don't emit relaxable GOTPCREL relocations
>          KBUILD_AFLAGS_KERNEL += -Wa,-mrelax-relocations=no
>          KBUILD_CFLAGS_KERNEL += -Wa,-mrelax-relocations=no
> diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
> index 14d72727d7ee..3ed0e8ec973f 100644
> --- a/arch/x86/include/asm/init.h
> +++ b/arch/x86/include/asm/init.h
> @@ -2,7 +2,7 @@
>  #ifndef _ASM_X86_INIT_H
>  #define _ASM_X86_INIT_H
>
> -#define __head __section(".head.text")
> +#define __head __section(".head.text") __no_stack_protector
>
>  struct x86_mapping_info {
>         void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 4a686f0e5dbf..56bc36116814 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -402,14 +402,9 @@ struct irq_stack {
>  #ifdef CONFIG_X86_64
>  struct fixed_percpu_data {
>         /*
> -        * GCC hardcodes the stack canary as %gs:40.  Since the
> -        * irq_stack is the object at %gs:0, we reserve the bottom
> -        * 48 bytes of the irq stack for the canary.
> -        *
> -        * Once we are willing to require -mstack-protector-guard-symbol=
> -        * support for x86_64 stackprotector, we can get rid of this.
> +        * Since the irq_stack is the object at %gs:0, the bottom 8 bytes of
> +        * the irq stack are reserved for the canary.
>          */
> -       char            gs_base[40];
>         unsigned long   stack_canary;
>  };
>
> @@ -418,7 +413,7 @@ DECLARE_INIT_PER_CPU(fixed_percpu_data);
>
>  static inline unsigned long cpu_kernelmode_gs_base(int cpu)
>  {
> -       return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
> +       return (unsigned long)&per_cpu(fixed_percpu_data, cpu);
>  }
>
>  extern asmlinkage void entry_SYSCALL32_ignore(void);
> diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
> index 00473a650f51..d1dcd22a0a4c 100644
> --- a/arch/x86/include/asm/stackprotector.h
> +++ b/arch/x86/include/asm/stackprotector.h
> @@ -51,10 +51,6 @@ static __always_inline void boot_init_stack_canary(void)
>  {
>         unsigned long canary = get_random_canary();
>
> -#ifdef CONFIG_X86_64
> -       BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
> -#endif
> -
>         current->stack_canary = canary;
>  #ifdef CONFIG_X86_64
>         this_cpu_write(fixed_percpu_data.stack_canary, canary);
> diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
> index 37ce43c4eb8f..7ecfedf5edb9 100644
> --- a/tools/perf/util/annotate.c
> +++ b/tools/perf/util/annotate.c
> @@ -2485,10 +2485,10 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
>
>  static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc)
>  {
> -       /* On x86_64, %gs:40 is used for stack canary */
> +       /* On x86_64, %gs:0 is used for stack canary */
>         if (arch__is(arch, "x86")) {
>                 if (loc->segment == INSN_SEG_X86_GS && loc->imm &&
> -                   loc->offset == 40)
> +                   loc->offset == 0)

As a new perf tool  can run on old kernels we may need to have this be
something like:
(loc->offset == 40 /* pre v6.xx kernels */ || loc->offset == 0 /*
v6.xx and later */ )

We could make this dependent on the kernel by processing the os_release string:
https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/env.h#n55
but that could well be more trouble than it is worth.

Thanks,
Ian

>                         return true;
>         }

>
> --
> 2.46.0.792.g87dc391469-goog
>
Ard Biesheuvel Sept. 25, 2024, 5:43 p.m. UTC | #2
On Wed, 25 Sept 2024 at 17:54, Ian Rogers <irogers@google.com> wrote:
>
> On Wed, Sep 25, 2024 at 8:02 AM Ard Biesheuvel <ardb+git@google.com> wrote:
> >
> > From: Ard Biesheuvel <ardb@kernel.org>
> >
> > Specify the guard symbol for the stack cookie explicitly, rather than
> > positioning it exactly 40 bytes into the per-CPU area. Doing so removes
> > the need for the per-CPU region to be absolute rather than relative to
> > the placement of the per-CPU template region in the kernel image, and
> > this allows the special handling for absolute per-CPU symbols to be
> > removed entirely.
> >
> > This is a worthwhile cleanup in itself, but it is also a prerequisite
> > for PIE codegen and PIE linking, which can replace our bespoke and
> > rather clunky runtime relocation handling.
> >
> > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > ---
> >  arch/x86/Makefile                     |  4 ++++
> >  arch/x86/include/asm/init.h           |  2 +-
> >  arch/x86/include/asm/processor.h      | 11 +++--------
> >  arch/x86/include/asm/stackprotector.h |  4 ----
> >  tools/perf/util/annotate.c            |  4 ++--
> >  5 files changed, 10 insertions(+), 15 deletions(-)
> >
...
> > diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
> > index 37ce43c4eb8f..7ecfedf5edb9 100644
> > --- a/tools/perf/util/annotate.c
> > +++ b/tools/perf/util/annotate.c
> > @@ -2485,10 +2485,10 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
> >
> >  static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc)
> >  {
> > -       /* On x86_64, %gs:40 is used for stack canary */
> > +       /* On x86_64, %gs:0 is used for stack canary */
> >         if (arch__is(arch, "x86")) {
> >                 if (loc->segment == INSN_SEG_X86_GS && loc->imm &&
> > -                   loc->offset == 40)
> > +                   loc->offset == 0)
>
> As a new perf tool  can run on old kernels we may need to have this be
> something like:
> (loc->offset == 40 /* pre v6.xx kernels */ || loc->offset == 0 /*
> v6.xx and later */ )
>
> We could make this dependent on the kernel by processing the os_release string:
> https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/env.h#n55
> but that could well be more trouble than it is worth.
>

Yeah. I also wonder what the purpose of this feature is. At the end of
this series, the stack cookie will no longer be at a fixed offset of
%GS anyway, and so perf will not be able to identify it in the same
manner. So it is probably better to just leave this in place, as the
%gs:0 case will not exist in the field (assuming that the series lands
all at once).

Any idea why this deviates from other architectures? Is x86_64 the
only arch that needs to identify stack canary accesses in perf? We
could rename the symbol to something identifiable, and do it across
all architectures, if this really serves a need (and assuming that
perf has insight into the symbol table).
Ian Rogers Sept. 25, 2024, 5:48 p.m. UTC | #3
On Wed, Sep 25, 2024 at 10:43 AM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Wed, 25 Sept 2024 at 17:54, Ian Rogers <irogers@google.com> wrote:
> >
> > On Wed, Sep 25, 2024 at 8:02 AM Ard Biesheuvel <ardb+git@google.com> wrote:
> > >
> > > From: Ard Biesheuvel <ardb@kernel.org>
> > >
> > > Specify the guard symbol for the stack cookie explicitly, rather than
> > > positioning it exactly 40 bytes into the per-CPU area. Doing so removes
> > > the need for the per-CPU region to be absolute rather than relative to
> > > the placement of the per-CPU template region in the kernel image, and
> > > this allows the special handling for absolute per-CPU symbols to be
> > > removed entirely.
> > >
> > > This is a worthwhile cleanup in itself, but it is also a prerequisite
> > > for PIE codegen and PIE linking, which can replace our bespoke and
> > > rather clunky runtime relocation handling.
> > >
> > > Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> > > ---
> > >  arch/x86/Makefile                     |  4 ++++
> > >  arch/x86/include/asm/init.h           |  2 +-
> > >  arch/x86/include/asm/processor.h      | 11 +++--------
> > >  arch/x86/include/asm/stackprotector.h |  4 ----
> > >  tools/perf/util/annotate.c            |  4 ++--
> > >  5 files changed, 10 insertions(+), 15 deletions(-)
> > >
> ...
> > > diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
> > > index 37ce43c4eb8f..7ecfedf5edb9 100644
> > > --- a/tools/perf/util/annotate.c
> > > +++ b/tools/perf/util/annotate.c
> > > @@ -2485,10 +2485,10 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
> > >
> > >  static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc)
> > >  {
> > > -       /* On x86_64, %gs:40 is used for stack canary */
> > > +       /* On x86_64, %gs:0 is used for stack canary */
> > >         if (arch__is(arch, "x86")) {
> > >                 if (loc->segment == INSN_SEG_X86_GS && loc->imm &&
> > > -                   loc->offset == 40)
> > > +                   loc->offset == 0)
> >
> > As a new perf tool  can run on old kernels we may need to have this be
> > something like:
> > (loc->offset == 40 /* pre v6.xx kernels */ || loc->offset == 0 /*
> > v6.xx and later */ )
> >
> > We could make this dependent on the kernel by processing the os_release string:
> > https://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools-next.git/tree/tools/perf/util/env.h#n55
> > but that could well be more trouble than it is worth.
> >
>
> Yeah. I also wonder what the purpose of this feature is. At the end of
> this series, the stack cookie will no longer be at a fixed offset of
> %GS anyway, and so perf will not be able to identify it in the same
> manner. So it is probably better to just leave this in place, as the
> %gs:0 case will not exist in the field (assuming that the series lands
> all at once).
>
> Any idea why this deviates from other architectures? Is x86_64 the
> only arch that needs to identify stack canary accesses in perf? We
> could rename the symbol to something identifiable, and do it across
> all architectures, if this really serves a need (and assuming that
> perf has insight into the symbol table).

This is relatively new work coming from Namhyung for data type
profiling and I believe is pretty much just x86 at the moment -
although the ever awesome IBM made contributions for PowerPC. The data
type profiling is trying to classify memory accesses which is why it
cares about the stack canary instruction, the particular encoding
shouldn't matter.

Thanks,
Ian
Uros Bizjak Sept. 25, 2024, 6:32 p.m. UTC | #4
On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote:
>
> From: Ard Biesheuvel <ardb@kernel.org>
>
> Specify the guard symbol for the stack cookie explicitly, rather than
> positioning it exactly 40 bytes into the per-CPU area. Doing so removes
> the need for the per-CPU region to be absolute rather than relative to
> the placement of the per-CPU template region in the kernel image, and
> this allows the special handling for absolute per-CPU symbols to be
> removed entirely.
>
> This is a worthwhile cleanup in itself, but it is also a prerequisite
> for PIE codegen and PIE linking, which can replace our bespoke and
> rather clunky runtime relocation handling.

I would like to point out a series that converted the stack protector
guard symbol to a normal percpu variable [1], so there was no need to
assume anything about the location of the guard symbol.

[1] "[PATCH v4 00/16] x86-64: Stack protector and percpu improvements"
https://lore.kernel.org/lkml/20240322165233.71698-1-brgerst@gmail.com/

Uros.

>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  arch/x86/Makefile                     |  4 ++++
>  arch/x86/include/asm/init.h           |  2 +-
>  arch/x86/include/asm/processor.h      | 11 +++--------
>  arch/x86/include/asm/stackprotector.h |  4 ----
>  tools/perf/util/annotate.c            |  4 ++--
>  5 files changed, 10 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> index 6b3fe6e2aadd..b78b7623a4a9 100644
> --- a/arch/x86/Makefile
> +++ b/arch/x86/Makefile
> @@ -193,6 +193,10 @@ else
>          KBUILD_RUSTFLAGS += -Cno-redzone=y
>          KBUILD_RUSTFLAGS += -Ccode-model=kernel
>
> +        ifeq ($(CONFIG_STACKPROTECTOR),y)
> +                KBUILD_CFLAGS += -mstack-protector-guard-symbol=fixed_percpu_data
> +        endif
> +
>          # Don't emit relaxable GOTPCREL relocations
>          KBUILD_AFLAGS_KERNEL += -Wa,-mrelax-relocations=no
>          KBUILD_CFLAGS_KERNEL += -Wa,-mrelax-relocations=no
> diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
> index 14d72727d7ee..3ed0e8ec973f 100644
> --- a/arch/x86/include/asm/init.h
> +++ b/arch/x86/include/asm/init.h
> @@ -2,7 +2,7 @@
>  #ifndef _ASM_X86_INIT_H
>  #define _ASM_X86_INIT_H
>
> -#define __head __section(".head.text")
> +#define __head __section(".head.text") __no_stack_protector
>
>  struct x86_mapping_info {
>         void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 4a686f0e5dbf..56bc36116814 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -402,14 +402,9 @@ struct irq_stack {
>  #ifdef CONFIG_X86_64
>  struct fixed_percpu_data {
>         /*
> -        * GCC hardcodes the stack canary as %gs:40.  Since the
> -        * irq_stack is the object at %gs:0, we reserve the bottom
> -        * 48 bytes of the irq stack for the canary.
> -        *
> -        * Once we are willing to require -mstack-protector-guard-symbol=
> -        * support for x86_64 stackprotector, we can get rid of this.
> +        * Since the irq_stack is the object at %gs:0, the bottom 8 bytes of
> +        * the irq stack are reserved for the canary.
>          */
> -       char            gs_base[40];
>         unsigned long   stack_canary;
>  };
>
> @@ -418,7 +413,7 @@ DECLARE_INIT_PER_CPU(fixed_percpu_data);
>
>  static inline unsigned long cpu_kernelmode_gs_base(int cpu)
>  {
> -       return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
> +       return (unsigned long)&per_cpu(fixed_percpu_data, cpu);
>  }
>
>  extern asmlinkage void entry_SYSCALL32_ignore(void);
> diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
> index 00473a650f51..d1dcd22a0a4c 100644
> --- a/arch/x86/include/asm/stackprotector.h
> +++ b/arch/x86/include/asm/stackprotector.h
> @@ -51,10 +51,6 @@ static __always_inline void boot_init_stack_canary(void)
>  {
>         unsigned long canary = get_random_canary();
>
> -#ifdef CONFIG_X86_64
> -       BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
> -#endif
> -
>         current->stack_canary = canary;
>  #ifdef CONFIG_X86_64
>         this_cpu_write(fixed_percpu_data.stack_canary, canary);
> diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
> index 37ce43c4eb8f..7ecfedf5edb9 100644
> --- a/tools/perf/util/annotate.c
> +++ b/tools/perf/util/annotate.c
> @@ -2485,10 +2485,10 @@ static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
>
>  static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc)
>  {
> -       /* On x86_64, %gs:40 is used for stack canary */
> +       /* On x86_64, %gs:0 is used for stack canary */
>         if (arch__is(arch, "x86")) {
>                 if (loc->segment == INSN_SEG_X86_GS && loc->imm &&
> -                   loc->offset == 40)
> +                   loc->offset == 0)
>                         return true;
>         }
>
> --
> 2.46.0.792.g87dc391469-goog
>
Brian Gerst Sept. 28, 2024, 1:41 p.m. UTC | #5
On Wed, Sep 25, 2024 at 2:33 PM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote:
> >
> > From: Ard Biesheuvel <ardb@kernel.org>
> >
> > Specify the guard symbol for the stack cookie explicitly, rather than
> > positioning it exactly 40 bytes into the per-CPU area. Doing so removes
> > the need for the per-CPU region to be absolute rather than relative to
> > the placement of the per-CPU template region in the kernel image, and
> > this allows the special handling for absolute per-CPU symbols to be
> > removed entirely.
> >
> > This is a worthwhile cleanup in itself, but it is also a prerequisite
> > for PIE codegen and PIE linking, which can replace our bespoke and
> > rather clunky runtime relocation handling.
>
> I would like to point out a series that converted the stack protector
> guard symbol to a normal percpu variable [1], so there was no need to
> assume anything about the location of the guard symbol.
>
> [1] "[PATCH v4 00/16] x86-64: Stack protector and percpu improvements"
> https://lore.kernel.org/lkml/20240322165233.71698-1-brgerst@gmail.com/
>
> Uros.

I plan on resubmitting that series sometime after the 6.12 merge
window closes.  As I recall from the last version, it was decided to
wait until after the next LTS release to raise the minimum GCC version
to 8.1 and avoid the need to be compatible with the old stack
protector layout.

Brian Gerst
Uros Bizjak Oct. 4, 2024, 10:01 a.m. UTC | #6
On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote:
>
> From: Ard Biesheuvel <ardb@kernel.org>
>
> Specify the guard symbol for the stack cookie explicitly, rather than
> positioning it exactly 40 bytes into the per-CPU area. Doing so removes
> the need for the per-CPU region to be absolute rather than relative to
> the placement of the per-CPU template region in the kernel image, and
> this allows the special handling for absolute per-CPU symbols to be
> removed entirely.
>
> This is a worthwhile cleanup in itself, but it is also a prerequisite
> for PIE codegen and PIE linking, which can replace our bespoke and
> rather clunky runtime relocation handling.
>
> Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
> ---
>  arch/x86/Makefile                     |  4 ++++
>  arch/x86/include/asm/init.h           |  2 +-
>  arch/x86/include/asm/processor.h      | 11 +++--------
>  arch/x86/include/asm/stackprotector.h |  4 ----
>  tools/perf/util/annotate.c            |  4 ++--
>  5 files changed, 10 insertions(+), 15 deletions(-)
>
> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> index 6b3fe6e2aadd..b78b7623a4a9 100644
> --- a/arch/x86/Makefile
> +++ b/arch/x86/Makefile
> @@ -193,6 +193,10 @@ else
>          KBUILD_RUSTFLAGS += -Cno-redzone=y
>          KBUILD_RUSTFLAGS += -Ccode-model=kernel
>
> +        ifeq ($(CONFIG_STACKPROTECTOR),y)
> +                KBUILD_CFLAGS += -mstack-protector-guard-symbol=fixed_percpu_data

Looking at:

> +        * Since the irq_stack is the object at %gs:0, the bottom 8 bytes of
> +        * the irq stack are reserved for the canary.

Please note that %gs:0 can also be achieved with
-mstack-protector-guard-offset=0

Uros.
Ard Biesheuvel Oct. 4, 2024, 1:15 p.m. UTC | #7
On Sat, 28 Sept 2024 at 15:41, Brian Gerst <brgerst@gmail.com> wrote:
>
> On Wed, Sep 25, 2024 at 2:33 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> >
> > On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote:
> > >
> > > From: Ard Biesheuvel <ardb@kernel.org>
> > >
> > > Specify the guard symbol for the stack cookie explicitly, rather than
> > > positioning it exactly 40 bytes into the per-CPU area. Doing so removes
> > > the need for the per-CPU region to be absolute rather than relative to
> > > the placement of the per-CPU template region in the kernel image, and
> > > this allows the special handling for absolute per-CPU symbols to be
> > > removed entirely.
> > >
> > > This is a worthwhile cleanup in itself, but it is also a prerequisite
> > > for PIE codegen and PIE linking, which can replace our bespoke and
> > > rather clunky runtime relocation handling.
> >
> > I would like to point out a series that converted the stack protector
> > guard symbol to a normal percpu variable [1], so there was no need to
> > assume anything about the location of the guard symbol.
> >
> > [1] "[PATCH v4 00/16] x86-64: Stack protector and percpu improvements"
> > https://lore.kernel.org/lkml/20240322165233.71698-1-brgerst@gmail.com/
> >
> > Uros.
>
> I plan on resubmitting that series sometime after the 6.12 merge
> window closes.  As I recall from the last version, it was decided to
> wait until after the next LTS release to raise the minimum GCC version
> to 8.1 and avoid the need to be compatible with the old stack
> protector layout.
>

Hi Brian,

I'd be more than happy to compare notes on that - I wasn't aware of
your intentions here, or I would have reached out before sending this
RFC.

There are two things that you would need to address for Clang support
to work correctly:
- the workaround I cc'ed you on the other day [0],
- a workaround for the module loader so it tolerates the GOTPCRELX
relocations that Clang emits [1]



[0] https://lore.kernel.org/all/20241002092534.3163838-2-ardb+git@google.com/
[1] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/commit/?id=a18121aabbdd
Brian Gerst Oct. 8, 2024, 2:36 p.m. UTC | #8
On Fri, Oct 4, 2024 at 9:15 AM Ard Biesheuvel <ardb@kernel.org> wrote:
>
> On Sat, 28 Sept 2024 at 15:41, Brian Gerst <brgerst@gmail.com> wrote:
> >
> > On Wed, Sep 25, 2024 at 2:33 PM Uros Bizjak <ubizjak@gmail.com> wrote:
> > >
> > > On Wed, Sep 25, 2024 at 5:02 PM Ard Biesheuvel <ardb+git@google.com> wrote:
> > > >
> > > > From: Ard Biesheuvel <ardb@kernel.org>
> > > >
> > > > Specify the guard symbol for the stack cookie explicitly, rather than
> > > > positioning it exactly 40 bytes into the per-CPU area. Doing so removes
> > > > the need for the per-CPU region to be absolute rather than relative to
> > > > the placement of the per-CPU template region in the kernel image, and
> > > > this allows the special handling for absolute per-CPU symbols to be
> > > > removed entirely.
> > > >
> > > > This is a worthwhile cleanup in itself, but it is also a prerequisite
> > > > for PIE codegen and PIE linking, which can replace our bespoke and
> > > > rather clunky runtime relocation handling.
> > >
> > > I would like to point out a series that converted the stack protector
> > > guard symbol to a normal percpu variable [1], so there was no need to
> > > assume anything about the location of the guard symbol.
> > >
> > > [1] "[PATCH v4 00/16] x86-64: Stack protector and percpu improvements"
> > > https://lore.kernel.org/lkml/20240322165233.71698-1-brgerst@gmail.com/
> > >
> > > Uros.
> >
> > I plan on resubmitting that series sometime after the 6.12 merge
> > window closes.  As I recall from the last version, it was decided to
> > wait until after the next LTS release to raise the minimum GCC version
> > to 8.1 and avoid the need to be compatible with the old stack
> > protector layout.
> >
>
> Hi Brian,
>
> I'd be more than happy to compare notes on that - I wasn't aware of
> your intentions here, or I would have reached out before sending this
> RFC.
>
> There are two things that you would need to address for Clang support
> to work correctly:
> - the workaround I cc'ed you on the other day [0],
> - a workaround for the module loader so it tolerates the GOTPCRELX
> relocations that Clang emits [1]
>
>
>
> [0] https://lore.kernel.org/all/20241002092534.3163838-2-ardb+git@google.com/
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git/commit/?id=a18121aabbdd

The first patch should be applied independently as a bug fix, since it
already affects the 32-bit build with clang.

I don't have an environment with an older clang compiler to test the
second patch, but I'll assume it will be necessary.  I did run into an
issue with the GOTPCRELX relocations before [1], but I thought it was
just an objtool issue and didn't do more testing to know if modules
were broken or not.

Brian Gerst

[1] https://lore.kernel.org/all/20231026160100.195099-6-brgerst@gmail.com/
diff mbox series

Patch

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 6b3fe6e2aadd..b78b7623a4a9 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -193,6 +193,10 @@  else
         KBUILD_RUSTFLAGS += -Cno-redzone=y
         KBUILD_RUSTFLAGS += -Ccode-model=kernel
 
+        ifeq ($(CONFIG_STACKPROTECTOR),y)
+                KBUILD_CFLAGS += -mstack-protector-guard-symbol=fixed_percpu_data
+        endif
+
         # Don't emit relaxable GOTPCREL relocations
         KBUILD_AFLAGS_KERNEL += -Wa,-mrelax-relocations=no
         KBUILD_CFLAGS_KERNEL += -Wa,-mrelax-relocations=no
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 14d72727d7ee..3ed0e8ec973f 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -2,7 +2,7 @@ 
 #ifndef _ASM_X86_INIT_H
 #define _ASM_X86_INIT_H
 
-#define __head	__section(".head.text")
+#define __head	__section(".head.text") __no_stack_protector
 
 struct x86_mapping_info {
 	void *(*alloc_pgt_page)(void *); /* allocate buf for page table */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4a686f0e5dbf..56bc36116814 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -402,14 +402,9 @@  struct irq_stack {
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
 	/*
-	 * GCC hardcodes the stack canary as %gs:40.  Since the
-	 * irq_stack is the object at %gs:0, we reserve the bottom
-	 * 48 bytes of the irq stack for the canary.
-	 *
-	 * Once we are willing to require -mstack-protector-guard-symbol=
-	 * support for x86_64 stackprotector, we can get rid of this.
+	 * Since the irq_stack is the object at %gs:0, the bottom 8 bytes of
+	 * the irq stack are reserved for the canary.
 	 */
-	char		gs_base[40];
 	unsigned long	stack_canary;
 };
 
@@ -418,7 +413,7 @@  DECLARE_INIT_PER_CPU(fixed_percpu_data);
 
 static inline unsigned long cpu_kernelmode_gs_base(int cpu)
 {
-	return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
+	return (unsigned long)&per_cpu(fixed_percpu_data, cpu);
 }
 
 extern asmlinkage void entry_SYSCALL32_ignore(void);
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 00473a650f51..d1dcd22a0a4c 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -51,10 +51,6 @@  static __always_inline void boot_init_stack_canary(void)
 {
 	unsigned long canary = get_random_canary();
 
-#ifdef CONFIG_X86_64
-	BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
-#endif
-
 	current->stack_canary = canary;
 #ifdef CONFIG_X86_64
 	this_cpu_write(fixed_percpu_data.stack_canary, canary);
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 37ce43c4eb8f..7ecfedf5edb9 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -2485,10 +2485,10 @@  static bool is_stack_operation(struct arch *arch, struct disasm_line *dl)
 
 static bool is_stack_canary(struct arch *arch, struct annotated_op_loc *loc)
 {
-	/* On x86_64, %gs:40 is used for stack canary */
+	/* On x86_64, %gs:0 is used for stack canary */
 	if (arch__is(arch, "x86")) {
 		if (loc->segment == INSN_SEG_X86_GS && loc->imm &&
-		    loc->offset == 40)
+		    loc->offset == 0)
 			return true;
 	}