Message ID | 20200929133814.2834621-3-elver@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | KFENCE: A low-overhead sampling-based memory safety error detector | expand |
On Tue, Sep 29, 2020 at 3:38 PM Marco Elver <elver@google.com> wrote: > Add architecture specific implementation details for KFENCE and enable > KFENCE for the x86 architecture. In particular, this implements the > required interface in <asm/kfence.h> for setting up the pool and > providing helper functions for protecting and unprotecting pages. > > For x86, we need to ensure that the pool uses 4K pages, which is done > using the set_memory_4k() helper function. [...] > diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h [...] > +/* Protect the given page and flush TLBs. */ > +static inline bool kfence_protect_page(unsigned long addr, bool protect) > +{ > + unsigned int level; > + pte_t *pte = lookup_address(addr, &level); > + > + if (!pte || level != PG_LEVEL_4K) Do we actually expect this to happen, or is this just a "robustness" check? If we don't expect this to happen, there should be a WARN_ON() around the condition. > + return false; > + > + if (protect) > + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); > + else > + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); Hmm... do we have this helper (instead of using the existing helpers for modifying memory permissions) to work around the allocation out of the data section? > + flush_tlb_one_kernel(addr); > + return true; > +} > + > +#endif /* _ASM_X86_KFENCE_H */ > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c [...] > @@ -701,6 +702,9 @@ no_context(struct pt_regs *regs, unsigned long error_code, > } > #endif > > + if (kfence_handle_page_fault(address)) > + return; > + > /* > * 32-bit: > * The standard 5 lines of diff context don't really make it obvious what's going on here. Here's a diff with more context: /* * Stack overflow? During boot, we can fault near the initial * stack in the direct map, but that's not an overflow -- check * that we're in vmalloc space to avoid this. */ if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but * double-fault even before we get this far, in which case * we're fine: the double-fault handler will deal with it. * * We don't want to make it all the way into the oops code * and then double-fault, though, because we're likely to * break the console driver and lose most of the stack dump. */ asm volatile ("movq %[stack], %%rsp\n\t" "call handle_stack_overflow\n\t" "1: jmp 1b" : ASM_CALL_CONSTRAINT : "D" ("kernel stack overflow (page fault)"), "S" (regs), "d" (address), [stack] "rm" (stack)); unreachable(); } #endif + if (kfence_handle_page_fault(address)) + return; + /* * 32-bit: * * Valid to do another page fault here, because if this fault * had been triggered by is_prefetch fixup_exception would have * handled it. * * 64-bit: * * Hall of shame of CPU/BIOS bugs. */ if (is_prefetch(regs, error_code, address)) return; if (is_errata93(regs, address)) return; /* * Buggy firmware could access regions which might page fault, try to * recover from such faults. */ if (IS_ENABLED(CONFIG_EFI)) efi_recover_from_page_fault(address); oops: /* * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice: */ flags = oops_begin(); Shouldn't kfence_handle_page_fault() happen after prefetch handling, at least? Maybe directly above the "oops" label?
On Tue, Sep 29, 2020 at 3:38 PM Marco Elver <elver@google.com> wrote: > Add architecture specific implementation details for KFENCE and enable > KFENCE for the x86 architecture. In particular, this implements the > required interface in <asm/kfence.h> for setting up the pool and > providing helper functions for protecting and unprotecting pages. [...] > diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h [...] > +/* Protect the given page and flush TLBs. */ > +static inline bool kfence_protect_page(unsigned long addr, bool protect) > +{ [...] > + flush_tlb_one_kernel(addr); flush_tlb_one_kernel() -> flush_tlb_one_user() -> __flush_tlb_one_user() -> native_flush_tlb_one_user() only flushes on the local CPU core, not on others. If you want to leave it this way, I think this needs a comment explaining why we're not doing a global flush (locking context / performance overhead / ... ?).
On Fri, 2 Oct 2020 at 07:45, Jann Horn <jannh@google.com> wrote: > > On Tue, Sep 29, 2020 at 3:38 PM Marco Elver <elver@google.com> wrote: > > Add architecture specific implementation details for KFENCE and enable > > KFENCE for the x86 architecture. In particular, this implements the > > required interface in <asm/kfence.h> for setting up the pool and > > providing helper functions for protecting and unprotecting pages. > > > > For x86, we need to ensure that the pool uses 4K pages, which is done > > using the set_memory_4k() helper function. > [...] > > diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h > [...] > > +/* Protect the given page and flush TLBs. */ > > +static inline bool kfence_protect_page(unsigned long addr, bool protect) > > +{ > > + unsigned int level; > > + pte_t *pte = lookup_address(addr, &level); > > + > > + if (!pte || level != PG_LEVEL_4K) > > Do we actually expect this to happen, or is this just a "robustness" > check? If we don't expect this to happen, there should be a WARN_ON() > around the condition. It's not obvious here, but we already have this covered with a WARN: the core.c code has a KFENCE_WARN_ON, which disables KFENCE on a warning. > > + return false; > > + > > + if (protect) > > + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); > > + else > > + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); > > Hmm... do we have this helper (instead of using the existing helpers > for modifying memory permissions) to work around the allocation out of > the data section? I just played around with using the set_memory.c functions, to remind myself why this didn't work. I experimented with using set_memory_{np,p}() functions; set_memory_p() isn't implemented, but is easily added (which I did for below experiment). However, this didn't quite work: WARNING: CPU: 6 PID: 107 at kernel/smp.c:490 smp_call_function_many_cond+0x9c/0x2a0 kernel/smp.c:490 [...] Call Trace: smp_call_function_many kernel/smp.c:577 [inline] smp_call_function kernel/smp.c:599 [inline] on_each_cpu+0x3e/0x90 kernel/smp.c:698 __purge_vmap_area_lazy+0x58/0x670 mm/vmalloc.c:1352 _vm_unmap_aliases.part.0+0x10b/0x140 mm/vmalloc.c:1770 change_page_attr_set_clr+0xb4/0x1c0 arch/x86/mm/pat/set_memory.c:1732 change_page_attr_set arch/x86/mm/pat/set_memory.c:1782 [inline] set_memory_p+0x21/0x30 arch/x86/mm/pat/set_memory.c:1950 kfence_protect_page arch/x86/include/asm/kfence.h:55 [inline] kfence_protect_page arch/x86/include/asm/kfence.h:43 [inline] kfence_unprotect+0x42/0x70 mm/kfence/core.c:139 no_context+0x115/0x300 arch/x86/mm/fault.c:705 handle_page_fault arch/x86/mm/fault.c:1431 [inline] exc_page_fault+0xa7/0x170 arch/x86/mm/fault.c:1486 asm_exc_page_fault+0x1e/0x30 arch/x86/include/asm/idtentry.h:538 For one, smp_call_function_many_cond() doesn't want to be called with interrupts disabled, and we may very well get a KFENCE allocation or page fault with interrupts disabled / within interrupts. Therefore, to be safe, we should avoid IPIs. It follows that setting the page attribute is best-effort, and we can tolerate some inaccuracy. Lazy fault handling should take care of faults after we set the page as PRESENT. Which hopefully also answers your other comment: > flush_tlb_one_kernel() -> flush_tlb_one_user() -> > __flush_tlb_one_user() -> native_flush_tlb_one_user() only flushes on > the local CPU core, not on others. If you want to leave it this way, I > think this needs a comment explaining why we're not doing a global > flush (locking context / performance overhead / ... ?). We'll add a comment to clarify why it's done this way. > > + flush_tlb_one_kernel(addr); > > + return true; > > +} > > + > > +#endif /* _ASM_X86_KFENCE_H */ > > diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c > [...] > > @@ -701,6 +702,9 @@ no_context(struct pt_regs *regs, unsigned long error_code, > > } > > #endif > > > > + if (kfence_handle_page_fault(address)) > > + return; > > + > > /* > > * 32-bit: > > * > > The standard 5 lines of diff context don't really make it obvious > what's going on here. Here's a diff with more context: > > > /* > * Stack overflow? During boot, we can fault near the initial > * stack in the direct map, but that's not an overflow -- check > * that we're in vmalloc space to avoid this. > */ > if (is_vmalloc_addr((void *)address) && > (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || > address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { > unsigned long stack = __this_cpu_ist_top_va(DF) - > sizeof(void *); > /* > * We're likely to be running with very little stack space > * left. It's plausible that we'd hit this condition but > * double-fault even before we get this far, in which case > * we're fine: the double-fault handler will deal with it. > * > * We don't want to make it all the way into the oops code > * and then double-fault, though, because we're likely to > * break the console driver and lose most of the stack dump. > */ > asm volatile ("movq %[stack], %%rsp\n\t" > "call handle_stack_overflow\n\t" > "1: jmp 1b" > : ASM_CALL_CONSTRAINT > : "D" ("kernel stack overflow (page fault)"), > "S" (regs), "d" (address), > [stack] "rm" (stack)); > unreachable(); > } > #endif > > + if (kfence_handle_page_fault(address)) > + return; > + > /* > * 32-bit: > * > * Valid to do another page fault here, because if this fault > * had been triggered by is_prefetch fixup_exception would have > * handled it. > * > * 64-bit: > * > * Hall of shame of CPU/BIOS bugs. > */ > if (is_prefetch(regs, error_code, address)) > return; > > if (is_errata93(regs, address)) > return; > > /* > * Buggy firmware could access regions which might page fault, try to > * recover from such faults. > */ > if (IS_ENABLED(CONFIG_EFI)) > efi_recover_from_page_fault(address); > > oops: > /* > * Oops. The kernel tried to access some bad page. We'll have to > * terminate things with extreme prejudice: > */ > flags = oops_begin(); > > > > Shouldn't kfence_handle_page_fault() happen after prefetch handling, > at least? Maybe directly above the "oops" label? Good question. AFAIK it doesn't matter, as is_kfence_address() should never apply for any of those that follow, right? In any case, it shouldn't hurt to move it down. Thanks, -- Marco
On Wed, Oct 7, 2020 at 3:09 PM Marco Elver <elver@google.com> wrote: > On Fri, 2 Oct 2020 at 07:45, Jann Horn <jannh@google.com> wrote: > > On Tue, Sep 29, 2020 at 3:38 PM Marco Elver <elver@google.com> wrote: > > > Add architecture specific implementation details for KFENCE and enable > > > KFENCE for the x86 architecture. In particular, this implements the > > > required interface in <asm/kfence.h> for setting up the pool and > > > providing helper functions for protecting and unprotecting pages. > > > > > > For x86, we need to ensure that the pool uses 4K pages, which is done > > > using the set_memory_4k() helper function. > > [...] > > > diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h > > [...] > > > +/* Protect the given page and flush TLBs. */ > > > +static inline bool kfence_protect_page(unsigned long addr, bool protect) > > > +{ > > > + unsigned int level; > > > + pte_t *pte = lookup_address(addr, &level); > > > + > > > + if (!pte || level != PG_LEVEL_4K) > > > > Do we actually expect this to happen, or is this just a "robustness" > > check? If we don't expect this to happen, there should be a WARN_ON() > > around the condition. > > It's not obvious here, but we already have this covered with a WARN: > the core.c code has a KFENCE_WARN_ON, which disables KFENCE on a > warning. So for this specific branch: Can it ever happen? If not, please either remove it or add WARN_ON(). That serves two functions: It ensures that if something unexpected happens, we see a warning, and it hints to people reading the code "this isn't actually expected to happen, you don't have to wrack your brain trying to figure out for which scenario this branch is intended". > > > + return false; > > > + > > > + if (protect) > > > + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); > > > + else > > > + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); > > > > Hmm... do we have this helper (instead of using the existing helpers > > for modifying memory permissions) to work around the allocation out of > > the data section? > > I just played around with using the set_memory.c functions, to remind > myself why this didn't work. I experimented with using > set_memory_{np,p}() functions; set_memory_p() isn't implemented, but > is easily added (which I did for below experiment). However, this > didn't quite work: [...] > For one, smp_call_function_many_cond() doesn't want to be called with > interrupts disabled, and we may very well get a KFENCE allocation or > page fault with interrupts disabled / within interrupts. > > Therefore, to be safe, we should avoid IPIs. set_direct_map_invalid_noflush() does that, too, I think? And that's already implemented for both arm64 and x86. > It follows that setting > the page attribute is best-effort, and we can tolerate some > inaccuracy. Lazy fault handling should take care of faults after we > set the page as PRESENT. [...] > > Shouldn't kfence_handle_page_fault() happen after prefetch handling, > > at least? Maybe directly above the "oops" label? > > Good question. AFAIK it doesn't matter, as is_kfence_address() should > never apply for any of those that follow, right? In any case, it > shouldn't hurt to move it down. is_prefetch() ignores any #PF not caused by instruction fetch if it comes from kernel mode and the faulting instruction is one of the PREFETCH* instructions. (Which is not supposed to happen - the processor should just be ignoring the fault for PREFETCH instead of generating an exception AFAIK. But the comments say that this is about CPU bugs and stuff.) While this is probably not a big deal anymore partly because the kernel doesn't use software prefetching in many places anymore, it seems to me like, in principle, this could also cause page faults that should be ignored in KFENCE regions if someone tries to do PREFETCH on an out-of-bounds array element or a dangling pointer or something.
On Wed, 7 Oct 2020 at 16:15, Jann Horn <jannh@google.com> wrote: > > On Wed, Oct 7, 2020 at 3:09 PM Marco Elver <elver@google.com> wrote: > > On Fri, 2 Oct 2020 at 07:45, Jann Horn <jannh@google.com> wrote: > > > On Tue, Sep 29, 2020 at 3:38 PM Marco Elver <elver@google.com> wrote: > > > > Add architecture specific implementation details for KFENCE and enable > > > > KFENCE for the x86 architecture. In particular, this implements the > > > > required interface in <asm/kfence.h> for setting up the pool and > > > > providing helper functions for protecting and unprotecting pages. > > > > > > > > For x86, we need to ensure that the pool uses 4K pages, which is done > > > > using the set_memory_4k() helper function. > > > [...] > > > > diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h > > > [...] > > > > +/* Protect the given page and flush TLBs. */ > > > > +static inline bool kfence_protect_page(unsigned long addr, bool protect) > > > > +{ > > > > + unsigned int level; > > > > + pte_t *pte = lookup_address(addr, &level); > > > > + > > > > + if (!pte || level != PG_LEVEL_4K) > > > > > > Do we actually expect this to happen, or is this just a "robustness" > > > check? If we don't expect this to happen, there should be a WARN_ON() > > > around the condition. > > > > It's not obvious here, but we already have this covered with a WARN: > > the core.c code has a KFENCE_WARN_ON, which disables KFENCE on a > > warning. > > So for this specific branch: Can it ever happen? If not, please either > remove it or add WARN_ON(). That serves two functions: It ensures that > if something unexpected happens, we see a warning, and it hints to > people reading the code "this isn't actually expected to happen, you > don't have to wrack your brain trying to figure out for which scenario > this branch is intended". Perhaps I could have been clearer: we already have this returning false covered by a WARN+disable KFENCE in core.c. We'll add another WARN_ON right here, as it doesn't hurt, and hopefully improves readability. > > > > + return false; > > > > + > > > > + if (protect) > > > > + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); > > > > + else > > > > + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); > > > > > > Hmm... do we have this helper (instead of using the existing helpers > > > for modifying memory permissions) to work around the allocation out of > > > the data section? > > > > I just played around with using the set_memory.c functions, to remind > > myself why this didn't work. I experimented with using > > set_memory_{np,p}() functions; set_memory_p() isn't implemented, but > > is easily added (which I did for below experiment). However, this > > didn't quite work: > [...] > > For one, smp_call_function_many_cond() doesn't want to be called with > > interrupts disabled, and we may very well get a KFENCE allocation or > > page fault with interrupts disabled / within interrupts. > > > > Therefore, to be safe, we should avoid IPIs. > > set_direct_map_invalid_noflush() does that, too, I think? And that's > already implemented for both arm64 and x86. Sure, that works. We still want the flush_tlb_one_kernel(), at least so the local CPU's TLB is flushed. > > It follows that setting > > the page attribute is best-effort, and we can tolerate some > > inaccuracy. Lazy fault handling should take care of faults after we > > set the page as PRESENT. > [...] > > > Shouldn't kfence_handle_page_fault() happen after prefetch handling, > > > at least? Maybe directly above the "oops" label? > > > > Good question. AFAIK it doesn't matter, as is_kfence_address() should > > never apply for any of those that follow, right? In any case, it > > shouldn't hurt to move it down. > > is_prefetch() ignores any #PF not caused by instruction fetch if it > comes from kernel mode and the faulting instruction is one of the > PREFETCH* instructions. (Which is not supposed to happen - the > processor should just be ignoring the fault for PREFETCH instead of > generating an exception AFAIK. But the comments say that this is about > CPU bugs and stuff.) While this is probably not a big deal anymore > partly because the kernel doesn't use software prefetching in many > places anymore, it seems to me like, in principle, this could also > cause page faults that should be ignored in KFENCE regions if someone > tries to do PREFETCH on an out-of-bounds array element or a dangling > pointer or something. Thanks for the clarification.
On Wed, Oct 07, 2020 at 04:41PM +0200, Marco Elver wrote: > On Wed, 7 Oct 2020 at 16:15, Jann Horn <jannh@google.com> wrote: [...] > > > > > + return false; > > > > > + > > > > > + if (protect) > > > > > + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); > > > > > + else > > > > > + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); > > > > > > > > Hmm... do we have this helper (instead of using the existing helpers > > > > for modifying memory permissions) to work around the allocation out of > > > > the data section? > > > > > > I just played around with using the set_memory.c functions, to remind > > > myself why this didn't work. I experimented with using > > > set_memory_{np,p}() functions; set_memory_p() isn't implemented, but > > > is easily added (which I did for below experiment). However, this > > > didn't quite work: > > [...] > > > For one, smp_call_function_many_cond() doesn't want to be called with > > > interrupts disabled, and we may very well get a KFENCE allocation or > > > page fault with interrupts disabled / within interrupts. > > > > > > Therefore, to be safe, we should avoid IPIs. > > > > set_direct_map_invalid_noflush() does that, too, I think? And that's > > already implemented for both arm64 and x86. > > Sure, that works. > > We still want the flush_tlb_one_kernel(), at least so the local CPU's > TLB is flushed. Nope, sorry, set_direct_map_invalid_noflush() does not work -- this results in potential deadlock. ================================ WARNING: inconsistent lock state 5.9.0-rc4+ #2 Not tainted -------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. ksoftirqd/1/16 [HC0[0]:SC1[1]:HE1:SE0] takes: ffffffff89fcf9b8 (cpa_lock){+.?.}-{2:2}, at: spin_lock include/linux/spinlock.h:354 [inline] ffffffff89fcf9b8 (cpa_lock){+.?.}-{2:2}, at: __change_page_attr_set_clr+0x1b0/0x2510 arch/x86/mm/pat/set_memory.c:1658 {SOFTIRQ-ON-W} state was registered at: lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:354 [inline] __change_page_attr_set_clr+0x1b0/0x2510 arch/x86/mm/pat/set_memory.c:1658 change_page_attr_set_clr+0x333/0x500 arch/x86/mm/pat/set_memory.c:1752 change_page_attr_set arch/x86/mm/pat/set_memory.c:1782 [inline] set_memory_nx+0xb2/0x110 arch/x86/mm/pat/set_memory.c:1930 free_init_pages+0x73/0xc0 arch/x86/mm/init.c:876 alternative_instructions+0x155/0x1a4 arch/x86/kernel/alternative.c:738 check_bugs+0x1bd0/0x1c77 arch/x86/kernel/cpu/bugs.c:140 start_kernel+0x486/0x4b6 init/main.c:1042 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:243 irq event stamp: 14564 hardirqs last enabled at (14564): [<ffffffff8828cadf>] __raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:160 [inline] hardirqs last enabled at (14564): [<ffffffff8828cadf>] _raw_spin_unlock_irqrestore+0x6f/0x90 kernel/locking/spinlock.c:191 hardirqs last disabled at (14563): [<ffffffff8828d239>] __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:108 [inline] hardirqs last disabled at (14563): [<ffffffff8828d239>] _raw_spin_lock_irqsave+0xa9/0xce kernel/locking/spinlock.c:159 softirqs last enabled at (14486): [<ffffffff8147fcff>] run_ksoftirqd kernel/softirq.c:652 [inline] softirqs last enabled at (14486): [<ffffffff8147fcff>] run_ksoftirqd+0xcf/0x170 kernel/softirq.c:644 softirqs last disabled at (14491): [<ffffffff8147fcff>] run_ksoftirqd kernel/softirq.c:652 [inline] softirqs last disabled at (14491): [<ffffffff8147fcff>] run_ksoftirqd+0xcf/0x170 kernel/softirq.c:644 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(cpa_lock); <Interrupt> lock(cpa_lock); *** DEADLOCK *** 1 lock held by ksoftirqd/1/16: #0: ffffffff8a067e20 (rcu_callback){....}-{0:0}, at: rcu_do_batch kernel/rcu/tree.c:2418 [inline] #0: ffffffff8a067e20 (rcu_callback){....}-{0:0}, at: rcu_core+0x55d/0x1130 kernel/rcu/tree.c:2656 stack backtrace: CPU: 1 PID: 16 Comm: ksoftirqd/1 Not tainted 5.9.0-rc4+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 print_usage_bug kernel/locking/lockdep.c:3350 [inline] valid_state kernel/locking/lockdep.c:3361 [inline] mark_lock_irq kernel/locking/lockdep.c:3575 [inline] mark_lock.cold+0x12/0x17 kernel/locking/lockdep.c:4006 mark_usage kernel/locking/lockdep.c:3905 [inline] __lock_acquire+0x1159/0x5780 kernel/locking/lockdep.c:4380 lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006 __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] _raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:151 spin_lock include/linux/spinlock.h:354 [inline] __change_page_attr_set_clr+0x1b0/0x2510 arch/x86/mm/pat/set_memory.c:1658 __set_pages_np arch/x86/mm/pat/set_memory.c:2184 [inline] set_direct_map_invalid_noflush+0xd2/0x110 arch/x86/mm/pat/set_memory.c:2189 kfence_protect_page arch/x86/include/asm/kfence.h:62 [inline] kfence_protect+0x10e/0x120 mm/kfence/core.c:124 kfence_guarded_free+0x380/0x880 mm/kfence/core.c:375 rcu_do_batch kernel/rcu/tree.c:2428 [inline] rcu_core+0x5ca/0x1130 kernel/rcu/tree.c:2656 __do_softirq+0x1f8/0xb23 kernel/softirq.c:298 run_ksoftirqd kernel/softirq.c:652 [inline] run_ksoftirqd+0xcf/0x170 kernel/softirq.c:644 smpboot_thread_fn+0x655/0x9e0 kernel/smpboot.c:165 kthread+0x3b5/0x4a0 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7101ac64bb20..e22dc722698c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -144,6 +144,8 @@ config X86 select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if X86_64 + select HAVE_ARCH_KFENCE + select HAVE_ARCH_KFENCE_STATIC_POOL select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT diff --git a/arch/x86/include/asm/kfence.h b/arch/x86/include/asm/kfence.h new file mode 100644 index 000000000000..98fb1cd80026 --- /dev/null +++ b/arch/x86/include/asm/kfence.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_X86_KFENCE_H +#define _ASM_X86_KFENCE_H + +#include <linux/bug.h> +#include <linux/kfence.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/set_memory.h> +#include <asm/tlbflush.h> + +/* The alignment should be at least a 4K page. */ +#define __kfence_pool_attrs __aligned(PAGE_SIZE) + +/* + * The page fault handler entry function, up to which the stack trace is + * truncated in reports. + */ +#define KFENCE_SKIP_ARCH_FAULT_HANDLER "asm_exc_page_fault" + +/* Force 4K pages for __kfence_pool. */ +static inline bool arch_kfence_initialize_pool(void) +{ + unsigned long addr; + + for (addr = (unsigned long)__kfence_pool; is_kfence_address((void *)addr); + addr += PAGE_SIZE) { + unsigned int level; + + if (!lookup_address(addr, &level)) + return false; + + if (level != PG_LEVEL_4K) + set_memory_4k(addr, 1); + } + + return true; +} + +/* Protect the given page and flush TLBs. */ +static inline bool kfence_protect_page(unsigned long addr, bool protect) +{ + unsigned int level; + pte_t *pte = lookup_address(addr, &level); + + if (!pte || level != PG_LEVEL_4K) + return false; + + if (protect) + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); + else + set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); + + flush_tlb_one_kernel(addr); + return true; +} + +#endif /* _ASM_X86_KFENCE_H */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6e3e8a124903..423e15ad5eb6 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -9,6 +9,7 @@ #include <linux/kdebug.h> /* oops_begin/end, ... */ #include <linux/extable.h> /* search_exception_tables */ #include <linux/memblock.h> /* max_low_pfn */ +#include <linux/kfence.h> /* kfence_handle_page_fault */ #include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ @@ -701,6 +702,9 @@ no_context(struct pt_regs *regs, unsigned long error_code, } #endif + if (kfence_handle_page_fault(address)) + return; + /* * 32-bit: *