diff mbox series

[01/17] Fix "x86/alternatives: Lockdep-enforce text_mutex in text_poke*()"

Message ID 20190117003259.23141-2-rick.p.edgecombe@intel.com (mailing list archive)
State New, archived
Headers show
Series Merge text_poke fixes and executable lockdowns | expand

Commit Message

Edgecombe, Rick P Jan. 17, 2019, 12:32 a.m. UTC
From: Nadav Amit <namit@vmware.com>

text_mutex is currently expected to be held before text_poke() is
called, but we kgdb does not take the mutex, and instead *supposedly*
ensures the lock is not taken and will not be acquired by any other core
while text_poke() is running.

The reason for the "supposedly" comment is that it is not entirely clear
that this would be the case if gdb_do_roundup is zero.

This patch creates two wrapper functions, text_poke() and
text_poke_kgdb() which do or do not run the lockdep assertion
respectively.

While we are at it, change the return code of text_poke() to something
meaningful. One day, callers might actually respect it and the existing
BUG_ON() when patching fails could be removed. For kgdb, the return
value can actually be used.

Cc: Andy Lutomirski <luto@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Fixes: 9222f606506c ("x86/alternatives: Lockdep-enforce text_mutex in text_poke*()")
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Nadav Amit <namit@vmware.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
---
 arch/x86/include/asm/text-patching.h |  1 +
 arch/x86/kernel/alternative.c        | 52 ++++++++++++++++++++--------
 arch/x86/kernel/kgdb.c               | 11 +++---
 3 files changed, 45 insertions(+), 19 deletions(-)

Comments

Masami Hiramatsu (Google) Jan. 17, 2019, 6:47 a.m. UTC | #1
On Wed, 16 Jan 2019 16:32:43 -0800
Rick Edgecombe <rick.p.edgecombe@intel.com> wrote:

> From: Nadav Amit <namit@vmware.com>
> 
> text_mutex is currently expected to be held before text_poke() is
> called, but we kgdb does not take the mutex, and instead *supposedly*
> ensures the lock is not taken and will not be acquired by any other core
> while text_poke() is running.
> 
> The reason for the "supposedly" comment is that it is not entirely clear
> that this would be the case if gdb_do_roundup is zero.
> 
> This patch creates two wrapper functions, text_poke() and
> text_poke_kgdb() which do or do not run the lockdep assertion
> respectively.
> 
> While we are at it, change the return code of text_poke() to something
> meaningful. One day, callers might actually respect it and the existing
> BUG_ON() when patching fails could be removed. For kgdb, the return
> value can actually be used.

Looks good to me.

Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>

Thank you,

> 
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Dave Hansen <dave.hansen@intel.com>
> Cc: Masami Hiramatsu <mhiramat@kernel.org>
> Fixes: 9222f606506c ("x86/alternatives: Lockdep-enforce text_mutex in text_poke*()")
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Acked-by: Jiri Kosina <jkosina@suse.cz>
> Signed-off-by: Nadav Amit <namit@vmware.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
>  arch/x86/include/asm/text-patching.h |  1 +
>  arch/x86/kernel/alternative.c        | 52 ++++++++++++++++++++--------
>  arch/x86/kernel/kgdb.c               | 11 +++---
>  3 files changed, 45 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
> index e85ff65c43c3..f8fc8e86cf01 100644
> --- a/arch/x86/include/asm/text-patching.h
> +++ b/arch/x86/include/asm/text-patching.h
> @@ -35,6 +35,7 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
>   * inconsistent instruction while you patch.
>   */
>  extern void *text_poke(void *addr, const void *opcode, size_t len);
> +extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
>  extern int poke_int3_handler(struct pt_regs *regs);
>  extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
>  extern int after_bootmem;
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index ebeac487a20c..c6a3a10a2fd5 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -678,18 +678,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode,
>  	return addr;
>  }
>  
> -/**
> - * text_poke - Update instructions on a live kernel
> - * @addr: address to modify
> - * @opcode: source of the copy
> - * @len: length to copy
> - *
> - * Only atomic text poke/set should be allowed when not doing early patching.
> - * It means the size must be writable atomically and the address must be aligned
> - * in a way that permits an atomic write. It also makes sure we fit on a single
> - * page.
> - */
> -void *text_poke(void *addr, const void *opcode, size_t len)
> +static void *__text_poke(void *addr, const void *opcode, size_t len)
>  {
>  	unsigned long flags;
>  	char *vaddr;
> @@ -702,8 +691,6 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>  	 */
>  	BUG_ON(!after_bootmem);
>  
> -	lockdep_assert_held(&text_mutex);
> -
>  	if (!core_kernel_text((unsigned long)addr)) {
>  		pages[0] = vmalloc_to_page(addr);
>  		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
> @@ -732,6 +719,43 @@ void *text_poke(void *addr, const void *opcode, size_t len)
>  	return addr;
>  }
>  
> +/**
> + * text_poke - Update instructions on a live kernel
> + * @addr: address to modify
> + * @opcode: source of the copy
> + * @len: length to copy
> + *
> + * Only atomic text poke/set should be allowed when not doing early patching.
> + * It means the size must be writable atomically and the address must be aligned
> + * in a way that permits an atomic write. It also makes sure we fit on a single
> + * page.
> + */
> +void *text_poke(void *addr, const void *opcode, size_t len)
> +{
> +	lockdep_assert_held(&text_mutex);
> +
> +	return __text_poke(addr, opcode, len);
> +}
> +
> +/**
> + * text_poke_kgdb - Update instructions on a live kernel by kgdb
> + * @addr: address to modify
> + * @opcode: source of the copy
> + * @len: length to copy
> + *
> + * Only atomic text poke/set should be allowed when not doing early patching.
> + * It means the size must be writable atomically and the address must be aligned
> + * in a way that permits an atomic write. It also makes sure we fit on a single
> + * page.
> + *
> + * Context: should only be used by kgdb, which ensures no other core is running,
> + *	    despite the fact it does not hold the text_mutex.
> + */
> +void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
> +{
> +	return __text_poke(addr, opcode, len);
> +}
> +
>  static void do_sync_core(void *info)
>  {
>  	sync_core();
> diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
> index 5db08425063e..1461544cba8b 100644
> --- a/arch/x86/kernel/kgdb.c
> +++ b/arch/x86/kernel/kgdb.c
> @@ -758,13 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
>  	if (!err)
>  		return err;
>  	/*
> -	 * It is safe to call text_poke() because normal kernel execution
> +	 * It is safe to call text_poke_kgdb() because normal kernel execution
>  	 * is stopped on all cores, so long as the text_mutex is not locked.
>  	 */
>  	if (mutex_is_locked(&text_mutex))
>  		return -EBUSY;
> -	text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
> -		  BREAK_INSTR_SIZE);
> +	text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
> +		       BREAK_INSTR_SIZE);
>  	err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
>  	if (err)
>  		return err;
> @@ -783,12 +783,13 @@ int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
>  	if (bpt->type != BP_POKE_BREAKPOINT)
>  		goto knl_write;
>  	/*
> -	 * It is safe to call text_poke() because normal kernel execution
> +	 * It is safe to call text_poke_kgdb() because normal kernel execution
>  	 * is stopped on all cores, so long as the text_mutex is not locked.
>  	 */
>  	if (mutex_is_locked(&text_mutex))
>  		goto knl_write;
> -	text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
> +	text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
> +		       BREAK_INSTR_SIZE);
>  	err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
>  	if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
>  		goto knl_write;
> -- 
> 2.17.1
>
H. Peter Anvin Jan. 17, 2019, 9:15 p.m. UTC | #2
On January 16, 2019 10:47:01 PM PST, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>On Wed, 16 Jan 2019 16:32:43 -0800
>Rick Edgecombe <rick.p.edgecombe@intel.com> wrote:
>
>> From: Nadav Amit <namit@vmware.com>
>> 
>> text_mutex is currently expected to be held before text_poke() is
>> called, but we kgdb does not take the mutex, and instead *supposedly*
>> ensures the lock is not taken and will not be acquired by any other
>core
>> while text_poke() is running.
>> 
>> The reason for the "supposedly" comment is that it is not entirely
>clear
>> that this would be the case if gdb_do_roundup is zero.
>> 
>> This patch creates two wrapper functions, text_poke() and
>> text_poke_kgdb() which do or do not run the lockdep assertion
>> respectively.
>> 
>> While we are at it, change the return code of text_poke() to
>something
>> meaningful. One day, callers might actually respect it and the
>existing
>> BUG_ON() when patching fails could be removed. For kgdb, the return
>> value can actually be used.
>
>Looks good to me.
>
>Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
>
>Thank you,
>
>> 
>> Cc: Andy Lutomirski <luto@kernel.org>
>> Cc: Kees Cook <keescook@chromium.org>
>> Cc: Dave Hansen <dave.hansen@intel.com>
>> Cc: Masami Hiramatsu <mhiramat@kernel.org>
>> Fixes: 9222f606506c ("x86/alternatives: Lockdep-enforce text_mutex in
>text_poke*()")
>> Suggested-by: Peter Zijlstra <peterz@infradead.org>
>> Acked-by: Jiri Kosina <jkosina@suse.cz>
>> Signed-off-by: Nadav Amit <namit@vmware.com>
>> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
>> ---
>>  arch/x86/include/asm/text-patching.h |  1 +
>>  arch/x86/kernel/alternative.c        | 52
>++++++++++++++++++++--------
>>  arch/x86/kernel/kgdb.c               | 11 +++---
>>  3 files changed, 45 insertions(+), 19 deletions(-)
>> 
>> diff --git a/arch/x86/include/asm/text-patching.h
>b/arch/x86/include/asm/text-patching.h
>> index e85ff65c43c3..f8fc8e86cf01 100644
>> --- a/arch/x86/include/asm/text-patching.h
>> +++ b/arch/x86/include/asm/text-patching.h
>> @@ -35,6 +35,7 @@ extern void *text_poke_early(void *addr, const void
>*opcode, size_t len);
>>   * inconsistent instruction while you patch.
>>   */
>>  extern void *text_poke(void *addr, const void *opcode, size_t len);
>> +extern void *text_poke_kgdb(void *addr, const void *opcode, size_t
>len);
>>  extern int poke_int3_handler(struct pt_regs *regs);
>>  extern void *text_poke_bp(void *addr, const void *opcode, size_t
>len, void *handler);
>>  extern int after_bootmem;
>> diff --git a/arch/x86/kernel/alternative.c
>b/arch/x86/kernel/alternative.c
>> index ebeac487a20c..c6a3a10a2fd5 100644
>> --- a/arch/x86/kernel/alternative.c
>> +++ b/arch/x86/kernel/alternative.c
>> @@ -678,18 +678,7 @@ void *__init_or_module text_poke_early(void
>*addr, const void *opcode,
>>  	return addr;
>>  }
>>  
>> -/**
>> - * text_poke - Update instructions on a live kernel
>> - * @addr: address to modify
>> - * @opcode: source of the copy
>> - * @len: length to copy
>> - *
>> - * Only atomic text poke/set should be allowed when not doing early
>patching.
>> - * It means the size must be writable atomically and the address
>must be aligned
>> - * in a way that permits an atomic write. It also makes sure we fit
>on a single
>> - * page.
>> - */
>> -void *text_poke(void *addr, const void *opcode, size_t len)
>> +static void *__text_poke(void *addr, const void *opcode, size_t len)
>>  {
>>  	unsigned long flags;
>>  	char *vaddr;
>> @@ -702,8 +691,6 @@ void *text_poke(void *addr, const void *opcode,
>size_t len)
>>  	 */
>>  	BUG_ON(!after_bootmem);
>>  
>> -	lockdep_assert_held(&text_mutex);
>> -
>>  	if (!core_kernel_text((unsigned long)addr)) {
>>  		pages[0] = vmalloc_to_page(addr);
>>  		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
>> @@ -732,6 +719,43 @@ void *text_poke(void *addr, const void *opcode,
>size_t len)
>>  	return addr;
>>  }
>>  
>> +/**
>> + * text_poke - Update instructions on a live kernel
>> + * @addr: address to modify
>> + * @opcode: source of the copy
>> + * @len: length to copy
>> + *
>> + * Only atomic text poke/set should be allowed when not doing early
>patching.
>> + * It means the size must be writable atomically and the address
>must be aligned
>> + * in a way that permits an atomic write. It also makes sure we fit
>on a single
>> + * page.
>> + */
>> +void *text_poke(void *addr, const void *opcode, size_t len)
>> +{
>> +	lockdep_assert_held(&text_mutex);
>> +
>> +	return __text_poke(addr, opcode, len);
>> +}
>> +
>> +/**
>> + * text_poke_kgdb - Update instructions on a live kernel by kgdb
>> + * @addr: address to modify
>> + * @opcode: source of the copy
>> + * @len: length to copy
>> + *
>> + * Only atomic text poke/set should be allowed when not doing early
>patching.
>> + * It means the size must be writable atomically and the address
>must be aligned
>> + * in a way that permits an atomic write. It also makes sure we fit
>on a single
>> + * page.
>> + *
>> + * Context: should only be used by kgdb, which ensures no other core
>is running,
>> + *	    despite the fact it does not hold the text_mutex.
>> + */
>> +void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
>> +{
>> +	return __text_poke(addr, opcode, len);
>> +}
>> +
>>  static void do_sync_core(void *info)
>>  {
>>  	sync_core();
>> diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
>> index 5db08425063e..1461544cba8b 100644
>> --- a/arch/x86/kernel/kgdb.c
>> +++ b/arch/x86/kernel/kgdb.c
>> @@ -758,13 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt
>*bpt)
>>  	if (!err)
>>  		return err;
>>  	/*
>> -	 * It is safe to call text_poke() because normal kernel execution
>> +	 * It is safe to call text_poke_kgdb() because normal kernel
>execution
>>  	 * is stopped on all cores, so long as the text_mutex is not
>locked.
>>  	 */
>>  	if (mutex_is_locked(&text_mutex))
>>  		return -EBUSY;
>> -	text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
>> -		  BREAK_INSTR_SIZE);
>> +	text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
>> +		       BREAK_INSTR_SIZE);
>>  	err = probe_kernel_read(opc, (char *)bpt->bpt_addr,
>BREAK_INSTR_SIZE);
>>  	if (err)
>>  		return err;
>> @@ -783,12 +783,13 @@ int kgdb_arch_remove_breakpoint(struct
>kgdb_bkpt *bpt)
>>  	if (bpt->type != BP_POKE_BREAKPOINT)
>>  		goto knl_write;
>>  	/*
>> -	 * It is safe to call text_poke() because normal kernel execution
>> +	 * It is safe to call text_poke_kgdb() because normal kernel
>execution
>>  	 * is stopped on all cores, so long as the text_mutex is not
>locked.
>>  	 */
>>  	if (mutex_is_locked(&text_mutex))
>>  		goto knl_write;
>> -	text_poke((void *)bpt->bpt_addr, bpt->saved_instr,
>BREAK_INSTR_SIZE);
>> +	text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
>> +		       BREAK_INSTR_SIZE);
>>  	err = probe_kernel_read(opc, (char *)bpt->bpt_addr,
>BREAK_INSTR_SIZE);
>>  	if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
>>  		goto knl_write;
>> -- 
>> 2.17.1
>> 

If you are reorganizing this code, please do so so that the caller doesn't have to worry about if it should call text_poke_bp() or text_poke_early(). Right now the caller had to know that, which makes no sense.
Nadav Amit Jan. 17, 2019, 10:39 p.m. UTC | #3
> On Jan 17, 2019, at 1:15 PM, hpa@zytor.com wrote:
> 
> On January 16, 2019 10:47:01 PM PST, Masami Hiramatsu <mhiramat@kernel.org> wrote:
>> On Wed, 16 Jan 2019 16:32:43 -0800
>> Rick Edgecombe <rick.p.edgecombe@intel.com> wrote:
>> 
>>> From: Nadav Amit <namit@vmware.com>
>>> 
>>> text_mutex is currently expected to be held before text_poke() is
>>> called, but we kgdb does not take the mutex, and instead *supposedly*
>>> ensures the lock is not taken and will not be acquired by any other
>> core
>>> while text_poke() is running.
>>> 
>>> The reason for the "supposedly" comment is that it is not entirely
>> clear
>>> that this would be the case if gdb_do_roundup is zero.
>>> 
>>> This patch creates two wrapper functions, text_poke() and
>>> text_poke_kgdb() which do or do not run the lockdep assertion
>>> respectively.
>>> 
>>> While we are at it, change the return code of text_poke() to
>> something
>>> meaningful. One day, callers might actually respect it and the
>> existing
>>> BUG_ON() when patching fails could be removed. For kgdb, the return
>>> value can actually be used.
>> 
>> Looks good to me.
>> 
>> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
>> 
>> Thank you,
>> 
>>> Cc: Andy Lutomirski <luto@kernel.org>
>>> Cc: Kees Cook <keescook@chromium.org>
>>> Cc: Dave Hansen <dave.hansen@intel.com>
>>> Cc: Masami Hiramatsu <mhiramat@kernel.org>
>>> Fixes: 9222f606506c ("x86/alternatives: Lockdep-enforce text_mutex in
>> text_poke*()")
>>> Suggested-by: Peter Zijlstra <peterz@infradead.org>
>>> Acked-by: Jiri Kosina <jkosina@suse.cz>
>>> Signed-off-by: Nadav Amit <namit@vmware.com>
>>> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
>>> ---
>>> arch/x86/include/asm/text-patching.h |  1 +
>>> arch/x86/kernel/alternative.c        | 52
>> ++++++++++++++++++++--------
>>> arch/x86/kernel/kgdb.c               | 11 +++---
>>> 3 files changed, 45 insertions(+), 19 deletions(-)
>>> 
>>> diff --git a/arch/x86/include/asm/text-patching.h
>> b/arch/x86/include/asm/text-patching.h
>>> index e85ff65c43c3..f8fc8e86cf01 100644
>>> --- a/arch/x86/include/asm/text-patching.h
>>> +++ b/arch/x86/include/asm/text-patching.h
>>> @@ -35,6 +35,7 @@ extern void *text_poke_early(void *addr, const void
>> *opcode, size_t len);
>>>  * inconsistent instruction while you patch.
>>>  */
>>> extern void *text_poke(void *addr, const void *opcode, size_t len);
>>> +extern void *text_poke_kgdb(void *addr, const void *opcode, size_t
>> len);
>>> extern int poke_int3_handler(struct pt_regs *regs);
>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t
>> len, void *handler);
>>> extern int after_bootmem;
>>> diff --git a/arch/x86/kernel/alternative.c
>> b/arch/x86/kernel/alternative.c
>>> index ebeac487a20c..c6a3a10a2fd5 100644
>>> --- a/arch/x86/kernel/alternative.c
>>> +++ b/arch/x86/kernel/alternative.c
>>> @@ -678,18 +678,7 @@ void *__init_or_module text_poke_early(void
>> *addr, const void *opcode,
>>> return addr;
>>> }
>>> 
>>> -/**
>>> - * text_poke - Update instructions on a live kernel
>>> - * @addr: address to modify
>>> - * @opcode: source of the copy
>>> - * @len: length to copy
>>> - *
>>> - * Only atomic text poke/set should be allowed when not doing early
>> patching.
>>> - * It means the size must be writable atomically and the address
>> must be aligned
>>> - * in a way that permits an atomic write. It also makes sure we fit
>> on a single
>>> - * page.
>>> - */
>>> -void *text_poke(void *addr, const void *opcode, size_t len)
>>> +static void *__text_poke(void *addr, const void *opcode, size_t len)
>>> {
>>> 	unsigned long flags;
>>> 	char *vaddr;
>>> @@ -702,8 +691,6 @@ void *text_poke(void *addr, const void *opcode,
>> size_t len)
>>>  */
>>> 	BUG_ON(!after_bootmem);
>>> 
>>> -	lockdep_assert_held(&text_mutex);
>>> -
>>> 	if (!core_kernel_text((unsigned long)addr)) {
>>> 		pages[0] = vmalloc_to_page(addr);
>>> 		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
>>> @@ -732,6 +719,43 @@ void *text_poke(void *addr, const void *opcode,
>> size_t len)
>>> return addr;
>>> }
>>> 
>>> +/**
>>> + * text_poke - Update instructions on a live kernel
>>> + * @addr: address to modify
>>> + * @opcode: source of the copy
>>> + * @len: length to copy
>>> + *
>>> + * Only atomic text poke/set should be allowed when not doing early
>> patching.
>>> + * It means the size must be writable atomically and the address
>> must be aligned
>>> + * in a way that permits an atomic write. It also makes sure we fit
>> on a single
>>> + * page.
>>> + */
>>> +void *text_poke(void *addr, const void *opcode, size_t len)
>>> +{
>>> +	lockdep_assert_held(&text_mutex);
>>> +
>>> +	return __text_poke(addr, opcode, len);
>>> +}
>>> +
>>> +/**
>>> + * text_poke_kgdb - Update instructions on a live kernel by kgdb
>>> + * @addr: address to modify
>>> + * @opcode: source of the copy
>>> + * @len: length to copy
>>> + *
>>> + * Only atomic text poke/set should be allowed when not doing early
>> patching.
>>> + * It means the size must be writable atomically and the address
>> must be aligned
>>> + * in a way that permits an atomic write. It also makes sure we fit
>> on a single
>>> + * page.
>>> + *
>>> + * Context: should only be used by kgdb, which ensures no other core
>> is running,
>>> + *	    despite the fact it does not hold the text_mutex.
>>> + */
>>> +void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
>>> +{
>>> +	return __text_poke(addr, opcode, len);
>>> +}
>>> +
>>> static void do_sync_core(void *info)
>>> {
>>> 	sync_core();
>>> diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
>>> index 5db08425063e..1461544cba8b 100644
>>> --- a/arch/x86/kernel/kgdb.c
>>> +++ b/arch/x86/kernel/kgdb.c
>>> @@ -758,13 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt
>> *bpt)
>>> if (!err)
>>> 		return err;
>>> 	/*
>>> -	 * It is safe to call text_poke() because normal kernel execution
>>> +	 * It is safe to call text_poke_kgdb() because normal kernel
>> execution
>>>  * is stopped on all cores, so long as the text_mutex is not
>> locked.
>>>  */
>>> 	if (mutex_is_locked(&text_mutex))
>>> 		return -EBUSY;
>>> -	text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
>>> -		  BREAK_INSTR_SIZE);
>>> +	text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
>>> +		       BREAK_INSTR_SIZE);
>>> 	err = probe_kernel_read(opc, (char *)bpt->bpt_addr,
>> BREAK_INSTR_SIZE);
>>> if (err)
>>> 		return err;
>>> @@ -783,12 +783,13 @@ int kgdb_arch_remove_breakpoint(struct
>> kgdb_bkpt *bpt)
>>> if (bpt->type != BP_POKE_BREAKPOINT)
>>> 		goto knl_write;
>>> 	/*
>>> -	 * It is safe to call text_poke() because normal kernel execution
>>> +	 * It is safe to call text_poke_kgdb() because normal kernel
>> execution
>>>  * is stopped on all cores, so long as the text_mutex is not
>> locked.
>>>  */
>>> 	if (mutex_is_locked(&text_mutex))
>>> 		goto knl_write;
>>> -	text_poke((void *)bpt->bpt_addr, bpt->saved_instr,
>> BREAK_INSTR_SIZE);
>>> +	text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
>>> +		       BREAK_INSTR_SIZE);
>>> 	err = probe_kernel_read(opc, (char *)bpt->bpt_addr,
>> BREAK_INSTR_SIZE);
>>> if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
>>> 		goto knl_write;
>>> -- 
>>> 2.17.1
> 
> If you are reorganizing this code, please do so so that the caller doesn’t
> have to worry about if it should call text_poke_bp() or text_poke_early().
> Right now the caller had to know that, which makes no sense.

Did you look at "[11/17] x86/jump-label: remove support for custom poker”?

https://lore.kernel.org/patchwork/patch/1032857/

If this is not what you regard, please be more concrete. text_poke_early()
is still used directly on init and while modules are loaded, which might not
be great, but is outside of the scope of this patch-set.
H. Peter Anvin Jan. 17, 2019, 10:59 p.m. UTC | #4
On January 17, 2019 2:39:15 PM PST, Nadav Amit <namit@vmware.com> wrote:
>> On Jan 17, 2019, at 1:15 PM, hpa@zytor.com wrote:
>> 
>> On January 16, 2019 10:47:01 PM PST, Masami Hiramatsu
><mhiramat@kernel.org> wrote:
>>> On Wed, 16 Jan 2019 16:32:43 -0800
>>> Rick Edgecombe <rick.p.edgecombe@intel.com> wrote:
>>> 
>>>> From: Nadav Amit <namit@vmware.com>
>>>> 
>>>> text_mutex is currently expected to be held before text_poke() is
>>>> called, but we kgdb does not take the mutex, and instead
>*supposedly*
>>>> ensures the lock is not taken and will not be acquired by any other
>>> core
>>>> while text_poke() is running.
>>>> 
>>>> The reason for the "supposedly" comment is that it is not entirely
>>> clear
>>>> that this would be the case if gdb_do_roundup is zero.
>>>> 
>>>> This patch creates two wrapper functions, text_poke() and
>>>> text_poke_kgdb() which do or do not run the lockdep assertion
>>>> respectively.
>>>> 
>>>> While we are at it, change the return code of text_poke() to
>>> something
>>>> meaningful. One day, callers might actually respect it and the
>>> existing
>>>> BUG_ON() when patching fails could be removed. For kgdb, the return
>>>> value can actually be used.
>>> 
>>> Looks good to me.
>>> 
>>> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
>>> 
>>> Thank you,
>>> 
>>>> Cc: Andy Lutomirski <luto@kernel.org>
>>>> Cc: Kees Cook <keescook@chromium.org>
>>>> Cc: Dave Hansen <dave.hansen@intel.com>
>>>> Cc: Masami Hiramatsu <mhiramat@kernel.org>
>>>> Fixes: 9222f606506c ("x86/alternatives: Lockdep-enforce text_mutex
>in
>>> text_poke*()")
>>>> Suggested-by: Peter Zijlstra <peterz@infradead.org>
>>>> Acked-by: Jiri Kosina <jkosina@suse.cz>
>>>> Signed-off-by: Nadav Amit <namit@vmware.com>
>>>> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
>>>> ---
>>>> arch/x86/include/asm/text-patching.h |  1 +
>>>> arch/x86/kernel/alternative.c        | 52
>>> ++++++++++++++++++++--------
>>>> arch/x86/kernel/kgdb.c               | 11 +++---
>>>> 3 files changed, 45 insertions(+), 19 deletions(-)
>>>> 
>>>> diff --git a/arch/x86/include/asm/text-patching.h
>>> b/arch/x86/include/asm/text-patching.h
>>>> index e85ff65c43c3..f8fc8e86cf01 100644
>>>> --- a/arch/x86/include/asm/text-patching.h
>>>> +++ b/arch/x86/include/asm/text-patching.h
>>>> @@ -35,6 +35,7 @@ extern void *text_poke_early(void *addr, const
>void
>>> *opcode, size_t len);
>>>>  * inconsistent instruction while you patch.
>>>>  */
>>>> extern void *text_poke(void *addr, const void *opcode, size_t len);
>>>> +extern void *text_poke_kgdb(void *addr, const void *opcode, size_t
>>> len);
>>>> extern int poke_int3_handler(struct pt_regs *regs);
>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t
>>> len, void *handler);
>>>> extern int after_bootmem;
>>>> diff --git a/arch/x86/kernel/alternative.c
>>> b/arch/x86/kernel/alternative.c
>>>> index ebeac487a20c..c6a3a10a2fd5 100644
>>>> --- a/arch/x86/kernel/alternative.c
>>>> +++ b/arch/x86/kernel/alternative.c
>>>> @@ -678,18 +678,7 @@ void *__init_or_module text_poke_early(void
>>> *addr, const void *opcode,
>>>> return addr;
>>>> }
>>>> 
>>>> -/**
>>>> - * text_poke - Update instructions on a live kernel
>>>> - * @addr: address to modify
>>>> - * @opcode: source of the copy
>>>> - * @len: length to copy
>>>> - *
>>>> - * Only atomic text poke/set should be allowed when not doing
>early
>>> patching.
>>>> - * It means the size must be writable atomically and the address
>>> must be aligned
>>>> - * in a way that permits an atomic write. It also makes sure we
>fit
>>> on a single
>>>> - * page.
>>>> - */
>>>> -void *text_poke(void *addr, const void *opcode, size_t len)
>>>> +static void *__text_poke(void *addr, const void *opcode, size_t
>len)
>>>> {
>>>> 	unsigned long flags;
>>>> 	char *vaddr;
>>>> @@ -702,8 +691,6 @@ void *text_poke(void *addr, const void *opcode,
>>> size_t len)
>>>>  */
>>>> 	BUG_ON(!after_bootmem);
>>>> 
>>>> -	lockdep_assert_held(&text_mutex);
>>>> -
>>>> 	if (!core_kernel_text((unsigned long)addr)) {
>>>> 		pages[0] = vmalloc_to_page(addr);
>>>> 		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
>>>> @@ -732,6 +719,43 @@ void *text_poke(void *addr, const void
>*opcode,
>>> size_t len)
>>>> return addr;
>>>> }
>>>> 
>>>> +/**
>>>> + * text_poke - Update instructions on a live kernel
>>>> + * @addr: address to modify
>>>> + * @opcode: source of the copy
>>>> + * @len: length to copy
>>>> + *
>>>> + * Only atomic text poke/set should be allowed when not doing
>early
>>> patching.
>>>> + * It means the size must be writable atomically and the address
>>> must be aligned
>>>> + * in a way that permits an atomic write. It also makes sure we
>fit
>>> on a single
>>>> + * page.
>>>> + */
>>>> +void *text_poke(void *addr, const void *opcode, size_t len)
>>>> +{
>>>> +	lockdep_assert_held(&text_mutex);
>>>> +
>>>> +	return __text_poke(addr, opcode, len);
>>>> +}
>>>> +
>>>> +/**
>>>> + * text_poke_kgdb - Update instructions on a live kernel by kgdb
>>>> + * @addr: address to modify
>>>> + * @opcode: source of the copy
>>>> + * @len: length to copy
>>>> + *
>>>> + * Only atomic text poke/set should be allowed when not doing
>early
>>> patching.
>>>> + * It means the size must be writable atomically and the address
>>> must be aligned
>>>> + * in a way that permits an atomic write. It also makes sure we
>fit
>>> on a single
>>>> + * page.
>>>> + *
>>>> + * Context: should only be used by kgdb, which ensures no other
>core
>>> is running,
>>>> + *	    despite the fact it does not hold the text_mutex.
>>>> + */
>>>> +void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
>>>> +{
>>>> +	return __text_poke(addr, opcode, len);
>>>> +}
>>>> +
>>>> static void do_sync_core(void *info)
>>>> {
>>>> 	sync_core();
>>>> diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
>>>> index 5db08425063e..1461544cba8b 100644
>>>> --- a/arch/x86/kernel/kgdb.c
>>>> +++ b/arch/x86/kernel/kgdb.c
>>>> @@ -758,13 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt
>>> *bpt)
>>>> if (!err)
>>>> 		return err;
>>>> 	/*
>>>> -	 * It is safe to call text_poke() because normal kernel execution
>>>> +	 * It is safe to call text_poke_kgdb() because normal kernel
>>> execution
>>>>  * is stopped on all cores, so long as the text_mutex is not
>>> locked.
>>>>  */
>>>> 	if (mutex_is_locked(&text_mutex))
>>>> 		return -EBUSY;
>>>> -	text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
>>>> -		  BREAK_INSTR_SIZE);
>>>> +	text_poke_kgdb((void *)bpt->bpt_addr,
>arch_kgdb_ops.gdb_bpt_instr,
>>>> +		       BREAK_INSTR_SIZE);
>>>> 	err = probe_kernel_read(opc, (char *)bpt->bpt_addr,
>>> BREAK_INSTR_SIZE);
>>>> if (err)
>>>> 		return err;
>>>> @@ -783,12 +783,13 @@ int kgdb_arch_remove_breakpoint(struct
>>> kgdb_bkpt *bpt)
>>>> if (bpt->type != BP_POKE_BREAKPOINT)
>>>> 		goto knl_write;
>>>> 	/*
>>>> -	 * It is safe to call text_poke() because normal kernel execution
>>>> +	 * It is safe to call text_poke_kgdb() because normal kernel
>>> execution
>>>>  * is stopped on all cores, so long as the text_mutex is not
>>> locked.
>>>>  */
>>>> 	if (mutex_is_locked(&text_mutex))
>>>> 		goto knl_write;
>>>> -	text_poke((void *)bpt->bpt_addr, bpt->saved_instr,
>>> BREAK_INSTR_SIZE);
>>>> +	text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
>>>> +		       BREAK_INSTR_SIZE);
>>>> 	err = probe_kernel_read(opc, (char *)bpt->bpt_addr,
>>> BREAK_INSTR_SIZE);
>>>> if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
>>>> 		goto knl_write;
>>>> -- 
>>>> 2.17.1
>> 
>> If you are reorganizing this code, please do so so that the caller
>doesn’t
>> have to worry about if it should call text_poke_bp() or
>text_poke_early().
>> Right now the caller had to know that, which makes no sense.
>
>Did you look at "[11/17] x86/jump-label: remove support for custom
>poker”?
>
>https://lore.kernel.org/patchwork/patch/1032857/
>
>If this is not what you regard, please be more concrete.
>text_poke_early()
>is still used directly on init and while modules are loaded, which
>might not
>be great, but is outside of the scope of this patch-set.

I don't think it is out of scope, although that patch is a huge step in the right direction.

text_poke_{early,bp,...}, however, should be fully internal, that is, static functions, and we should present a single interface, preferably called text_poke(), to the outside world.

I think we have three subcases:

1. Early, UP, or under stop_machine();
2. Atomic and aligned;
3. Breakpoint.

My proposed algorithm should remove the need for a fixup which should help this interface, too.

The specific alignment needed for #2 is started by the hardware people to be not crossing 16 bytes (NOT a cache line) on any CPU we support SMP on and, of course, being possible to do atomically do on the specific CPU (note that we *can* do a redundantly large store of existing bytes, which adds flexibility.)

To the best of my knowledge any CPU supporting SSE can do an atomic (for our purposes) aligned 16-byte store via MOVAPS; of course any CPU with cx16 can do it without SSE registers. For older CPUs we may be limited to 8-byte stores (cx8) or even 4-byte stores before we need to use the breakpoint algorithm.
Nadav Amit Jan. 17, 2019, 11:14 p.m. UTC | #5
> On Jan 17, 2019, at 2:59 PM, hpa@zytor.com wrote:
> 
> On January 17, 2019 2:39:15 PM PST, Nadav Amit <namit@vmware.com> wrote:
>>> On Jan 17, 2019, at 1:15 PM, hpa@zytor.com wrote:
>>> 
>>> On January 16, 2019 10:47:01 PM PST, Masami Hiramatsu
>> <mhiramat@kernel.org> wrote:
>>>> On Wed, 16 Jan 2019 16:32:43 -0800
>>>> Rick Edgecombe <rick.p.edgecombe@intel.com> wrote:
>>>> 
>>>>> From: Nadav Amit <namit@vmware.com>
>>>>> 
>>>>> text_mutex is currently expected to be held before text_poke() is
>>>>> called, but we kgdb does not take the mutex, and instead
>> *supposedly*
>>>>> ensures the lock is not taken and will not be acquired by any other
>>>> core
>>>>> while text_poke() is running.
>>>>> 
>>>>> The reason for the "supposedly" comment is that it is not entirely
>>>> clear
>>>>> that this would be the case if gdb_do_roundup is zero.
>>>>> 
>>>>> This patch creates two wrapper functions, text_poke() and
>>>>> text_poke_kgdb() which do or do not run the lockdep assertion
>>>>> respectively.
>>>>> 
>>>>> While we are at it, change the return code of text_poke() to
>>>> something
>>>>> meaningful. One day, callers might actually respect it and the
>>>> existing
>>>>> BUG_ON() when patching fails could be removed. For kgdb, the return
>>>>> value can actually be used.
>>>> 
>>>> Looks good to me.
>>>> 
>>>> Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
>>>> 
>>>> Thank you,
>>>> 
>>>>> Cc: Andy Lutomirski <luto@kernel.org>
>>>>> Cc: Kees Cook <keescook@chromium.org>
>>>>> Cc: Dave Hansen <dave.hansen@intel.com>
>>>>> Cc: Masami Hiramatsu <mhiramat@kernel.org>
>>>>> Fixes: 9222f606506c ("x86/alternatives: Lockdep-enforce text_mutex
>> in
>>>> text_poke*()")
>>>>> Suggested-by: Peter Zijlstra <peterz@infradead.org>
>>>>> Acked-by: Jiri Kosina <jkosina@suse.cz>
>>>>> Signed-off-by: Nadav Amit <namit@vmware.com>
>>>>> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
>>>>> ---
>>>>> arch/x86/include/asm/text-patching.h |  1 +
>>>>> arch/x86/kernel/alternative.c        | 52
>>>> ++++++++++++++++++++--------
>>>>> arch/x86/kernel/kgdb.c               | 11 +++---
>>>>> 3 files changed, 45 insertions(+), 19 deletions(-)
>>>>> 
>>>>> diff --git a/arch/x86/include/asm/text-patching.h
>>>> b/arch/x86/include/asm/text-patching.h
>>>>> index e85ff65c43c3..f8fc8e86cf01 100644
>>>>> --- a/arch/x86/include/asm/text-patching.h
>>>>> +++ b/arch/x86/include/asm/text-patching.h
>>>>> @@ -35,6 +35,7 @@ extern void *text_poke_early(void *addr, const
>> void
>>>> *opcode, size_t len);
>>>>> * inconsistent instruction while you patch.
>>>>> */
>>>>> extern void *text_poke(void *addr, const void *opcode, size_t len);
>>>>> +extern void *text_poke_kgdb(void *addr, const void *opcode, size_t
>>>> len);
>>>>> extern int poke_int3_handler(struct pt_regs *regs);
>>>>> extern void *text_poke_bp(void *addr, const void *opcode, size_t
>>>> len, void *handler);
>>>>> extern int after_bootmem;
>>>>> diff --git a/arch/x86/kernel/alternative.c
>>>> b/arch/x86/kernel/alternative.c
>>>>> index ebeac487a20c..c6a3a10a2fd5 100644
>>>>> --- a/arch/x86/kernel/alternative.c
>>>>> +++ b/arch/x86/kernel/alternative.c
>>>>> @@ -678,18 +678,7 @@ void *__init_or_module text_poke_early(void
>>>> *addr, const void *opcode,
>>>>> return addr;
>>>>> }
>>>>> 
>>>>> -/**
>>>>> - * text_poke - Update instructions on a live kernel
>>>>> - * @addr: address to modify
>>>>> - * @opcode: source of the copy
>>>>> - * @len: length to copy
>>>>> - *
>>>>> - * Only atomic text poke/set should be allowed when not doing
>> early
>>>> patching.
>>>>> - * It means the size must be writable atomically and the address
>>>> must be aligned
>>>>> - * in a way that permits an atomic write. It also makes sure we
>> fit
>>>> on a single
>>>>> - * page.
>>>>> - */
>>>>> -void *text_poke(void *addr, const void *opcode, size_t len)
>>>>> +static void *__text_poke(void *addr, const void *opcode, size_t
>> len)
>>>>> {
>>>>> 	unsigned long flags;
>>>>> 	char *vaddr;
>>>>> @@ -702,8 +691,6 @@ void *text_poke(void *addr, const void *opcode,
>>>> size_t len)
>>>>> */
>>>>> 	BUG_ON(!after_bootmem);
>>>>> 
>>>>> -	lockdep_assert_held(&text_mutex);
>>>>> -
>>>>> 	if (!core_kernel_text((unsigned long)addr)) {
>>>>> 		pages[0] = vmalloc_to_page(addr);
>>>>> 		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
>>>>> @@ -732,6 +719,43 @@ void *text_poke(void *addr, const void
>> *opcode,
>>>> size_t len)
>>>>> return addr;
>>>>> }
>>>>> 
>>>>> +/**
>>>>> + * text_poke - Update instructions on a live kernel
>>>>> + * @addr: address to modify
>>>>> + * @opcode: source of the copy
>>>>> + * @len: length to copy
>>>>> + *
>>>>> + * Only atomic text poke/set should be allowed when not doing
>> early
>>>> patching.
>>>>> + * It means the size must be writable atomically and the address
>>>> must be aligned
>>>>> + * in a way that permits an atomic write. It also makes sure we
>> fit
>>>> on a single
>>>>> + * page.
>>>>> + */
>>>>> +void *text_poke(void *addr, const void *opcode, size_t len)
>>>>> +{
>>>>> +	lockdep_assert_held(&text_mutex);
>>>>> +
>>>>> +	return __text_poke(addr, opcode, len);
>>>>> +}
>>>>> +
>>>>> +/**
>>>>> + * text_poke_kgdb - Update instructions on a live kernel by kgdb
>>>>> + * @addr: address to modify
>>>>> + * @opcode: source of the copy
>>>>> + * @len: length to copy
>>>>> + *
>>>>> + * Only atomic text poke/set should be allowed when not doing
>> early
>>>> patching.
>>>>> + * It means the size must be writable atomically and the address
>>>> must be aligned
>>>>> + * in a way that permits an atomic write. It also makes sure we
>> fit
>>>> on a single
>>>>> + * page.
>>>>> + *
>>>>> + * Context: should only be used by kgdb, which ensures no other
>> core
>>>> is running,
>>>>> + *	    despite the fact it does not hold the text_mutex.
>>>>> + */
>>>>> +void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
>>>>> +{
>>>>> +	return __text_poke(addr, opcode, len);
>>>>> +}
>>>>> +
>>>>> static void do_sync_core(void *info)
>>>>> {
>>>>> 	sync_core();
>>>>> diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
>>>>> index 5db08425063e..1461544cba8b 100644
>>>>> --- a/arch/x86/kernel/kgdb.c
>>>>> +++ b/arch/x86/kernel/kgdb.c
>>>>> @@ -758,13 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt
>>>> *bpt)
>>>>> if (!err)
>>>>> 		return err;
>>>>> 	/*
>>>>> -	 * It is safe to call text_poke() because normal kernel execution
>>>>> +	 * It is safe to call text_poke_kgdb() because normal kernel
>>>> execution
>>>>> * is stopped on all cores, so long as the text_mutex is not
>>>> locked.
>>>>> */
>>>>> 	if (mutex_is_locked(&text_mutex))
>>>>> 		return -EBUSY;
>>>>> -	text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
>>>>> -		  BREAK_INSTR_SIZE);
>>>>> +	text_poke_kgdb((void *)bpt->bpt_addr,
>> arch_kgdb_ops.gdb_bpt_instr,
>>>>> +		       BREAK_INSTR_SIZE);
>>>>> 	err = probe_kernel_read(opc, (char *)bpt->bpt_addr,
>>>> BREAK_INSTR_SIZE);
>>>>> if (err)
>>>>> 		return err;
>>>>> @@ -783,12 +783,13 @@ int kgdb_arch_remove_breakpoint(struct
>>>> kgdb_bkpt *bpt)
>>>>> if (bpt->type != BP_POKE_BREAKPOINT)
>>>>> 		goto knl_write;
>>>>> 	/*
>>>>> -	 * It is safe to call text_poke() because normal kernel execution
>>>>> +	 * It is safe to call text_poke_kgdb() because normal kernel
>>>> execution
>>>>> * is stopped on all cores, so long as the text_mutex is not
>>>> locked.
>>>>> */
>>>>> 	if (mutex_is_locked(&text_mutex))
>>>>> 		goto knl_write;
>>>>> -	text_poke((void *)bpt->bpt_addr, bpt->saved_instr,
>>>> BREAK_INSTR_SIZE);
>>>>> +	text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
>>>>> +		       BREAK_INSTR_SIZE);
>>>>> 	err = probe_kernel_read(opc, (char *)bpt->bpt_addr,
>>>> BREAK_INSTR_SIZE);
>>>>> if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
>>>>> 		goto knl_write;
>>>>> -- 
>>>>> 2.17.1
>>> 
>>> If you are reorganizing this code, please do so so that the caller
>> doesn’t
>>> have to worry about if it should call text_poke_bp() or
>> text_poke_early().
>>> Right now the caller had to know that, which makes no sense.
>> 
>> Did you look at "[11/17] x86/jump-label: remove support for custom
>> poker”?
>> 
>> https://lore.kernel.org/patchwork/patch/1032857/
>> 
>> If this is not what you regard, please be more concrete.
>> text_poke_early()
>> is still used directly on init and while modules are loaded, which
>> might not
>> be great, but is outside of the scope of this patch-set.
> 
> I don't think it is out of scope, although that patch is a huge step in the right direction.
> 
> text_poke_{early,bp,...}, however, should be fully internal, that is, static functions, and we should present a single interface, preferably called text_poke(), to the outside world.
> 
> I think we have three subcases:
> 
> 1. Early, UP, or under stop_machine();
> 2. Atomic and aligned;
> 3. Breakpoint.
> 
> My proposed algorithm should remove the need for a fixup which should help this interface, too.

That’s another reason why such change might be done later (after your
changes are merged). The main reason is that Rick was kind enough to
deal with the whole patch-set.

> The specific alignment needed for #2 is started by the hardware people to be not crossing 16 bytes (NOT a cache line) on any CPU we support SMP on and, of course, being possible to do atomically do on the specific CPU (note that we *can* do a redundantly large store of existing bytes, which adds flexibility.)
> 
> To the best of my knowledge any CPU supporting SSE can do an atomic (for our purposes) aligned 16-byte store via MOVAPS; of course any CPU with cx16 can do it without SSE registers. For older CPUs we may be limited to 8-byte stores (cx8) or even 4-byte stores before we need to use the breakpoint algorithm.

So the last time we had this discussion, I could not be convinced that
hypervisors (e.g, KVM), which do not follow this undocumented behavior,
would not break. I also don’t remember an official confirmation of this
behavior on Intel and AMD CPUs.
H. Peter Anvin Jan. 17, 2019, 11:19 p.m. UTC | #6
> 
> I think we have three subcases:
> 
> 1. Early, UP, or under stop_machine();
> 2. Atomic and aligned;
> 3. Breakpoint.
> 
> My proposed algorithm should remove the need for a fixup which should help this interface, too.
> 
> The specific alignment needed for #2 is started by the hardware people to be not crossing 16 bytes (NOT a cache line) on any CPU we support SMP on and, of course, being possible to do atomically do on the specific CPU (note that we *can* do a redundantly large store of existing bytes, which adds flexibility.)
> 
> To the best of my knowledge any CPU supporting SSE can do an atomic (for our purposes) aligned 16-byte store via MOVAPS; of course any CPU with cx16 can do it without SSE registers. For older CPUs we may be limited to 8-byte stores (cx8) or even 4-byte stores before we need to use the breakpoint algorithm.
> 

Sending to a restricted list, because I don't actually know how publicly
known this is, but it is known there are operating systems in the field
already which rely on the 16-byte atomicity guarantee.

	-hpa
Nadav Amit Jan. 18, 2019, 2:40 a.m. UTC | #7
> On Jan 17, 2019, at 3:19 PM, H. Peter Anvin <hpa@zytor.com> wrote:
> 
>> I think we have three subcases:
>> 
>> 1. Early, UP, or under stop_machine();
>> 2. Atomic and aligned;
>> 3. Breakpoint.
>> 
>> My proposed algorithm should remove the need for a fixup which should help this interface, too.
>> 
>> The specific alignment needed for #2 is started by the hardware people to be not crossing 16 bytes (NOT a cache line) on any CPU we support SMP on and, of course, being possible to do atomically do on the specific CPU (note that we *can* do a redundantly large store of existing bytes, which adds flexibility.)
>> 
>> To the best of my knowledge any CPU supporting SSE can do an atomic (for our purposes) aligned 16-byte store via MOVAPS; of course any CPU with cx16 can do it without SSE registers. For older CPUs we may be limited to 8-byte stores (cx8) or even 4-byte stores before we need to use the breakpoint algorithm.
> 
> Sending to a restricted list, because I don't actually know how publicly
> known this is, but it is known there are operating systems in the field
> already which rely on the 16-byte atomicity guarantee.

Hmm. I don’t know how restricted the list is, considering you left "LSM list”.
;-)

Anyhow, I don’t know whether it changes things much. If you patch an MMIO
accessing instruction, which is executed on top of KVM - things might break.
IMHO, the very least hypervisors vendors should be informed before
implementing such change.
Borislav Petkov Jan. 25, 2019, 9:30 a.m. UTC | #8
On Wed, Jan 16, 2019 at 04:32:43PM -0800, Rick Edgecombe wrote:
> From: Nadav Amit <namit@vmware.com>
> 
> text_mutex is currently expected to be held before text_poke() is
> called, but we kgdb does not take the mutex, and instead *supposedly*
> ensures the lock is not taken and will not be acquired by any other core
> while text_poke() is running.
> 
> The reason for the "supposedly" comment is that it is not entirely clear
> that this would be the case if gdb_do_roundup is zero.

I guess that variable name is "kgdb_do_roundup" ?

> This patch creates two wrapper functions, text_poke() and

Avoid having "This patch" or "This commit" in the commit message. It is
tautologically useless.

Also, do

$ git grep 'This patch' Documentation/process

for more details.

> text_poke_kgdb() which do or do not run the lockdep assertion
> respectively.
> 
> While we are at it, change the return code of text_poke() to something
> meaningful. One day, callers might actually respect it and the existing
> BUG_ON() when patching fails could be removed. For kgdb, the return
> value can actually be used.
> 
> Cc: Andy Lutomirski <luto@kernel.org>
> Cc: Kees Cook <keescook@chromium.org>
> Cc: Dave Hansen <dave.hansen@intel.com>
> Cc: Masami Hiramatsu <mhiramat@kernel.org>
> Fixes: 9222f606506c ("x86/alternatives: Lockdep-enforce text_mutex in text_poke*()")
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Acked-by: Jiri Kosina <jkosina@suse.cz>
> Signed-off-by: Nadav Amit <namit@vmware.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> ---
>  arch/x86/include/asm/text-patching.h |  1 +
>  arch/x86/kernel/alternative.c        | 52 ++++++++++++++++++++--------
>  arch/x86/kernel/kgdb.c               | 11 +++---
>  3 files changed, 45 insertions(+), 19 deletions(-)

...

> +/**
> + * text_poke_kgdb - Update instructions on a live kernel by kgdb
> + * @addr: address to modify
> + * @opcode: source of the copy
> + * @len: length to copy
> + *
> + * Only atomic text poke/set should be allowed when not doing early patching.
> + * It means the size must be writable atomically and the address must be aligned
> + * in a way that permits an atomic write. It also makes sure we fit on a single
> + * page.
> + *
> + * Context: should only be used by kgdb, which ensures no other core is running,
> + *	    despite the fact it does not hold the text_mutex.
> + */
> +void *text_poke_kgdb(void *addr, const void *opcode, size_t len)

text_poke_unlocked() I guess. I don't think kgdb is that special that it
needs its own function flavor.
Nadav Amit Jan. 25, 2019, 6:28 p.m. UTC | #9
> On Jan 25, 2019, at 1:30 AM, Borislav Petkov <bp@alien8.de> wrote:
> 
> On Wed, Jan 16, 2019 at 04:32:43PM -0800, Rick Edgecombe wrote:
>> From: Nadav Amit <namit@vmware.com>
>> 
>> text_mutex is currently expected to be held before text_poke() is
>> called, but we kgdb does not take the mutex, and instead *supposedly*
>> ensures the lock is not taken and will not be acquired by any other core
>> while text_poke() is running.
>> 
>> The reason for the "supposedly" comment is that it is not entirely clear
>> that this would be the case if gdb_do_roundup is zero.
> 
> I guess that variable name is "kgdb_do_roundup” ?

Yes. Will fix.

> 
>> This patch creates two wrapper functions, text_poke() and
> 
> Avoid having "This patch" or "This commit" in the commit message. It is
> tautologically useless.
> 
> Also, do
> 
> $ git grep 'This patch' Documentation/process
> 
> for more details.

Ok.

>> 
>> +void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
> 
> text_poke_unlocked() I guess. I don't think kgdb is that special that it
> needs its own function flavor.

Tglx suggested this naming to prevent anyone from misusing text_poke_kdgb().
This is a very specific use-case that nobody else should need.

Regards,
Nadav
diff mbox series

Patch

diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index e85ff65c43c3..f8fc8e86cf01 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -35,6 +35,7 @@  extern void *text_poke_early(void *addr, const void *opcode, size_t len);
  * inconsistent instruction while you patch.
  */
 extern void *text_poke(void *addr, const void *opcode, size_t len);
+extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
 extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
 extern int after_bootmem;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ebeac487a20c..c6a3a10a2fd5 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -678,18 +678,7 @@  void *__init_or_module text_poke_early(void *addr, const void *opcode,
 	return addr;
 }
 
-/**
- * text_poke - Update instructions on a live kernel
- * @addr: address to modify
- * @opcode: source of the copy
- * @len: length to copy
- *
- * Only atomic text poke/set should be allowed when not doing early patching.
- * It means the size must be writable atomically and the address must be aligned
- * in a way that permits an atomic write. It also makes sure we fit on a single
- * page.
- */
-void *text_poke(void *addr, const void *opcode, size_t len)
+static void *__text_poke(void *addr, const void *opcode, size_t len)
 {
 	unsigned long flags;
 	char *vaddr;
@@ -702,8 +691,6 @@  void *text_poke(void *addr, const void *opcode, size_t len)
 	 */
 	BUG_ON(!after_bootmem);
 
-	lockdep_assert_held(&text_mutex);
-
 	if (!core_kernel_text((unsigned long)addr)) {
 		pages[0] = vmalloc_to_page(addr);
 		pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
@@ -732,6 +719,43 @@  void *text_poke(void *addr, const void *opcode, size_t len)
 	return addr;
 }
 
+/**
+ * text_poke - Update instructions on a live kernel
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Only atomic text poke/set should be allowed when not doing early patching.
+ * It means the size must be writable atomically and the address must be aligned
+ * in a way that permits an atomic write. It also makes sure we fit on a single
+ * page.
+ */
+void *text_poke(void *addr, const void *opcode, size_t len)
+{
+	lockdep_assert_held(&text_mutex);
+
+	return __text_poke(addr, opcode, len);
+}
+
+/**
+ * text_poke_kgdb - Update instructions on a live kernel by kgdb
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Only atomic text poke/set should be allowed when not doing early patching.
+ * It means the size must be writable atomically and the address must be aligned
+ * in a way that permits an atomic write. It also makes sure we fit on a single
+ * page.
+ *
+ * Context: should only be used by kgdb, which ensures no other core is running,
+ *	    despite the fact it does not hold the text_mutex.
+ */
+void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
+{
+	return __text_poke(addr, opcode, len);
+}
+
 static void do_sync_core(void *info)
 {
 	sync_core();
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5db08425063e..1461544cba8b 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -758,13 +758,13 @@  int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
 	if (!err)
 		return err;
 	/*
-	 * It is safe to call text_poke() because normal kernel execution
+	 * It is safe to call text_poke_kgdb() because normal kernel execution
 	 * is stopped on all cores, so long as the text_mutex is not locked.
 	 */
 	if (mutex_is_locked(&text_mutex))
 		return -EBUSY;
-	text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
-		  BREAK_INSTR_SIZE);
+	text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
+		       BREAK_INSTR_SIZE);
 	err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
 	if (err)
 		return err;
@@ -783,12 +783,13 @@  int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
 	if (bpt->type != BP_POKE_BREAKPOINT)
 		goto knl_write;
 	/*
-	 * It is safe to call text_poke() because normal kernel execution
+	 * It is safe to call text_poke_kgdb() because normal kernel execution
 	 * is stopped on all cores, so long as the text_mutex is not locked.
 	 */
 	if (mutex_is_locked(&text_mutex))
 		goto knl_write;
-	text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
+	text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr,
+		       BREAK_INSTR_SIZE);
 	err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
 	if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
 		goto knl_write;