diff mbox series

[v2,4/7] s390/physmem_info: query diag500(STORAGE LIMIT) to support QEMU/KVM memory devices

Message ID 20241014144622.876731-5-david@redhat.com (mailing list archive)
State New, archived
Headers show
Series virtio-mem: s390 support | expand

Commit Message

David Hildenbrand Oct. 14, 2024, 2:46 p.m. UTC
To support memory devices under QEMU/KVM, such as virtio-mem,
we have to prepare our kernel virtual address space accordingly and
have to know the highest possible physical memory address we might see
later: the storage limit. The good old SCLP interface is not suitable for
this use case.

In particular, memory owned by memory devices has no relationship to
storage increments, it is always detected using the device driver, and
unaware OSes (no driver) must never try making use of that memory.
Consequently this memory is located outside of the "maximum storage
increment"-indicated memory range.

Let's use our new diag500 STORAGE_LIMIT subcode to query this storage
limit that can exceed the "maximum storage increment", and use the
existing interfaces (i.e., SCLP) to obtain information about the initial
memory that is not owned+managed by memory devices.

If a hypervisor does not support such memory devices, the address exposed
through diag500 STORAGE_LIMIT will correspond to the maximum storage
increment exposed through SCLP.

To teach kdump on s390 to include memory owned by memory devices, there
will be ways to query the relevant memory ranges from the device via a
driver running in special kdump mode (like virtio-mem already implements
to filter /proc/vmcore access so we don't end up reading from unplugged
device blocks).

Tested-by: Mario Casquero <mcasquer@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 arch/s390/boot/physmem_info.c        | 46 ++++++++++++++++++++++++++--
 arch/s390/include/asm/physmem_info.h |  3 ++
 2 files changed, 46 insertions(+), 3 deletions(-)

Comments

Heiko Carstens Oct. 14, 2024, 6:43 p.m. UTC | #1
On Mon, Oct 14, 2024 at 04:46:16PM +0200, David Hildenbrand wrote:
> To support memory devices under QEMU/KVM, such as virtio-mem,
> we have to prepare our kernel virtual address space accordingly and
> have to know the highest possible physical memory address we might see
> later: the storage limit. The good old SCLP interface is not suitable for
> this use case.
> 
> In particular, memory owned by memory devices has no relationship to
> storage increments, it is always detected using the device driver, and
> unaware OSes (no driver) must never try making use of that memory.
> Consequently this memory is located outside of the "maximum storage
> increment"-indicated memory range.
> 
> Let's use our new diag500 STORAGE_LIMIT subcode to query this storage
> limit that can exceed the "maximum storage increment", and use the
> existing interfaces (i.e., SCLP) to obtain information about the initial
> memory that is not owned+managed by memory devices.
> 
> If a hypervisor does not support such memory devices, the address exposed
> through diag500 STORAGE_LIMIT will correspond to the maximum storage
> increment exposed through SCLP.
> 
> To teach kdump on s390 to include memory owned by memory devices, there
> will be ways to query the relevant memory ranges from the device via a
> driver running in special kdump mode (like virtio-mem already implements
> to filter /proc/vmcore access so we don't end up reading from unplugged
> device blocks).
> 
> Tested-by: Mario Casquero <mcasquer@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  arch/s390/boot/physmem_info.c        | 46 ++++++++++++++++++++++++++--
>  arch/s390/include/asm/physmem_info.h |  3 ++
>  2 files changed, 46 insertions(+), 3 deletions(-)

...

> +static int diag500_storage_limit(unsigned long *max_physmem_end)
> +{
> +	register unsigned long __nr asm("1") = 0x4;
> +	register unsigned long __storage_limit asm("2") = 0;
> +	unsigned long reg1, reg2;
> +	psw_t old;

In general we do not allow register asm usage anymore in s390 code,
except for a very few defined places. This is due to all the problems
that we've seen with code instrumentation and register corruption.

The patch below changes your code accordingly, but it is
untested. Please verify that your code still works.

> @@ -157,7 +189,9 @@ unsigned long detect_max_physmem_end(void)
>  {
>  	unsigned long max_physmem_end = 0;
>  
> -	if (!sclp_early_get_memsize(&max_physmem_end)) {
> +	if (!diag500_storage_limit(&max_physmem_end)) {
> +		physmem_info.info_source = MEM_DETECT_DIAG500_STOR_LIMIT;
> +	} else if (!sclp_early_get_memsize(&max_physmem_end)) {
>  		physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
>  	} else {
>  		max_physmem_end = search_mem_end();
> @@ -170,11 +204,17 @@ void detect_physmem_online_ranges(unsigned long max_physmem_end)
>  {
>  	if (!sclp_early_read_storage_info()) {
>  		physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO;
> +		return;
>  	} else if (!diag260()) {
>  		physmem_info.info_source = MEM_DETECT_DIAG260;
> -	} else if (max_physmem_end) {
> -		add_physmem_online_range(0, max_physmem_end);
> +		return;
> +	} else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) {
> +		max_physmem_end = 0;
> +		if (!sclp_early_get_memsize(&max_physmem_end))
> +			physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
>  	}
> +	if (max_physmem_end)
> +		add_physmem_online_range(0, max_physmem_end);
>  }

In general looks good to me, but I'd like to see that Vasily or
Alexander give an Ack to this patch.

diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c
index fb4e66e80fd8..975fc478e0e3 100644
--- a/arch/s390/boot/physmem_info.c
+++ b/arch/s390/boot/physmem_info.c
@@ -109,10 +109,11 @@ static int diag260(void)
 	return 0;
 }
 
+#define DIAG500_SC_STOR_LIMIT 4
+
 static int diag500_storage_limit(unsigned long *max_physmem_end)
 {
-	register unsigned long __nr asm("1") = 0x4;
-	register unsigned long __storage_limit asm("2") = 0;
+	unsigned long storage_limit;
 	unsigned long reg1, reg2;
 	psw_t old;
 
@@ -123,21 +124,24 @@ static int diag500_storage_limit(unsigned long *max_physmem_end)
 		"	st	%[reg2],4(%[psw_pgm])\n"
 		"	larl	%[reg1],1f\n"
 		"	stg	%[reg1],8(%[psw_pgm])\n"
+		"	lghi	1,%[subcode]\n"
+		"	lghi	2,0\n"
 		"	diag	2,4,0x500\n"
 		"1:	mvc	0(16,%[psw_pgm]),0(%[psw_old])\n"
+		"	lgr	%[slimit],2\n"
 		: [reg1] "=&d" (reg1),
 		  [reg2] "=&a" (reg2),
-		  "+&d" (__storage_limit),
+		  [slimit] "=d" (storage_limit),
 		  "=Q" (get_lowcore()->program_new_psw),
 		  "=Q" (old)
 		: [psw_old] "a" (&old),
 		  [psw_pgm] "a" (&get_lowcore()->program_new_psw),
-		  "d" (__nr)
-		: "memory");
-	if (!__storage_limit)
-	        return -EINVAL;
-	/* convert inclusive end to exclusive end. */
-	*max_physmem_end = __storage_limit + 1;
+		  [subcode] "i" (DIAG500_SC_STOR_LIMIT)
+		: "memory", "1", "2");
+	if (!storage_limit)
+		return -EINVAL;
+	/* Convert inclusive end to exclusive end */
+	*max_physmem_end = storage_limit + 1;
 	return 0;
 }
David Hildenbrand Oct. 14, 2024, 7:42 p.m. UTC | #2
On 14.10.24 20:43, Heiko Carstens wrote:
> On Mon, Oct 14, 2024 at 04:46:16PM +0200, David Hildenbrand wrote:
>> To support memory devices under QEMU/KVM, such as virtio-mem,
>> we have to prepare our kernel virtual address space accordingly and
>> have to know the highest possible physical memory address we might see
>> later: the storage limit. The good old SCLP interface is not suitable for
>> this use case.
>>
>> In particular, memory owned by memory devices has no relationship to
>> storage increments, it is always detected using the device driver, and
>> unaware OSes (no driver) must never try making use of that memory.
>> Consequently this memory is located outside of the "maximum storage
>> increment"-indicated memory range.
>>
>> Let's use our new diag500 STORAGE_LIMIT subcode to query this storage
>> limit that can exceed the "maximum storage increment", and use the
>> existing interfaces (i.e., SCLP) to obtain information about the initial
>> memory that is not owned+managed by memory devices.
>>
>> If a hypervisor does not support such memory devices, the address exposed
>> through diag500 STORAGE_LIMIT will correspond to the maximum storage
>> increment exposed through SCLP.
>>
>> To teach kdump on s390 to include memory owned by memory devices, there
>> will be ways to query the relevant memory ranges from the device via a
>> driver running in special kdump mode (like virtio-mem already implements
>> to filter /proc/vmcore access so we don't end up reading from unplugged
>> device blocks).
>>
>> Tested-by: Mario Casquero <mcasquer@redhat.com>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>>   arch/s390/boot/physmem_info.c        | 46 ++++++++++++++++++++++++++--
>>   arch/s390/include/asm/physmem_info.h |  3 ++
>>   2 files changed, 46 insertions(+), 3 deletions(-)
> 
> ...
> 
>> +static int diag500_storage_limit(unsigned long *max_physmem_end)
>> +{
>> +	register unsigned long __nr asm("1") = 0x4;
>> +	register unsigned long __storage_limit asm("2") = 0;
>> +	unsigned long reg1, reg2;
>> +	psw_t old;
> 
> In general we do not allow register asm usage anymore in s390 code,
> except for a very few defined places. This is due to all the problems
> that we've seen with code instrumentation and register corruption.

Makes sense. Note that I was inspired by GENERATE_KVM_HYPERCALL_FUNC 
that also still uses register asm usage.

> 
> The patch below changes your code accordingly, but it is
> untested. Please verify that your code still works.

Below LGTM, I'll give it a churn, thanks!
Eric Farman Oct. 15, 2024, 3:01 p.m. UTC | #3
On Mon, 2024-10-14 at 20:43 +0200, Heiko Carstens wrote:
> On Mon, Oct 14, 2024 at 04:46:16PM +0200, David Hildenbrand wrote:
> > To support memory devices under QEMU/KVM, such as virtio-mem,
> > we have to prepare our kernel virtual address space accordingly and
> > have to know the highest possible physical memory address we might see
> > later: the storage limit. The good old SCLP interface is not suitable for
> > this use case.
> > 
> > In particular, memory owned by memory devices has no relationship to
> > storage increments, it is always detected using the device driver, and
> > unaware OSes (no driver) must never try making use of that memory.
> > Consequently this memory is located outside of the "maximum storage
> > increment"-indicated memory range.
> > 
> > Let's use our new diag500 STORAGE_LIMIT subcode to query this storage
> > limit that can exceed the "maximum storage increment", and use the
> > existing interfaces (i.e., SCLP) to obtain information about the initial
> > memory that is not owned+managed by memory devices.

...snip...

> The patch below changes your code accordingly, but it is
> untested. Please verify that your code still works.

...snip...

> diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c
> index fb4e66e80fd8..975fc478e0e3 100644
> --- a/arch/s390/boot/physmem_info.c
> +++ b/arch/s390/boot/physmem_info.c
> @@ -109,10 +109,11 @@ static int diag260(void)
>  	return 0;
>  }
>  
> +#define DIAG500_SC_STOR_LIMIT 4
> +
>  static int diag500_storage_limit(unsigned long *max_physmem_end)
>  {
> -	register unsigned long __nr asm("1") = 0x4;
> -	register unsigned long __storage_limit asm("2") = 0;
> +	unsigned long storage_limit;
>  	unsigned long reg1, reg2;
>  	psw_t old;
>  
> @@ -123,21 +124,24 @@ static int diag500_storage_limit(unsigned long *max_physmem_end)
>  		"	st	%[reg2],4(%[psw_pgm])\n"
>  		"	larl	%[reg1],1f\n"
>  		"	stg	%[reg1],8(%[psw_pgm])\n"
> +		"	lghi	1,%[subcode]\n"
> +		"	lghi	2,0\n"
>  		"	diag	2,4,0x500\n"
>  		"1:	mvc	0(16,%[psw_pgm]),0(%[psw_old])\n"
> +		"	lgr	%[slimit],2\n"
>  		: [reg1] "=&d" (reg1),
>  		  [reg2] "=&a" (reg2),
> -		  "+&d" (__storage_limit),
> +		  [slimit] "=d" (storage_limit),
>  		  "=Q" (get_lowcore()->program_new_psw),
>  		  "=Q" (old)
>  		: [psw_old] "a" (&old),
>  		  [psw_pgm] "a" (&get_lowcore()->program_new_psw),
> -		  "d" (__nr)
> -		: "memory");
> -	if (!__storage_limit)
> -	        return -EINVAL;
> -	/* convert inclusive end to exclusive end. */
> -	*max_physmem_end = __storage_limit + 1;
> +		  [subcode] "i" (DIAG500_SC_STOR_LIMIT)
> +		: "memory", "1", "2");
> +	if (!storage_limit)
> +		return -EINVAL;
> +	/* Convert inclusive end to exclusive end */
> +	*max_physmem_end = storage_limit + 1;
>  	return 0;
>  }
>  
> 

I like the idea of a defined constant here instead of hardcoded, but maybe it should be placed
somewhere in include/uapi so that QEMU can pick it up with update-linux-headers.sh and be in sync
with the kernel, instead of just an equivalent definition in [1] ?

[1] https://lore.kernel.org/qemu-devel/20241008105455.2302628-8-david@redhat.com/
Heiko Carstens Oct. 15, 2024, 3:20 p.m. UTC | #4
On Tue, Oct 15, 2024 at 11:01:44AM -0400, Eric Farman wrote:
> On Mon, 2024-10-14 at 20:43 +0200, Heiko Carstens wrote:
> > On Mon, Oct 14, 2024 at 04:46:16PM +0200, David Hildenbrand wrote:
...
> > +#define DIAG500_SC_STOR_LIMIT 4
...
> I like the idea of a defined constant here instead of hardcoded, but maybe it should be placed
> somewhere in include/uapi so that QEMU can pick it up with update-linux-headers.sh and be in sync
> with the kernel, instead of just an equivalent definition in [1] ?
> 
> [1] https://lore.kernel.org/qemu-devel/20241008105455.2302628-8-david@redhat.com/

It is already a mess; we have already subcode 3 defined:

#define KVM_S390_VIRTIO_CCW_NOTIFY 3

in

arch/s390/include/uapi/asm/virtio-ccw.h

which for some reason is uapi. But it doesn't make sense to put the
new subcode 4 there too. So what is the end result?

Another uapi file? I think resolving this would be a project on its own.
Halil Pasic Oct. 16, 2024, 10:37 a.m. UTC | #5
On Tue, 15 Oct 2024 11:01:44 -0400
Eric Farman <farman@linux.ibm.com> wrote:

> > +		  [subcode] "i" (DIAG500_SC_STOR_LIMIT)
> > +		: "memory", "1", "2");
> > +	if (!storage_limit)
> > +		return -EINVAL;
> > +	/* Convert inclusive end to exclusive end */
> > +	*max_physmem_end = storage_limit + 1;
> >  	return 0;
> >  }
> >  
> >   
> 
> I like the idea of a defined constant here instead of hardcoded, but maybe it should be placed
> somewhere in include/uapi so that QEMU can pick it up with update-linux-headers.sh and be in sync
> with the kernel, instead of just an equivalent definition in [1] ?
> 
> [1] https://lore.kernel.org/qemu-devel/20241008105455.2302628-8-david@redhat.com/

I think it is fine to have equivalent definitions. This is more or less
an ISA thing we are introducing here. And IMHO it would be fine to have
such a definition even if the emulator was supposed to run on an OS that
is not Linux and without without KVM.

Regards,
Halil
Alexander Gordeev Oct. 17, 2024, 7:36 a.m. UTC | #6
On Mon, Oct 14, 2024 at 04:46:16PM +0200, David Hildenbrand wrote:

Hi David!

> @@ -157,7 +189,9 @@ unsigned long detect_max_physmem_end(void)
>  {
>  	unsigned long max_physmem_end = 0;
>  
> -	if (!sclp_early_get_memsize(&max_physmem_end)) {
> +	if (!diag500_storage_limit(&max_physmem_end)) {
> +		physmem_info.info_source = MEM_DETECT_DIAG500_STOR_LIMIT;
> +	} else if (!sclp_early_get_memsize(&max_physmem_end)) {
>  		physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
>  	} else {
>  		max_physmem_end = search_mem_end();
> @@ -170,11 +204,17 @@ void detect_physmem_online_ranges(unsigned long max_physmem_end)
>  {
>  	if (!sclp_early_read_storage_info()) {
>  		physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO;
> +		return;
>  	} else if (!diag260()) {
>  		physmem_info.info_source = MEM_DETECT_DIAG260;
> -	} else if (max_physmem_end) {
> -		add_physmem_online_range(0, max_physmem_end);
> +		return;
> +	} else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) {
> +		max_physmem_end = 0;
> +		if (!sclp_early_get_memsize(&max_physmem_end))
> +			physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;

Why search_mem_end() is not tried in case sclp_early_get_memsize() failed?

>  	}
> +	if (max_physmem_end)
> +		add_physmem_online_range(0, max_physmem_end);
>  }
>  
>  void physmem_set_usable_limit(unsigned long limit)
> diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h
> index f45cfc8bc233..51b68a43e195 100644
> --- a/arch/s390/include/asm/physmem_info.h
> +++ b/arch/s390/include/asm/physmem_info.h
> @@ -9,6 +9,7 @@ enum physmem_info_source {
>  	MEM_DETECT_NONE = 0,
>  	MEM_DETECT_SCLP_STOR_INFO,
>  	MEM_DETECT_DIAG260,
> +	MEM_DETECT_DIAG500_STOR_LIMIT,
>  	MEM_DETECT_SCLP_READ_INFO,
>  	MEM_DETECT_BIN_SEARCH
>  };
> @@ -107,6 +108,8 @@ static inline const char *get_physmem_info_source(void)
>  		return "sclp storage info";
>  	case MEM_DETECT_DIAG260:
>  		return "diag260";
> +	case MEM_DETECT_DIAG500_STOR_LIMIT:
> +		return "diag500 storage limit";

AFAIU you want to always override MEM_DETECT_DIAG500_STOR_LIMIT method
with an online memory detection method. In that case this code is dead.

>  	case MEM_DETECT_SCLP_READ_INFO:
>  		return "sclp read info";
>  	case MEM_DETECT_BIN_SEARCH:

Thanks!
David Hildenbrand Oct. 17, 2024, 8:19 a.m. UTC | #7
On 17.10.24 09:36, Alexander Gordeev wrote:
> On Mon, Oct 14, 2024 at 04:46:16PM +0200, David Hildenbrand wrote:
> 
> Hi David!

Hi Alexander!

> 
>> @@ -157,7 +189,9 @@ unsigned long detect_max_physmem_end(void)
>>   {
>>   	unsigned long max_physmem_end = 0;
>>   
>> -	if (!sclp_early_get_memsize(&max_physmem_end)) {
>> +	if (!diag500_storage_limit(&max_physmem_end)) {
>> +		physmem_info.info_source = MEM_DETECT_DIAG500_STOR_LIMIT;
>> +	} else if (!sclp_early_get_memsize(&max_physmem_end)) {
>>   		physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
>>   	} else {
>>   		max_physmem_end = search_mem_end();
>> @@ -170,11 +204,17 @@ void detect_physmem_online_ranges(unsigned long max_physmem_end)
>>   {
>>   	if (!sclp_early_read_storage_info()) {
>>   		physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO;
>> +		return;
>>   	} else if (!diag260()) {
>>   		physmem_info.info_source = MEM_DETECT_DIAG260;
>> -	} else if (max_physmem_end) {
>> -		add_physmem_online_range(0, max_physmem_end);
>> +		return;
>> +	} else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) {
>> +		max_physmem_end = 0;
>> +		if (!sclp_early_get_memsize(&max_physmem_end))
>> +			physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
> 
> Why search_mem_end() is not tried in case sclp_early_get_memsize() failed?

Patch #3 documents that:

+    The storage limit does not indicate currently usable storage, it may
+    include holes, standby storage and areas reserved for other means, such
+    as memory hotplug or virtio-mem devices. Other interfaces for detecting
+    actually usable storage, such as SCLP, must be used in conjunction with
+    this subfunction.

If SCLP would fail, something would be seriously wrong and we should just crash
instead of trying to fallback to the legacy way of scanning.

> 
>>   	}
>> +	if (max_physmem_end)
>> +		add_physmem_online_range(0, max_physmem_end);
>>   }
>>   
>>   void physmem_set_usable_limit(unsigned long limit)
>> diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h
>> index f45cfc8bc233..51b68a43e195 100644
>> --- a/arch/s390/include/asm/physmem_info.h
>> +++ b/arch/s390/include/asm/physmem_info.h
>> @@ -9,6 +9,7 @@ enum physmem_info_source {
>>   	MEM_DETECT_NONE = 0,
>>   	MEM_DETECT_SCLP_STOR_INFO,
>>   	MEM_DETECT_DIAG260,
>> +	MEM_DETECT_DIAG500_STOR_LIMIT,
>>   	MEM_DETECT_SCLP_READ_INFO,
>>   	MEM_DETECT_BIN_SEARCH
>>   };
>> @@ -107,6 +108,8 @@ static inline const char *get_physmem_info_source(void)
>>   		return "sclp storage info";
>>   	case MEM_DETECT_DIAG260:
>>   		return "diag260";
>> +	case MEM_DETECT_DIAG500_STOR_LIMIT:
>> +		return "diag500 storage limit";
> 
> AFAIU you want to always override MEM_DETECT_DIAG500_STOR_LIMIT method
> with an online memory detection method. In that case this code is dead.

Not in the above case, pathological case above where something went wrong
during sclp_early_get_memsize(). In that scenario, die_oom() would indicate
that there are no memory ranges but that "diag500 storage limit" worked.

Does that make sense?

Thanks for the review!
Alexander Gordeev Oct. 17, 2024, 9:53 a.m. UTC | #8
> > Why search_mem_end() is not tried in case sclp_early_get_memsize() failed?
> 
> Patch #3 documents that:
> 
> +    The storage limit does not indicate currently usable storage, it may
> +    include holes, standby storage and areas reserved for other means, such
> +    as memory hotplug or virtio-mem devices. Other interfaces for detecting
> +    actually usable storage, such as SCLP, must be used in conjunction with
> +    this subfunction.

Yes, I read this and that exactly what causes my confusion. In this wording it
sounds like SCLP *or* other methods are fine to use. But then you use SCLP or
DIAGNOSE 260, but not memory scanning. So I am still confused ;)

> If SCLP would fail, something would be seriously wrong and we should just crash
> instead of trying to fallback to the legacy way of scanning.

But what is wrong with the legacy way of scanning?

> > > +	case MEM_DETECT_DIAG500_STOR_LIMIT:
> > > +		return "diag500 storage limit";
> > 
> > AFAIU you want to always override MEM_DETECT_DIAG500_STOR_LIMIT method
> > with an online memory detection method. In that case this code is dead.
> 
> Not in the above case, pathological case above where something went wrong
> during sclp_early_get_memsize(). In that scenario, die_oom() would indicate
> that there are no memory ranges but that "diag500 storage limit" worked.
> 
> Does that make sense?

Yes, I get your approach.

> Thanks for the review!

Thanks!

> -- 
> Cheers,
> 
> David / dhildenb
David Hildenbrand Oct. 17, 2024, 10 a.m. UTC | #9
On 17.10.24 11:53, Alexander Gordeev wrote:
>>> Why search_mem_end() is not tried in case sclp_early_get_memsize() failed?
>>
>> Patch #3 documents that:
>>
>> +    The storage limit does not indicate currently usable storage, it may
>> +    include holes, standby storage and areas reserved for other means, such
>> +    as memory hotplug or virtio-mem devices. Other interfaces for detecting
>> +    actually usable storage, such as SCLP, must be used in conjunction with
>> +    this subfunction.
> 
> Yes, I read this and that exactly what causes my confusion. In this wording it
> sounds like SCLP *or* other methods are fine to use. But then you use SCLP or
> DIAGNOSE 260, but not memory scanning. So I am still confused ;)

Well, DIAGNOSE 260 is z/VM only and DIAG 500 is KVM only. So there are 
currently not really any other reasonable ways besides SCLP.

> 
>> If SCLP would fail, something would be seriously wrong and we should just crash
>> instead of trying to fallback to the legacy way of scanning.
> 
> But what is wrong with the legacy way of scanning?

Missing to detect holes and starting to use them, detecting and using 
device memory without negotiating with the device ... it all falls to 
pieces.

> 
>>>> +	case MEM_DETECT_DIAG500_STOR_LIMIT:
>>>> +		return "diag500 storage limit";
>>>
>>> AFAIU you want to always override MEM_DETECT_DIAG500_STOR_LIMIT method
>>> with an online memory detection method. In that case this code is dead.
>>
>> Not in the above case, pathological case above where something went wrong
>> during sclp_early_get_memsize(). In that scenario, die_oom() would indicate
>> that there are no memory ranges but that "diag500 storage limit" worked.
>>
>> Does that make sense?
> 
> Yes, I get your approach.

Thanks, please let me know if I should make it clearer in the 
description, of if you think we can improve the code.
David Hildenbrand Oct. 17, 2024, 12:07 p.m. UTC | #10
On 17.10.24 12:00, David Hildenbrand wrote:
> On 17.10.24 11:53, Alexander Gordeev wrote:
>>>> Why search_mem_end() is not tried in case sclp_early_get_memsize() failed?
>>>
>>> Patch #3 documents that:
>>>
>>> +    The storage limit does not indicate currently usable storage, it may
>>> +    include holes, standby storage and areas reserved for other means, such
>>> +    as memory hotplug or virtio-mem devices. Other interfaces for detecting
>>> +    actually usable storage, such as SCLP, must be used in conjunction with
>>> +    this subfunction.
>>
>> Yes, I read this and that exactly what causes my confusion. In this wording it
>> sounds like SCLP *or* other methods are fine to use. But then you use SCLP or
>> DIAGNOSE 260, but not memory scanning. So I am still confused ;)
> 
> Well, DIAGNOSE 260 is z/VM only and DIAG 500 is KVM only. So there are
> currently not really any other reasonable ways besides SCLP.

Correction: Staring at the code again, in detect_physmem_online_ranges()
we will indeed try:

a) sclp_early_read_storage_info()
b) diag260()

But if neither works, we cannot blindly add all that memory, something is
messed up. So we'll fallback to

c) sclp_early_get_memsize()

But if none of that works, something is seriously wrong.


I will squash the following:

diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c
index 975fc478e0e3..6ad3ac2050eb 100644
--- a/arch/s390/boot/physmem_info.c
+++ b/arch/s390/boot/physmem_info.c
@@ -214,6 +214,12 @@ void detect_physmem_online_ranges(unsigned long max_physmem_end)
                 return;
         } else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) {
                 max_physmem_end = 0;
+               /*
+                * If we know the storage limit but do not find any other
+                * indication of usable initial memory, something is messed
+                * up. In that case, we'll not add any physical memory so
+                * we'll run into die_oom() later.
+                */
                 if (!sclp_early_get_memsize(&max_physmem_end))
                         physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
         }
Alexander Gordeev Oct. 17, 2024, 2:32 p.m. UTC | #11
On Thu, Oct 17, 2024 at 02:07:12PM +0200, David Hildenbrand wrote:
> On 17.10.24 12:00, David Hildenbrand wrote:
> > Well, DIAGNOSE 260 is z/VM only and DIAG 500 is KVM only. So there are
> > currently not really any other reasonable ways besides SCLP.
> 
> Correction: Staring at the code again, in detect_physmem_online_ranges()
> we will indeed try:
> 
> a) sclp_early_read_storage_info()
> b) diag260()

So why care to call diag260() in case of DIAGNOSE 500? What about the below?

void detect_physmem_online_ranges(unsigned long max_physmem_end)
{
	if (!sclp_early_read_storage_info()) {
		physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO;
	} else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) {
		unsigned long online_end;

		if (!sclp_early_get_memsize(&online_end)) {
			physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
			add_physmem_online_range(0, online_end);
		}
	} else if (!diag260()) {
		physmem_info.info_source = MEM_DETECT_DIAG260;
	} else if (max_physmem_end) {
		add_physmem_online_range(0, max_physmem_end);
	}
}

> But if neither works, we cannot blindly add all that memory, something is
> messed up. So we'll fallback to
> 
> c) sclp_early_get_memsize()
> 
> But if none of that works, something is seriously wrong.

Ok, thanks for the clarification.
David Hildenbrand Oct. 17, 2024, 2:36 p.m. UTC | #12
On 17.10.24 16:32, Alexander Gordeev wrote:
> On Thu, Oct 17, 2024 at 02:07:12PM +0200, David Hildenbrand wrote:
>> On 17.10.24 12:00, David Hildenbrand wrote:
>>> Well, DIAGNOSE 260 is z/VM only and DIAG 500 is KVM only. So there are
>>> currently not really any other reasonable ways besides SCLP.
>>
>> Correction: Staring at the code again, in detect_physmem_online_ranges()
>> we will indeed try:
>>
>> a) sclp_early_read_storage_info()
>> b) diag260()
> 
> So why care to call diag260() in case of DIAGNOSE 500? What about the below?
> 
> void detect_physmem_online_ranges(unsigned long max_physmem_end)
> {
> 	if (!sclp_early_read_storage_info()) {
> 		physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO;
> 	} else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) {
> 		unsigned long online_end;
> 
> 		if (!sclp_early_get_memsize(&online_end)) {
> 			physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
> 			add_physmem_online_range(0, online_end);
> 		}
> 	} else if (!diag260()) {
> 		physmem_info.info_source = MEM_DETECT_DIAG260;
> 	} else if (max_physmem_end) {
> 		add_physmem_online_range(0, max_physmem_end);
> 	}
> }

Works for me, thanks!
David Hildenbrand Oct. 25, 2024, 10:52 a.m. UTC | #13
On 15.10.24 17:20, Heiko Carstens wrote:
> On Tue, Oct 15, 2024 at 11:01:44AM -0400, Eric Farman wrote:
>> On Mon, 2024-10-14 at 20:43 +0200, Heiko Carstens wrote:
>>> On Mon, Oct 14, 2024 at 04:46:16PM +0200, David Hildenbrand wrote:
> ...
>>> +#define DIAG500_SC_STOR_LIMIT 4
> ...
>> I like the idea of a defined constant here instead of hardcoded, but maybe it should be placed
>> somewhere in include/uapi so that QEMU can pick it up with update-linux-headers.sh and be in sync
>> with the kernel, instead of just an equivalent definition in [1] ?
>>
>> [1] https://lore.kernel.org/qemu-devel/20241008105455.2302628-8-david@redhat.com/
> 
> It is already a mess; we have already subcode 3 defined:
> 
> #define KVM_S390_VIRTIO_CCW_NOTIFY 3
> 
> in
> 
> arch/s390/include/uapi/asm/virtio-ccw.h
> 
> which for some reason is uapi. But it doesn't make sense to put the
> new subcode 4 there too. So what is the end result?
> 
> Another uapi file? I think resolving this would be a project on its own.

Agreed, thanks all!
Alexander Gordeev Oct. 30, 2024, 2:30 p.m. UTC | #14
On Mon, Oct 14, 2024 at 04:46:16PM +0200, David Hildenbrand wrote:

Hi David,

> To support memory devices under QEMU/KVM, such as virtio-mem,
> we have to prepare our kernel virtual address space accordingly and
> have to know the highest possible physical memory address we might see
> later: the storage limit. The good old SCLP interface is not suitable for
> this use case.
> 
> In particular, memory owned by memory devices has no relationship to
> storage increments, it is always detected using the device driver, and
> unaware OSes (no driver) must never try making use of that memory.
> Consequently this memory is located outside of the "maximum storage
> increment"-indicated memory range.
> 
> Let's use our new diag500 STORAGE_LIMIT subcode to query this storage
> limit that can exceed the "maximum storage increment", and use the
> existing interfaces (i.e., SCLP) to obtain information about the initial
> memory that is not owned+managed by memory devices.
> 
> If a hypervisor does not support such memory devices, the address exposed
> through diag500 STORAGE_LIMIT will correspond to the maximum storage
> increment exposed through SCLP.
> 
> To teach kdump on s390 to include memory owned by memory devices, there
> will be ways to query the relevant memory ranges from the device via a
> driver running in special kdump mode (like virtio-mem already implements
> to filter /proc/vmcore access so we don't end up reading from unplugged
> device blocks).
> 
> Tested-by: Mario Casquero <mcasquer@redhat.com>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  arch/s390/boot/physmem_info.c        | 46 ++++++++++++++++++++++++++--
>  arch/s390/include/asm/physmem_info.h |  3 ++
>  2 files changed, 46 insertions(+), 3 deletions(-)

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>

Thanks!
Alexander Gordeev Oct. 30, 2024, 2:33 p.m. UTC | #15
> >  arch/s390/boot/physmem_info.c        | 46 ++++++++++++++++++++++++++--
> >  arch/s390/include/asm/physmem_info.h |  3 ++
> >  2 files changed, 46 insertions(+), 3 deletions(-)
> 
> Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>

Sorry, it supposed to be for v3, so please dismiss this one.

> Thanks!
diff mbox series

Patch

diff --git a/arch/s390/boot/physmem_info.c b/arch/s390/boot/physmem_info.c
index 1d131a81cb8b..fb4e66e80fd8 100644
--- a/arch/s390/boot/physmem_info.c
+++ b/arch/s390/boot/physmem_info.c
@@ -109,6 +109,38 @@  static int diag260(void)
 	return 0;
 }
 
+static int diag500_storage_limit(unsigned long *max_physmem_end)
+{
+	register unsigned long __nr asm("1") = 0x4;
+	register unsigned long __storage_limit asm("2") = 0;
+	unsigned long reg1, reg2;
+	psw_t old;
+
+	asm volatile(
+		"	mvc	0(16,%[psw_old]),0(%[psw_pgm])\n"
+		"	epsw	%[reg1],%[reg2]\n"
+		"	st	%[reg1],0(%[psw_pgm])\n"
+		"	st	%[reg2],4(%[psw_pgm])\n"
+		"	larl	%[reg1],1f\n"
+		"	stg	%[reg1],8(%[psw_pgm])\n"
+		"	diag	2,4,0x500\n"
+		"1:	mvc	0(16,%[psw_pgm]),0(%[psw_old])\n"
+		: [reg1] "=&d" (reg1),
+		  [reg2] "=&a" (reg2),
+		  "+&d" (__storage_limit),
+		  "=Q" (get_lowcore()->program_new_psw),
+		  "=Q" (old)
+		: [psw_old] "a" (&old),
+		  [psw_pgm] "a" (&get_lowcore()->program_new_psw),
+		  "d" (__nr)
+		: "memory");
+	if (!__storage_limit)
+	        return -EINVAL;
+	/* convert inclusive end to exclusive end. */
+	*max_physmem_end = __storage_limit + 1;
+	return 0;
+}
+
 static int tprot(unsigned long addr)
 {
 	unsigned long reg1, reg2;
@@ -157,7 +189,9 @@  unsigned long detect_max_physmem_end(void)
 {
 	unsigned long max_physmem_end = 0;
 
-	if (!sclp_early_get_memsize(&max_physmem_end)) {
+	if (!diag500_storage_limit(&max_physmem_end)) {
+		physmem_info.info_source = MEM_DETECT_DIAG500_STOR_LIMIT;
+	} else if (!sclp_early_get_memsize(&max_physmem_end)) {
 		physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
 	} else {
 		max_physmem_end = search_mem_end();
@@ -170,11 +204,17 @@  void detect_physmem_online_ranges(unsigned long max_physmem_end)
 {
 	if (!sclp_early_read_storage_info()) {
 		physmem_info.info_source = MEM_DETECT_SCLP_STOR_INFO;
+		return;
 	} else if (!diag260()) {
 		physmem_info.info_source = MEM_DETECT_DIAG260;
-	} else if (max_physmem_end) {
-		add_physmem_online_range(0, max_physmem_end);
+		return;
+	} else if (physmem_info.info_source == MEM_DETECT_DIAG500_STOR_LIMIT) {
+		max_physmem_end = 0;
+		if (!sclp_early_get_memsize(&max_physmem_end))
+			physmem_info.info_source = MEM_DETECT_SCLP_READ_INFO;
 	}
+	if (max_physmem_end)
+		add_physmem_online_range(0, max_physmem_end);
 }
 
 void physmem_set_usable_limit(unsigned long limit)
diff --git a/arch/s390/include/asm/physmem_info.h b/arch/s390/include/asm/physmem_info.h
index f45cfc8bc233..51b68a43e195 100644
--- a/arch/s390/include/asm/physmem_info.h
+++ b/arch/s390/include/asm/physmem_info.h
@@ -9,6 +9,7 @@  enum physmem_info_source {
 	MEM_DETECT_NONE = 0,
 	MEM_DETECT_SCLP_STOR_INFO,
 	MEM_DETECT_DIAG260,
+	MEM_DETECT_DIAG500_STOR_LIMIT,
 	MEM_DETECT_SCLP_READ_INFO,
 	MEM_DETECT_BIN_SEARCH
 };
@@ -107,6 +108,8 @@  static inline const char *get_physmem_info_source(void)
 		return "sclp storage info";
 	case MEM_DETECT_DIAG260:
 		return "diag260";
+	case MEM_DETECT_DIAG500_STOR_LIMIT:
+		return "diag500 storage limit";
 	case MEM_DETECT_SCLP_READ_INFO:
 		return "sclp read info";
 	case MEM_DETECT_BIN_SEARCH: