diff mbox series

[v4] x86: add etc/phys-bits fw_cfg file

Message ID 20220922101454.1069462-1-kraxel@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v4] x86: add etc/phys-bits fw_cfg file | expand

Commit Message

Gerd Hoffmann Sept. 22, 2022, 10:14 a.m. UTC
In case phys bits are functional and can be used by the guest (aka
host-phys-bits=on) add a fw_cfg file carrying the value.  This can
be used by the guest firmware for address space configuration.

The value in the etc/phys-bits fw_cfg file should be identical to
the phys bits value published via cpuid leaf 0x80000008.

This is only enabled for 7.2+ machine types for live migration
compatibility reasons.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/i386/fw_cfg.h     |  1 +
 include/hw/i386/pc.h |  1 +
 hw/i386/fw_cfg.c     | 12 ++++++++++++
 hw/i386/pc.c         |  5 +++++
 hw/i386/pc_piix.c    |  2 ++
 hw/i386/pc_q35.c     |  2 ++
 6 files changed, 23 insertions(+)

Comments

Daniel P. Berrangé Sept. 22, 2022, 11:24 a.m. UTC | #1
On Thu, Sep 22, 2022 at 12:14:54PM +0200, Gerd Hoffmann wrote:
> In case phys bits are functional and can be used by the guest (aka
> host-phys-bits=on) add a fw_cfg file carrying the value.  This can
> be used by the guest firmware for address space configuration.
> 
> The value in the etc/phys-bits fw_cfg file should be identical to
> the phys bits value published via cpuid leaf 0x80000008.
> 
> This is only enabled for 7.2+ machine types for live migration
> compatibility reasons.

Is this going to have any implications for what mgmt apps must
take into account when selecting valid migration target hosts ?

Historically, apps have tended to ignore any checks for phys
bits between src/dst migration hosts and hoped for the best.

Will this new behaviour introduce / change any failure scenarios
where the target host has fewer phys bits than the src host, that
mgmt apps need to be made aware of ?

> 
> Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  hw/i386/fw_cfg.h     |  1 +
>  include/hw/i386/pc.h |  1 +
>  hw/i386/fw_cfg.c     | 12 ++++++++++++
>  hw/i386/pc.c         |  5 +++++
>  hw/i386/pc_piix.c    |  2 ++
>  hw/i386/pc_q35.c     |  2 ++
>  6 files changed, 23 insertions(+)
> 
> diff --git a/hw/i386/fw_cfg.h b/hw/i386/fw_cfg.h
> index 275f15c1c5e8..6ff198a6cb85 100644
> --- a/hw/i386/fw_cfg.h
> +++ b/hw/i386/fw_cfg.h
> @@ -26,5 +26,6 @@ FWCfgState *fw_cfg_arch_create(MachineState *ms,
>  void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg);
>  void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg);
>  void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg);
> +void fw_cfg_phys_bits(FWCfgState *fw_cfg);
>  
>  #endif
> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> index c95333514ed3..bedef1ee13c1 100644
> --- a/include/hw/i386/pc.h
> +++ b/include/hw/i386/pc.h
> @@ -119,6 +119,7 @@ struct PCMachineClass {
>      bool enforce_aligned_dimm;
>      bool broken_reserved_end;
>      bool enforce_amd_1tb_hole;
> +    bool phys_bits_in_fw_cfg;
>  
>      /* generate legacy CPU hotplug AML */
>      bool legacy_cpu_hotplug;
> diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
> index a283785a8de4..6a1f18925725 100644
> --- a/hw/i386/fw_cfg.c
> +++ b/hw/i386/fw_cfg.c
> @@ -219,3 +219,15 @@ void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg)
>      aml_append(dev, aml_name_decl("_CRS", crs));
>      aml_append(scope, dev);
>  }
> +
> +void fw_cfg_phys_bits(FWCfgState *fw_cfg)
> +{
> +    X86CPU *cpu = X86_CPU(first_cpu);
> +    uint64_t phys_bits = cpu->phys_bits;
> +
> +    if (cpu->host_phys_bits) {
> +        fw_cfg_add_file(fw_cfg, "etc/phys-bits",
> +                        g_memdup2(&phys_bits, sizeof(phys_bits)),
> +                        sizeof(phys_bits));
> +    }
> +}
> diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> index 566accf7e60a..17ecc7fe4331 100644
> --- a/hw/i386/pc.c
> +++ b/hw/i386/pc.c
> @@ -744,6 +744,7 @@ void pc_machine_done(Notifier *notifier, void *data)
>  {
>      PCMachineState *pcms = container_of(notifier,
>                                          PCMachineState, machine_done);
> +    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
>      X86MachineState *x86ms = X86_MACHINE(pcms);
>  
>      cxl_hook_up_pxb_registers(pcms->bus, &pcms->cxl_devices_state,
> @@ -764,6 +765,9 @@ void pc_machine_done(Notifier *notifier, void *data)
>          fw_cfg_build_feature_control(MACHINE(pcms), x86ms->fw_cfg);
>          /* update FW_CFG_NB_CPUS to account for -device added CPUs */
>          fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
> +        if (pcmc->phys_bits_in_fw_cfg) {
> +            fw_cfg_phys_bits(x86ms->fw_cfg);
> +        }
>      }
>  }
>  
> @@ -1907,6 +1911,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
>      pcmc->kvmclock_enabled = true;
>      pcmc->enforce_aligned_dimm = true;
>      pcmc->enforce_amd_1tb_hole = true;
> +    pcmc->phys_bits_in_fw_cfg = true;
>      /* BIOS ACPI tables: 128K. Other BIOS datastructures: less than 4K reported
>       * to be used at the moment, 32K should be enough for a while.  */
>      pcmc->acpi_data_size = 0x20000 + 0x8000;
> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> index 8043a250adf3..c6a4dbd5c0b0 100644
> --- a/hw/i386/pc_piix.c
> +++ b/hw/i386/pc_piix.c
> @@ -447,9 +447,11 @@ DEFINE_I440FX_MACHINE(v7_2, "pc-i440fx-7.2", NULL,
>  
>  static void pc_i440fx_7_1_machine_options(MachineClass *m)
>  {
> +    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
>      pc_i440fx_7_2_machine_options(m);
>      m->alias = NULL;
>      m->is_default = false;
> +    pcmc->phys_bits_in_fw_cfg = false;
>      compat_props_add(m->compat_props, hw_compat_7_1, hw_compat_7_1_len);
>      compat_props_add(m->compat_props, pc_compat_7_1, pc_compat_7_1_len);
>  }
> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> index 53eda50e818c..c2b56daa1550 100644
> --- a/hw/i386/pc_q35.c
> +++ b/hw/i386/pc_q35.c
> @@ -384,8 +384,10 @@ DEFINE_Q35_MACHINE(v7_2, "pc-q35-7.2", NULL,
>  
>  static void pc_q35_7_1_machine_options(MachineClass *m)
>  {
> +    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
>      pc_q35_7_2_machine_options(m);
>      m->alias = NULL;
> +    pcmc->phys_bits_in_fw_cfg = false;
>      compat_props_add(m->compat_props, hw_compat_7_1, hw_compat_7_1_len);
>      compat_props_add(m->compat_props, pc_compat_7_1, pc_compat_7_1_len);
>  }
> -- 
> 2.37.3
> 
> 

With regards,
Daniel
Michael S. Tsirkin Sept. 22, 2022, 11:56 a.m. UTC | #2
On Thu, Sep 22, 2022 at 12:24:09PM +0100, Daniel P. Berrangé wrote:
> On Thu, Sep 22, 2022 at 12:14:54PM +0200, Gerd Hoffmann wrote:
> > In case phys bits are functional and can be used by the guest (aka
> > host-phys-bits=on) add a fw_cfg file carrying the value.  This can
> > be used by the guest firmware for address space configuration.
> > 
> > The value in the etc/phys-bits fw_cfg file should be identical to
> > the phys bits value published via cpuid leaf 0x80000008.
> > 
> > This is only enabled for 7.2+ machine types for live migration
> > compatibility reasons.
> 
> Is this going to have any implications for what mgmt apps must
> take into account when selecting valid migration target hosts ?

I don't think this does anything by itself. It just tells the firmware
which value to use, since historically it ignored CPUID. I am still
debating with myself whether a boolean would be better.  Would
appreciate KVM maintainer's take on this. But in any case guests already
sometimes use CPUID (e.g. just grep for cpuid_maxphyaddr).
This value is just for firmware use.


> Historically, apps have tended to ignore any checks for phys
> bits between src/dst migration hosts and hoped for the best.
> 
> Will this new behaviour introduce / change any failure scenarios
> where the target host has fewer phys bits than the src host, that
> mgmt apps need to be made aware of ?
> 
> > 
> > Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
> > Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> >  hw/i386/fw_cfg.h     |  1 +
> >  include/hw/i386/pc.h |  1 +
> >  hw/i386/fw_cfg.c     | 12 ++++++++++++
> >  hw/i386/pc.c         |  5 +++++
> >  hw/i386/pc_piix.c    |  2 ++
> >  hw/i386/pc_q35.c     |  2 ++
> >  6 files changed, 23 insertions(+)
> > 
> > diff --git a/hw/i386/fw_cfg.h b/hw/i386/fw_cfg.h
> > index 275f15c1c5e8..6ff198a6cb85 100644
> > --- a/hw/i386/fw_cfg.h
> > +++ b/hw/i386/fw_cfg.h
> > @@ -26,5 +26,6 @@ FWCfgState *fw_cfg_arch_create(MachineState *ms,
> >  void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg);
> >  void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg);
> >  void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg);
> > +void fw_cfg_phys_bits(FWCfgState *fw_cfg);
> >  
> >  #endif
> > diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
> > index c95333514ed3..bedef1ee13c1 100644
> > --- a/include/hw/i386/pc.h
> > +++ b/include/hw/i386/pc.h
> > @@ -119,6 +119,7 @@ struct PCMachineClass {
> >      bool enforce_aligned_dimm;
> >      bool broken_reserved_end;
> >      bool enforce_amd_1tb_hole;
> > +    bool phys_bits_in_fw_cfg;
> >  
> >      /* generate legacy CPU hotplug AML */
> >      bool legacy_cpu_hotplug;
> > diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
> > index a283785a8de4..6a1f18925725 100644
> > --- a/hw/i386/fw_cfg.c
> > +++ b/hw/i386/fw_cfg.c
> > @@ -219,3 +219,15 @@ void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg)
> >      aml_append(dev, aml_name_decl("_CRS", crs));
> >      aml_append(scope, dev);
> >  }
> > +
> > +void fw_cfg_phys_bits(FWCfgState *fw_cfg)
> > +{
> > +    X86CPU *cpu = X86_CPU(first_cpu);
> > +    uint64_t phys_bits = cpu->phys_bits;
> > +
> > +    if (cpu->host_phys_bits) {
> > +        fw_cfg_add_file(fw_cfg, "etc/phys-bits",
> > +                        g_memdup2(&phys_bits, sizeof(phys_bits)),
> > +                        sizeof(phys_bits));
> > +    }
> > +}
> > diff --git a/hw/i386/pc.c b/hw/i386/pc.c
> > index 566accf7e60a..17ecc7fe4331 100644
> > --- a/hw/i386/pc.c
> > +++ b/hw/i386/pc.c
> > @@ -744,6 +744,7 @@ void pc_machine_done(Notifier *notifier, void *data)
> >  {
> >      PCMachineState *pcms = container_of(notifier,
> >                                          PCMachineState, machine_done);
> > +    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
> >      X86MachineState *x86ms = X86_MACHINE(pcms);
> >  
> >      cxl_hook_up_pxb_registers(pcms->bus, &pcms->cxl_devices_state,
> > @@ -764,6 +765,9 @@ void pc_machine_done(Notifier *notifier, void *data)
> >          fw_cfg_build_feature_control(MACHINE(pcms), x86ms->fw_cfg);
> >          /* update FW_CFG_NB_CPUS to account for -device added CPUs */
> >          fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
> > +        if (pcmc->phys_bits_in_fw_cfg) {
> > +            fw_cfg_phys_bits(x86ms->fw_cfg);
> > +        }
> >      }
> >  }
> >  
> > @@ -1907,6 +1911,7 @@ static void pc_machine_class_init(ObjectClass *oc, void *data)
> >      pcmc->kvmclock_enabled = true;
> >      pcmc->enforce_aligned_dimm = true;
> >      pcmc->enforce_amd_1tb_hole = true;
> > +    pcmc->phys_bits_in_fw_cfg = true;
> >      /* BIOS ACPI tables: 128K. Other BIOS datastructures: less than 4K reported
> >       * to be used at the moment, 32K should be enough for a while.  */
> >      pcmc->acpi_data_size = 0x20000 + 0x8000;
> > diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
> > index 8043a250adf3..c6a4dbd5c0b0 100644
> > --- a/hw/i386/pc_piix.c
> > +++ b/hw/i386/pc_piix.c
> > @@ -447,9 +447,11 @@ DEFINE_I440FX_MACHINE(v7_2, "pc-i440fx-7.2", NULL,
> >  
> >  static void pc_i440fx_7_1_machine_options(MachineClass *m)
> >  {
> > +    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
> >      pc_i440fx_7_2_machine_options(m);
> >      m->alias = NULL;
> >      m->is_default = false;
> > +    pcmc->phys_bits_in_fw_cfg = false;
> >      compat_props_add(m->compat_props, hw_compat_7_1, hw_compat_7_1_len);
> >      compat_props_add(m->compat_props, pc_compat_7_1, pc_compat_7_1_len);
> >  }
> > diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
> > index 53eda50e818c..c2b56daa1550 100644
> > --- a/hw/i386/pc_q35.c
> > +++ b/hw/i386/pc_q35.c
> > @@ -384,8 +384,10 @@ DEFINE_Q35_MACHINE(v7_2, "pc-q35-7.2", NULL,
> >  
> >  static void pc_q35_7_1_machine_options(MachineClass *m)
> >  {
> > +    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
> >      pc_q35_7_2_machine_options(m);
> >      m->alias = NULL;
> > +    pcmc->phys_bits_in_fw_cfg = false;
> >      compat_props_add(m->compat_props, hw_compat_7_1, hw_compat_7_1_len);
> >      compat_props_add(m->compat_props, pc_compat_7_1, pc_compat_7_1_len);
> >  }
> > -- 
> > 2.37.3
> > 
> > 
> 
> With regards,
> Daniel
> -- 
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
Gerd Hoffmann Sept. 22, 2022, 12:20 p.m. UTC | #3
On Thu, Sep 22, 2022 at 12:24:09PM +0100, Daniel P. Berrangé wrote:
> On Thu, Sep 22, 2022 at 12:14:54PM +0200, Gerd Hoffmann wrote:
> > In case phys bits are functional and can be used by the guest (aka
> > host-phys-bits=on) add a fw_cfg file carrying the value.  This can
> > be used by the guest firmware for address space configuration.
> > 
> > The value in the etc/phys-bits fw_cfg file should be identical to
> > the phys bits value published via cpuid leaf 0x80000008.
> > 
> > This is only enabled for 7.2+ machine types for live migration
> > compatibility reasons.
> 
> Is this going to have any implications for what mgmt apps must
> take into account when selecting valid migration target hosts ?
> 
> Historically, apps have tended to ignore any checks for phys
> bits between src/dst migration hosts and hoped for the best.
> 
> Will this new behaviour introduce / change any failure scenarios
> where the target host has fewer phys bits than the src host, that
> mgmt apps need to be made aware of ?

No.  This will basically inform the guest that host-phys-bits has been
enabled (and pass the number of bits).  So the firmware can make use of
the available address space instead of trying to be as conservative as
possible to avoid going beyond the (unknown) limit.

The phys-bits config itself is not touched.

take care,
  Gerd
Paolo Bonzini Sept. 22, 2022, 12:38 p.m. UTC | #4
On Thu, Sep 22, 2022 at 2:21 PM Gerd Hoffmann <kraxel@redhat.com> wrote:
> No.  This will basically inform the guest that host-phys-bits has been
> enabled (and pass the number of bits).  So the firmware can make use of
> the available address space instead of trying to be as conservative as
> possible to avoid going beyond the (unknown) limit.

Intel processors that are not extremely old have host-phys-bits equal
to 39, 46 or 52. Older processors that had 36, in all likelihood,
didn't have IOMMUs (so no big 64-bit BARs).

AMD processors have had 48 for a while, though older consumer processors had 40.

QEMU has always used 40, though many downstream packages (IIRC RHEL
and Ubuntu) just use host-phys-bits = true when using KVM.

Would it work to:

1) set host-phys-bits to true on new machine types when not using TCG
(i.e. KVM / HVF / WHPX)

2) in the firmware treat 40 as if it were 39, to support old machine types?

Paolo
Gerd Hoffmann Sept. 22, 2022, 2:16 p.m. UTC | #5
On Thu, Sep 22, 2022 at 02:38:02PM +0200, Paolo Bonzini wrote:
> On Thu, Sep 22, 2022 at 2:21 PM Gerd Hoffmann <kraxel@redhat.com> wrote:
> > No.  This will basically inform the guest that host-phys-bits has been
> > enabled (and pass the number of bits).  So the firmware can make use of
> > the available address space instead of trying to be as conservative as
> > possible to avoid going beyond the (unknown) limit.
> 
> Intel processors that are not extremely old have host-phys-bits equal
> to 39, 46 or 52. Older processors that had 36, in all likelihood,
> didn't have IOMMUs (so no big 64-bit BARs).

Well, I happen to have a intel box with 36 physbits + iommu.

> 1) set host-phys-bits to true on new machine types when not using TCG
> (i.e. KVM / HVF / WHPX)

That is probably a good idea, but an independent problem.

Has live migration problems (when hosts have different phys bits),
which is IIRC the reason this hasn't happen yet.  Maybe that is solved
meanwhile the one way or another, I've seen some phys-bits changes in
libvirt recently ...

> 2) in the firmware treat 40 as if it were 39, to support old machine
> types?

The background of all this is that devices need more and more memory,
and the very conservative edk2 defaults are becoming increasingly
problematic.  So what I want do is scale things up with the address
space size.  Use 1/4 or 1/8 of the physical address space as 64bit
pci mmio window.  Likewise scale up the default pcie root port window
sizes, to have more room for hotplug.

For that to work the firmware obviously needs to know how much it
actually has, which is not the case.

Yes, the problematic cases are intel machines with 36 or 39.

Treating 40 as if it were 39 will explode with 36 cpus.

Treating 40 as if it were 36 will mostly work.  Will leave a big
chunk of address space unused.  Will cause regressions on guests
with > 32G of RAM.

Treating 40 as invalid and continue to use the current conservative
heuristic, otherwise treat phys-bits as valid might work.  Obvious
corner case is that it'll not catch broken manual configurations
(host-phys-bits=off,phys-bits=<larger-than-host>), only the broken
default.  Not sure how much of a problem that is in practice, maybe
it isn't.

I think I still prefer to explicitly communicate a reliable phys-bits
value to the guest somehow.

take care,
  Gerd
Jim Mattson Sept. 22, 2022, 5:13 p.m. UTC | #6
On Thu, Sep 22, 2022 at 7:16 AM Gerd Hoffmann <kraxel@redhat.com> wrote:
>
> On Thu, Sep 22, 2022 at 02:38:02PM +0200, Paolo Bonzini wrote:
> > On Thu, Sep 22, 2022 at 2:21 PM Gerd Hoffmann <kraxel@redhat.com> wrote:
> > > No.  This will basically inform the guest that host-phys-bits has been
> > > enabled (and pass the number of bits).  So the firmware can make use of
> > > the available address space instead of trying to be as conservative as
> > > possible to avoid going beyond the (unknown) limit.
> >
> > Intel processors that are not extremely old have host-phys-bits equal
> > to 39, 46 or 52. Older processors that had 36, in all likelihood,
> > didn't have IOMMUs (so no big 64-bit BARs).
>
> Well, I happen to have a intel box with 36 physbits + iommu.
>
> > 1) set host-phys-bits to true on new machine types when not using TCG
> > (i.e. KVM / HVF / WHPX)
>
> That is probably a good idea, but an independent problem.
>
> Has live migration problems (when hosts have different phys bits),
> which is IIRC the reason this hasn't happen yet.  Maybe that is solved
> meanwhile the one way or another, I've seen some phys-bits changes in
> libvirt recently ...
>
> > 2) in the firmware treat 40 as if it were 39, to support old machine
> > types?
>
> The background of all this is that devices need more and more memory,
> and the very conservative edk2 defaults are becoming increasingly
> problematic.  So what I want do is scale things up with the address
> space size.  Use 1/4 or 1/8 of the physical address space as 64bit
> pci mmio window.  Likewise scale up the default pcie root port window
> sizes, to have more room for hotplug.
>
> For that to work the firmware obviously needs to know how much it
> actually has, which is not the case.
>
> Yes, the problematic cases are intel machines with 36 or 39.
>
> Treating 40 as if it were 39 will explode with 36 cpus.
>
> Treating 40 as if it were 36 will mostly work.  Will leave a big
> chunk of address space unused.  Will cause regressions on guests
> with > 32G of RAM.
>
> Treating 40 as invalid and continue to use the current conservative
> heuristic, otherwise treat phys-bits as valid might work.  Obvious
> corner case is that it'll not catch broken manual configurations
> (host-phys-bits=off,phys-bits=<larger-than-host>), only the broken
> default.  Not sure how much of a problem that is in practice, maybe
> it isn't.
>
> I think I still prefer to explicitly communicate a reliable phys-bits
> value to the guest somehow.

On x86 hardware, KVM is incapable of emulating a guest physical width
that differs from the host physical width. There isn't support in the
hardware for it.
Paolo Bonzini Sept. 22, 2022, 7:49 p.m. UTC | #7
On Thu, Sep 22, 2022 at 7:13 PM Jim Mattson <jmattson@google.com> wrote:
> > Treating 40 as invalid and continue to use the current conservative
> > heuristic, otherwise treat phys-bits as valid might work.  Obvious
> > corner case is that it'll not catch broken manual configurations
> > (host-phys-bits=off,phys-bits=<larger-than-host>), only the broken
> > default.  Not sure how much of a problem that is in practice, maybe
> > it isn't.
> >
> > I think I still prefer to explicitly communicate a reliable phys-bits
> > value to the guest somehow.
>
> On x86 hardware, KVM is incapable of emulating a guest physical width
> that differs from the host physical width. There isn't support in the
> hardware for it.

Indeed, everything else is a userspace bug. Especially since here
we're talking of host_maxphyaddr < guest_maxphyaddr, which is
completely impossible.

Paolo
Gerd Hoffmann Sept. 22, 2022, 8:33 p.m. UTC | #8
On Thu, Sep 22, 2022 at 02:38:02PM +0200, Paolo Bonzini wrote:
> On Thu, Sep 22, 2022 at 2:21 PM Gerd Hoffmann <kraxel@redhat.com> wrote:
> > No.  This will basically inform the guest that host-phys-bits has been
> > enabled (and pass the number of bits).  So the firmware can make use of
> > the available address space instead of trying to be as conservative as
> > possible to avoid going beyond the (unknown) limit.
> 
> Intel processors that are not extremely old have host-phys-bits equal
> to 39, 46 or 52. Older processors that had 36, in all likelihood,
> didn't have IOMMUs (so no big 64-bit BARs).
> 
> AMD processors have had 48 for a while, though older consumer processors had 40.

How reliable is the vendorid?

Given newer processors have more than 40 and for older ones we know
the possible values for the two relevant x86 vendors we could do
something along the lines of:

   phys-bits >= 41                   -> valid
   phys-bits == 40    + AuthenticAMD -> valid
   phys-bits == 36,39 + GenuineIntel -> valid
   everything else                   -> invalid

Does that look sensible to you?

take care,
  Gerd
Gerd Hoffmann Sept. 23, 2022, 6:23 a.m. UTC | #9
Hi,

> > Given newer processors have more than 40 and for older ones we know
> > the possible values for the two relevant x86 vendors we could do
> > something along the lines of:
> >
> >    phys-bits >= 41                   -> valid
> >    phys-bits == 40    + AuthenticAMD -> valid
> >    phys-bits == 36,39 + GenuineIntel -> valid
> >    everything else                   -> invalid
> >
> > Does that look sensible to you?
> >
> 
> Yes, it does! Is phys-bits == 36 the same as invalid?

'invalid' would continue to use the current guesswork codepath for
phys-bits.  Which will end up with phys-bits = 36 for smaller VMs, but
it can go beyond that in VMs with alot (32G or more) of memory.  That
logic assumes that physical machines with enough RAM for 32G+ guests
have a physical address space > 64G.

'phys-bits = 36' would be a hard limit.

So, it's not exactly the same but small VMs wouldn't see a difference.

take care,
  Gerd
Michael S. Tsirkin Oct. 7, 2022, 1:44 p.m. UTC | #10
On Fri, Sep 23, 2022 at 08:23:12AM +0200, Gerd Hoffmann wrote:
>   Hi,
> 
> > > Given newer processors have more than 40 and for older ones we know
> > > the possible values for the two relevant x86 vendors we could do
> > > something along the lines of:
> > >
> > >    phys-bits >= 41                   -> valid
> > >    phys-bits == 40    + AuthenticAMD -> valid
> > >    phys-bits == 36,39 + GenuineIntel -> valid
> > >    everything else                   -> invalid
> > >
> > > Does that look sensible to you?
> > >
> > 
> > Yes, it does! Is phys-bits == 36 the same as invalid?
> 
> 'invalid' would continue to use the current guesswork codepath for
> phys-bits.  Which will end up with phys-bits = 36 for smaller VMs, but
> it can go beyond that in VMs with alot (32G or more) of memory.  That
> logic assumes that physical machines with enough RAM for 32G+ guests
> have a physical address space > 64G.
> 
> 'phys-bits = 36' would be a hard limit.
> 
> So, it's not exactly the same but small VMs wouldn't see a difference.
> 
> take care,
>   Gerd

I dropped the patch for now.
Gerd Hoffmann Oct. 10, 2022, 7:30 a.m. UTC | #11
Hi,

> > > > Given newer processors have more than 40 and for older ones we know
> > > > the possible values for the two relevant x86 vendors we could do
> > > > something along the lines of:
> > > >
> > > >    phys-bits >= 41                   -> valid
> > > >    phys-bits == 40    + AuthenticAMD -> valid
> > > >    phys-bits == 36,39 + GenuineIntel -> valid
> > > >    everything else                   -> invalid

> I dropped the patch for now.

You can drop it forever.

For the mail archives and anyone interested:  The approach outlined
above appears to work well, patches just landed in edk2 master branch.
Next edk2 stable tag (2022-11) will have it.

take care,
  Gerd
diff mbox series

Patch

diff --git a/hw/i386/fw_cfg.h b/hw/i386/fw_cfg.h
index 275f15c1c5e8..6ff198a6cb85 100644
--- a/hw/i386/fw_cfg.h
+++ b/hw/i386/fw_cfg.h
@@ -26,5 +26,6 @@  FWCfgState *fw_cfg_arch_create(MachineState *ms,
 void fw_cfg_build_smbios(MachineState *ms, FWCfgState *fw_cfg);
 void fw_cfg_build_feature_control(MachineState *ms, FWCfgState *fw_cfg);
 void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg);
+void fw_cfg_phys_bits(FWCfgState *fw_cfg);
 
 #endif
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index c95333514ed3..bedef1ee13c1 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -119,6 +119,7 @@  struct PCMachineClass {
     bool enforce_aligned_dimm;
     bool broken_reserved_end;
     bool enforce_amd_1tb_hole;
+    bool phys_bits_in_fw_cfg;
 
     /* generate legacy CPU hotplug AML */
     bool legacy_cpu_hotplug;
diff --git a/hw/i386/fw_cfg.c b/hw/i386/fw_cfg.c
index a283785a8de4..6a1f18925725 100644
--- a/hw/i386/fw_cfg.c
+++ b/hw/i386/fw_cfg.c
@@ -219,3 +219,15 @@  void fw_cfg_add_acpi_dsdt(Aml *scope, FWCfgState *fw_cfg)
     aml_append(dev, aml_name_decl("_CRS", crs));
     aml_append(scope, dev);
 }
+
+void fw_cfg_phys_bits(FWCfgState *fw_cfg)
+{
+    X86CPU *cpu = X86_CPU(first_cpu);
+    uint64_t phys_bits = cpu->phys_bits;
+
+    if (cpu->host_phys_bits) {
+        fw_cfg_add_file(fw_cfg, "etc/phys-bits",
+                        g_memdup2(&phys_bits, sizeof(phys_bits)),
+                        sizeof(phys_bits));
+    }
+}
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 566accf7e60a..17ecc7fe4331 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -744,6 +744,7 @@  void pc_machine_done(Notifier *notifier, void *data)
 {
     PCMachineState *pcms = container_of(notifier,
                                         PCMachineState, machine_done);
+    PCMachineClass *pcmc = PC_MACHINE_GET_CLASS(pcms);
     X86MachineState *x86ms = X86_MACHINE(pcms);
 
     cxl_hook_up_pxb_registers(pcms->bus, &pcms->cxl_devices_state,
@@ -764,6 +765,9 @@  void pc_machine_done(Notifier *notifier, void *data)
         fw_cfg_build_feature_control(MACHINE(pcms), x86ms->fw_cfg);
         /* update FW_CFG_NB_CPUS to account for -device added CPUs */
         fw_cfg_modify_i16(x86ms->fw_cfg, FW_CFG_NB_CPUS, x86ms->boot_cpus);
+        if (pcmc->phys_bits_in_fw_cfg) {
+            fw_cfg_phys_bits(x86ms->fw_cfg);
+        }
     }
 }
 
@@ -1907,6 +1911,7 @@  static void pc_machine_class_init(ObjectClass *oc, void *data)
     pcmc->kvmclock_enabled = true;
     pcmc->enforce_aligned_dimm = true;
     pcmc->enforce_amd_1tb_hole = true;
+    pcmc->phys_bits_in_fw_cfg = true;
     /* BIOS ACPI tables: 128K. Other BIOS datastructures: less than 4K reported
      * to be used at the moment, 32K should be enough for a while.  */
     pcmc->acpi_data_size = 0x20000 + 0x8000;
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 8043a250adf3..c6a4dbd5c0b0 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -447,9 +447,11 @@  DEFINE_I440FX_MACHINE(v7_2, "pc-i440fx-7.2", NULL,
 
 static void pc_i440fx_7_1_machine_options(MachineClass *m)
 {
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
     pc_i440fx_7_2_machine_options(m);
     m->alias = NULL;
     m->is_default = false;
+    pcmc->phys_bits_in_fw_cfg = false;
     compat_props_add(m->compat_props, hw_compat_7_1, hw_compat_7_1_len);
     compat_props_add(m->compat_props, pc_compat_7_1, pc_compat_7_1_len);
 }
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 53eda50e818c..c2b56daa1550 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -384,8 +384,10 @@  DEFINE_Q35_MACHINE(v7_2, "pc-q35-7.2", NULL,
 
 static void pc_q35_7_1_machine_options(MachineClass *m)
 {
+    PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
     pc_q35_7_2_machine_options(m);
     m->alias = NULL;
+    pcmc->phys_bits_in_fw_cfg = false;
     compat_props_add(m->compat_props, hw_compat_7_1, hw_compat_7_1_len);
     compat_props_add(m->compat_props, pc_compat_7_1, pc_compat_7_1_len);
 }