Message ID | 1de2cc0a-e89c-6be9-9d6e-a10219f6f9aa@suse.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | IOMMU: superpage support when not sharing pagetables | expand |
On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote: > While already the case for PVH, there's no reason to treat PV > differently here, though of course the addresses get taken from another > source in this case. Except that, to match CPU side mappings, by default > we permit r/o ones. This then also means we now deal consistently with > IO-APICs whose MMIO is or is not covered by E820 reserved regions. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> Just one comment below. > --- > v5: Extend to also cover e.g. HPET, which in turn means explicitly > excluding PCI MMCFG ranges. > [integrated] v1: Integrate into series. > [standalone] v2: Keep IOMMU mappings in sync with CPU ones. > > --- a/xen/drivers/passthrough/x86/iommu.c > +++ b/xen/drivers/passthrough/x86/iommu.c > @@ -13,6 +13,7 @@ > */ > > #include <xen/sched.h> > +#include <xen/iocap.h> > #include <xen/iommu.h> > #include <xen/paging.h> > #include <xen/guest_access.h> > @@ -275,12 +276,12 @@ void iommu_identity_map_teardown(struct > } > } > > -static bool __hwdom_init hwdom_iommu_map(const struct domain *d, > - unsigned long pfn, > - unsigned long max_pfn) > +static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d, > + unsigned long pfn, > + unsigned long max_pfn) > { > mfn_t mfn = _mfn(pfn); > - unsigned int i, type; > + unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable; > > /* > * Set up 1:1 mapping for dom0. Default to include only conventional RAM > @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map > * that fall in unusable ranges for PV Dom0. > */ > if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) > - return false; > + return 0; > > switch ( type = page_get_ram_type(mfn) ) > { > case RAM_TYPE_UNUSABLE: > - return false; > + return 0; > > case RAM_TYPE_CONVENTIONAL: > if ( iommu_hwdom_strict ) > - return false; > + return 0; > break; > > default: > if ( type & RAM_TYPE_RESERVED ) > { > if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) > - return false; > + perms = 0; > } > - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) > - return false; > + else if ( is_hvm_domain(d) ) > + return 0; > + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) > + perms = 0; > } > > /* Check that it doesn't overlap with the Interrupt Address Range. */ > if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) > - return false; > + return 0; > /* ... or the IO-APIC */ > - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) > - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > - return false; > + if ( has_vioapic(d) ) > + { > + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) > + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > + return 0; > + } > + else if ( is_pv_domain(d) ) > + { > + /* > + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o > + * ones there (also for e.g. HPET in certain cases), so it should also > + * have such established for IOMMUs. > + */ > + if ( iomem_access_permitted(d, pfn, pfn) && > + rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > + perms = IOMMUF_readable; > + } > /* > * ... or the PCIe MCFG regions. > * TODO: runtime added MMCFG regions are not checked to make sure they > * don't overlap with already mapped regions, thus preventing trapping. > */ > if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) ) > - return false; > + return 0; > + else if ( is_pv_domain(d) ) > + { > + /* > + * Don't extend consistency with CPU mappings to PCI MMCFG regions. > + * These shouldn't be accessed via DMA by devices. Could you expand the comment a bit to explicitly mention the reason why MMCFG regions shouldn't be accessible from device DMA operations? Thanks, Roger.
On 31.05.2022 16:40, Roger Pau Monné wrote: > On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote: >> While already the case for PVH, there's no reason to treat PV >> differently here, though of course the addresses get taken from another >> source in this case. Except that, to match CPU side mappings, by default >> we permit r/o ones. This then also means we now deal consistently with >> IO-APICs whose MMIO is or is not covered by E820 reserved regions. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> > > Reviewed-by: Roger Pau Monné <roger.pau@citrix.com> Thanks. >> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map >> * that fall in unusable ranges for PV Dom0. >> */ >> if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) >> - return false; >> + return 0; >> >> switch ( type = page_get_ram_type(mfn) ) >> { >> case RAM_TYPE_UNUSABLE: >> - return false; >> + return 0; >> >> case RAM_TYPE_CONVENTIONAL: >> if ( iommu_hwdom_strict ) >> - return false; >> + return 0; >> break; >> >> default: >> if ( type & RAM_TYPE_RESERVED ) >> { >> if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) >> - return false; >> + perms = 0; >> } >> - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) >> - return false; >> + else if ( is_hvm_domain(d) ) >> + return 0; >> + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) >> + perms = 0; >> } >> >> /* Check that it doesn't overlap with the Interrupt Address Range. */ >> if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) >> - return false; >> + return 0; >> /* ... or the IO-APIC */ >> - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) >> - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >> - return false; >> + if ( has_vioapic(d) ) >> + { >> + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) >> + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >> + return 0; >> + } >> + else if ( is_pv_domain(d) ) >> + { >> + /* >> + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o >> + * ones there (also for e.g. HPET in certain cases), so it should also >> + * have such established for IOMMUs. >> + */ >> + if ( iomem_access_permitted(d, pfn, pfn) && >> + rangeset_contains_singleton(mmio_ro_ranges, pfn) ) >> + perms = IOMMUF_readable; >> + } >> /* >> * ... or the PCIe MCFG regions. With this comment (which I leave alone) ... >> * TODO: runtime added MMCFG regions are not checked to make sure they >> * don't overlap with already mapped regions, thus preventing trapping. >> */ >> if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) ) >> - return false; >> + return 0; >> + else if ( is_pv_domain(d) ) >> + { >> + /* >> + * Don't extend consistency with CPU mappings to PCI MMCFG regions. >> + * These shouldn't be accessed via DMA by devices. > > Could you expand the comment a bit to explicitly mention the reason > why MMCFG regions shouldn't be accessible from device DMA operations? ... it's hard to tell what I should write here. I'd expect extended reasoning to go there (if anywhere). I'd be okay adjusting the earlier comment, if only I knew what to write. "We don't want them to be accessed that way" seems a little blunt. I could say "Devices have other means to access PCI config space", but this not being said there I took as being implied. Or else what was the reason to exclude these for PVH Dom0? Jan
On Tue, May 31, 2022 at 05:40:03PM +0200, Jan Beulich wrote: > On 31.05.2022 16:40, Roger Pau Monné wrote: > > On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote: > >> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map > >> * that fall in unusable ranges for PV Dom0. > >> */ > >> if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) > >> - return false; > >> + return 0; > >> > >> switch ( type = page_get_ram_type(mfn) ) > >> { > >> case RAM_TYPE_UNUSABLE: > >> - return false; > >> + return 0; > >> > >> case RAM_TYPE_CONVENTIONAL: > >> if ( iommu_hwdom_strict ) > >> - return false; > >> + return 0; > >> break; > >> > >> default: > >> if ( type & RAM_TYPE_RESERVED ) > >> { > >> if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) > >> - return false; > >> + perms = 0; > >> } > >> - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) > >> - return false; > >> + else if ( is_hvm_domain(d) ) > >> + return 0; > >> + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) > >> + perms = 0; > >> } > >> > >> /* Check that it doesn't overlap with the Interrupt Address Range. */ > >> if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) > >> - return false; > >> + return 0; > >> /* ... or the IO-APIC */ > >> - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) > >> - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > >> - return false; > >> + if ( has_vioapic(d) ) > >> + { > >> + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) > >> + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > >> + return 0; > >> + } > >> + else if ( is_pv_domain(d) ) > >> + { > >> + /* > >> + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o > >> + * ones there (also for e.g. HPET in certain cases), so it should also > >> + * have such established for IOMMUs. > >> + */ > >> + if ( iomem_access_permitted(d, pfn, pfn) && > >> + rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > >> + perms = IOMMUF_readable; > >> + } > >> /* > >> * ... or the PCIe MCFG regions. > > With this comment (which I leave alone) ... > > >> * TODO: runtime added MMCFG regions are not checked to make sure they > >> * don't overlap with already mapped regions, thus preventing trapping. > >> */ > >> if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) ) > >> - return false; > >> + return 0; > >> + else if ( is_pv_domain(d) ) > >> + { > >> + /* > >> + * Don't extend consistency with CPU mappings to PCI MMCFG regions. > >> + * These shouldn't be accessed via DMA by devices. > > > > Could you expand the comment a bit to explicitly mention the reason > > why MMCFG regions shouldn't be accessible from device DMA operations? > > ... it's hard to tell what I should write here. I'd expect extended > reasoning to go there (if anywhere). I'd be okay adjusting the earlier > comment, if only I knew what to write. "We don't want them to be > accessed that way" seems a little blunt. I could say "Devices have > other means to access PCI config space", but this not being said there > I took as being implied. But we could likely say the same about IO-APIC or HPET MMIO regions. I don't think we expect them to be accessed by devices, yet we provide them for coherency with CPU side mappings in the PV case. > Or else what was the reason to exclude these > for PVH Dom0? The reason for PVH is because the config space is (partially) emulated for the hardware domain, so we don't allow untrapped access by the CPU either. Thanks, Roger.
On 31.05.2022 18:15, Roger Pau Monné wrote: > On Tue, May 31, 2022 at 05:40:03PM +0200, Jan Beulich wrote: >> On 31.05.2022 16:40, Roger Pau Monné wrote: >>> On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote: >>>> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map >>>> * that fall in unusable ranges for PV Dom0. >>>> */ >>>> if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) >>>> - return false; >>>> + return 0; >>>> >>>> switch ( type = page_get_ram_type(mfn) ) >>>> { >>>> case RAM_TYPE_UNUSABLE: >>>> - return false; >>>> + return 0; >>>> >>>> case RAM_TYPE_CONVENTIONAL: >>>> if ( iommu_hwdom_strict ) >>>> - return false; >>>> + return 0; >>>> break; >>>> >>>> default: >>>> if ( type & RAM_TYPE_RESERVED ) >>>> { >>>> if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) >>>> - return false; >>>> + perms = 0; >>>> } >>>> - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) >>>> - return false; >>>> + else if ( is_hvm_domain(d) ) >>>> + return 0; >>>> + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) >>>> + perms = 0; >>>> } >>>> >>>> /* Check that it doesn't overlap with the Interrupt Address Range. */ >>>> if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) >>>> - return false; >>>> + return 0; >>>> /* ... or the IO-APIC */ >>>> - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) >>>> - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >>>> - return false; >>>> + if ( has_vioapic(d) ) >>>> + { >>>> + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) >>>> + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >>>> + return 0; >>>> + } >>>> + else if ( is_pv_domain(d) ) >>>> + { >>>> + /* >>>> + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o >>>> + * ones there (also for e.g. HPET in certain cases), so it should also >>>> + * have such established for IOMMUs. >>>> + */ >>>> + if ( iomem_access_permitted(d, pfn, pfn) && >>>> + rangeset_contains_singleton(mmio_ro_ranges, pfn) ) >>>> + perms = IOMMUF_readable; >>>> + } >>>> /* >>>> * ... or the PCIe MCFG regions. >> >> With this comment (which I leave alone) ... >> >>>> * TODO: runtime added MMCFG regions are not checked to make sure they >>>> * don't overlap with already mapped regions, thus preventing trapping. >>>> */ >>>> if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) ) >>>> - return false; >>>> + return 0; >>>> + else if ( is_pv_domain(d) ) >>>> + { >>>> + /* >>>> + * Don't extend consistency with CPU mappings to PCI MMCFG regions. >>>> + * These shouldn't be accessed via DMA by devices. >>> >>> Could you expand the comment a bit to explicitly mention the reason >>> why MMCFG regions shouldn't be accessible from device DMA operations? >> >> ... it's hard to tell what I should write here. I'd expect extended >> reasoning to go there (if anywhere). I'd be okay adjusting the earlier >> comment, if only I knew what to write. "We don't want them to be >> accessed that way" seems a little blunt. I could say "Devices have >> other means to access PCI config space", but this not being said there >> I took as being implied. > > But we could likely say the same about IO-APIC or HPET MMIO regions. > I don't think we expect them to be accessed by devices, yet we provide > them for coherency with CPU side mappings in the PV case. As to "say the same" - yes for the first part of my earlier reply, but no for the latter part. >> Or else what was the reason to exclude these >> for PVH Dom0? > > The reason for PVH is because the config space is (partially) emulated > for the hardware domain, so we don't allow untrapped access by the CPU > either. Hmm, right - there's read emulation there as well, while for PV we only intercept writes. So overall should we perhaps permit r/o access to MMCFG for PV? Of course that would only end up consistent once we adjust mappings dynamically when MMCFG ranges are put in use (IOW if we can't verify an MMCFG range is suitably reserved, we'd not find in mmio_ro_ranges just yet, and hence we still wouldn't have an IOMMU side mapping even if CPU side mappings are permitted). But for the patch here it would simply mean dropping some of the code I did add for v5. Otherwise, i.e. if the code is to remain as is, I'm afraid I still wouldn't see what to put usefully in the comment. Jan
On Wed, Jun 01, 2022 at 09:10:09AM +0200, Jan Beulich wrote: > On 31.05.2022 18:15, Roger Pau Monné wrote: > > On Tue, May 31, 2022 at 05:40:03PM +0200, Jan Beulich wrote: > >> On 31.05.2022 16:40, Roger Pau Monné wrote: > >>> On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote: > >>>> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map > >>>> * that fall in unusable ranges for PV Dom0. > >>>> */ > >>>> if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) > >>>> - return false; > >>>> + return 0; > >>>> > >>>> switch ( type = page_get_ram_type(mfn) ) > >>>> { > >>>> case RAM_TYPE_UNUSABLE: > >>>> - return false; > >>>> + return 0; > >>>> > >>>> case RAM_TYPE_CONVENTIONAL: > >>>> if ( iommu_hwdom_strict ) > >>>> - return false; > >>>> + return 0; > >>>> break; > >>>> > >>>> default: > >>>> if ( type & RAM_TYPE_RESERVED ) > >>>> { > >>>> if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) > >>>> - return false; > >>>> + perms = 0; > >>>> } > >>>> - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) > >>>> - return false; > >>>> + else if ( is_hvm_domain(d) ) > >>>> + return 0; > >>>> + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) > >>>> + perms = 0; > >>>> } > >>>> > >>>> /* Check that it doesn't overlap with the Interrupt Address Range. */ > >>>> if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) > >>>> - return false; > >>>> + return 0; > >>>> /* ... or the IO-APIC */ > >>>> - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) > >>>> - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > >>>> - return false; > >>>> + if ( has_vioapic(d) ) > >>>> + { > >>>> + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) > >>>> + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > >>>> + return 0; > >>>> + } > >>>> + else if ( is_pv_domain(d) ) > >>>> + { > >>>> + /* > >>>> + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o > >>>> + * ones there (also for e.g. HPET in certain cases), so it should also > >>>> + * have such established for IOMMUs. > >>>> + */ > >>>> + if ( iomem_access_permitted(d, pfn, pfn) && > >>>> + rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > >>>> + perms = IOMMUF_readable; > >>>> + } > >>>> /* > >>>> * ... or the PCIe MCFG regions. > >> > >> With this comment (which I leave alone) ... > >> > >>>> * TODO: runtime added MMCFG regions are not checked to make sure they > >>>> * don't overlap with already mapped regions, thus preventing trapping. > >>>> */ > >>>> if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) ) > >>>> - return false; > >>>> + return 0; > >>>> + else if ( is_pv_domain(d) ) > >>>> + { > >>>> + /* > >>>> + * Don't extend consistency with CPU mappings to PCI MMCFG regions. > >>>> + * These shouldn't be accessed via DMA by devices. > >>> > >>> Could you expand the comment a bit to explicitly mention the reason > >>> why MMCFG regions shouldn't be accessible from device DMA operations? > >> > >> ... it's hard to tell what I should write here. I'd expect extended > >> reasoning to go there (if anywhere). I'd be okay adjusting the earlier > >> comment, if only I knew what to write. "We don't want them to be > >> accessed that way" seems a little blunt. I could say "Devices have > >> other means to access PCI config space", but this not being said there > >> I took as being implied. > > > > But we could likely say the same about IO-APIC or HPET MMIO regions. > > I don't think we expect them to be accessed by devices, yet we provide > > them for coherency with CPU side mappings in the PV case. > > As to "say the same" - yes for the first part of my earlier reply, but > no for the latter part. Yes, obviously devices cannot access the HPET or the IO-APIC MMIO from the PCI config space :). > >> Or else what was the reason to exclude these > >> for PVH Dom0? > > > > The reason for PVH is because the config space is (partially) emulated > > for the hardware domain, so we don't allow untrapped access by the CPU > > either. > > Hmm, right - there's read emulation there as well, while for PV we > only intercept writes. > > So overall should we perhaps permit r/o access to MMCFG for PV? Of > course that would only end up consistent once we adjust mappings > dynamically when MMCFG ranges are put in use (IOW if we can't verify > an MMCFG range is suitably reserved, we'd not find in > mmio_ro_ranges just yet, and hence we still wouldn't have an IOMMU > side mapping even if CPU side mappings are permitted). But for the > patch here it would simply mean dropping some of the code I did add > for v5. I would be OK with that, as I think we would then be consistent with how IO-APIC and HPET MMIO regions are handled. We would have to add some small helper/handling in PHYSDEVOP_pci_mmcfg_reserved for PV. > Otherwise, i.e. if the code is to remain as is, I'm afraid I still > wouldn't see what to put usefully in the comment. IMO the important part is to note whether there's a reason or not why the handling of IO-APIC, HPET vs MMCFG RO regions differ in PV mode. Ie: if we don't want to handle MMCFG in RO mode for device mappings because of the complication with handling dynamic changes as a result of PHYSDEVOP_pci_mmcfg_reserved we should just note it. Thanks, Roger.
On 01.06.2022 10:17, Roger Pau Monné wrote: > On Wed, Jun 01, 2022 at 09:10:09AM +0200, Jan Beulich wrote: >> On 31.05.2022 18:15, Roger Pau Monné wrote: >>> On Tue, May 31, 2022 at 05:40:03PM +0200, Jan Beulich wrote: >>>> On 31.05.2022 16:40, Roger Pau Monné wrote: >>>>> On Fri, May 27, 2022 at 01:12:06PM +0200, Jan Beulich wrote: >>>>>> @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map >>>>>> * that fall in unusable ranges for PV Dom0. >>>>>> */ >>>>>> if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) >>>>>> - return false; >>>>>> + return 0; >>>>>> >>>>>> switch ( type = page_get_ram_type(mfn) ) >>>>>> { >>>>>> case RAM_TYPE_UNUSABLE: >>>>>> - return false; >>>>>> + return 0; >>>>>> >>>>>> case RAM_TYPE_CONVENTIONAL: >>>>>> if ( iommu_hwdom_strict ) >>>>>> - return false; >>>>>> + return 0; >>>>>> break; >>>>>> >>>>>> default: >>>>>> if ( type & RAM_TYPE_RESERVED ) >>>>>> { >>>>>> if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) >>>>>> - return false; >>>>>> + perms = 0; >>>>>> } >>>>>> - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) >>>>>> - return false; >>>>>> + else if ( is_hvm_domain(d) ) >>>>>> + return 0; >>>>>> + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) >>>>>> + perms = 0; >>>>>> } >>>>>> >>>>>> /* Check that it doesn't overlap with the Interrupt Address Range. */ >>>>>> if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) >>>>>> - return false; >>>>>> + return 0; >>>>>> /* ... or the IO-APIC */ >>>>>> - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) >>>>>> - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >>>>>> - return false; >>>>>> + if ( has_vioapic(d) ) >>>>>> + { >>>>>> + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) >>>>>> + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >>>>>> + return 0; >>>>>> + } >>>>>> + else if ( is_pv_domain(d) ) >>>>>> + { >>>>>> + /* >>>>>> + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o >>>>>> + * ones there (also for e.g. HPET in certain cases), so it should also >>>>>> + * have such established for IOMMUs. >>>>>> + */ >>>>>> + if ( iomem_access_permitted(d, pfn, pfn) && >>>>>> + rangeset_contains_singleton(mmio_ro_ranges, pfn) ) >>>>>> + perms = IOMMUF_readable; >>>>>> + } >>>>>> /* >>>>>> * ... or the PCIe MCFG regions. >>>> >>>> With this comment (which I leave alone) ... >>>> >>>>>> * TODO: runtime added MMCFG regions are not checked to make sure they >>>>>> * don't overlap with already mapped regions, thus preventing trapping. >>>>>> */ >>>>>> if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) ) >>>>>> - return false; >>>>>> + return 0; >>>>>> + else if ( is_pv_domain(d) ) >>>>>> + { >>>>>> + /* >>>>>> + * Don't extend consistency with CPU mappings to PCI MMCFG regions. >>>>>> + * These shouldn't be accessed via DMA by devices. >>>>> >>>>> Could you expand the comment a bit to explicitly mention the reason >>>>> why MMCFG regions shouldn't be accessible from device DMA operations? >>>> >>>> ... it's hard to tell what I should write here. I'd expect extended >>>> reasoning to go there (if anywhere). I'd be okay adjusting the earlier >>>> comment, if only I knew what to write. "We don't want them to be >>>> accessed that way" seems a little blunt. I could say "Devices have >>>> other means to access PCI config space", but this not being said there >>>> I took as being implied. >>> >>> But we could likely say the same about IO-APIC or HPET MMIO regions. >>> I don't think we expect them to be accessed by devices, yet we provide >>> them for coherency with CPU side mappings in the PV case. >> >> As to "say the same" - yes for the first part of my earlier reply, but >> no for the latter part. > > Yes, obviously devices cannot access the HPET or the IO-APIC MMIO from > the PCI config space :). > >>>> Or else what was the reason to exclude these >>>> for PVH Dom0? >>> >>> The reason for PVH is because the config space is (partially) emulated >>> for the hardware domain, so we don't allow untrapped access by the CPU >>> either. >> >> Hmm, right - there's read emulation there as well, while for PV we >> only intercept writes. >> >> So overall should we perhaps permit r/o access to MMCFG for PV? Of >> course that would only end up consistent once we adjust mappings >> dynamically when MMCFG ranges are put in use (IOW if we can't verify >> an MMCFG range is suitably reserved, we'd not find in >> mmio_ro_ranges just yet, and hence we still wouldn't have an IOMMU >> side mapping even if CPU side mappings are permitted). But for the >> patch here it would simply mean dropping some of the code I did add >> for v5. > > I would be OK with that, as I think we would then be consistent with > how IO-APIC and HPET MMIO regions are handled. We would have to add > some small helper/handling in PHYSDEVOP_pci_mmcfg_reserved for PV. Okay, I'll drop that code again then. But I'm not going to look into making the dynamic part work, at least not within this series. Jan
--- a/xen/drivers/passthrough/x86/iommu.c +++ b/xen/drivers/passthrough/x86/iommu.c @@ -13,6 +13,7 @@ */ #include <xen/sched.h> +#include <xen/iocap.h> #include <xen/iommu.h> #include <xen/paging.h> #include <xen/guest_access.h> @@ -275,12 +276,12 @@ void iommu_identity_map_teardown(struct } } -static bool __hwdom_init hwdom_iommu_map(const struct domain *d, - unsigned long pfn, - unsigned long max_pfn) +static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d, + unsigned long pfn, + unsigned long max_pfn) { mfn_t mfn = _mfn(pfn); - unsigned int i, type; + unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable; /* * Set up 1:1 mapping for dom0. Default to include only conventional RAM @@ -289,44 +290,75 @@ static bool __hwdom_init hwdom_iommu_map * that fall in unusable ranges for PV Dom0. */ if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) - return false; + return 0; switch ( type = page_get_ram_type(mfn) ) { case RAM_TYPE_UNUSABLE: - return false; + return 0; case RAM_TYPE_CONVENTIONAL: if ( iommu_hwdom_strict ) - return false; + return 0; break; default: if ( type & RAM_TYPE_RESERVED ) { if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) - return false; + perms = 0; } - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) - return false; + else if ( is_hvm_domain(d) ) + return 0; + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) + perms = 0; } /* Check that it doesn't overlap with the Interrupt Address Range. */ if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) - return false; + return 0; /* ... or the IO-APIC */ - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) - return false; + if ( has_vioapic(d) ) + { + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) + return 0; + } + else if ( is_pv_domain(d) ) + { + /* + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o + * ones there (also for e.g. HPET in certain cases), so it should also + * have such established for IOMMUs. + */ + if ( iomem_access_permitted(d, pfn, pfn) && + rangeset_contains_singleton(mmio_ro_ranges, pfn) ) + perms = IOMMUF_readable; + } /* * ... or the PCIe MCFG regions. * TODO: runtime added MMCFG regions are not checked to make sure they * don't overlap with already mapped regions, thus preventing trapping. */ if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) ) - return false; + return 0; + else if ( is_pv_domain(d) ) + { + /* + * Don't extend consistency with CPU mappings to PCI MMCFG regions. + * These shouldn't be accessed via DMA by devices. + */ + const struct acpi_mcfg_allocation *cfg = pci_mmcfg_config; + + for ( i = 0; i < pci_mmcfg_config_num; ++i, ++cfg ) + if ( pfn >= PFN_DOWN(cfg->address) + PCI_BDF(cfg->start_bus_number, + 0, 0) && + pfn <= PFN_DOWN(cfg->address) + PCI_BDF(cfg->end_bus_number, + ~0, ~0)) + return 0; + } - return true; + return perms; } void __hwdom_init arch_iommu_hwdom_init(struct domain *d) @@ -368,15 +400,19 @@ void __hwdom_init arch_iommu_hwdom_init( for ( ; i < top; i++ ) { unsigned long pfn = pdx_to_pfn(i); + unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn); int rc; - if ( !hwdom_iommu_map(d, pfn, max_pfn) ) + if ( !perms ) rc = 0; else if ( paging_mode_translate(d) ) - rc = p2m_add_identity_entry(d, pfn, p2m_access_rw, 0); + rc = p2m_add_identity_entry(d, pfn, + perms & IOMMUF_writable ? p2m_access_rw + : p2m_access_r, + 0); else rc = iommu_map(d, _dfn(pfn), _mfn(pfn), 1ul << PAGE_ORDER_4K, - IOMMUF_readable | IOMMUF_writable, &flush_flags); + perms, &flush_flags); if ( rc ) printk(XENLOG_WARNING "%pd: identity %smapping of %lx failed: %d\n",
While already the case for PVH, there's no reason to treat PV differently here, though of course the addresses get taken from another source in this case. Except that, to match CPU side mappings, by default we permit r/o ones. This then also means we now deal consistently with IO-APICs whose MMIO is or is not covered by E820 reserved regions. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v5: Extend to also cover e.g. HPET, which in turn means explicitly excluding PCI MMCFG ranges. [integrated] v1: Integrate into series. [standalone] v2: Keep IOMMU mappings in sync with CPU ones.