Message ID | 5cb4dc1b-f6b0-89cc-e21c-a27a5daf0290@suse.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | IOMMU: superpage support when not sharing pagetables | expand |
On Mon, Apr 25, 2022 at 10:34:23AM +0200, Jan Beulich wrote: > While already the case for PVH, there's no reason to treat PV > differently here, though of course the addresses get taken from another > source in this case. Except that, to match CPU side mappings, by default > we permit r/o ones. This then also means we now deal consistently with > IO-APICs whose MMIO is or is not covered by E820 reserved regions. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> > --- > [integrated] v1: Integrate into series. > [standalone] v2: Keep IOMMU mappings in sync with CPU ones. > > --- a/xen/drivers/passthrough/x86/iommu.c > +++ b/xen/drivers/passthrough/x86/iommu.c > @@ -275,12 +275,12 @@ void iommu_identity_map_teardown(struct > } > } > > -static bool __hwdom_init hwdom_iommu_map(const struct domain *d, > - unsigned long pfn, > - unsigned long max_pfn) > +static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d, > + unsigned long pfn, > + unsigned long max_pfn) > { > mfn_t mfn = _mfn(pfn); > - unsigned int i, type; > + unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable; > > /* > * Set up 1:1 mapping for dom0. Default to include only conventional RAM > @@ -289,44 +289,60 @@ static bool __hwdom_init hwdom_iommu_map > * that fall in unusable ranges for PV Dom0. > */ > if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) > - return false; > + return 0; > > switch ( type = page_get_ram_type(mfn) ) > { > case RAM_TYPE_UNUSABLE: > - return false; > + return 0; > > case RAM_TYPE_CONVENTIONAL: > if ( iommu_hwdom_strict ) > - return false; > + return 0; > break; > > default: > if ( type & RAM_TYPE_RESERVED ) > { > if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) > - return false; > + perms = 0; > } > - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) > - return false; > + else if ( is_hvm_domain(d) ) > + return 0; > + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) > + perms = 0; > } > > /* Check that it doesn't overlap with the Interrupt Address Range. */ > if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) > - return false; > + return 0; > /* ... or the IO-APIC */ > - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) > - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > - return false; > + if ( has_vioapic(d) ) > + { > + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) > + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > + return 0; > + } > + else if ( is_pv_domain(d) ) > + { > + /* > + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o > + * ones there, so it should also have such established for IOMMUs. > + */ > + for ( i = 0; i < nr_ioapics; i++ ) > + if ( pfn == PFN_DOWN(mp_ioapics[i].mpc_apicaddr) ) > + return rangeset_contains_singleton(mmio_ro_ranges, pfn) > + ? IOMMUF_readable : 0; If we really are after consistency with CPU side mappings, we should likely take the whole contents of mmio_ro_ranges and d->iomem_caps into account, not just the pages belonging to the IO-APIC? There could also be HPET pages mapped as RO for PV. Thanks, Roger.
On 03.05.2022 15:00, Roger Pau Monné wrote: > On Mon, Apr 25, 2022 at 10:34:23AM +0200, Jan Beulich wrote: >> While already the case for PVH, there's no reason to treat PV >> differently here, though of course the addresses get taken from another >> source in this case. Except that, to match CPU side mappings, by default >> we permit r/o ones. This then also means we now deal consistently with >> IO-APICs whose MMIO is or is not covered by E820 reserved regions. >> >> Signed-off-by: Jan Beulich <jbeulich@suse.com> >> --- >> [integrated] v1: Integrate into series. >> [standalone] v2: Keep IOMMU mappings in sync with CPU ones. >> >> --- a/xen/drivers/passthrough/x86/iommu.c >> +++ b/xen/drivers/passthrough/x86/iommu.c >> @@ -275,12 +275,12 @@ void iommu_identity_map_teardown(struct >> } >> } >> >> -static bool __hwdom_init hwdom_iommu_map(const struct domain *d, >> - unsigned long pfn, >> - unsigned long max_pfn) >> +static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d, >> + unsigned long pfn, >> + unsigned long max_pfn) >> { >> mfn_t mfn = _mfn(pfn); >> - unsigned int i, type; >> + unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable; >> >> /* >> * Set up 1:1 mapping for dom0. Default to include only conventional RAM >> @@ -289,44 +289,60 @@ static bool __hwdom_init hwdom_iommu_map >> * that fall in unusable ranges for PV Dom0. >> */ >> if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) >> - return false; >> + return 0; >> >> switch ( type = page_get_ram_type(mfn) ) >> { >> case RAM_TYPE_UNUSABLE: >> - return false; >> + return 0; >> >> case RAM_TYPE_CONVENTIONAL: >> if ( iommu_hwdom_strict ) >> - return false; >> + return 0; >> break; >> >> default: >> if ( type & RAM_TYPE_RESERVED ) >> { >> if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) >> - return false; >> + perms = 0; >> } >> - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) >> - return false; >> + else if ( is_hvm_domain(d) ) >> + return 0; >> + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) >> + perms = 0; >> } >> >> /* Check that it doesn't overlap with the Interrupt Address Range. */ >> if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) >> - return false; >> + return 0; >> /* ... or the IO-APIC */ >> - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) >> - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >> - return false; >> + if ( has_vioapic(d) ) >> + { >> + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) >> + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >> + return 0; >> + } >> + else if ( is_pv_domain(d) ) >> + { >> + /* >> + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o >> + * ones there, so it should also have such established for IOMMUs. >> + */ >> + for ( i = 0; i < nr_ioapics; i++ ) >> + if ( pfn == PFN_DOWN(mp_ioapics[i].mpc_apicaddr) ) >> + return rangeset_contains_singleton(mmio_ro_ranges, pfn) >> + ? IOMMUF_readable : 0; > > If we really are after consistency with CPU side mappings, we should > likely take the whole contents of mmio_ro_ranges and d->iomem_caps > into account, not just the pages belonging to the IO-APIC? > > There could also be HPET pages mapped as RO for PV. Hmm. This would be a yet bigger functional change, but indeed would further improve consistency. But shouldn't we then also establish r/w mappings for stuff in ->iomem_caps but not in mmio_ro_ranges? This would feel like going too far ... Jan
On 03.05.2022 16:50, Jan Beulich wrote: > On 03.05.2022 15:00, Roger Pau Monné wrote: >> On Mon, Apr 25, 2022 at 10:34:23AM +0200, Jan Beulich wrote: >>> While already the case for PVH, there's no reason to treat PV >>> differently here, though of course the addresses get taken from another >>> source in this case. Except that, to match CPU side mappings, by default >>> we permit r/o ones. This then also means we now deal consistently with >>> IO-APICs whose MMIO is or is not covered by E820 reserved regions. >>> >>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >>> --- >>> [integrated] v1: Integrate into series. >>> [standalone] v2: Keep IOMMU mappings in sync with CPU ones. >>> >>> --- a/xen/drivers/passthrough/x86/iommu.c >>> +++ b/xen/drivers/passthrough/x86/iommu.c >>> @@ -275,12 +275,12 @@ void iommu_identity_map_teardown(struct >>> } >>> } >>> >>> -static bool __hwdom_init hwdom_iommu_map(const struct domain *d, >>> - unsigned long pfn, >>> - unsigned long max_pfn) >>> +static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d, >>> + unsigned long pfn, >>> + unsigned long max_pfn) >>> { >>> mfn_t mfn = _mfn(pfn); >>> - unsigned int i, type; >>> + unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable; >>> >>> /* >>> * Set up 1:1 mapping for dom0. Default to include only conventional RAM >>> @@ -289,44 +289,60 @@ static bool __hwdom_init hwdom_iommu_map >>> * that fall in unusable ranges for PV Dom0. >>> */ >>> if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) >>> - return false; >>> + return 0; >>> >>> switch ( type = page_get_ram_type(mfn) ) >>> { >>> case RAM_TYPE_UNUSABLE: >>> - return false; >>> + return 0; >>> >>> case RAM_TYPE_CONVENTIONAL: >>> if ( iommu_hwdom_strict ) >>> - return false; >>> + return 0; >>> break; >>> >>> default: >>> if ( type & RAM_TYPE_RESERVED ) >>> { >>> if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) >>> - return false; >>> + perms = 0; >>> } >>> - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) >>> - return false; >>> + else if ( is_hvm_domain(d) ) >>> + return 0; >>> + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) >>> + perms = 0; >>> } >>> >>> /* Check that it doesn't overlap with the Interrupt Address Range. */ >>> if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) >>> - return false; >>> + return 0; >>> /* ... or the IO-APIC */ >>> - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) >>> - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >>> - return false; >>> + if ( has_vioapic(d) ) >>> + { >>> + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) >>> + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >>> + return 0; >>> + } >>> + else if ( is_pv_domain(d) ) >>> + { >>> + /* >>> + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o >>> + * ones there, so it should also have such established for IOMMUs. >>> + */ >>> + for ( i = 0; i < nr_ioapics; i++ ) >>> + if ( pfn == PFN_DOWN(mp_ioapics[i].mpc_apicaddr) ) >>> + return rangeset_contains_singleton(mmio_ro_ranges, pfn) >>> + ? IOMMUF_readable : 0; >> >> If we really are after consistency with CPU side mappings, we should >> likely take the whole contents of mmio_ro_ranges and d->iomem_caps >> into account, not just the pages belonging to the IO-APIC? >> >> There could also be HPET pages mapped as RO for PV. > > Hmm. This would be a yet bigger functional change, but indeed would further > improve consistency. But shouldn't we then also establish r/w mappings for > stuff in ->iomem_caps but not in mmio_ro_ranges? This would feel like going > too far ... FTAOD I didn't mean to say that I think such mappings shouldn't be there; I have been of the opinion that e.g. I/O directly to/from the linear frame buffer of a graphics device should in principle be permitted. But which specific mappings to put in place can imo not be derived from ->iomem_caps, as we merely subtract certain ranges after initially having set all bits in it. Besides ranges not mapping any MMIO, even something like the PCI ECAM ranges (parts of which we may also force to r/o, and which we would hence cover here if I followed your suggestion) are questionable in this regard. Jan
On Wed, May 04, 2022 at 11:32:51AM +0200, Jan Beulich wrote: > On 03.05.2022 16:50, Jan Beulich wrote: > > On 03.05.2022 15:00, Roger Pau Monné wrote: > >> On Mon, Apr 25, 2022 at 10:34:23AM +0200, Jan Beulich wrote: > >>> While already the case for PVH, there's no reason to treat PV > >>> differently here, though of course the addresses get taken from another > >>> source in this case. Except that, to match CPU side mappings, by default > >>> we permit r/o ones. This then also means we now deal consistently with > >>> IO-APICs whose MMIO is or is not covered by E820 reserved regions. > >>> > >>> Signed-off-by: Jan Beulich <jbeulich@suse.com> > >>> --- > >>> [integrated] v1: Integrate into series. > >>> [standalone] v2: Keep IOMMU mappings in sync with CPU ones. > >>> > >>> --- a/xen/drivers/passthrough/x86/iommu.c > >>> +++ b/xen/drivers/passthrough/x86/iommu.c > >>> @@ -275,12 +275,12 @@ void iommu_identity_map_teardown(struct > >>> } > >>> } > >>> > >>> -static bool __hwdom_init hwdom_iommu_map(const struct domain *d, > >>> - unsigned long pfn, > >>> - unsigned long max_pfn) > >>> +static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d, > >>> + unsigned long pfn, > >>> + unsigned long max_pfn) > >>> { > >>> mfn_t mfn = _mfn(pfn); > >>> - unsigned int i, type; > >>> + unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable; > >>> > >>> /* > >>> * Set up 1:1 mapping for dom0. Default to include only conventional RAM > >>> @@ -289,44 +289,60 @@ static bool __hwdom_init hwdom_iommu_map > >>> * that fall in unusable ranges for PV Dom0. > >>> */ > >>> if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) > >>> - return false; > >>> + return 0; > >>> > >>> switch ( type = page_get_ram_type(mfn) ) > >>> { > >>> case RAM_TYPE_UNUSABLE: > >>> - return false; > >>> + return 0; > >>> > >>> case RAM_TYPE_CONVENTIONAL: > >>> if ( iommu_hwdom_strict ) > >>> - return false; > >>> + return 0; > >>> break; > >>> > >>> default: > >>> if ( type & RAM_TYPE_RESERVED ) > >>> { > >>> if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) > >>> - return false; > >>> + perms = 0; > >>> } > >>> - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) > >>> - return false; > >>> + else if ( is_hvm_domain(d) ) > >>> + return 0; > >>> + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) > >>> + perms = 0; > >>> } > >>> > >>> /* Check that it doesn't overlap with the Interrupt Address Range. */ > >>> if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) > >>> - return false; > >>> + return 0; > >>> /* ... or the IO-APIC */ > >>> - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) > >>> - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > >>> - return false; > >>> + if ( has_vioapic(d) ) > >>> + { > >>> + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) > >>> + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > >>> + return 0; > >>> + } > >>> + else if ( is_pv_domain(d) ) > >>> + { > >>> + /* > >>> + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o > >>> + * ones there, so it should also have such established for IOMMUs. > >>> + */ > >>> + for ( i = 0; i < nr_ioapics; i++ ) > >>> + if ( pfn == PFN_DOWN(mp_ioapics[i].mpc_apicaddr) ) > >>> + return rangeset_contains_singleton(mmio_ro_ranges, pfn) > >>> + ? IOMMUF_readable : 0; > >> > >> If we really are after consistency with CPU side mappings, we should > >> likely take the whole contents of mmio_ro_ranges and d->iomem_caps > >> into account, not just the pages belonging to the IO-APIC? > >> > >> There could also be HPET pages mapped as RO for PV. > > > > Hmm. This would be a yet bigger functional change, but indeed would further > > improve consistency. But shouldn't we then also establish r/w mappings for > > stuff in ->iomem_caps but not in mmio_ro_ranges? This would feel like going > > too far ... > > FTAOD I didn't mean to say that I think such mappings shouldn't be there; > I have been of the opinion that e.g. I/O directly to/from the linear > frame buffer of a graphics device should in principle be permitted. But > which specific mappings to put in place can imo not be derived from > ->iomem_caps, as we merely subtract certain ranges after initially having > set all bits in it. Besides ranges not mapping any MMIO, even something > like the PCI ECAM ranges (parts of which we may also force to r/o, and > which we would hence cover here if I followed your suggestion) are > questionable in this regard. Right, ->iomem_caps is indeed too wide for our purpose. What about using something like: else if ( is_pv_domain(d) ) { if ( !iomem_access_permitted(d, pfn, pfn) ) return 0; if ( rangeset_contains_singleton(mmio_ro_ranges, pfn) ) return IOMMUF_readable; } That would get us a bit closer to allowed CPU side mappings, and we don't need to special case IO-APIC or HPET addresses as those are already added to ->iomem_caps or mmio_ro_ranges respectively by dom0_setup_permissions(). Thanks, Roger.
On 04.05.2022 12:30, Roger Pau Monné wrote: > On Wed, May 04, 2022 at 11:32:51AM +0200, Jan Beulich wrote: >> On 03.05.2022 16:50, Jan Beulich wrote: >>> On 03.05.2022 15:00, Roger Pau Monné wrote: >>>> On Mon, Apr 25, 2022 at 10:34:23AM +0200, Jan Beulich wrote: >>>>> While already the case for PVH, there's no reason to treat PV >>>>> differently here, though of course the addresses get taken from another >>>>> source in this case. Except that, to match CPU side mappings, by default >>>>> we permit r/o ones. This then also means we now deal consistently with >>>>> IO-APICs whose MMIO is or is not covered by E820 reserved regions. >>>>> >>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com> >>>>> --- >>>>> [integrated] v1: Integrate into series. >>>>> [standalone] v2: Keep IOMMU mappings in sync with CPU ones. >>>>> >>>>> --- a/xen/drivers/passthrough/x86/iommu.c >>>>> +++ b/xen/drivers/passthrough/x86/iommu.c >>>>> @@ -275,12 +275,12 @@ void iommu_identity_map_teardown(struct >>>>> } >>>>> } >>>>> >>>>> -static bool __hwdom_init hwdom_iommu_map(const struct domain *d, >>>>> - unsigned long pfn, >>>>> - unsigned long max_pfn) >>>>> +static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d, >>>>> + unsigned long pfn, >>>>> + unsigned long max_pfn) >>>>> { >>>>> mfn_t mfn = _mfn(pfn); >>>>> - unsigned int i, type; >>>>> + unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable; >>>>> >>>>> /* >>>>> * Set up 1:1 mapping for dom0. Default to include only conventional RAM >>>>> @@ -289,44 +289,60 @@ static bool __hwdom_init hwdom_iommu_map >>>>> * that fall in unusable ranges for PV Dom0. >>>>> */ >>>>> if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) >>>>> - return false; >>>>> + return 0; >>>>> >>>>> switch ( type = page_get_ram_type(mfn) ) >>>>> { >>>>> case RAM_TYPE_UNUSABLE: >>>>> - return false; >>>>> + return 0; >>>>> >>>>> case RAM_TYPE_CONVENTIONAL: >>>>> if ( iommu_hwdom_strict ) >>>>> - return false; >>>>> + return 0; >>>>> break; >>>>> >>>>> default: >>>>> if ( type & RAM_TYPE_RESERVED ) >>>>> { >>>>> if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) >>>>> - return false; >>>>> + perms = 0; >>>>> } >>>>> - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) >>>>> - return false; >>>>> + else if ( is_hvm_domain(d) ) >>>>> + return 0; >>>>> + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) >>>>> + perms = 0; >>>>> } >>>>> >>>>> /* Check that it doesn't overlap with the Interrupt Address Range. */ >>>>> if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) >>>>> - return false; >>>>> + return 0; >>>>> /* ... or the IO-APIC */ >>>>> - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) >>>>> - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >>>>> - return false; >>>>> + if ( has_vioapic(d) ) >>>>> + { >>>>> + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) >>>>> + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) >>>>> + return 0; >>>>> + } >>>>> + else if ( is_pv_domain(d) ) >>>>> + { >>>>> + /* >>>>> + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o >>>>> + * ones there, so it should also have such established for IOMMUs. >>>>> + */ >>>>> + for ( i = 0; i < nr_ioapics; i++ ) >>>>> + if ( pfn == PFN_DOWN(mp_ioapics[i].mpc_apicaddr) ) >>>>> + return rangeset_contains_singleton(mmio_ro_ranges, pfn) >>>>> + ? IOMMUF_readable : 0; >>>> >>>> If we really are after consistency with CPU side mappings, we should >>>> likely take the whole contents of mmio_ro_ranges and d->iomem_caps >>>> into account, not just the pages belonging to the IO-APIC? >>>> >>>> There could also be HPET pages mapped as RO for PV. >>> >>> Hmm. This would be a yet bigger functional change, but indeed would further >>> improve consistency. But shouldn't we then also establish r/w mappings for >>> stuff in ->iomem_caps but not in mmio_ro_ranges? This would feel like going >>> too far ... >> >> FTAOD I didn't mean to say that I think such mappings shouldn't be there; >> I have been of the opinion that e.g. I/O directly to/from the linear >> frame buffer of a graphics device should in principle be permitted. But >> which specific mappings to put in place can imo not be derived from >> ->iomem_caps, as we merely subtract certain ranges after initially having >> set all bits in it. Besides ranges not mapping any MMIO, even something >> like the PCI ECAM ranges (parts of which we may also force to r/o, and >> which we would hence cover here if I followed your suggestion) are >> questionable in this regard. > > Right, ->iomem_caps is indeed too wide for our purpose. What > about using something like: > > else if ( is_pv_domain(d) ) > { > if ( !iomem_access_permitted(d, pfn, pfn) ) > return 0; We can't return 0 here (as RAM pages also make it here when !iommu_hwdom_strict), so I can at best take this as a vague outline of what you really mean. And I don't want to rely on RAM pages being (imo wrongly) represented by set bits in Dom0's iomem_caps. > if ( rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > return IOMMUF_readable; > } > > That would get us a bit closer to allowed CPU side mappings, and we > don't need to special case IO-APIC or HPET addresses as those are > already added to ->iomem_caps or mmio_ro_ranges respectively by > dom0_setup_permissions(). This won't fit in a region of code framed by a (split) comment saying "Check that it doesn't overlap with ...". Hence if anything I could put something like this further down. Yet even then the question remains what to do with ranges which pass iomem_access_permitted() but - aren't really MMIO, - are inside MMCFG, - are otherwise special. Or did you perhaps mean to suggest something like else if ( is_pv_domain(d) && iomem_access_permitted(d, pfn, pfn) && rangeset_contains_singleton(mmio_ro_ranges, pfn) ) return IOMMUF_readable; ? Then there would only remain the question of whether mapping r/o MMCFG pages is okay (I don't think it is), but that could then be special-cased similar to what's done further down for vPCI (by not returning in the "else if", but merely updating "perms"). Jan
On Wed, May 04, 2022 at 12:51:25PM +0200, Jan Beulich wrote: > On 04.05.2022 12:30, Roger Pau Monné wrote: > > On Wed, May 04, 2022 at 11:32:51AM +0200, Jan Beulich wrote: > >> On 03.05.2022 16:50, Jan Beulich wrote: > >>> On 03.05.2022 15:00, Roger Pau Monné wrote: > >>>> On Mon, Apr 25, 2022 at 10:34:23AM +0200, Jan Beulich wrote: > >>>>> While already the case for PVH, there's no reason to treat PV > >>>>> differently here, though of course the addresses get taken from another > >>>>> source in this case. Except that, to match CPU side mappings, by default > >>>>> we permit r/o ones. This then also means we now deal consistently with > >>>>> IO-APICs whose MMIO is or is not covered by E820 reserved regions. > >>>>> > >>>>> Signed-off-by: Jan Beulich <jbeulich@suse.com> > >>>>> --- > >>>>> [integrated] v1: Integrate into series. > >>>>> [standalone] v2: Keep IOMMU mappings in sync with CPU ones. > >>>>> > >>>>> --- a/xen/drivers/passthrough/x86/iommu.c > >>>>> +++ b/xen/drivers/passthrough/x86/iommu.c > >>>>> @@ -275,12 +275,12 @@ void iommu_identity_map_teardown(struct > >>>>> } > >>>>> } > >>>>> > >>>>> -static bool __hwdom_init hwdom_iommu_map(const struct domain *d, > >>>>> - unsigned long pfn, > >>>>> - unsigned long max_pfn) > >>>>> +static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d, > >>>>> + unsigned long pfn, > >>>>> + unsigned long max_pfn) > >>>>> { > >>>>> mfn_t mfn = _mfn(pfn); > >>>>> - unsigned int i, type; > >>>>> + unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable; > >>>>> > >>>>> /* > >>>>> * Set up 1:1 mapping for dom0. Default to include only conventional RAM > >>>>> @@ -289,44 +289,60 @@ static bool __hwdom_init hwdom_iommu_map > >>>>> * that fall in unusable ranges for PV Dom0. > >>>>> */ > >>>>> if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) > >>>>> - return false; > >>>>> + return 0; > >>>>> > >>>>> switch ( type = page_get_ram_type(mfn) ) > >>>>> { > >>>>> case RAM_TYPE_UNUSABLE: > >>>>> - return false; > >>>>> + return 0; > >>>>> > >>>>> case RAM_TYPE_CONVENTIONAL: > >>>>> if ( iommu_hwdom_strict ) > >>>>> - return false; > >>>>> + return 0; > >>>>> break; > >>>>> > >>>>> default: > >>>>> if ( type & RAM_TYPE_RESERVED ) > >>>>> { > >>>>> if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) > >>>>> - return false; > >>>>> + perms = 0; > >>>>> } > >>>>> - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) > >>>>> - return false; > >>>>> + else if ( is_hvm_domain(d) ) > >>>>> + return 0; > >>>>> + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) > >>>>> + perms = 0; > >>>>> } > >>>>> > >>>>> /* Check that it doesn't overlap with the Interrupt Address Range. */ > >>>>> if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) > >>>>> - return false; > >>>>> + return 0; > >>>>> /* ... or the IO-APIC */ > >>>>> - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) > >>>>> - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > >>>>> - return false; > >>>>> + if ( has_vioapic(d) ) > >>>>> + { > >>>>> + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) > >>>>> + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) > >>>>> + return 0; > >>>>> + } > >>>>> + else if ( is_pv_domain(d) ) > >>>>> + { > >>>>> + /* > >>>>> + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o > >>>>> + * ones there, so it should also have such established for IOMMUs. > >>>>> + */ > >>>>> + for ( i = 0; i < nr_ioapics; i++ ) > >>>>> + if ( pfn == PFN_DOWN(mp_ioapics[i].mpc_apicaddr) ) > >>>>> + return rangeset_contains_singleton(mmio_ro_ranges, pfn) > >>>>> + ? IOMMUF_readable : 0; > >>>> > >>>> If we really are after consistency with CPU side mappings, we should > >>>> likely take the whole contents of mmio_ro_ranges and d->iomem_caps > >>>> into account, not just the pages belonging to the IO-APIC? > >>>> > >>>> There could also be HPET pages mapped as RO for PV. > >>> > >>> Hmm. This would be a yet bigger functional change, but indeed would further > >>> improve consistency. But shouldn't we then also establish r/w mappings for > >>> stuff in ->iomem_caps but not in mmio_ro_ranges? This would feel like going > >>> too far ... > >> > >> FTAOD I didn't mean to say that I think such mappings shouldn't be there; > >> I have been of the opinion that e.g. I/O directly to/from the linear > >> frame buffer of a graphics device should in principle be permitted. But > >> which specific mappings to put in place can imo not be derived from > >> ->iomem_caps, as we merely subtract certain ranges after initially having > >> set all bits in it. Besides ranges not mapping any MMIO, even something > >> like the PCI ECAM ranges (parts of which we may also force to r/o, and > >> which we would hence cover here if I followed your suggestion) are > >> questionable in this regard. > > > > Right, ->iomem_caps is indeed too wide for our purpose. What > > about using something like: > > > > else if ( is_pv_domain(d) ) > > { > > if ( !iomem_access_permitted(d, pfn, pfn) ) > > return 0; > > We can't return 0 here (as RAM pages also make it here when > !iommu_hwdom_strict), so I can at best take this as a vague outline > of what you really mean. And I don't want to rely on RAM pages being > (imo wrongly) represented by set bits in Dom0's iomem_caps. Well, yes, my suggestion was taking into account that ->iomem_caps for dom0 has mostly holes for things that shouldn't be mapped, but otherwise contains everything else as allowed (including RAM). We could instead do: else if ( is_pv_domain(d) && type != RAM_TYPE_CONVENTIONAL ) { ... So that we don't rely on RAM being 'allowed' in ->iomem_caps? > > if ( rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > > return IOMMUF_readable; > > } > > > > That would get us a bit closer to allowed CPU side mappings, and we > > don't need to special case IO-APIC or HPET addresses as those are > > already added to ->iomem_caps or mmio_ro_ranges respectively by > > dom0_setup_permissions(). > > This won't fit in a region of code framed by a (split) comment > saying "Check that it doesn't overlap with ...". Hence if anything > I could put something like this further down. Yet even then the > question remains what to do with ranges which pass > iomem_access_permitted() but > - aren't really MMIO, > - are inside MMCFG, > - are otherwise special. > > Or did you perhaps mean to suggest something like > > else if ( is_pv_domain(d) && iomem_access_permitted(d, pfn, pfn) && > rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > return IOMMUF_readable; I don't think this would be fully correct, as we would still allow mappings of IO-APIC pages explicitly banned in ->iomem_caps by not handling those? > ? Then there would only remain the question of whether mapping r/o > MMCFG pages is okay (I don't think it is), but that could then be > special-cased similar to what's done further down for vPCI (by not > returning in the "else if", but merely updating "perms"). Well part of the point of this is to make CPU and Device mappings more similar. I don't think devices have any business in poking at the MMCFG range, so it's fine to explicitly ban that range. But I would have also said the same for IO-APIC pages, so I'm unsure why are IO-APIC pages fine to be mapped RO, but not the MMCFG range. Thanks, Roger.
On 04.05.2022 14:01, Roger Pau Monné wrote: > On Wed, May 04, 2022 at 12:51:25PM +0200, Jan Beulich wrote: >> On 04.05.2022 12:30, Roger Pau Monné wrote: >>> Right, ->iomem_caps is indeed too wide for our purpose. What >>> about using something like: >>> >>> else if ( is_pv_domain(d) ) >>> { >>> if ( !iomem_access_permitted(d, pfn, pfn) ) >>> return 0; >> >> We can't return 0 here (as RAM pages also make it here when >> !iommu_hwdom_strict), so I can at best take this as a vague outline >> of what you really mean. And I don't want to rely on RAM pages being >> (imo wrongly) represented by set bits in Dom0's iomem_caps. > > Well, yes, my suggestion was taking into account that ->iomem_caps for > dom0 has mostly holes for things that shouldn't be mapped, but > otherwise contains everything else as allowed (including RAM). > > We could instead do: > > else if ( is_pv_domain(d) && type != RAM_TYPE_CONVENTIONAL ) > { > ... > > So that we don't rely on RAM being 'allowed' in ->iomem_caps? This would feel to me like excess special casing. >>> if ( rangeset_contains_singleton(mmio_ro_ranges, pfn) ) >>> return IOMMUF_readable; >>> } >>> >>> That would get us a bit closer to allowed CPU side mappings, and we >>> don't need to special case IO-APIC or HPET addresses as those are >>> already added to ->iomem_caps or mmio_ro_ranges respectively by >>> dom0_setup_permissions(). >> >> This won't fit in a region of code framed by a (split) comment >> saying "Check that it doesn't overlap with ...". Hence if anything >> I could put something like this further down. Yet even then the >> question remains what to do with ranges which pass >> iomem_access_permitted() but >> - aren't really MMIO, >> - are inside MMCFG, >> - are otherwise special. >> >> Or did you perhaps mean to suggest something like >> >> else if ( is_pv_domain(d) && iomem_access_permitted(d, pfn, pfn) && >> rangeset_contains_singleton(mmio_ro_ranges, pfn) ) >> return IOMMUF_readable; > > I don't think this would be fully correct, as we would still allow > mappings of IO-APIC pages explicitly banned in ->iomem_caps by not > handling those? CPU side mappings don't deal with the IO-APICs specifically. They only care about iomem_caps and mmio_ro_ranges. Hence explicitly banned IO-APIC pages cannot be mapped there either. (Of course we only do such banning if IO-APIC pages weren't possible to represent in mmio_ro_ranges, which should effectively be never.) >> ? Then there would only remain the question of whether mapping r/o >> MMCFG pages is okay (I don't think it is), but that could then be >> special-cased similar to what's done further down for vPCI (by not >> returning in the "else if", but merely updating "perms"). > > Well part of the point of this is to make CPU and Device mappings > more similar. I don't think devices have any business in poking at > the MMCFG range, so it's fine to explicitly ban that range. But I > would have also said the same for IO-APIC pages, so I'm unsure why are > IO-APIC pages fine to be mapped RO, but not the MMCFG range. I wouldn't have wanted to allow r/o mappings of the IO-APICs, but Linux plus the ACPI tables of certain vendors require us to permit this. If we didn't, Dom0 would crash there during boot. Jan
On Wed, May 04, 2022 at 02:12:58PM +0200, Jan Beulich wrote: > On 04.05.2022 14:01, Roger Pau Monné wrote: > > On Wed, May 04, 2022 at 12:51:25PM +0200, Jan Beulich wrote: > >> On 04.05.2022 12:30, Roger Pau Monné wrote: > >>> Right, ->iomem_caps is indeed too wide for our purpose. What > >>> about using something like: > >>> > >>> else if ( is_pv_domain(d) ) > >>> { > >>> if ( !iomem_access_permitted(d, pfn, pfn) ) > >>> return 0; > >> > >> We can't return 0 here (as RAM pages also make it here when > >> !iommu_hwdom_strict), so I can at best take this as a vague outline > >> of what you really mean. And I don't want to rely on RAM pages being > >> (imo wrongly) represented by set bits in Dom0's iomem_caps. > > > > Well, yes, my suggestion was taking into account that ->iomem_caps for > > dom0 has mostly holes for things that shouldn't be mapped, but > > otherwise contains everything else as allowed (including RAM). > > > > We could instead do: > > > > else if ( is_pv_domain(d) && type != RAM_TYPE_CONVENTIONAL ) > > { > > ... > > > > So that we don't rely on RAM being 'allowed' in ->iomem_caps? > > This would feel to me like excess special casing. What about placing this in the 'default:' label on the type switch a bit above? > >>> if ( rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > >>> return IOMMUF_readable; > >>> } > >>> > >>> That would get us a bit closer to allowed CPU side mappings, and we > >>> don't need to special case IO-APIC or HPET addresses as those are > >>> already added to ->iomem_caps or mmio_ro_ranges respectively by > >>> dom0_setup_permissions(). > >> > >> This won't fit in a region of code framed by a (split) comment > >> saying "Check that it doesn't overlap with ...". Hence if anything > >> I could put something like this further down. Yet even then the > >> question remains what to do with ranges which pass > >> iomem_access_permitted() but > >> - aren't really MMIO, > >> - are inside MMCFG, > >> - are otherwise special. > >> > >> Or did you perhaps mean to suggest something like > >> > >> else if ( is_pv_domain(d) && iomem_access_permitted(d, pfn, pfn) && > >> rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > >> return IOMMUF_readable; > > > > I don't think this would be fully correct, as we would still allow > > mappings of IO-APIC pages explicitly banned in ->iomem_caps by not > > handling those? > > CPU side mappings don't deal with the IO-APICs specifically. They only > care about iomem_caps and mmio_ro_ranges. Hence explicitly banned > IO-APIC pages cannot be mapped there either. (Of course we only do > such banning if IO-APIC pages weren't possible to represent in > mmio_ro_ranges, which should effectively be never.) I think I haven't expressed myself correctly. This construct won't return 0 for pfns not in iomem_caps, and hence could allow mapping of addresses not in iomem_caps? > >> ? Then there would only remain the question of whether mapping r/o > >> MMCFG pages is okay (I don't think it is), but that could then be > >> special-cased similar to what's done further down for vPCI (by not > >> returning in the "else if", but merely updating "perms"). > > > > Well part of the point of this is to make CPU and Device mappings > > more similar. I don't think devices have any business in poking at > > the MMCFG range, so it's fine to explicitly ban that range. But I > > would have also said the same for IO-APIC pages, so I'm unsure why are > > IO-APIC pages fine to be mapped RO, but not the MMCFG range. > > I wouldn't have wanted to allow r/o mappings of the IO-APICs, but > Linux plus the ACPI tables of certain vendors require us to permit > this. If we didn't, Dom0 would crash there during boot. Right, but those are required for the CPU only. I think it's a fine goal to try to have similar mappings for CPU and Devices, and then that would also cover MMCFG in the PV case. Or else it fine to assume CPU vs Device mappings will be slightly different, and then don't add any mappings for IO-APIC, HPET or MMCFG to the Device page tables (likely there's more that could be added here). Thanks, Roger.
On 04.05.2022 15:00, Roger Pau Monné wrote: > On Wed, May 04, 2022 at 02:12:58PM +0200, Jan Beulich wrote: >> On 04.05.2022 14:01, Roger Pau Monné wrote: >>> On Wed, May 04, 2022 at 12:51:25PM +0200, Jan Beulich wrote: >>>> On 04.05.2022 12:30, Roger Pau Monné wrote: >>>>> Right, ->iomem_caps is indeed too wide for our purpose. What >>>>> about using something like: >>>>> >>>>> else if ( is_pv_domain(d) ) >>>>> { >>>>> if ( !iomem_access_permitted(d, pfn, pfn) ) >>>>> return 0; >>>> >>>> We can't return 0 here (as RAM pages also make it here when >>>> !iommu_hwdom_strict), so I can at best take this as a vague outline >>>> of what you really mean. And I don't want to rely on RAM pages being >>>> (imo wrongly) represented by set bits in Dom0's iomem_caps. >>> >>> Well, yes, my suggestion was taking into account that ->iomem_caps for >>> dom0 has mostly holes for things that shouldn't be mapped, but >>> otherwise contains everything else as allowed (including RAM). >>> >>> We could instead do: >>> >>> else if ( is_pv_domain(d) && type != RAM_TYPE_CONVENTIONAL ) >>> { >>> ... >>> >>> So that we don't rely on RAM being 'allowed' in ->iomem_caps? >> >> This would feel to me like excess special casing. > > What about placing this in the 'default:' label on the type switch a > bit above? I'd really like to stick to the present layout of where the special casing is done, with PV and PVH logic at least next to each other. I continue to think the construct I suggested (still visible below) would do. >>>>> if ( rangeset_contains_singleton(mmio_ro_ranges, pfn) ) >>>>> return IOMMUF_readable; >>>>> } >>>>> >>>>> That would get us a bit closer to allowed CPU side mappings, and we >>>>> don't need to special case IO-APIC or HPET addresses as those are >>>>> already added to ->iomem_caps or mmio_ro_ranges respectively by >>>>> dom0_setup_permissions(). >>>> >>>> This won't fit in a region of code framed by a (split) comment >>>> saying "Check that it doesn't overlap with ...". Hence if anything >>>> I could put something like this further down. Yet even then the >>>> question remains what to do with ranges which pass >>>> iomem_access_permitted() but >>>> - aren't really MMIO, >>>> - are inside MMCFG, >>>> - are otherwise special. >>>> >>>> Or did you perhaps mean to suggest something like >>>> >>>> else if ( is_pv_domain(d) && iomem_access_permitted(d, pfn, pfn) && >>>> rangeset_contains_singleton(mmio_ro_ranges, pfn) ) >>>> return IOMMUF_readable; >>> >>> I don't think this would be fully correct, as we would still allow >>> mappings of IO-APIC pages explicitly banned in ->iomem_caps by not >>> handling those? >> >> CPU side mappings don't deal with the IO-APICs specifically. They only >> care about iomem_caps and mmio_ro_ranges. Hence explicitly banned >> IO-APIC pages cannot be mapped there either. (Of course we only do >> such banning if IO-APIC pages weren't possible to represent in >> mmio_ro_ranges, which should effectively be never.) > > I think I haven't expressed myself correctly. > > This construct won't return 0 for pfns not in iomem_caps, and hence > could allow mapping of addresses not in iomem_caps? I'm afraid I don't understand: There's an iomem_access_permitted() in the conditional. How would this allow mapping pages outside of iomem_caps? The default case higher up has already forced perms to zero for any non-RAM page (unless iommu_hwdom_inclusive). >>>> ? Then there would only remain the question of whether mapping r/o >>>> MMCFG pages is okay (I don't think it is), but that could then be >>>> special-cased similar to what's done further down for vPCI (by not >>>> returning in the "else if", but merely updating "perms"). >>> >>> Well part of the point of this is to make CPU and Device mappings >>> more similar. I don't think devices have any business in poking at >>> the MMCFG range, so it's fine to explicitly ban that range. But I >>> would have also said the same for IO-APIC pages, so I'm unsure why are >>> IO-APIC pages fine to be mapped RO, but not the MMCFG range. >> >> I wouldn't have wanted to allow r/o mappings of the IO-APICs, but >> Linux plus the ACPI tables of certain vendors require us to permit >> this. If we didn't, Dom0 would crash there during boot. > > Right, but those are required for the CPU only. I think it's a fine > goal to try to have similar mappings for CPU and Devices, and then > that would also cover MMCFG in the PV case. Or else it fine to assume > CPU vs Device mappings will be slightly different, and then don't add > any mappings for IO-APIC, HPET or MMCFG to the Device page tables > (likely there's more that could be added here). It being different is what Andrew looks to strongly dislike. And I agree with this up to a certain point, i.e. I'm having a hard time seeing why we should put in MMCFG mappings just for this reason. But if consensus was that consistency across all types of MMIO is the goal, then I could live with also making MMCFG mappings ... Jan
On Wed, May 04, 2022 at 03:19:16PM +0200, Jan Beulich wrote: > On 04.05.2022 15:00, Roger Pau Monné wrote: > > On Wed, May 04, 2022 at 02:12:58PM +0200, Jan Beulich wrote: > >> On 04.05.2022 14:01, Roger Pau Monné wrote: > >>> On Wed, May 04, 2022 at 12:51:25PM +0200, Jan Beulich wrote: > >>>> On 04.05.2022 12:30, Roger Pau Monné wrote: > >>>>> Right, ->iomem_caps is indeed too wide for our purpose. What > >>>>> about using something like: > >>>>> > >>>>> else if ( is_pv_domain(d) ) > >>>>> { > >>>>> if ( !iomem_access_permitted(d, pfn, pfn) ) > >>>>> return 0; > >>>> > >>>> We can't return 0 here (as RAM pages also make it here when > >>>> !iommu_hwdom_strict), so I can at best take this as a vague outline > >>>> of what you really mean. And I don't want to rely on RAM pages being > >>>> (imo wrongly) represented by set bits in Dom0's iomem_caps. > >>> > >>> Well, yes, my suggestion was taking into account that ->iomem_caps for > >>> dom0 has mostly holes for things that shouldn't be mapped, but > >>> otherwise contains everything else as allowed (including RAM). > >>> > >>> We could instead do: > >>> > >>> else if ( is_pv_domain(d) && type != RAM_TYPE_CONVENTIONAL ) > >>> { > >>> ... > >>> > >>> So that we don't rely on RAM being 'allowed' in ->iomem_caps? > >> > >> This would feel to me like excess special casing. > > > > What about placing this in the 'default:' label on the type switch a > > bit above? > > I'd really like to stick to the present layout of where the special > casing is done, with PV and PVH logic at least next to each other. I > continue to think the construct I suggested (still visible below) > would do. > > >>>>> if ( rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > >>>>> return IOMMUF_readable; > >>>>> } > >>>>> > >>>>> That would get us a bit closer to allowed CPU side mappings, and we > >>>>> don't need to special case IO-APIC or HPET addresses as those are > >>>>> already added to ->iomem_caps or mmio_ro_ranges respectively by > >>>>> dom0_setup_permissions(). > >>>> > >>>> This won't fit in a region of code framed by a (split) comment > >>>> saying "Check that it doesn't overlap with ...". Hence if anything > >>>> I could put something like this further down. Yet even then the > >>>> question remains what to do with ranges which pass > >>>> iomem_access_permitted() but > >>>> - aren't really MMIO, > >>>> - are inside MMCFG, > >>>> - are otherwise special. > >>>> > >>>> Or did you perhaps mean to suggest something like > >>>> > >>>> else if ( is_pv_domain(d) && iomem_access_permitted(d, pfn, pfn) && > >>>> rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > >>>> return IOMMUF_readable; > >>> > >>> I don't think this would be fully correct, as we would still allow > >>> mappings of IO-APIC pages explicitly banned in ->iomem_caps by not > >>> handling those? > >> > >> CPU side mappings don't deal with the IO-APICs specifically. They only > >> care about iomem_caps and mmio_ro_ranges. Hence explicitly banned > >> IO-APIC pages cannot be mapped there either. (Of course we only do > >> such banning if IO-APIC pages weren't possible to represent in > >> mmio_ro_ranges, which should effectively be never.) > > > > I think I haven't expressed myself correctly. > > > > This construct won't return 0 for pfns not in iomem_caps, and hence > > could allow mapping of addresses not in iomem_caps? > > I'm afraid I don't understand: There's an iomem_access_permitted() > in the conditional. How would this allow mapping pages outside of > iomem_caps? The default case higher up has already forced perms to > zero for any non-RAM page (unless iommu_hwdom_inclusive). It was my understanding that when using iommu_hwdom_inclusive (or iommu_hwdom_reserved if the IO-APIC page is a reserved region) we still want to deny access to the IO-APIC page if it's not in iomem_caps, and the proposed conditional won't do that. So I guess the discussion is really whether iommu_hwdom_{inclusive,reserved} take precedence over ->iomem_caps? It seems a bit inconsistent IMO to enforce mmio_ro_ranges but not ->iomem_caps when using iommu_hwdom_{inclusive,reserved}. > >>>> ? Then there would only remain the question of whether mapping r/o > >>>> MMCFG pages is okay (I don't think it is), but that could then be > >>>> special-cased similar to what's done further down for vPCI (by not > >>>> returning in the "else if", but merely updating "perms"). > >>> > >>> Well part of the point of this is to make CPU and Device mappings > >>> more similar. I don't think devices have any business in poking at > >>> the MMCFG range, so it's fine to explicitly ban that range. But I > >>> would have also said the same for IO-APIC pages, so I'm unsure why are > >>> IO-APIC pages fine to be mapped RO, but not the MMCFG range. > >> > >> I wouldn't have wanted to allow r/o mappings of the IO-APICs, but > >> Linux plus the ACPI tables of certain vendors require us to permit > >> this. If we didn't, Dom0 would crash there during boot. > > > > Right, but those are required for the CPU only. I think it's a fine > > goal to try to have similar mappings for CPU and Devices, and then > > that would also cover MMCFG in the PV case. Or else it fine to assume > > CPU vs Device mappings will be slightly different, and then don't add > > any mappings for IO-APIC, HPET or MMCFG to the Device page tables > > (likely there's more that could be added here). > > It being different is what Andrew looks to strongly dislike. And I agree > with this up to a certain point, i.e. I'm having a hard time seeing why > we should put in MMCFG mappings just for this reason. But if consensus > was that consistency across all types of MMIO is the goal, then I could > live with also making MMCFG mappings ... For HVM/PVH I think we want o be consistent as long as it's doable (we can't provide devices access to the emulated MMCFG there for example). For PV I guess it's also a worthy goal if it makes the code easier. PV (and PV dom0 specially) is already a very custom platform with weird properties (like the mapping of the IO-APIC and HPET regions RO or no mappings at all). Thanks, Roger.
On 04.05.2022 15:46, Roger Pau Monné wrote: > On Wed, May 04, 2022 at 03:19:16PM +0200, Jan Beulich wrote: >> On 04.05.2022 15:00, Roger Pau Monné wrote: >>> On Wed, May 04, 2022 at 02:12:58PM +0200, Jan Beulich wrote: >>>> On 04.05.2022 14:01, Roger Pau Monné wrote: >>>>> On Wed, May 04, 2022 at 12:51:25PM +0200, Jan Beulich wrote: >>>>>> On 04.05.2022 12:30, Roger Pau Monné wrote: >>>>>>> Right, ->iomem_caps is indeed too wide for our purpose. What >>>>>>> about using something like: >>>>>>> >>>>>>> else if ( is_pv_domain(d) ) >>>>>>> { >>>>>>> if ( !iomem_access_permitted(d, pfn, pfn) ) >>>>>>> return 0; >>>>>> >>>>>> We can't return 0 here (as RAM pages also make it here when >>>>>> !iommu_hwdom_strict), so I can at best take this as a vague outline >>>>>> of what you really mean. And I don't want to rely on RAM pages being >>>>>> (imo wrongly) represented by set bits in Dom0's iomem_caps. >>>>> >>>>> Well, yes, my suggestion was taking into account that ->iomem_caps for >>>>> dom0 has mostly holes for things that shouldn't be mapped, but >>>>> otherwise contains everything else as allowed (including RAM). >>>>> >>>>> We could instead do: >>>>> >>>>> else if ( is_pv_domain(d) && type != RAM_TYPE_CONVENTIONAL ) >>>>> { >>>>> ... >>>>> >>>>> So that we don't rely on RAM being 'allowed' in ->iomem_caps? >>>> >>>> This would feel to me like excess special casing. >>> >>> What about placing this in the 'default:' label on the type switch a >>> bit above? >> >> I'd really like to stick to the present layout of where the special >> casing is done, with PV and PVH logic at least next to each other. I >> continue to think the construct I suggested (still visible below) >> would do. >> >>>>>>> if ( rangeset_contains_singleton(mmio_ro_ranges, pfn) ) >>>>>>> return IOMMUF_readable; >>>>>>> } >>>>>>> >>>>>>> That would get us a bit closer to allowed CPU side mappings, and we >>>>>>> don't need to special case IO-APIC or HPET addresses as those are >>>>>>> already added to ->iomem_caps or mmio_ro_ranges respectively by >>>>>>> dom0_setup_permissions(). >>>>>> >>>>>> This won't fit in a region of code framed by a (split) comment >>>>>> saying "Check that it doesn't overlap with ...". Hence if anything >>>>>> I could put something like this further down. Yet even then the >>>>>> question remains what to do with ranges which pass >>>>>> iomem_access_permitted() but >>>>>> - aren't really MMIO, >>>>>> - are inside MMCFG, >>>>>> - are otherwise special. >>>>>> >>>>>> Or did you perhaps mean to suggest something like >>>>>> >>>>>> else if ( is_pv_domain(d) && iomem_access_permitted(d, pfn, pfn) && >>>>>> rangeset_contains_singleton(mmio_ro_ranges, pfn) ) >>>>>> return IOMMUF_readable; >>>>> >>>>> I don't think this would be fully correct, as we would still allow >>>>> mappings of IO-APIC pages explicitly banned in ->iomem_caps by not >>>>> handling those? >>>> >>>> CPU side mappings don't deal with the IO-APICs specifically. They only >>>> care about iomem_caps and mmio_ro_ranges. Hence explicitly banned >>>> IO-APIC pages cannot be mapped there either. (Of course we only do >>>> such banning if IO-APIC pages weren't possible to represent in >>>> mmio_ro_ranges, which should effectively be never.) >>> >>> I think I haven't expressed myself correctly. >>> >>> This construct won't return 0 for pfns not in iomem_caps, and hence >>> could allow mapping of addresses not in iomem_caps? >> >> I'm afraid I don't understand: There's an iomem_access_permitted() >> in the conditional. How would this allow mapping pages outside of >> iomem_caps? The default case higher up has already forced perms to >> zero for any non-RAM page (unless iommu_hwdom_inclusive). > > It was my understanding that when using iommu_hwdom_inclusive (or > iommu_hwdom_reserved if the IO-APIC page is a reserved region) we > still want to deny access to the IO-APIC page if it's not in > iomem_caps, and the proposed conditional won't do that. > > So I guess the discussion is really whether > iommu_hwdom_{inclusive,reserved} take precedence over ->iomem_caps? I think the intended interaction is not spelled out anywhere. I also think that it is to be expected for such interaction to be quirky; after all the options themselves are quirks. > It seems a bit inconsistent IMO to enforce mmio_ro_ranges but not > ->iomem_caps when using iommu_hwdom_{inclusive,reserved}. In a way, yes. But as said before - it's highly theoretical for IO-APIC pages to not be in ->iomem_caps (and this case also won't go silently). Jan
On Wed, May 04, 2022 at 03:55:09PM +0200, Jan Beulich wrote: > On 04.05.2022 15:46, Roger Pau Monné wrote: > > On Wed, May 04, 2022 at 03:19:16PM +0200, Jan Beulich wrote: > >> On 04.05.2022 15:00, Roger Pau Monné wrote: > >>> On Wed, May 04, 2022 at 02:12:58PM +0200, Jan Beulich wrote: > >>>> On 04.05.2022 14:01, Roger Pau Monné wrote: > >>>>> On Wed, May 04, 2022 at 12:51:25PM +0200, Jan Beulich wrote: > >>>>>> On 04.05.2022 12:30, Roger Pau Monné wrote: > >>>>>>> Right, ->iomem_caps is indeed too wide for our purpose. What > >>>>>>> about using something like: > >>>>>>> > >>>>>>> else if ( is_pv_domain(d) ) > >>>>>>> { > >>>>>>> if ( !iomem_access_permitted(d, pfn, pfn) ) > >>>>>>> return 0; > >>>>>> > >>>>>> We can't return 0 here (as RAM pages also make it here when > >>>>>> !iommu_hwdom_strict), so I can at best take this as a vague outline > >>>>>> of what you really mean. And I don't want to rely on RAM pages being > >>>>>> (imo wrongly) represented by set bits in Dom0's iomem_caps. > >>>>> > >>>>> Well, yes, my suggestion was taking into account that ->iomem_caps for > >>>>> dom0 has mostly holes for things that shouldn't be mapped, but > >>>>> otherwise contains everything else as allowed (including RAM). > >>>>> > >>>>> We could instead do: > >>>>> > >>>>> else if ( is_pv_domain(d) && type != RAM_TYPE_CONVENTIONAL ) > >>>>> { > >>>>> ... > >>>>> > >>>>> So that we don't rely on RAM being 'allowed' in ->iomem_caps? > >>>> > >>>> This would feel to me like excess special casing. > >>> > >>> What about placing this in the 'default:' label on the type switch a > >>> bit above? > >> > >> I'd really like to stick to the present layout of where the special > >> casing is done, with PV and PVH logic at least next to each other. I > >> continue to think the construct I suggested (still visible below) > >> would do. > >> > >>>>>>> if ( rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > >>>>>>> return IOMMUF_readable; > >>>>>>> } > >>>>>>> > >>>>>>> That would get us a bit closer to allowed CPU side mappings, and we > >>>>>>> don't need to special case IO-APIC or HPET addresses as those are > >>>>>>> already added to ->iomem_caps or mmio_ro_ranges respectively by > >>>>>>> dom0_setup_permissions(). > >>>>>> > >>>>>> This won't fit in a region of code framed by a (split) comment > >>>>>> saying "Check that it doesn't overlap with ...". Hence if anything > >>>>>> I could put something like this further down. Yet even then the > >>>>>> question remains what to do with ranges which pass > >>>>>> iomem_access_permitted() but > >>>>>> - aren't really MMIO, > >>>>>> - are inside MMCFG, > >>>>>> - are otherwise special. > >>>>>> > >>>>>> Or did you perhaps mean to suggest something like > >>>>>> > >>>>>> else if ( is_pv_domain(d) && iomem_access_permitted(d, pfn, pfn) && > >>>>>> rangeset_contains_singleton(mmio_ro_ranges, pfn) ) > >>>>>> return IOMMUF_readable; > >>>>> > >>>>> I don't think this would be fully correct, as we would still allow > >>>>> mappings of IO-APIC pages explicitly banned in ->iomem_caps by not > >>>>> handling those? > >>>> > >>>> CPU side mappings don't deal with the IO-APICs specifically. They only > >>>> care about iomem_caps and mmio_ro_ranges. Hence explicitly banned > >>>> IO-APIC pages cannot be mapped there either. (Of course we only do > >>>> such banning if IO-APIC pages weren't possible to represent in > >>>> mmio_ro_ranges, which should effectively be never.) > >>> > >>> I think I haven't expressed myself correctly. > >>> > >>> This construct won't return 0 for pfns not in iomem_caps, and hence > >>> could allow mapping of addresses not in iomem_caps? > >> > >> I'm afraid I don't understand: There's an iomem_access_permitted() > >> in the conditional. How would this allow mapping pages outside of > >> iomem_caps? The default case higher up has already forced perms to > >> zero for any non-RAM page (unless iommu_hwdom_inclusive). > > > > It was my understanding that when using iommu_hwdom_inclusive (or > > iommu_hwdom_reserved if the IO-APIC page is a reserved region) we > > still want to deny access to the IO-APIC page if it's not in > > iomem_caps, and the proposed conditional won't do that. > > > > So I guess the discussion is really whether > > iommu_hwdom_{inclusive,reserved} take precedence over ->iomem_caps? > > I think the intended interaction is not spelled out anywhere. I > also think that it is to be expected for such interaction to be > quirky; after all the options themselves are quirks. > > > It seems a bit inconsistent IMO to enforce mmio_ro_ranges but not > > ->iomem_caps when using iommu_hwdom_{inclusive,reserved}. > > In a way, yes. But as said before - it's highly theoretical for > IO-APIC pages to not be in ->iomem_caps (and this case also won't > go silently). My idea was for whatever check we add for PV to also cover HPET, which is in a similar situation (can be either blocked in ->iomem_caps or in mmio_ro_ranges). Thanks, Roger.
--- a/xen/drivers/passthrough/x86/iommu.c +++ b/xen/drivers/passthrough/x86/iommu.c @@ -275,12 +275,12 @@ void iommu_identity_map_teardown(struct } } -static bool __hwdom_init hwdom_iommu_map(const struct domain *d, - unsigned long pfn, - unsigned long max_pfn) +static unsigned int __hwdom_init hwdom_iommu_map(const struct domain *d, + unsigned long pfn, + unsigned long max_pfn) { mfn_t mfn = _mfn(pfn); - unsigned int i, type; + unsigned int i, type, perms = IOMMUF_readable | IOMMUF_writable; /* * Set up 1:1 mapping for dom0. Default to include only conventional RAM @@ -289,44 +289,60 @@ static bool __hwdom_init hwdom_iommu_map * that fall in unusable ranges for PV Dom0. */ if ( (pfn > max_pfn && !mfn_valid(mfn)) || xen_in_range(pfn) ) - return false; + return 0; switch ( type = page_get_ram_type(mfn) ) { case RAM_TYPE_UNUSABLE: - return false; + return 0; case RAM_TYPE_CONVENTIONAL: if ( iommu_hwdom_strict ) - return false; + return 0; break; default: if ( type & RAM_TYPE_RESERVED ) { if ( !iommu_hwdom_inclusive && !iommu_hwdom_reserved ) - return false; + perms = 0; } - else if ( is_hvm_domain(d) || !iommu_hwdom_inclusive || pfn > max_pfn ) - return false; + else if ( is_hvm_domain(d) ) + return 0; + else if ( !iommu_hwdom_inclusive || pfn > max_pfn ) + perms = 0; } /* Check that it doesn't overlap with the Interrupt Address Range. */ if ( pfn >= 0xfee00 && pfn <= 0xfeeff ) - return false; + return 0; /* ... or the IO-APIC */ - for ( i = 0; has_vioapic(d) && i < d->arch.hvm.nr_vioapics; i++ ) - if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) - return false; + if ( has_vioapic(d) ) + { + for ( i = 0; i < d->arch.hvm.nr_vioapics; i++ ) + if ( pfn == PFN_DOWN(domain_vioapic(d, i)->base_address) ) + return 0; + } + else if ( is_pv_domain(d) ) + { + /* + * Be consistent with CPU mappings: Dom0 is permitted to establish r/o + * ones there, so it should also have such established for IOMMUs. + */ + for ( i = 0; i < nr_ioapics; i++ ) + if ( pfn == PFN_DOWN(mp_ioapics[i].mpc_apicaddr) ) + return rangeset_contains_singleton(mmio_ro_ranges, pfn) + ? IOMMUF_readable : 0; + } /* * ... or the PCIe MCFG regions. * TODO: runtime added MMCFG regions are not checked to make sure they * don't overlap with already mapped regions, thus preventing trapping. */ if ( has_vpci(d) && vpci_is_mmcfg_address(d, pfn_to_paddr(pfn)) ) - return false; + return 0; - return true; + return perms; } void __hwdom_init arch_iommu_hwdom_init(struct domain *d) @@ -368,15 +384,19 @@ void __hwdom_init arch_iommu_hwdom_init( for ( ; i < top; i++ ) { unsigned long pfn = pdx_to_pfn(i); + unsigned int perms = hwdom_iommu_map(d, pfn, max_pfn); int rc; - if ( !hwdom_iommu_map(d, pfn, max_pfn) ) + if ( !perms ) rc = 0; else if ( paging_mode_translate(d) ) - rc = p2m_add_identity_entry(d, pfn, p2m_access_rw, 0); + rc = p2m_add_identity_entry(d, pfn, + perms & IOMMUF_writable ? p2m_access_rw + : p2m_access_r, + 0); else rc = iommu_map(d, _dfn(pfn), _mfn(pfn), 1ul << PAGE_ORDER_4K, - IOMMUF_readable | IOMMUF_writable, &flush_flags); + perms, &flush_flags); if ( rc ) printk(XENLOG_WARNING "%pd: identity %smapping of %lx failed: %d\n",
While already the case for PVH, there's no reason to treat PV differently here, though of course the addresses get taken from another source in this case. Except that, to match CPU side mappings, by default we permit r/o ones. This then also means we now deal consistently with IO-APICs whose MMIO is or is not covered by E820 reserved regions. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- [integrated] v1: Integrate into series. [standalone] v2: Keep IOMMU mappings in sync with CPU ones.