Message ID | 20240227223501.28475-6-vikram.garhwal@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Xen: support grant mappings. | expand |
On Tue, 27 Feb 2024, Vikram Garhwal wrote: > From: Juergen Gross <jgross@suse.com> > > In order to support mapping and unmapping guest memory dynamically to > and from qemu during address_space_[un]map() operations add the map() > and unmap() callbacks to MemoryRegionOps. > > Those will be used e.g. for Xen grant mappings when performing guest > I/Os. > > Signed-off-by: Juergen Gross <jgross@suse.com> > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com> Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> > --- > include/exec/memory.h | 21 ++++++++++++++++++ > system/physmem.c | 50 +++++++++++++++++++++++++++++++++---------- > 2 files changed, 60 insertions(+), 11 deletions(-) > > diff --git a/include/exec/memory.h b/include/exec/memory.h > index 8626a355b3..9f7dfe59c7 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -282,6 +282,27 @@ struct MemoryRegionOps { > unsigned size, > MemTxAttrs attrs); > > + /* > + * Dynamically create mapping. @addr is the guest address to map; @plen > + * is the pointer to the usable length of the buffer. > + * @mr contents can be changed in case a new memory region is created for > + * the mapping. > + * Returns the buffer address for accessing the data. > + */ > + void *(*map)(MemoryRegion **mr, > + hwaddr addr, > + hwaddr *plen, > + bool is_write, > + MemTxAttrs attrs); > + > + /* Unmap an area obtained via map() before. */ > + void (*unmap)(MemoryRegion *mr, > + void *buffer, > + ram_addr_t addr, > + hwaddr len, > + bool is_write, > + hwaddr access_len); > + > enum device_endian endianness; > /* Guest-visible constraints: */ > struct { > diff --git a/system/physmem.c b/system/physmem.c > index 949dcb20ba..d989e9fc1f 100644 > --- a/system/physmem.c > +++ b/system/physmem.c > @@ -3141,6 +3141,7 @@ void *address_space_map(AddressSpace *as, > hwaddr len = *plen; > hwaddr l, xlat; > MemoryRegion *mr; > + void *ptr = NULL; > FlatView *fv; > > if (len == 0) { > @@ -3174,12 +3175,20 @@ void *address_space_map(AddressSpace *as, > return bounce.buffer; > } > > - > memory_region_ref(mr); > + > + if (mr->ops && mr->ops->map) { > + ptr = mr->ops->map(&mr, addr, plen, is_write, attrs); > + } > + > *plen = flatview_extend_translation(fv, addr, len, mr, xlat, > l, is_write, attrs); > fuzz_dma_read_cb(addr, *plen, mr); > - return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); > + if (ptr == NULL) { > + ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); > + } > + > + return ptr; > } > > /* Unmaps a memory region previously mapped by address_space_map(). > @@ -3195,11 +3204,16 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, > > mr = memory_region_from_host(buffer, &addr1); > assert(mr != NULL); > - if (is_write) { > - invalidate_and_set_dirty(mr, addr1, access_len); > - } > - if (xen_enabled()) { > - xen_invalidate_map_cache_entry(buffer); > + > + if (mr->ops && mr->ops->unmap) { > + mr->ops->unmap(mr, buffer, addr1, len, is_write, access_len); > + } else { > + if (is_write) { > + invalidate_and_set_dirty(mr, addr1, access_len); > + } > + if (xen_enabled()) { > + xen_invalidate_map_cache_entry(buffer); > + } > } > memory_region_unref(mr); > return; > @@ -3272,10 +3286,18 @@ int64_t address_space_cache_init(MemoryRegionCache *cache, > * doing this if we found actual RAM, which behaves the same > * regardless of attributes; so UNSPECIFIED is fine. > */ > + if (mr->ops && mr->ops->map) { > + cache->ptr = mr->ops->map(&mr, addr, &l, is_write, > + MEMTXATTRS_UNSPECIFIED); > + } > + > l = flatview_extend_translation(cache->fv, addr, len, mr, > cache->xlat, l, is_write, > MEMTXATTRS_UNSPECIFIED); > - cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true); > + if (!cache->ptr) { > + cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, > + true); > + } > } else { > cache->ptr = NULL; > } > @@ -3297,14 +3319,20 @@ void address_space_cache_invalidate(MemoryRegionCache *cache, > > void address_space_cache_destroy(MemoryRegionCache *cache) > { > - if (!cache->mrs.mr) { > + MemoryRegion *mr = cache->mrs.mr; > + > + if (!mr) { > return; > } > > - if (xen_enabled()) { > + if (mr->ops && mr->ops->unmap) { > + mr->ops->unmap(mr, cache->ptr, cache->xlat, cache->len, > + cache->is_write, cache->len); > + } else if (xen_enabled()) { > xen_invalidate_map_cache_entry(cache->ptr); > } > - memory_region_unref(cache->mrs.mr); > + > + memory_region_unref(mr); > flatview_unref(cache->fv); > cache->mrs.mr = NULL; > cache->fv = NULL; > -- > 2.17.1 >
On Fri, Mar 1, 2024 at 12:11 AM Stefano Stabellini <sstabellini@kernel.org> wrote: > On Tue, 27 Feb 2024, Vikram Garhwal wrote: > > From: Juergen Gross <jgross@suse.com> > > > > In order to support mapping and unmapping guest memory dynamically to > > and from qemu during address_space_[un]map() operations add the map() > > and unmap() callbacks to MemoryRegionOps. > > > > Those will be used e.g. for Xen grant mappings when performing guest > > I/Os. > > > > Signed-off-by: Juergen Gross <jgross@suse.com> > > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com> > > Reviewed-by: Stefano Stabellini <sstabellini@kernel.org> > Reviewed-by: Edgar E. Iglesias <edgar.iglesias@amd.com> > --- > > include/exec/memory.h | 21 ++++++++++++++++++ > > system/physmem.c | 50 +++++++++++++++++++++++++++++++++---------- > > 2 files changed, 60 insertions(+), 11 deletions(-) > > > > diff --git a/include/exec/memory.h b/include/exec/memory.h > > index 8626a355b3..9f7dfe59c7 100644 > > --- a/include/exec/memory.h > > +++ b/include/exec/memory.h > > @@ -282,6 +282,27 @@ struct MemoryRegionOps { > > unsigned size, > > MemTxAttrs attrs); > > > > + /* > > + * Dynamically create mapping. @addr is the guest address to map; > @plen > > + * is the pointer to the usable length of the buffer. > > + * @mr contents can be changed in case a new memory region is > created for > > + * the mapping. > > + * Returns the buffer address for accessing the data. > > + */ > > + void *(*map)(MemoryRegion **mr, > > + hwaddr addr, > > + hwaddr *plen, > > + bool is_write, > > + MemTxAttrs attrs); > > + > > + /* Unmap an area obtained via map() before. */ > > + void (*unmap)(MemoryRegion *mr, > > + void *buffer, > > + ram_addr_t addr, > > + hwaddr len, > > + bool is_write, > > + hwaddr access_len); > > + > > enum device_endian endianness; > > /* Guest-visible constraints: */ > > struct { > > diff --git a/system/physmem.c b/system/physmem.c > > index 949dcb20ba..d989e9fc1f 100644 > > --- a/system/physmem.c > > +++ b/system/physmem.c > > @@ -3141,6 +3141,7 @@ void *address_space_map(AddressSpace *as, > > hwaddr len = *plen; > > hwaddr l, xlat; > > MemoryRegion *mr; > > + void *ptr = NULL; > > FlatView *fv; > > > > if (len == 0) { > > @@ -3174,12 +3175,20 @@ void *address_space_map(AddressSpace *as, > > return bounce.buffer; > > } > > > > - > > memory_region_ref(mr); > > + > > + if (mr->ops && mr->ops->map) { > > + ptr = mr->ops->map(&mr, addr, plen, is_write, attrs); > > + } > > + > > *plen = flatview_extend_translation(fv, addr, len, mr, xlat, > > l, is_write, attrs); > > fuzz_dma_read_cb(addr, *plen, mr); > > - return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); > > + if (ptr == NULL) { > > + ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); > > + } > > + > > + return ptr; > > } > > > > /* Unmaps a memory region previously mapped by address_space_map(). > > @@ -3195,11 +3204,16 @@ void address_space_unmap(AddressSpace *as, void > *buffer, hwaddr len, > > > > mr = memory_region_from_host(buffer, &addr1); > > assert(mr != NULL); > > - if (is_write) { > > - invalidate_and_set_dirty(mr, addr1, access_len); > > - } > > - if (xen_enabled()) { > > - xen_invalidate_map_cache_entry(buffer); > > + > > + if (mr->ops && mr->ops->unmap) { > > + mr->ops->unmap(mr, buffer, addr1, len, is_write, > access_len); > > + } else { > > + if (is_write) { > > + invalidate_and_set_dirty(mr, addr1, access_len); > > + } > > + if (xen_enabled()) { > > + xen_invalidate_map_cache_entry(buffer); > > + } > > } > > memory_region_unref(mr); > > return; > > @@ -3272,10 +3286,18 @@ int64_t > address_space_cache_init(MemoryRegionCache *cache, > > * doing this if we found actual RAM, which behaves the same > > * regardless of attributes; so UNSPECIFIED is fine. > > */ > > + if (mr->ops && mr->ops->map) { > > + cache->ptr = mr->ops->map(&mr, addr, &l, is_write, > > + MEMTXATTRS_UNSPECIFIED); > > + } > > + > > l = flatview_extend_translation(cache->fv, addr, len, mr, > > cache->xlat, l, is_write, > > MEMTXATTRS_UNSPECIFIED); > > - cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, > &l, true); > > + if (!cache->ptr) { > > + cache->ptr = qemu_ram_ptr_length(mr->ram_block, > cache->xlat, &l, > > + true); > > + } > > } else { > > cache->ptr = NULL; > > } > > @@ -3297,14 +3319,20 @@ void > address_space_cache_invalidate(MemoryRegionCache *cache, > > > > void address_space_cache_destroy(MemoryRegionCache *cache) > > { > > - if (!cache->mrs.mr) { > > + MemoryRegion *mr = cache->mrs.mr; > > + > > + if (!mr) { > > return; > > } > > > > - if (xen_enabled()) { > > + if (mr->ops && mr->ops->unmap) { > > + mr->ops->unmap(mr, cache->ptr, cache->xlat, cache->len, > > + cache->is_write, cache->len); > > + } else if (xen_enabled()) { > > xen_invalidate_map_cache_entry(cache->ptr); > > } > > - memory_region_unref(cache->mrs.mr); > > + > > + memory_region_unref(mr); > > flatview_unref(cache->fv); > > cache->mrs.mr = NULL; > > cache->fv = NULL; > > -- > > 2.17.1 > > > >
On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com> wrote: > From: Juergen Gross <jgross@suse.com> > > In order to support mapping and unmapping guest memory dynamically to > and from qemu during address_space_[un]map() operations add the map() > and unmap() callbacks to MemoryRegionOps. > > Those will be used e.g. for Xen grant mappings when performing guest > I/Os. > > Signed-off-by: Juergen Gross <jgross@suse.com> > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com> > Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch? Cheers, Edgar > --- > include/exec/memory.h | 21 ++++++++++++++++++ > system/physmem.c | 50 +++++++++++++++++++++++++++++++++---------- > 2 files changed, 60 insertions(+), 11 deletions(-) > > diff --git a/include/exec/memory.h b/include/exec/memory.h > index 8626a355b3..9f7dfe59c7 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -282,6 +282,27 @@ struct MemoryRegionOps { > unsigned size, > MemTxAttrs attrs); > > + /* > + * Dynamically create mapping. @addr is the guest address to map; > @plen > + * is the pointer to the usable length of the buffer. > + * @mr contents can be changed in case a new memory region is created > for > + * the mapping. > + * Returns the buffer address for accessing the data. > + */ > + void *(*map)(MemoryRegion **mr, > + hwaddr addr, > + hwaddr *plen, > + bool is_write, > + MemTxAttrs attrs); > + > + /* Unmap an area obtained via map() before. */ > + void (*unmap)(MemoryRegion *mr, > + void *buffer, > + ram_addr_t addr, > + hwaddr len, > + bool is_write, > + hwaddr access_len); > + > enum device_endian endianness; > /* Guest-visible constraints: */ > struct { > diff --git a/system/physmem.c b/system/physmem.c > index 949dcb20ba..d989e9fc1f 100644 > --- a/system/physmem.c > +++ b/system/physmem.c > @@ -3141,6 +3141,7 @@ void *address_space_map(AddressSpace *as, > hwaddr len = *plen; > hwaddr l, xlat; > MemoryRegion *mr; > + void *ptr = NULL; > FlatView *fv; > > if (len == 0) { > @@ -3174,12 +3175,20 @@ void *address_space_map(AddressSpace *as, > return bounce.buffer; > } > > - > memory_region_ref(mr); > + > + if (mr->ops && mr->ops->map) { > + ptr = mr->ops->map(&mr, addr, plen, is_write, attrs); > + } > + > *plen = flatview_extend_translation(fv, addr, len, mr, xlat, > l, is_write, attrs); > fuzz_dma_read_cb(addr, *plen, mr); > - return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); > + if (ptr == NULL) { > + ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); > + } > + > + return ptr; > } > > /* Unmaps a memory region previously mapped by address_space_map(). > @@ -3195,11 +3204,16 @@ void address_space_unmap(AddressSpace *as, void > *buffer, hwaddr len, > > mr = memory_region_from_host(buffer, &addr1); > assert(mr != NULL); > - if (is_write) { > - invalidate_and_set_dirty(mr, addr1, access_len); > - } > - if (xen_enabled()) { > - xen_invalidate_map_cache_entry(buffer); > + > + if (mr->ops && mr->ops->unmap) { > + mr->ops->unmap(mr, buffer, addr1, len, is_write, access_len); > + } else { > + if (is_write) { > + invalidate_and_set_dirty(mr, addr1, access_len); > + } > + if (xen_enabled()) { > + xen_invalidate_map_cache_entry(buffer); > + } > } > memory_region_unref(mr); > return; > @@ -3272,10 +3286,18 @@ int64_t address_space_cache_init(MemoryRegionCache > *cache, > * doing this if we found actual RAM, which behaves the same > * regardless of attributes; so UNSPECIFIED is fine. > */ > + if (mr->ops && mr->ops->map) { > + cache->ptr = mr->ops->map(&mr, addr, &l, is_write, > + MEMTXATTRS_UNSPECIFIED); > + } > + > l = flatview_extend_translation(cache->fv, addr, len, mr, > cache->xlat, l, is_write, > MEMTXATTRS_UNSPECIFIED); > - cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, > true); > + if (!cache->ptr) { > + cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, > &l, > + true); > + } > } else { > cache->ptr = NULL; > } > @@ -3297,14 +3319,20 @@ void > address_space_cache_invalidate(MemoryRegionCache *cache, > > void address_space_cache_destroy(MemoryRegionCache *cache) > { > - if (!cache->mrs.mr) { > + MemoryRegion *mr = cache->mrs.mr; > + > + if (!mr) { > return; > } > > - if (xen_enabled()) { > + if (mr->ops && mr->ops->unmap) { > + mr->ops->unmap(mr, cache->ptr, cache->xlat, cache->len, > + cache->is_write, cache->len); > + } else if (xen_enabled()) { > xen_invalidate_map_cache_entry(cache->ptr); > } > - memory_region_unref(cache->mrs.mr); > + > + memory_region_unref(mr); > flatview_unref(cache->fv); > cache->mrs.mr = NULL; > cache->fv = NULL; > -- > 2.17.1 > > >
On Wed, Apr 10, 2024 at 06:44:38PM +0200, Edgar E. Iglesias wrote: > On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com> > wrote: > > > From: Juergen Gross <jgross@suse.com> > > > > In order to support mapping and unmapping guest memory dynamically to > > and from qemu during address_space_[un]map() operations add the map() > > and unmap() callbacks to MemoryRegionOps. > > > > Those will be used e.g. for Xen grant mappings when performing guest > > I/Os. > > > > Signed-off-by: Juergen Gross <jgross@suse.com> > > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com> > > > > > Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch? This introduces a 3rd memory type afaict, neither direct nor !direct. What happens if someone does address_space_write() to it? I didn't see it covered here.. OTOH, the cover letter didn't mention too much either on the big picture: https://lore.kernel.org/all/20240227223501.28475-1-vikram.garhwal@amd.com/ I want to have a quick grasp on whether it's justified worthwhile we should introduce this complexity to qemu memory core. Could I request a better cover letter when repost? It'll be great to mention things like: - what is grant mapping, why it needs to be used, when it can be used (is it only relevant to vIOMMU=on)? Some more information on the high level design using this type or MR would be great. - why a 3rd memory type is required? Do we have other alternatives? So it's all based on my very limited understanding of reading this: https://xenbits.xenproject.org/docs/4.3-testing/misc/grant-tables.txt If it's about cross-vm sharing memory, does it mean that in reality there are RAMs allocated, but it's only about permission management? In that case, is there any option we implement it with direct access mode (however with some extra dynamic permissions applied on top using some mechanism)? - perhaps sold old links would be great too so people can read about the context when it's not obvious, without a need to copy-paste. Thanks,
On Wed, Apr 10, 2024 at 8:56 PM Peter Xu <peterx@redhat.com> wrote: > > On Wed, Apr 10, 2024 at 06:44:38PM +0200, Edgar E. Iglesias wrote: > > On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com> > > wrote: > > > > > From: Juergen Gross <jgross@suse.com> > > > > > > In order to support mapping and unmapping guest memory dynamically to > > > and from qemu during address_space_[un]map() operations add the map() > > > and unmap() callbacks to MemoryRegionOps. > > > > > > Those will be used e.g. for Xen grant mappings when performing guest > > > I/Os. > > > > > > Signed-off-by: Juergen Gross <jgross@suse.com> > > > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com> > > > > > > > > > Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch? > Thanks for your comments Peter, > This introduces a 3rd memory type afaict, neither direct nor !direct. > > What happens if someone does address_space_write() to it? I didn't see it > covered here.. You're right, that won't work, the memory needs to be mapped before it can be used. At minimum there should be some graceful failure, right now this will crash. > > OTOH, the cover letter didn't mention too much either on the big picture: > > https://lore.kernel.org/all/20240227223501.28475-1-vikram.garhwal@amd.com/ > > I want to have a quick grasp on whether it's justified worthwhile we should > introduce this complexity to qemu memory core. > > Could I request a better cover letter when repost? It'll be great to > mention things like: I'll do that, but also answer inline in the meantime since we should perhaps change the approach. > > - what is grant mapping, why it needs to be used, when it can be used (is > it only relevant to vIOMMU=on)? Some more information on the high > level design using this type or MR would be great. https://github.com/xen-project/xen/blob/master/docs/misc/grant-tables.txt Xen VM's that use QEMU's VirtIO have a QEMU instance running in a separate VM. There's basically two mechanisms for QEMU's Virtio backends to access the guest's RAM. 1. Foreign mappings. This gives the VM running QEMU access to the entire RAM of the guest VM. 2. Grant mappings. This allows the guest to dynamically grant and remove access to pages as needed. So the VM running QEMU, cannot map guest RAM unless it's been instructed to do so by the guest. #2 is desirable because if QEMU gets compromised it has a smaller attack surface onto the guest. > > - why a 3rd memory type is required? Do we have other alternatives? Yes, there are alternatives. 1. It was suggested by Stefano to try to handle this in existing qemu/hw/xen/*. This would be less intrusive but perhaps also less explicit. Concerns about touching the Memory API have been raised before, so perhaps we should try this. I'm a little unsure how we would deal with unmapping when the guest removes our grants and we're using models that don't map but use address_space_read/write(). 2. Another approach could be to change the Xen grant-iommu in the Linux kernel to work with a grant vIOMMU in QEMU. Linux could explicitly ask QEMU's grant vIOMMU to map/unmap granted regions. This would have the benefit that we wouldn't need to allocate address-bit 63 for grants. A drawback is that it may be slower since we're going to need to bounce between guest/qemu a bit more. > So it's all based on my very limited understanding of reading this: > https://xenbits.xenproject.org/docs/4.3-testing/misc/grant-tables.txt > > If it's about cross-vm sharing memory, does it mean that in reality > there are RAMs allocated, but it's only about permission management? > In that case, is there any option we implement it with direct access > mode (however with some extra dynamic permissions applied on top using > some mechanism)? Yes, I think the grant vIOMMU approach would fall into this category. > > - perhaps sold old links would be great too so people can read about the > context when it's not obvious, without a need to copy-paste. https://wiki.xenproject.org/wiki/Virtio_On_Xen https://lwn.net/Articles/896938/ Cheers, Edgar
On 16.04.24 13:32, Edgar E. Iglesias wrote: > On Wed, Apr 10, 2024 at 8:56 PM Peter Xu <peterx@redhat.com> wrote: >> >> On Wed, Apr 10, 2024 at 06:44:38PM +0200, Edgar E. Iglesias wrote: >>> On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com> >>> wrote: >>> >>>> From: Juergen Gross <jgross@suse.com> >>>> >>>> In order to support mapping and unmapping guest memory dynamically to >>>> and from qemu during address_space_[un]map() operations add the map() >>>> and unmap() callbacks to MemoryRegionOps. >>>> >>>> Those will be used e.g. for Xen grant mappings when performing guest >>>> I/Os. >>>> >>>> Signed-off-by: Juergen Gross <jgross@suse.com> >>>> Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com> >>>> >>> >>> >>> Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch? >> > > Thanks for your comments Peter, > > >> This introduces a 3rd memory type afaict, neither direct nor !direct. >> >> What happens if someone does address_space_write() to it? I didn't see it >> covered here.. > > You're right, that won't work, the memory needs to be mapped before it > can be used. > At minimum there should be some graceful failure, right now this will crash. > >> >> OTOH, the cover letter didn't mention too much either on the big picture: >> >> https://lore.kernel.org/all/20240227223501.28475-1-vikram.garhwal@amd.com/ >> >> I want to have a quick grasp on whether it's justified worthwhile we should >> introduce this complexity to qemu memory core. >> >> Could I request a better cover letter when repost? It'll be great to >> mention things like: > > I'll do that, but also answer inline in the meantime since we should > perhaps change the approach. > >> >> - what is grant mapping, why it needs to be used, when it can be used (is >> it only relevant to vIOMMU=on)? Some more information on the high >> level design using this type or MR would be great. > > https://github.com/xen-project/xen/blob/master/docs/misc/grant-tables.txt > > Xen VM's that use QEMU's VirtIO have a QEMU instance running in a separate VM. > > There's basically two mechanisms for QEMU's Virtio backends to access > the guest's RAM. > 1. Foreign mappings. This gives the VM running QEMU access to the > entire RAM of the guest VM. Additionally it requires qemu to run in dom0, while in general Xen allows to run backends in less privileged "driver domains", which are usually not allowed to perform foreign mappings. > 2. Grant mappings. This allows the guest to dynamically grant and > remove access to pages as needed. > So the VM running QEMU, cannot map guest RAM unless it's been > instructed to do so by the guest. > > #2 is desirable because if QEMU gets compromised it has a smaller > attack surface onto the guest. And it allows to run the virtio backend in a less privileged VM. > >> >> - why a 3rd memory type is required? Do we have other alternatives? > > Yes, there are alternatives. > > 1. It was suggested by Stefano to try to handle this in existing qemu/hw/xen/*. > This would be less intrusive but perhaps also less explicit. > Concerns about touching the Memory API have been raised before, so > perhaps we should try this. > I'm a little unsure how we would deal with unmapping when the guest > removes our grants and we're using models that don't map but use > address_space_read/write(). Those would either need to use grant-copy hypercalls, or they'd need to map, read/write, unmap. > > 2. Another approach could be to change the Xen grant-iommu in the > Linux kernel to work with a grant vIOMMU in QEMU. > Linux could explicitly ask QEMU's grant vIOMMU to map/unmap granted regions. > This would have the benefit that we wouldn't need to allocate > address-bit 63 for grants. > A drawback is that it may be slower since we're going to need to > bounce between guest/qemu a bit more. It would be a _lot_ slower, unless virtio-iommu and grants are both modified to match. I have looked into that, but the needed effort is rather large. At the last Xen summit I have suggested to introduce a new grant format which would work more like a normal page table structure. Using the same format for virtio-iommu would allow to avoid the additional bounces between qemu and the guest (and in fact that was one of the motivations to suggest the new grant format). Juergen
On Tue, Apr 16, 2024 at 03:28:41PM +0200, Jürgen Groß wrote: > On 16.04.24 13:32, Edgar E. Iglesias wrote: > > On Wed, Apr 10, 2024 at 8:56 PM Peter Xu <peterx@redhat.com> wrote: > > > > > > On Wed, Apr 10, 2024 at 06:44:38PM +0200, Edgar E. Iglesias wrote: > > > > On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com> > > > > wrote: > > > > > > > > > From: Juergen Gross <jgross@suse.com> > > > > > > > > > > In order to support mapping and unmapping guest memory dynamically to > > > > > and from qemu during address_space_[un]map() operations add the map() > > > > > and unmap() callbacks to MemoryRegionOps. > > > > > > > > > > Those will be used e.g. for Xen grant mappings when performing guest > > > > > I/Os. > > > > > > > > > > Signed-off-by: Juergen Gross <jgross@suse.com> > > > > > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com> > > > > > > > > > > > > > > > > > Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch? > > > > > > > Thanks for your comments Peter, > > > > > > > This introduces a 3rd memory type afaict, neither direct nor !direct. > > > > > > What happens if someone does address_space_write() to it? I didn't see it > > > covered here.. > > > > You're right, that won't work, the memory needs to be mapped before it > > can be used. > > At minimum there should be some graceful failure, right now this will crash. > > > > > > > > OTOH, the cover letter didn't mention too much either on the big picture: > > > > > > https://lore.kernel.org/all/20240227223501.28475-1-vikram.garhwal@amd.com/ > > > > > > I want to have a quick grasp on whether it's justified worthwhile we should > > > introduce this complexity to qemu memory core. > > > > > > Could I request a better cover letter when repost? It'll be great to > > > mention things like: > > > > I'll do that, but also answer inline in the meantime since we should > > perhaps change the approach. > > > > > > > > - what is grant mapping, why it needs to be used, when it can be used (is > > > it only relevant to vIOMMU=on)? Some more information on the high > > > level design using this type or MR would be great. > > > > https://github.com/xen-project/xen/blob/master/docs/misc/grant-tables.txt > > > > Xen VM's that use QEMU's VirtIO have a QEMU instance running in a separate VM. > > > > There's basically two mechanisms for QEMU's Virtio backends to access > > the guest's RAM. > > 1. Foreign mappings. This gives the VM running QEMU access to the > > entire RAM of the guest VM. > > Additionally it requires qemu to run in dom0, while in general Xen allows > to run backends in less privileged "driver domains", which are usually not > allowed to perform foreign mappings. > > > 2. Grant mappings. This allows the guest to dynamically grant and > > remove access to pages as needed. > > So the VM running QEMU, cannot map guest RAM unless it's been > > instructed to do so by the guest. > > > > #2 is desirable because if QEMU gets compromised it has a smaller > > attack surface onto the guest. > > And it allows to run the virtio backend in a less privileged VM. > > > > > > > > > - why a 3rd memory type is required? Do we have other alternatives? > > > > Yes, there are alternatives. > > > > 1. It was suggested by Stefano to try to handle this in existing qemu/hw/xen/*. > > This would be less intrusive but perhaps also less explicit. > > Concerns about touching the Memory API have been raised before, so > > perhaps we should try this. > > I'm a little unsure how we would deal with unmapping when the guest > > removes our grants and we're using models that don't map but use > > address_space_read/write(). > > Those would either need to use grant-copy hypercalls, or they'd need to map, > read/write, unmap. > > > > > 2. Another approach could be to change the Xen grant-iommu in the > > Linux kernel to work with a grant vIOMMU in QEMU. > > Linux could explicitly ask QEMU's grant vIOMMU to map/unmap granted regions. > > This would have the benefit that we wouldn't need to allocate > > address-bit 63 for grants. > > A drawback is that it may be slower since we're going to need to > > bounce between guest/qemu a bit more. > > It would be a _lot_ slower, unless virtio-iommu and grants are both modified > to match. I have looked into that, but the needed effort is rather large. At > the last Xen summit I have suggested to introduce a new grant format which > would work more like a normal page table structure. Using the same format > for virtio-iommu would allow to avoid the additional bounces between qemu and > the guest (and in fact that was one of the motivations to suggest the new > grant format). I have a better picture now, thanks both. It really looks like an vIOMMU already to me, perhaps with a special refID mapping playing similar roles as IOVAs in the rest IOMMU worlds. I can't yet tell what's the best way for Xen - as of now QEMU's memory API does provide such translations via IOMMUMemoryRegionClass.translate(), but only from that. So far it works for all vIOMMU emulations in QEMU, and I'd hope we don't need to hack another memory type if possible for this, especially if for performance's sake; more on below. QEMU also suffers from similar issues with other kind of DMA protections, at least that's what I'm aware with using either VT-d, SMMU, etc.. where dynamic DMA mappings will slow the IOs down to a degree that it may not be usable in real production. We kept it like that and so far AFAIK we don't yet have a solution for that simply because of the nature on how DMA buffers are mapped and managed within a guest OS no matter Linux or not. For Linux as a guest we basically suggest enabling iommu=pt so that kernel drivers are trusted, and kernel driven devices can have full access to guest RAMs by using the vIOMMU's passthrough mode. Perhaps similar to foreign mappings for Xen, but maybe still different, as Xen's topology is definitely special as a hypervisor here. While for userspace drivers within the guest OS it'll always go through vfio-pci now, which will enforce effective DMA mappings not the passthrough mode. Then it's suggested to only map as less as possible, e.g. DPDK only maps at the start of the user driver so it's mostly not affected by the slowness of frequently changing DMA mappings. I'm not sure whether above ideas would even be applicable for Xen, but I just to share the status quo regarding to how we manage protected DMAs when without Xen, just in case there's anything useful to help route the path forward. Thanks,
On Tue, Apr 16, 2024 at 5:55 PM Peter Xu <peterx@redhat.com> wrote: > > On Tue, Apr 16, 2024 at 03:28:41PM +0200, Jürgen Groß wrote: > > On 16.04.24 13:32, Edgar E. Iglesias wrote: > > > On Wed, Apr 10, 2024 at 8:56 PM Peter Xu <peterx@redhat.com> wrote: > > > > > > > > On Wed, Apr 10, 2024 at 06:44:38PM +0200, Edgar E. Iglesias wrote: > > > > > On Tue, Feb 27, 2024 at 11:37 PM Vikram Garhwal <vikram.garhwal@amd.com> > > > > > wrote: > > > > > > > > > > > From: Juergen Gross <jgross@suse.com> > > > > > > > > > > > > In order to support mapping and unmapping guest memory dynamically to > > > > > > and from qemu during address_space_[un]map() operations add the map() > > > > > > and unmap() callbacks to MemoryRegionOps. > > > > > > > > > > > > Those will be used e.g. for Xen grant mappings when performing guest > > > > > > I/Os. > > > > > > > > > > > > Signed-off-by: Juergen Gross <jgross@suse.com> > > > > > > Signed-off-by: Vikram Garhwal <vikram.garhwal@amd.com> > > > > > > > > > > > > > > > > > > > > > Paolo, Peter, David, Phiippe, do you guys have any concerns with this patch? > > > > > > > > > > Thanks for your comments Peter, > > > > > > > > > > This introduces a 3rd memory type afaict, neither direct nor !direct. > > > > > > > > What happens if someone does address_space_write() to it? I didn't see it > > > > covered here.. > > > > > > You're right, that won't work, the memory needs to be mapped before it > > > can be used. > > > At minimum there should be some graceful failure, right now this will crash. > > > > > > > > > > > OTOH, the cover letter didn't mention too much either on the big picture: > > > > > > > > https://lore.kernel.org/all/20240227223501.28475-1-vikram.garhwal@amd.com/ > > > > > > > > I want to have a quick grasp on whether it's justified worthwhile we should > > > > introduce this complexity to qemu memory core. > > > > > > > > Could I request a better cover letter when repost? It'll be great to > > > > mention things like: > > > > > > I'll do that, but also answer inline in the meantime since we should > > > perhaps change the approach. > > > > > > > > > > > - what is grant mapping, why it needs to be used, when it can be used (is > > > > it only relevant to vIOMMU=on)? Some more information on the high > > > > level design using this type or MR would be great. > > > > > > https://github.com/xen-project/xen/blob/master/docs/misc/grant-tables.txt > > > > > > Xen VM's that use QEMU's VirtIO have a QEMU instance running in a separate VM. > > > > > > There's basically two mechanisms for QEMU's Virtio backends to access > > > the guest's RAM. > > > 1. Foreign mappings. This gives the VM running QEMU access to the > > > entire RAM of the guest VM. > > > > Additionally it requires qemu to run in dom0, while in general Xen allows > > to run backends in less privileged "driver domains", which are usually not > > allowed to perform foreign mappings. > > > > > 2. Grant mappings. This allows the guest to dynamically grant and > > > remove access to pages as needed. > > > So the VM running QEMU, cannot map guest RAM unless it's been > > > instructed to do so by the guest. > > > > > > #2 is desirable because if QEMU gets compromised it has a smaller > > > attack surface onto the guest. > > > > And it allows to run the virtio backend in a less privileged VM. > > > > > > > > > > > > > - why a 3rd memory type is required? Do we have other alternatives? > > > > > > Yes, there are alternatives. > > > > > > 1. It was suggested by Stefano to try to handle this in existing qemu/hw/xen/*. > > > This would be less intrusive but perhaps also less explicit. > > > Concerns about touching the Memory API have been raised before, so > > > perhaps we should try this. > > > I'm a little unsure how we would deal with unmapping when the guest > > > removes our grants and we're using models that don't map but use > > > address_space_read/write(). > > > > Those would either need to use grant-copy hypercalls, or they'd need to map, > > read/write, unmap. > > > > > > > > 2. Another approach could be to change the Xen grant-iommu in the > > > Linux kernel to work with a grant vIOMMU in QEMU. > > > Linux could explicitly ask QEMU's grant vIOMMU to map/unmap granted regions. > > > This would have the benefit that we wouldn't need to allocate > > > address-bit 63 for grants. > > > A drawback is that it may be slower since we're going to need to > > > bounce between guest/qemu a bit more. > > > > It would be a _lot_ slower, unless virtio-iommu and grants are both modified > > to match. I have looked into that, but the needed effort is rather large. At > > the last Xen summit I have suggested to introduce a new grant format which > > would work more like a normal page table structure. Using the same format > > for virtio-iommu would allow to avoid the additional bounces between qemu and > > the guest (and in fact that was one of the motivations to suggest the new > > grant format). > > I have a better picture now, thanks both. > > It really looks like an vIOMMU already to me, perhaps with a special refID > mapping playing similar roles as IOVAs in the rest IOMMU worlds. > > I can't yet tell what's the best way for Xen - as of now QEMU's memory API > does provide such translations via IOMMUMemoryRegionClass.translate(), but > only from that. So far it works for all vIOMMU emulations in QEMU, and I'd > hope we don't need to hack another memory type if possible for this, > especially if for performance's sake; more on below. > > QEMU also suffers from similar issues with other kind of DMA protections, > at least that's what I'm aware with using either VT-d, SMMU, etc.. where > dynamic DMA mappings will slow the IOs down to a degree that it may not be > usable in real production. We kept it like that and so far AFAIK we don't > yet have a solution for that simply because of the nature on how DMA > buffers are mapped and managed within a guest OS no matter Linux or not. > > For Linux as a guest we basically suggest enabling iommu=pt so that kernel > drivers are trusted, and kernel driven devices can have full access to > guest RAMs by using the vIOMMU's passthrough mode. Perhaps similar to > foreign mappings for Xen, but maybe still different, as Xen's topology is > definitely special as a hypervisor here. > > While for userspace drivers within the guest OS it'll always go through > vfio-pci now, which will enforce effective DMA mappings not the passthrough > mode. Then it's suggested to only map as less as possible, e.g. DPDK only > maps at the start of the user driver so it's mostly not affected by the > slowness of frequently changing DMA mappings. > > I'm not sure whether above ideas would even be applicable for Xen, but I > just to share the status quo regarding to how we manage protected DMAs when > without Xen, just in case there's anything useful to help route the path > forward. > > Thanks, > > -- > Peter Xu > Thanks for the suggestions Peter and for your comments Jurgen. We'll have to evaluate the different approaches a little more and see where we go from here. Best regards, Edgar
diff --git a/include/exec/memory.h b/include/exec/memory.h index 8626a355b3..9f7dfe59c7 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -282,6 +282,27 @@ struct MemoryRegionOps { unsigned size, MemTxAttrs attrs); + /* + * Dynamically create mapping. @addr is the guest address to map; @plen + * is the pointer to the usable length of the buffer. + * @mr contents can be changed in case a new memory region is created for + * the mapping. + * Returns the buffer address for accessing the data. + */ + void *(*map)(MemoryRegion **mr, + hwaddr addr, + hwaddr *plen, + bool is_write, + MemTxAttrs attrs); + + /* Unmap an area obtained via map() before. */ + void (*unmap)(MemoryRegion *mr, + void *buffer, + ram_addr_t addr, + hwaddr len, + bool is_write, + hwaddr access_len); + enum device_endian endianness; /* Guest-visible constraints: */ struct { diff --git a/system/physmem.c b/system/physmem.c index 949dcb20ba..d989e9fc1f 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -3141,6 +3141,7 @@ void *address_space_map(AddressSpace *as, hwaddr len = *plen; hwaddr l, xlat; MemoryRegion *mr; + void *ptr = NULL; FlatView *fv; if (len == 0) { @@ -3174,12 +3175,20 @@ void *address_space_map(AddressSpace *as, return bounce.buffer; } - memory_region_ref(mr); + + if (mr->ops && mr->ops->map) { + ptr = mr->ops->map(&mr, addr, plen, is_write, attrs); + } + *plen = flatview_extend_translation(fv, addr, len, mr, xlat, l, is_write, attrs); fuzz_dma_read_cb(addr, *plen, mr); - return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); + if (ptr == NULL) { + ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); + } + + return ptr; } /* Unmaps a memory region previously mapped by address_space_map(). @@ -3195,11 +3204,16 @@ void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, mr = memory_region_from_host(buffer, &addr1); assert(mr != NULL); - if (is_write) { - invalidate_and_set_dirty(mr, addr1, access_len); - } - if (xen_enabled()) { - xen_invalidate_map_cache_entry(buffer); + + if (mr->ops && mr->ops->unmap) { + mr->ops->unmap(mr, buffer, addr1, len, is_write, access_len); + } else { + if (is_write) { + invalidate_and_set_dirty(mr, addr1, access_len); + } + if (xen_enabled()) { + xen_invalidate_map_cache_entry(buffer); + } } memory_region_unref(mr); return; @@ -3272,10 +3286,18 @@ int64_t address_space_cache_init(MemoryRegionCache *cache, * doing this if we found actual RAM, which behaves the same * regardless of attributes; so UNSPECIFIED is fine. */ + if (mr->ops && mr->ops->map) { + cache->ptr = mr->ops->map(&mr, addr, &l, is_write, + MEMTXATTRS_UNSPECIFIED); + } + l = flatview_extend_translation(cache->fv, addr, len, mr, cache->xlat, l, is_write, MEMTXATTRS_UNSPECIFIED); - cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true); + if (!cache->ptr) { + cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, + true); + } } else { cache->ptr = NULL; } @@ -3297,14 +3319,20 @@ void address_space_cache_invalidate(MemoryRegionCache *cache, void address_space_cache_destroy(MemoryRegionCache *cache) { - if (!cache->mrs.mr) { + MemoryRegion *mr = cache->mrs.mr; + + if (!mr) { return; } - if (xen_enabled()) { + if (mr->ops && mr->ops->unmap) { + mr->ops->unmap(mr, cache->ptr, cache->xlat, cache->len, + cache->is_write, cache->len); + } else if (xen_enabled()) { xen_invalidate_map_cache_entry(cache->ptr); } - memory_region_unref(cache->mrs.mr); + + memory_region_unref(mr); flatview_unref(cache->fv); cache->mrs.mr = NULL; cache->fv = NULL;