Message ID | 20230405180134.16932-2-ankita@nvidia.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Expose GPU memory as coherently CPU accessible | expand |
On Wed, 05 Apr 2023 19:01:29 +0100, <ankita@nvidia.com> wrote: > > From: Ankit Agrawal <ankita@nvidia.com> > > Each VM stores the requires pgprots for its mappings in the The VM stores *nothing*. Did you mean VMA? And even then, I can't parse this. > vma->pgprot. Based on this we can determine the desired MT_DEVICE_* > for the VMA directly, and do not have to guess based on heuristics > based on pfn_is_map_memory(). > > There are the following kinds of pgprot available to userspace and their > corresponding type: > pgprot_noncached -> MT_DEVICE_nGnRnE > pgprot_writecombine -> MT_NORMAL_NC > pgprot_device -> MT_DEVICE_nGnRE > pgprot_tagged -> MT_NORMAL_TAGGED > > Decode the relevant MT_* types in use and translate them into the > corresponding KVM_PGTABLEPROT_*: > > - MT_DEVICE_nGnRE -> KVM_PGTABLE_PROT_DEVICE_nGnRE (device) > - MT_DEVICE_nGnRnE -> KVM_PGTABLE_PROT_DEVICE_nGnRnE (noncached) > - MT_NORMAL/_TAGGED/_NC -> 0 > > The selection of 0 for the S2 KVM_PGTABLE_PROT_DEVICE_nGnRnE is based > on [2]. > > Also worth noting is the result of the stage-1 and stage-2. Ref [3] > If FWB not set, then the combination is the one that is more restrictive. > The sequence from lowest restriction to the highest: > DEVICE_nGnRnE -> DEVICE_nGnRE -> NORMAL/_TAGGED/_NC Sorry, but I can't parse this either. If you mean that the strongest memory type wins, then I agree. But what does it bring to the discussion? > If FWB is set, then stage-2 mapping type overrides the stage-1 [1]. > > This solves a problem where KVM cannot preserve the MT_NORMAL memory > type for non-struct page backed memory into the S2 mapping. Instead > the VMA creator determines the MT type and the S2 will follow it. What makes it safe? How does VFIO ensures that the memory type used is correct in all circumstances? This has to hold for *ANY* device, not just your favourite toy of the day. Nothing in this patch is horribly wrong, but the above question must be answered before we can consider any of this. > > [1] https://developer.arm.com/documentation/102376/0100/Combining-Stage-1-and-Stage-2-attributes > [2] ARMv8 reference manual: https://developer.arm.com/documentation/ddi0487/gb/ Section D5.5.3, Table D5-38 > [3] ARMv8 reference manual: https://developer.arm.com/documentation/ddi0487/gb/ Table G5-20 on page G5-6330 Please quote references that are current at the time of posting. Revision DDI0487I.a is the most recent version, so use that. > > Signed-off-by: Ankit Agrawal <ankita@nvidia.com> > --- > arch/arm64/include/asm/kvm_pgtable.h | 8 +++++--- > arch/arm64/include/asm/memory.h | 6 ++++-- > arch/arm64/kvm/hyp/pgtable.c | 16 +++++++++++----- > arch/arm64/kvm/mmu.c | 27 ++++++++++++++++++++++----- > 4 files changed, 42 insertions(+), 15 deletions(-) > > diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h > index 4cd6762bda80..d3166b6e6329 100644 > --- a/arch/arm64/include/asm/kvm_pgtable.h > +++ b/arch/arm64/include/asm/kvm_pgtable.h > @@ -150,7 +150,8 @@ enum kvm_pgtable_stage2_flags { > * @KVM_PGTABLE_PROT_X: Execute permission. > * @KVM_PGTABLE_PROT_W: Write permission. > * @KVM_PGTABLE_PROT_R: Read permission. > - * @KVM_PGTABLE_PROT_DEVICE: Device attributes. > + * @KVM_PGTABLE_PROT_DEVICE_nGnRE: Device nGnRE attributes. > + * @KVM_PGTABLE_PROT_DEVICE_nGnRnE: Device nGnRnE attributes. > * @KVM_PGTABLE_PROT_SW0: Software bit 0. > * @KVM_PGTABLE_PROT_SW1: Software bit 1. > * @KVM_PGTABLE_PROT_SW2: Software bit 2. > @@ -161,7 +162,8 @@ enum kvm_pgtable_prot { > KVM_PGTABLE_PROT_W = BIT(1), > KVM_PGTABLE_PROT_R = BIT(2), > > - KVM_PGTABLE_PROT_DEVICE = BIT(3), > + KVM_PGTABLE_PROT_DEVICE_nGnRE = BIT(3), > + KVM_PGTABLE_PROT_DEVICE_nGnRnE = BIT(4), > > KVM_PGTABLE_PROT_SW0 = BIT(55), > KVM_PGTABLE_PROT_SW1 = BIT(56), > @@ -178,7 +180,7 @@ enum kvm_pgtable_prot { > #define PAGE_HYP KVM_PGTABLE_PROT_RW > #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) > #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) > -#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) > +#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE_nGnRE) > > typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, > enum kvm_pgtable_prot prot); > diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h > index 78e5163836a0..4ebbc4b1ba4d 100644 > --- a/arch/arm64/include/asm/memory.h > +++ b/arch/arm64/include/asm/memory.h > @@ -147,14 +147,16 @@ > * Memory types for Stage-2 translation > */ > #define MT_S2_NORMAL 0xf > +#define MT_S2_DEVICE_nGnRnE 0x0 > #define MT_S2_DEVICE_nGnRE 0x1 > > /* > * Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001 > * Stage-2 enforces Normal-WB and Device-nGnRE > */ > -#define MT_S2_FWB_NORMAL 6 > -#define MT_S2_FWB_DEVICE_nGnRE 1 > +#define MT_S2_FWB_NORMAL 0x6 > +#define MT_S2_FWB_DEVICE_nGnRnE 0x0 > +#define MT_S2_FWB_DEVICE_nGnRE 0x1 Why the repainting of perfectly valid constants? What does the 0x prefix bring? Does the comment above need updating? > > #ifdef CONFIG_ARM64_4K_PAGES > #define IOREMAP_MAX_ORDER (PUD_SHIFT) > diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c > index 3d61bd3e591d..7a8238b41590 100644 > --- a/arch/arm64/kvm/hyp/pgtable.c > +++ b/arch/arm64/kvm/hyp/pgtable.c > @@ -355,7 +355,7 @@ struct hyp_map_data { > > static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) > { > - bool device = prot & KVM_PGTABLE_PROT_DEVICE; > + bool device = prot & KVM_PGTABLE_PROT_DEVICE_nGnRE; > u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; > kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); > u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; > @@ -636,14 +636,20 @@ static bool stage2_has_fwb(struct kvm_pgtable *pgt) > static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, > kvm_pte_t *ptep) > { > - bool device = prot & KVM_PGTABLE_PROT_DEVICE; > - kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : > - KVM_S2_MEMATTR(pgt, NORMAL); > u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; > + kvm_pte_t attr; > + > + if (prot & KVM_PGTABLE_PROT_DEVICE_nGnRE) > + attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE); > + else if (prot & KVM_PGTABLE_PROT_DEVICE_nGnRnE) > + attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRnE); > + else > + attr = KVM_S2_MEMATTR(pgt, NORMAL); > > if (!(prot & KVM_PGTABLE_PROT_X)) > attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; > - else if (device) > + else if (prot & KVM_PGTABLE_PROT_DEVICE_nGnRE || > + prot & KVM_PGTABLE_PROT_DEVICE_nGnRnE) Why don't you keep 'device', which is concise and readable, and simply change the way it is assigned? > return -EINVAL; > > if (prot & KVM_PGTABLE_PROT_R) > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c > index 7113587222ff..8d63aa951c33 100644 > --- a/arch/arm64/kvm/mmu.c > +++ b/arch/arm64/kvm/mmu.c > @@ -897,7 +897,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, > int ret = 0; > struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; > struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; > - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | > + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE_nGnRE | > KVM_PGTABLE_PROT_R | > (writable ? KVM_PGTABLE_PROT_W : 0); > > @@ -1186,6 +1186,15 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) > return vma->vm_flags & VM_MTE_ALLOWED; > } > > +/* > + * Determine the memory region cacheability from VMA's pgprot. This > + * is used to set the stage 2 PTEs. > + */ > +static unsigned long mapping_type(pgprot_t page_prot) > +{ > + return ((pgprot_val(page_prot) & PTE_ATTRINDX_MASK) >> 2); Please use FIELD_GET() for field extraction. > +} > + > static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > struct kvm_memory_slot *memslot, unsigned long hva, > unsigned long fault_status) > @@ -1368,10 +1377,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > if (exec_fault) > prot |= KVM_PGTABLE_PROT_X; > > - if (device) > - prot |= KVM_PGTABLE_PROT_DEVICE; > - else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) > - prot |= KVM_PGTABLE_PROT_X; > + switch (mapping_type(vma->vm_page_prot)) { > + case MT_DEVICE_nGnRE: > + prot |= KVM_PGTABLE_PROT_DEVICE_nGnRE; > + break; > + case MT_DEVICE_nGnRnE: > + prot |= KVM_PGTABLE_PROT_DEVICE_nGnRnE; > + break; Please keep the 'device' guard. It makes it easy to find the code paths that are relevant to this case. > + /* MT_NORMAL/_TAGGED/_NC */ > + default: > + if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) > + prot |= KVM_PGTABLE_PROT_X; > + } > > /* > * Under the premise of getting a FSC_PERM fault, we just need to relax Thanks, M.
On Wed, Apr 12, 2023 at 01:43:26PM +0100, Marc Zyngier wrote: > What makes it safe? How does VFIO ensures that the memory type used is > correct in all circumstances? This has to hold for *ANY* device, not > just your favourite toy of the day. Nothing in this patch is horribly > wrong, but the above question must be answered before we can consider > any of this. In VFIO we now have the concept of "variant drivers" which work with specific PCI IDs. The variant drivers can inject device specific knowledge into VFIO. In this series the driver injects the cachable pgprot when it creates some of the VMAs because it knows the PCI IDs it supports, parses the ACPI description, and knows for sure that the memory it puts in the cachable VMA is linked with a cache coherent interconnect. The generic vfio-pci path is not changed, so 'any device' is not relevant here. Jason
On Wed, Apr 12, 2023 at 10:01:33AM -0300, Jason Gunthorpe wrote: > On Wed, Apr 12, 2023 at 01:43:26PM +0100, Marc Zyngier wrote: > > What makes it safe? How does VFIO ensures that the memory type used is > > correct in all circumstances? This has to hold for *ANY* device, not > > just your favourite toy of the day. Nothing in this patch is horribly > > wrong, but the above question must be answered before we can consider > > any of this. > > In VFIO we now have the concept of "variant drivers" which work with > specific PCI IDs. The variant drivers can inject device specific > knowledge into VFIO. In this series the driver injects the cachable > pgprot when it creates some of the VMAs because it knows the PCI IDs > it supports, parses the ACPI description, and knows for sure that the > memory it puts in the cachable VMA is linked with a cache coherent > interconnect. > > The generic vfio-pci path is not changed, so 'any device' is not > relevant here. There were several off-list discussions, I'm trying to summarise my understanding here. This series aims to relax the VFIO mapping to cacheable and have KVM map it into the guest with the same attributes. Somewhat related past threads also tried to relax the KVM device pass-through mapping from Device_nGnRnE (pgprot_noncached) to Normal_NC (pgprot_writecombine). Those were initially using the PCIe prefetchable BAR attribute but that's not a reliable means to infer whether Normal vs Device is safe. Anyway, I think we'd need to unify these threads and come up with some common handling that can cater for various attributes required by devices/drivers. Therefore replying in this thread. Current status on arm64: vfio-pci user mapping (Qemu) of PCIe BARs is Device_nGnRnE. KVM also maps it at Stage 2 with the same attributes. The user mapping is fine since the VMM may not have detailed knowledge about how to safely map the BAR. KVM doesn't have such detailed knowledge either in the device pass-through case but for safety reasons it follows the same attributes as the user mapping. From a safety perspective, relaxing the KVM stage 2 mapping has some implications: 1. It creates multiple memory aliases with different attributes. 2. Speculative loads, unaligned accesses. 3. Potentially new uncontained errors introduced by Normal NC vs Device mappings. From the private discussions I had regarding point (3), Device doesn't really make any difference and it can be even worse. For example, Normal mappings would allow a contained error while Device would trigger an uncontained SError. So the only safe bet here is for the platform, root complex to behave properly when device pass-through is involved (e.g. PCIe to ignore writes, return 0xff on reads if there are errors). That's something Arm is working on to require in the BSA specs (base system architecture). Presumably cloud vendors allowing device pass-through already know their system well enough, no need for further policing in KVM (which it can't do properly anyway). As long as the errors are contained, point (2) becomes the responsibility of the guest, given its detailed knowledge of the device, using the appropriate attributes (whether x86 WC maps exactly onto arm64 Normal NC is the subject of a separate discussion; so far that's the assumption we took in the arm64 kernel). Regarding vfio-pci, the user mapping must remain Device_nGnRnE on arm64 to avoid unexpected speculative loads. This leaves us with (1) and since the vfio-pci user mapping must remain Device, there's a potential mismatched alias if the guest maps the corresponding BAR as Normal NC. Luckily the Arm ARM constrains the hardware behaviour here to basically allowing the Device mapping in the VMM to behave somewhat the Normal NC mapping in the guest. IOW, the VMM loses the Device guarantees (non-speculation, ordering). That's not a problem for the device since the guest already deemed such mapping to be safe. While I think it's less likely for the VMM to access the same BAR that was mapped into the guest, if we want to be on the safe side from an ABI perspective (the returned mmap() now has different memory guarantees), we could make this an opt-in. That's pretty much the VMM stating that it is ok with losing some of the Device guarantees for the vfio-pci mapping. A question here is whether we want to this to be done at the vfio-pci level, the KVM memory slot or a big knob per VMM. Or whether we need to bother with this at all (if it's just theoretical, maybe we can have a global opt-out). Going back to this series, allowing Cacheable mapping in KVM has similar implications as above. (2) and (3) would be assumed safe by the platform vendors. Regarding (1), to avoid confusion, I would only allow it if FWB (force write-back) is present so that KVM can enforce a cacheable mapping at Stage 2 if the vfio variant driver also maps it as cacheable in user space. There were some other thoughts on only allowing different attributes in KVM if the user counterpart does not have any mapping (e.g. fd-based KVM memory slots). However, this won't work well with CXL-attached memory for example where the VMM may want access to it (e.g. virtio). So I wouldn't spend more thoughts on this. The TL;DR summary of what I think we should do: a) Relax the KVM Stage 2 mapping for vfio-pci to Normal NC (pgprot_writecombine). TBD whether we need a VMM opt-in is at the vfio-pci level, KVM slot or per-VMM level. Another option is to do this by default and have a global opt-out (cmdline). I don't think the latter is such a bad idea since I find it unlikely for the VMM to also touch the same PCIe BAR _and_ expect different memory ordering guarantees than the guest device driver. b) Map the KVM stage 2 mapping as cacheable+FWB iff the VMM has a corresponding cacheable mapping. Thoughts?
On Wed, May 31, 2023 at 12:35:57PM +0100, Catalin Marinas wrote: > Current status on arm64: vfio-pci user mapping (Qemu) of PCIe BARs is > Device_nGnRnE. KVM also maps it at Stage 2 with the same attributes. The > user mapping is fine since the VMM may not have detailed knowledge about > how to safely map the BAR. KVM doesn't have such detailed knowledge > either in the device pass-through case but for safety reasons it follows > the same attributes as the user mapping. To put a fine point on this - it means the KVM VM does not emulate the same behaviors as a bare metal ARM system and this results in significant performance degradation (loss of WC) or malfunction (loss of cachable) when working with some VFIO device scenarios. > architecture). Presumably cloud vendors allowing device pass-through > already know their system well enough, no need for further policing in > KVM (which it can't do properly anyway). This also matches x86 pretty well. There is alot of platform and BIOS magic at work to make sure that VFIO is safe in general. As this is not discoverable by the OS we have no choice in the kernel but to assume the operator knows what they are doing. > While I think it's less likely for the VMM to access the same BAR that > was mapped into the guest, if we want to be on the safe side from an ABI > perspective (the returned mmap() now has different memory guarantees), > we could make this an opt-in. That's pretty much the VMM stating that it > is ok with losing some of the Device guarantees for the vfio-pci > mapping. I don't know of any use case where this matters. The VMM is already basically not allowed to touch the MMIO spaces for security reasons. The only reason it is mmap'd into the VMM at all is because this is the only way to get it into KVM. So an opt-in seems like overkill to me. > Going back to this series, allowing Cacheable mapping in KVM has similar > implications as above. (2) and (3) would be assumed safe by the platform > vendors. Regarding (1), to avoid confusion, I would only allow it if FWB > (force write-back) is present so that KVM can enforce a cacheable > mapping at Stage 2 if the vfio variant driver also maps it as cacheable > in user space. Yes, currently if FWB is not available this patch makes KVM crash :\ FWB needs to be enforced as well in this patch. > There were some other thoughts on only allowing different attributes in > KVM if the user counterpart does not have any mapping (e.g. fd-based > KVM memory slots). However, this won't work well with CXL-attached > memory for example where the VMM may want access to it (e.g. virtio). So > I wouldn't spend more thoughts on this. CXL is a different beast - struct page backed CXL memory should be force-cachable everywhere just like normal system memory (it is normal system memory). This is the kind of memory that might be mixed with virtio. However, some future CXL VFIO device memory should ideally allow the VM full flexibility to use cachable or not to properly emulate bare-metal. This approach where we pre-select some of the CXL memory as cachable is kind of a half job.. I think providing a way for KVM to take in a FD instead of a VMA for mapping memory could be interesting in general. We waste a fair amount of PTEs creating a 40GB linear map in the CPU just for KVM. Avoiding the alias mapping is a nice side effect. We'd have to do something else for the SIGBUS stuff though.. > a) Relax the KVM Stage 2 mapping for vfio-pci to Normal NC > (pgprot_writecombine). TBD whether we need a VMM opt-in is at the We are working on a tested patch for this, it isn't working yet for some reason, still debugging. > b) Map the KVM stage 2 mapping as cacheable+FWB iff the VMM has a > corresponding cacheable mapping. That is this patch, with some improvements to check FWB/etc. Thanks, Jason
On Wed, 2023-05-31 at 12:35 +0100, Catalin Marinas wrote: > There were several off-list discussions, I'm trying to summarise my > understanding here. This series aims to relax the VFIO mapping to > cacheable and have KVM map it into the guest with the same attributes. > Somewhat related past threads also tried to relax the KVM device > pass-through mapping from Device_nGnRnE (pgprot_noncached) to Normal_NC > (pgprot_writecombine). Those were initially using the PCIe prefetchable > BAR attribute but that's not a reliable means to infer whether Normal vs > Device is safe. Anyway, I think we'd need to unify these threads and > come up with some common handling that can cater for various attributes > required by devices/drivers. Therefore replying in this thread. So picking up on this as I was just trying to start a separate discussion on the subject for write combine :-) In this case, not so much for KVM as much as for VFIO to userspace though. The rough idea is that the "userspace driver" (ie DPDK or equivalent) for the device is the one to "know" wether a BAR or portion of a BAR can/should be mapped write-combine, and is expected to also "know" what to do to enforce ordering when necessary. So the userspace component needs to be responsible for selecting the mapping, the same way using the PCI sysfs resource files today allows to do that by selecting the _wc variant. I posted a separate message that Lorenzo CCed back to some of you, but let's recap here to keep the discussion localized. I don't know how much of this makes sense for KVM, but I think what we really want is for userspace to be able to specify some "attributes" (which we can initially limit to writecombine, full cachability probably requires a device specific kernel driver providing adequate authority, separate discussion in any case), for all or a portion of a BAR mapping. The easy way is an ioctl to affect the attributes of the next mmap but it's a rather gross interface. A better approach (still requires some coordination but not nearly as bad) would be to have an ioctl to create "subregions", ie, dynamically add new "struct vfio_pci_region" (using the existing dynamic index API), which are children of existing regions (including real BARs) and provide different attributes, which mmap can then honor. This is particularly suited for the case (which used to exist, I don't know if it still does) where the buffer that wants write combining reside in the same BAR as registers that otherwise don't. A simpler compromise if that latter case is deemed irrelevant would be an ioctl to selectively set a region index (including BARs) to be WC prior to mmap. I don't know if that fits in the ideas you have for KVM, I think it could by having the userspace component require mappings using a "special" attribute which we could define as being the most relaxed allowed to pass to a VM, which can then be architecture defined. The guest can then enforce specifics. Does this make sense ? Cheers Ben.
Hi Ben, On Fri, Jul 14, 2023 at 06:10:39PM +1000, Benjamin Herrenschmidt wrote: > On Wed, 2023-05-31 at 12:35 +0100, Catalin Marinas wrote: > > There were several off-list discussions, I'm trying to summarise my > > understanding here. This series aims to relax the VFIO mapping to > > cacheable and have KVM map it into the guest with the same attributes. > > Somewhat related past threads also tried to relax the KVM device > > pass-through mapping from Device_nGnRnE (pgprot_noncached) to Normal_NC > > (pgprot_writecombine). Those were initially using the PCIe prefetchable > > BAR attribute but that's not a reliable means to infer whether Normal vs > > Device is safe. Anyway, I think we'd need to unify these threads and > > come up with some common handling that can cater for various attributes > > required by devices/drivers. Therefore replying in this thread. > > So picking up on this as I was just trying to start a separate > discussion on the subject for write combine :-) Basically this thread started as a fix/improvement for KVM by mimicking the VFIO user mapping attributes at the guest but the conclusion we came to is that the VFIO PCIe driver cannot reliably tell when WC is possible. > In this case, not so much for KVM as much as for VFIO to userspace > though. > > The rough idea is that the "userspace driver" (ie DPDK or equivalent) > for the device is the one to "know" wether a BAR or portion of a BAR > can/should be mapped write-combine, and is expected to also "know" > what to do to enforce ordering when necessary. I agree in principle. On the KVM side we concluded that it's the guest driver that knows the attributes, so the hypervisor should not restrict them. In the DPDK case, it would be the user driver that knows the device it is mapping and the required attributes. In terms of security for arm64 at least, Device vs Normal NC (or nc vs wc in Linux terminology) doesn't make much difference with the former occasionally being worse. The kernel would probably trust the DPDK code if it allows direct device access. > So the userspace component needs to be responsible for selecting the > mapping, the same way using the PCI sysfs resource files today allows > to do that by selecting the _wc variant. I guess the sysfs interface is just trying to work around the VFIO limitations. > I don't know how much of this makes sense for KVM, but I think what we > really want is for userspace to be able to specify some "attributes" > (which we can initially limit to writecombine, full cachability > probably requires a device specific kernel driver providing adequate > authority, separate discussion in any case), for all or a portion of a > BAR mapping. For KVM, at least the WC case, user-space doesn't need to be involved as it normally should not access the same BAR concurrently with the guest. But at some point, for CXL-attached memory for example, it may need to be able to map it as cacheable so that it has the same attributes as the guest. > The easy way is an ioctl to affect the attributes of the next mmap but > it's a rather gross interface. > > A better approach (still requires some coordination but not nearly as > bad) would be to have an ioctl to create "subregions", ie, dynamically > add new "struct vfio_pci_region" (using the existing dynamic index > API), which are children of existing regions (including real BARs) and > provide different attributes, which mmap can then honor. > > This is particularly suited for the case (which used to exist, I don't > know if it still does) where the buffer that wants write combining > reside in the same BAR as registers that otherwise don't. IIUC that's still the case for some devices (I think Jason mentioned some Mellanox cards). > A simpler compromise if that latter case is deemed irrelevant would be > an ioctl to selectively set a region index (including BARs) to be WC > prior to mmap. > > I don't know if that fits in the ideas you have for KVM, I think it > could by having the userspace component require mappings using a > "special" attribute which we could define as being the most relaxed > allowed to pass to a VM, which can then be architecture defined. The > guest can then enforce specifics. Does this make sense ? I think this interface would help KVM when we'll need a cacheable mapping. For WC, we are ok without any VFIO changes.
On Sun, Jul 16, 2023 at 08:09:02AM -0700, Catalin Marinas wrote: > In terms of security for arm64 at least, Device vs Normal NC (or nc vs > wc in Linux terminology) doesn't make much difference with the former > occasionally being worse. The kernel would probably trust the DPDK code > if it allows direct device access. RDMA and DRM already allow device drivers to map WC to userspace on demand, we expect the platform to support this. > > So the userspace component needs to be responsible for selecting the > > mapping, the same way using the PCI sysfs resource files today allows > > to do that by selecting the _wc variant. > > I guess the sysfs interface is just trying to work around the VFIO > limitations. I think just nobody has ever asked for VFIO WC support. The main non-VM user is DPDK and none of the NIC drivers have wanted this (DPDK applications areis more of throughput than latency focused typically) > > This is particularly suited for the case (which used to exist, I don't > > know if it still does) where the buffer that wants write combining > > reside in the same BAR as registers that otherwise don't. > > IIUC that's still the case for some devices (I think Jason mentioned > some Mellanox cards). Right, VFIO will have to allow it page-by-page > I think this interface would help KVM when we'll need a cacheable > mapping. For WC, we are ok without any VFIO changes. Yes, it may be interesting to map cachable CXL memory as NORMAL_NC into userspace for similar reasons. Jason
On Sun, 16 Jul 2023 19:30:23 -0300 Jason Gunthorpe <jgg@nvidia.com> wrote: > On Sun, Jul 16, 2023 at 08:09:02AM -0700, Catalin Marinas wrote: > > > In terms of security for arm64 at least, Device vs Normal NC (or nc vs > > wc in Linux terminology) doesn't make much difference with the former > > occasionally being worse. The kernel would probably trust the DPDK code > > if it allows direct device access. > > RDMA and DRM already allow device drivers to map WC to userspace on > demand, we expect the platform to support this. > > > > So the userspace component needs to be responsible for selecting the > > > mapping, the same way using the PCI sysfs resource files today allows > > > to do that by selecting the _wc variant. > > > > I guess the sysfs interface is just trying to work around the VFIO > > limitations. > > I think just nobody has ever asked for VFIO WC support. The main > non-VM user is DPDK and none of the NIC drivers have wanted this (DPDK > applications areis more of throughput than latency focused typically) Yes, QEMU can't know whether the device or driver want a WC BAR mapping, so we've left it for KVM manipulation relative to VM use cases. Nobody has followed through with a complete proposal to enable it otherwise for direct userspace driver access, but I don't think there's opposition to providing such a thing. Thanks, Alex > > > This is particularly suited for the case (which used to exist, I don't > > > know if it still does) where the buffer that wants write combining > > > reside in the same BAR as registers that otherwise don't. > > > > IIUC that's still the case for some devices (I think Jason mentioned > > some Mellanox cards). > > Right, VFIO will have to allow it page-by-page > > > I think this interface would help KVM when we'll need a cacheable > > mapping. For WC, we are ok without any VFIO changes. > > Yes, it may be interesting to map cachable CXL memory as NORMAL_NC > into userspace for similar reasons. > > Jason >
On Mon, 2023-07-17 at 12:35 -0600, Alex Williamson wrote: > On Sun, 16 Jul 2023 19:30:23 -0300 > Jason Gunthorpe <jgg@nvidia.com> wrote: > > > On Sun, Jul 16, 2023 at 08:09:02AM -0700, Catalin Marinas wrote: > > > > > In terms of security for arm64 at least, Device vs Normal NC (or nc vs > > > wc in Linux terminology) doesn't make much difference with the former > > > occasionally being worse. The kernel would probably trust the DPDK code > > > if it allows direct device access. > > > > RDMA and DRM already allow device drivers to map WC to userspace on > > demand, we expect the platform to support this. > > > > > > So the userspace component needs to be responsible for selecting the > > > > mapping, the same way using the PCI sysfs resource files today allows > > > > to do that by selecting the _wc variant. > > > > > > I guess the sysfs interface is just trying to work around the VFIO > > > limitations. > > > > I think just nobody has ever asked for VFIO WC support. The main > > non-VM user is DPDK and none of the NIC drivers have wanted this (DPDK > > applications areis more of throughput than latency focused typically) > > Yes, QEMU can't know whether the device or driver want a WC BAR > mapping, so we've left it for KVM manipulation relative to VM use > cases. Nobody has followed through with a complete proposal to enable > it otherwise for direct userspace driver access, but I don't think > there's opposition to providing such a thing. Thanks, Ok, this is really backburner work for me but I'll try to cook up a POC patch in the near (hopefully) future along the lines of the subregions I proposed and we can discuss from there. Cheers, Ben.
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 4cd6762bda80..d3166b6e6329 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -150,7 +150,8 @@ enum kvm_pgtable_stage2_flags { * @KVM_PGTABLE_PROT_X: Execute permission. * @KVM_PGTABLE_PROT_W: Write permission. * @KVM_PGTABLE_PROT_R: Read permission. - * @KVM_PGTABLE_PROT_DEVICE: Device attributes. + * @KVM_PGTABLE_PROT_DEVICE_nGnRE: Device nGnRE attributes. + * @KVM_PGTABLE_PROT_DEVICE_nGnRnE: Device nGnRnE attributes. * @KVM_PGTABLE_PROT_SW0: Software bit 0. * @KVM_PGTABLE_PROT_SW1: Software bit 1. * @KVM_PGTABLE_PROT_SW2: Software bit 2. @@ -161,7 +162,8 @@ enum kvm_pgtable_prot { KVM_PGTABLE_PROT_W = BIT(1), KVM_PGTABLE_PROT_R = BIT(2), - KVM_PGTABLE_PROT_DEVICE = BIT(3), + KVM_PGTABLE_PROT_DEVICE_nGnRE = BIT(3), + KVM_PGTABLE_PROT_DEVICE_nGnRnE = BIT(4), KVM_PGTABLE_PROT_SW0 = BIT(55), KVM_PGTABLE_PROT_SW1 = BIT(56), @@ -178,7 +180,7 @@ enum kvm_pgtable_prot { #define PAGE_HYP KVM_PGTABLE_PROT_RW #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) -#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) +#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE_nGnRE) typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 78e5163836a0..4ebbc4b1ba4d 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -147,14 +147,16 @@ * Memory types for Stage-2 translation */ #define MT_S2_NORMAL 0xf +#define MT_S2_DEVICE_nGnRnE 0x0 #define MT_S2_DEVICE_nGnRE 0x1 /* * Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001 * Stage-2 enforces Normal-WB and Device-nGnRE */ -#define MT_S2_FWB_NORMAL 6 -#define MT_S2_FWB_DEVICE_nGnRE 1 +#define MT_S2_FWB_NORMAL 0x6 +#define MT_S2_FWB_DEVICE_nGnRnE 0x0 +#define MT_S2_FWB_DEVICE_nGnRE 0x1 #ifdef CONFIG_ARM64_4K_PAGES #define IOREMAP_MAX_ORDER (PUD_SHIFT) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 3d61bd3e591d..7a8238b41590 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -355,7 +355,7 @@ struct hyp_map_data { static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { - bool device = prot & KVM_PGTABLE_PROT_DEVICE; + bool device = prot & KVM_PGTABLE_PROT_DEVICE_nGnRE; u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype); u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS; @@ -636,14 +636,20 @@ static bool stage2_has_fwb(struct kvm_pgtable *pgt) static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, kvm_pte_t *ptep) { - bool device = prot & KVM_PGTABLE_PROT_DEVICE; - kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : - KVM_S2_MEMATTR(pgt, NORMAL); u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; + kvm_pte_t attr; + + if (prot & KVM_PGTABLE_PROT_DEVICE_nGnRE) + attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE); + else if (prot & KVM_PGTABLE_PROT_DEVICE_nGnRnE) + attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRnE); + else + attr = KVM_S2_MEMATTR(pgt, NORMAL); if (!(prot & KVM_PGTABLE_PROT_X)) attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN; - else if (device) + else if (prot & KVM_PGTABLE_PROT_DEVICE_nGnRE || + prot & KVM_PGTABLE_PROT_DEVICE_nGnRnE) return -EINVAL; if (prot & KVM_PGTABLE_PROT_R) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 7113587222ff..8d63aa951c33 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -897,7 +897,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, int ret = 0; struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO }; struct kvm_pgtable *pgt = kvm->arch.mmu.pgt; - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE | + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE_nGnRE | KVM_PGTABLE_PROT_R | (writable ? KVM_PGTABLE_PROT_W : 0); @@ -1186,6 +1186,15 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma) return vma->vm_flags & VM_MTE_ALLOWED; } +/* + * Determine the memory region cacheability from VMA's pgprot. This + * is used to set the stage 2 PTEs. + */ +static unsigned long mapping_type(pgprot_t page_prot) +{ + return ((pgprot_val(page_prot) & PTE_ATTRINDX_MASK) >> 2); +} + static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, struct kvm_memory_slot *memslot, unsigned long hva, unsigned long fault_status) @@ -1368,10 +1377,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, if (exec_fault) prot |= KVM_PGTABLE_PROT_X; - if (device) - prot |= KVM_PGTABLE_PROT_DEVICE; - else if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) - prot |= KVM_PGTABLE_PROT_X; + switch (mapping_type(vma->vm_page_prot)) { + case MT_DEVICE_nGnRE: + prot |= KVM_PGTABLE_PROT_DEVICE_nGnRE; + break; + case MT_DEVICE_nGnRnE: + prot |= KVM_PGTABLE_PROT_DEVICE_nGnRnE; + break; + /* MT_NORMAL/_TAGGED/_NC */ + default: + if (cpus_have_const_cap(ARM64_HAS_CACHE_DIC)) + prot |= KVM_PGTABLE_PROT_X; + } /* * Under the premise of getting a FSC_PERM fault, we just need to relax