Message ID | 1234443038-15437-7-git-send-email-yu.zhao@intel.com (mailing list archive) |
---|---|
State | Accepted, archived |
Headers | show |
On Thu, Feb 12, 2009 at 08:50:38PM +0800, Yu Zhao wrote: > Support device IOTLB (i.e. ATS) for both native and KVM environments. > > Signed-off-by: Yu Zhao <yu.zhao@intel.com> > --- > drivers/pci/intel-iommu.c | 95 +++++++++++++++++++++++++++++++++++++++++- > include/linux/intel-iommu.h | 1 + > 2 files changed, 93 insertions(+), 3 deletions(-) > > diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c > index 5fdbed3..fe09e7a 100644 > --- a/drivers/pci/intel-iommu.c > +++ b/drivers/pci/intel-iommu.c > @@ -125,6 +125,7 @@ static inline void context_set_fault_enable(struct context_entry *context) > } > > #define CONTEXT_TT_MULTI_LEVEL 0 > +#define CONTEXT_TT_DEV_IOTLB 1 > > static inline void context_set_translation_type(struct context_entry *context, > unsigned long value) > @@ -240,6 +241,7 @@ struct device_domain_info { > struct list_head global; /* link to global list */ > u8 bus; /* PCI bus numer */ > u8 devfn; /* PCI devfn number */ > + struct intel_iommu *iommu; /* IOMMU used by this device */ > struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */ > struct dmar_domain *domain; /* pointer to domain */ > }; > @@ -922,6 +924,74 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, > return 0; > } > > +static struct device_domain_info * > +iommu_support_dev_iotlb(struct dmar_domain *domain, u8 bus, u8 devfn) > +{ > + int found = 0; > + unsigned long flags; > + struct device_domain_info *info; > + struct intel_iommu *iommu = device_to_iommu(bus, devfn); > + > + if (!ecap_dev_iotlb_support(iommu->ecap)) > + return NULL; > + > + if (!iommu->qi) > + return NULL; > + > + spin_lock_irqsave(&device_domain_lock, flags); > + list_for_each_entry(info, &domain->devices, link) > + if (info->dev && info->bus == bus && info->devfn == devfn) { > + found = 1; > + break; > + } > + spin_unlock_irqrestore(&device_domain_lock, flags); > + > + if (!found) > + return NULL; > + > + if (!dmar_find_matched_atsr_unit(info->dev)) > + return NULL; > + > + info->iommu = iommu; > + if (!pci_ats_queue_depth(info->dev)) > + return NULL; > + > + return info; > +} > + > +static void iommu_enable_dev_iotlb(struct device_domain_info *info) > +{ > + pci_enable_ats(info->dev, VTD_PAGE_SHIFT); > +} Why is a static function defined that calls a global function? > + > +static void iommu_disable_dev_iotlb(struct device_domain_info *info) > +{ > + if (info->dev && pci_ats_enabled(info->dev)) > + pci_disable_ats(info->dev); > +} ditto. pci_disable_ats() should be able to handle the case when "info->dev" is NULL and will know if ATS is enabled. I think both of these functions can be dropped and just directly call pci_*_ats(). > + > +static void iommu_flush_dev_iotlb(struct dmar_domain *domain, > + u64 addr, unsigned mask) > +{ > + int rc; > + u16 sid, qdep; > + unsigned long flags; > + struct device_domain_info *info; > + > + spin_lock_irqsave(&device_domain_lock, flags); > + list_for_each_entry(info, &domain->devices, link) { Would it be possible to define a single domain for each PCI device? Or does "domain" represent an IOMMU? Sorry, I forgot...I'm sure someone has mentioned this the past. I want to point out list_for_each_entry() is effectively a "nested loop". iommu_flush_dev_iotlb() will get called alot from flush_unmaps(). Perhaps do the lookup once there and pass that as a parameter? I don't know if that is feasible. But if this is a very frequently used code path, every CPU cycle counts. > + if (!info->dev || !pci_ats_enabled(info->dev)) > + continue; > + > + sid = info->bus << 8 | info->devfn; > + qdep = pci_ats_queue_depth(info->dev); re Matthew Wilcox's comment - looks like caching ats_queue_depth is appropriate. thanks, gant > + rc = qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask); > + if (rc) > + printk(KERN_ERR "IOMMU: flush device IOTLB failed\n"); Can this be a dev_printk please? Perhaps in general review the use of "printk" so when errors are reported, users will know which devices might be affected by the failure. If more than a few printk's should be "converted" to dev_printk(), I'd be happy if that were a seperate patch (submitted later). > + } > + spin_unlock_irqrestore(&device_domain_lock, flags); > +} > + > static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, > u64 addr, unsigned int pages, int non_present_entry_flush) > { > @@ -945,6 +1015,9 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, > rc = iommu->flush.flush_iotlb(iommu, did, addr, mask, > DMA_TLB_PSI_FLUSH, > non_present_entry_flush); > + if (!rc && !non_present_entry_flush) > + iommu_flush_dev_iotlb(iommu->domains[did], addr, mask); > + > return rc; > } > > @@ -1469,6 +1542,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, > unsigned long ndomains; > int id; > int agaw; > + struct device_domain_info *info; > > pr_debug("Set context mapping for %02x:%02x.%d\n", > bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); > @@ -1534,7 +1608,11 @@ static int domain_context_mapping_one(struct dmar_domain *domain, > context_set_domain_id(context, id); > context_set_address_width(context, iommu->agaw); > context_set_address_root(context, virt_to_phys(pgd)); > - context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL); > + info = iommu_support_dev_iotlb(domain, bus, devfn); > + if (info) > + context_set_translation_type(context, CONTEXT_TT_DEV_IOTLB); > + else > + context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL); Would it be ok to rewrite this as: + context_set_translation_type(context, + info ? CONTEXT_TT_DEV_IOTLB : CONTEXT_TT_MULTI_LEVEL); > context_set_fault_enable(context); > context_set_present(context); > domain_flush_cache(domain, context, sizeof(*context)); > @@ -1546,6 +1624,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, > iommu_flush_write_buffer(iommu); > else > iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0); Adding a blank line here would make this more readable. (AFAIK, not required by coding style, just my opinion.) > + if (info) > + iommu_enable_dev_iotlb(info); Could iommu_enable_dev_iotlb() (or pci_enable_ats()) check if "info" is NULL? Then this would just be a simple function call. And it would be consistent with usage of iommu_disable_dev_iotlb(). thanks, grant > > spin_unlock_irqrestore(&iommu->lock, flags); > > @@ -1687,6 +1767,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain) > info->dev->dev.archdata.iommu = NULL; > spin_unlock_irqrestore(&device_domain_lock, flags); > > + iommu_disable_dev_iotlb(info); > iommu = device_to_iommu(info->bus, info->devfn); > iommu_detach_dev(iommu, info->bus, info->devfn); > free_devinfo_mem(info); > @@ -2304,8 +2385,14 @@ static void flush_unmaps(void) > iommu->flush.flush_iotlb(iommu, 0, 0, 0, > DMA_TLB_GLOBAL_FLUSH, 0); > for (j = 0; j < deferred_flush[i].next; j++) { > - __free_iova(&deferred_flush[i].domain[j]->iovad, > - deferred_flush[i].iova[j]); > + unsigned long mask; > + struct iova *iova = deferred_flush[i].iova[j]; > + > + mask = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT; > + mask = ilog2(mask >> VTD_PAGE_SHIFT); > + iommu_flush_dev_iotlb(deferred_flush[i].domain[j], > + iova->pfn_lo << PAGE_SHIFT, mask); > + __free_iova(&deferred_flush[i].domain[j]->iovad, iova); > } > deferred_flush[i].next = 0; > } > @@ -2792,6 +2879,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, > info->dev->dev.archdata.iommu = NULL; > spin_unlock_irqrestore(&device_domain_lock, flags); > > + iommu_disable_dev_iotlb(info); > iommu_detach_dev(iommu, info->bus, info->devfn); > free_devinfo_mem(info); > > @@ -2840,6 +2928,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) > > spin_unlock_irqrestore(&device_domain_lock, flags1); > > + iommu_disable_dev_iotlb(info); > iommu = device_to_iommu(info->bus, info->devfn); > iommu_detach_dev(iommu, info->bus, info->devfn); > > diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h > index d82bdac..609af82 100644 > --- a/include/linux/intel-iommu.h > +++ b/include/linux/intel-iommu.h > @@ -123,6 +123,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) > #define ecap_qis(e) ((e) & 0x2) > #define ecap_eim_support(e) ((e >> 4) & 0x1) > #define ecap_ir_support(e) ((e >> 3) & 0x1) > +#define ecap_dev_iotlb_support(e) (((e) >> 2) & 0x1) > #define ecap_max_handle_mask(e) ((e >> 20) & 0xf) > > > -- > 1.6.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-pci" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Sun, Feb 15, 2009 at 07:20:52AM +0800, Grant Grundler wrote: > On Thu, Feb 12, 2009 at 08:50:38PM +0800, Yu Zhao wrote: > > Support device IOTLB (i.e. ATS) for both native and KVM environments. > > + > > +static void iommu_enable_dev_iotlb(struct device_domain_info *info) > > +{ > > + pci_enable_ats(info->dev, VTD_PAGE_SHIFT); > > +} > > Why is a static function defined that calls a global function? There would be some extra steps to do before VT-d enables ATS in the future, so this wrapper makes code expandable later. > > > + > > +static void iommu_disable_dev_iotlb(struct device_domain_info *info) > > +{ > > + if (info->dev && pci_ats_enabled(info->dev)) > > + pci_disable_ats(info->dev); > > +} > > ditto. pci_disable_ats() should be able to handle the case when > "info->dev" is NULL and will know if ATS is enabled. The info->dev could be NULL only because the VT-d code makes it so. AMD an IBM IOMMU may not have this requirement. If we make pci_disable_ats() accept NULL pci_dev, it would fail to catch some errors like using pci_disable_ats without calling pci_enable_ats before. > > I think both of these functions can be dropped and just directly call pci_*_ats(). > > > + > > +static void iommu_flush_dev_iotlb(struct dmar_domain *domain, > > + u64 addr, unsigned mask) > > +{ > > + int rc; > > + u16 sid, qdep; > > + unsigned long flags; > > + struct device_domain_info *info; > > + > > + spin_lock_irqsave(&device_domain_lock, flags); > > + list_for_each_entry(info, &domain->devices, link) { > > Would it be possible to define a single domain for each PCI device? > Or does "domain" represent an IOMMU? > Sorry, I forgot...I'm sure someone has mentioned this the past. A domain represents one translation mapping. For device used by the host, there is one domain per device. Device assigned to a guest shares one domain. > > I want to point out list_for_each_entry() is effectively a "nested loop". > iommu_flush_dev_iotlb() will get called alot from flush_unmaps(). > Perhaps do the lookup once there and pass that as a parameter? > I don't know if that is feasible. But if this is a very frequently > used code path, every CPU cycle counts. iommu_flush_dev_iotlb() is only used to flush the devices used in the host, which means there is always one entry on the list. > > > > + if (!info->dev || !pci_ats_enabled(info->dev)) > > + continue; > > + > > + sid = info->bus << 8 | info->devfn; > > + qdep = pci_ats_queue_depth(info->dev); > > re Matthew Wilcox's comment - looks like caching ats_queue_depth > is appropriate. Yes, it's cached as of v3. > > + rc = qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask); > > + if (rc) > > + printk(KERN_ERR "IOMMU: flush device IOTLB failed\n"); > > Can this be a dev_printk please? Yes, will replace it with dev_err(). > Perhaps in general review the use of "printk" so when errors are reported, > users will know which devices might be affected by the failure. > If more than a few printk's should be "converted" to dev_printk(), I'd > be happy if that were a seperate patch (submitted later). > > > > pr_debug("Set context mapping for %02x:%02x.%d\n", > > bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); > > @@ -1534,7 +1608,11 @@ static int domain_context_mapping_one(struct dmar_domain *domain, > > context_set_domain_id(context, id); > > context_set_address_width(context, iommu->agaw); > > context_set_address_root(context, virt_to_phys(pgd)); > > - context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL); > > + info = iommu_support_dev_iotlb(domain, bus, devfn); > > + if (info) > > + context_set_translation_type(context, CONTEXT_TT_DEV_IOTLB); > > + else > > + context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL); > > Would it be ok to rewrite this as: > + context_set_translation_type(context, > + info ? CONTEXT_TT_DEV_IOTLB : CONTEXT_TT_MULTI_LEVEL); Yes, this one looks better. > > > context_set_fault_enable(context); > > context_set_present(context); > > domain_flush_cache(domain, context, sizeof(*context)); > > @@ -1546,6 +1624,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, > > iommu_flush_write_buffer(iommu); > > else > > iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0); > > Adding a blank line here would make this more readable. > (AFAIK, not required by coding style, just my opinion.) Yes, I prefer a bank line here too, somehow I missed it. > > > + if (info) > > + iommu_enable_dev_iotlb(info); > > Could iommu_enable_dev_iotlb() (or pci_enable_ats()) check if "info" is NULL? > Then this would just be a simple function call. > > And it would be consistent with usage of iommu_disable_dev_iotlb(). Yes, good idea. Thanks a lot for reviewing it! Yu -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 5fdbed3..fe09e7a 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -125,6 +125,7 @@ static inline void context_set_fault_enable(struct context_entry *context) } #define CONTEXT_TT_MULTI_LEVEL 0 +#define CONTEXT_TT_DEV_IOTLB 1 static inline void context_set_translation_type(struct context_entry *context, unsigned long value) @@ -240,6 +241,7 @@ struct device_domain_info { struct list_head global; /* link to global list */ u8 bus; /* PCI bus numer */ u8 devfn; /* PCI devfn number */ + struct intel_iommu *iommu; /* IOMMU used by this device */ struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */ struct dmar_domain *domain; /* pointer to domain */ }; @@ -922,6 +924,74 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, return 0; } +static struct device_domain_info * +iommu_support_dev_iotlb(struct dmar_domain *domain, u8 bus, u8 devfn) +{ + int found = 0; + unsigned long flags; + struct device_domain_info *info; + struct intel_iommu *iommu = device_to_iommu(bus, devfn); + + if (!ecap_dev_iotlb_support(iommu->ecap)) + return NULL; + + if (!iommu->qi) + return NULL; + + spin_lock_irqsave(&device_domain_lock, flags); + list_for_each_entry(info, &domain->devices, link) + if (info->dev && info->bus == bus && info->devfn == devfn) { + found = 1; + break; + } + spin_unlock_irqrestore(&device_domain_lock, flags); + + if (!found) + return NULL; + + if (!dmar_find_matched_atsr_unit(info->dev)) + return NULL; + + info->iommu = iommu; + if (!pci_ats_queue_depth(info->dev)) + return NULL; + + return info; +} + +static void iommu_enable_dev_iotlb(struct device_domain_info *info) +{ + pci_enable_ats(info->dev, VTD_PAGE_SHIFT); +} + +static void iommu_disable_dev_iotlb(struct device_domain_info *info) +{ + if (info->dev && pci_ats_enabled(info->dev)) + pci_disable_ats(info->dev); +} + +static void iommu_flush_dev_iotlb(struct dmar_domain *domain, + u64 addr, unsigned mask) +{ + int rc; + u16 sid, qdep; + unsigned long flags; + struct device_domain_info *info; + + spin_lock_irqsave(&device_domain_lock, flags); + list_for_each_entry(info, &domain->devices, link) { + if (!info->dev || !pci_ats_enabled(info->dev)) + continue; + + sid = info->bus << 8 | info->devfn; + qdep = pci_ats_queue_depth(info->dev); + rc = qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask); + if (rc) + printk(KERN_ERR "IOMMU: flush device IOTLB failed\n"); + } + spin_unlock_irqrestore(&device_domain_lock, flags); +} + static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, u64 addr, unsigned int pages, int non_present_entry_flush) { @@ -945,6 +1015,9 @@ static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, rc = iommu->flush.flush_iotlb(iommu, did, addr, mask, DMA_TLB_PSI_FLUSH, non_present_entry_flush); + if (!rc && !non_present_entry_flush) + iommu_flush_dev_iotlb(iommu->domains[did], addr, mask); + return rc; } @@ -1469,6 +1542,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, unsigned long ndomains; int id; int agaw; + struct device_domain_info *info; pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); @@ -1534,7 +1608,11 @@ static int domain_context_mapping_one(struct dmar_domain *domain, context_set_domain_id(context, id); context_set_address_width(context, iommu->agaw); context_set_address_root(context, virt_to_phys(pgd)); - context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL); + info = iommu_support_dev_iotlb(domain, bus, devfn); + if (info) + context_set_translation_type(context, CONTEXT_TT_DEV_IOTLB); + else + context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL); context_set_fault_enable(context); context_set_present(context); domain_flush_cache(domain, context, sizeof(*context)); @@ -1546,6 +1624,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, iommu_flush_write_buffer(iommu); else iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0); + if (info) + iommu_enable_dev_iotlb(info); spin_unlock_irqrestore(&iommu->lock, flags); @@ -1687,6 +1767,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain) info->dev->dev.archdata.iommu = NULL; spin_unlock_irqrestore(&device_domain_lock, flags); + iommu_disable_dev_iotlb(info); iommu = device_to_iommu(info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); free_devinfo_mem(info); @@ -2304,8 +2385,14 @@ static void flush_unmaps(void) iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, 0); for (j = 0; j < deferred_flush[i].next; j++) { - __free_iova(&deferred_flush[i].domain[j]->iovad, - deferred_flush[i].iova[j]); + unsigned long mask; + struct iova *iova = deferred_flush[i].iova[j]; + + mask = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT; + mask = ilog2(mask >> VTD_PAGE_SHIFT); + iommu_flush_dev_iotlb(deferred_flush[i].domain[j], + iova->pfn_lo << PAGE_SHIFT, mask); + __free_iova(&deferred_flush[i].domain[j]->iovad, iova); } deferred_flush[i].next = 0; } @@ -2792,6 +2879,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, info->dev->dev.archdata.iommu = NULL; spin_unlock_irqrestore(&device_domain_lock, flags); + iommu_disable_dev_iotlb(info); iommu_detach_dev(iommu, info->bus, info->devfn); free_devinfo_mem(info); @@ -2840,6 +2928,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) spin_unlock_irqrestore(&device_domain_lock, flags1); + iommu_disable_dev_iotlb(info); iommu = device_to_iommu(info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index d82bdac..609af82 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -123,6 +123,7 @@ static inline void dmar_writeq(void __iomem *addr, u64 val) #define ecap_qis(e) ((e) & 0x2) #define ecap_eim_support(e) ((e >> 4) & 0x1) #define ecap_ir_support(e) ((e >> 3) & 0x1) +#define ecap_dev_iotlb_support(e) (((e) >> 2) & 0x1) #define ecap_max_handle_mask(e) ((e >> 20) & 0xf)
Support device IOTLB (i.e. ATS) for both native and KVM environments. Signed-off-by: Yu Zhao <yu.zhao@intel.com> --- drivers/pci/intel-iommu.c | 95 +++++++++++++++++++++++++++++++++++++++++- include/linux/intel-iommu.h | 1 + 2 files changed, 93 insertions(+), 3 deletions(-)