diff mbox series

[5/5] iommufd: Flush CPU caches on DMA pages in non-coherent domains

Message ID 20240507062212.20535-1-yan.y.zhao@intel.com (mailing list archive)
State New, archived
Headers show
Series Enforce CPU cache flush for non-coherent device assignment | expand

Commit Message

Yan Zhao May 7, 2024, 6:22 a.m. UTC
Flush CPU cache on DMA pages before mapping them into the first
non-coherent domain (domain that does not enforce cache coherency, i.e. CPU
caches are not force-snooped) and after unmapping them from the last
domain.

Devices attached to non-coherent domains can execute non-coherent DMAs
(DMAs that lack CPU cache snooping) to access physical memory with CPU
caches bypassed.

Such a scenario could be exploited by a malicious guest, allowing them to
access stale host data in memory rather than the data initialized by the
host (e.g., zeros) in the cache, thus posing a risk of information leakage
attack.

Furthermore, the host kernel (e.g. a ksm thread) might encounter
inconsistent data between the CPU cache and memory (left by a malicious
guest) after a page is unpinned for DMA but before it's recycled.

Therefore, it is required to flush the CPU cache before a page is
accessible to non-coherent DMAs and after the page is inaccessible to
non-coherent DMAs.

However, the CPU cache is not flushed immediately when the page is unmapped
from the last non-coherent domain. Instead, the flushing is performed
lazily, right before the page is unpinned.
Take the following example to illustrate the process. The CPU cache is
flushed right before step 2 and step 5.
1. A page is mapped into a coherent domain.
2. The page is mapped into a non-coherent domain.
3. The page is unmapped from the non-coherent domain e.g.due to hot-unplug.
4. The page is unmapped from the coherent domain.
5. The page is unpinned.

Reasons for adopting this lazily flushing design include:
- There're several unmap paths and only one unpin path. Lazily flush before
  unpin wipes out the inconsistency between cache and physical memory
  before a page is globally visible and produces code that is simpler, more
  maintainable and easier to backport.
- Avoid dividing a large unmap range into several smaller ones or
  allocating additional memory to hold IOVA to HPA relationship.

Unlike "has_noncoherent_domain" flag used in vfio_iommu, the
"noncoherent_domain_cnt" counter is implemented in io_pagetable to track
whether an iopt has non-coherent domains attached.
Such a difference is because in iommufd only hwpt of type paging contains
flag "enforce_cache_coherency" and iommu domains in io_pagetable has no
flag "enforce_cache_coherency" as that in vfio_domain.
A counter in io_pagetable can avoid traversing ioas->hwpt_list and holding
ioas->mutex.

Reported-by: Jason Gunthorpe <jgg@nvidia.com>
Closes: https://lore.kernel.org/lkml/20240109002220.GA439767@nvidia.com
Fixes: e8d57210035b ("iommufd: Add kAPI toward external drivers for physical devices")
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Yan Zhao <yan.y.zhao@intel.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 19 +++++++++--
 drivers/iommu/iommufd/io_pagetable.h    |  5 +++
 drivers/iommu/iommufd/iommufd_private.h |  1 +
 drivers/iommu/iommufd/pages.c           | 44 +++++++++++++++++++++++--
 4 files changed, 65 insertions(+), 4 deletions(-)

Comments

Jason Gunthorpe May 9, 2024, 2:13 p.m. UTC | #1
On Tue, May 07, 2024 at 02:22:12PM +0800, Yan Zhao wrote:
> diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> index 33d142f8057d..e3099d732c5c 100644
> --- a/drivers/iommu/iommufd/hw_pagetable.c
> +++ b/drivers/iommu/iommufd/hw_pagetable.c
> @@ -14,12 +14,18 @@ void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
>  		container_of(obj, struct iommufd_hwpt_paging, common.obj);
>  
>  	if (!list_empty(&hwpt_paging->hwpt_item)) {
> +		struct io_pagetable *iopt = &hwpt_paging->ioas->iopt;
>  		mutex_lock(&hwpt_paging->ioas->mutex);
>  		list_del(&hwpt_paging->hwpt_item);
>  		mutex_unlock(&hwpt_paging->ioas->mutex);
>  
> -		iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
> -					 hwpt_paging->common.domain);
> +		iopt_table_remove_domain(iopt, hwpt_paging->common.domain);
> +
> +		if (!hwpt_paging->enforce_cache_coherency) {
> +			down_write(&iopt->domains_rwsem);
> +			iopt->noncoherent_domain_cnt--;
> +			up_write(&iopt->domains_rwsem);

I think it would be nicer to put this in iopt_table_remove_domain()
since we already have the lock there anyhow. It would be OK to pass
int he hwpt. Same remark for the incr side

> @@ -176,6 +182,12 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
>  			goto out_abort;
>  	}
>  
> +	if (!hwpt_paging->enforce_cache_coherency) {
> +		down_write(&ioas->iopt.domains_rwsem);
> +		ioas->iopt.noncoherent_domain_cnt++;
> +		up_write(&ioas->iopt.domains_rwsem);
> +	}
> +
>  	rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain);

iopt_table_add_domain also already gets the required locks too

>  	if (rc)
>  		goto out_detach;
> @@ -183,6 +195,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
>  	return hwpt_paging;
>  
>  out_detach:
> +	down_write(&ioas->iopt.domains_rwsem);
> +	ioas->iopt.noncoherent_domain_cnt--;
> +	up_write(&ioas->iopt.domains_rwsem);

And then you don't need this error unwind

> diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
> index 0ec3509b7e33..557da8fb83d9 100644
> --- a/drivers/iommu/iommufd/io_pagetable.h
> +++ b/drivers/iommu/iommufd/io_pagetable.h
> @@ -198,6 +198,11 @@ struct iopt_pages {
>  	void __user *uptr;
>  	bool writable:1;
>  	u8 account_mode;
> +	/*
> +	 * CPU cache flush is required before mapping the pages to or after
> +	 * unmapping it from a noncoherent domain
> +	 */
> +	bool cache_flush_required:1;

Move this up a line so it packs with the other bool bitfield.

>  static void batch_clear(struct pfn_batch *batch)
>  {
>  	batch->total_pfns = 0;
> @@ -637,10 +648,18 @@ static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
>  	while (npages) {
>  		size_t to_unpin = min_t(size_t, npages,
>  					batch->npfns[cur] - first_page_off);
> +		unsigned long pfn = batch->pfns[cur] + first_page_off;
> +
> +		/*
> +		 * Lazily flushing CPU caches when a page is about to be
> +		 * unpinned if the page was mapped into a noncoherent domain
> +		 */
> +		if (pages->cache_flush_required)
> +			arch_clean_nonsnoop_dma(pfn << PAGE_SHIFT,
> +						to_unpin << PAGE_SHIFT);
>  
>  		unpin_user_page_range_dirty_lock(
> -			pfn_to_page(batch->pfns[cur] + first_page_off),
> -			to_unpin, pages->writable);
> +			pfn_to_page(pfn), to_unpin, pages->writable);
>  		iopt_pages_sub_npinned(pages, to_unpin);
>  		cur++;
>  		first_page_off = 0;

Make sense

> @@ -1358,10 +1377,17 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
>  {
>  	unsigned long done_end_index;
>  	struct pfn_reader pfns;
> +	bool cache_flush_required;
>  	int rc;
>  
>  	lockdep_assert_held(&area->pages->mutex);
>  
> +	cache_flush_required = area->iopt->noncoherent_domain_cnt &&
> +			       !area->pages->cache_flush_required;
> +
> +	if (cache_flush_required)
> +		area->pages->cache_flush_required = true;
> +
>  	rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
>  			      iopt_area_last_index(area));
>  	if (rc)
> @@ -1369,6 +1395,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
>  
>  	while (!pfn_reader_done(&pfns)) {
>  		done_end_index = pfns.batch_start_index;
> +		if (cache_flush_required)
> +			iopt_cache_flush_pfn_batch(&pfns.batch);
> +

This is a bit unfortunate, it means we are going to flush for every
domain, even though it is not required. I don't see any easy way out
of that :(

Jason
Yan Zhao May 10, 2024, 8:03 a.m. UTC | #2
On Thu, May 09, 2024 at 11:13:32AM -0300, Jason Gunthorpe wrote:
> On Tue, May 07, 2024 at 02:22:12PM +0800, Yan Zhao wrote:
> > diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
> > index 33d142f8057d..e3099d732c5c 100644
> > --- a/drivers/iommu/iommufd/hw_pagetable.c
> > +++ b/drivers/iommu/iommufd/hw_pagetable.c
> > @@ -14,12 +14,18 @@ void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
> >  		container_of(obj, struct iommufd_hwpt_paging, common.obj);
> >  
> >  	if (!list_empty(&hwpt_paging->hwpt_item)) {
> > +		struct io_pagetable *iopt = &hwpt_paging->ioas->iopt;
> >  		mutex_lock(&hwpt_paging->ioas->mutex);
> >  		list_del(&hwpt_paging->hwpt_item);
> >  		mutex_unlock(&hwpt_paging->ioas->mutex);
> >  
> > -		iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
> > -					 hwpt_paging->common.domain);
> > +		iopt_table_remove_domain(iopt, hwpt_paging->common.domain);
> > +
> > +		if (!hwpt_paging->enforce_cache_coherency) {
> > +			down_write(&iopt->domains_rwsem);
> > +			iopt->noncoherent_domain_cnt--;
> > +			up_write(&iopt->domains_rwsem);
> 
> I think it would be nicer to put this in iopt_table_remove_domain()
> since we already have the lock there anyhow. It would be OK to pass
> int he hwpt. Same remark for the incr side
Ok. Passed hwpt to the two funcions.

int iopt_table_add_domain(struct io_pagetable *iopt,
                          struct iommufd_hw_pagetable *hwpt);

void iopt_table_remove_domain(struct io_pagetable *iopt,
                              struct iommufd_hw_pagetable *hwpt);

> 
> > @@ -176,6 +182,12 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
> >  			goto out_abort;
> >  	}
> >  
> > +	if (!hwpt_paging->enforce_cache_coherency) {
> > +		down_write(&ioas->iopt.domains_rwsem);
> > +		ioas->iopt.noncoherent_domain_cnt++;
> > +		up_write(&ioas->iopt.domains_rwsem);
> > +	}
> > +
> >  	rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain);
> 
> iopt_table_add_domain also already gets the required locks too
Right.

> 
> >  	if (rc)
> >  		goto out_detach;
> > @@ -183,6 +195,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
> >  	return hwpt_paging;
> >  
> >  out_detach:
> > +	down_write(&ioas->iopt.domains_rwsem);
> > +	ioas->iopt.noncoherent_domain_cnt--;
> > +	up_write(&ioas->iopt.domains_rwsem);
> 
> And then you don't need this error unwind
Yes :)

> > diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
> > index 0ec3509b7e33..557da8fb83d9 100644
> > --- a/drivers/iommu/iommufd/io_pagetable.h
> > +++ b/drivers/iommu/iommufd/io_pagetable.h
> > @@ -198,6 +198,11 @@ struct iopt_pages {
> >  	void __user *uptr;
> >  	bool writable:1;
> >  	u8 account_mode;
> > +	/*
> > +	 * CPU cache flush is required before mapping the pages to or after
> > +	 * unmapping it from a noncoherent domain
> > +	 */
> > +	bool cache_flush_required:1;
> 
> Move this up a line so it packs with the other bool bitfield.
Yes, thanks!

> >  static void batch_clear(struct pfn_batch *batch)
> >  {
> >  	batch->total_pfns = 0;
> > @@ -637,10 +648,18 @@ static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
> >  	while (npages) {
> >  		size_t to_unpin = min_t(size_t, npages,
> >  					batch->npfns[cur] - first_page_off);
> > +		unsigned long pfn = batch->pfns[cur] + first_page_off;
> > +
> > +		/*
> > +		 * Lazily flushing CPU caches when a page is about to be
> > +		 * unpinned if the page was mapped into a noncoherent domain
> > +		 */
> > +		if (pages->cache_flush_required)
> > +			arch_clean_nonsnoop_dma(pfn << PAGE_SHIFT,
> > +						to_unpin << PAGE_SHIFT);
> >  
> >  		unpin_user_page_range_dirty_lock(
> > -			pfn_to_page(batch->pfns[cur] + first_page_off),
> > -			to_unpin, pages->writable);
> > +			pfn_to_page(pfn), to_unpin, pages->writable);
> >  		iopt_pages_sub_npinned(pages, to_unpin);
> >  		cur++;
> >  		first_page_off = 0;
> 
> Make sense
> 
> > @@ -1358,10 +1377,17 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
> >  {
> >  	unsigned long done_end_index;
> >  	struct pfn_reader pfns;
> > +	bool cache_flush_required;
> >  	int rc;
> >  
> >  	lockdep_assert_held(&area->pages->mutex);
> >  
> > +	cache_flush_required = area->iopt->noncoherent_domain_cnt &&
> > +			       !area->pages->cache_flush_required;
> > +
> > +	if (cache_flush_required)
> > +		area->pages->cache_flush_required = true;
> > +
> >  	rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
> >  			      iopt_area_last_index(area));
> >  	if (rc)
> > @@ -1369,6 +1395,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
> >  
> >  	while (!pfn_reader_done(&pfns)) {
> >  		done_end_index = pfns.batch_start_index;
> > +		if (cache_flush_required)
> > +			iopt_cache_flush_pfn_batch(&pfns.batch);
> > +
> 
> This is a bit unfortunate, it means we are going to flush for every
> domain, even though it is not required. I don't see any easy way out
> of that :(
Yes. Do you think it's possible to add an op get_cache_coherency_enforced
to iommu_domain_ops?
Jason Gunthorpe May 10, 2024, 1:29 p.m. UTC | #3
On Fri, May 10, 2024 at 04:03:04PM +0800, Yan Zhao wrote:
> > > @@ -1358,10 +1377,17 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
> > >  {
> > >  	unsigned long done_end_index;
> > >  	struct pfn_reader pfns;
> > > +	bool cache_flush_required;
> > >  	int rc;
> > >  
> > >  	lockdep_assert_held(&area->pages->mutex);
> > >  
> > > +	cache_flush_required = area->iopt->noncoherent_domain_cnt &&
> > > +			       !area->pages->cache_flush_required;
> > > +
> > > +	if (cache_flush_required)
> > > +		area->pages->cache_flush_required = true;
> > > +
> > >  	rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
> > >  			      iopt_area_last_index(area));
> > >  	if (rc)
> > > @@ -1369,6 +1395,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
> > >  
> > >  	while (!pfn_reader_done(&pfns)) {
> > >  		done_end_index = pfns.batch_start_index;
> > > +		if (cache_flush_required)
> > > +			iopt_cache_flush_pfn_batch(&pfns.batch);
> > > +
> > 
> > This is a bit unfortunate, it means we are going to flush for every
> > domain, even though it is not required. I don't see any easy way out
> > of that :(
> Yes. Do you think it's possible to add an op get_cache_coherency_enforced
> to iommu_domain_ops?

Do we need that? The hwpt already keeps track of that? the enforced could be
copied into the area along side storage_domain

Then I guess you could avoid flushing in the case the page came from
the storage_domain...

You'd want the storage_domain to preferentially point to any
non-enforced domain.

Is it worth it? How slow is this stuff?

Jason
Yan Zhao May 13, 2024, 7:43 a.m. UTC | #4
On Fri, May 10, 2024 at 10:29:28AM -0300, Jason Gunthorpe wrote:
> On Fri, May 10, 2024 at 04:03:04PM +0800, Yan Zhao wrote:
> > > > @@ -1358,10 +1377,17 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
> > > >  {
> > > >  	unsigned long done_end_index;
> > > >  	struct pfn_reader pfns;
> > > > +	bool cache_flush_required;
> > > >  	int rc;
> > > >  
> > > >  	lockdep_assert_held(&area->pages->mutex);
> > > >  
> > > > +	cache_flush_required = area->iopt->noncoherent_domain_cnt &&
> > > > +			       !area->pages->cache_flush_required;
> > > > +
> > > > +	if (cache_flush_required)
> > > > +		area->pages->cache_flush_required = true;
> > > > +
> > > >  	rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
> > > >  			      iopt_area_last_index(area));
> > > >  	if (rc)
> > > > @@ -1369,6 +1395,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
> > > >  
> > > >  	while (!pfn_reader_done(&pfns)) {
> > > >  		done_end_index = pfns.batch_start_index;
> > > > +		if (cache_flush_required)
> > > > +			iopt_cache_flush_pfn_batch(&pfns.batch);
> > > > +
> > > 
> > > This is a bit unfortunate, it means we are going to flush for every
> > > domain, even though it is not required. I don't see any easy way out
> > > of that :(
> > Yes. Do you think it's possible to add an op get_cache_coherency_enforced
> > to iommu_domain_ops?
> 
> Do we need that? The hwpt already keeps track of that? the enforced could be
> copied into the area along side storage_domain
> 
> Then I guess you could avoid flushing in the case the page came from
> the storage_domain...
> 
> You'd want the storage_domain to preferentially point to any
> non-enforced domain.
> 
> Is it worth it? How slow is this stuff?
Sorry, I might have misunderstood your intentions in my previous mail.
In iopt_area_fill_domain(), flushing CPU caches is only performed when
(1) noncoherent_domain_cnt is non-zero and
(2) area->pages->cache_flush_required is false.
area->pages->cache_flush_required is also set to true after the two are met, so
that the next flush to the same "area->pages" in filling phase will be skipped.

In my last mail, I thought you wanted to flush for every domain even if
area->pages->cache_flush_required is true, because I thought that you were
worried about that checking area->pages->cache_flush_required might results in
some pages, which ought be flushed, not being flushed.
So, I was wondering if we could do the flush for every non-coherent domain by
checking whether domain enforces cache coherency.

However, as you said, we can check hwpt instead if it's passed in
iopt_area_fill_domain().

On the other side, after a second thought, looks it's still good to check
area->pages->cache_flush_required?
- "area" and "pages" are 1:1. In other words, there's no such a condition that
  several "area"s are pointing to the same "pages".
  Is this assumption right?
- Once area->pages->cache_flush_required is set to true, it means all pages
  indicated by "area->pages" has been mapped into a non-coherent domain
  (though the domain is not necessarily the storage domain).
  Is this assumption correct as well?
  If so, we can safely skip the flush in iopt_area_fill_domain() if
  area->pages->cache_flush_required is true.
Jason Gunthorpe May 14, 2024, 3:11 p.m. UTC | #5
On Mon, May 13, 2024 at 03:43:45PM +0800, Yan Zhao wrote:
> On Fri, May 10, 2024 at 10:29:28AM -0300, Jason Gunthorpe wrote:
> > On Fri, May 10, 2024 at 04:03:04PM +0800, Yan Zhao wrote:
> > > > > @@ -1358,10 +1377,17 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
> > > > >  {
> > > > >  	unsigned long done_end_index;
> > > > >  	struct pfn_reader pfns;
> > > > > +	bool cache_flush_required;
> > > > >  	int rc;
> > > > >  
> > > > >  	lockdep_assert_held(&area->pages->mutex);
> > > > >  
> > > > > +	cache_flush_required = area->iopt->noncoherent_domain_cnt &&
> > > > > +			       !area->pages->cache_flush_required;
> > > > > +
> > > > > +	if (cache_flush_required)
> > > > > +		area->pages->cache_flush_required = true;
> > > > > +
> > > > >  	rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
> > > > >  			      iopt_area_last_index(area));
> > > > >  	if (rc)
> > > > > @@ -1369,6 +1395,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
> > > > >  
> > > > >  	while (!pfn_reader_done(&pfns)) {
> > > > >  		done_end_index = pfns.batch_start_index;
> > > > > +		if (cache_flush_required)
> > > > > +			iopt_cache_flush_pfn_batch(&pfns.batch);
> > > > > +
> > > > 
> > > > This is a bit unfortunate, it means we are going to flush for every
> > > > domain, even though it is not required. I don't see any easy way out
> > > > of that :(
> > > Yes. Do you think it's possible to add an op get_cache_coherency_enforced
> > > to iommu_domain_ops?
> > 
> > Do we need that? The hwpt already keeps track of that? the enforced could be
> > copied into the area along side storage_domain
> > 
> > Then I guess you could avoid flushing in the case the page came from
> > the storage_domain...
> > 
> > You'd want the storage_domain to preferentially point to any
> > non-enforced domain.
> > 
> > Is it worth it? How slow is this stuff?
> Sorry, I might have misunderstood your intentions in my previous mail.
> In iopt_area_fill_domain(), flushing CPU caches is only performed when
> (1) noncoherent_domain_cnt is non-zero and
> (2) area->pages->cache_flush_required is false.
> area->pages->cache_flush_required is also set to true after the two are met, so
> that the next flush to the same "area->pages" in filling phase will be skipped.
> 
> In my last mail, I thought you wanted to flush for every domain even if
> area->pages->cache_flush_required is true, because I thought that you were
> worried about that checking area->pages->cache_flush_required might results in
> some pages, which ought be flushed, not being flushed.
> So, I was wondering if we could do the flush for every non-coherent domain by
> checking whether domain enforces cache coherency.
> 
> However, as you said, we can check hwpt instead if it's passed in
> iopt_area_fill_domain().
> 
> On the other side, after a second thought, looks it's still good to check
> area->pages->cache_flush_required?
> - "area" and "pages" are 1:1. In other words, there's no such a condition that
>   several "area"s are pointing to the same "pages".
>   Is this assumption right?

copy can create new areas that point to shared pages. That is why
there are two structs.

> - Once area->pages->cache_flush_required is set to true, it means all pages
>   indicated by "area->pages" has been mapped into a non-coherent
>   domain

Also not true, the multiple area's can take sub slices of the pages,
so two hwpts' can be mapping disjoint sets of pages, and thus have
disjoint cachability.

So it has to be calculated on closer to a page by page basis (really a
span by span basis) if flushing of that span is needed based on where
the pages came from. Only pages that came from a hwpt that is
non-coherent can skip the flushing.

Jason
Yan Zhao May 15, 2024, 7:06 a.m. UTC | #6
On Tue, May 14, 2024 at 12:11:19PM -0300, Jason Gunthorpe wrote:
> On Mon, May 13, 2024 at 03:43:45PM +0800, Yan Zhao wrote:
> > On Fri, May 10, 2024 at 10:29:28AM -0300, Jason Gunthorpe wrote:
> > > On Fri, May 10, 2024 at 04:03:04PM +0800, Yan Zhao wrote:
> > > > > > @@ -1358,10 +1377,17 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
> > > > > >  {
> > > > > >  	unsigned long done_end_index;
> > > > > >  	struct pfn_reader pfns;
> > > > > > +	bool cache_flush_required;
> > > > > >  	int rc;
> > > > > >  
> > > > > >  	lockdep_assert_held(&area->pages->mutex);
> > > > > >  
> > > > > > +	cache_flush_required = area->iopt->noncoherent_domain_cnt &&
> > > > > > +			       !area->pages->cache_flush_required;
> > > > > > +
> > > > > > +	if (cache_flush_required)
> > > > > > +		area->pages->cache_flush_required = true;
> > > > > > +
> > > > > >  	rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
> > > > > >  			      iopt_area_last_index(area));
> > > > > >  	if (rc)
> > > > > > @@ -1369,6 +1395,9 @@ int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
> > > > > >  
> > > > > >  	while (!pfn_reader_done(&pfns)) {
> > > > > >  		done_end_index = pfns.batch_start_index;
> > > > > > +		if (cache_flush_required)
> > > > > > +			iopt_cache_flush_pfn_batch(&pfns.batch);
> > > > > > +
> > > > > 
> > > > > This is a bit unfortunate, it means we are going to flush for every
> > > > > domain, even though it is not required. I don't see any easy way out
> > > > > of that :(
> > > > Yes. Do you think it's possible to add an op get_cache_coherency_enforced
> > > > to iommu_domain_ops?
> > > 
> > > Do we need that? The hwpt already keeps track of that? the enforced could be
> > > copied into the area along side storage_domain
> > > 
> > > Then I guess you could avoid flushing in the case the page came from
> > > the storage_domain...
> > > 
> > > You'd want the storage_domain to preferentially point to any
> > > non-enforced domain.
> > > 
> > > Is it worth it? How slow is this stuff?
> > Sorry, I might have misunderstood your intentions in my previous mail.
> > In iopt_area_fill_domain(), flushing CPU caches is only performed when
> > (1) noncoherent_domain_cnt is non-zero and
> > (2) area->pages->cache_flush_required is false.
> > area->pages->cache_flush_required is also set to true after the two are met, so
> > that the next flush to the same "area->pages" in filling phase will be skipped.
> > 
> > In my last mail, I thought you wanted to flush for every domain even if
> > area->pages->cache_flush_required is true, because I thought that you were
> > worried about that checking area->pages->cache_flush_required might results in
> > some pages, which ought be flushed, not being flushed.
> > So, I was wondering if we could do the flush for every non-coherent domain by
> > checking whether domain enforces cache coherency.
> > 
> > However, as you said, we can check hwpt instead if it's passed in
> > iopt_area_fill_domain().
> > 
> > On the other side, after a second thought, looks it's still good to check
> > area->pages->cache_flush_required?
> > - "area" and "pages" are 1:1. In other words, there's no such a condition that
> >   several "area"s are pointing to the same "pages".
> >   Is this assumption right?
> 
> copy can create new areas that point to shared pages. That is why
> there are two structs.
Oh, thanks for explanation and glad to learn that!!
Though in this case, new area is identical to the old area.
> 
> > - Once area->pages->cache_flush_required is set to true, it means all pages
> >   indicated by "area->pages" has been mapped into a non-coherent
> >   domain
> 
> Also not true, the multiple area's can take sub slices of the pages,
Ah, right, e.g. after iopt_area_split().

> so two hwpts' can be mapping disjoint sets of pages, and thus have
> disjoint cachability.
Indeed.
> 
> So it has to be calculated on closer to a page by page basis (really a
> span by span basis) if flushing of that span is needed based on where
> the pages came from. Only pages that came from a hwpt that is
> non-coherent can skip the flushing.
Is area by area basis also good?
Isn't an area either not mapped to any domain or mapped into all domains?

But, yes, considering the limited number of non-coherent domains, it appears
more robust and clean to always flush for non-coherent domain in
iopt_area_fill_domain().
It eliminates the need to decide whether to retain the area flag during a split.
Jason Gunthorpe May 15, 2024, 8:43 p.m. UTC | #7
On Wed, May 15, 2024 at 03:06:36PM +0800, Yan Zhao wrote:

> > So it has to be calculated on closer to a page by page basis (really a
> > span by span basis) if flushing of that span is needed based on where
> > the pages came from. Only pages that came from a hwpt that is
> > non-coherent can skip the flushing.
> Is area by area basis also good?
> Isn't an area either not mapped to any domain or mapped into all domains?

Yes, this is what the span iterator turns into in the background, it
goes area by area to cover things.

> But, yes, considering the limited number of non-coherent domains, it appears
> more robust and clean to always flush for non-coherent domain in
> iopt_area_fill_domain().
> It eliminates the need to decide whether to retain the area flag during a split.

And flush for pin user pages, so you basically always flush because
you can't tell where the pages came from.

Jason
Yan Zhao May 16, 2024, 2:32 a.m. UTC | #8
On Wed, May 15, 2024 at 05:43:04PM -0300, Jason Gunthorpe wrote:
> On Wed, May 15, 2024 at 03:06:36PM +0800, Yan Zhao wrote:
> 
> > > So it has to be calculated on closer to a page by page basis (really a
> > > span by span basis) if flushing of that span is needed based on where
> > > the pages came from. Only pages that came from a hwpt that is
> > > non-coherent can skip the flushing.
> > Is area by area basis also good?
> > Isn't an area either not mapped to any domain or mapped into all domains?
> 
> Yes, this is what the span iterator turns into in the background, it
> goes area by area to cover things.
> 
> > But, yes, considering the limited number of non-coherent domains, it appears
> > more robust and clean to always flush for non-coherent domain in
> > iopt_area_fill_domain().
> > It eliminates the need to decide whether to retain the area flag during a split.
> 
> And flush for pin user pages, so you basically always flush because
> you can't tell where the pages came from.
As a summary, do you think it's good to flush in below way?

1. in iopt_area_fill_domains(), flush before mapping a page into domains when
   iopt->noncoherent_domain_cnt > 0, no matter where the page is from.
   Record cache_flush_required in pages for unpin.
2. in iopt_area_fill_domain(), pass in hwpt to check domain non-coherency.
   flush before mapping a page into a non-coherent domain, no matter where the
   page is from.
   Record cache_flush_required in pages for unpin.
3. in batch_unpin(), flush if pages->cache_flush_required before
   unpin_user_pages.
Tian, Kevin May 16, 2024, 8:38 a.m. UTC | #9
> From: Zhao, Yan Y <yan.y.zhao@intel.com>
> Sent: Thursday, May 16, 2024 10:33 AM
> 
> On Wed, May 15, 2024 at 05:43:04PM -0300, Jason Gunthorpe wrote:
> > On Wed, May 15, 2024 at 03:06:36PM +0800, Yan Zhao wrote:
> >
> > > > So it has to be calculated on closer to a page by page basis (really a
> > > > span by span basis) if flushing of that span is needed based on where
> > > > the pages came from. Only pages that came from a hwpt that is
> > > > non-coherent can skip the flushing.
> > > Is area by area basis also good?
> > > Isn't an area either not mapped to any domain or mapped into all
> domains?
> >
> > Yes, this is what the span iterator turns into in the background, it
> > goes area by area to cover things.
> >
> > > But, yes, considering the limited number of non-coherent domains, it
> appears
> > > more robust and clean to always flush for non-coherent domain in
> > > iopt_area_fill_domain().
> > > It eliminates the need to decide whether to retain the area flag during a
> split.
> >
> > And flush for pin user pages, so you basically always flush because
> > you can't tell where the pages came from.
> As a summary, do you think it's good to flush in below way?
> 
> 1. in iopt_area_fill_domains(), flush before mapping a page into domains
> when
>    iopt->noncoherent_domain_cnt > 0, no matter where the page is from.
>    Record cache_flush_required in pages for unpin.
> 2. in iopt_area_fill_domain(), pass in hwpt to check domain non-coherency.
>    flush before mapping a page into a non-coherent domain, no matter where
> the
>    page is from.
>    Record cache_flush_required in pages for unpin.
> 3. in batch_unpin(), flush if pages->cache_flush_required before
>    unpin_user_pages.

so above suggests a sequence similar to vfio_type1 does?
Yan Zhao May 16, 2024, 9:48 a.m. UTC | #10
On Thu, May 16, 2024 at 04:38:12PM +0800, Tian, Kevin wrote:
> > From: Zhao, Yan Y <yan.y.zhao@intel.com>
> > Sent: Thursday, May 16, 2024 10:33 AM
> > 
> > On Wed, May 15, 2024 at 05:43:04PM -0300, Jason Gunthorpe wrote:
> > > On Wed, May 15, 2024 at 03:06:36PM +0800, Yan Zhao wrote:
> > >
> > > > > So it has to be calculated on closer to a page by page basis (really a
> > > > > span by span basis) if flushing of that span is needed based on where
> > > > > the pages came from. Only pages that came from a hwpt that is
> > > > > non-coherent can skip the flushing.
> > > > Is area by area basis also good?
> > > > Isn't an area either not mapped to any domain or mapped into all
> > domains?
> > >
> > > Yes, this is what the span iterator turns into in the background, it
> > > goes area by area to cover things.
> > >
> > > > But, yes, considering the limited number of non-coherent domains, it
> > appears
> > > > more robust and clean to always flush for non-coherent domain in
> > > > iopt_area_fill_domain().
> > > > It eliminates the need to decide whether to retain the area flag during a
> > split.
> > >
> > > And flush for pin user pages, so you basically always flush because
> > > you can't tell where the pages came from.
> > As a summary, do you think it's good to flush in below way?
> > 
> > 1. in iopt_area_fill_domains(), flush before mapping a page into domains
> > when
> >    iopt->noncoherent_domain_cnt > 0, no matter where the page is from.
> >    Record cache_flush_required in pages for unpin.
> > 2. in iopt_area_fill_domain(), pass in hwpt to check domain non-coherency.
> >    flush before mapping a page into a non-coherent domain, no matter where
> > the
> >    page is from.
> >    Record cache_flush_required in pages for unpin.
> > 3. in batch_unpin(), flush if pages->cache_flush_required before
> >    unpin_user_pages.
> 
> so above suggests a sequence similar to vfio_type1 does?
Similar. Except that in iopt_area_fill_domain(), flush is always performed to
non-coherent domains without checking pages->cache_flush_required, while in
vfio_iommu_replay(), flush can be skipped if dma->cache_flush_required is true.

This is because in vfio_type1, pages are mapped into domains in dma-by-dma basis,
but in iommufd, pages are mapped into domains in area-by-area basis.
Two areas are possible to be non-overlapping parts of an iopt_pages.
It's not right to skip flushing of pages in the second area if
pages->cache_flush_required is set to true by mapping pages in the first area.
It's also cumbersome to introduce and check another flag in area or to check
where pages came from before mapping them into a non-coherent domain.
Jason Gunthorpe May 17, 2024, 5:04 p.m. UTC | #11
On Thu, May 16, 2024 at 10:32:43AM +0800, Yan Zhao wrote:
> On Wed, May 15, 2024 at 05:43:04PM -0300, Jason Gunthorpe wrote:
> > On Wed, May 15, 2024 at 03:06:36PM +0800, Yan Zhao wrote:
> > 
> > > > So it has to be calculated on closer to a page by page basis (really a
> > > > span by span basis) if flushing of that span is needed based on where
> > > > the pages came from. Only pages that came from a hwpt that is
> > > > non-coherent can skip the flushing.
> > > Is area by area basis also good?
> > > Isn't an area either not mapped to any domain or mapped into all domains?
> > 
> > Yes, this is what the span iterator turns into in the background, it
> > goes area by area to cover things.
> > 
> > > But, yes, considering the limited number of non-coherent domains, it appears
> > > more robust and clean to always flush for non-coherent domain in
> > > iopt_area_fill_domain().
> > > It eliminates the need to decide whether to retain the area flag during a split.
> > 
> > And flush for pin user pages, so you basically always flush because
> > you can't tell where the pages came from.
> As a summary, do you think it's good to flush in below way?
> 
> 1. in iopt_area_fill_domains(), flush before mapping a page into domains when
>    iopt->noncoherent_domain_cnt > 0, no matter where the page is from.
>    Record cache_flush_required in pages for unpin.
> 2. in iopt_area_fill_domain(), pass in hwpt to check domain non-coherency.
>    flush before mapping a page into a non-coherent domain, no matter where the
>    page is from.
>    Record cache_flush_required in pages for unpin.
> 3. in batch_unpin(), flush if pages->cache_flush_required before
>    unpin_user_pages.

It does not quite sound right, there should be no tracking in the
pages of this stuff.

If pfn_reader_fill_span() does batch_from_domain() and
the source domain's storage_domain is non-coherent then you can skip
the flush. This is not pedantically perfect in skipping all flushes, but
in practice it is probably good enough.

__iopt_area_unfill_domain() (and children) must flush after
iopt_area_unmap_domain_range() if the area's domain is
non-coherent. This is also not perfect, but probably good enough.

Doing better in both cases would require inspecting the areas under
the used span to see what is there. This is not so easy.

Jason
Yan Zhao May 20, 2024, 2:45 a.m. UTC | #12
On Fri, May 17, 2024 at 02:04:18PM -0300, Jason Gunthorpe wrote:
> On Thu, May 16, 2024 at 10:32:43AM +0800, Yan Zhao wrote:
> > On Wed, May 15, 2024 at 05:43:04PM -0300, Jason Gunthorpe wrote:
> > > On Wed, May 15, 2024 at 03:06:36PM +0800, Yan Zhao wrote:
> > > 
> > > > > So it has to be calculated on closer to a page by page basis (really a
> > > > > span by span basis) if flushing of that span is needed based on where
> > > > > the pages came from. Only pages that came from a hwpt that is
> > > > > non-coherent can skip the flushing.
> > > > Is area by area basis also good?
> > > > Isn't an area either not mapped to any domain or mapped into all domains?
> > > 
> > > Yes, this is what the span iterator turns into in the background, it
> > > goes area by area to cover things.
> > > 
> > > > But, yes, considering the limited number of non-coherent domains, it appears
> > > > more robust and clean to always flush for non-coherent domain in
> > > > iopt_area_fill_domain().
> > > > It eliminates the need to decide whether to retain the area flag during a split.
> > > 
> > > And flush for pin user pages, so you basically always flush because
> > > you can't tell where the pages came from.
> > As a summary, do you think it's good to flush in below way?
> > 
> > 1. in iopt_area_fill_domains(), flush before mapping a page into domains when
> >    iopt->noncoherent_domain_cnt > 0, no matter where the page is from.
> >    Record cache_flush_required in pages for unpin.
> > 2. in iopt_area_fill_domain(), pass in hwpt to check domain non-coherency.
> >    flush before mapping a page into a non-coherent domain, no matter where the
> >    page is from.
> >    Record cache_flush_required in pages for unpin.
> > 3. in batch_unpin(), flush if pages->cache_flush_required before
> >    unpin_user_pages.
> 
> It does not quite sound right, there should be no tracking in the
> pages of this stuff.
What's the downside of having tracking in the pages?

Lazily flush pages right before unpin pages is not only to save flush count
for performance, but also for some real problem we encountered. see below.

> 
> If pfn_reader_fill_span() does batch_from_domain() and
> the source domain's storage_domain is non-coherent then you can skip
> the flush. This is not pedantically perfect in skipping all flushes, but
> in practice it is probably good enough.
We don't know whether the source storage_domain is non-coherent since
area->storage_domain is of "struct iommu_domain".

Do you want to add a flag in "area", e.g. area->storage_domain_is_noncoherent,
and set this flag along side setting storage_domain?
(But looks this is not easy in iopt_area_fill_domains() as we don't have hwpt
there.)

> __iopt_area_unfill_domain() (and children) must flush after
> iopt_area_unmap_domain_range() if the area's domain is
> non-coherent. This is also not perfect, but probably good enough.
Do you mean flush after each iopt_area_unmap_domain_range() if the domain is
non-coherent?
The problem is that iopt_area_unmap_domain_range() knows only IOVA, the
IOVA->PFN relationship is not available without iommu_iova_to_phys() and
iommu_domain contains no coherency info.
Besides, when the non-coherent domain is a storage domain, we still need to do
the flush in batch_unpin(), right?
Then, with a more complex case, if the non-coherent domain is a storage domain,
and if some pages are still held in pages->access_itree when unfilling the
domain, should we get PFNs from pages->pinned_pfns and do the flush in
__iopt_area_unfill_domain()?
> 
> Doing better in both cases would require inspecting the areas under
> the used span to see what is there. This is not so easy.
My feeling is that checking non-coherency of target domain and save non-coherency
in pages might be the easiest way with least code change.
Jason Gunthorpe May 21, 2024, 4:04 p.m. UTC | #13
On Mon, May 20, 2024 at 10:45:56AM +0800, Yan Zhao wrote:
> On Fri, May 17, 2024 at 02:04:18PM -0300, Jason Gunthorpe wrote:
> > On Thu, May 16, 2024 at 10:32:43AM +0800, Yan Zhao wrote:
> > > On Wed, May 15, 2024 at 05:43:04PM -0300, Jason Gunthorpe wrote:
> > > > On Wed, May 15, 2024 at 03:06:36PM +0800, Yan Zhao wrote:
> > > > 
> > > > > > So it has to be calculated on closer to a page by page basis (really a
> > > > > > span by span basis) if flushing of that span is needed based on where
> > > > > > the pages came from. Only pages that came from a hwpt that is
> > > > > > non-coherent can skip the flushing.
> > > > > Is area by area basis also good?
> > > > > Isn't an area either not mapped to any domain or mapped into all domains?
> > > > 
> > > > Yes, this is what the span iterator turns into in the background, it
> > > > goes area by area to cover things.
> > > > 
> > > > > But, yes, considering the limited number of non-coherent domains, it appears
> > > > > more robust and clean to always flush for non-coherent domain in
> > > > > iopt_area_fill_domain().
> > > > > It eliminates the need to decide whether to retain the area flag during a split.
> > > > 
> > > > And flush for pin user pages, so you basically always flush because
> > > > you can't tell where the pages came from.
> > > As a summary, do you think it's good to flush in below way?
> > > 
> > > 1. in iopt_area_fill_domains(), flush before mapping a page into domains when
> > >    iopt->noncoherent_domain_cnt > 0, no matter where the page is from.
> > >    Record cache_flush_required in pages for unpin.
> > > 2. in iopt_area_fill_domain(), pass in hwpt to check domain non-coherency.
> > >    flush before mapping a page into a non-coherent domain, no matter where the
> > >    page is from.
> > >    Record cache_flush_required in pages for unpin.
> > > 3. in batch_unpin(), flush if pages->cache_flush_required before
> > >    unpin_user_pages.
> > 
> > It does not quite sound right, there should be no tracking in the
> > pages of this stuff.
> What's the downside of having tracking in the pages?

Well, a counter doesn't make sense. You could have a single sticky bit
that indicates that all PFNs are coherency dirty and overflush them on
every map and unmap operation.

This is certainly the simplest option, but gives the maximal flushes.

If you want to minimize flushes then you can't store flush
minimization information in the pages because it isn't global to the
pages and will not be accurate enough.

> > If pfn_reader_fill_span() does batch_from_domain() and
> > the source domain's storage_domain is non-coherent then you can skip
> > the flush. This is not pedantically perfect in skipping all flushes, but
> > in practice it is probably good enough.

> We don't know whether the source storage_domain is non-coherent since
> area->storage_domain is of "struct iommu_domain".
 
> Do you want to add a flag in "area", e.g. area->storage_domain_is_noncoherent,
> and set this flag along side setting storage_domain?

Sure, that could work.

> > __iopt_area_unfill_domain() (and children) must flush after
> > iopt_area_unmap_domain_range() if the area's domain is
> > non-coherent. This is also not perfect, but probably good enough.
> Do you mean flush after each iopt_area_unmap_domain_range() if the domain is
> non-coherent?
> The problem is that iopt_area_unmap_domain_range() knows only IOVA, the
> IOVA->PFN relationship is not available without iommu_iova_to_phys() and
> iommu_domain contains no coherency info.

Yes, you'd have to read back the PFNs on this path which it doesn't do
right now.. Given this pain it would be simpler to have one bit in the
pages that marks it permanently non-coherent and all pfns will be
flushed before put_page is called.

The trouble with a counter is that the count going to zero doesn't
really mean we flushed the PFN if it is being held someplace else.

Jason
Yan Zhao May 22, 2024, 3:17 a.m. UTC | #14
On Tue, May 21, 2024 at 01:04:42PM -0300, Jason Gunthorpe wrote:
> On Mon, May 20, 2024 at 10:45:56AM +0800, Yan Zhao wrote:
> > On Fri, May 17, 2024 at 02:04:18PM -0300, Jason Gunthorpe wrote:
> > > On Thu, May 16, 2024 at 10:32:43AM +0800, Yan Zhao wrote:
> > > > On Wed, May 15, 2024 at 05:43:04PM -0300, Jason Gunthorpe wrote:
> > > > > On Wed, May 15, 2024 at 03:06:36PM +0800, Yan Zhao wrote:
> > > > > 
> > > > > > > So it has to be calculated on closer to a page by page basis (really a
> > > > > > > span by span basis) if flushing of that span is needed based on where
> > > > > > > the pages came from. Only pages that came from a hwpt that is
> > > > > > > non-coherent can skip the flushing.
> > > > > > Is area by area basis also good?
> > > > > > Isn't an area either not mapped to any domain or mapped into all domains?
> > > > > 
> > > > > Yes, this is what the span iterator turns into in the background, it
> > > > > goes area by area to cover things.
> > > > > 
> > > > > > But, yes, considering the limited number of non-coherent domains, it appears
> > > > > > more robust and clean to always flush for non-coherent domain in
> > > > > > iopt_area_fill_domain().
> > > > > > It eliminates the need to decide whether to retain the area flag during a split.
> > > > > 
> > > > > And flush for pin user pages, so you basically always flush because
> > > > > you can't tell where the pages came from.
> > > > As a summary, do you think it's good to flush in below way?
> > > > 
> > > > 1. in iopt_area_fill_domains(), flush before mapping a page into domains when
> > > >    iopt->noncoherent_domain_cnt > 0, no matter where the page is from.
> > > >    Record cache_flush_required in pages for unpin.
> > > > 2. in iopt_area_fill_domain(), pass in hwpt to check domain non-coherency.
> > > >    flush before mapping a page into a non-coherent domain, no matter where the
> > > >    page is from.
> > > >    Record cache_flush_required in pages for unpin.
> > > > 3. in batch_unpin(), flush if pages->cache_flush_required before
> > > >    unpin_user_pages.
> > > 
> > > It does not quite sound right, there should be no tracking in the
> > > pages of this stuff.
> > What's the downside of having tracking in the pages?
> 
> Well, a counter doesn't make sense. You could have a single sticky bit
> that indicates that all PFNs are coherency dirty and overflush them on
> every map and unmap operation.
cache_flush_required is a sticky bit actually. It's set if any PFN in the
iopt_pages is mapped into a noncoherent domain.
batch_unpin() checks this sticky bit for flush.

@@ -198,6 +198,11 @@ struct iopt_pages {
        void __user *uptr;
        bool writable:1;
        u8 account_mode;
+       /*
+        * CPU cache flush is required before mapping the pages to or after
+        * unmapping it from a noncoherent domain
+        */
+       bool cache_flush_required:1;

(Please ignore the confusing comment).

iopt->noncoherent_domain_cnt is a counter. It's increased/decreased on
non-coherent hwpt attach/detach.

@@ -53,6 +53,7 @@ struct io_pagetable {
        struct rb_root_cached reserved_itree;
        u8 disable_large_pages;
        unsigned long iova_alignment;
+       unsigned int noncoherent_domain_cnt;
 };

Since iopt->domains contains no coherency info, this counter helps
iopt_area_fill_domains() to decide whether to flush pages and set sticky bit
cache_flush_required in iopt_pages.
Though it's not that useful to iopt_area_fill_domain(), after your suggestion
to pass in hwpt.

> This is certainly the simplest option, but gives the maximal flushes.

Why does this give the maximal flushes?
Considering the flush after unmap,
- With a sticky bit in iopt_pages, once a iopt_pages has been mapped into a
  non-coherent domain, the PFNs in the iopt_pages will be flushed for only once
  right before the page is unpinned.

- But if we do the flush after each iopt_area_unmap_domain_range() for each
  non-coherent domain, then the flush count for each PFN is the count of
  non-coherent domains.

> 
> If you want to minimize flushes then you can't store flush
> minimization information in the pages because it isn't global to the
> pages and will not be accurate enough.
> 
> > > If pfn_reader_fill_span() does batch_from_domain() and
> > > the source domain's storage_domain is non-coherent then you can skip
> > > the flush. This is not pedantically perfect in skipping all flushes, but
> > > in practice it is probably good enough.
> 
> > We don't know whether the source storage_domain is non-coherent since
> > area->storage_domain is of "struct iommu_domain".
>  
> > Do you want to add a flag in "area", e.g. area->storage_domain_is_noncoherent,
> > and set this flag along side setting storage_domain?
> 
> Sure, that could work.
When the storage_domain is set in iopt_area_fill_domains(),
    "area->storage_domain = xa_load(&area->iopt->domains, 0);"
is there a convenient way to know the storage_domain is non-coherent?

> 
> > > __iopt_area_unfill_domain() (and children) must flush after
> > > iopt_area_unmap_domain_range() if the area's domain is
> > > non-coherent. This is also not perfect, but probably good enough.
> > Do you mean flush after each iopt_area_unmap_domain_range() if the domain is
> > non-coherent?
> > The problem is that iopt_area_unmap_domain_range() knows only IOVA, the
> > IOVA->PFN relationship is not available without iommu_iova_to_phys() and
> > iommu_domain contains no coherency info.
> 
> Yes, you'd have to read back the PFNs on this path which it doesn't do
> right now.. Given this pain it would be simpler to have one bit in the
> pages that marks it permanently non-coherent and all pfns will be
> flushed before put_page is called.
> 
> The trouble with a counter is that the count going to zero doesn't
> really mean we flushed the PFN if it is being held someplace else.
Not sure if you are confused between iopt->noncoherent_domain_cnt and
pages->cache_flush_required.

iopt->noncoherent_domain_cnt is increased/decreased on non-coherent hwpt
attach/detach.

Once iopt->noncoherent_domain_cnt is non-zero, sticky bit cache_flush_required
in iopt_pages will be set during filling domain, PFNs in the iopt_pages will be
flushed right before unpinning even though iopt->noncoherent_domain_cnt might
have gone to 0 at that time.
Yan Zhao May 22, 2024, 6:29 a.m. UTC | #15
> > If you want to minimize flushes then you can't store flush
> > minimization information in the pages because it isn't global to the
> > pages and will not be accurate enough.
> > 
> > > > If pfn_reader_fill_span() does batch_from_domain() and
> > > > the source domain's storage_domain is non-coherent then you can skip
> > > > the flush. This is not pedantically perfect in skipping all flushes, but
> > > > in practice it is probably good enough.
> > 
> > > We don't know whether the source storage_domain is non-coherent since
> > > area->storage_domain is of "struct iommu_domain".
> >  
> > > Do you want to add a flag in "area", e.g. area->storage_domain_is_noncoherent,
> > > and set this flag along side setting storage_domain?
> > 
> > Sure, that could work.
> When the storage_domain is set in iopt_area_fill_domains(),
>     "area->storage_domain = xa_load(&area->iopt->domains, 0);"
> is there a convenient way to know the storage_domain is non-coherent?
Also asking for when storage_domain is switching to an arbitrary remaining domain
in iopt_unfill_domain().

And in iopt_area_unfill_domains(), after iopt_area_unmap_domain_range()
of a non-coherent domain which is not the storage domain, how can we know that
the domain is non-coherent?
Jason Gunthorpe May 22, 2024, 5:01 p.m. UTC | #16
On Wed, May 22, 2024 at 02:29:19PM +0800, Yan Zhao wrote:
> > > If you want to minimize flushes then you can't store flush
> > > minimization information in the pages because it isn't global to the
> > > pages and will not be accurate enough.
> > > 
> > > > > If pfn_reader_fill_span() does batch_from_domain() and
> > > > > the source domain's storage_domain is non-coherent then you can skip
> > > > > the flush. This is not pedantically perfect in skipping all flushes, but
> > > > > in practice it is probably good enough.
> > > 
> > > > We don't know whether the source storage_domain is non-coherent since
> > > > area->storage_domain is of "struct iommu_domain".
> > >  
> > > > Do you want to add a flag in "area", e.g. area->storage_domain_is_noncoherent,
> > > > and set this flag along side setting storage_domain?
> > > 
> > > Sure, that could work.
> > When the storage_domain is set in iopt_area_fill_domains(),
> >     "area->storage_domain = xa_load(&area->iopt->domains, 0);"
> > is there a convenient way to know the storage_domain is non-coherent?
> Also asking for when storage_domain is switching to an arbitrary remaining domain
> in iopt_unfill_domain().
> 
> And in iopt_area_unfill_domains(), after iopt_area_unmap_domain_range()
> of a non-coherent domain which is not the storage domain, how can we know that
> the domain is non-coherent?

Yes, it would have to keep track of hwpts in more case unfortunately
:(

Jason
Yan Zhao May 27, 2024, 7:15 a.m. UTC | #17
On Wed, May 22, 2024 at 02:01:50PM -0300, Jason Gunthorpe wrote:
> On Wed, May 22, 2024 at 02:29:19PM +0800, Yan Zhao wrote:
> > > > If you want to minimize flushes then you can't store flush
> > > > minimization information in the pages because it isn't global to the
> > > > pages and will not be accurate enough.
> > > > 
> > > > > > If pfn_reader_fill_span() does batch_from_domain() and
> > > > > > the source domain's storage_domain is non-coherent then you can skip
> > > > > > the flush. This is not pedantically perfect in skipping all flushes, but
> > > > > > in practice it is probably good enough.
> > > > 
> > > > > We don't know whether the source storage_domain is non-coherent since
> > > > > area->storage_domain is of "struct iommu_domain".
> > > >  
> > > > > Do you want to add a flag in "area", e.g. area->storage_domain_is_noncoherent,
> > > > > and set this flag along side setting storage_domain?
> > > > 
> > > > Sure, that could work.
> > > When the storage_domain is set in iopt_area_fill_domains(),
> > >     "area->storage_domain = xa_load(&area->iopt->domains, 0);"
> > > is there a convenient way to know the storage_domain is non-coherent?
> > Also asking for when storage_domain is switching to an arbitrary remaining domain
> > in iopt_unfill_domain().
> > 
> > And in iopt_area_unfill_domains(), after iopt_area_unmap_domain_range()
> > of a non-coherent domain which is not the storage domain, how can we know that
> > the domain is non-coherent?
> 
> Yes, it would have to keep track of hwpts in more case unfortunately
> :(
Current iopt keeps an xarray of domains. Could I modify the xarray to store
hwpt instead?
Jason Gunthorpe June 1, 2024, 4:48 p.m. UTC | #18
On Mon, May 27, 2024 at 03:15:39PM +0800, Yan Zhao wrote:
> On Wed, May 22, 2024 at 02:01:50PM -0300, Jason Gunthorpe wrote:
> > On Wed, May 22, 2024 at 02:29:19PM +0800, Yan Zhao wrote:
> > > > > If you want to minimize flushes then you can't store flush
> > > > > minimization information in the pages because it isn't global to the
> > > > > pages and will not be accurate enough.
> > > > > 
> > > > > > > If pfn_reader_fill_span() does batch_from_domain() and
> > > > > > > the source domain's storage_domain is non-coherent then you can skip
> > > > > > > the flush. This is not pedantically perfect in skipping all flushes, but
> > > > > > > in practice it is probably good enough.
> > > > > 
> > > > > > We don't know whether the source storage_domain is non-coherent since
> > > > > > area->storage_domain is of "struct iommu_domain".
> > > > >  
> > > > > > Do you want to add a flag in "area", e.g. area->storage_domain_is_noncoherent,
> > > > > > and set this flag along side setting storage_domain?
> > > > > 
> > > > > Sure, that could work.
> > > > When the storage_domain is set in iopt_area_fill_domains(),
> > > >     "area->storage_domain = xa_load(&area->iopt->domains, 0);"
> > > > is there a convenient way to know the storage_domain is non-coherent?
> > > Also asking for when storage_domain is switching to an arbitrary remaining domain
> > > in iopt_unfill_domain().
> > > 
> > > And in iopt_area_unfill_domains(), after iopt_area_unmap_domain_range()
> > > of a non-coherent domain which is not the storage domain, how can we know that
> > > the domain is non-coherent?
> > 
> > Yes, it would have to keep track of hwpts in more case unfortunately
> > :(
> Current iopt keeps an xarray of domains. Could I modify the xarray to store
> hwpt instead? 

Probably yes

Jason
diff mbox series

Patch

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 33d142f8057d..e3099d732c5c 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -14,12 +14,18 @@  void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
 		container_of(obj, struct iommufd_hwpt_paging, common.obj);
 
 	if (!list_empty(&hwpt_paging->hwpt_item)) {
+		struct io_pagetable *iopt = &hwpt_paging->ioas->iopt;
 		mutex_lock(&hwpt_paging->ioas->mutex);
 		list_del(&hwpt_paging->hwpt_item);
 		mutex_unlock(&hwpt_paging->ioas->mutex);
 
-		iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
-					 hwpt_paging->common.domain);
+		iopt_table_remove_domain(iopt, hwpt_paging->common.domain);
+
+		if (!hwpt_paging->enforce_cache_coherency) {
+			down_write(&iopt->domains_rwsem);
+			iopt->noncoherent_domain_cnt--;
+			up_write(&iopt->domains_rwsem);
+		}
 	}
 
 	if (hwpt_paging->common.domain)
@@ -176,6 +182,12 @@  iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 			goto out_abort;
 	}
 
+	if (!hwpt_paging->enforce_cache_coherency) {
+		down_write(&ioas->iopt.domains_rwsem);
+		ioas->iopt.noncoherent_domain_cnt++;
+		up_write(&ioas->iopt.domains_rwsem);
+	}
+
 	rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain);
 	if (rc)
 		goto out_detach;
@@ -183,6 +195,9 @@  iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 	return hwpt_paging;
 
 out_detach:
+	down_write(&ioas->iopt.domains_rwsem);
+	ioas->iopt.noncoherent_domain_cnt--;
+	up_write(&ioas->iopt.domains_rwsem);
 	if (immediate_attach)
 		iommufd_hw_pagetable_detach(idev);
 out_abort:
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index 0ec3509b7e33..557da8fb83d9 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -198,6 +198,11 @@  struct iopt_pages {
 	void __user *uptr;
 	bool writable:1;
 	u8 account_mode;
+	/*
+	 * CPU cache flush is required before mapping the pages to or after
+	 * unmapping it from a noncoherent domain
+	 */
+	bool cache_flush_required:1;
 
 	struct xarray pinned_pfns;
 	/* Of iopt_pages_access::node */
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9b..fc77fd43b232 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -53,6 +53,7 @@  struct io_pagetable {
 	struct rb_root_cached reserved_itree;
 	u8 disable_large_pages;
 	unsigned long iova_alignment;
+	unsigned int noncoherent_domain_cnt;
 };
 
 void iopt_init_table(struct io_pagetable *iopt);
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 528f356238b3..8f4b939cba5b 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -272,6 +272,17 @@  struct pfn_batch {
 	unsigned int total_pfns;
 };
 
+static void iopt_cache_flush_pfn_batch(struct pfn_batch *batch)
+{
+	unsigned long cur, i;
+
+	for (cur = 0; cur < batch->end; cur++) {
+		for (i = 0; i < batch->npfns[cur]; i++)
+			arch_clean_nonsnoop_dma(PFN_PHYS(batch->pfns[cur] + i),
+						PAGE_SIZE);
+	}
+}
+
 static void batch_clear(struct pfn_batch *batch)
 {
 	batch->total_pfns = 0;
@@ -637,10 +648,18 @@  static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
 	while (npages) {
 		size_t to_unpin = min_t(size_t, npages,
 					batch->npfns[cur] - first_page_off);
+		unsigned long pfn = batch->pfns[cur] + first_page_off;
+
+		/*
+		 * Lazily flushing CPU caches when a page is about to be
+		 * unpinned if the page was mapped into a noncoherent domain
+		 */
+		if (pages->cache_flush_required)
+			arch_clean_nonsnoop_dma(pfn << PAGE_SHIFT,
+						to_unpin << PAGE_SHIFT);
 
 		unpin_user_page_range_dirty_lock(
-			pfn_to_page(batch->pfns[cur] + first_page_off),
-			to_unpin, pages->writable);
+			pfn_to_page(pfn), to_unpin, pages->writable);
 		iopt_pages_sub_npinned(pages, to_unpin);
 		cur++;
 		first_page_off = 0;
@@ -1358,10 +1377,17 @@  int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
 {
 	unsigned long done_end_index;
 	struct pfn_reader pfns;
+	bool cache_flush_required;
 	int rc;
 
 	lockdep_assert_held(&area->pages->mutex);
 
+	cache_flush_required = area->iopt->noncoherent_domain_cnt &&
+			       !area->pages->cache_flush_required;
+
+	if (cache_flush_required)
+		area->pages->cache_flush_required = true;
+
 	rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area),
 			      iopt_area_last_index(area));
 	if (rc)
@@ -1369,6 +1395,9 @@  int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain)
 
 	while (!pfn_reader_done(&pfns)) {
 		done_end_index = pfns.batch_start_index;
+		if (cache_flush_required)
+			iopt_cache_flush_pfn_batch(&pfns.batch);
+
 		rc = batch_to_domain(&pfns.batch, domain, area,
 				     pfns.batch_start_index);
 		if (rc)
@@ -1413,6 +1442,7 @@  int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
 	unsigned long unmap_index;
 	struct pfn_reader pfns;
 	unsigned long index;
+	bool cache_flush_required;
 	int rc;
 
 	lockdep_assert_held(&area->iopt->domains_rwsem);
@@ -1426,9 +1456,19 @@  int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages)
 	if (rc)
 		goto out_unlock;
 
+	cache_flush_required = area->iopt->noncoherent_domain_cnt &&
+			       !pages->cache_flush_required;
+
+	if (cache_flush_required)
+		pages->cache_flush_required = true;
+
 	while (!pfn_reader_done(&pfns)) {
 		done_first_end_index = pfns.batch_end_index;
 		done_all_end_index = pfns.batch_start_index;
+
+		if (cache_flush_required)
+			iopt_cache_flush_pfn_batch(&pfns.batch);
+
 		xa_for_each(&area->iopt->domains, index, domain) {
 			rc = batch_to_domain(&pfns.batch, domain, area,
 					     pfns.batch_start_index);