diff mbox series

irqchip/gic-v3-its: Mark some in-memory data structures as 'decrypted'

Message ID 20211208155916.681-1-will@kernel.org (mailing list archive)
State New, archived
Headers show
Series irqchip/gic-v3-its: Mark some in-memory data structures as 'decrypted' | expand

Commit Message

Will Deacon Dec. 8, 2021, 3:59 p.m. UTC
The GICv3 ITS driver allocates memory for its tables using alloc_pages()
and performs explicit cache maintenance if necessary. On systems such
as those running pKVM, where the memory encryption API is implemented,
memory shared with the ITS must first be transitioned to the "decrypted"
state, as it would be if allocated via the DMA API.

Allow pKVM guests to interact with an ITS emulation by ensuring that the
shared pages are decrypted at the point of allocation and encrypted
again upon free().

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marc Zyngier <maz@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---

Although pKVM doesn't yet expose share/unshare hypercalls to the guest,
this change is agnostic of the hypervisor and could be queued
independently as it has no functional impact when the memory encryption
API is not implemented.

 drivers/irqchip/irq-gic-v3-its.c | 40 ++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 7 deletions(-)

Comments

Ard Biesheuvel Dec. 8, 2021, 4:56 p.m. UTC | #1
On Wed, 8 Dec 2021 at 17:00, Will Deacon <will@kernel.org> wrote:
>
> The GICv3 ITS driver allocates memory for its tables using alloc_pages()
> and performs explicit cache maintenance if necessary. On systems such
> as those running pKVM, where the memory encryption API is implemented,
> memory shared with the ITS must first be transitioned to the "decrypted"
> state, as it would be if allocated via the DMA API.
>
> Allow pKVM guests to interact with an ITS emulation by ensuring that the
> shared pages are decrypted at the point of allocation and encrypted
> again upon free().
>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Marc Zyngier <maz@kernel.org>
> Signed-off-by: Will Deacon <will@kernel.org>
> ---
>
> Although pKVM doesn't yet expose share/unshare hypercalls to the guest,
> this change is agnostic of the hypervisor and could be queued
> independently as it has no functional impact when the memory encryption
> API is not implemented.
>
>  drivers/irqchip/irq-gic-v3-its.c | 40 ++++++++++++++++++++++++++------
>  1 file changed, 33 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index eb0882d15366..4559b8dfb9bc 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -27,6 +27,7 @@
>  #include <linux/of_pci.h>
>  #include <linux/of_platform.h>
>  #include <linux/percpu.h>
> +#include <linux/set_memory.h>
>  #include <linux/slab.h>
>  #include <linux/syscore_ops.h>
>
> @@ -2166,6 +2167,7 @@ static void gic_reset_prop_table(void *va)
>
>         /* Make sure the GIC will observe the written configuration */
>         gic_flush_dcache_to_poc(va, LPI_PROPBASE_SZ);
> +       set_memory_decrypted((unsigned long)va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
>  }
>
>  static struct page *its_allocate_prop_table(gfp_t gfp_flags)
> @@ -2183,8 +2185,10 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>
>  static void its_free_prop_table(struct page *prop_page)
>  {
> -       free_pages((unsigned long)page_address(prop_page),
> -                  get_order(LPI_PROPBASE_SZ));
> +       unsigned long va = (unsigned long)page_address(prop_page);
> +
> +       set_memory_encrypted(va, LPI_PROPBASE_SZ >> PAGE_SHIFT);

I take it this call is there to return the freed pages to the state
they are assumed to have been in when they were allocated?

This seems like a maintenance nightmare in the making, to be honest.
This driver should be able to stop caring about the
encrypted/decrypted state of the pages as soon as it frees them, and
on systems that actually implement the difference, I would expect the
memory management layer to implement a reasonable default that always
applies to newly allocated pages, regardless of how they may have been
used in the past.

> +       free_pages(va, get_order(LPI_PROPBASE_SZ));
>  }
>
>  static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size)
> @@ -2377,6 +2381,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>                 return -ENXIO;
>         }
>
> +       set_memory_decrypted((unsigned long)base,
> +                            PAGE_ORDER_TO_SIZE(order) >> PAGE_SHIFT);
>         baser->order = order;
>         baser->base = base;
>         baser->psz = psz;
> @@ -2512,8 +2518,12 @@ static void its_free_tables(struct its_node *its)
>
>         for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>                 if (its->tables[i].base) {
> -                       free_pages((unsigned long)its->tables[i].base,
> -                                  its->tables[i].order);
> +                       unsigned long base = (unsigned long)its->tables[i].base;
> +                       u32 order = its->tables[i].order;
> +                       u32 npages = PAGE_ORDER_TO_SIZE(order) >> PAGE_SHIFT;
> +
> +                       set_memory_encrypted(base, npages);
> +                       free_pages(base, order);
>                         its->tables[i].base = NULL;
>                 }
>         }
> @@ -2934,6 +2944,7 @@ static int its_alloc_collections(struct its_node *its)
>  static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>  {
>         struct page *pend_page;
> +       void *va;
>
>         pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
>                                 get_order(LPI_PENDBASE_SZ));
> @@ -2941,14 +2952,19 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>                 return NULL;
>
>         /* Make sure the GIC will observe the zero-ed page */
> -       gic_flush_dcache_to_poc(page_address(pend_page), LPI_PENDBASE_SZ);
> +       va = page_address(pend_page);
> +       gic_flush_dcache_to_poc(va, LPI_PENDBASE_SZ);
> +       set_memory_decrypted((unsigned long)va, LPI_PENDBASE_SZ >> PAGE_SHIFT);
>
>         return pend_page;
>  }
>
>  static void its_free_pending_table(struct page *pt)
>  {
> -       free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ));
> +       unsigned long va = (unsigned long)page_address(pt);
> +
> +       set_memory_encrypted(va, LPI_PENDBASE_SZ >> PAGE_SHIFT);
> +       free_pages(va, get_order(LPI_PENDBASE_SZ));
>  }
>
>  /*
> @@ -3268,14 +3284,20 @@ static bool its_alloc_table_entry(struct its_node *its,
>
>         /* Allocate memory for 2nd level table */
>         if (!table[idx]) {
> +               void *l2addr;
> +
>                 page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>                                         get_order(baser->psz));
>                 if (!page)
>                         return false;
>
> +               l2addr = page_address(page);
> +               set_memory_decrypted((unsigned long)l2addr,
> +                                    baser->psz >> PAGE_SHIFT);
> +
>                 /* Flush Lvl2 table to PoC if hw doesn't support coherency */
>                 if (!(baser->val & GITS_BASER_SHAREABILITY_MASK))
> -                       gic_flush_dcache_to_poc(page_address(page), baser->psz);
> +                       gic_flush_dcache_to_poc(l2addr, baser->psz);
>
>                 table[idx] = cpu_to_le64(page_to_phys(page) | GITS_BASER_VALID);
>
> @@ -5043,6 +5065,8 @@ static int __init its_probe_one(struct resource *res,
>         its->fwnode_handle = handle;
>         its->get_msi_base = its_irq_get_msi_base;
>         its->msi_domain_flags = IRQ_DOMAIN_FLAG_MSI_REMAP;
> +       set_memory_decrypted((unsigned long)its->cmd_base,
> +                            ITS_CMD_QUEUE_SZ >> PAGE_SHIFT);
>
>         its_enable_quirks(its);
>
> @@ -5099,6 +5123,8 @@ static int __init its_probe_one(struct resource *res,
>  out_free_tables:
>         its_free_tables(its);
>  out_free_cmd:
> +       set_memory_encrypted((unsigned long)its->cmd_base,
> +                            ITS_CMD_QUEUE_SZ >> PAGE_SHIFT);
>         free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
>  out_unmap_sgir:
>         if (its->sgir_base)
> --
> 2.34.1.400.ga245620fadb-goog
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Will Deacon Dec. 8, 2021, 5:20 p.m. UTC | #2
Hi Ard,

On Wed, Dec 08, 2021 at 05:56:59PM +0100, Ard Biesheuvel wrote:
> On Wed, 8 Dec 2021 at 17:00, Will Deacon <will@kernel.org> wrote:
> >
> > The GICv3 ITS driver allocates memory for its tables using alloc_pages()
> > and performs explicit cache maintenance if necessary. On systems such
> > as those running pKVM, where the memory encryption API is implemented,
> > memory shared with the ITS must first be transitioned to the "decrypted"
> > state, as it would be if allocated via the DMA API.
> >
> > Allow pKVM guests to interact with an ITS emulation by ensuring that the
> > shared pages are decrypted at the point of allocation and encrypted
> > again upon free().
> >
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Marc Zyngier <maz@kernel.org>
> > Signed-off-by: Will Deacon <will@kernel.org>
> > ---
> >
> > Although pKVM doesn't yet expose share/unshare hypercalls to the guest,
> > this change is agnostic of the hypervisor and could be queued
> > independently as it has no functional impact when the memory encryption
> > API is not implemented.
> >
> >  drivers/irqchip/irq-gic-v3-its.c | 40 ++++++++++++++++++++++++++------
> >  1 file changed, 33 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> > index eb0882d15366..4559b8dfb9bc 100644
> > --- a/drivers/irqchip/irq-gic-v3-its.c
> > +++ b/drivers/irqchip/irq-gic-v3-its.c
> > @@ -27,6 +27,7 @@
> >  #include <linux/of_pci.h>
> >  #include <linux/of_platform.h>
> >  #include <linux/percpu.h>
> > +#include <linux/set_memory.h>
> >  #include <linux/slab.h>
> >  #include <linux/syscore_ops.h>
> >
> > @@ -2166,6 +2167,7 @@ static void gic_reset_prop_table(void *va)
> >
> >         /* Make sure the GIC will observe the written configuration */
> >         gic_flush_dcache_to_poc(va, LPI_PROPBASE_SZ);
> > +       set_memory_decrypted((unsigned long)va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
> >  }
> >
> >  static struct page *its_allocate_prop_table(gfp_t gfp_flags)
> > @@ -2183,8 +2185,10 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
> >
> >  static void its_free_prop_table(struct page *prop_page)
> >  {
> > -       free_pages((unsigned long)page_address(prop_page),
> > -                  get_order(LPI_PROPBASE_SZ));
> > +       unsigned long va = (unsigned long)page_address(prop_page);
> > +
> > +       set_memory_encrypted(va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
> 
> I take it this call is there to return the freed pages to the state
> they are assumed to have been in when they were allocated?
> 
> This seems like a maintenance nightmare in the making, to be honest.
> This driver should be able to stop caring about the
> encrypted/decrypted state of the pages as soon as it frees them, and
> on systems that actually implement the difference, I would expect the
> memory management layer to implement a reasonable default that always
> applies to newly allocated pages, regardless of how they may have been
> used in the past.

This currently happens above the mm layer and is instead handled by the DMA
layer which may use the page allocator, swiotlb, CMA etc under the hood and
ensures that whatever memory is allocated ends up being in the decrypted
state while allocated if needed by the device. If we could use the normal
DMA APIs here then we wouldn't have to worry about the *crypted nature of
the memory at all, but since we're forced to roll our own routines then
we're exposed to the horror of it all.

It's a similar story to the cache maintenance, which we also end up having
to open-code here.

Will
Robin Murphy Dec. 8, 2021, 6:20 p.m. UTC | #3
On 2021-12-08 15:59, Will Deacon wrote:
> The GICv3 ITS driver allocates memory for its tables using alloc_pages()
> and performs explicit cache maintenance if necessary. On systems such
> as those running pKVM, where the memory encryption API is implemented,
> memory shared with the ITS must first be transitioned to the "decrypted"
> state, as it would be if allocated via the DMA API.
> 
> Allow pKVM guests to interact with an ITS emulation by ensuring that the
> shared pages are decrypted at the point of allocation and encrypted
> again upon free().
> 
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Marc Zyngier <maz@kernel.org>
> Signed-off-by: Will Deacon <will@kernel.org>
> ---
> 
> Although pKVM doesn't yet expose share/unshare hypercalls to the guest,
> this change is agnostic of the hypervisor and could be queued
> independently as it has no functional impact when the memory encryption
> API is not implemented.
> 
>   drivers/irqchip/irq-gic-v3-its.c | 40 ++++++++++++++++++++++++++------
>   1 file changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index eb0882d15366..4559b8dfb9bc 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -27,6 +27,7 @@
>   #include <linux/of_pci.h>
>   #include <linux/of_platform.h>
>   #include <linux/percpu.h>
> +#include <linux/set_memory.h>
>   #include <linux/slab.h>
>   #include <linux/syscore_ops.h>
>   
> @@ -2166,6 +2167,7 @@ static void gic_reset_prop_table(void *va)
>   
>   	/* Make sure the GIC will observe the written configuration */
>   	gic_flush_dcache_to_poc(va, LPI_PROPBASE_SZ);
> +	set_memory_decrypted((unsigned long)va, LPI_PROPBASE_SZ >> PAGE_SHIFT);

Hmm, that would seem to imply we've just done the memset() and cache 
clean on encrypted memory... that's not right, surely?

>   }
>   
>   static struct page *its_allocate_prop_table(gfp_t gfp_flags)
> @@ -2183,8 +2185,10 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>   
>   static void its_free_prop_table(struct page *prop_page)
>   {
> -	free_pages((unsigned long)page_address(prop_page),
> -		   get_order(LPI_PROPBASE_SZ));
> +	unsigned long va = (unsigned long)page_address(prop_page);
> +
> +	set_memory_encrypted(va, LPI_PROPBASE_SZ >> PAGE_SHIFT);

Similarly to [1], it might be worth at least pretending to consider 
failure in these instances. (Side note, might set_memory_encrypted() 
logically warrant a __must_check annotation?)

> +	free_pages(va, get_order(LPI_PROPBASE_SZ));
>   }
>   
>   static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size)
> @@ -2377,6 +2381,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>   		return -ENXIO;
>   	}
>   
> +	set_memory_decrypted((unsigned long)base,
> +			     PAGE_ORDER_TO_SIZE(order) >> PAGE_SHIFT);
>   	baser->order = order;
>   	baser->base = base;
>   	baser->psz = psz;
> @@ -2512,8 +2518,12 @@ static void its_free_tables(struct its_node *its)
>   
>   	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>   		if (its->tables[i].base) {
> -			free_pages((unsigned long)its->tables[i].base,
> -				   its->tables[i].order);
> +			unsigned long base = (unsigned long)its->tables[i].base;
> +			u32 order = its->tables[i].order;
> +			u32 npages = PAGE_ORDER_TO_SIZE(order) >> PAGE_SHIFT;
> +
> +			set_memory_encrypted(base, npages);
> +			free_pages(base, order);
>   			its->tables[i].base = NULL;
>   		}
>   	}
> @@ -2934,6 +2944,7 @@ static int its_alloc_collections(struct its_node *its)
>   static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>   {
>   	struct page *pend_page;
> +	void *va;
>   
>   	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
>   				get_order(LPI_PENDBASE_SZ));
> @@ -2941,14 +2952,19 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>   		return NULL;
>   
>   	/* Make sure the GIC will observe the zero-ed page */
> -	gic_flush_dcache_to_poc(page_address(pend_page), LPI_PENDBASE_SZ);
> +	va = page_address(pend_page);
> +	gic_flush_dcache_to_poc(va, LPI_PENDBASE_SZ);
> +	set_memory_decrypted((unsigned long)va, LPI_PENDBASE_SZ >> PAGE_SHIFT);

Again, it looks fundamentally sketchy to decrypt memory *after* already 
doing something to it - under a "real" encryption scheme the cleaned out 
zeros might now appear as a page full of gibberish ciphertext, which 
would not be what you want.

That said, aren't the pending tables the completely IMP-DEF ones that 
only the ITS itself ever touches? As-is should we even need to "decrypt" 
them at all?

The overall feeling here is that we're not being particularly consistent 
about whether we're readily leaning on the assumption of "encryption" 
only ever meaning CPU stage 2 protection here, or whether we're trying 
to uphold the illusion of the more general notion with the ITS only 
being able to observe "plaintext" shared memory.

Robin.

[1] https://lore.kernel.org/linux-iommu/20211111065028.32761-4-hch@lst.de/

>   
>   	return pend_page;
>   }
>   
>   static void its_free_pending_table(struct page *pt)
>   {
> -	free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ));
> +	unsigned long va = (unsigned long)page_address(pt);
> +
> +	set_memory_encrypted(va, LPI_PENDBASE_SZ >> PAGE_SHIFT);
> +	free_pages(va, get_order(LPI_PENDBASE_SZ));
>   }
>   
>   /*
> @@ -3268,14 +3284,20 @@ static bool its_alloc_table_entry(struct its_node *its,
>   
>   	/* Allocate memory for 2nd level table */
>   	if (!table[idx]) {
> +		void *l2addr;
> +
>   		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>   					get_order(baser->psz));
>   		if (!page)
>   			return false;
>   
> +		l2addr = page_address(page);
> +		set_memory_decrypted((unsigned long)l2addr,
> +				     baser->psz >> PAGE_SHIFT);
> +
>   		/* Flush Lvl2 table to PoC if hw doesn't support coherency */
>   		if (!(baser->val & GITS_BASER_SHAREABILITY_MASK))
> -			gic_flush_dcache_to_poc(page_address(page), baser->psz);
> +			gic_flush_dcache_to_poc(l2addr, baser->psz);
>   
>   		table[idx] = cpu_to_le64(page_to_phys(page) | GITS_BASER_VALID);
>   
> @@ -5043,6 +5065,8 @@ static int __init its_probe_one(struct resource *res,
>   	its->fwnode_handle = handle;
>   	its->get_msi_base = its_irq_get_msi_base;
>   	its->msi_domain_flags = IRQ_DOMAIN_FLAG_MSI_REMAP;
> +	set_memory_decrypted((unsigned long)its->cmd_base,
> +			     ITS_CMD_QUEUE_SZ >> PAGE_SHIFT);
>   
>   	its_enable_quirks(its);
>   
> @@ -5099,6 +5123,8 @@ static int __init its_probe_one(struct resource *res,
>   out_free_tables:
>   	its_free_tables(its);
>   out_free_cmd:
> +	set_memory_encrypted((unsigned long)its->cmd_base,
> +			     ITS_CMD_QUEUE_SZ >> PAGE_SHIFT);
>   	free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
>   out_unmap_sgir:
>   	if (its->sgir_base)
>
Will Deacon Dec. 9, 2021, 9:10 a.m. UTC | #4
On Wed, Dec 08, 2021 at 06:20:36PM +0000, Robin Murphy wrote:
> On 2021-12-08 15:59, Will Deacon wrote:
> > The GICv3 ITS driver allocates memory for its tables using alloc_pages()
> > and performs explicit cache maintenance if necessary. On systems such
> > as those running pKVM, where the memory encryption API is implemented,
> > memory shared with the ITS must first be transitioned to the "decrypted"
> > state, as it would be if allocated via the DMA API.
> > 
> > Allow pKVM guests to interact with an ITS emulation by ensuring that the
> > shared pages are decrypted at the point of allocation and encrypted
> > again upon free().
> > 
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Cc: Marc Zyngier <maz@kernel.org>
> > Signed-off-by: Will Deacon <will@kernel.org>
> > ---
> > 
> > Although pKVM doesn't yet expose share/unshare hypercalls to the guest,
> > this change is agnostic of the hypervisor and could be queued
> > independently as it has no functional impact when the memory encryption
> > API is not implemented.
> > 
> >   drivers/irqchip/irq-gic-v3-its.c | 40 ++++++++++++++++++++++++++------
> >   1 file changed, 33 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> > index eb0882d15366..4559b8dfb9bc 100644
> > --- a/drivers/irqchip/irq-gic-v3-its.c
> > +++ b/drivers/irqchip/irq-gic-v3-its.c
> > @@ -27,6 +27,7 @@
> >   #include <linux/of_pci.h>
> >   #include <linux/of_platform.h>
> >   #include <linux/percpu.h>
> > +#include <linux/set_memory.h>
> >   #include <linux/slab.h>
> >   #include <linux/syscore_ops.h>
> > @@ -2166,6 +2167,7 @@ static void gic_reset_prop_table(void *va)
> >   	/* Make sure the GIC will observe the written configuration */
> >   	gic_flush_dcache_to_poc(va, LPI_PROPBASE_SZ);
> > +	set_memory_decrypted((unsigned long)va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
> 
> Hmm, that would seem to imply we've just done the memset() and cache clean
> on encrypted memory... that's not right, surely?

Actually, that's exactly what we want! The *crypted state is only of concern
to the DMA accesses, so the idea is that you clear out / initialiase the
buffer and only then decrypt the memory for the device to access it.

The DMA code does the same sort of thing -- for example, see
atomic_pool_expand() in kernel/dma/pool.c.

> >   }
> >   static struct page *its_allocate_prop_table(gfp_t gfp_flags)
> > @@ -2183,8 +2185,10 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
> >   static void its_free_prop_table(struct page *prop_page)
> >   {
> > -	free_pages((unsigned long)page_address(prop_page),
> > -		   get_order(LPI_PROPBASE_SZ));
> > +	unsigned long va = (unsigned long)page_address(prop_page);
> > +
> > +	set_memory_encrypted(va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
> 
> Similarly to [1], it might be worth at least pretending to consider failure
> in these instances. (Side note, might set_memory_encrypted() logically
> warrant a __must_check annotation?)

There's not much we can do, but it looks like the DMA allocator leaks the
memory rather than freeing it, so I'll add that.

> > @@ -2934,6 +2944,7 @@ static int its_alloc_collections(struct its_node *its)
> >   static struct page *its_allocate_pending_table(gfp_t gfp_flags)
> >   {
> >   	struct page *pend_page;
> > +	void *va;
> >   	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
> >   				get_order(LPI_PENDBASE_SZ));
> > @@ -2941,14 +2952,19 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags)
> >   		return NULL;
> >   	/* Make sure the GIC will observe the zero-ed page */
> > -	gic_flush_dcache_to_poc(page_address(pend_page), LPI_PENDBASE_SZ);
> > +	va = page_address(pend_page);
> > +	gic_flush_dcache_to_poc(va, LPI_PENDBASE_SZ);
> > +	set_memory_decrypted((unsigned long)va, LPI_PENDBASE_SZ >> PAGE_SHIFT);
> 
> Again, it looks fundamentally sketchy to decrypt memory *after* already
> doing something to it - under a "real" encryption scheme the cleaned out
> zeros might now appear as a page full of gibberish ciphertext, which would
> not be what you want.
> 
> That said, aren't the pending tables the completely IMP-DEF ones that only
> the ITS itself ever touches? As-is should we even need to "decrypt" them at
> all?

We do, as we need the ITS to be able to access the pages. For pKVM, this
is actually the virtual ITS emulation in the host.

> The overall feeling here is that we're not being particularly consistent
> about whether we're readily leaning on the assumption of "encryption" only
> ever meaning CPU stage 2 protection here, or whether we're trying to uphold
> the illusion of the more general notion with the ITS only being able to
> observe "plaintext" shared memory.

Really, I've just tried to follow the way this is used for DMA: pages which
need to be accessed by the device must first be set as decrypted, and then
set back to encrypted before freeing back to the page allocator. This maps
directly to what we're doing with the stage-2 (i.e. decrypt means share, and
encrypt means unshare) but it would equally apply to real memory encryption
schemes along the lines of what x86, power and s390 are doing.

Will
Marc Zyngier Dec. 9, 2021, 9:41 a.m. UTC | #5
On Wed, 08 Dec 2021 15:59:16 +0000,
Will Deacon <will@kernel.org> wrote:
> 
> The GICv3 ITS driver allocates memory for its tables using alloc_pages()
> and performs explicit cache maintenance if necessary. On systems such
> as those running pKVM, where the memory encryption API is implemented,
> memory shared with the ITS must first be transitioned to the "decrypted"
> state, as it would be if allocated via the DMA API.
> 
> Allow pKVM guests to interact with an ITS emulation by ensuring that the
> shared pages are decrypted at the point of allocation and encrypted
> again upon free().
> 
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Cc: Marc Zyngier <maz@kernel.org>
> Signed-off-by: Will Deacon <will@kernel.org>
> ---
> 
> Although pKVM doesn't yet expose share/unshare hypercalls to the guest,
> this change is agnostic of the hypervisor and could be queued
> independently as it has no functional impact when the memory encryption
> API is not implemented.
> 
>  drivers/irqchip/irq-gic-v3-its.c | 40 ++++++++++++++++++++++++++------
>  1 file changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
> index eb0882d15366..4559b8dfb9bc 100644
> --- a/drivers/irqchip/irq-gic-v3-its.c
> +++ b/drivers/irqchip/irq-gic-v3-its.c
> @@ -27,6 +27,7 @@
>  #include <linux/of_pci.h>
>  #include <linux/of_platform.h>
>  #include <linux/percpu.h>
> +#include <linux/set_memory.h>
>  #include <linux/slab.h>
>  #include <linux/syscore_ops.h>
>  
> @@ -2166,6 +2167,7 @@ static void gic_reset_prop_table(void *va)
>  
>  	/* Make sure the GIC will observe the written configuration */
>  	gic_flush_dcache_to_poc(va, LPI_PROPBASE_SZ);
> +	set_memory_decrypted((unsigned long)va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
>  }
>  
>  static struct page *its_allocate_prop_table(gfp_t gfp_flags)
> @@ -2183,8 +2185,10 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>  
>  static void its_free_prop_table(struct page *prop_page)
>  {
> -	free_pages((unsigned long)page_address(prop_page),
> -		   get_order(LPI_PROPBASE_SZ));
> +	unsigned long va = (unsigned long)page_address(prop_page);
> +
> +	set_memory_encrypted(va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
> +	free_pages(va, get_order(LPI_PROPBASE_SZ));
>  }
>  
>  static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size)
> @@ -2377,6 +2381,8 @@ static int its_setup_baser(struct its_node *its, struct its_baser *baser,
>  		return -ENXIO;
>  	}
>  
> +	set_memory_decrypted((unsigned long)base,
> +			     PAGE_ORDER_TO_SIZE(order) >> PAGE_SHIFT);
>  	baser->order = order;
>  	baser->base = base;
>  	baser->psz = psz;
> @@ -2512,8 +2518,12 @@ static void its_free_tables(struct its_node *its)
>  
>  	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
>  		if (its->tables[i].base) {
> -			free_pages((unsigned long)its->tables[i].base,
> -				   its->tables[i].order);
> +			unsigned long base = (unsigned long)its->tables[i].base;
> +			u32 order = its->tables[i].order;
> +			u32 npages = PAGE_ORDER_TO_SIZE(order) >> PAGE_SHIFT;
> +
> +			set_memory_encrypted(base, npages);
> +			free_pages(base, order);
>  			its->tables[i].base = NULL;
>  		}
>  	}
> @@ -2934,6 +2944,7 @@ static int its_alloc_collections(struct its_node *its)
>  static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>  {
>  	struct page *pend_page;
> +	void *va;
>  
>  	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
>  				get_order(LPI_PENDBASE_SZ));
> @@ -2941,14 +2952,19 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>  		return NULL;
>  
>  	/* Make sure the GIC will observe the zero-ed page */
> -	gic_flush_dcache_to_poc(page_address(pend_page), LPI_PENDBASE_SZ);
> +	va = page_address(pend_page);
> +	gic_flush_dcache_to_poc(va, LPI_PENDBASE_SZ);
> +	set_memory_decrypted((unsigned long)va, LPI_PENDBASE_SZ >> PAGE_SHIFT);
>  
>  	return pend_page;
>  }
>  
>  static void its_free_pending_table(struct page *pt)
>  {
> -	free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ));
> +	unsigned long va = (unsigned long)page_address(pt);
> +
> +	set_memory_encrypted(va, LPI_PENDBASE_SZ >> PAGE_SHIFT);
> +	free_pages(va, get_order(LPI_PENDBASE_SZ));
>  }
>  
>  /*
> @@ -3268,14 +3284,20 @@ static bool its_alloc_table_entry(struct its_node *its,
>  
>  	/* Allocate memory for 2nd level table */
>  	if (!table[idx]) {
> +		void *l2addr;
> +
>  		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
>  					get_order(baser->psz));
>  		if (!page)
>  			return false;
>  
> +		l2addr = page_address(page);
> +		set_memory_decrypted((unsigned long)l2addr,
> +				     baser->psz >> PAGE_SHIFT);
> +
>  		/* Flush Lvl2 table to PoC if hw doesn't support coherency */
>  		if (!(baser->val & GITS_BASER_SHAREABILITY_MASK))
> -			gic_flush_dcache_to_poc(page_address(page), baser->psz);
> +			gic_flush_dcache_to_poc(l2addr, baser->psz);
>  
>  		table[idx] = cpu_to_le64(page_to_phys(page) | GITS_BASER_VALID);
>  
> @@ -5043,6 +5065,8 @@ static int __init its_probe_one(struct resource *res,
>  	its->fwnode_handle = handle;
>  	its->get_msi_base = its_irq_get_msi_base;
>  	its->msi_domain_flags = IRQ_DOMAIN_FLAG_MSI_REMAP;
> +	set_memory_decrypted((unsigned long)its->cmd_base,
> +			     ITS_CMD_QUEUE_SZ >> PAGE_SHIFT);
>  
>  	its_enable_quirks(its);
>  
> @@ -5099,6 +5123,8 @@ static int __init its_probe_one(struct resource *res,
>  out_free_tables:
>  	its_free_tables(its);
>  out_free_cmd:
> +	set_memory_encrypted((unsigned long)its->cmd_base,
> +			     ITS_CMD_QUEUE_SZ >> PAGE_SHIFT);
>  	free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
>  out_unmap_sgir:
>  	if (its->sgir_base)

This misses the subtly hidden per-endpoint ITT allocation (see
its_create_device()), which is the only one that is not a full page
allocation.

You could of course upgrade it to a full page, but this is going to
waste a lot of memory on systems with large page sizes, lots of
devices, and only few interrupts per device.

I guess that this would be acceptable on systems such as those
currently targeted by pKVM, but you'd need to make it a runtime
decision. Another possibility would be to build a custom allocator for
this, but this feels like a massive undertaking.

Maybe that's the push we needed to switch the ITS driver to be a
full-blown platform driver instead of a side hack.

	M.
Robin Murphy Dec. 9, 2021, 11:34 a.m. UTC | #6
On 2021-12-09 09:10, Will Deacon wrote:
> On Wed, Dec 08, 2021 at 06:20:36PM +0000, Robin Murphy wrote:
>> On 2021-12-08 15:59, Will Deacon wrote:
>>> The GICv3 ITS driver allocates memory for its tables using alloc_pages()
>>> and performs explicit cache maintenance if necessary. On systems such
>>> as those running pKVM, where the memory encryption API is implemented,
>>> memory shared with the ITS must first be transitioned to the "decrypted"
>>> state, as it would be if allocated via the DMA API.
>>>
>>> Allow pKVM guests to interact with an ITS emulation by ensuring that the
>>> shared pages are decrypted at the point of allocation and encrypted
>>> again upon free().
>>>
>>> Cc: Thomas Gleixner <tglx@linutronix.de>
>>> Cc: Marc Zyngier <maz@kernel.org>
>>> Signed-off-by: Will Deacon <will@kernel.org>
>>> ---
>>>
>>> Although pKVM doesn't yet expose share/unshare hypercalls to the guest,
>>> this change is agnostic of the hypervisor and could be queued
>>> independently as it has no functional impact when the memory encryption
>>> API is not implemented.
>>>
>>>    drivers/irqchip/irq-gic-v3-its.c | 40 ++++++++++++++++++++++++++------
>>>    1 file changed, 33 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
>>> index eb0882d15366..4559b8dfb9bc 100644
>>> --- a/drivers/irqchip/irq-gic-v3-its.c
>>> +++ b/drivers/irqchip/irq-gic-v3-its.c
>>> @@ -27,6 +27,7 @@
>>>    #include <linux/of_pci.h>
>>>    #include <linux/of_platform.h>
>>>    #include <linux/percpu.h>
>>> +#include <linux/set_memory.h>
>>>    #include <linux/slab.h>
>>>    #include <linux/syscore_ops.h>
>>> @@ -2166,6 +2167,7 @@ static void gic_reset_prop_table(void *va)
>>>    	/* Make sure the GIC will observe the written configuration */
>>>    	gic_flush_dcache_to_poc(va, LPI_PROPBASE_SZ);
>>> +	set_memory_decrypted((unsigned long)va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
>>
>> Hmm, that would seem to imply we've just done the memset() and cache clean
>> on encrypted memory... that's not right, surely?
> 
> Actually, that's exactly what we want! The *crypted state is only of concern
> to the DMA accesses, so the idea is that you clear out / initialiase the
> buffer and only then decrypt the memory for the device to access it.
> 
> The DMA code does the same sort of thing -- for example, see
> atomic_pool_expand() in kernel/dma/pool.c.

Sure, we allocate the pages, clean any dirty lines, then decrypt them. 
But the important point is that any actual initialisation happens 
*after* the decryption - the memset() is in dma_alloc_from_pool(), for 
that example. In general, changing the encryption state should be 
treated as changing the contents of the memory.

>>>    }
>>>    static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>>> @@ -2183,8 +2185,10 @@ static struct page *its_allocate_prop_table(gfp_t gfp_flags)
>>>    static void its_free_prop_table(struct page *prop_page)
>>>    {
>>> -	free_pages((unsigned long)page_address(prop_page),
>>> -		   get_order(LPI_PROPBASE_SZ));
>>> +	unsigned long va = (unsigned long)page_address(prop_page);
>>> +
>>> +	set_memory_encrypted(va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
>>
>> Similarly to [1], it might be worth at least pretending to consider failure
>> in these instances. (Side note, might set_memory_encrypted() logically
>> warrant a __must_check annotation?)
> 
> There's not much we can do, but it looks like the DMA allocator leaks the
> memory rather than freeing it, so I'll add that.
> 
>>> @@ -2934,6 +2944,7 @@ static int its_alloc_collections(struct its_node *its)
>>>    static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>>>    {
>>>    	struct page *pend_page;
>>> +	void *va;
>>>    	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
>>>    				get_order(LPI_PENDBASE_SZ));
>>> @@ -2941,14 +2952,19 @@ static struct page *its_allocate_pending_table(gfp_t gfp_flags)
>>>    		return NULL;
>>>    	/* Make sure the GIC will observe the zero-ed page */
>>> -	gic_flush_dcache_to_poc(page_address(pend_page), LPI_PENDBASE_SZ);
>>> +	va = page_address(pend_page);
>>> +	gic_flush_dcache_to_poc(va, LPI_PENDBASE_SZ);
>>> +	set_memory_decrypted((unsigned long)va, LPI_PENDBASE_SZ >> PAGE_SHIFT);
>>
>> Again, it looks fundamentally sketchy to decrypt memory *after* already
>> doing something to it - under a "real" encryption scheme the cleaned out
>> zeros might now appear as a page full of gibberish ciphertext, which would
>> not be what you want.
>>
>> That said, aren't the pending tables the completely IMP-DEF ones that only
>> the ITS itself ever touches? As-is should we even need to "decrypt" them at
>> all?
> 
> We do, as we need the ITS to be able to access the pages. For pKVM, this
> is actually the virtual ITS emulation in the host.

Derp, sorry, somehow I completely overlooked "pKVM guests" and "ITS 
emulation" in the commit message, and for assumed this was about passing 
the physical ITS through to host EL1. It does indeed make complete sense 
for emulation, so please forgive me the brain-fart there.

>> The overall feeling here is that we're not being particularly consistent
>> about whether we're readily leaning on the assumption of "encryption" only
>> ever meaning CPU stage 2 protection here, or whether we're trying to uphold
>> the illusion of the more general notion with the ITS only being able to
>> observe "plaintext" shared memory.
> 
> Really, I've just tried to follow the way this is used for DMA: pages which
> need to be accessed by the device must first be set as decrypted, and then
> set back to encrypted before freeing back to the page allocator. This maps
> directly to what we're doing with the stage-2 (i.e. decrypt means share, and
> encrypt means unshare) but it would equally apply to real memory encryption
> schemes along the lines of what x86, power and s390 are doing.

That's fine - TBH I'd much *prefer* for all usage of the memory 
encryption APIs to be consistent and not take shortcuts based on 
implementation-specific assumptions - but in that case, please do try to 
actually follow the usage model consistently. This patch as-is would 
*not* work under AMD SME, for example.

Thanks,
Robin.
diff mbox series

Patch

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index eb0882d15366..4559b8dfb9bc 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -27,6 +27,7 @@ 
 #include <linux/of_pci.h>
 #include <linux/of_platform.h>
 #include <linux/percpu.h>
+#include <linux/set_memory.h>
 #include <linux/slab.h>
 #include <linux/syscore_ops.h>
 
@@ -2166,6 +2167,7 @@  static void gic_reset_prop_table(void *va)
 
 	/* Make sure the GIC will observe the written configuration */
 	gic_flush_dcache_to_poc(va, LPI_PROPBASE_SZ);
+	set_memory_decrypted((unsigned long)va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
 }
 
 static struct page *its_allocate_prop_table(gfp_t gfp_flags)
@@ -2183,8 +2185,10 @@  static struct page *its_allocate_prop_table(gfp_t gfp_flags)
 
 static void its_free_prop_table(struct page *prop_page)
 {
-	free_pages((unsigned long)page_address(prop_page),
-		   get_order(LPI_PROPBASE_SZ));
+	unsigned long va = (unsigned long)page_address(prop_page);
+
+	set_memory_encrypted(va, LPI_PROPBASE_SZ >> PAGE_SHIFT);
+	free_pages(va, get_order(LPI_PROPBASE_SZ));
 }
 
 static bool gic_check_reserved_range(phys_addr_t addr, unsigned long size)
@@ -2377,6 +2381,8 @@  static int its_setup_baser(struct its_node *its, struct its_baser *baser,
 		return -ENXIO;
 	}
 
+	set_memory_decrypted((unsigned long)base,
+			     PAGE_ORDER_TO_SIZE(order) >> PAGE_SHIFT);
 	baser->order = order;
 	baser->base = base;
 	baser->psz = psz;
@@ -2512,8 +2518,12 @@  static void its_free_tables(struct its_node *its)
 
 	for (i = 0; i < GITS_BASER_NR_REGS; i++) {
 		if (its->tables[i].base) {
-			free_pages((unsigned long)its->tables[i].base,
-				   its->tables[i].order);
+			unsigned long base = (unsigned long)its->tables[i].base;
+			u32 order = its->tables[i].order;
+			u32 npages = PAGE_ORDER_TO_SIZE(order) >> PAGE_SHIFT;
+
+			set_memory_encrypted(base, npages);
+			free_pages(base, order);
 			its->tables[i].base = NULL;
 		}
 	}
@@ -2934,6 +2944,7 @@  static int its_alloc_collections(struct its_node *its)
 static struct page *its_allocate_pending_table(gfp_t gfp_flags)
 {
 	struct page *pend_page;
+	void *va;
 
 	pend_page = alloc_pages(gfp_flags | __GFP_ZERO,
 				get_order(LPI_PENDBASE_SZ));
@@ -2941,14 +2952,19 @@  static struct page *its_allocate_pending_table(gfp_t gfp_flags)
 		return NULL;
 
 	/* Make sure the GIC will observe the zero-ed page */
-	gic_flush_dcache_to_poc(page_address(pend_page), LPI_PENDBASE_SZ);
+	va = page_address(pend_page);
+	gic_flush_dcache_to_poc(va, LPI_PENDBASE_SZ);
+	set_memory_decrypted((unsigned long)va, LPI_PENDBASE_SZ >> PAGE_SHIFT);
 
 	return pend_page;
 }
 
 static void its_free_pending_table(struct page *pt)
 {
-	free_pages((unsigned long)page_address(pt), get_order(LPI_PENDBASE_SZ));
+	unsigned long va = (unsigned long)page_address(pt);
+
+	set_memory_encrypted(va, LPI_PENDBASE_SZ >> PAGE_SHIFT);
+	free_pages(va, get_order(LPI_PENDBASE_SZ));
 }
 
 /*
@@ -3268,14 +3284,20 @@  static bool its_alloc_table_entry(struct its_node *its,
 
 	/* Allocate memory for 2nd level table */
 	if (!table[idx]) {
+		void *l2addr;
+
 		page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
 					get_order(baser->psz));
 		if (!page)
 			return false;
 
+		l2addr = page_address(page);
+		set_memory_decrypted((unsigned long)l2addr,
+				     baser->psz >> PAGE_SHIFT);
+
 		/* Flush Lvl2 table to PoC if hw doesn't support coherency */
 		if (!(baser->val & GITS_BASER_SHAREABILITY_MASK))
-			gic_flush_dcache_to_poc(page_address(page), baser->psz);
+			gic_flush_dcache_to_poc(l2addr, baser->psz);
 
 		table[idx] = cpu_to_le64(page_to_phys(page) | GITS_BASER_VALID);
 
@@ -5043,6 +5065,8 @@  static int __init its_probe_one(struct resource *res,
 	its->fwnode_handle = handle;
 	its->get_msi_base = its_irq_get_msi_base;
 	its->msi_domain_flags = IRQ_DOMAIN_FLAG_MSI_REMAP;
+	set_memory_decrypted((unsigned long)its->cmd_base,
+			     ITS_CMD_QUEUE_SZ >> PAGE_SHIFT);
 
 	its_enable_quirks(its);
 
@@ -5099,6 +5123,8 @@  static int __init its_probe_one(struct resource *res,
 out_free_tables:
 	its_free_tables(its);
 out_free_cmd:
+	set_memory_encrypted((unsigned long)its->cmd_base,
+			     ITS_CMD_QUEUE_SZ >> PAGE_SHIFT);
 	free_pages((unsigned long)its->cmd_base, get_order(ITS_CMD_QUEUE_SZ));
 out_unmap_sgir:
 	if (its->sgir_base)