diff mbox series

[3/5] x86/ioapic: Handle Extended Destination ID field in RTE

Message ID 20201007122046.1113577-3-dwmw2@infradead.org (mailing list archive)
State New, archived
Headers show
Series Fix x2apic enablement and allow up to 32768 CPUs without IR where supported | expand

Commit Message

David Woodhouse Oct. 7, 2020, 12:20 p.m. UTC
From: David Woodhouse <dwmw@amazon.co.uk>

The IOAPIC Redirection Table Entries contain an 8-bit Extended
Destination ID field which maps to bits 11-4 of the MSI address.

The lowest bit is used to indicate remappable format, when interrupt
remapping is in use. A hypervisor can use the other 7 bits to permit
guests to address up to 15 bits of APIC IDs, thus allowing 32768 vCPUs
before having to expose a vIOMMU and interrupt remapping to the guest.

No behavioural change in this patch, since nothing yet permits APIC IDs
above 255 to be used with the non-IR IOAPIC domain. Except for the case
where IR is enabled but there are IOAPICs which aren't in the scope of
any IOMMU, which is totally hosed anyway and needs fixing independently
of this change.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
---
 arch/x86/include/asm/io_apic.h |  3 ++-
 arch/x86/kernel/apic/io_apic.c | 19 +++++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

Comments

Peter Zijlstra Oct. 8, 2020, 9:12 a.m. UTC | #1
On Wed, Oct 07, 2020 at 01:20:44PM +0100, David Woodhouse wrote:
> @@ -1861,7 +1863,8 @@ static void ioapic_configure_entry(struct irq_data *irqd)
>  	 * ioapic chip to verify that.
>  	 */
>  	if (irqd->chip == &ioapic_chip) {
> -		mpd->entry.dest = cfg->dest_apicid;
> +		mpd->entry.dest = cfg->dest_apicid & 0xff;
> +		mpd->entry.ext_dest = cfg->dest_apicid >> 8;
>  		mpd->entry.vector = cfg->vector;
>  	}
>  	for_each_irq_pin(entry, mpd->irq_2_pin)

All the other sites did memset(0) before the assignment, and this the
extra unconditional write of 0 to ext_dest is harmless.

This might be true for this site too, but it wasn't immediately obvious.
Thomas Gleixner Oct. 8, 2020, 11:41 a.m. UTC | #2
On Wed, Oct 07 2020 at 13:20, David Woodhouse wrote:
> From: David Woodhouse <dwmw@amazon.co.uk>
>
> The IOAPIC Redirection Table Entries contain an 8-bit Extended
> Destination ID field which maps to bits 11-4 of the MSI address.
>
> The lowest bit is used to indicate remappable format, when interrupt
> remapping is in use. A hypervisor can use the other 7 bits to permit
> guests to address up to 15 bits of APIC IDs, thus allowing 32768 vCPUs
> before having to expose a vIOMMU and interrupt remapping to the guest.
>
> No behavioural change in this patch, since nothing yet permits APIC IDs
> above 255 to be used with the non-IR IOAPIC domain. Except for the case
> where IR is enabled but there are IOAPICs which aren't in the scope of
> any IOMMU, which is totally hosed anyway and needs fixing independently
> of this change.

Again: IOAPICs which are not covered by IR are detected and prevent IR
enablement which makes the above a fairy tale. Changelogs are about
facts.

> diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
> index a1a26f6d3aa4..e65a0b7379d0 100644
> --- a/arch/x86/include/asm/io_apic.h
> +++ b/arch/x86/include/asm/io_apic.h
> @@ -78,7 +78,8 @@ struct IO_APIC_route_entry {
>  		mask		:  1,	/* 0: enabled, 1: disabled */
>  		__reserved_2	: 15;
>  
> -	__u32	__reserved_3	: 24,
> +	__u32	__reserved_3	: 17,
> +		ext_dest	:  7,

This wants to be explicitely named 'virt_ext_dest'.

Thanks,

        tglx
David Woodhouse Oct. 8, 2020, 5:05 p.m. UTC | #3
On Thu, 2020-10-08 at 11:12 +0200, Peter Zijlstra wrote:
> On Wed, Oct 07, 2020 at 01:20:44PM +0100, David Woodhouse wrote:
> > @@ -1861,7 +1863,8 @@ static void ioapic_configure_entry(struct irq_data *irqd)
> >  	 * ioapic chip to verify that.
> >  	 */
> >  	if (irqd->chip == &ioapic_chip) {
> > -		mpd->entry.dest = cfg->dest_apicid;
> > +		mpd->entry.dest = cfg->dest_apicid & 0xff;
> > +		mpd->entry.ext_dest = cfg->dest_apicid >> 8;
> >  		mpd->entry.vector = cfg->vector;
> >  	}
> >  	for_each_irq_pin(entry, mpd->irq_2_pin)
> 
> All the other sites did memset(0) before the assignment, and this the
> extra unconditional write of 0 to ext_dest is harmless.
> 
> This might be true for this site too, but it wasn't immediately obvious.

Yeah. Really, that whole 16-bit field ought to be called
'msi_addr_bits_19_to_4' since there's no interpretation by the IOAPIC
and it's *just* passed on in the MSI cycle. See my later patch which
stops making them up for ourselves in the IOAPIC code and *just*
swizzles them out of the MSI message which is composed for us by
upstream.

In this case, since this piece of code is based on the knowledge that
the upstream controller is accepting MSI in the x86 Compatibility
Format, those seven bits are never going to have been used for anything
else.
diff mbox series

Patch

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index a1a26f6d3aa4..e65a0b7379d0 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -78,7 +78,8 @@  struct IO_APIC_route_entry {
 		mask		:  1,	/* 0: enabled, 1: disabled */
 		__reserved_2	: 15;
 
-	__u32	__reserved_3	: 24,
+	__u32	__reserved_3	: 17,
+		ext_dest	:  7,
 		dest		:  8;
 } __attribute__ ((packed));
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index a33380059db6..aa9a3b54a96c 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1239,10 +1239,10 @@  static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 			       buf, (ir_entry->index2 << 15) | ir_entry->index,
 			       ir_entry->zero);
 		else
-			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
+			printk(KERN_DEBUG "%s, %s, D(%02X%02X), M(%1d)\n",
 			       buf,
 			       entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
-			       "logical " : "physical",
+			       "logical " : "physical", entry.ext_dest,
 			       entry.dest, entry.delivery_mode);
 	}
 }
@@ -1410,6 +1410,7 @@  void native_restore_boot_irq_mode(void)
 	 */
 	if (ioapic_i8259.pin != -1) {
 		struct IO_APIC_route_entry entry;
+		u32 apic_id = read_apic_id();
 
 		memset(&entry, 0, sizeof(entry));
 		entry.mask		= IOAPIC_UNMASKED;
@@ -1417,7 +1418,8 @@  void native_restore_boot_irq_mode(void)
 		entry.polarity		= IOAPIC_POL_HIGH;
 		entry.dest_mode		= IOAPIC_DEST_MODE_PHYSICAL;
 		entry.delivery_mode	= dest_ExtINT;
-		entry.dest		= read_apic_id();
+		entry.dest		= apic_id & 0xff;
+		entry.ext_dest		= apic_id >> 8;
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1861,7 +1863,8 @@  static void ioapic_configure_entry(struct irq_data *irqd)
 	 * ioapic chip to verify that.
 	 */
 	if (irqd->chip == &ioapic_chip) {
-		mpd->entry.dest = cfg->dest_apicid;
+		mpd->entry.dest = cfg->dest_apicid & 0xff;
+		mpd->entry.ext_dest = cfg->dest_apicid >> 8;
 		mpd->entry.vector = cfg->vector;
 	}
 	for_each_irq_pin(entry, mpd->irq_2_pin)
@@ -2027,6 +2030,7 @@  static inline void __init unlock_ExtINT_logic(void)
 	int apic, pin, i;
 	struct IO_APIC_route_entry entry0, entry1;
 	unsigned char save_control, save_freq_select;
+	u32 apic_id;
 
 	pin  = find_isa_irq_pin(8, mp_INT);
 	if (pin == -1) {
@@ -2042,11 +2046,13 @@  static inline void __init unlock_ExtINT_logic(void)
 	entry0 = ioapic_read_entry(apic, pin);
 	clear_IO_APIC_pin(apic, pin);
 
+	apic_id = hard_smp_processor_id();
 	memset(&entry1, 0, sizeof(entry1));
 
 	entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
 	entry1.mask = IOAPIC_UNMASKED;
-	entry1.dest = hard_smp_processor_id();
+	entry1.dest = apic_id & 0xff;
+	entry1.ext_dest = apic_id >> 8;
 	entry1.delivery_mode = dest_ExtINT;
 	entry1.polarity = entry0.polarity;
 	entry1.trigger = IOAPIC_EDGE;
@@ -2949,7 +2955,8 @@  static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
 	memset(entry, 0, sizeof(*entry));
 	entry->delivery_mode = apic->irq_delivery_mode;
 	entry->dest_mode     = apic->irq_dest_mode;
-	entry->dest	     = cfg->dest_apicid;
+	entry->dest	     = cfg->dest_apicid & 0xff;
+	entry->ext_dest	     = cfg->dest_apicid >> 8;
 	entry->vector	     = cfg->vector;
 	entry->trigger	     = data->trigger;
 	entry->polarity	     = data->polarity;