diff mbox

[V2] acpi, pci, irq: account for early penalty assignment

Message ID 1455801582-21595-1-git-send-email-okaya@codeaurora.org (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Sinan Kaya Feb. 18, 2016, 1:19 p.m. UTC
A crash has been observed when assigning penalty on x86 systems.

It looks like this problem happens on x86 platforms with IOAPIC and an SCI
interrupt override in the ACPI table with interrupt number greater than
16. (22 in this example)

The bug has been introduced by "ACPI, PCI, irq: remove interrupt count
restriction" commit. The code was using kmalloc to resize the interrupt
list. In this use case, the set penalty call is coming from early phase
and the heap is not initialized yet.

BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
IP: [<ffffffff811e8b9d>] kmem_cache_alloc_trace+0xad/0x1c0
PGD 0
Oops: 0000 [#1] SMP
Modules linked in:
CPU: 0 PID: 0 Comm: swapper Not tainted 4.5.0-rc2Feb-3_RK #1
Hardware name: HP Superdome2 16s, BIOS Bundle: 007.006.000 SFW: 033.162.000
10/30/2015
[<ffffffff813bc190>] acpi_irq_set_penalty+0x60/0x8e
[<ffffffff813bc1df>] acpi_irq_add_penalty+0x21/0x26
[<ffffffff813bc76d>] acpi_penalize_sci_irq+0x25/0x28
[<ffffffff81b8260d>] acpi_sci_ioapic_setup+0x68/0x78
[<ffffffff81b830fc>] acpi_boot_init+0x2cc/0x533
[<ffffffff810677c8>] ? set_pte_vaddr_pud+0x48/0x50
[<ffffffff81b828cf>] ? acpi_parse_x2apic+0x77/0x77
[<ffffffff81b82858>] ? dmi_ignore_irq0_timer_override+0x30/0x30
[<ffffffff81b77c1e>] setup_arch+0xc24/0xce9
[<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
[<ffffffff81b6ed94>] start_kernel+0xfc/0x506
[<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
[<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
[<ffffffff81b6e5ee>] x86_64_start_reservations+0x2a/0x2c
[<ffffffff81b6e73c>] x86_64_start_kernel+0x14c/0x16f

Besides from the use case above, there is one more situation where
set_penalty is being called from the init context like. There is support
for setting the penalty through kernel command line.

Adding support to be called from early context for limited number of
interrupts.

Reported-by: Nalla, Ravikanth <ravikanth.nalla@hpe.com>
Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
---
 drivers/acpi/pci_link.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

Comments

Timur Tabi Feb. 18, 2016, 3:12 p.m. UTC | #1
Sinan Kaya wrote:
> @@ -968,3 +978,4 @@ void __init acpi_pci_link_init(void)
>   	register_syscore_ops(&irqrouter_syscore_ops);
>   	acpi_scan_add_handler(&pci_link_handler);
>   }
> +

Unrelated whitespace change.
Rafael J. Wysocki Feb. 18, 2016, 4:39 p.m. UTC | #2
On Thu, Feb 18, 2016 at 2:19 PM, Sinan Kaya <okaya@codeaurora.org> wrote:
> A crash has been observed when assigning penalty on x86 systems.
>
> It looks like this problem happens on x86 platforms with IOAPIC and an SCI
> interrupt override in the ACPI table with interrupt number greater than
> 16. (22 in this example)
>
> The bug has been introduced by "ACPI, PCI, irq: remove interrupt count
> restriction" commit. The code was using kmalloc to resize the interrupt
> list. In this use case, the set penalty call is coming from early phase
> and the heap is not initialized yet.
>
> BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
> IP: [<ffffffff811e8b9d>] kmem_cache_alloc_trace+0xad/0x1c0
> PGD 0
> Oops: 0000 [#1] SMP
> Modules linked in:
> CPU: 0 PID: 0 Comm: swapper Not tainted 4.5.0-rc2Feb-3_RK #1
> Hardware name: HP Superdome2 16s, BIOS Bundle: 007.006.000 SFW: 033.162.000
> 10/30/2015
> [<ffffffff813bc190>] acpi_irq_set_penalty+0x60/0x8e
> [<ffffffff813bc1df>] acpi_irq_add_penalty+0x21/0x26
> [<ffffffff813bc76d>] acpi_penalize_sci_irq+0x25/0x28
> [<ffffffff81b8260d>] acpi_sci_ioapic_setup+0x68/0x78
> [<ffffffff81b830fc>] acpi_boot_init+0x2cc/0x533
> [<ffffffff810677c8>] ? set_pte_vaddr_pud+0x48/0x50
> [<ffffffff81b828cf>] ? acpi_parse_x2apic+0x77/0x77
> [<ffffffff81b82858>] ? dmi_ignore_irq0_timer_override+0x30/0x30
> [<ffffffff81b77c1e>] setup_arch+0xc24/0xce9
> [<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
> [<ffffffff81b6ed94>] start_kernel+0xfc/0x506
> [<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
> [<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
> [<ffffffff81b6e5ee>] x86_64_start_reservations+0x2a/0x2c
> [<ffffffff81b6e73c>] x86_64_start_kernel+0x14c/0x16f
>
> Besides from the use case above, there is one more situation where
> set_penalty is being called from the init context like. There is support
> for setting the penalty through kernel command line.
>
> Adding support to be called from early context for limited number of
> interrupts.
>
> Reported-by: Nalla, Ravikanth <ravikanth.nalla@hpe.com>
> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
> ---
>  drivers/acpi/pci_link.c | 19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
> index fa28635..14fe3ca 100644
> --- a/drivers/acpi/pci_link.c
> +++ b/drivers/acpi/pci_link.c
> @@ -47,6 +47,7 @@ ACPI_MODULE_NAME("pci_link");
>  #define ACPI_PCI_LINK_FILE_INFO                "info"
>  #define ACPI_PCI_LINK_FILE_STATUS      "state"
>  #define ACPI_PCI_LINK_MAX_POSSIBLE     16
> +#define ACPI_PCI_LINK_MAX_EARLY_IRQINFO 1024

Why do we need so many of them?

>
>  static int acpi_pci_link_add(struct acpi_device *device,
>                              const struct acpi_device_id *not_used);
> @@ -473,6 +474,8 @@ struct irq_penalty_info {
>  };
>
>  static LIST_HEAD(acpi_irq_penalty_list);
> +static struct irq_penalty_info early_irq_infos[ACPI_PCI_LINK_MAX_EARLY_IRQINFO];
> +static int early_irq_info_counter;
>
>  static int acpi_irq_get_penalty(int irq)
>  {
> @@ -507,10 +510,17 @@ static int acpi_irq_set_penalty(int irq, int new_penalty)
>                 }
>         }
>
> -       /* nope, let's allocate a slot for this IRQ */
> -       irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
> -       if (!irq_info)
> -               return -ENOMEM;
> +       if (!acpi_gbl_permanent_mmap) {
> +               if (early_irq_info_counter < ARRAY_SIZE(early_irq_infos))
> +                       irq_info = &early_irq_infos[early_irq_info_counter++];
> +               else
> +                       return -ENOMEM;
> +       } else {
> +               /* nope, let's allocate a slot for this IRQ */
> +               irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
> +               if (!irq_info)
> +                       return -ENOMEM;
> +       }
>
>         irq_info->irq = irq;
>         irq_info->penalty = new_penalty;
> @@ -968,3 +978,4 @@ void __init acpi_pci_link_init(void)
>         register_syscore_ops(&irqrouter_syscore_ops);
>         acpi_scan_add_handler(&pci_link_handler);
>  }
> +
> --

Bjorn, what do you think about this one?

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sinan Kaya Feb. 18, 2016, 4:43 p.m. UTC | #3
On 2/18/2016 11:39 AM, Rafael J. Wysocki wrote:
>> +#define ACPI_PCI_LINK_MAX_EARLY_IRQINFO 1024
> Why do we need so many of them?
> 

The previous code supported 1024 max interrupts before 
"ACPI, PCI, irq: remove interrupt count restriction" change. 

I added back 1024 number but the limit is only for early IRQs now.
Bjorn Helgaas Feb. 29, 2016, 7:24 p.m. UTC | #4
On Thu, Feb 18, 2016 at 08:19:41AM -0500, Sinan Kaya wrote:
> A crash has been observed when assigning penalty on x86 systems.
> 
> It looks like this problem happens on x86 platforms with IOAPIC and an SCI
> interrupt override in the ACPI table with interrupt number greater than
> 16. (22 in this example)
> 
> The bug has been introduced by "ACPI, PCI, irq: remove interrupt count
> restriction" commit. The code was using kmalloc to resize the interrupt

When referring to a previous commit, please include the SHA1, e.g.,

  b5bd02695471 ("ACPI, PCI, irq: remove interrupt count restriction")

> list. In this use case, the set penalty call is coming from early phase
> and the heap is not initialized yet.
> 
> BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
> IP: [<ffffffff811e8b9d>] kmem_cache_alloc_trace+0xad/0x1c0
> PGD 0
> Oops: 0000 [#1] SMP
> Modules linked in:
> CPU: 0 PID: 0 Comm: swapper Not tainted 4.5.0-rc2Feb-3_RK #1
> Hardware name: HP Superdome2 16s, BIOS Bundle: 007.006.000 SFW: 033.162.000
> 10/30/2015
> [<ffffffff813bc190>] acpi_irq_set_penalty+0x60/0x8e
> [<ffffffff813bc1df>] acpi_irq_add_penalty+0x21/0x26
> [<ffffffff813bc76d>] acpi_penalize_sci_irq+0x25/0x28
> [<ffffffff81b8260d>] acpi_sci_ioapic_setup+0x68/0x78
> [<ffffffff81b830fc>] acpi_boot_init+0x2cc/0x533
> [<ffffffff810677c8>] ? set_pte_vaddr_pud+0x48/0x50
> [<ffffffff81b828cf>] ? acpi_parse_x2apic+0x77/0x77
> [<ffffffff81b82858>] ? dmi_ignore_irq0_timer_override+0x30/0x30
> [<ffffffff81b77c1e>] setup_arch+0xc24/0xce9
> [<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
> [<ffffffff81b6ed94>] start_kernel+0xfc/0x506
> [<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
> [<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
> [<ffffffff81b6e5ee>] x86_64_start_reservations+0x2a/0x2c
> [<ffffffff81b6e73c>] x86_64_start_kernel+0x14c/0x16f
> 
> Besides from the use case above, there is one more situation where
> set_penalty is being called from the init context like. There is support
> for setting the penalty through kernel command line.
> 
> Adding support to be called from early context for limited number of
> interrupts.

I can't believe this whole IRQ penalty thing needs to be so
complicated.

The only time we actually use the penalty information is when we're
attaching a driver to a PCI device, i.e., in this path:

  pci_device_probe
    pcibios_alloc_irq
      pcibios_enable_irq

That happens pretty late, so there's no "can't allocate memory during
early boot" problem.

I bet the only thing that might happen early enough to be an issue is
the acpi_penalize_sci_irq() thing, which is a special case that
doesn't need to be handled generically.

> Reported-by: Nalla, Ravikanth <ravikanth.nalla@hpe.com>
> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
> ---
>  drivers/acpi/pci_link.c | 19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
> index fa28635..14fe3ca 100644
> --- a/drivers/acpi/pci_link.c
> +++ b/drivers/acpi/pci_link.c
> @@ -47,6 +47,7 @@ ACPI_MODULE_NAME("pci_link");
>  #define ACPI_PCI_LINK_FILE_INFO		"info"
>  #define ACPI_PCI_LINK_FILE_STATUS	"state"
>  #define ACPI_PCI_LINK_MAX_POSSIBLE	16
> +#define ACPI_PCI_LINK_MAX_EARLY_IRQINFO 1024
>  
>  static int acpi_pci_link_add(struct acpi_device *device,
>  			     const struct acpi_device_id *not_used);
> @@ -473,6 +474,8 @@ struct irq_penalty_info {
>  };
>  
>  static LIST_HEAD(acpi_irq_penalty_list);
> +static struct irq_penalty_info early_irq_infos[ACPI_PCI_LINK_MAX_EARLY_IRQINFO];
> +static int early_irq_info_counter;
>  
>  static int acpi_irq_get_penalty(int irq)
>  {
> @@ -507,10 +510,17 @@ static int acpi_irq_set_penalty(int irq, int new_penalty)
>  		}
>  	}
>  
> -	/* nope, let's allocate a slot for this IRQ */
> -	irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
> -	if (!irq_info)
> -		return -ENOMEM;
> +	if (!acpi_gbl_permanent_mmap) {
> +		if (early_irq_info_counter < ARRAY_SIZE(early_irq_infos))
> +			irq_info = &early_irq_infos[early_irq_info_counter++];
> +		else
> +			return -ENOMEM;
> +	} else {
> +		/* nope, let's allocate a slot for this IRQ */
> +		irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
> +		if (!irq_info)
> +			return -ENOMEM;
> +	}
>  
>  	irq_info->irq = irq;
>  	irq_info->penalty = new_penalty;
> @@ -968,3 +978,4 @@ void __init acpi_pci_link_init(void)
>  	register_syscore_ops(&irqrouter_syscore_ops);
>  	acpi_scan_add_handler(&pci_link_handler);
>  }
> +
> -- 
> 1.8.2.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sinan Kaya Feb. 29, 2016, 8:08 p.m. UTC | #5
Hi Bjorn,

On 2/29/2016 2:24 PM, Bjorn Helgaas wrote:
> On Thu, Feb 18, 2016 at 08:19:41AM -0500, Sinan Kaya wrote:
>> A crash has been observed when assigning penalty on x86 systems.
>>
>> It looks like this problem happens on x86 platforms with IOAPIC and an SCI
>> interrupt override in the ACPI table with interrupt number greater than
>> 16. (22 in this example)
>>
>> The bug has been introduced by "ACPI, PCI, irq: remove interrupt count
>> restriction" commit. The code was using kmalloc to resize the interrupt
> 
> When referring to a previous commit, please include the SHA1, e.g.,
> 
>   b5bd02695471 ("ACPI, PCI, irq: remove interrupt count restriction")

OK

> 
>> list. In this use case, the set penalty call is coming from early phase
>> and the heap is not initialized yet.
>>
>> BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
>> IP: [<ffffffff811e8b9d>] kmem_cache_alloc_trace+0xad/0x1c0
>> PGD 0
>> Oops: 0000 [#1] SMP
>> Modules linked in:
>> CPU: 0 PID: 0 Comm: swapper Not tainted 4.5.0-rc2Feb-3_RK #1
>> Hardware name: HP Superdome2 16s, BIOS Bundle: 007.006.000 SFW: 033.162.000
>> 10/30/2015
>> [<ffffffff813bc190>] acpi_irq_set_penalty+0x60/0x8e
>> [<ffffffff813bc1df>] acpi_irq_add_penalty+0x21/0x26
>> [<ffffffff813bc76d>] acpi_penalize_sci_irq+0x25/0x28
>> [<ffffffff81b8260d>] acpi_sci_ioapic_setup+0x68/0x78
>> [<ffffffff81b830fc>] acpi_boot_init+0x2cc/0x533
>> [<ffffffff810677c8>] ? set_pte_vaddr_pud+0x48/0x50
>> [<ffffffff81b828cf>] ? acpi_parse_x2apic+0x77/0x77
>> [<ffffffff81b82858>] ? dmi_ignore_irq0_timer_override+0x30/0x30
>> [<ffffffff81b77c1e>] setup_arch+0xc24/0xce9
>> [<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
>> [<ffffffff81b6ed94>] start_kernel+0xfc/0x506
>> [<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
>> [<ffffffff81b6e120>] ? early_idt_handler_array+0x120/0x120
>> [<ffffffff81b6e5ee>] x86_64_start_reservations+0x2a/0x2c
>> [<ffffffff81b6e73c>] x86_64_start_kernel+0x14c/0x16f
>>
>> Besides from the use case above, there is one more situation where
>> set_penalty is being called from the init context like. There is support
>> for setting the penalty through kernel command line.
>>
>> Adding support to be called from early context for limited number of
>> interrupts.
> 
> I can't believe this whole IRQ penalty thing needs to be so
> complicated.
> 
> The only time we actually use the penalty information is when we're
> attaching a driver to a PCI device, i.e., in this path:
> 
>   pci_device_probe
>     pcibios_alloc_irq
>       pcibios_enable_irq
> 
> That happens pretty late, so there's no "can't allocate memory during
> early boot" problem.

Correct, this is the path that code is intended for.

> 
> I bet the only thing that might happen early enough to be an issue is
> the acpi_penalize_sci_irq() thing, which is a special case that
> doesn't need to be handled generically.

The second use case is the kernel command line. See the bottom of the code, 
there are routines there to go get the penalty information from command line.

How would you like to proceed ?

- merge this to the original patch
- remove the acpi_penalize_sci_irq code to somewhere else.
- what about the kernel command line?


> 
>> Reported-by: Nalla, Ravikanth <ravikanth.nalla@hpe.com>
>> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
>> ---
>>  drivers/acpi/pci_link.c | 19 +++++++++++++++----
>>  1 file changed, 15 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
>> index fa28635..14fe3ca 100644
>> --- a/drivers/acpi/pci_link.c
>> +++ b/drivers/acpi/pci_link.c
>> @@ -47,6 +47,7 @@ ACPI_MODULE_NAME("pci_link");
>>  #define ACPI_PCI_LINK_FILE_INFO		"info"
>>  #define ACPI_PCI_LINK_FILE_STATUS	"state"
>>  #define ACPI_PCI_LINK_MAX_POSSIBLE	16
>> +#define ACPI_PCI_LINK_MAX_EARLY_IRQINFO 1024
>>  
>>  static int acpi_pci_link_add(struct acpi_device *device,
>>  			     const struct acpi_device_id *not_used);
>> @@ -473,6 +474,8 @@ struct irq_penalty_info {
>>  };
>>  
>>  static LIST_HEAD(acpi_irq_penalty_list);
>> +static struct irq_penalty_info early_irq_infos[ACPI_PCI_LINK_MAX_EARLY_IRQINFO];
>> +static int early_irq_info_counter;
>>  
>>  static int acpi_irq_get_penalty(int irq)
>>  {
>> @@ -507,10 +510,17 @@ static int acpi_irq_set_penalty(int irq, int new_penalty)
>>  		}
>>  	}
>>  
>> -	/* nope, let's allocate a slot for this IRQ */
>> -	irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
>> -	if (!irq_info)
>> -		return -ENOMEM;
>> +	if (!acpi_gbl_permanent_mmap) {
>> +		if (early_irq_info_counter < ARRAY_SIZE(early_irq_infos))
>> +			irq_info = &early_irq_infos[early_irq_info_counter++];
>> +		else
>> +			return -ENOMEM;
>> +	} else {
>> +		/* nope, let's allocate a slot for this IRQ */
>> +		irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
>> +		if (!irq_info)
>> +			return -ENOMEM;
>> +	}
>>  
>>  	irq_info->irq = irq;
>>  	irq_info->penalty = new_penalty;
>> @@ -968,3 +978,4 @@ void __init acpi_pci_link_init(void)
>>  	register_syscore_ops(&irqrouter_syscore_ops);
>>  	acpi_scan_add_handler(&pci_link_handler);
>>  }
>> +
>> -- 
>> 1.8.2.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Bjorn Helgaas Feb. 29, 2016, 10:34 p.m. UTC | #6
On Mon, Feb 29, 2016 at 03:08:26PM -0500, Sinan Kaya wrote:
> Hi Bjorn,
> 
> On 2/29/2016 2:24 PM, Bjorn Helgaas wrote:
> > On Thu, Feb 18, 2016 at 08:19:41AM -0500, Sinan Kaya wrote:
> >> A crash has been observed when assigning penalty on x86 systems.
> >>
> >> It looks like this problem happens on x86 platforms with IOAPIC and an SCI
> >> interrupt override in the ACPI table with interrupt number greater than
> >> 16. (22 in this example)
> >>
> >> The bug has been introduced by "ACPI, PCI, irq: remove interrupt count
> >> restriction" commit. The code was using kmalloc to resize the interrupt
> >> list. In this use case, the set penalty call is coming from early phase
> >> and the heap is not initialized yet.
> >> ...

> >> Besides from the use case above, there is one more situation where
> >> set_penalty is being called from the init context like. There is support
> >> for setting the penalty through kernel command line.
> >>
> >> Adding support to be called from early context for limited number of
> >> interrupts.
> > 
> > I can't believe this whole IRQ penalty thing needs to be so
> > complicated.
> > 
> > The only time we actually use the penalty information is when we're
> > attaching a driver to a PCI device, i.e., in this path:
> > 
> >   pci_device_probe
> >     pcibios_alloc_irq
> >       pcibios_enable_irq
> > 
> > That happens pretty late, so there's no "can't allocate memory during
> > early boot" problem.
> 
> Correct, this is the path that code is intended for.
> 
> > I bet the only thing that might happen early enough to be an issue is
> > the acpi_penalize_sci_irq() thing, which is a special case that
> > doesn't need to be handled generically.
> 
> The second use case is the kernel command line. See the bottom of the code, 
> there are routines there to go get the penalty information from command line.

Right.  But if we don't *use* the information until later, there's
probably no need to parse the command line and set it up so early.

> How would you like to proceed ?
> 
> - merge this to the original patch
> - remove the acpi_penalize_sci_irq code to somewhere else.
> - what about the kernel command line?

There's so much code there, that I think all the code obscures the
fact that there's almost nothing really happening.  In broad outline,
I think we care about:

  - the legacy ISA IRQs, i.e., the contents of acpi_irq_isa_penalty[]
  - acpi_irq_isa= from command line
  - acpi_irq_pci= from command line
  - which IRQ is used for SCI
  - number of PCI Interrupt Link devices sharing an IRQ

I doubt we need any dynamic allocation at all to manage this.  We
already have the acpi_irq_isa_penalty[] table statically allocated.
The SCI IRQ is one word.  I bet the command-line stuff is only
useful for the 16 ISA IRQs and could be merged into
acpi_irq_isa_penalty[].  Same for acpi_penalize_isa_irq() and
acpi_isa_irq_available().  We could easily compute the
number of links sharing an IRQ by traversing acpi_link_list.

I think only x86 cares about the first three items (legacy ISA IRQs
and command-line args).  This should be reflected in the code.  Only
x86 calls acpi_irq_penalty_init(), but that's pretty non-obvious.

I think it would be better to completely rewrite this penalty stuff
than to keep making it more complicated by fixing things in the
existing design.

> >> Reported-by: Nalla, Ravikanth <ravikanth.nalla@hpe.com>
> >> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
> >> ---
> >>  drivers/acpi/pci_link.c | 19 +++++++++++++++----
> >>  1 file changed, 15 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
> >> index fa28635..14fe3ca 100644
> >> --- a/drivers/acpi/pci_link.c
> >> +++ b/drivers/acpi/pci_link.c
> >> @@ -47,6 +47,7 @@ ACPI_MODULE_NAME("pci_link");
> >>  #define ACPI_PCI_LINK_FILE_INFO		"info"
> >>  #define ACPI_PCI_LINK_FILE_STATUS	"state"
> >>  #define ACPI_PCI_LINK_MAX_POSSIBLE	16
> >> +#define ACPI_PCI_LINK_MAX_EARLY_IRQINFO 1024
> >>  
> >>  static int acpi_pci_link_add(struct acpi_device *device,
> >>  			     const struct acpi_device_id *not_used);
> >> @@ -473,6 +474,8 @@ struct irq_penalty_info {
> >>  };
> >>  
> >>  static LIST_HEAD(acpi_irq_penalty_list);
> >> +static struct irq_penalty_info early_irq_infos[ACPI_PCI_LINK_MAX_EARLY_IRQINFO];
> >> +static int early_irq_info_counter;
> >>  
> >>  static int acpi_irq_get_penalty(int irq)
> >>  {
> >> @@ -507,10 +510,17 @@ static int acpi_irq_set_penalty(int irq, int new_penalty)
> >>  		}
> >>  	}
> >>  
> >> -	/* nope, let's allocate a slot for this IRQ */
> >> -	irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
> >> -	if (!irq_info)
> >> -		return -ENOMEM;
> >> +	if (!acpi_gbl_permanent_mmap) {
> >> +		if (early_irq_info_counter < ARRAY_SIZE(early_irq_infos))
> >> +			irq_info = &early_irq_infos[early_irq_info_counter++];
> >> +		else
> >> +			return -ENOMEM;
> >> +	} else {
> >> +		/* nope, let's allocate a slot for this IRQ */
> >> +		irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
> >> +		if (!irq_info)
> >> +			return -ENOMEM;
> >> +	}
> >>  
> >>  	irq_info->irq = irq;
> >>  	irq_info->penalty = new_penalty;
> >> @@ -968,3 +978,4 @@ void __init acpi_pci_link_init(void)
> >>  	register_syscore_ops(&irqrouter_syscore_ops);
> >>  	acpi_scan_add_handler(&pci_link_handler);
> >>  }
> >> +
> >> -- 
> >> 1.8.2.1
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> 
> 
> -- 
> Sinan Kaya
> Qualcomm Technologies, Inc. on behalf of Qualcomm Innovation Center, Inc.
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sinan Kaya March 1, 2016, 6:49 p.m. UTC | #7
> There's so much code there, that I think all the code obscures the
> fact that there's almost nothing really happening.  In broad outline,
> I think we care about:
> 
>   - the legacy ISA IRQs, i.e., the contents of acpi_irq_isa_penalty[]
>   - acpi_irq_isa= from command line
>   - acpi_irq_pci= from command line
>   - which IRQ is used for SCI
>   - number of PCI Interrupt Link devices sharing an IRQ
> 
> I doubt we need any dynamic allocation at all to manage this.  We
> already have the acpi_irq_isa_penalty[] table statically allocated.
> The SCI IRQ is one word.  

Just to be clear, we have resized acpi_irq_penalty table to 16 and named it
acpi_irq_isa_penalty. We are dynamically allocating memory for the rest of 
the interrupts that is bigger than 16. 

The SCI interrupt that caused the failure is interrupt 22 in this case. The code
was trying to allocate a new entry with kzalloc. 22 won't fit into the 
acpi_irq_isa_penalty array. How do we handle such case? Is there a cap on the SCI
interrupt number? 

That's why, I was trying to reallocate some memory in this code.


> I bet the command-line stuff is only
> useful for the 16 ISA IRQs and could be merged into
> acpi_irq_isa_penalty[].  
> Same for acpi_penalize_isa_irq() and
> acpi_isa_irq_available().  

Agreed. No issues with ISA IRQs.

> We could easily compute the
> number of links sharing an IRQ by traversing acpi_link_list.

Sorry, I couldn't quite get this. Where would you do this?

> 
> I think only x86 cares about the first three items (legacy ISA IRQs
> and command-line args).  This should be reflected in the code.  Only
> x86 calls acpi_irq_penalty_init(), but that's pretty non-obvious.

Very true. It looks like somebody sneaked in some code there.

> 
> I think it would be better to completely rewrite this penalty stuff
> than to keep making it more complicated by fixing things in the
> existing design.
> 
OK. Let's find out what it needs to look like. I need guidance on what IRQs on
Intel systems look like. 

>>>> Reported-by: Nalla, Ravikanth <ravikanth.nalla@hpe.com>
>>>> Signed-off-by: Sinan Kaya <okaya@codeaurora.org>
>>>> ---
>>>>  drivers/acpi/pci_link.c | 19 +++++++++++++++----
>>>>  1 file changed, 15 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
>>>> index fa28635..14fe3ca 100644
>>>> --- a/drivers/acpi/pci_link.c
>>>> +++ b/drivers/acpi/pci_link.c
>>>> @@ -47,6 +47,7 @@ ACPI_MODULE_NAME("pci_link");
>>>>  #define ACPI_PCI_LINK_FILE_INFO		"info"
>>>>  #define ACPI_PCI_LINK_FILE_STATUS	"state"
>>>>  #define ACPI_PCI_LINK_MAX_POSSIBLE	16
>>>> +#define ACPI_PCI_LINK_MAX_EARLY_IRQINFO 1024
>>>>  
>>>>  static int acpi_pci_link_add(struct acpi_device *device,
>>>>  			     const struct acpi_device_id *not_used);
>>>> @@ -473,6 +474,8 @@ struct irq_penalty_info {
>>>>  };
>>>>  
>>>>  static LIST_HEAD(acpi_irq_penalty_list);
>>>> +static struct irq_penalty_info early_irq_infos[ACPI_PCI_LINK_MAX_EARLY_IRQINFO];
>>>> +static int early_irq_info_counter;
>>>>  
>>>>  static int acpi_irq_get_penalty(int irq)
>>>>  {
>>>> @@ -507,10 +510,17 @@ static int acpi_irq_set_penalty(int irq, int new_penalty)
>>>>  		}
>>>>  	}
>>>>  
>>>> -	/* nope, let's allocate a slot for this IRQ */
>>>> -	irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
>>>> -	if (!irq_info)
>>>> -		return -ENOMEM;
>>>> +	if (!acpi_gbl_permanent_mmap) {
>>>> +		if (early_irq_info_counter < ARRAY_SIZE(early_irq_infos))
>>>> +			irq_info = &early_irq_infos[early_irq_info_counter++];
>>>> +		else
>>>> +			return -ENOMEM;
>>>> +	} else {
>>>> +		/* nope, let's allocate a slot for this IRQ */
>>>> +		irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
>>>> +		if (!irq_info)
>>>> +			return -ENOMEM;
>>>> +	}
>>>>  
>>>>  	irq_info->irq = irq;
>>>>  	irq_info->penalty = new_penalty;
>>>> @@ -968,3 +978,4 @@ void __init acpi_pci_link_init(void)
>>>>  	register_syscore_ops(&irqrouter_syscore_ops);
>>>>  	acpi_scan_add_handler(&pci_link_handler);
>>>>  }
>>>> +
>>>> -- 
>>>> 1.8.2.1
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>
>>
>> -- 
>> Sinan Kaya
>> Qualcomm Technologies, Inc. on behalf of Qualcomm Innovation Center, Inc.
>> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a Linux Foundation Collaborative Project
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Bjorn Helgaas March 1, 2016, 7:43 p.m. UTC | #8
On Tue, Mar 01, 2016 at 01:49:34PM -0500, Sinan Kaya wrote:
> > There's so much code there, that I think all the code obscures the
> > fact that there's almost nothing really happening.  In broad outline,
> > I think we care about:
> > 
> >   - the legacy ISA IRQs, i.e., the contents of acpi_irq_isa_penalty[]
> >   - acpi_irq_isa= from command line
> >   - acpi_irq_pci= from command line
> >   - which IRQ is used for SCI
> >   - number of PCI Interrupt Link devices sharing an IRQ
> > 
> > I doubt we need any dynamic allocation at all to manage this.  We
> > already have the acpi_irq_isa_penalty[] table statically allocated.
> > The SCI IRQ is one word.  
> 
> Just to be clear, we have resized acpi_irq_penalty table to 16 and named it
> acpi_irq_isa_penalty. We are dynamically allocating memory for the rest of 
> the interrupts that is bigger than 16. 
> 
> The SCI interrupt that caused the failure is interrupt 22 in this case. The code
> was trying to allocate a new entry with kzalloc. 22 won't fit into the 
> acpi_irq_isa_penalty array. How do we handle such case? Is there a cap on the SCI
> interrupt number? 
> 
> That's why, I was trying to reallocate some memory in this code.

I don't think there's a restriction on what the SCI IRQ can be.  But
there is only one SCI IRQ, so all we have to do is keep track of what
it is, which only requires one word.

> > I bet the command-line stuff is only
> > useful for the 16 ISA IRQs and could be merged into
> > acpi_irq_isa_penalty[].  
> > Same for acpi_penalize_isa_irq() and
> > acpi_isa_irq_available().  
> 
> Agreed. No issues with ISA IRQs.
> 
> > We could easily compute the
> > number of links sharing an IRQ by traversing acpi_link_list.
> 
> Sorry, I couldn't quite get this. Where would you do this?

I've never been exactly clear on how these links work.  So pardon me
while I think out loud and bore you with what you already know
(correct me if I get this wrong):

  - A link device has a PCI wired interrupt (INTA, INTB, etc.) on its
    "downstream" end.

  - The link device has a set of possible interrupt controller inputs
    to which it can connect the PCI interrupt.  _PRS contains this
    set.

  - When we enable a PCI device's interrupt, Interrupt Pin from config
    space tells us which INTx it uses.  The _PRT tells us whether that
    INTx is connected to (a) a fixed GSI or (b) an Interrupt Link that
    can be configured to one of several interrupt controller inputs.

  - If the latter, we must select one of the interrupt controller
    inputs to use, i.e., one of the IRQs from _PRS, and enable the
    Link.

  - If the Link is already active, we probably shouldn't change its
    configuration because other devices might already be using it.

  - If the Link is inactive, we must choose an IRQ and activate it.
    We should be able to choose anything from _PRS (as long as the
    level & trigger attributes match), but we can try to reduce IRQ
    sharing by avoiding an IRQ that's already in use.

This IRQ selection process is where we use the penalty information.
In acpi_pci_link_allocate(), we iterate through the possible choices
(link->irq.possible[i]) and choose the one with the smallest penalty.

Here's a sketch of what I'm thinking the code could look like.  In x86
code:

  int pcibios_irq_penalty(int irq)
  {
    if (irq >= ACPI_MAX_ISA_IRQ)
      return 0;

    return acpi_irq_isa_penalty[irq] + acpi_irq_cmd_line_penalty[irq];
  }

In pci_link.c:

  static int sci_irq, sci_irq_penalty;

  void acpi_penalize_sci_irq(int irq, int trigger, int polarity)
  {
    if (irq < 0)
      return;

    sci_irq = irq;
    if (trigger != ACPI_MADT_TRIGGER_LEVEL ||
        polarity != ACPI_MADT_POLARITY_ACTIVE_LOW)
          sci_irq_penalty = infinite;  /* can't use for PCI at all */
    else
      sci_irq_penalty = PIRQ_PENALTY_PCI_USING;
  }

  static pci_irq_sharing_penalty(int irq)
  {
    struct acpi_pci_link *link;
    int penalty = 0;

    list_for_each_entry(link, &acpi_link_list, list) {

      /*
       * If a link is active, penalize its IRQ heavily so we try to choose
       * a different IRQ.
       */
      if (link->irq.active && link->irq.active == irq)
        penalty += PIRQ_PENALTY_PCI_USING;
      else {

        /*
         * If a link is inactive, penalize the IRQs it might use, but
         * not as severely.
         */ 
        for (i = 0; i < link->irq.possible_count; i++)
          if (link->irq.possible[i] == irq)
            penalty += PIRQ_PENALTY_PCI_POSSIBLE;
      }
    }

    return penalty;
  }

  int __weak pcibios_irq_penalty(int irq)
  {
    return 0;
  }

  static int acpi_irq_get_penalty(int irq)
  {
    int penalty;

    penalty = pcibios_irq_penalty(irq);

    if (irq == sci_irq)
      penalty += sci_irq_penalty;

    penalty += pci_irq_sharing_penalty(irq);
    return penalty;
  }
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sinan Kaya March 2, 2016, 6:31 p.m. UTC | #9
On 3/1/2016 2:43 PM, Bjorn Helgaas wrote:
> On Tue, Mar 01, 2016 at 01:49:34PM -0500, Sinan Kaya wrote:
>>> There's so much code there, that I think all the code obscures the
>>> fact that there's almost nothing really happening.  In broad outline,
>>> I think we care about:
>>>
>>>   - the legacy ISA IRQs, i.e., the contents of acpi_irq_isa_penalty[]
>>>   - acpi_irq_isa= from command line
>>>   - acpi_irq_pci= from command line
>>>   - which IRQ is used for SCI
>>>   - number of PCI Interrupt Link devices sharing an IRQ
>>>
>>> I doubt we need any dynamic allocation at all to manage this.  We
>>> already have the acpi_irq_isa_penalty[] table statically allocated.
>>> The SCI IRQ is one word.  
>>
>> Just to be clear, we have resized acpi_irq_penalty table to 16 and named it
>> acpi_irq_isa_penalty. We are dynamically allocating memory for the rest of 
>> the interrupts that is bigger than 16. 
>>
>> The SCI interrupt that caused the failure is interrupt 22 in this case. The code
>> was trying to allocate a new entry with kzalloc. 22 won't fit into the 
>> acpi_irq_isa_penalty array. How do we handle such case? Is there a cap on the SCI
>> interrupt number? 
>>
>> That's why, I was trying to reallocate some memory in this code.
> 
> I don't think there's a restriction on what the SCI IRQ can be.  But
> there is only one SCI IRQ, so all we have to do is keep track of what
> it is, which only requires one word.
> 
>>> I bet the command-line stuff is only
>>> useful for the 16 ISA IRQs and could be merged into
>>> acpi_irq_isa_penalty[].  
>>> Same for acpi_penalize_isa_irq() and
>>> acpi_isa_irq_available().  
>>
>> Agreed. No issues with ISA IRQs.
>>
>>> We could easily compute the
>>> number of links sharing an IRQ by traversing acpi_link_list.
>>
>> Sorry, I couldn't quite get this. Where would you do this?
> 
> I've never been exactly clear on how these links work.  So pardon me
> while I think out loud and bore you with what you already know
> (correct me if I get this wrong):
> 
>   - A link device has a PCI wired interrupt (INTA, INTB, etc.) on its
>     "downstream" end.
> 
>   - The link device has a set of possible interrupt controller inputs
>     to which it can connect the PCI interrupt.  _PRS contains this
>     set.
> 
>   - When we enable a PCI device's interrupt, Interrupt Pin from config
>     space tells us which INTx it uses.  The _PRT tells us whether that
>     INTx is connected to (a) a fixed GSI or (b) an Interrupt Link that
>     can be configured to one of several interrupt controller inputs.
> 
>   - If the latter, we must select one of the interrupt controller
>     inputs to use, i.e., one of the IRQs from _PRS, and enable the
>     Link.
> 
>   - If the Link is already active, we probably shouldn't change its
>     configuration because other devices might already be using it.
> 
>   - If the Link is inactive, we must choose an IRQ and activate it.
>     We should be able to choose anything from _PRS (as long as the
>     level & trigger attributes match), but we can try to reduce IRQ
>     sharing by avoiding an IRQ that's already in use.
> 

Really nice write up. We need to fold this into the code. It was never 
obvious.

I'll send something soon.

> This IRQ selection process is where we use the penalty information.
> In acpi_pci_link_allocate(), we iterate through the possible choices
> (link->irq.possible[i]) and choose the one with the smallest penalty.
> 
> Here's a sketch of what I'm thinking the code could look like.  In x86
> code:
> 
>   int pcibios_irq_penalty(int irq)
>   {
>     if (irq >= ACPI_MAX_ISA_IRQ)
>       return 0;
> 
>     return acpi_irq_isa_penalty[irq] + acpi_irq_cmd_line_penalty[irq];
>   }
> 

> In pci_link.c:
> 
>   static int sci_irq, sci_irq_penalty;
> 
>   void acpi_penalize_sci_irq(int irq, int trigger, int polarity)
>   {
>     if (irq < 0)
>       return;
> 
>     sci_irq = irq;
>     if (trigger != ACPI_MADT_TRIGGER_LEVEL ||
>         polarity != ACPI_MADT_POLARITY_ACTIVE_LOW)
>           sci_irq_penalty = infinite;  /* can't use for PCI at all */
>     else
>       sci_irq_penalty = PIRQ_PENALTY_PCI_USING;
>   }
> 
>   static pci_irq_sharing_penalty(int irq)
>   {
>     struct acpi_pci_link *link;
>     int penalty = 0;
> 
>     list_for_each_entry(link, &acpi_link_list, list) {
> 
>       /*
>        * If a link is active, penalize its IRQ heavily so we try to choose
>        * a different IRQ.
>        */
>       if (link->irq.active && link->irq.active == irq)
>         penalty += PIRQ_PENALTY_PCI_USING;
>       else {
> 
>         /*
>          * If a link is inactive, penalize the IRQs it might use, but
>          * not as severely.
>          */ 
>         for (i = 0; i < link->irq.possible_count; i++)
>           if (link->irq.possible[i] == irq)
>             penalty += PIRQ_PENALTY_PCI_POSSIBLE;
>       }
>     }
> 
>     return penalty;
>   }
> 
>   int __weak pcibios_irq_penalty(int irq)
>   {
>     return 0;
>   }
> 
>   static int acpi_irq_get_penalty(int irq)
>   {
>     int penalty;
> 
>     penalty = pcibios_irq_penalty(irq);
> 
>     if (irq == sci_irq)
>       penalty += sci_irq_penalty;
> 
>     penalty += pci_irq_sharing_penalty(irq);
>     return penalty;
>   }
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
diff mbox

Patch

diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index fa28635..14fe3ca 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -47,6 +47,7 @@  ACPI_MODULE_NAME("pci_link");
 #define ACPI_PCI_LINK_FILE_INFO		"info"
 #define ACPI_PCI_LINK_FILE_STATUS	"state"
 #define ACPI_PCI_LINK_MAX_POSSIBLE	16
+#define ACPI_PCI_LINK_MAX_EARLY_IRQINFO 1024
 
 static int acpi_pci_link_add(struct acpi_device *device,
 			     const struct acpi_device_id *not_used);
@@ -473,6 +474,8 @@  struct irq_penalty_info {
 };
 
 static LIST_HEAD(acpi_irq_penalty_list);
+static struct irq_penalty_info early_irq_infos[ACPI_PCI_LINK_MAX_EARLY_IRQINFO];
+static int early_irq_info_counter;
 
 static int acpi_irq_get_penalty(int irq)
 {
@@ -507,10 +510,17 @@  static int acpi_irq_set_penalty(int irq, int new_penalty)
 		}
 	}
 
-	/* nope, let's allocate a slot for this IRQ */
-	irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
-	if (!irq_info)
-		return -ENOMEM;
+	if (!acpi_gbl_permanent_mmap) {
+		if (early_irq_info_counter < ARRAY_SIZE(early_irq_infos))
+			irq_info = &early_irq_infos[early_irq_info_counter++];
+		else
+			return -ENOMEM;
+	} else {
+		/* nope, let's allocate a slot for this IRQ */
+		irq_info = kzalloc(sizeof(*irq_info), GFP_KERNEL);
+		if (!irq_info)
+			return -ENOMEM;
+	}
 
 	irq_info->irq = irq;
 	irq_info->penalty = new_penalty;
@@ -968,3 +978,4 @@  void __init acpi_pci_link_init(void)
 	register_syscore_ops(&irqrouter_syscore_ops);
 	acpi_scan_add_handler(&pci_link_handler);
 }
+