diff mbox

[2/2] ACPI / scan: Parse _CCA and setup device coherency

Message ID 1430315049-4663-3-git-send-email-Suravee.Suthikulpanit@amd.com (mailing list archive)
State New, archived
Headers show

Commit Message

Suravee Suthikulpanit April 29, 2015, 1:44 p.m. UTC
This patch implements support for ACPI _CCA object, which is introduced in
ACPIv5.1, can be used for specifying device DMA coherency attribute.

The parsing logic traverses device namespace to parse coherency
information, and stores it in acpi_device_flags. Then uses it to call
arch_setup_dma_ops() when creating each device enumerated in DSDT
during ACPI scan.

This patch also introduces acpi_dma_is_coherent(), which provides
an interface for device drivers to check the coherency information
similarly to the of_dma_is_coherent().

Signed-off-by: Mark Salter <msalter@redhat.com>
Signed-off-by: Suravee Suthikulpanit <Suravee.Suthikulpanit@amd.com>
---
 drivers/acpi/acpi_platform.c |  5 ++++-
 drivers/acpi/scan.c          | 45 ++++++++++++++++++++++++++++++++++++++++++++
 include/acpi/acpi_bus.h      |  9 ++++++++-
 3 files changed, 57 insertions(+), 2 deletions(-)

Comments

Arnd Bergmann April 29, 2015, 2:03 p.m. UTC | #1
On Wednesday 29 April 2015 08:44:09 Suravee Suthikulpanit wrote:
> +                       device->flags.cca_seen = 1;
> +               } else if (IS_ENABLED(CONFIG_ACPI_MUST_HAVE_CCA)) {
> +                       /*
> +                        * Architecture has specified that if the device
> +                        * can do DMA, it must have ACPI _CCA object.
> +                        * Here, there could be two cases:
> +                        *   1. Not DMA-able device.
> +                        *   2. DMA-able device, but missing _CCA object.
> +                        *
> +                        * In both cases, we will default to dma non-coherent.
> +                        */
> +                       cca = 0;
> +               } else {
> +                       /*
> +                        * If architecture does not specify that device must
> +                        * specify ACPI _CCA (e.g. x86), we default to use
> +                        * dma coherent.
> +                        */
> +                       cca = 1;
> +               }
> 

What does it mean here if a device does DMA but is not coherent? Do you
have an example of a server that needs this?

Can we please make the default for ARM64 cca=1 as well?

	Arnd
Suravee Suthikulpanit April 29, 2015, 2:45 p.m. UTC | #2
On 04/29/2015 09:03 AM, Arnd Bergmann wrote:
> On Wednesday 29 April 2015 08:44:09 Suravee Suthikulpanit wrote:
>> +                       device->flags.cca_seen = 1;
>> +               } else if (IS_ENABLED(CONFIG_ACPI_MUST_HAVE_CCA)) {
>> +                       /*
>> +                        * Architecture has specified that if the device
>> +                        * can do DMA, it must have ACPI _CCA object.
>> +                        * Here, there could be two cases:
>> +                        *   1. Not DMA-able device.
>> +                        *   2. DMA-able device, but missing _CCA object.
>> +                        *
>> +                        * In both cases, we will default to dma non-coherent.
>> +                        */
>> +                       cca = 0;
>> +               } else {
>> +                       /*
>> +                        * If architecture does not specify that device must
>> +                        * specify ACPI _CCA (e.g. x86), we default to use
>> +                        * dma coherent.
>> +                        */
>> +                       cca = 1;
>> +               }
>>
>
> What does it mean here if a device does DMA but is not coherent? Do you
> have an example of a server that needs this?
>
> Can we please make the default for ARM64 cca=1 as well?
>
> 	Arnd
>

Actually, I am trying to implement the logic for when missing _CCA to be 
consistent with the behavior when the devicetree entry does not specify 
"dma-coherent" property. IIUC, in such case, Linux will default to using 
non-coherent DMA.

Thanks,

Suravee
Arnd Bergmann April 29, 2015, 2:47 p.m. UTC | #3
On Wednesday 29 April 2015 09:45:43 Suravee Suthikulpanit wrote:
> On 04/29/2015 09:03 AM, Arnd Bergmann wrote:
> > On Wednesday 29 April 2015 08:44:09 Suravee Suthikulpanit wrote:
> >> +                       device->flags.cca_seen = 1;
> >> +               } else if (IS_ENABLED(CONFIG_ACPI_MUST_HAVE_CCA)) {
> >> +                       /*
> >> +                        * Architecture has specified that if the device
> >> +                        * can do DMA, it must have ACPI _CCA object.
> >> +                        * Here, there could be two cases:
> >> +                        *   1. Not DMA-able device.
> >> +                        *   2. DMA-able device, but missing _CCA object.
> >> +                        *
> >> +                        * In both cases, we will default to dma non-coherent.
> >> +                        */
> >> +                       cca = 0;
> >> +               } else {
> >> +                       /*
> >> +                        * If architecture does not specify that device must
> >> +                        * specify ACPI _CCA (e.g. x86), we default to use
> >> +                        * dma coherent.
> >> +                        */
> >> +                       cca = 1;
> >> +               }
> >>
> >
> > What does it mean here if a device does DMA but is not coherent? Do you
> > have an example of a server that needs this?
> >
> > Can we please make the default for ARM64 cca=1 as well?
> >
> >       Arnd
> >
> 
> Actually, I am trying to implement the logic for when missing _CCA to be 
> consistent with the behavior when the devicetree entry does not specify 
> "dma-coherent" property. IIUC, in such case, Linux will default to using 
> non-coherent DMA.

Why?

	Arnd
Suravee Suthikulpanit April 29, 2015, 2:57 p.m. UTC | #4
On 4/29/15, 09:47, "Arnd Bergmann" <arnd@arndb.de> wrote:

>On Wednesday 29 April 2015 09:45:43 Suravee Suthikulpanit wrote:
>> On 04/29/2015 09:03 AM, Arnd Bergmann wrote:
>> > On Wednesday 29 April 2015 08:44:09 Suravee Suthikulpanit wrote:
>> >> +                       device->flags.cca_seen = 1;
>> >> +               } else if (IS_ENABLED(CONFIG_ACPI_MUST_HAVE_CCA)) {
>> >> +                       /*
>> >> +                        * Architecture has specified that if the
>>device
>> >> +                        * can do DMA, it must have ACPI _CCA object.
>> >> +                        * Here, there could be two cases:
>> >> +                        *   1. Not DMA-able device.
>> >> +                        *   2. DMA-able device, but missing _CCA
>>object.
>> >> +                        *
>> >> +                        * In both cases, we will default to dma
>>non-coherent.
>> >> +                        */
>> >> +                       cca = 0;
>> >> +               } else {
>> >> +                       /*
>> >> +                        * If architecture does not specify that
>>device must
>> >> +                        * specify ACPI _CCA (e.g. x86), we default
>>to use
>> >> +                        * dma coherent.
>> >> +                        */
>> >> +                       cca = 1;
>> >> +               }
>> >>
>> >
>> > What does it mean here if a device does DMA but is not coherent? Do
>>you
>> > have an example of a server that needs this?
>> >
>> > Can we please make the default for ARM64 cca=1 as well?
>> >
>> >       Arnd
>> >
>> 
>> Actually, I am trying to implement the logic for when missing _CCA to
>>be 
>> consistent with the behavior when the devicetree entry does not specify
>> "dma-coherent" property. IIUC, in such case, Linux will default to
>>using 
>> non-coherent DMA.
>
>Why?
>
>	Arnd

Otherwise, it would seem inconsistent with what states in the ACPI spec:
 
  CCA objects are only relevant for devices that can access CPU-visible
memory,
  such as devices that are DMA capable. On ARM based systems, the _CCA
object 
  must be supplied all such devices. On Intel platforms, if the _CCA
object is 
  not supplied, the OSPM will assume the devices are hardware cache
coherent.

From the statement above, I interpreted as if it is not present, it would
be non-coherent.

Suravee
al.stone@linaro.org April 29, 2015, 3:39 p.m. UTC | #5
On 04/29/2015 08:57 AM, Suthikulpanit, Suravee wrote:
> 
> 
> On 4/29/15, 09:47, "Arnd Bergmann" <arnd@arndb.de> wrote:
> 
>> On Wednesday 29 April 2015 09:45:43 Suravee Suthikulpanit wrote:
>>> On 04/29/2015 09:03 AM, Arnd Bergmann wrote:
>>>> On Wednesday 29 April 2015 08:44:09 Suravee Suthikulpanit wrote:
>>>>> +                       device->flags.cca_seen = 1;
>>>>> +               } else if (IS_ENABLED(CONFIG_ACPI_MUST_HAVE_CCA)) {
>>>>> +                       /*
>>>>> +                        * Architecture has specified that if the
>>> device
>>>>> +                        * can do DMA, it must have ACPI _CCA object.
>>>>> +                        * Here, there could be two cases:
>>>>> +                        *   1. Not DMA-able device.
>>>>> +                        *   2. DMA-able device, but missing _CCA
>>> object.
>>>>> +                        *
>>>>> +                        * In both cases, we will default to dma
>>> non-coherent.
>>>>> +                        */
>>>>> +                       cca = 0;
>>>>> +               } else {
>>>>> +                       /*
>>>>> +                        * If architecture does not specify that
>>> device must
>>>>> +                        * specify ACPI _CCA (e.g. x86), we default
>>> to use
>>>>> +                        * dma coherent.
>>>>> +                        */
>>>>> +                       cca = 1;
>>>>> +               }
>>>>>
>>>>
>>>> What does it mean here if a device does DMA but is not coherent? Do
>>> you
>>>> have an example of a server that needs this?
>>>>
>>>> Can we please make the default for ARM64 cca=1 as well?
>>>>
>>>>       Arnd
>>>>
>>>
>>> Actually, I am trying to implement the logic for when missing _CCA to
>>> be 
>>> consistent with the behavior when the devicetree entry does not specify
>>> "dma-coherent" property. IIUC, in such case, Linux will default to
>>> using 
>>> non-coherent DMA.
>>
>> Why?
>>
>> 	Arnd
> 
> Otherwise, it would seem inconsistent with what states in the ACPI spec:
>  
>   CCA objects are only relevant for devices that can access CPU-visible
> memory,
>   such as devices that are DMA capable. On ARM based systems, the _CCA
> object 
>   must be supplied all such devices. On Intel platforms, if the _CCA
> object is 
>   not supplied, the OSPM will assume the devices are hardware cache
> coherent.
> 
> From the statement above, I interpreted as if it is not present, it would
> be non-coherent.
> 
> Suravee

A little background to Suravee's statement...

When the spec was being changed for _CCA, it was determined by the ASWG
that there was no reasonable default -- either choice would break something.
Multiple OSs, SoC vendors, and platform vendors were asked.  So, the spec
says for ARMv8, _CCA must be specified when needed and is not assumed to have
any value.  Obviously, any OS can choose to behave differently, but that's
what was specified and why it was specified that way.
Arnd Bergmann April 29, 2015, 3:54 p.m. UTC | #6
On Wednesday 29 April 2015 14:57:10 Suthikulpanit, Suravee wrote:
> Otherwise, it would seem inconsistent with what states in the ACPI spec:
>  
>   CCA objects are only relevant for devices that can access CPU-visible
> memory,
>   such as devices that are DMA capable. On ARM based systems, the _CCA
> object 
>   must be supplied all such devices. On Intel platforms, if the _CCA
> object is 
>   not supplied, the OSPM will assume the devices are hardware cache
> coherent.
> 
> From the statement above, I interpreted as if it is not present, it would
> be non-coherent.
> 

My guess is that this section was included for Windows Phone, which runs
on embedded SoCs that usually have noncoherent DMA in a particular way.

Linux however only uses ACPI for servers, so that case does not happen.

I guess it would be reasonable to add a run-time warning here if you
try to do DMA on a device that does not have CCA set, and you should
probably set the DMA mask to 0 in that case as well.

Note that there are lots of ways in which you could have noncoherent DMA:
the default on ARM32 is that it requires uncached access or explicit
cache flushes, but it's also possible to have an SMP system where a device
is only coherent with some of the CPUs and requires explicit synchronization
(not flushes) otherwise. In a multi-level cache hierarchy, there could be
all sorts of combinations of flushes and syncs you would need to do.

With DT, we handle this using SoC-specific overrides for platforms that
are noncoherent in funny ways, see
http://lxr.free-electrons.com/source/arch/arm/mach-mvebu/coherency.c?v=3.18#L263
for instance. If we just disallow DMA to devices that are marked with _CCA=0
in ACPI, we can avoid this case, or discuss it by the time someone has hardware
that wants it, and then make a more informed decision about it.

	Arnd
Arnd Bergmann April 29, 2015, 4:15 p.m. UTC | #7
On Wednesday 29 April 2015 09:39:25 Al Stone wrote:
> 
> When the spec was being changed for _CCA, it was determined by the ASWG
> that there was no reasonable default -- either choice would break something.
> Multiple OSs, SoC vendors, and platform vendors were asked.  So, the spec
> says for ARMv8, _CCA must be specified when needed and is not assumed to have
> any value.  Obviously, any OS can choose to behave differently, but that's
> what was specified and why it was specified that way.

Ok, so it was essentially a CYA strategy. As we know that for Linux we're
only interested in server parts here, but we also want to be compliant,
I'd still argue that we check the property value and just disallow DMA
for any device that is lacking CCA or contains zero here.

The current patch actually implements non-standard behavior: if _CCA
is missing, it registers the device as dma-capable with coherency turned
off, where my interpretation of the cited standard would be that we
treat a missing _CCA as not being able to perform DMA. What I'd like
to see instead is to only enable DMA support if _CCA is present and
enabled.

	Arnd
Arnd Bergmann April 29, 2015, 4:25 p.m. UTC | #8
On Wednesday 29 April 2015 08:44:09 Suravee Suthikulpanit wrote:
> diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
> index 4bf7559..a4db208 100644
> --- a/drivers/acpi/acpi_platform.c
> +++ b/drivers/acpi/acpi_platform.c
> @@ -108,9 +108,12 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
>         if (IS_ERR(pdev))
>                 dev_err(&adev->dev, "platform device creation failed: %ld\n",
>                         PTR_ERR(pdev));
> -       else
> +       else {
> +               arch_setup_dma_ops(&pdev->dev, 0, 0, NULL,
> +                                  adev->flags.is_coherent);
>                 dev_dbg(&adev->dev, "created platform device %s\n",
>                         dev_name(&pdev->dev));
> +       }
>  
>         kfree(resources);
> 

Looking at this code in more detail, it seems that it unconditionally
sets pdevinfo.dma_mask = DMA_BIT_MASK(32), before calling
arch_setup_dma_ops(). This assignment should really done inside of
arch_setup_dma_ops() instead, which means we should implement that
function on all architectures that support ACPI.

For the case where _CCA is missing (or coherency disabled, if you ask
me), we would not call that function.

On a related note, I'm not sure how to handle different DMA masks here.
arch_setup_dma_ops() gets passed a size (and offset) argument, which should
match the DMA mask, but I don't know if there is a way to find out the
size from ACPI. Should we assume it's always 64-bit DMA capable?

For legacy reasons, the default mask is probably best left at 32-bit,
but drivers are expected to call dma_set_mask() if they can do 64-bit DMA,
and that should fail based on the information provided by the platform
if the bus is not capable of doing that.

	Arnd
Suravee Suthikulpanit April 29, 2015, 9:53 p.m. UTC | #9
On 4/29/15 11:25, Arnd Bergmann wrote:
> On Wednesday 29 April 2015 08:44:09 Suravee Suthikulpanit wrote:
>> diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
>> index 4bf7559..a4db208 100644
>> --- a/drivers/acpi/acpi_platform.c
>> +++ b/drivers/acpi/acpi_platform.c
>> @@ -108,9 +108,12 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
>>          if (IS_ERR(pdev))
>>                  dev_err(&adev->dev, "platform device creation failed: %ld\n",
>>                          PTR_ERR(pdev));
>> -       else
>> +       else {
>> +               arch_setup_dma_ops(&pdev->dev, 0, 0, NULL,
>> +                                  adev->flags.is_coherent);
>>                  dev_dbg(&adev->dev, "created platform device %s\n",
>>                          dev_name(&pdev->dev));
>> +       }
>>
>>          kfree(resources);
>>
>
> Looking at this code in more detail, it seems that it unconditionally
> sets pdevinfo.dma_mask = DMA_BIT_MASK(32), before calling
> arch_setup_dma_ops().

I think that's just the default legacy value assigned when it first 
create the platform_device from acpi_device.

> This assignment should really done inside of arch_setup_dma_ops()
 > instead, which means we should implement that
> function on all architectures that support ACPI.


> For the case where _CCA is missing (or coherency disabled, if you ask
> me), we would not call that function.

Actually, I agree for the case of missing _CCA when needed, ACPI driver 
probably should not make assumption and leave the decision for the 
default underlying arch-specific default. Basically, it should not be 
calling arch_setup_dma_ops().

As for the case where _CCA=0, I think the ACPI driver should essentially 
communicate the information as HW is non-coherent as described in the 
spec, and should be calling arch_setup_dma_ops(dev, false). It is true 
that this in probably less-likely for the ARM64 server platforms. 
However, I would think that the ACPI driver should not be making such 
assumption.

> On a related note, I'm not sure how to handle different DMA masks here.
> arch_setup_dma_ops() gets passed a size (and offset) argument, which should
> match the DMA mask, but I don't know if there is a way to find out the
> size from ACPI. Should we assume it's always 64-bit DMA capable?

Looking at the ACPI spec, it does have the _DMA object. IIUC, this can 
be used to describe DMA properties of a particular bus.

Method(_DMA, ResourceTemplate()
{
	QWORDMemory(
	ResourceConsumer,
	PosDecode, // _DEC
	MinFixed, // _MIF
	MaxFixed, // _MAF
	Prefetchable, // _MEM
	ReadWrite, // _RW
	0, // _GRA
	0, // _MIN
	0x1fffffff, // _MAX
	0x200000000, // _TRA
	0x20000000, // _LEN
	, , ,	
	)
}

I am not sure if this is an appropriate use for this object, but this 
seems to be similar to the dma-ranges property for OF, and probably can 
be used to specify baseaddr and size information when calling 
arch_setup_dma_ops().

> For legacy reasons, the default mask is probably best left at 32-bit,
> but drivers are expected to call dma_set_mask() if they can do 64-bit DMA,
> and that should fail based on the information provided by the platform
> if the bus is not capable of doing that.
>
> 	Arnd
>

However, on ARM64 the dma_base and size parameter for 
arch_setup_dma_ops() is currently not used, and only coherent flag is 
used. We probably should look at this separately. For the moment, we can 
probably say that if _CCA object is missing when needed, the ACPI driver 
won't set up dma_mask when creating platform_device, which should be 
equivalent to saying DMA is not supported.

Please let me know if this is acceptable, and I'll make change in V2 
accordingly.

Thanks,

Suravee
Arnd Bergmann April 30, 2015, 8:23 a.m. UTC | #10
On Wednesday 29 April 2015 16:53:10 Suravee Suthikulpanit wrote:
> On 4/29/15 11:25, Arnd Bergmann wrote:
> > On Wednesday 29 April 2015 08:44:09 Suravee Suthikulpanit wrote:
> >> diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
> >> index 4bf7559..a4db208 100644
> >> --- a/drivers/acpi/acpi_platform.c
> >> +++ b/drivers/acpi/acpi_platform.c
> >> @@ -108,9 +108,12 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
> >>          if (IS_ERR(pdev))
> >>                  dev_err(&adev->dev, "platform device creation failed: %ld\n",
> >>                          PTR_ERR(pdev));
> >> -       else
> >> +       else {
> >> +               arch_setup_dma_ops(&pdev->dev, 0, 0, NULL,
> >> +                                  adev->flags.is_coherent);
> >>                  dev_dbg(&adev->dev, "created platform device %s\n",
> >>                          dev_name(&pdev->dev));
> >> +       }
> >>
> >>          kfree(resources);
> >>
> >
> > Looking at this code in more detail, it seems that it unconditionally
> > sets pdevinfo.dma_mask = DMA_BIT_MASK(32), before calling
> > arch_setup_dma_ops().
> 
> I think that's just the default legacy value assigned when it first 
> create the platform_device from acpi_device.

Understood. And on x86 there is no way to find out if a device supports
DMA or not, so it has to do this I guess.

> > This assignment should really done inside of arch_setup_dma_ops()
>  > instead, which means we should implement that
> > function on all architectures that support ACPI.
> 
> 
> > For the case where _CCA is missing (or coherency disabled, if you ask
> > me), we would not call that function.
> 
> Actually, I agree for the case of missing _CCA when needed, ACPI driver 
> probably should not make assumption and leave the decision for the 
> default underlying arch-specific default. Basically, it should not be 
> calling arch_setup_dma_ops().

Ok.

> As for the case where _CCA=0, I think the ACPI driver should essentially 
> communicate the information as HW is non-coherent as described in the 
> spec, and should be calling arch_setup_dma_ops(dev, false). It is true 
> that this in probably less-likely for the ARM64 server platforms. 
> However, I would think that the ACPI driver should not be making such 
> assumption.

Can you add a description to the ACPI spec then to describe in detail what
"non-coherent" is supposed to mean, and which action the OS is supposed to
take when accessing data from device or CPU?

As I explained, the way we handle it by default on ARM64 is what embedded
systems typically do, but that might be completely different on the imagined
server chips that are not coherent for some reason. Just saying a device
is not coherent is like saying the CPU has known bugs but not saying how
to prevent it from crashing.

Is there some AML method that the OS can call to synchronize the cache
controller for all DMA to/from a particular device?

> > On a related note, I'm not sure how to handle different DMA masks here.
> > arch_setup_dma_ops() gets passed a size (and offset) argument, which should
> > match the DMA mask, but I don't know if there is a way to find out the
> > size from ACPI. Should we assume it's always 64-bit DMA capable?
> 
> Looking at the ACPI spec, it does have the _DMA object. IIUC, this can 
> be used to describe DMA properties of a particular bus.
> 
> Method(_DMA, ResourceTemplate()
> {
> 	QWORDMemory(
> 	ResourceConsumer,
> 	PosDecode, // _DEC
> 	MinFixed, // _MIF
> 	MaxFixed, // _MAF
> 	Prefetchable, // _MEM
> 	ReadWrite, // _RW
> 	0, // _GRA
> 	0, // _MIN
> 	0x1fffffff, // _MAX
> 	0x200000000, // _TRA
> 	0x20000000, // _LEN
> 	, , ,	
> 	)
> }
> 
> I am not sure if this is an appropriate use for this object, but this 
> seems to be similar to the dma-ranges property for OF, and probably can 
> be used to specify baseaddr and size information when calling 
> arch_setup_dma_ops().

Yes, that seems like a good idea. What is the expected behavior when that
object is absent? Do we assume that the parent device is not DMA capable?

Is this sufficient to describe the case where a device can only do DMA
to a specific address range that is not at bus address zero but that maps
to the beginning of physical RAM?

> > For legacy reasons, the default mask is probably best left at 32-bit,
> > but drivers are expected to call dma_set_mask() if they can do 64-bit DMA,
> > and that should fail based on the information provided by the platform
> > if the bus is not capable of doing that.
> >
> 
> However, on ARM64 the dma_base and size parameter for 
> arch_setup_dma_ops() is currently not used, and only coherent flag is 
> used. 

We can hope that we won't need the dma_base setting here, but it's
good to have the option to pass it down if we need it.

Not passing the size is a bug that needs to be fixed ASAP, I believe
a number of folks have run into this, most recently the APM X-Gene
MMC controller

> We probably should look at this separately. For the moment, we can 
> probably say that if _CCA object is missing when needed, the ACPI driver 
> won't set up dma_mask when creating platform_device, which should be 
> equivalent to saying DMA is not supported.
> 
> Please let me know if this is acceptable, and I'll make change in V2 
> accordingly.

I would still ask that you treat non-coherent to mean "no DMA" until
we have come up with a way to sufficiently describe the kind of
non-coherency in ACPI.

	Arnd
Will Deacon April 30, 2015, 10:41 a.m. UTC | #11
Hi Arnd,

On Thu, Apr 30, 2015 at 09:23:59AM +0100, Arnd Bergmann wrote:
> On Wednesday 29 April 2015 16:53:10 Suravee Suthikulpanit wrote:
> > As for the case where _CCA=0, I think the ACPI driver should essentially 
> > communicate the information as HW is non-coherent as described in the 
> > spec, and should be calling arch_setup_dma_ops(dev, false). It is true 
> > that this in probably less-likely for the ARM64 server platforms. 
> > However, I would think that the ACPI driver should not be making such 
> > assumption.
> 
> Can you add a description to the ACPI spec then to describe in detail what
> "non-coherent" is supposed to mean, and which action the OS is supposed to
> take when accessing data from device or CPU?

You may be interested in the IORT ACPI companion spec here:

  http://infocenter.arm.com/help/topic/com.arm.doc.den0049a/DEN0049A_IO_Remapping_Table.pdf

On CCA, it says:

  `This value must match the value returned by the _CCA object defined in
   the DSDT for the device represented by this node. The attribute can take
   the following values:

   - 0x1: The device is fully coherent. No cache maintenance[1] is required for
     memory shared with the device which is mapped on CPUs as
     Inner Write-Back (IWB), Outer Write-back (OWB), and Inner
     shareable (ISH). In addition, during system initialization at cold
     boot, or after wakeup from low-power state, if the cache
     coherency requires an SMMU override or some specific device
     configuration, the platform firmware has to ensure that this has
     been done. Therefore the semantics represented by a value of
     0x1 are always correct at the time of hand-off from firmware to
     OS.

   - 0x0: The device is not coherent. Therefore:
     * Cache maintenance is required for memory shared with the
       device that is mapped on CPUs as IWB-OWB-ISH.
     * No cache maintenance is required for memory shared with the
       device that is mapped on the CPU as device or Non-cacheable.

   All other values are reserved.

[1] Note: Caching operations described in this document apply to the CPU
    caches and any other caches in the system where device memory accesses
    can hit.'

This aside, the documented introduces some useful, related concepts such
as CPM (coherent path to memory) and DACS (device attributes are cacheable
and inner shareable) for describing different IO subsystems. It also has
mechanisms to descibe ID repainting from PCI->SMMU->ITS.

Will
Arnd Bergmann April 30, 2015, 10:47 a.m. UTC | #12
On Thursday 30 April 2015 11:41:02 Will Deacon wrote:
> Hi Arnd,
> 
> On Thu, Apr 30, 2015 at 09:23:59AM +0100, Arnd Bergmann wrote:
> > On Wednesday 29 April 2015 16:53:10 Suravee Suthikulpanit wrote:
> > > As for the case where _CCA=0, I think the ACPI driver should essentially 
> > > communicate the information as HW is non-coherent as described in the 
> > > spec, and should be calling arch_setup_dma_ops(dev, false). It is true 
> > > that this in probably less-likely for the ARM64 server platforms. 
> > > However, I would think that the ACPI driver should not be making such 
> > > assumption.
> > 
> > Can you add a description to the ACPI spec then to describe in detail what
> > "non-coherent" is supposed to mean, and which action the OS is supposed to
> > take when accessing data from device or CPU?
> 
> You may be interested in the IORT ACPI companion spec here:
> 
>   http://infocenter.arm.com/help/topic/com.arm.doc.den0049a/DEN0049A_IO_Remapping_Table.pdf
> 
> On CCA, it says:
> 
>   `This value must match the value returned by the _CCA object defined in
>    the DSDT for the device represented by this node. The attribute can take
>    the following values:
> 
>    - 0x1: The device is fully coherent. No cache maintenance[1] is required for
>      memory shared with the device which is mapped on CPUs as
>      Inner Write-Back (IWB), Outer Write-back (OWB), and Inner
>      shareable (ISH). In addition, during system initialization at cold
>      boot, or after wakeup from low-power state, if the cache
>      coherency requires an SMMU override or some specific device
>      configuration, the platform firmware has to ensure that this has
>      been done. Therefore the semantics represented by a value of
>      0x1 are always correct at the time of hand-off from firmware to
>      OS.

Ok, this part absolutely makes sense.

>    - 0x0: The device is not coherent. Therefore:
>      * Cache maintenance is required for memory shared with the
>        device that is mapped on CPUs as IWB-OWB-ISH.

This still seems insufficient. I guess this excludes having to
synchronize external bridges or write buffers, but it does not specify
what cache maintenance is required. Should there be an "outer-flush"?
Should the CPU cache be invalidated or flushed (or both), and do
we need to care about caches inside of the device or just inside of
the CPU?

>      * No cache maintenance is required for memory shared with the
>        device that is mapped on the CPU as device or Non-cacheable.
> 
>    All other values are reserved.
> 
> [1] Note: Caching operations described in this document apply to the CPU
>     caches and any other caches in the system where device memory accesses
>     can hit.'
> 
> This aside, the documented introduces some useful, related concepts such
> as CPM (coherent path to memory) and DACS (device attributes are cacheable
> and inner shareable) for describing different IO subsystems. It also has
> mechanisms to descibe ID repainting from PCI->SMMU->ITS.

Ah, good.

	Arnd
Will Deacon April 30, 2015, 11:07 a.m. UTC | #13
On Thu, Apr 30, 2015 at 11:47:46AM +0100, Arnd Bergmann wrote:
> On Thursday 30 April 2015 11:41:02 Will Deacon wrote:
> >    - 0x0: The device is not coherent. Therefore:
> >      * Cache maintenance is required for memory shared with the
> >        device that is mapped on CPUs as IWB-OWB-ISH.
> 
> This still seems insufficient. I guess this excludes having to
> synchronize external bridges or write buffers, but it does not specify
> what cache maintenance is required. Should there be an "outer-flush"?
> Should the CPU cache be invalidated or flushed (or both), and do
> we need to care about caches inside of the device or just inside of
> the CPU?

See the note below:

> > [1] Note: Caching operations described in this document apply to the CPU
> >     caches and any other caches in the system where device memory accesses
> >     can hit.'

So for the CPU caches we'd do the usual clean to push dirty lines to the device
and (clean+)invalidate before reading data from the device. For the "other
caches in the system" we currently assume (for ARM64) that cache maintenance
will be broadcast and therefore I wouldn't anticipate doing anything extra.

If people want to build system caches that don't respect broadcast cache
maintenance and require explicit management (e.g outer_flush), then I
consider that a broken system and we should try to disable the cache before
entering the kernel. ARMv8 explicitly prohibits this type of cache in the
architecture (type 1 below):

  `Conceptually, three classes of system cache can be envisaged:

   1. System caches which lie before the point of coherency and cannot
      be managed by any cache maintenance instructions. Such systems
      fundamentally undermine the concept of cache maintenance
      instructions operating to the point of coherency, as they imply
      the use of non-architecture mechanisms to manage coherency. The
      use of such systems in the ARM architecture is explicitly
      prohibited.

   2. System caches which lie before the point of coherency and can be
      managed by cache maintenance by address instructions that apply to
      the point of coherency, but cannot be managed by cache maintenance
      by set/way instructions. Where maintenance of the entirety of such
      a cache must be performed, as in the case for power management, it
      must be performed using non-architectural mechanisms.

   3. System caches which lie beyond the point of coherency and so are
      invisible to the software. The management of such caches is
      outside the scope of the architecture.'

(sorry to keep throwing the book at you!)

Will
Arnd Bergmann April 30, 2015, 11:24 a.m. UTC | #14
On Thursday 30 April 2015 12:07:18 Will Deacon wrote:
> On Thu, Apr 30, 2015 at 11:47:46AM +0100, Arnd Bergmann wrote:
> > On Thursday 30 April 2015 11:41:02 Will Deacon wrote:
> > >    - 0x0: The device is not coherent. Therefore:
> > >      * Cache maintenance is required for memory shared with the
> > >        device that is mapped on CPUs as IWB-OWB-ISH.
> > 
> > This still seems insufficient. I guess this excludes having to
> > synchronize external bridges or write buffers, but it does not specify
> > what cache maintenance is required. Should there be an "outer-flush"?
> > Should the CPU cache be invalidated or flushed (or both), and do
> > we need to care about caches inside of the device or just inside of
> > the CPU?
> 
> See the note below:
> 
> > > [1] Note: Caching operations described in this document apply to the CPU
> > >     caches and any other caches in the system where device memory accesses
> > >     can hit.'
> 
> So for the CPU caches we'd do the usual clean to push dirty lines to the device
> and (clean+)invalidate before reading data from the device. For the "other
> caches in the system" we currently assume (for ARM64) that cache maintenance
> will be broadcast and therefore I wouldn't anticipate doing anything extra.
> 
> If people want to build system caches that don't respect broadcast cache
> maintenance and require explicit management (e.g outer_flush), then I
> consider that a broken system and we should try to disable the cache before
> entering the kernel. ARMv8 explicitly prohibits this type of cache in the
> architecture (type 1 below):
> 
>   `Conceptually, three classes of system cache can be envisaged:
> 
>    1. System caches which lie before the point of coherency and cannot
>       be managed by any cache maintenance instructions. Such systems
>       fundamentally undermine the concept of cache maintenance
>       instructions operating to the point of coherency, as they imply
>       the use of non-architecture mechanisms to manage coherency. The
>       use of such systems in the ARM architecture is explicitly
>       prohibited.

Hmm, I thought this was what GPUs typically have, with their own
internal caches that are managed by the GPU rather than the normal
cache maintenance instructions. Does this prohibit the use of most
GPU devices with ARMv8, or did I misunderstand what they do?

>    2. System caches which lie before the point of coherency and can be
>       managed by cache maintenance by address instructions that apply to
>       the point of coherency, but cannot be managed by cache maintenance
>       by set/way instructions. Where maintenance of the entirety of such
>       a cache must be performed, as in the case for power management, it
>       must be performed using non-architectural mechanisms.

That still doesn't define which cache maintenance instructions are
required for a device that is marked as not coherent using the _CCA
property.

Here, I know that I have a cache that I can flush or invalidate or sync
using architected instructions, but should I?

In particular, there are two common models that we support in Linux:

a) embedded ARM32 and others

dma_alloc_non_coherent() == dma_alloc_coherent() == alloc uncached
dma_cache_sync() == not supportable
dma_sync_{single,sg,page}_for_{device,cpu} == {flush, invalidate, ...}

b) NUMA servers (parisc, itanium) and others

dma_alloc_noncoherent() == alloc cached
dma_alloc_coherent() == alloc uncached
dma_sync_{single,sg,page}_for_{device,cpu} ==  dma_cache_sync() == cache sync

There are probably other models that could happen, but the patch
set seems to assume a) is the only possible model, while the
architecture description you cite seems to still allow both a) and
b), as well as some variations, and it's possible that we will 
see b) on arm64 servers but not a).

You could also have a system that requires cache invalidation for
sending data from the device to memory, but does not require anything
for memory-to-device data, or you could have the opposite.

>    3. System caches which lie beyond the point of coherency and so are
>       invisible to the software. The management of such caches is
>       outside the scope of the architecture.'
> 
> (sorry to keep throwing the book at you!)

That's fine, at least I don't have to read it cover-to-cover then ;-)

	Arnd
Will Deacon April 30, 2015, 11:46 a.m. UTC | #15
On Thu, Apr 30, 2015 at 12:24:12PM +0100, Arnd Bergmann wrote:
> On Thursday 30 April 2015 12:07:18 Will Deacon wrote:
> > So for the CPU caches we'd do the usual clean to push dirty lines to the device
> > and (clean+)invalidate before reading data from the device. For the "other
> > caches in the system" we currently assume (for ARM64) that cache maintenance
> > will be broadcast and therefore I wouldn't anticipate doing anything extra.
> > 
> > If people want to build system caches that don't respect broadcast cache
> > maintenance and require explicit management (e.g outer_flush), then I
> > consider that a broken system and we should try to disable the cache before
> > entering the kernel. ARMv8 explicitly prohibits this type of cache in the
> > architecture (type 1 below):
> > 
> >   `Conceptually, three classes of system cache can be envisaged:
> > 
> >    1. System caches which lie before the point of coherency and cannot
> >       be managed by any cache maintenance instructions. Such systems
> >       fundamentally undermine the concept of cache maintenance
> >       instructions operating to the point of coherency, as they imply
> >       the use of non-architecture mechanisms to manage coherency. The
> >       use of such systems in the ARM architecture is explicitly
> >       prohibited.
> 
> Hmm, I thought this was what GPUs typically have, with their own
> internal caches that are managed by the GPU rather than the normal
> cache maintenance instructions. Does this prohibit the use of most
> GPU devices with ARMv8, or did I misunderstand what they do?

No, because it's the responsibility of the GPU/GPU driver to ensure
that the internal caches are not visible to the CPU. I guess you can
think of data in the GPU private cache like data sitting in a CPU's write
buffer (i.e. non-snoopable).

> >    2. System caches which lie before the point of coherency and can be
> >       managed by cache maintenance by address instructions that apply to
> >       the point of coherency, but cannot be managed by cache maintenance
> >       by set/way instructions. Where maintenance of the entirety of such
> >       a cache must be performed, as in the case for power management, it
> >       must be performed using non-architectural mechanisms.
> 
> That still doesn't define which cache maintenance instructions are
> required for a device that is marked as not coherent using the _CCA
> property.
> 
> Here, I know that I have a cache that I can flush or invalidate or sync
> using architected instructions, but should I?

Table 15 in the IORT spec show the 8 combinations of CCA/CPM/DACs,
the mapping requirements and whether or not maintenance is required.

The actual maintenance operations aren't described, but they would
correspond with what we currently do in the ARM and arm64 kernels (clean to
device, clean+inv from device).

> In particular, there are two common models that we support in Linux:
> 
> a) embedded ARM32 and others
> 
> dma_alloc_non_coherent() == dma_alloc_coherent() == alloc uncached
> dma_cache_sync() == not supportable
> dma_sync_{single,sg,page}_for_{device,cpu} == {flush, invalidate, ...}
> 
> b) NUMA servers (parisc, itanium) and others
> 
> dma_alloc_noncoherent() == alloc cached

This would lead to mismatched memory attributes on ARM/arm64.

> dma_alloc_coherent() == alloc uncached
> dma_sync_{single,sg,page}_for_{device,cpu} ==  dma_cache_sync() == cache sync

Cache sync doesn't exist in the ARM/arm64architecture, what are the
semantics supposed to be? Maybe it's just DSB for us (complete all pending
maintenance).

> There are probably other models that could happen, but the patch
> set seems to assume a) is the only possible model, while the
> architecture description you cite seems to still allow both a) and
> b), as well as some variations, and it's possible that we will 
> see b) on arm64 servers but not a)

Well, we should be careful not to confuse the ACPI spec with the ARM
architecture. The latter is more permissive, but does disallow system
caches that do not respect broadcast maintenance.

It's also worth pointing out that the architecture doesn't distinguish
between embedded and server machines using A-class processors.

> You could also have a system that requires cache invalidation for
> sending data from the device to memory, but does not require anything
> for memory-to-device data, or you could have the opposite.

You could theoretically build all sorts of strange devices, but that doesn't
mean we have to support them. In the case you describe, they'd have to put
up with the cost of redundant cache cleaning but it should at least function
correctly.

Will
Arnd Bergmann April 30, 2015, 1:03 p.m. UTC | #16
On Thursday 30 April 2015 12:46:15 Will Deacon wrote:
> On Thu, Apr 30, 2015 at 12:24:12PM +0100, Arnd Bergmann wrote:
> > On Thursday 30 April 2015 12:07:18 Will Deacon wrote:
> > > So for the CPU caches we'd do the usual clean to push dirty lines to the device
> > > and (clean+)invalidate before reading data from the device. For the "other
> > > caches in the system" we currently assume (for ARM64) that cache maintenance
> > > will be broadcast and therefore I wouldn't anticipate doing anything extra.
> > > 
> > > If people want to build system caches that don't respect broadcast cache
> > > maintenance and require explicit management (e.g outer_flush), then I
> > > consider that a broken system and we should try to disable the cache before
> > > entering the kernel. ARMv8 explicitly prohibits this type of cache in the
> > > architecture (type 1 below):
> > > 
> > >   `Conceptually, three classes of system cache can be envisaged:
> > > 
> > >    1. System caches which lie before the point of coherency and cannot
> > >       be managed by any cache maintenance instructions. Such systems
> > >       fundamentally undermine the concept of cache maintenance
> > >       instructions operating to the point of coherency, as they imply
> > >       the use of non-architecture mechanisms to manage coherency. The
> > >       use of such systems in the ARM architecture is explicitly
> > >       prohibited.
> > 
> > Hmm, I thought this was what GPUs typically have, with their own
> > internal caches that are managed by the GPU rather than the normal
> > cache maintenance instructions. Does this prohibit the use of most
> > GPU devices with ARMv8, or did I misunderstand what they do?
> 
> No, because it's the responsibility of the GPU/GPU driver to ensure
> that the internal caches are not visible to the CPU. I guess you can
> think of data in the GPU private cache like data sitting in a CPU's write
> buffer (i.e. non-snoopable).

Ok.

> > In particular, there are two common models that we support in Linux:
> > 
> > a) embedded ARM32 and others
> > 
> > dma_alloc_non_coherent() == dma_alloc_coherent() == alloc uncached
> > dma_cache_sync() == not supportable
> > dma_sync_{single,sg,page}_for_{device,cpu} == {flush, invalidate, ...}
> > 
> > b) NUMA servers (parisc, itanium) and others
> > 
> > dma_alloc_noncoherent() == alloc cached
> 
> This would lead to mismatched memory attributes on ARM/arm64.

How so? This is just what __dma_alloc() on arm64 does for
coherent devices:

        /* no need for non-cacheable mapping if coherent */
        if (coherent)
                return ptr;

> > dma_alloc_coherent() == alloc uncached
> > dma_sync_{single,sg,page}_for_{device,cpu} ==  dma_cache_sync() == cache sync
> 
> Cache sync doesn't exist in the ARM/arm64architecture, what are the
> semantics supposed to be? Maybe it's just DSB for us (complete all pending
> maintenance).

It ensures that a state of a buffer as observed by CPU and device is
identical. It's possible that we removed all platforms that did something
interesting here, so it's one of these:

a) On architectures that are mostly coherent, it's a barrier
   that is broadcast to all devices, like I assume DSB is. IA64
   currently does this for all machines, but IIRC it used to 
   access some cluster interconnect at some point to enforce a
   flush.
   The ARM32 based ArmadaXP also falls into this model if the cache
   coherency fabric is enabled, as that needs to be synchronized
b) On architectures where the device may not see the state of the cache,
   but the CPU is always aware of anything the device sends it,
   it flushes the cache. This seems to be the case on parisc,
   and in particular, there are some variants that do not support
   dma_alloc_coherent but only dma_alloc_noncoherent.
c) On architectures that need the synchronization both ways,
   it does (almost) the same invalidate/clean/flush thing as
   ARM, except it doesn't have to worry about cache lines from
   speculative prefetch which make it impossible to implement on
   ARM.

> > There are probably other models that could happen, but the patch
> > set seems to assume a) is the only possible model, while the
> > architecture description you cite seems to still allow both a) and
> > b), as well as some variations, and it's possible that we will 
> > see b) on arm64 servers but not a)
> 
> Well, we should be careful not to confuse the ACPI spec with the ARM
> architecture. The latter is more permissive, but does disallow system
> caches that do not respect broadcast maintenance.
> 
> It's also worth pointing out that the architecture doesn't distinguish
> between embedded and server machines using A-class processors.
> 
> > You could also have a system that requires cache invalidation for
> > sending data from the device to memory, but does not require anything
> > for memory-to-device data, or you could have the opposite.
> 
> You could theoretically build all sorts of strange devices, but that doesn't
> mean we have to support them. In the case you describe, they'd have to put
> up with the cost of redundant cache cleaning but it should at least function
> correctly.

Which case would a variant of ArmadaXP with a 64-bit core fall into then?
Do I understand it right that requiring to sync the coherency fabric
would make it noncompliant with ACPI but still architecturally compliant?

I guess we could handle that case as well, by requiring any ACPI based
firmware to turn off the coherency fabric on that system and just making
it dog slow.

	Arnd
Will Deacon April 30, 2015, 1:13 p.m. UTC | #17
On Thu, Apr 30, 2015 at 02:03:00PM +0100, Arnd Bergmann wrote:
> On Thursday 30 April 2015 12:46:15 Will Deacon wrote:
> > On Thu, Apr 30, 2015 at 12:24:12PM +0100, Arnd Bergmann wrote:
> > > In particular, there are two common models that we support in Linux:
> > > 
> > > a) embedded ARM32 and others
> > > 
> > > dma_alloc_non_coherent() == dma_alloc_coherent() == alloc uncached
> > > dma_cache_sync() == not supportable
> > > dma_sync_{single,sg,page}_for_{device,cpu} == {flush, invalidate, ...}
> > > 
> > > b) NUMA servers (parisc, itanium) and others
> > > 
> > > dma_alloc_noncoherent() == alloc cached
> > 
> > This would lead to mismatched memory attributes on ARM/arm64.
> 
> How so? This is just what __dma_alloc() on arm64 does for
> coherent devices:
> 
>         /* no need for non-cacheable mapping if coherent */
>         if (coherent)
>                 return ptr;

Ok, I thought that you were only describing the cases when the device is
non-coherent (_CCA=0). Otherwise, your assertion above that
dma_alloc_coherent == alloc uncached isn't true for coherent devices.

So now I'm confused...

> > > dma_alloc_coherent() == alloc uncached
> > > dma_sync_{single,sg,page}_for_{device,cpu} ==  dma_cache_sync() == cache sync
> > 
> > Cache sync doesn't exist in the ARM/arm64architecture, what are the
> > semantics supposed to be? Maybe it's just DSB for us (complete all pending
> > maintenance).
> 
> It ensures that a state of a buffer as observed by CPU and device is
> identical. It's possible that we removed all platforms that did something
> interesting here, so it's one of these:
> 
> a) On architectures that are mostly coherent, it's a barrier
>    that is broadcast to all devices, like I assume DSB is. IA64
>    currently does this for all machines, but IIRC it used to 
>    access some cluster interconnect at some point to enforce a
>    flush.
>    The ARM32 based ArmadaXP also falls into this model if the cache
>    coherency fabric is enabled, as that needs to be synchronized
> b) On architectures where the device may not see the state of the cache,
>    but the CPU is always aware of anything the device sends it,
>    it flushes the cache. This seems to be the case on parisc,
>    and in particular, there are some variants that do not support
>    dma_alloc_coherent but only dma_alloc_noncoherent.
> c) On architectures that need the synchronization both ways,
>    it does (almost) the same invalidate/clean/flush thing as
>    ARM, except it doesn't have to worry about cache lines from
>    speculative prefetch which make it impossible to implement on
>    ARM.

Okey doke, thanks for the explanation. It sounds like we can just build
the primitive out of the existing cache maintenance routines if we need
to implement it.

> > > There are probably other models that could happen, but the patch
> > > set seems to assume a) is the only possible model, while the
> > > architecture description you cite seems to still allow both a) and
> > > b), as well as some variations, and it's possible that we will 
> > > see b) on arm64 servers but not a)
> > 
> > Well, we should be careful not to confuse the ACPI spec with the ARM
> > architecture. The latter is more permissive, but does disallow system
> > caches that do not respect broadcast maintenance.
> > 
> > It's also worth pointing out that the architecture doesn't distinguish
> > between embedded and server machines using A-class processors.
> > 
> > > You could also have a system that requires cache invalidation for
> > > sending data from the device to memory, but does not require anything
> > > for memory-to-device data, or you could have the opposite.
> > 
> > You could theoretically build all sorts of strange devices, but that doesn't
> > mean we have to support them. In the case you describe, they'd have to put
> > up with the cost of redundant cache cleaning but it should at least function
> > correctly.
> 
> Which case would a variant of ArmadaXP with a 64-bit core fall into then?
> Do I understand it right that requiring to sync the coherency fabric
> would make it noncompliant with ACPI but still architecturally compliant?

I would say that the ArmadaXP coherency fabric is not compliant with ARMv8
as it requires additional steps over those cache maintenance instructions
described by the architecture (i.e. it falls into class (1) of the three
classes of system cache in the architecture).

> I guess we could handle that case as well, by requiring any ACPI based
> firmware to turn off the coherency fabric on that system and just making
> it dog slow.

We already require something similar in Documentation/arm64/booting.txt:

  `System caches which do not respect architected cache maintenance by VA
   operations (not recommended) must be configured and disabled.'

Will
Arnd Bergmann April 30, 2015, 1:52 p.m. UTC | #18
On Thursday 30 April 2015 14:13:45 Will Deacon wrote:
> On Thu, Apr 30, 2015 at 02:03:00PM +0100, Arnd Bergmann wrote:
> > On Thursday 30 April 2015 12:46:15 Will Deacon wrote:
> > > On Thu, Apr 30, 2015 at 12:24:12PM +0100, Arnd Bergmann wrote:
> > > > In particular, there are two common models that we support in Linux:
> > > > 
> > > > a) embedded ARM32 and others
> > > > 
> > > > dma_alloc_non_coherent() == dma_alloc_coherent() == alloc uncached
> > > > dma_cache_sync() == not supportable
> > > > dma_sync_{single,sg,page}_for_{device,cpu} == {flush, invalidate, ...}
> > > > 
> > > > b) NUMA servers (parisc, itanium) and others
> > > > 
> > > > dma_alloc_noncoherent() == alloc cached
> > > 
> > > This would lead to mismatched memory attributes on ARM/arm64.
> > 
> > How so? This is just what __dma_alloc() on arm64 does for
> > coherent devices:
> > 
> >         /* no need for non-cacheable mapping if coherent */
> >         if (coherent)
> >                 return ptr;
> 
> Ok, I thought that you were only describing the cases when the device is
> non-coherent (_CCA=0). Otherwise, your assertion above that
> dma_alloc_coherent == alloc uncached isn't true for coherent devices.
> 
> So now I'm confused...

What I was describing here is a device that is not fully coherent,
but instead requires some operation other than a cache flush/invalidate
to complete before the memory can be accessed.

> > > > dma_alloc_coherent() == alloc uncached
> > > > dma_sync_{single,sg,page}_for_{device,cpu} ==  dma_cache_sync() == cache sync
> > > 
> > > Cache sync doesn't exist in the ARM/arm64architecture, what are the
> > > semantics supposed to be? Maybe it's just DSB for us (complete all pending
> > > maintenance).
> > 
> > It ensures that a state of a buffer as observed by CPU and device is
> > identical. It's possible that we removed all platforms that did something
> > interesting here, so it's one of these:
> > 
> > a) On architectures that are mostly coherent, it's a barrier
> >    that is broadcast to all devices, like I assume DSB is. IA64
> >    currently does this for all machines, but IIRC it used to 
> >    access some cluster interconnect at some point to enforce a
> >    flush.
> >    The ARM32 based ArmadaXP also falls into this model if the cache
> >    coherency fabric is enabled, as that needs to be synchronized
> > b) On architectures where the device may not see the state of the cache,
> >    but the CPU is always aware of anything the device sends it,
> >    it flushes the cache. This seems to be the case on parisc,
> >    and in particular, there are some variants that do not support
> >    dma_alloc_coherent but only dma_alloc_noncoherent.
> > c) On architectures that need the synchronization both ways,
> >    it does (almost) the same invalidate/clean/flush thing as
> >    ARM, except it doesn't have to worry about cache lines from
> >    speculative prefetch which make it impossible to implement on
> >    ARM.
> 
> Okey doke, thanks for the explanation. It sounds like we can just build
> the primitive out of the existing cache maintenance routines if we need
> to implement it.

Cases a) and b) yes, but not c), otherwise we could simplify
the ARM dma-mapping implementation and just merge __dma_page_cpu_to_dev
and __dma_page_dev_to_cpu into one function.

And a) and b) are both for systems that are more coherent than what
our noncoherent dma_map_ops implement, but less coherent than what
the coherent dma_map_ops do, and that is specifically what the ACPI
binding cannot describe, unless you argue that either ACPI or ARMv8
forbids both of these models.

> > Which case would a variant of ArmadaXP with a 64-bit core fall into then?
> > Do I understand it right that requiring to sync the coherency fabric
> > would make it noncompliant with ACPI but still architecturally compliant?
> 
> I would say that the ArmadaXP coherency fabric is not compliant with ARMv8
> as it requires additional steps over those cache maintenance instructions
> described by the architecture (i.e. it falls into class (1) of the three
> classes of system cache in the architecture).
> 
> > I guess we could handle that case as well, by requiring any ACPI based
> > firmware to turn off the coherency fabric on that system and just making
> > it dog slow.
> 
> We already require something similar in Documentation/arm64/booting.txt:
> 
>   `System caches which do not respect architected cache maintenance by VA
>    operations (not recommended) must be configured and disabled.'

Hmm, does that rule really get violated here? I think it fully respects
the cache maintenance (flush/invalidate/clean) operations, but it does
not fully respect the dsb/dmb instructions, which is something else.

	Arnd
Catalin Marinas April 30, 2015, 3:55 p.m. UTC | #19
On Thu, Apr 30, 2015 at 03:52:17PM +0200, Arnd Bergmann wrote:
> On Thursday 30 April 2015 14:13:45 Will Deacon wrote:
> > On Thu, Apr 30, 2015 at 02:03:00PM +0100, Arnd Bergmann wrote:
> > > On Thursday 30 April 2015 12:46:15 Will Deacon wrote:
> > > > Cache sync doesn't exist in the ARM/arm64architecture, what are the
> > > > semantics supposed to be? Maybe it's just DSB for us (complete all pending
> > > > maintenance).
> > > 
> > > It ensures that a state of a buffer as observed by CPU and device is
> > > identical. It's possible that we removed all platforms that did something
> > > interesting here, so it's one of these:
> > > 
> > > a) On architectures that are mostly coherent, it's a barrier
> > >    that is broadcast to all devices, like I assume DSB is. IA64
> > >    currently does this for all machines, but IIRC it used to 
> > >    access some cluster interconnect at some point to enforce a
> > >    flush.
> > >    The ARM32 based ArmadaXP also falls into this model if the cache
> > >    coherency fabric is enabled, as that needs to be synchronized

I'm getting confused by the ArmadaXP case. IIRC, the point of the
arm,io-coherent property to the PL310 was precisely to make the
outer_sync a no-op when the coherency is enabled. So basically an mb()
would only issue a DSB on such platform without the PL310 cache sync.

On coherent systems, devices usually snoop the inner/CPU cache and not
the system cache, that's further down the line. So a DSB would ensure
the visibility at the coherent interconnect level before the system
cache. I don't think it needs to be broadcast all the way to devices.

> > > b) On architectures where the device may not see the state of the cache,
> > >    but the CPU is always aware of anything the device sends it,
> > >    it flushes the cache. This seems to be the case on parisc,
> > >    and in particular, there are some variants that do not support
> > >    dma_alloc_coherent but only dma_alloc_noncoherent.
> > > c) On architectures that need the synchronization both ways,
> > >    it does (almost) the same invalidate/clean/flush thing as
> > >    ARM, except it doesn't have to worry about cache lines from
> > >    speculative prefetch which make it impossible to implement on
> > >    ARM.
> > 
> > Okey doke, thanks for the explanation. It sounds like we can just build
> > the primitive out of the existing cache maintenance routines if we need
> > to implement it.
> 
> Cases a) and b) yes, but not c), otherwise we could simplify
> the ARM dma-mapping implementation and just merge __dma_page_cpu_to_dev
> and __dma_page_dev_to_cpu into one function.

I don't fully understand c) or b). Wouldn't the non-coherent ops cover
them both, though potentially not as efficient?

> And a) and b) are both for systems that are more coherent than what
> our noncoherent dma_map_ops implement, but less coherent than what
> the coherent dma_map_ops do, and that is specifically what the ACPI
> binding cannot describe, unless you argue that either ACPI or ARMv8
> forbids both of these models.

In general, a DSB should work as described in the ARM ARM without the
need to poke additional devices (PL310 is an example not to follow).

> > > I guess we could handle that case as well, by requiring any ACPI based
> > > firmware to turn off the coherency fabric on that system and just making
> > > it dog slow.
> > 
> > We already require something similar in Documentation/arm64/booting.txt:
> > 
> >   `System caches which do not respect architected cache maintenance by VA
> >    operations (not recommended) must be configured and disabled.'
> 
> Hmm, does that rule really get violated here? I think it fully respects
> the cache maintenance (flush/invalidate/clean) operations, but it does
> not fully respect the dsb/dmb instructions, which is something else.

If it fully respects the cache maintenance, it should also respect the
completion and ordering requirements of the cache maintenance
operations. That means that a DSB guarantees completion of such
operations.
Suravee Suthikulpanit April 30, 2015, 11:39 p.m. UTC | #20
On 4/30/2015 3:23 AM, Arnd Bergmann wrote:
> On Wednesday 29 April 2015 16:53:10 Suravee Suthikulpanit wrote:
>> On 4/29/15 11:25, Arnd Bergmann wrote:
>>> On Wednesday 29 April 2015 08:44:09 Suravee Suthikulpanit wrote:
>> [...]
>> As for the case where _CCA=0, I think the ACPI driver should essentially
>> communicate the information as HW is non-coherent as described in the
>> spec, and should be calling arch_setup_dma_ops(dev, false). It is true
>> that this in probably less-likely for the ARM64 server platforms.
>> However, I would think that the ACPI driver should not be making such
>> assumption.
>
> Can you add a description to the ACPI spec then to describe in detail what
> "non-coherent" is supposed to mean, and which action the OS is supposed to
> take when accessing data from device or CPU?

I believe Will has already provided this, and we have already discussed 
this on separate emails in this thread.

>>>[...]
>>> On a related note, I'm not sure how to handle different DMA masks here.
>>> arch_setup_dma_ops() gets passed a size (and offset) argument, which should
>>> match the DMA mask, but I don't know if there is a way to find out the
>>> size from ACPI. Should we assume it's always 64-bit DMA capable?
>>
>> Looking at the ACPI spec, it does have the _DMA object. IIUC, this can
>> be used to describe DMA properties of a particular bus.
>>
>> Method(_DMA, ResourceTemplate()
>> {
>> 	QWORDMemory(
>> 	ResourceConsumer,
>> 	PosDecode, // _DEC
>> 	MinFixed, // _MIF
>> 	MaxFixed, // _MAF
>> 	Prefetchable, // _MEM
>> 	ReadWrite, // _RW
>> 	0, // _GRA
>> 	0, // _MIN
>> 	0x1fffffff, // _MAX
>> 	0x200000000, // _TRA
>> 	0x20000000, // _LEN
>> 	, , ,	
>> 	)
>> }
>>
>> I am not sure if this is an appropriate use for this object, but this
>> seems to be similar to the dma-ranges property for OF, and probably can
>> be used to specify baseaddr and size information when calling
>> arch_setup_dma_ops().
>
> Yes, that seems like a good idea. What is the expected behavior when that
> object is absent? Do we assume that the parent device is not DMA capable?

 From the spec:
If the _DMA object is not present for a bus device, the OS assumes that 
any address placed on a bus by a child device will be decoded either by 
a device on the bus or by the bus itself, (in other words, all address 
ranges can be used for DMA).

The issue is, since this is optional, I don't know which FW often 
providing this info.

> Is this sufficient to describe the case where a device can only do DMA
> to a specific address range that is not at bus address zero but that maps
> to the beginning of physical RAM?

I believe that's the _MIN (Minimum Base Address) is for.

>>> For legacy reasons, the default mask is probably best left at 32-bit,
>>> but drivers are expected to call dma_set_mask() if they can do 64-bit DMA,
>>> and that should fail based on the information provided by the platform
>>> if the bus is not capable of doing that.
>>>
>> However, on ARM64 the dma_base and size parameter for
>> arch_setup_dma_ops() is currently not used, and only coherent flag is
>> used.
>
> We can hope that we won't need the dma_base setting here, but it's
> good to have the option to pass it down if we need it.
>
> Not passing the size is a bug that needs to be fixed ASAP, I believe
> a number of folks have run into this, most recently the APM X-Gene
> MMC controller
>

Ok. I'll look at this separately.

>> We probably should look at this separately. For the moment, we can
>> probably say that if _CCA object is missing when needed, the ACPI driver
>> won't set up dma_mask when creating platform_device, which should be
>> equivalent to saying DMA is not supported.
>>
>> Please let me know if this is acceptable, and I'll make change in V2
>> accordingly.
>
> I would still ask that you treat non-coherent to mean "no DMA" until
> we have come up with a way to sufficiently describe the kind of
> non-coherency in ACPI.
>
> 	Arnd

Ok. In V2, when _CCA=0, since we are not aware of ARM64 systems that is 
working with such assumption with ACPI. I will also default to not 
calling arch_setup_dma_ops() and fallback to arch-specific default. We 
can revisit this later once we need to support such case.

Thanks,

Suravee
Arnd Bergmann May 8, 2015, 2:01 p.m. UTC | #21
On Thursday 30 April 2015 16:55:14 Catalin Marinas wrote:
> On Thu, Apr 30, 2015 at 03:52:17PM +0200, Arnd Bergmann wrote:
> > On Thursday 30 April 2015 14:13:45 Will Deacon wrote:
> > > On Thu, Apr 30, 2015 at 02:03:00PM +0100, Arnd Bergmann wrote:
> > > > On Thursday 30 April 2015 12:46:15 Will Deacon wrote:
> > > > > Cache sync doesn't exist in the ARM/arm64architecture, what are the
> > > > > semantics supposed to be? Maybe it's just DSB for us (complete all pending
> > > > > maintenance).
> > > > 
> > > > It ensures that a state of a buffer as observed by CPU and device is
> > > > identical. It's possible that we removed all platforms that did something
> > > > interesting here, so it's one of these:
> > > > 
> > > > a) On architectures that are mostly coherent, it's a barrier
> > > >    that is broadcast to all devices, like I assume DSB is. IA64
> > > >    currently does this for all machines, but IIRC it used to 
> > > >    access some cluster interconnect at some point to enforce a
> > > >    flush.
> > > >    The ARM32 based ArmadaXP also falls into this model if the cache
> > > >    coherency fabric is enabled, as that needs to be synchronized
> 
> I'm getting confused by the ArmadaXP case. IIRC, the point of the
> arm,io-coherent property to the PL310 was precisely to make the
> outer_sync a no-op when the coherency is enabled. So basically an mb()
> would only issue a DSB on such platform without the PL310 cache sync.
> 
> On coherent systems, devices usually snoop the inner/CPU cache and not
> the system cache, that's further down the line. So a DSB would ensure
> the visibility at the coherent interconnect level before the system
> cache. I don't think it needs to be broadcast all the way to devices.

Sorry for the late reply. IIRC, the sync on Armada XP was not required
for the cache controller, but rather for the bus fabric, to ensure
that a DMA has made it into the memory controller.

> > > > b) On architectures where the device may not see the state of the cache,
> > > >    but the CPU is always aware of anything the device sends it,
> > > >    it flushes the cache. This seems to be the case on parisc,
> > > >    and in particular, there are some variants that do not support
> > > >    dma_alloc_coherent but only dma_alloc_noncoherent.
> > > > c) On architectures that need the synchronization both ways,
> > > >    it does (almost) the same invalidate/clean/flush thing as
> > > >    ARM, except it doesn't have to worry about cache lines from
> > > >    speculative prefetch which make it impossible to implement on
> > > >    ARM.
> > > 
> > > Okey doke, thanks for the explanation. It sounds like we can just build
> > > the primitive out of the existing cache maintenance routines if we need
> > > to implement it.
> > 
> > Cases a) and b) yes, but not c), otherwise we could simplify
> > the ARM dma-mapping implementation and just merge __dma_page_cpu_to_dev
> > and __dma_page_dev_to_cpu into one function.
> 
> I don't fully understand c) or b). Wouldn't the non-coherent ops cover
> them both, though potentially not as efficient?

Turning off caches usually makes everything coherent, but the performance
cost can be gigantic. Also, it might not help if the problem with coherency
is the completion of the DMA as opposed to the caching.

> > > > I guess we could handle that case as well, by requiring any ACPI based
> > > > firmware to turn off the coherency fabric on that system and just making
> > > > it dog slow.
> > > 
> > > We already require something similar in Documentation/arm64/booting.txt:
> > > 
> > >   `System caches which do not respect architected cache maintenance by VA
> > >    operations (not recommended) must be configured and disabled.'
> > 
> > Hmm, does that rule really get violated here? I think it fully respects
> > the cache maintenance (flush/invalidate/clean) operations, but it does
> > not fully respect the dsb/dmb instructions, which is something else.
> 
> If it fully respects the cache maintenance, it should also respect the
> completion and ordering requirements of the cache maintenance
> operations. That means that a DSB guarantees completion of such
> operations.

Ok.

	Arnd
diff mbox

Patch

diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 4bf7559..a4db208 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -108,9 +108,12 @@  struct platform_device *acpi_create_platform_device(struct acpi_device *adev)
 	if (IS_ERR(pdev))
 		dev_err(&adev->dev, "platform device creation failed: %ld\n",
 			PTR_ERR(pdev));
-	else
+	else {
+		arch_setup_dma_ops(&pdev->dev, 0, 0, NULL,
+				   adev->flags.is_coherent);
 		dev_dbg(&adev->dev, "created platform device %s\n",
 			dev_name(&pdev->dev));
+	}
 
 	kfree(resources);
 	return pdev;
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 849b699..509d0157 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -11,6 +11,7 @@ 
 #include <linux/kthread.h>
 #include <linux/dmi.h>
 #include <linux/nls.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/pgtable.h>
 
@@ -2137,6 +2138,49 @@  void acpi_free_pnp_ids(struct acpi_device_pnp *pnp)
 	kfree(pnp->unique_id);
 }
 
+static void acpi_init_coherency(struct acpi_device *device)
+{
+	unsigned long long cca;
+	acpi_status status;
+	struct acpi_device *parent = device->parent;
+
+	if (parent && parent->flags.cca_seen) {
+		/*
+		 * From ACPIv5.1, OSPM will ignore _CCA if an ancestor
+		 * already saw one.
+		 */
+		device->flags.cca_seen = 1;
+		cca = acpi_dma_is_coherent(parent);
+	} else {
+		status = acpi_evaluate_integer(device->handle, "_CCA",
+					       NULL, &cca);
+		if (ACPI_SUCCESS(status)) {
+			device->flags.cca_seen = 1;
+		} else if (IS_ENABLED(CONFIG_ACPI_MUST_HAVE_CCA)) {
+			/*
+			 * Architecture has specified that if the device
+			 * can do DMA, it must have ACPI _CCA object.
+			 * Here, there could be two cases:
+			 *   1. Not DMA-able device.
+			 *   2. DMA-able device, but missing _CCA object.
+			 *
+			 * In both cases, we will default to dma non-coherent.
+			 */
+			cca = 0;
+		} else {
+			/*
+			 * If architecture does not specify that device must
+			 * specify ACPI _CCA (e.g. x86), we default to use
+			 * dma coherent.
+			 */
+			cca = 1;
+		}
+	}
+
+	device->flags.is_coherent = cca;
+	arch_setup_dma_ops(&device->dev, 0, 0, NULL, cca);
+}
+
 void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
 			     int type, unsigned long long sta)
 {
@@ -2155,6 +2199,7 @@  void acpi_init_device_object(struct acpi_device *device, acpi_handle handle,
 	device->flags.visited = false;
 	device_initialize(&device->dev);
 	dev_set_uevent_suppress(&device->dev, true);
+	acpi_init_coherency(device);
 }
 
 void acpi_device_add_finalize(struct acpi_device *device)
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 8de4fa9..7e8cd4c 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -208,7 +208,9 @@  struct acpi_device_flags {
 	u32 visited:1;
 	u32 hotplug_notify:1;
 	u32 is_dock_station:1;
-	u32 reserved:23;
+	u32 is_coherent:1;
+	u32 cca_seen:1;
+	u32 reserved:21;
 };
 
 /* File System */
@@ -380,6 +382,11 @@  struct acpi_device {
 	void (*remove)(struct acpi_device *);
 };
 
+static inline bool acpi_dma_is_coherent(struct acpi_device *adev)
+{
+	return adev && adev->flags.is_coherent;
+}
+
 static inline bool is_acpi_node(struct fwnode_handle *fwnode)
 {
 	return fwnode && fwnode->type == FWNODE_ACPI;