diff mbox series

mm: cma: support sysfs

Message ID 20210203155001.4121868-1-minchan@kernel.org (mailing list archive)
State New, archived
Headers show
Series mm: cma: support sysfs | expand

Commit Message

Minchan Kim Feb. 3, 2021, 3:50 p.m. UTC
Since CMA is getting used more widely, it's more important to
keep monitoring CMA statistics for system health since it's
directly related to user experience.

This patch introduces sysfs for the CMA and exposes stats below
to keep monitor for telemetric in the system.

 * the number of CMA allocation attempts
 * the number of CMA allocation failures
 * the number of CMA page allocation attempts
 * the number of CMA page allocation failures

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 Documentation/ABI/testing/sysfs-kernel-mm-cma |  39 +++++
 include/linux/cma.h                           |   1 +
 mm/Makefile                                   |   1 +
 mm/cma.c                                      |   6 +-
 mm/cma.h                                      |  20 +++
 mm/cma_sysfs.c                                | 143 ++++++++++++++++++
 6 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cma
 create mode 100644 mm/cma_sysfs.c

Comments

John Hubbard Feb. 4, 2021, 8:50 a.m. UTC | #1
On 2/3/21 7:50 AM, Minchan Kim wrote:
> Since CMA is getting used more widely, it's more important to
> keep monitoring CMA statistics for system health since it's
> directly related to user experience.
> 
> This patch introduces sysfs for the CMA and exposes stats below
> to keep monitor for telemetric in the system.
> 
>   * the number of CMA allocation attempts
>   * the number of CMA allocation failures
>   * the number of CMA page allocation attempts
>   * the number of CMA page allocation failures

The desire to report CMA data is understandable, but there are a few
odd things here:

1) First of all, this has significant overlap with /sys/kernel/debug/cma
items. I suspect that all of these items could instead go into
/sys/kernel/debug/cma, right?

2) The overall CMA allocation attempts/failures (first two items above) seem
an odd pair of things to track. Maybe that is what was easy to track, but I'd
vote for just omitting them.
> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>   Documentation/ABI/testing/sysfs-kernel-mm-cma |  39 +++++
>   include/linux/cma.h                           |   1 +
>   mm/Makefile                                   |   1 +
>   mm/cma.c                                      |   6 +-
>   mm/cma.h                                      |  20 +++
>   mm/cma_sysfs.c                                | 143 ++++++++++++++++++
>   6 files changed, 209 insertions(+), 1 deletion(-)
>   create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cma
>   create mode 100644 mm/cma_sysfs.c
> 
> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma
> new file mode 100644
> index 000000000000..2a43c0aacc39
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma
> @@ -0,0 +1,39 @@
> +What:		/sys/kernel/mm/cma/
> +Date:		Feb 2021
> +Contact:	Minchan Kim <minchan@kernel.org>
> +Description:
> +		/sys/kernel/mm/cma/ contains a number of subdirectories by
> +		cma-heap name. The subdirectory contains a number of files
> +		to represent cma allocation statistics.

Somewhere, maybe here, there should be a mention of the closely related
/sys/kernel/debug/cma files.

> +
> +		There are number of files under
> +				/sys/kernel/mm/cma/<cma-heap-name> directory
> +
> +			- cma_alloc_attempt
> +			- cma_alloc_fail

Are these really useful? They a summary of the alloc_pages items, really.

> +			- alloc_pages_attempt
> +			- alloc_pages_fail

This should also have "cma" in the name, really: cma_alloc_pages_*.

> +
> +What:		/sys/kernel/mm/cma/<cma-heap-name>/cma_alloc_attempt
> +Date:		Feb 2021
> +Contact:	Minchan Kim <minchan@kernel.org>
> +Description:
> +		the number of cma_alloc API attempted
> +
> +What:		/sys/kernel/mm/cma/<cma-heap-name>/cma_alloc_fail
> +Date:		Feb 2021
> +Contact:	Minchan Kim <minchan@kernel.org>
> +Description:
> +		the number of CMA_alloc API failed
> +
> +What:		/sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_attempt
> +Date:		Feb 2021
> +Contact:	Minchan Kim <minchan@kernel.org>
> +Description:
> +		the number of pages CMA API tried to allocate
> +
> +What:		/sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_fail
> +Date:		Feb 2021
> +Contact:	Minchan Kim <minchan@kernel.org>
> +Description:
> +		the number of pages CMA API failed to allocate
> diff --git a/include/linux/cma.h b/include/linux/cma.h
> index 217999c8a762..71a28a5bb54e 100644
> --- a/include/linux/cma.h
> +++ b/include/linux/cma.h
> @@ -49,4 +49,5 @@ extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
>   extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
>   
>   extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
> +

A single additional blank line seems to be the only change to this file. :)

thanks,
Minchan Kim Feb. 4, 2021, 8:07 p.m. UTC | #2
On Thu, Feb 04, 2021 at 12:50:58AM -0800, John Hubbard wrote:
> On 2/3/21 7:50 AM, Minchan Kim wrote:
> > Since CMA is getting used more widely, it's more important to
> > keep monitoring CMA statistics for system health since it's
> > directly related to user experience.
> > 
> > This patch introduces sysfs for the CMA and exposes stats below
> > to keep monitor for telemetric in the system.
> > 
> >   * the number of CMA allocation attempts
> >   * the number of CMA allocation failures
> >   * the number of CMA page allocation attempts
> >   * the number of CMA page allocation failures
> 
> The desire to report CMA data is understandable, but there are a few
> odd things here:
> 
> 1) First of all, this has significant overlap with /sys/kernel/debug/cma
> items. I suspect that all of these items could instead go into

At this moment, I don't see any overlap with item from cma_debugfs.
Could you specify what item you are mentioning?

> /sys/kernel/debug/cma, right?

Anyway, thing is I need an stable interface for that and need to use
it in Android production build, too(Unfortunately, Android deprecated
the debugfs
https://source.android.com/setup/start/android-11-release#debugfs
)

What should be in debugfs and in sysfs? What's the criteria?

Some statistic could be considered about debugging aid or telemetric
depening on view point and usecase. And here, I want to use it for
telemetric, get an stable interface and use it in production build
of Android. In this chance, I'd like to get concrete guideline
what should be in sysfs and debugfs so that pointing out this thread
whenever folks dump their stat into sysfs to avoid waste of time
for others in future. :)

> 
> 2) The overall CMA allocation attempts/failures (first two items above) seem
> an odd pair of things to track. Maybe that is what was easy to track, but I'd
> vote for just omitting them.

Then, how to know how often CMA API failed?
There are various size allocation request for a CMA so only page
allocation stat are not enough to know it.

> > 
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> >   Documentation/ABI/testing/sysfs-kernel-mm-cma |  39 +++++
> >   include/linux/cma.h                           |   1 +
> >   mm/Makefile                                   |   1 +
> >   mm/cma.c                                      |   6 +-
> >   mm/cma.h                                      |  20 +++
> >   mm/cma_sysfs.c                                | 143 ++++++++++++++++++
> >   6 files changed, 209 insertions(+), 1 deletion(-)
> >   create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cma
> >   create mode 100644 mm/cma_sysfs.c
> > 
> > diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma
> > new file mode 100644
> > index 000000000000..2a43c0aacc39
> > --- /dev/null
> > +++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma
> > @@ -0,0 +1,39 @@
> > +What:		/sys/kernel/mm/cma/
> > +Date:		Feb 2021
> > +Contact:	Minchan Kim <minchan@kernel.org>
> > +Description:
> > +		/sys/kernel/mm/cma/ contains a number of subdirectories by
> > +		cma-heap name. The subdirectory contains a number of files
> > +		to represent cma allocation statistics.
> 
> Somewhere, maybe here, there should be a mention of the closely related
> /sys/kernel/debug/cma files.
> 
> > +
> > +		There are number of files under
> > +				/sys/kernel/mm/cma/<cma-heap-name> directory
> > +
> > +			- cma_alloc_attempt
> > +			- cma_alloc_fail
> 
> Are these really useful? They a summary of the alloc_pages items, really.
> 
> > +			- alloc_pages_attempt
> > +			- alloc_pages_fail
> 
> This should also have "cma" in the name, really: cma_alloc_pages_*.

No problem.

> 
> > +
> > +What:		/sys/kernel/mm/cma/<cma-heap-name>/cma_alloc_attempt
> > +Date:		Feb 2021
> > +Contact:	Minchan Kim <minchan@kernel.org>
> > +Description:
> > +		the number of cma_alloc API attempted
> > +
> > +What:		/sys/kernel/mm/cma/<cma-heap-name>/cma_alloc_fail
> > +Date:		Feb 2021
> > +Contact:	Minchan Kim <minchan@kernel.org>
> > +Description:
> > +		the number of CMA_alloc API failed
> > +
> > +What:		/sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_attempt
> > +Date:		Feb 2021
> > +Contact:	Minchan Kim <minchan@kernel.org>
> > +Description:
> > +		the number of pages CMA API tried to allocate
> > +
> > +What:		/sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_fail
> > +Date:		Feb 2021
> > +Contact:	Minchan Kim <minchan@kernel.org>
> > +Description:
> > +		the number of pages CMA API failed to allocate
> > diff --git a/include/linux/cma.h b/include/linux/cma.h
> > index 217999c8a762..71a28a5bb54e 100644
> > --- a/include/linux/cma.h
> > +++ b/include/linux/cma.h
> > @@ -49,4 +49,5 @@ extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
> >   extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
> >   extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
> > +
> 
> A single additional blank line seems to be the only change to this file. :)

Oops.
John Hubbard Feb. 4, 2021, 11:14 p.m. UTC | #3
On 2/4/21 12:07 PM, Minchan Kim wrote:
> On Thu, Feb 04, 2021 at 12:50:58AM -0800, John Hubbard wrote:
>> On 2/3/21 7:50 AM, Minchan Kim wrote:
>>> Since CMA is getting used more widely, it's more important to
>>> keep monitoring CMA statistics for system health since it's
>>> directly related to user experience.
>>>
>>> This patch introduces sysfs for the CMA and exposes stats below
>>> to keep monitor for telemetric in the system.
>>>
>>>    * the number of CMA allocation attempts
>>>    * the number of CMA allocation failures
>>>    * the number of CMA page allocation attempts
>>>    * the number of CMA page allocation failures
>>
>> The desire to report CMA data is understandable, but there are a few
>> odd things here:
>>
>> 1) First of all, this has significant overlap with /sys/kernel/debug/cma
>> items. I suspect that all of these items could instead go into
> 
> At this moment, I don't see any overlap with item from cma_debugfs.
> Could you specify what item you are mentioning?

Just the fact that there would be two systems under /sys, both of which are
doing very very similar things: providing information that is intended to
help diagnose CMA.

> 
>> /sys/kernel/debug/cma, right?
> 
> Anyway, thing is I need an stable interface for that and need to use
> it in Android production build, too(Unfortunately, Android deprecated
> the debugfs
> https://source.android.com/setup/start/android-11-release#debugfs
> )

That's the closest hint to a "why this is needed" that we've seen yet.
But it's only a hint.

> 
> What should be in debugfs and in sysfs? What's the criteria?

Well, it's a gray area. "Debugging support" goes into debugfs, and
"production-level monitoring and control" goes into sysfs, roughly
speaking. And here you have items that could be classified as either.

> 
> Some statistic could be considered about debugging aid or telemetric
> depening on view point and usecase. And here, I want to use it for
> telemetric, get an stable interface and use it in production build
> of Android. In this chance, I'd like to get concrete guideline
> what should be in sysfs and debugfs so that pointing out this thread
> whenever folks dump their stat into sysfs to avoid waste of time
> for others in future. :)
> 
>>
>> 2) The overall CMA allocation attempts/failures (first two items above) seem
>> an odd pair of things to track. Maybe that is what was easy to track, but I'd
>> vote for just omitting them.
> 
> Then, how to know how often CMA API failed?

Why would you even need to know that, *in addition* to knowing specific
page allocation numbers that failed? Again, there is no real-world motivation
cited yet, just "this is good data". Need more stories and support here.


thanks,
Suren Baghdasaryan Feb. 4, 2021, 11:43 p.m. UTC | #4
On Thu, Feb 4, 2021 at 3:14 PM John Hubbard <jhubbard@nvidia.com> wrote:
>
> On 2/4/21 12:07 PM, Minchan Kim wrote:
> > On Thu, Feb 04, 2021 at 12:50:58AM -0800, John Hubbard wrote:
> >> On 2/3/21 7:50 AM, Minchan Kim wrote:
> >>> Since CMA is getting used more widely, it's more important to
> >>> keep monitoring CMA statistics for system health since it's
> >>> directly related to user experience.
> >>>
> >>> This patch introduces sysfs for the CMA and exposes stats below
> >>> to keep monitor for telemetric in the system.
> >>>
> >>>    * the number of CMA allocation attempts
> >>>    * the number of CMA allocation failures
> >>>    * the number of CMA page allocation attempts
> >>>    * the number of CMA page allocation failures
> >>
> >> The desire to report CMA data is understandable, but there are a few
> >> odd things here:
> >>
> >> 1) First of all, this has significant overlap with /sys/kernel/debug/cma
> >> items. I suspect that all of these items could instead go into
> >
> > At this moment, I don't see any overlap with item from cma_debugfs.
> > Could you specify what item you are mentioning?
>
> Just the fact that there would be two systems under /sys, both of which are
> doing very very similar things: providing information that is intended to
> help diagnose CMA.
>
> >
> >> /sys/kernel/debug/cma, right?
> >
> > Anyway, thing is I need an stable interface for that and need to use
> > it in Android production build, too(Unfortunately, Android deprecated
> > the debugfs
> > https://source.android.com/setup/start/android-11-release#debugfs
> > )
>
> That's the closest hint to a "why this is needed" that we've seen yet.
> But it's only a hint.
>
> >
> > What should be in debugfs and in sysfs? What's the criteria?
>
> Well, it's a gray area. "Debugging support" goes into debugfs, and
> "production-level monitoring and control" goes into sysfs, roughly
> speaking. And here you have items that could be classified as either.
>
> >
> > Some statistic could be considered about debugging aid or telemetric
> > depening on view point and usecase. And here, I want to use it for
> > telemetric, get an stable interface and use it in production build
> > of Android. In this chance, I'd like to get concrete guideline
> > what should be in sysfs and debugfs so that pointing out this thread
> > whenever folks dump their stat into sysfs to avoid waste of time
> > for others in future. :)
> >
> >>
> >> 2) The overall CMA allocation attempts/failures (first two items above) seem
> >> an odd pair of things to track. Maybe that is what was easy to track, but I'd
> >> vote for just omitting them.
> >
> > Then, how to know how often CMA API failed?
>
> Why would you even need to know that, *in addition* to knowing specific
> page allocation numbers that failed? Again, there is no real-world motivation
> cited yet, just "this is good data". Need more stories and support here.

IMHO it would be very useful to see whether there are multiple
small-order allocation failures or a few large-order ones, especially
for CMA where large allocations are not unusual. For that I believe
both alloc_pages_attempt and alloc_pages_fail would be required.

>
>
> thanks,
> --
> John Hubbard
> NVIDIA
>
> > There are various size allocation request for a CMA so only page
> > allocation stat are not enough to know it.
> >
> >>>
> >>> Signed-off-by: Minchan Kim <minchan@kernel.org>
> >>> ---
> >>>    Documentation/ABI/testing/sysfs-kernel-mm-cma |  39 +++++
> >>>    include/linux/cma.h                           |   1 +
> >>>    mm/Makefile                                   |   1 +
> >>>    mm/cma.c                                      |   6 +-
> >>>    mm/cma.h                                      |  20 +++
> >>>    mm/cma_sysfs.c                                | 143 ++++++++++++++++++
> >>>    6 files changed, 209 insertions(+), 1 deletion(-)
> >>>    create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cma
> >>>    create mode 100644 mm/cma_sysfs.c
> >>>
> >>> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma
> >>> new file mode 100644
> >>> index 000000000000..2a43c0aacc39
> >>> --- /dev/null
> >>> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma
> >>> @@ -0,0 +1,39 @@
> >>> +What:              /sys/kernel/mm/cma/
> >>> +Date:              Feb 2021
> >>> +Contact:   Minchan Kim <minchan@kernel.org>
> >>> +Description:
> >>> +           /sys/kernel/mm/cma/ contains a number of subdirectories by
> >>> +           cma-heap name. The subdirectory contains a number of files
> >>> +           to represent cma allocation statistics.
> >>
> >> Somewhere, maybe here, there should be a mention of the closely related
> >> /sys/kernel/debug/cma files.
> >>
> >>> +
> >>> +           There are number of files under
> >>> +                           /sys/kernel/mm/cma/<cma-heap-name> directory
> >>> +
> >>> +                   - cma_alloc_attempt
> >>> +                   - cma_alloc_fail
> >>
> >> Are these really useful? They a summary of the alloc_pages items, really.
> >>
> >>> +                   - alloc_pages_attempt
> >>> +                   - alloc_pages_fail
> >>
> >> This should also have "cma" in the name, really: cma_alloc_pages_*.
> >
> > No problem.
> >
> >>
> >>> +
> >>> +What:              /sys/kernel/mm/cma/<cma-heap-name>/cma_alloc_attempt
> >>> +Date:              Feb 2021
> >>> +Contact:   Minchan Kim <minchan@kernel.org>
> >>> +Description:
> >>> +           the number of cma_alloc API attempted
> >>> +
> >>> +What:              /sys/kernel/mm/cma/<cma-heap-name>/cma_alloc_fail
> >>> +Date:              Feb 2021
> >>> +Contact:   Minchan Kim <minchan@kernel.org>
> >>> +Description:
> >>> +           the number of CMA_alloc API failed
> >>> +
> >>> +What:              /sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_attempt
> >>> +Date:              Feb 2021
> >>> +Contact:   Minchan Kim <minchan@kernel.org>
> >>> +Description:
> >>> +           the number of pages CMA API tried to allocate
> >>> +
> >>> +What:              /sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_fail
> >>> +Date:              Feb 2021
> >>> +Contact:   Minchan Kim <minchan@kernel.org>
> >>> +Description:
> >>> +           the number of pages CMA API failed to allocate
> >>> diff --git a/include/linux/cma.h b/include/linux/cma.h
> >>> index 217999c8a762..71a28a5bb54e 100644
> >>> --- a/include/linux/cma.h
> >>> +++ b/include/linux/cma.h
> >>> @@ -49,4 +49,5 @@ extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
> >>>    extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
> >>>    extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
> >>> +
> >>
> >> A single additional blank line seems to be the only change to this file. :)
> >
> > Oops.
> >
>
Suren Baghdasaryan Feb. 4, 2021, 11:45 p.m. UTC | #5
On Thu, Feb 4, 2021 at 3:43 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Feb 4, 2021 at 3:14 PM John Hubbard <jhubbard@nvidia.com> wrote:
> >
> > On 2/4/21 12:07 PM, Minchan Kim wrote:
> > > On Thu, Feb 04, 2021 at 12:50:58AM -0800, John Hubbard wrote:
> > >> On 2/3/21 7:50 AM, Minchan Kim wrote:
> > >>> Since CMA is getting used more widely, it's more important to
> > >>> keep monitoring CMA statistics for system health since it's
> > >>> directly related to user experience.
> > >>>
> > >>> This patch introduces sysfs for the CMA and exposes stats below
> > >>> to keep monitor for telemetric in the system.
> > >>>
> > >>>    * the number of CMA allocation attempts
> > >>>    * the number of CMA allocation failures
> > >>>    * the number of CMA page allocation attempts
> > >>>    * the number of CMA page allocation failures
> > >>
> > >> The desire to report CMA data is understandable, but there are a few
> > >> odd things here:
> > >>
> > >> 1) First of all, this has significant overlap with /sys/kernel/debug/cma
> > >> items. I suspect that all of these items could instead go into
> > >
> > > At this moment, I don't see any overlap with item from cma_debugfs.
> > > Could you specify what item you are mentioning?
> >
> > Just the fact that there would be two systems under /sys, both of which are
> > doing very very similar things: providing information that is intended to
> > help diagnose CMA.
> >
> > >
> > >> /sys/kernel/debug/cma, right?
> > >
> > > Anyway, thing is I need an stable interface for that and need to use
> > > it in Android production build, too(Unfortunately, Android deprecated
> > > the debugfs
> > > https://source.android.com/setup/start/android-11-release#debugfs
> > > )
> >
> > That's the closest hint to a "why this is needed" that we've seen yet.
> > But it's only a hint.
> >
> > >
> > > What should be in debugfs and in sysfs? What's the criteria?
> >
> > Well, it's a gray area. "Debugging support" goes into debugfs, and
> > "production-level monitoring and control" goes into sysfs, roughly
> > speaking. And here you have items that could be classified as either.
> >
> > >
> > > Some statistic could be considered about debugging aid or telemetric
> > > depening on view point and usecase. And here, I want to use it for
> > > telemetric, get an stable interface and use it in production build
> > > of Android. In this chance, I'd like to get concrete guideline
> > > what should be in sysfs and debugfs so that pointing out this thread
> > > whenever folks dump their stat into sysfs to avoid waste of time
> > > for others in future. :)
> > >
> > >>
> > >> 2) The overall CMA allocation attempts/failures (first two items above) seem
> > >> an odd pair of things to track. Maybe that is what was easy to track, but I'd
> > >> vote for just omitting them.
> > >
> > > Then, how to know how often CMA API failed?
> >
> > Why would you even need to know that, *in addition* to knowing specific
> > page allocation numbers that failed? Again, there is no real-world motivation
> > cited yet, just "this is good data". Need more stories and support here.
>
> IMHO it would be very useful to see whether there are multiple
> small-order allocation failures or a few large-order ones, especially
> for CMA where large allocations are not unusual. For that I believe
> both alloc_pages_attempt and alloc_pages_fail would be required.

Sorry, I meant to say "both cma_alloc_fail and alloc_pages_fail would
be required".

>
> >
> >
> > thanks,
> > --
> > John Hubbard
> > NVIDIA
> >
> > > There are various size allocation request for a CMA so only page
> > > allocation stat are not enough to know it.
> > >
> > >>>
> > >>> Signed-off-by: Minchan Kim <minchan@kernel.org>
> > >>> ---
> > >>>    Documentation/ABI/testing/sysfs-kernel-mm-cma |  39 +++++
> > >>>    include/linux/cma.h                           |   1 +
> > >>>    mm/Makefile                                   |   1 +
> > >>>    mm/cma.c                                      |   6 +-
> > >>>    mm/cma.h                                      |  20 +++
> > >>>    mm/cma_sysfs.c                                | 143 ++++++++++++++++++
> > >>>    6 files changed, 209 insertions(+), 1 deletion(-)
> > >>>    create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cma
> > >>>    create mode 100644 mm/cma_sysfs.c
> > >>>
> > >>> diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma
> > >>> new file mode 100644
> > >>> index 000000000000..2a43c0aacc39
> > >>> --- /dev/null
> > >>> +++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma
> > >>> @@ -0,0 +1,39 @@
> > >>> +What:              /sys/kernel/mm/cma/
> > >>> +Date:              Feb 2021
> > >>> +Contact:   Minchan Kim <minchan@kernel.org>
> > >>> +Description:
> > >>> +           /sys/kernel/mm/cma/ contains a number of subdirectories by
> > >>> +           cma-heap name. The subdirectory contains a number of files
> > >>> +           to represent cma allocation statistics.
> > >>
> > >> Somewhere, maybe here, there should be a mention of the closely related
> > >> /sys/kernel/debug/cma files.
> > >>
> > >>> +
> > >>> +           There are number of files under
> > >>> +                           /sys/kernel/mm/cma/<cma-heap-name> directory
> > >>> +
> > >>> +                   - cma_alloc_attempt
> > >>> +                   - cma_alloc_fail
> > >>
> > >> Are these really useful? They a summary of the alloc_pages items, really.
> > >>
> > >>> +                   - alloc_pages_attempt
> > >>> +                   - alloc_pages_fail
> > >>
> > >> This should also have "cma" in the name, really: cma_alloc_pages_*.
> > >
> > > No problem.
> > >
> > >>
> > >>> +
> > >>> +What:              /sys/kernel/mm/cma/<cma-heap-name>/cma_alloc_attempt
> > >>> +Date:              Feb 2021
> > >>> +Contact:   Minchan Kim <minchan@kernel.org>
> > >>> +Description:
> > >>> +           the number of cma_alloc API attempted
> > >>> +
> > >>> +What:              /sys/kernel/mm/cma/<cma-heap-name>/cma_alloc_fail
> > >>> +Date:              Feb 2021
> > >>> +Contact:   Minchan Kim <minchan@kernel.org>
> > >>> +Description:
> > >>> +           the number of CMA_alloc API failed
> > >>> +
> > >>> +What:              /sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_attempt
> > >>> +Date:              Feb 2021
> > >>> +Contact:   Minchan Kim <minchan@kernel.org>
> > >>> +Description:
> > >>> +           the number of pages CMA API tried to allocate
> > >>> +
> > >>> +What:              /sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_fail
> > >>> +Date:              Feb 2021
> > >>> +Contact:   Minchan Kim <minchan@kernel.org>
> > >>> +Description:
> > >>> +           the number of pages CMA API failed to allocate
> > >>> diff --git a/include/linux/cma.h b/include/linux/cma.h
> > >>> index 217999c8a762..71a28a5bb54e 100644
> > >>> --- a/include/linux/cma.h
> > >>> +++ b/include/linux/cma.h
> > >>> @@ -49,4 +49,5 @@ extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
> > >>>    extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
> > >>>    extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
> > >>> +
> > >>
> > >> A single additional blank line seems to be the only change to this file. :)
> > >
> > > Oops.
> > >
> >
Minchan Kim Feb. 5, 2021, 12:12 a.m. UTC | #6
On Thu, Feb 04, 2021 at 03:14:56PM -0800, John Hubbard wrote:
> On 2/4/21 12:07 PM, Minchan Kim wrote:
> > On Thu, Feb 04, 2021 at 12:50:58AM -0800, John Hubbard wrote:
> > > On 2/3/21 7:50 AM, Minchan Kim wrote:
> > > > Since CMA is getting used more widely, it's more important to
> > > > keep monitoring CMA statistics for system health since it's
> > > > directly related to user experience.
> > > > 
> > > > This patch introduces sysfs for the CMA and exposes stats below
> > > > to keep monitor for telemetric in the system.
> > > > 
> > > >    * the number of CMA allocation attempts
> > > >    * the number of CMA allocation failures
> > > >    * the number of CMA page allocation attempts
> > > >    * the number of CMA page allocation failures
> > > 
> > > The desire to report CMA data is understandable, but there are a few
> > > odd things here:
> > > 
> > > 1) First of all, this has significant overlap with /sys/kernel/debug/cma
> > > items. I suspect that all of these items could instead go into
> > 
> > At this moment, I don't see any overlap with item from cma_debugfs.
> > Could you specify what item you are mentioning?
> 
> Just the fact that there would be two systems under /sys, both of which are
> doing very very similar things: providing information that is intended to
> help diagnose CMA.
> 
> > 
> > > /sys/kernel/debug/cma, right?
> > 
> > Anyway, thing is I need an stable interface for that and need to use
> > it in Android production build, too(Unfortunately, Android deprecated
> > the debugfs
> > https://source.android.com/setup/start/android-11-release#debugfs
> > )
> 
> That's the closest hint to a "why this is needed" that we've seen yet.
> But it's only a hint.
> 
> > 
> > What should be in debugfs and in sysfs? What's the criteria?
> 
> Well, it's a gray area. "Debugging support" goes into debugfs, and
> "production-level monitoring and control" goes into sysfs, roughly

True.

> speaking. And here you have items that could be classified as either.
> 
> > 
> > Some statistic could be considered about debugging aid or telemetric
> > depening on view point and usecase. And here, I want to use it for
> > telemetric, get an stable interface and use it in production build
> > of Android. In this chance, I'd like to get concrete guideline
> > what should be in sysfs and debugfs so that pointing out this thread
> > whenever folks dump their stat into sysfs to avoid waste of time
> > for others in future. :)
> > 
> > > 
> > > 2) The overall CMA allocation attempts/failures (first two items above) seem
> > > an odd pair of things to track. Maybe that is what was easy to track, but I'd
> > > vote for just omitting them.
> > 
> > Then, how to know how often CMA API failed?
> 
> Why would you even need to know that, *in addition* to knowing specific
> page allocation numbers that failed? Again, there is no real-world motivation
> cited yet, just "this is good data". Need more stories and support here.

Let me give an example.

Let' assume we use memory buffer allocation via CMA for bluetooth
enable of  device.
If user clicks the bluetooth button in the phone but fail to allocate
the memory from CMA, user will still see bluetooth button gray.
User would think his touch was not enough powerful so he try clicking
again and fortunately CMA allocation was successful this time and
they will see bluetooh button enabled and could listen the music.

Here, product team needs to monitor how often CMA alloc failed so
if the failure ratio is steadily increased than the bar,
it means engineers need to go investigation.

Make sense?
John Hubbard Feb. 5, 2021, 12:24 a.m. UTC | #7
On 2/4/21 4:12 PM, Minchan Kim wrote:
...
>>> Then, how to know how often CMA API failed?
>>
>> Why would you even need to know that, *in addition* to knowing specific
>> page allocation numbers that failed? Again, there is no real-world motivation
>> cited yet, just "this is good data". Need more stories and support here.
> 
> Let me give an example.
> 
> Let' assume we use memory buffer allocation via CMA for bluetooth
> enable of  device.
> If user clicks the bluetooth button in the phone but fail to allocate
> the memory from CMA, user will still see bluetooth button gray.
> User would think his touch was not enough powerful so he try clicking
> again and fortunately CMA allocation was successful this time and
> they will see bluetooh button enabled and could listen the music.
> 
> Here, product team needs to monitor how often CMA alloc failed so
> if the failure ratio is steadily increased than the bar,
> it means engineers need to go investigation.
> 
> Make sense?
> 

Yes, except that it raises more questions:

1) Isn't this just standard allocation failure? Don't you already have a way
to track that?

Presumably, having the source code, you can easily deduce that a bluetooth
allocation failure goes directly to a CMA allocation failure, right?

Anyway, even though the above is still a little murky, I expect you're right
that it's good to have *some* indication, somewhere about CMA behavior...

Thinking about this some more, I wonder if this is really /proc/vmstat sort
of data that we're talking about. It seems to fit right in there, yes?


thanks,
John Hubbard Feb. 5, 2021, 12:25 a.m. UTC | #8
On 2/4/21 3:45 PM, Suren Baghdasaryan wrote:
...
>>>>> 2) The overall CMA allocation attempts/failures (first two items above) seem
>>>>> an odd pair of things to track. Maybe that is what was easy to track, but I'd
>>>>> vote for just omitting them.
>>>>
>>>> Then, how to know how often CMA API failed?
>>>
>>> Why would you even need to know that, *in addition* to knowing specific
>>> page allocation numbers that failed? Again, there is no real-world motivation
>>> cited yet, just "this is good data". Need more stories and support here.
>>
>> IMHO it would be very useful to see whether there are multiple
>> small-order allocation failures or a few large-order ones, especially
>> for CMA where large allocations are not unusual. For that I believe
>> both alloc_pages_attempt and alloc_pages_fail would be required.
> 
> Sorry, I meant to say "both cma_alloc_fail and alloc_pages_fail would
> be required".

So if you want to know that, the existing items are still a little too indirect
to really get it right. You can only know the average allocation size, by
dividing. Instead, we should provide the allocation size, for each count.

The limited interface makes this a little awkward, but using zones/ranges could
work: "for this range of allocation sizes, there were the following stats". Or,
some other technique that I haven't thought of (maybe two items per file?) would
be better.

On the other hand, there's an argument for keeping this minimal and simple. That
would probably lead us to putting in a couple of items into /proc/vmstat, as I
just mentioned in my other response, and calling it good.


thanks,
John Hubbard Feb. 5, 2021, 12:34 a.m. UTC | #9
On 2/4/21 4:25 PM, John Hubbard wrote:
> On 2/4/21 3:45 PM, Suren Baghdasaryan wrote:
> ...
>>>>>> 2) The overall CMA allocation attempts/failures (first two items above) seem
>>>>>> an odd pair of things to track. Maybe that is what was easy to track, but I'd
>>>>>> vote for just omitting them.
>>>>>
>>>>> Then, how to know how often CMA API failed?
>>>>
>>>> Why would you even need to know that, *in addition* to knowing specific
>>>> page allocation numbers that failed? Again, there is no real-world motivation
>>>> cited yet, just "this is good data". Need more stories and support here.
>>>
>>> IMHO it would be very useful to see whether there are multiple
>>> small-order allocation failures or a few large-order ones, especially
>>> for CMA where large allocations are not unusual. For that I believe
>>> both alloc_pages_attempt and alloc_pages_fail would be required.
>>
>> Sorry, I meant to say "both cma_alloc_fail and alloc_pages_fail would
>> be required".
> 
> So if you want to know that, the existing items are still a little too indirect
> to really get it right. You can only know the average allocation size, by
> dividing. Instead, we should provide the allocation size, for each count.
> 
> The limited interface makes this a little awkward, but using zones/ranges could
> work: "for this range of allocation sizes, there were the following stats". Or,
> some other technique that I haven't thought of (maybe two items per file?) would
> be better.
> 
> On the other hand, there's an argument for keeping this minimal and simple. That
> would probably lead us to putting in a couple of items into /proc/vmstat, as I
> just mentioned in my other response, and calling it good.

...and remember: if we keep it nice and minimal and clean, we can put it into
/proc/vmstat and monitor it.

And then if a problem shows up, the more complex and advanced debugging data can
go into debugfs's CMA area. And you're all set.

If Android made up some policy not to use debugfs, then:

a) that probably won't prevent engineers from using it anyway, for advanced debugging,
and

b) If (a) somehow falls short, then we need to talk about what Android's plans are to
fill the need. And "fill up sysfs with debugfs items, possibly duplicating some of them,
and generally making an unecessary mess, to compensate for not using debugfs" is not
my first choice. :)


thanks,
Minchan Kim Feb. 5, 2021, 1:44 a.m. UTC | #10
On Thu, Feb 04, 2021 at 04:24:20PM -0800, John Hubbard wrote:
> On 2/4/21 4:12 PM, Minchan Kim wrote:
> ...
> > > > Then, how to know how often CMA API failed?
> > > 
> > > Why would you even need to know that, *in addition* to knowing specific
> > > page allocation numbers that failed? Again, there is no real-world motivation
> > > cited yet, just "this is good data". Need more stories and support here.
> > 
> > Let me give an example.
> > 
> > Let' assume we use memory buffer allocation via CMA for bluetooth
> > enable of  device.
> > If user clicks the bluetooth button in the phone but fail to allocate
> > the memory from CMA, user will still see bluetooth button gray.
> > User would think his touch was not enough powerful so he try clicking
> > again and fortunately CMA allocation was successful this time and
> > they will see bluetooh button enabled and could listen the music.
> > 
> > Here, product team needs to monitor how often CMA alloc failed so
> > if the failure ratio is steadily increased than the bar,
> > it means engineers need to go investigation.
> > 
> > Make sense?
> > 
> 
> Yes, except that it raises more questions:
> 
> 1) Isn't this just standard allocation failure? Don't you already have a way
> to track that?
> 
> Presumably, having the source code, you can easily deduce that a bluetooth
> allocation failure goes directly to a CMA allocation failure, right?
> 
> Anyway, even though the above is still a little murky, I expect you're right
> that it's good to have *some* indication, somewhere about CMA behavior...
> 
> Thinking about this some more, I wonder if this is really /proc/vmstat sort
> of data that we're talking about. It seems to fit right in there, yes?

Thing is CMA instance are multiple, cma-A, cma-B, cma-C and each of CMA
heap has own specific scenario. /proc/vmstat could be bloated a lot
while CMA instance will be increased.
Suren Baghdasaryan Feb. 5, 2021, 1:44 a.m. UTC | #11
On Thu, Feb 4, 2021 at 4:34 PM John Hubbard <jhubbard@nvidia.com> wrote:
>
> On 2/4/21 4:25 PM, John Hubbard wrote:
> > On 2/4/21 3:45 PM, Suren Baghdasaryan wrote:
> > ...
> >>>>>> 2) The overall CMA allocation attempts/failures (first two items above) seem
> >>>>>> an odd pair of things to track. Maybe that is what was easy to track, but I'd
> >>>>>> vote for just omitting them.
> >>>>>
> >>>>> Then, how to know how often CMA API failed?
> >>>>
> >>>> Why would you even need to know that, *in addition* to knowing specific
> >>>> page allocation numbers that failed? Again, there is no real-world motivation
> >>>> cited yet, just "this is good data". Need more stories and support here.
> >>>
> >>> IMHO it would be very useful to see whether there are multiple
> >>> small-order allocation failures or a few large-order ones, especially
> >>> for CMA where large allocations are not unusual. For that I believe
> >>> both alloc_pages_attempt and alloc_pages_fail would be required.
> >>
> >> Sorry, I meant to say "both cma_alloc_fail and alloc_pages_fail would
> >> be required".
> >
> > So if you want to know that, the existing items are still a little too indirect
> > to really get it right. You can only know the average allocation size, by
> > dividing. Instead, we should provide the allocation size, for each count.
> >
> > The limited interface makes this a little awkward, but using zones/ranges could
> > work: "for this range of allocation sizes, there were the following stats". Or,
> > some other technique that I haven't thought of (maybe two items per file?) would
> > be better.
> >
> > On the other hand, there's an argument for keeping this minimal and simple. That
> > would probably lead us to putting in a couple of items into /proc/vmstat, as I
> > just mentioned in my other response, and calling it good.

True. I was thinking along these lines but per-order counters felt
like maybe an overkill? I'm all for keeping it simple.

>

> ...and remember: if we keep it nice and minimal and clean, we can put it into
> /proc/vmstat and monitor it.

No objections from me.

>
> And then if a problem shows up, the more complex and advanced debugging data can
> go into debugfs's CMA area. And you're all set.
>
> If Android made up some policy not to use debugfs, then:
>
> a) that probably won't prevent engineers from using it anyway, for advanced debugging,
> and
>
> b) If (a) somehow falls short, then we need to talk about what Android's plans are to
> fill the need. And "fill up sysfs with debugfs items, possibly duplicating some of them,
> and generally making an unecessary mess, to compensate for not using debugfs" is not
> my first choice. :)
>
>
> thanks,
> --
> John Hubbard
> NVIDIA
Suren Baghdasaryan Feb. 5, 2021, 2:39 a.m. UTC | #12
On Thu, Feb 4, 2021 at 5:44 PM Minchan Kim <minchan@kernel.org> wrote:
>
> On Thu, Feb 04, 2021 at 04:24:20PM -0800, John Hubbard wrote:
> > On 2/4/21 4:12 PM, Minchan Kim wrote:
> > ...
> > > > > Then, how to know how often CMA API failed?
> > > >
> > > > Why would you even need to know that, *in addition* to knowing specific
> > > > page allocation numbers that failed? Again, there is no real-world motivation
> > > > cited yet, just "this is good data". Need more stories and support here.
> > >
> > > Let me give an example.
> > >
> > > Let' assume we use memory buffer allocation via CMA for bluetooth
> > > enable of  device.
> > > If user clicks the bluetooth button in the phone but fail to allocate
> > > the memory from CMA, user will still see bluetooth button gray.
> > > User would think his touch was not enough powerful so he try clicking
> > > again and fortunately CMA allocation was successful this time and
> > > they will see bluetooh button enabled and could listen the music.
> > >
> > > Here, product team needs to monitor how often CMA alloc failed so
> > > if the failure ratio is steadily increased than the bar,
> > > it means engineers need to go investigation.
> > >
> > > Make sense?
> > >
> >
> > Yes, except that it raises more questions:
> >
> > 1) Isn't this just standard allocation failure? Don't you already have a way
> > to track that?
> >
> > Presumably, having the source code, you can easily deduce that a bluetooth
> > allocation failure goes directly to a CMA allocation failure, right?
> >
> > Anyway, even though the above is still a little murky, I expect you're right
> > that it's good to have *some* indication, somewhere about CMA behavior...
> >
> > Thinking about this some more, I wonder if this is really /proc/vmstat sort
> > of data that we're talking about. It seems to fit right in there, yes?
>
> Thing is CMA instance are multiple, cma-A, cma-B, cma-C and each of CMA
> heap has own specific scenario. /proc/vmstat could be bloated a lot
> while CMA instance will be increased.

Oh, I missed the fact that you need these stats per-CMA.
John Hubbard Feb. 5, 2021, 2:52 a.m. UTC | #13
On 2/4/21 5:44 PM, Minchan Kim wrote:
> On Thu, Feb 04, 2021 at 04:24:20PM -0800, John Hubbard wrote:
>> On 2/4/21 4:12 PM, Minchan Kim wrote:
>> ...
>>>>> Then, how to know how often CMA API failed?
>>>>
>>>> Why would you even need to know that, *in addition* to knowing specific
>>>> page allocation numbers that failed? Again, there is no real-world motivation
>>>> cited yet, just "this is good data". Need more stories and support here.
>>>
>>> Let me give an example.
>>>
>>> Let' assume we use memory buffer allocation via CMA for bluetooth
>>> enable of  device.
>>> If user clicks the bluetooth button in the phone but fail to allocate
>>> the memory from CMA, user will still see bluetooth button gray.
>>> User would think his touch was not enough powerful so he try clicking
>>> again and fortunately CMA allocation was successful this time and
>>> they will see bluetooh button enabled and could listen the music.
>>>
>>> Here, product team needs to monitor how often CMA alloc failed so
>>> if the failure ratio is steadily increased than the bar,
>>> it means engineers need to go investigation.
>>>
>>> Make sense?
>>>
>>
>> Yes, except that it raises more questions:
>>
>> 1) Isn't this just standard allocation failure? Don't you already have a way
>> to track that?
>>
>> Presumably, having the source code, you can easily deduce that a bluetooth
>> allocation failure goes directly to a CMA allocation failure, right?

Still wondering about this...

>>
>> Anyway, even though the above is still a little murky, I expect you're right
>> that it's good to have *some* indication, somewhere about CMA behavior...
>>
>> Thinking about this some more, I wonder if this is really /proc/vmstat sort
>> of data that we're talking about. It seems to fit right in there, yes?
> 
> Thing is CMA instance are multiple, cma-A, cma-B, cma-C and each of CMA
> heap has own specific scenario. /proc/vmstat could be bloated a lot
> while CMA instance will be increased.
> 

Yes, that would not fit in /proc/vmstat...assuming that you really require
knowing--at this point--which CMA heap is involved. And that's worth poking
at. If you get an overall indication in vmstat that CMA is having trouble,
then maybe that's all you need to start digging further.

It's actually easier to monitor one or two simpler items than it is to monitor
a larger number of complicated items. And I get the impression that this is
sort of a top-level, production software indicator.

thanks,
Matthew Wilcox (Oracle) Feb. 5, 2021, 2:55 a.m. UTC | #14
On Wed, Feb 03, 2021 at 07:50:01AM -0800, Minchan Kim wrote:
> +++ b/mm/Makefile
> @@ -106,6 +106,7 @@ obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
>  obj-$(CONFIG_Z3FOLD)	+= z3fold.o
>  obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
>  obj-$(CONFIG_CMA)	+= cma.o
> +obj-$(CONFIG_SYSFS)     += cma_sysfs.o

ehh ... if we have a kernel build with CMA=n, SYSFS=y, we'll get
cma_sysfs built in with no cma to report on.

> +static ssize_t cma_alloc_attempt_show(struct kobject *kobj,
> +			struct kobj_attribute *attr, char *buf)
> +{
> +	unsigned long val;
> +	struct cma_stat *stat = container_of(kobj, struct cma_stat, kobj);
> +
> +	val = stat->alloc_attempt;
> +
> +	return sysfs_emit(buf, "%lu\n", val);

Why not more simply:

{
	struct cma_stat *stat = container_of(kobj, struct cma_stat, kobj);
	return sysfs_emit(buf, "%lu\n", stat->alloc_attempt);
}

> +	for (i = 0; i < cma_area_count; i++) {
> +		cma = &cma_areas[i];
> +		stat = kzalloc(sizeof(*stat), GFP_KERNEL);
> +		if (!stat)
> +			goto out;

How many cma areas are there going to be?  do we really want to allocate
their stat individually?
Minchan Kim Feb. 5, 2021, 5:17 a.m. UTC | #15
On Thu, Feb 04, 2021 at 06:52:01PM -0800, John Hubbard wrote:
> On 2/4/21 5:44 PM, Minchan Kim wrote:
> > On Thu, Feb 04, 2021 at 04:24:20PM -0800, John Hubbard wrote:
> > > On 2/4/21 4:12 PM, Minchan Kim wrote:
> > > ...
> > > > > > Then, how to know how often CMA API failed?
> > > > > 
> > > > > Why would you even need to know that, *in addition* to knowing specific
> > > > > page allocation numbers that failed? Again, there is no real-world motivation
> > > > > cited yet, just "this is good data". Need more stories and support here.
> > > > 
> > > > Let me give an example.
> > > > 
> > > > Let' assume we use memory buffer allocation via CMA for bluetooth
> > > > enable of  device.
> > > > If user clicks the bluetooth button in the phone but fail to allocate
> > > > the memory from CMA, user will still see bluetooth button gray.
> > > > User would think his touch was not enough powerful so he try clicking
> > > > again and fortunately CMA allocation was successful this time and
> > > > they will see bluetooh button enabled and could listen the music.
> > > > 
> > > > Here, product team needs to monitor how often CMA alloc failed so
> > > > if the failure ratio is steadily increased than the bar,
> > > > it means engineers need to go investigation.
> > > > 
> > > > Make sense?
> > > > 
> > > 
> > > Yes, except that it raises more questions:
> > > 
> > > 1) Isn't this just standard allocation failure? Don't you already have a way
> > > to track that?
> > > 
> > > Presumably, having the source code, you can easily deduce that a bluetooth
> > > allocation failure goes directly to a CMA allocation failure, right?
> 
> Still wondering about this...

It would work if we have full source code and stack are not complicated for
every usecases. Having said, having a good central place automatically
popped up is also beneficial for not to add similar statistics for each
call sites.

Why do we have too many item in slab sysfs instead of creating each call
site inventing on each own?

> 
> > > 
> > > Anyway, even though the above is still a little murky, I expect you're right
> > > that it's good to have *some* indication, somewhere about CMA behavior...
> > > 
> > > Thinking about this some more, I wonder if this is really /proc/vmstat sort
> > > of data that we're talking about. It seems to fit right in there, yes?
> > 
> > Thing is CMA instance are multiple, cma-A, cma-B, cma-C and each of CMA
> > heap has own specific scenario. /proc/vmstat could be bloated a lot
> > while CMA instance will be increased.
> > 
> 
> Yes, that would not fit in /proc/vmstat...assuming that you really require
> knowing--at this point--which CMA heap is involved. And that's worth poking
> at. If you get an overall indication in vmstat that CMA is having trouble,
> then maybe that's all you need to start digging further.

I agree it could save to decide whether I should go digging further
but anyway, I need to go though each of instance once it happens.
In that, what I need is per-CMA statistics, not global.
I am happy to implement it but I'd like to say it's not my case.

> 
> It's actually easier to monitor one or two simpler items than it is to monitor
> a larger number of complicated items. And I get the impression that this is
> sort of a top-level, production software indicator.

Let me clarify one more time.

What I'd like to get ultimately is per-CMA statistics instead of
global vmstat for the usecase at this moment. Global vmstat
could help the decision whether I should go deeper but it ends up
needing per-CMA statistics. And I'd like to keep them in sysfs,
not debugfs since it should be stable as a telemetric.

What points do you disagree in this view?
Minchan Kim Feb. 5, 2021, 5:22 a.m. UTC | #16
On Fri, Feb 05, 2021 at 02:55:26AM +0000, Matthew Wilcox wrote:
> On Wed, Feb 03, 2021 at 07:50:01AM -0800, Minchan Kim wrote:
> > +++ b/mm/Makefile
> > @@ -106,6 +106,7 @@ obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
> >  obj-$(CONFIG_Z3FOLD)	+= z3fold.o
> >  obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
> >  obj-$(CONFIG_CMA)	+= cma.o
> > +obj-$(CONFIG_SYSFS)     += cma_sysfs.o
> 
> ehh ... if we have a kernel build with CMA=n, SYSFS=y, we'll get
> cma_sysfs built in with no cma to report on.

OMG. Let me fix it.

> 
> > +static ssize_t cma_alloc_attempt_show(struct kobject *kobj,
> > +			struct kobj_attribute *attr, char *buf)
> > +{
> > +	unsigned long val;
> > +	struct cma_stat *stat = container_of(kobj, struct cma_stat, kobj);
> > +
> > +	val = stat->alloc_attempt;
> > +
> > +	return sysfs_emit(buf, "%lu\n", val);
> 
> Why not more simply:
> 
> {
> 	struct cma_stat *stat = container_of(kobj, struct cma_stat, kobj);
> 	return sysfs_emit(buf, "%lu\n", stat->alloc_attempt);

It's a legacy when I used the lock there but removed finally.
Will follow your suggestion.

> }
> 
> > +	for (i = 0; i < cma_area_count; i++) {
> > +		cma = &cma_areas[i];
> > +		stat = kzalloc(sizeof(*stat), GFP_KERNEL);
> > +		if (!stat)
> > +			goto out;
> 
> How many cma areas are there going to be?  do we really want to allocate
> their stat individually?

I am not sure what could be in the end but at least, I have
5+ candidates (but could be shrink or extend) and yes,
want to keep track them individually.
John Hubbard Feb. 5, 2021, 5:49 a.m. UTC | #17
On 2/4/21 9:17 PM, Minchan Kim wrote:
...
>>>> Presumably, having the source code, you can easily deduce that a bluetooth
>>>> allocation failure goes directly to a CMA allocation failure, right?
>>
>> Still wondering about this...
> 
> It would work if we have full source code and stack are not complicated for
> every usecases. Having said, having a good central place automatically
> popped up is also beneficial for not to add similar statistics for each
> call sites.
> 
> Why do we have too many item in slab sysfs instead of creating each call
> site inventing on each own?
> 

I'm not sure I understand that question fully, but I don't think we need to
invent anything unique here. So far we've discussed debugfs, sysfs, and /proc,
none of which are new mechanisms.

...

>> It's actually easier to monitor one or two simpler items than it is to monitor
>> a larger number of complicated items. And I get the impression that this is
>> sort of a top-level, production software indicator.
> 
> Let me clarify one more time.
> 
> What I'd like to get ultimately is per-CMA statistics instead of
> global vmstat for the usecase at this moment. Global vmstat
> could help the decision whether I should go deeper but it ends up
> needing per-CMA statistics. And I'd like to keep them in sysfs,
> not debugfs since it should be stable as a telemetric.
> 
> What points do you disagree in this view?


No huge disagreements, I just want to get us down to the true essential elements
of what is required--and find a good home for the data. Initial debugging always
has excesses, and those should not end up in the more carefully vetted production
code.

If I were doing this, I'd probably consider HugeTLB pages as an example to follow,
because they have a lot in common with CMA: it's another memory allocation pool, and
people also want to monitor it.

HugeTLB pages and THP pages are monitored in /proc:
	/proc/meminfo and /proc/vmstat:

# cat meminfo |grep -i huge
AnonHugePages:     88064 kB
ShmemHugePages:        0 kB
FileHugePages:         0 kB
HugePages_Total:     500
HugePages_Free:      500
HugePages_Rsvd:        0
HugePages_Surp:        0
Hugepagesize:       2048 kB
Hugetlb:         1024000 kB

# cat vmstat | grep -i huge
nr_shmem_hugepages 0
nr_file_hugepages 0
nr_anon_transparent_hugepages 43
numa_huge_pte_updates 0

...aha, so is CMA:

# cat vmstat | grep -i cma
nr_free_cma 261718

# cat meminfo | grep -i cma
CmaTotal:        1048576 kB
CmaFree:         1046872 kB

OK, given that CMA is already in those two locations, maybe we should put
this information in one or both of those, yes?


thanks,
Minchan Kim Feb. 5, 2021, 6:24 a.m. UTC | #18
On Thu, Feb 04, 2021 at 09:49:54PM -0800, John Hubbard wrote:
> On 2/4/21 9:17 PM, Minchan Kim wrote:
> ...
> > > > > Presumably, having the source code, you can easily deduce that a bluetooth
> > > > > allocation failure goes directly to a CMA allocation failure, right?
> > > 
> > > Still wondering about this...
> > 
> > It would work if we have full source code and stack are not complicated for
> > every usecases. Having said, having a good central place automatically
> > popped up is also beneficial for not to add similar statistics for each
> > call sites.
> > 
> > Why do we have too many item in slab sysfs instead of creating each call
> > site inventing on each own?
> > 
> 
> I'm not sure I understand that question fully, but I don't think we need to
> invent anything unique here. So far we've discussed debugfs, sysfs, and /proc,
> none of which are new mechanisms.

I thought you asked why we couldn't add those stat in their call site
driver syfs instead of central place. Please clarify if I misunderstood
your question.

> 
> ...
> 
> > > It's actually easier to monitor one or two simpler items than it is to monitor
> > > a larger number of complicated items. And I get the impression that this is
> > > sort of a top-level, production software indicator.
> > 
> > Let me clarify one more time.
> > 
> > What I'd like to get ultimately is per-CMA statistics instead of
> > global vmstat for the usecase at this moment. Global vmstat
> > could help the decision whether I should go deeper but it ends up
> > needing per-CMA statistics. And I'd like to keep them in sysfs,
> > not debugfs since it should be stable as a telemetric.
> > 
> > What points do you disagree in this view?
> 
> 
> No huge disagreements, I just want to get us down to the true essential elements
> of what is required--and find a good home for the data. Initial debugging always
> has excesses, and those should not end up in the more carefully vetted production
> code.
> 
> If I were doing this, I'd probably consider HugeTLB pages as an example to follow,
> because they have a lot in common with CMA: it's another memory allocation pool, and
> people also want to monitor it.
> 
> HugeTLB pages and THP pages are monitored in /proc:
> 	/proc/meminfo and /proc/vmstat:
> 
> # cat meminfo |grep -i huge
> AnonHugePages:     88064 kB
> ShmemHugePages:        0 kB
> FileHugePages:         0 kB
> HugePages_Total:     500
> HugePages_Free:      500
> HugePages_Rsvd:        0
> HugePages_Surp:        0
> Hugepagesize:       2048 kB
> Hugetlb:         1024000 kB
> 
> # cat vmstat | grep -i huge
> nr_shmem_hugepages 0
> nr_file_hugepages 0
> nr_anon_transparent_hugepages 43
> numa_huge_pte_updates 0
> 
> ...aha, so is CMA:
> 
> # cat vmstat | grep -i cma
> nr_free_cma 261718
> 
> # cat meminfo | grep -i cma
> CmaTotal:        1048576 kB
> CmaFree:         1046872 kB
> 
> OK, given that CMA is already in those two locations, maybe we should put
> this information in one or both of those, yes?

Do you suggest something liks this, for example?


cat vmstat | grep -i cma
cma_a_success	125
cma_a_fail	25
cma_b_success	130
cma_b_fail	156
..
cma_f_fail	xxx
John Hubbard Feb. 5, 2021, 6:41 a.m. UTC | #19
On 2/4/21 10:24 PM, Minchan Kim wrote:
> On Thu, Feb 04, 2021 at 09:49:54PM -0800, John Hubbard wrote:
>> On 2/4/21 9:17 PM, Minchan Kim wrote:
...
>> # cat vmstat | grep -i cma
>> nr_free_cma 261718
>>
>> # cat meminfo | grep -i cma
>> CmaTotal:        1048576 kB
>> CmaFree:         1046872 kB
>>
>> OK, given that CMA is already in those two locations, maybe we should put
>> this information in one or both of those, yes?
> 
> Do you suggest something liks this, for example?
> 
> 
> cat vmstat | grep -i cma
> cma_a_success	125
> cma_a_fail	25
> cma_b_success	130
> cma_b_fail	156
> ..
> cma_f_fail	xxx
> 

Yes, approximately. I was wondering if this would suffice at least as a baseline:

cma_alloc_success   125
cma_alloc_failure   25

...and then, to see if more is needed, some questions:

a)  Do you know of an upper bound on how many cma areas there can be
(I think Matthew also asked that)?

b) Is tracking the cma area really as valuable as other possibilities? We can put
"a few" to "several" items here, so really want to get your very favorite bits of
information in. If, for example, there can be *lots* of cma areas, then maybe tracking
by a range of allocation sizes is better...


thanks,
Matthew Wilcox (Oracle) Feb. 5, 2021, 12:12 p.m. UTC | #20
On Thu, Feb 04, 2021 at 09:22:18PM -0800, Minchan Kim wrote:
> > > +	for (i = 0; i < cma_area_count; i++) {
> > > +		cma = &cma_areas[i];
> > > +		stat = kzalloc(sizeof(*stat), GFP_KERNEL);
> > > +		if (!stat)
> > > +			goto out;
> > 
> > How many cma areas are there going to be?  do we really want to allocate
> > their stat individually?
> 
> I am not sure what could be in the end but at least, I have
> 5+ candidates (but could be shrink or extend) and yes,
> want to keep track them individually.

I meant, wouldn't it be better to:

	cma_stats = kzalloc(array_size(sizeof(*stat), cma_area_count),
				GFP_KERNEL);
Minchan Kim Feb. 5, 2021, 4:15 p.m. UTC | #21
On Thu, Feb 04, 2021 at 10:41:14PM -0800, John Hubbard wrote:
> On 2/4/21 10:24 PM, Minchan Kim wrote:
> > On Thu, Feb 04, 2021 at 09:49:54PM -0800, John Hubbard wrote:
> > > On 2/4/21 9:17 PM, Minchan Kim wrote:
> ...
> > > # cat vmstat | grep -i cma
> > > nr_free_cma 261718
> > > 
> > > # cat meminfo | grep -i cma
> > > CmaTotal:        1048576 kB
> > > CmaFree:         1046872 kB
> > > 
> > > OK, given that CMA is already in those two locations, maybe we should put
> > > this information in one or both of those, yes?
> > 
> > Do you suggest something liks this, for example?
> > 
> > 
> > cat vmstat | grep -i cma
> > cma_a_success	125
> > cma_a_fail	25
> > cma_b_success	130
> > cma_b_fail	156
> > ..
> > cma_f_fail	xxx
> > 
> 
> Yes, approximately. I was wondering if this would suffice at least as a baseline:
> 
> cma_alloc_success   125
> cma_alloc_failure   25

IMO, regardless of the my patch, it would be good to have such statistics
in that CMA was born to replace carved out memory with dynamic allocation
ideally for memory efficiency ideally so failure should regard critical
so admin could notice it how the system is hurt.

Anyway, it's not enough for me and orthgonal with my goal.

> 
> ...and then, to see if more is needed, some questions:
> 
> a)  Do you know of an upper bound on how many cma areas there can be
> (I think Matthew also asked that)?

There is no upper bound since it's configurable.

> 
> b) Is tracking the cma area really as valuable as other possibilities? We can put
> "a few" to "several" items here, so really want to get your very favorite bits of
> information in. If, for example, there can be *lots* of cma areas, then maybe tracking

At this moment, allocation/failure for each CMA area since they have
particular own usecase, which makes me easy to keep which module will
be affected. I think it is very useful per-CMA statistics as minimum
code change so I want to enable it by default under CONFIG_CMA && CONFIG_SYSFS.

> by a range of allocation sizes is better...

I takes your suggestion something like this.

[alloc_range] could be order or range by interval

/sys/kernel/mm/cma/cma-A/[alloc_range]/success
/sys/kernel/mm/cma/cma-A/[alloc_range]/fail
..
..
/sys/kernel/mm/cma/cma-Z/[alloc_range]/success
/sys/kernel/mm/cma/cma-Z/[alloc_range]/fail

I agree it would be also useful but I'd like to enable it under
CONFIG_CMA_SYSFS_ALLOC_RANGE as separate patchset.
Minchan Kim Feb. 5, 2021, 4:16 p.m. UTC | #22
On Fri, Feb 05, 2021 at 12:12:17PM +0000, Matthew Wilcox wrote:
> On Thu, Feb 04, 2021 at 09:22:18PM -0800, Minchan Kim wrote:
> > > > +	for (i = 0; i < cma_area_count; i++) {
> > > > +		cma = &cma_areas[i];
> > > > +		stat = kzalloc(sizeof(*stat), GFP_KERNEL);
> > > > +		if (!stat)
> > > > +			goto out;
> > > 
> > > How many cma areas are there going to be?  do we really want to allocate
> > > their stat individually?
> > 
> > I am not sure what could be in the end but at least, I have
> > 5+ candidates (but could be shrink or extend) and yes,
> > want to keep track them individually.
> 
> I meant, wouldn't it be better to:
> 
> 	cma_stats = kzalloc(array_size(sizeof(*stat), cma_area_count),
> 				GFP_KERNEL);
> 

Definitely.
Thanks, Matthew.
John Hubbard Feb. 5, 2021, 8:25 p.m. UTC | #23
On 2/5/21 8:15 AM, Minchan Kim wrote:
...
>> Yes, approximately. I was wondering if this would suffice at least as a baseline:
>>
>> cma_alloc_success   125
>> cma_alloc_failure   25
> 
> IMO, regardless of the my patch, it would be good to have such statistics
> in that CMA was born to replace carved out memory with dynamic allocation
> ideally for memory efficiency ideally so failure should regard critical
> so admin could notice it how the system is hurt.

Right. So CMA failures are useful for the admin to see, understood.

> 
> Anyway, it's not enough for me and orthgonal with my goal.
> 

OK. But...what *is* your goal, and why is this useless (that's what
orthogonal really means here) for your goal?

Also, would you be willing to try out something simple first,
such as providing indication that cma is active and it's overall success
rate, like this:

/proc/vmstat:

cma_alloc_success   125
cma_alloc_failure   25

...or is the only way to provide the more detailed items, complete with
per-CMA details, in a non-debugfs location?


>>
>> ...and then, to see if more is needed, some questions:
>>
>> a)  Do you know of an upper bound on how many cma areas there can be
>> (I think Matthew also asked that)?
> 
> There is no upper bound since it's configurable.
> 

OK, thanks,so that pretty much rules out putting per-cma details into
anything other than a directory or something like it.

>>
>> b) Is tracking the cma area really as valuable as other possibilities? We can put
>> "a few" to "several" items here, so really want to get your very favorite bits of
>> information in. If, for example, there can be *lots* of cma areas, then maybe tracking
> 
> At this moment, allocation/failure for each CMA area since they have
> particular own usecase, which makes me easy to keep which module will
> be affected. I think it is very useful per-CMA statistics as minimum
> code change so I want to enable it by default under CONFIG_CMA && CONFIG_SYSFS.
> 
>> by a range of allocation sizes is better...
> 
> I takes your suggestion something like this.
> 
> [alloc_range] could be order or range by interval
> 
> /sys/kernel/mm/cma/cma-A/[alloc_range]/success
> /sys/kernel/mm/cma/cma-A/[alloc_range]/fail
> ..
> ..
> /sys/kernel/mm/cma/cma-Z/[alloc_range]/success
> /sys/kernel/mm/cma/cma-Z/[alloc_range]/fail

Actually, I meant, "ranges instead of cma areas", like this:

/<path-to-cma-data/[alloc_range_1]/success
/<path-to-cma-data/[alloc_range_1]/fail
/<path-to-cma-data/[alloc_range_2]/success
/<path-to-cma-data/[alloc_range_2]/fail
...
/<path-to-cma-data/[alloc_range_max]/success
/<path-to-cma-data/[alloc_range_max]/fail

The idea is that knowing the allocation sizes that succeeded
and failed is maybe even more interesting and useful than
knowing the cma area that contains them.

> 
> I agree it would be also useful but I'd like to enable it under
> CONFIG_CMA_SYSFS_ALLOC_RANGE as separate patchset.
> 

I will stop harassing you very soon, just want to bottom out on
understanding the real goals first. :)

thanks,
Minchan Kim Feb. 5, 2021, 9:28 p.m. UTC | #24
On Fri, Feb 05, 2021 at 12:25:52PM -0800, John Hubbard wrote:
> On 2/5/21 8:15 AM, Minchan Kim wrote:
> ...
> > > Yes, approximately. I was wondering if this would suffice at least as a baseline:
> > > 
> > > cma_alloc_success   125
> > > cma_alloc_failure   25
> > 
> > IMO, regardless of the my patch, it would be good to have such statistics
> > in that CMA was born to replace carved out memory with dynamic allocation
> > ideally for memory efficiency ideally so failure should regard critical
> > so admin could notice it how the system is hurt.
> 
> Right. So CMA failures are useful for the admin to see, understood.
> 
> > 
> > Anyway, it's not enough for me and orthgonal with my goal.
> > 
> 
> OK. But...what *is* your goal, and why is this useless (that's what
> orthogonal really means here) for your goal?

As I mentioned, the goal is to monitor the failure from each of CMA
since they have each own purpose.

Let's have an example.

System has 5 CMA area and each CMA is associated with each
user scenario. They have exclusive CMA area to avoid
fragmentation problem.

CMA-1 depends on bluetooh
CMA-2 depends on WIFI
CMA-3 depends on sensor-A
CMA-4 depends on sensor-B
CMA-5 depends on sensor-C

With this, we could catch which module was affected but with global failure,
I couldn't find who was affected.

> 
> Also, would you be willing to try out something simple first,
> such as providing indication that cma is active and it's overall success
> rate, like this:
> 
> /proc/vmstat:
> 
> cma_alloc_success   125
> cma_alloc_failure   25
> 
> ...or is the only way to provide the more detailed items, complete with
> per-CMA details, in a non-debugfs location?
> 
> 
> > > 
> > > ...and then, to see if more is needed, some questions:
> > > 
> > > a)  Do you know of an upper bound on how many cma areas there can be
> > > (I think Matthew also asked that)?
> > 
> > There is no upper bound since it's configurable.
> > 
> 
> OK, thanks,so that pretty much rules out putting per-cma details into
> anything other than a directory or something like it.
> 
> > > 
> > > b) Is tracking the cma area really as valuable as other possibilities? We can put
> > > "a few" to "several" items here, so really want to get your very favorite bits of
> > > information in. If, for example, there can be *lots* of cma areas, then maybe tracking
> > 
> > At this moment, allocation/failure for each CMA area since they have
> > particular own usecase, which makes me easy to keep which module will
> > be affected. I think it is very useful per-CMA statistics as minimum
> > code change so I want to enable it by default under CONFIG_CMA && CONFIG_SYSFS.
> > 
> > > by a range of allocation sizes is better...
> > 
> > I takes your suggestion something like this.
> > 
> > [alloc_range] could be order or range by interval
> > 
> > /sys/kernel/mm/cma/cma-A/[alloc_range]/success
> > /sys/kernel/mm/cma/cma-A/[alloc_range]/fail
> > ..
> > ..
> > /sys/kernel/mm/cma/cma-Z/[alloc_range]/success
> > /sys/kernel/mm/cma/cma-Z/[alloc_range]/fail
> 
> Actually, I meant, "ranges instead of cma areas", like this:
> 
> /<path-to-cma-data/[alloc_range_1]/success
> /<path-to-cma-data/[alloc_range_1]/fail
> /<path-to-cma-data/[alloc_range_2]/success
> /<path-to-cma-data/[alloc_range_2]/fail
> ...
> /<path-to-cma-data/[alloc_range_max]/success
> /<path-to-cma-data/[alloc_range_max]/fail
> 
> The idea is that knowing the allocation sizes that succeeded
> and failed is maybe even more interesting and useful than
> knowing the cma area that contains them.

Understand your point but it would make hard to find who was
affected by the failure. That's why I suggested to have your
suggestion under additional config since per-cma metric with
simple sucess/failure are enough.

> 
> > 
> > I agree it would be also useful but I'd like to enable it under
> > CONFIG_CMA_SYSFS_ALLOC_RANGE as separate patchset.
> > 
> 
> I will stop harassing you very soon, just want to bottom out on
> understanding the real goals first. :)
> 

I hope my example makes the goal more clear for you.
Suren Baghdasaryan Feb. 5, 2021, 9:52 p.m. UTC | #25
On Fri, Feb 5, 2021 at 1:28 PM Minchan Kim <minchan@kernel.org> wrote:
>
> On Fri, Feb 05, 2021 at 12:25:52PM -0800, John Hubbard wrote:
> > On 2/5/21 8:15 AM, Minchan Kim wrote:
> > ...
> > > > Yes, approximately. I was wondering if this would suffice at least as a baseline:
> > > >
> > > > cma_alloc_success   125
> > > > cma_alloc_failure   25
> > >
> > > IMO, regardless of the my patch, it would be good to have such statistics
> > > in that CMA was born to replace carved out memory with dynamic allocation
> > > ideally for memory efficiency ideally so failure should regard critical
> > > so admin could notice it how the system is hurt.
> >
> > Right. So CMA failures are useful for the admin to see, understood.
> >
> > >
> > > Anyway, it's not enough for me and orthgonal with my goal.
> > >
> >
> > OK. But...what *is* your goal, and why is this useless (that's what
> > orthogonal really means here) for your goal?
>
> As I mentioned, the goal is to monitor the failure from each of CMA
> since they have each own purpose.
>
> Let's have an example.
>
> System has 5 CMA area and each CMA is associated with each
> user scenario. They have exclusive CMA area to avoid
> fragmentation problem.
>
> CMA-1 depends on bluetooh
> CMA-2 depends on WIFI
> CMA-3 depends on sensor-A
> CMA-4 depends on sensor-B
> CMA-5 depends on sensor-C
>
> With this, we could catch which module was affected but with global failure,
> I couldn't find who was affected.
>
> >
> > Also, would you be willing to try out something simple first,
> > such as providing indication that cma is active and it's overall success
> > rate, like this:
> >
> > /proc/vmstat:
> >
> > cma_alloc_success   125
> > cma_alloc_failure   25
> >
> > ...or is the only way to provide the more detailed items, complete with
> > per-CMA details, in a non-debugfs location?
> >
> >
> > > >
> > > > ...and then, to see if more is needed, some questions:
> > > >
> > > > a)  Do you know of an upper bound on how many cma areas there can be
> > > > (I think Matthew also asked that)?
> > >
> > > There is no upper bound since it's configurable.
> > >
> >
> > OK, thanks,so that pretty much rules out putting per-cma details into
> > anything other than a directory or something like it.
> >
> > > >
> > > > b) Is tracking the cma area really as valuable as other possibilities? We can put
> > > > "a few" to "several" items here, so really want to get your very favorite bits of
> > > > information in. If, for example, there can be *lots* of cma areas, then maybe tracking
> > >
> > > At this moment, allocation/failure for each CMA area since they have
> > > particular own usecase, which makes me easy to keep which module will
> > > be affected. I think it is very useful per-CMA statistics as minimum
> > > code change so I want to enable it by default under CONFIG_CMA && CONFIG_SYSFS.
> > >
> > > > by a range of allocation sizes is better...
> > >
> > > I takes your suggestion something like this.
> > >
> > > [alloc_range] could be order or range by interval
> > >
> > > /sys/kernel/mm/cma/cma-A/[alloc_range]/success
> > > /sys/kernel/mm/cma/cma-A/[alloc_range]/fail
> > > ..
> > > ..
> > > /sys/kernel/mm/cma/cma-Z/[alloc_range]/success
> > > /sys/kernel/mm/cma/cma-Z/[alloc_range]/fail

The interface above seems to me the most useful actually, if by
[alloc_range] you mean the different allocation orders. This would
cover Minchan's per-CMA failure tracking and would also allow us to
understand what kind of allocations are failing and therefore if the
problem is caused by pinning/fragmentation or by over-utilization.

> >
> > Actually, I meant, "ranges instead of cma areas", like this:
> >
> > /<path-to-cma-data/[alloc_range_1]/success
> > /<path-to-cma-data/[alloc_range_1]/fail
> > /<path-to-cma-data/[alloc_range_2]/success
> > /<path-to-cma-data/[alloc_range_2]/fail
> > ...
> > /<path-to-cma-data/[alloc_range_max]/success
> > /<path-to-cma-data/[alloc_range_max]/fail
> >
> > The idea is that knowing the allocation sizes that succeeded
> > and failed is maybe even more interesting and useful than
> > knowing the cma area that contains them.
>
> Understand your point but it would make hard to find who was
> affected by the failure. That's why I suggested to have your
> suggestion under additional config since per-cma metric with
> simple sucess/failure are enough.
>
> >
> > >
> > > I agree it would be also useful but I'd like to enable it under
> > > CONFIG_CMA_SYSFS_ALLOC_RANGE as separate patchset.
> > >
> >
> > I will stop harassing you very soon, just want to bottom out on
> > understanding the real goals first. :)
> >
>
> I hope my example makes the goal more clear for you.
John Hubbard Feb. 5, 2021, 9:57 p.m. UTC | #26
On 2/5/21 1:28 PM, Minchan Kim wrote:
> On Fri, Feb 05, 2021 at 12:25:52PM -0800, John Hubbard wrote:
>> On 2/5/21 8:15 AM, Minchan Kim wrote:
>> ...
>> OK. But...what *is* your goal, and why is this useless (that's what
>> orthogonal really means here) for your goal?
> 
> As I mentioned, the goal is to monitor the failure from each of CMA
> since they have each own purpose.
> 
> Let's have an example.
> 
> System has 5 CMA area and each CMA is associated with each
> user scenario. They have exclusive CMA area to avoid
> fragmentation problem.
> 
> CMA-1 depends on bluetooh
> CMA-2 depends on WIFI
> CMA-3 depends on sensor-A
> CMA-4 depends on sensor-B
> CMA-5 depends on sensor-C
> 

aha, finally. I had no idea that sort of use case was happening.

This would be good to put in the patch commit description.

> With this, we could catch which module was affected but with global failure,
> I couldn't find who was affected.
> 
>>
>> Also, would you be willing to try out something simple first,
>> such as providing indication that cma is active and it's overall success
>> rate, like this:
>>
>> /proc/vmstat:
>>
>> cma_alloc_success   125
>> cma_alloc_failure   25
>>
>> ...or is the only way to provide the more detailed items, complete with
>> per-CMA details, in a non-debugfs location?
>>
>>
>>>>
>>>> ...and then, to see if more is needed, some questions:
>>>>
>>>> a)  Do you know of an upper bound on how many cma areas there can be
>>>> (I think Matthew also asked that)?
>>>
>>> There is no upper bound since it's configurable.
>>>
>>
>> OK, thanks,so that pretty much rules out putting per-cma details into
>> anything other than a directory or something like it.
>>
>>>>
>>>> b) Is tracking the cma area really as valuable as other possibilities? We can put
>>>> "a few" to "several" items here, so really want to get your very favorite bits of
>>>> information in. If, for example, there can be *lots* of cma areas, then maybe tracking
>>>
>>> At this moment, allocation/failure for each CMA area since they have
>>> particular own usecase, which makes me easy to keep which module will
>>> be affected. I think it is very useful per-CMA statistics as minimum
>>> code change so I want to enable it by default under CONFIG_CMA && CONFIG_SYSFS.
>>>
>>>> by a range of allocation sizes is better...
>>>
>>> I takes your suggestion something like this.
>>>
>>> [alloc_range] could be order or range by interval
>>>
>>> /sys/kernel/mm/cma/cma-A/[alloc_range]/success
>>> /sys/kernel/mm/cma/cma-A/[alloc_range]/fail
>>> ..
>>> ..
>>> /sys/kernel/mm/cma/cma-Z/[alloc_range]/success
>>> /sys/kernel/mm/cma/cma-Z/[alloc_range]/fail
>>
>> Actually, I meant, "ranges instead of cma areas", like this:
>>
>> /<path-to-cma-data/[alloc_range_1]/success
>> /<path-to-cma-data/[alloc_range_1]/fail
>> /<path-to-cma-data/[alloc_range_2]/success
>> /<path-to-cma-data/[alloc_range_2]/fail
>> ...
>> /<path-to-cma-data/[alloc_range_max]/success
>> /<path-to-cma-data/[alloc_range_max]/fail
>>
>> The idea is that knowing the allocation sizes that succeeded
>> and failed is maybe even more interesting and useful than
>> knowing the cma area that contains them.
> 
> Understand your point but it would make hard to find who was
> affected by the failure. That's why I suggested to have your
> suggestion under additional config since per-cma metric with
> simple sucess/failure are enough.
> 
>>
>>>
>>> I agree it would be also useful but I'd like to enable it under
>>> CONFIG_CMA_SYSFS_ALLOC_RANGE as separate patchset.
>>>
>>
>> I will stop harassing you very soon, just want to bottom out on
>> understanding the real goals first. :)
>>
> 
> I hope my example makes the goal more clear for you.
> 

Yes it does. Based on the (rather surprising) use of cma-area-per-device,
it seems clear that you will need this, so I'll drop my objections to
putting it in sysfs.

I still think the "number of allocation failures" needs refining, probably
via a range-based thing, as we've discussed. But the number of pages
failed per cma looks OK now.



thanks,
John Hubbard Feb. 5, 2021, 9:58 p.m. UTC | #27
On 2/5/21 1:52 PM, Suren Baghdasaryan wrote:
>>>> I takes your suggestion something like this.
>>>>
>>>> [alloc_range] could be order or range by interval
>>>>
>>>> /sys/kernel/mm/cma/cma-A/[alloc_range]/success
>>>> /sys/kernel/mm/cma/cma-A/[alloc_range]/fail
>>>> ..
>>>> ..
>>>> /sys/kernel/mm/cma/cma-Z/[alloc_range]/success
>>>> /sys/kernel/mm/cma/cma-Z/[alloc_range]/fail
> 
> The interface above seems to me the most useful actually, if by
> [alloc_range] you mean the different allocation orders. This would
> cover Minchan's per-CMA failure tracking and would also allow us to
> understand what kind of allocations are failing and therefore if the
> problem is caused by pinning/fragmentation or by over-utilization.
> 

I agree. That seems about right, now that we've established that
cma areas are a must-have.

thanks,
Minchan Kim Feb. 5, 2021, 10:47 p.m. UTC | #28
On Fri, Feb 05, 2021 at 01:58:06PM -0800, John Hubbard wrote:
> On 2/5/21 1:52 PM, Suren Baghdasaryan wrote:
> > > > > I takes your suggestion something like this.
> > > > > 
> > > > > [alloc_range] could be order or range by interval
> > > > > 
> > > > > /sys/kernel/mm/cma/cma-A/[alloc_range]/success
> > > > > /sys/kernel/mm/cma/cma-A/[alloc_range]/fail
> > > > > ..
> > > > > ..
> > > > > /sys/kernel/mm/cma/cma-Z/[alloc_range]/success
> > > > > /sys/kernel/mm/cma/cma-Z/[alloc_range]/fail
> > 
> > The interface above seems to me the most useful actually, if by
> > [alloc_range] you mean the different allocation orders. This would
> > cover Minchan's per-CMA failure tracking and would also allow us to
> > understand what kind of allocations are failing and therefore if the
> > problem is caused by pinning/fragmentation or by over-utilization.
> > 
> 
> I agree. That seems about right, now that we've established that
> cma areas are a must-have.

Okay, now we agreed the dirction right now so let me do that in next
version. If you don't see it's reasonable, let me know.

* I will drop the number of CMA *page* allocation attemtps/failures to
make simple start
* I will keep CMA allocation attemtps/failures
* They will be under /sys/kernel/mm/cma/cma-XX/success,fail
* It will turn on CONFIG_CMA && CONFIG_SYSFS

Orthognal work(diffrent patchset)

* adding global CMA alloc/fail into vmstat
* adding alloc_range success/failure under CONFIG_CMA_ALLOC_TRACKING
  whatever configuration or just by default if everyone agree

Thanks.
Pintu Agarwal Feb. 6, 2021, 5:08 p.m. UTC | #29
On Sat, 6 Feb 2021 at 04:17, Minchan Kim <minchan@kernel.org> wrote:
>
> On Fri, Feb 05, 2021 at 01:58:06PM -0800, John Hubbard wrote:
> > On 2/5/21 1:52 PM, Suren Baghdasaryan wrote:
> > > > > > I takes your suggestion something like this.
> > > > > >
> > > > > > [alloc_range] could be order or range by interval
> > > > > >
> > > > > > /sys/kernel/mm/cma/cma-A/[alloc_range]/success
> > > > > > /sys/kernel/mm/cma/cma-A/[alloc_range]/fail
> > > > > > ..
> > > > > > ..
> > > > > > /sys/kernel/mm/cma/cma-Z/[alloc_range]/success
> > > > > > /sys/kernel/mm/cma/cma-Z/[alloc_range]/fail
> > >
> > > The interface above seems to me the most useful actually, if by
> > > [alloc_range] you mean the different allocation orders. This would
> > > cover Minchan's per-CMA failure tracking and would also allow us to
> > > understand what kind of allocations are failing and therefore if the
> > > problem is caused by pinning/fragmentation or by over-utilization.
> > >
> >
> > I agree. That seems about right, now that we've established that
> > cma areas are a must-have.
>
> Okay, now we agreed the dirction right now so let me do that in next
> version. If you don't see it's reasonable, let me know.
>
> * I will drop the number of CMA *page* allocation attemtps/failures to
> make simple start
> * I will keep CMA allocation attemtps/failures
> * They will be under /sys/kernel/mm/cma/cma-XX/success,fail
> * It will turn on CONFIG_CMA && CONFIG_SYSFS
>
> Orthognal work(diffrent patchset)
>
> * adding global CMA alloc/fail into vmstat
> * adding alloc_range success/failure under CONFIG_CMA_ALLOC_TRACKING
>   whatever configuration or just by default if everyone agree
>

> # cat meminfo | grep -i cma
> CmaTotal:        1048576 kB
> CmaFree:         1046872 kB

This CMA info was added by me way back in 2014.
At that time I even thought about adding this cma alloc/fail counter in vmstat.
That time I also had an internal patch about it but later dropped
(never released to mainline).
If required I can re-work again on this part.

And I have few more points to add here.
1) In the past I have faced this cma allocation failure (which could
be common in small embedded devices).
Earlier it was even difficult to figure if cma failure actually happened.
Thus I added this simple patch:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v5.11-rc6&id=5984af1082f3b115082178ed88c47033d43b924d

2) IMO just reporting CMA alloc/fail may not be enough (at times). The
main point is for which client/dev is this failure happening ?
Sometimes just tuning the size or fixing the alignment can resolve the
failure if we know the client. For global CMA it will be just NULL
(dev).

3) IMO having 2 options SYSFS and DEBUGFS may confuse the
developer/user (personal experience). So are we going to remove the
DEBUGFS or merge it ?

4) Sometimes CMA (or DMA) allocation failure can happen even very
early in boot time itself. At that time these are anyways not useful.
Some system may not proceed if CMA/DMA allocation is failing (again
personal experience).
==> Anyways this is altogether is different case...

5) The default max CMA areas are defined to be 7 but user can
configure it to any number, may be 20 or 50 (???)

6) Thus I would like to propose another thing here.
Just like we have /proc/vmallocinfo, /proc/slabinfo, etc., we can also
have: /proc/cmainfo
Here in /proc/cmaminfo we can capute more detailed information:
Global CMA Data: Alloc/Free
Client specific data: name, size, success, fail, flags, align, etc
(just a random example).
==> This can dynamically grow as large as possible
==> It can also be enabled/disabled based on CMA config itself (no
additional config required)

Any feedback on point (6) specifically ?


Thanks,
Pintu
John Hubbard Feb. 8, 2021, 8:39 a.m. UTC | #30
On 2/6/21 9:08 AM, Pintu Agarwal wrote:
...
>> # cat meminfo | grep -i cma
>> CmaTotal:        1048576 kB
>> CmaFree:         1046872 kB
> 
> This CMA info was added by me way back in 2014.
> At that time I even thought about adding this cma alloc/fail counter in vmstat.
> That time I also had an internal patch about it but later dropped
> (never released to mainline).
> If required I can re-work again on this part.
> 
> And I have few more points to add here.
> 1) In the past I have faced this cma allocation failure (which could
> be common in small embedded devices).
> Earlier it was even difficult to figure if cma failure actually happened.
> Thus I added this simple patch:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=v5.11-rc6&id=5984af1082f3b115082178ed88c47033d43b924d
> 
> 2) IMO just reporting CMA alloc/fail may not be enough (at times). The
> main point is for which client/dev is this failure happening ?
> Sometimes just tuning the size or fixing the alignment can resolve the
> failure if we know the client. For global CMA it will be just NULL
> (dev).
> 
> 3) IMO having 2 options SYSFS and DEBUGFS may confuse the
> developer/user (personal experience). So are we going to remove the
> DEBUGFS or merge it ?
> 

It is confusing to have a whole bunch of different places to find data
about a system. Here, I think the key is to add to the Documentation/
directory. So far, the documentation I see is:

     admin-guide/mm/cma_debugfs.rst: only covers debugfs
     admin-guide/kernel-parameters.txt:602: for CMA kernel parameters

If we add sysfs items then we will want a new .rst document that covers
that, and lists all the places to look.

So anyway, the underlying guidelines for which fs to user are approximately:

* sysfs: used for monitoring. One value per item, stable ABI, production.

* debugfs: *theoretically* not a stable ABI (we hope no one locks us in
by doing obnoxious things that break userspace if the debugfs APIs change).
The intention is that developers can put *anything* in there.

debugfs has a firm place in debugging, and is probably a keeper here.

I originally thought we should combine CMA's sysfs and debugfs items,
but Minchan listed an example that seems to show a pretty clear need
for monitoring of CMA areas, in production systems, and that's what
sysfs is for.

So it looks like we want both debugfs and sysfs for CMA, plus a new
overall CMA documentation that points to everything.

> 4) Sometimes CMA (or DMA) allocation failure can happen even very
> early in boot time itself. At that time these are anyways not useful.
> Some system may not proceed if CMA/DMA allocation is failing (again
> personal experience).
> ==> Anyways this is altogether is different case...
> 
> 5) The default max CMA areas are defined to be 7 but user can
> configure it to any number, may be 20 or 50 (???)
> 
> 6) Thus I would like to propose another thing here.
> Just like we have /proc/vmallocinfo, /proc/slabinfo, etc., we can also
> have: /proc/cmainfo
> Here in /proc/cmaminfo we can capute more detailed information:
> Global CMA Data: Alloc/Free
> Client specific data: name, size, success, fail, flags, align, etc
> (just a random example).
> ==> This can dynamically grow as large as possible
> ==> It can also be enabled/disabled based on CMA config itself (no
> additional config required)
> 
> Any feedback on point (6) specifically ?
> 

Well, /proc these days is for process-specific items. And CMA areas are
system-wide. So that's an argument against it. However...if there is any
process-specific CMA allocation info to report, then /proc is just the
right place for it.


thanks,
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma b/Documentation/ABI/testing/sysfs-kernel-mm-cma
new file mode 100644
index 000000000000..2a43c0aacc39
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma
@@ -0,0 +1,39 @@ 
+What:		/sys/kernel/mm/cma/
+Date:		Feb 2021
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		/sys/kernel/mm/cma/ contains a number of subdirectories by
+		cma-heap name. The subdirectory contains a number of files
+		to represent cma allocation statistics.
+
+		There are number of files under
+				/sys/kernel/mm/cma/<cma-heap-name> directory
+
+			- cma_alloc_attempt
+			- cma_alloc_fail
+			- alloc_pages_attempt
+			- alloc_pages_fail
+
+What:		/sys/kernel/mm/cma/<cma-heap-name>/cma_alloc_attempt
+Date:		Feb 2021
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		the number of cma_alloc API attempted
+
+What:		/sys/kernel/mm/cma/<cma-heap-name>/cma_alloc_fail
+Date:		Feb 2021
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		the number of CMA_alloc API failed
+
+What:		/sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_attempt
+Date:		Feb 2021
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		the number of pages CMA API tried to allocate
+
+What:		/sys/kernel/mm/cma/<cma-heap-name>/alloc_pages_fail
+Date:		Feb 2021
+Contact:	Minchan Kim <minchan@kernel.org>
+Description:
+		the number of pages CMA API failed to allocate
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 217999c8a762..71a28a5bb54e 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -49,4 +49,5 @@  extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count);
 
 extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);
+
 #endif
diff --git a/mm/Makefile b/mm/Makefile
index b2a564eec27f..9c2c81ce3894 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -106,6 +106,7 @@  obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
 obj-$(CONFIG_Z3FOLD)	+= z3fold.o
 obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)	+= cma.o
+obj-$(CONFIG_SYSFS)     += cma_sysfs.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
 obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
 obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/cma.c b/mm/cma.c
index 0ba69cd16aeb..a25068b9d012 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -448,9 +448,10 @@  struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 	offset = cma_bitmap_aligned_offset(cma, align);
 	bitmap_maxno = cma_bitmap_maxno(cma);
 	bitmap_count = cma_bitmap_pages_to_bits(cma, count);
+	cma_sysfs_alloc_count(cma, count);
 
 	if (bitmap_count > bitmap_maxno)
-		return NULL;
+		goto out;
 
 	for (;;) {
 		mutex_lock(&cma->lock);
@@ -505,6 +506,9 @@  struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
 			__func__, count, ret);
 		cma_debug_show_areas(cma);
 	}
+out:
+	if (!page)
+		cma_sysfs_fail(cma, count);
 
 	pr_debug("%s(): returned %p\n", __func__, page);
 	return page;
diff --git a/mm/cma.h b/mm/cma.h
index 42ae082cb067..e7e31012b44e 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -3,6 +3,16 @@ 
 #define __MM_CMA_H__
 
 #include <linux/debugfs.h>
+#include <linux/kobject.h>
+
+struct cma_stat {
+	spinlock_t lock;
+	unsigned long alloc_attempt;	/* the number of CMA allocation attempts */
+	unsigned long alloc_fail;	/* the number of CMA allocation failures */
+	unsigned long pages_attempt;	/* the number of CMA page allocation attempts */
+	unsigned long pages_fail;	/* the number of CMA page allocation failures */
+	struct kobject kobj;
+};
 
 struct cma {
 	unsigned long   base_pfn;
@@ -16,6 +26,9 @@  struct cma {
 	struct debugfs_u32_array dfs_bitmap;
 #endif
 	char name[CMA_MAX_NAME];
+#ifdef CONFIG_SYSFS
+	struct cma_stat	*stat;
+#endif
 };
 
 extern struct cma cma_areas[MAX_CMA_AREAS];
@@ -26,4 +39,11 @@  static inline unsigned long cma_bitmap_maxno(struct cma *cma)
 	return cma->count >> cma->order_per_bit;
 }
 
+#ifdef CONFIG_SYSFS
+void cma_sysfs_alloc_count(struct cma *cma, size_t count);
+void cma_sysfs_fail(struct cma *cma, size_t count);
+#else
+static void cma_sysfs_alloc_count(struct cma *cma, size_t count) {};
+static void cma_sysfs_fail(struct cma *cma, size_t count) {};
+#endif
 #endif
diff --git a/mm/cma_sysfs.c b/mm/cma_sysfs.c
new file mode 100644
index 000000000000..66df292cd6ca
--- /dev/null
+++ b/mm/cma_sysfs.c
@@ -0,0 +1,143 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CMA SysFS Interface
+ *
+ * Copyright (c) 2021 Minchan Kim <minchan@kernel.org>
+ */
+
+#include <linux/cma.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include "cma.h"
+
+void cma_sysfs_alloc_count(struct cma *cma, size_t count)
+{
+	spin_lock(&cma->stat->lock);
+	cma->stat->alloc_attempt++;
+	cma->stat->pages_attempt += count;
+	spin_unlock(&cma->stat->lock);
+}
+
+void cma_sysfs_fail(struct cma *cma, size_t count)
+{
+	spin_lock(&cma->stat->lock);
+	cma->stat->alloc_fail++;
+	cma->stat->pages_fail += count;
+	spin_unlock(&cma->stat->lock);
+}
+
+#define CMA_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+static struct kobject *cma_kobj;
+
+static ssize_t cma_alloc_attempt_show(struct kobject *kobj,
+			struct kobj_attribute *attr, char *buf)
+{
+	unsigned long val;
+	struct cma_stat *stat = container_of(kobj, struct cma_stat, kobj);
+
+	val = stat->alloc_attempt;
+
+	return sysfs_emit(buf, "%lu\n", val);
+}
+CMA_ATTR_RO(cma_alloc_attempt);
+
+static ssize_t cma_alloc_fail_show(struct kobject *kobj,
+			struct kobj_attribute *attr, char *buf)
+{
+	unsigned long val;
+	struct cma_stat *stat = container_of(kobj, struct cma_stat, kobj);
+
+	val = stat->alloc_fail;
+
+	return sysfs_emit(buf, "%lu\n", val);
+}
+CMA_ATTR_RO(cma_alloc_fail);
+
+static ssize_t alloc_pages_attempt_show(struct kobject *kobj,
+			struct kobj_attribute *attr, char *buf)
+{
+	unsigned long val;
+	struct cma_stat *stat = container_of(kobj, struct cma_stat, kobj);
+
+	val = stat->pages_attempt;
+
+	return sysfs_emit(buf, "%lu\n", val);
+}
+CMA_ATTR_RO(alloc_pages_attempt);
+
+static ssize_t alloc_pages_fail_show(struct kobject *kobj,
+			struct kobj_attribute *attr, char *buf)
+{
+	unsigned long val;
+	struct cma_stat *stat = container_of(kobj, struct cma_stat, kobj);
+
+	val = stat->pages_fail;
+
+	return sysfs_emit(buf, "%lu\n", val);
+}
+CMA_ATTR_RO(alloc_pages_fail);
+
+static void cma_kobj_release(struct kobject *kobj)
+{
+	struct cma_stat *stat = container_of(kobj, struct cma_stat, kobj);
+
+	kfree(stat);
+}
+
+static struct attribute *cma_attrs[] = {
+	&cma_alloc_attempt_attr.attr,
+	&cma_alloc_fail_attr.attr,
+	&alloc_pages_attempt_attr.attr,
+	&alloc_pages_fail_attr.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(cma);
+
+static struct kobj_type cma_ktype = {
+	.release = cma_kobj_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = cma_groups
+};
+
+static int __init cma_sysfs_init(void)
+{
+	int i;
+	struct cma *cma;
+	struct cma_stat *stat;
+
+	cma_kobj = kobject_create_and_add("cma", mm_kobj);
+	if (!cma_kobj) {
+		pr_err("failed to create cma kobject\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < cma_area_count; i++) {
+		cma = &cma_areas[i];
+		stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+		if (!stat)
+			goto out;
+
+		cma->stat = stat;
+		spin_lock_init(&stat->lock);
+		if (kobject_init_and_add(&stat->kobj, &cma_ktype,
+					cma_kobj, "%s", cma->name)) {
+			kobject_put(&stat->kobj);
+			goto out;
+		}
+	}
+
+	return 0;
+out:
+	while (--i >= 0) {
+		cma = &cma_areas[i];
+		kobject_put(&cma->stat->kobj);
+	}
+
+	kobject_put(cma_kobj);
+
+	return -ENOMEM;
+}
+subsys_initcall(cma_sysfs_init);