diff mbox series

[06/12] xen/common: add cache coloring allocator for domains

Message ID 20220826125111.152261-7-carlo.nonato@minervasys.tech (mailing list archive)
State Superseded
Headers show
Series Arm cache coloring | expand

Commit Message

Carlo Nonato Aug. 26, 2022, 12:51 p.m. UTC
This commit adds a new memory page allocator that implements the cache
coloring mechanism. The allocation algorithm follows the given color
configuration of the domain and maximizes contiguity in the page selection.

Pages are stored in a color-indexed data structure of lists, sorted by their
machine addresses, that are collectively called the colored heap. A simple
initialization function computes the color of any available page and inserts
it in the corresponding list. When a domain requests a page, the allocator
takes one from the subset of lists whose colors equals the domain
configuration. It chooses the page with the highest machine address such that
contiguous pages are sequentially allocated, if this is made possible by a
color assignment which includes adjacent colors.

The allocator can handle only requests with order equals to 0 since the
single color granularity is represented in memory by one page.

The buddy allocator must coexist with the colored one because the Xen heap
isn't colored. For this reason a new Kconfig option and a command line
parameter are added to let the user set the amount of memory reserved for
the buddy allocator. Even when cache coloring is enabled, this memory isn't
managed by the colored allocator.

Signed-off-by: Carlo Nonato <carlo.nonato@minervasys.tech>
Signed-off-by: Marco Solieri <marco.solieri@minervasys.tech>
---
 docs/misc/arm/cache-coloring.rst    |  43 +++++-
 docs/misc/xen-command-line.pandoc   |  14 ++
 xen/arch/arm/Kconfig                |  12 ++
 xen/arch/arm/coloring.c             |  10 ++
 xen/arch/arm/include/asm/coloring.h |   6 +
 xen/arch/arm/include/asm/mm.h       |   3 +
 xen/common/page_alloc.c             | 213 ++++++++++++++++++++++++++--
 7 files changed, 290 insertions(+), 11 deletions(-)

Comments

Jan Beulich Sept. 15, 2022, 1:13 p.m. UTC | #1
On 26.08.2022 14:51, Carlo Nonato wrote:
> This commit adds a new memory page allocator that implements the cache
> coloring mechanism. The allocation algorithm follows the given color
> configuration of the domain and maximizes contiguity in the page selection.
> 
> Pages are stored in a color-indexed data structure of lists, sorted by their
> machine addresses, that are collectively called the colored heap. A simple
> initialization function computes the color of any available page and inserts
> it in the corresponding list. When a domain requests a page, the allocator
> takes one from the subset of lists whose colors equals the domain
> configuration. It chooses the page with the highest machine address such that
> contiguous pages are sequentially allocated, if this is made possible by a
> color assignment which includes adjacent colors.
> 
> The allocator can handle only requests with order equals to 0 since the
> single color granularity is represented in memory by one page.

In the earlier paragraph you talk about contiguous pages using contiguous
colors. This paragraph then appears to say the opposite, i.e. that
contiguous multi-page allocations aren't possible. Which of the two is it?

> --- a/xen/arch/arm/Kconfig
> +++ b/xen/arch/arm/Kconfig
> @@ -147,6 +147,18 @@ config MAX_CACHE_COLORS
>  	  colors at boot. Note that if, at any time, a color configuration with more
>  	  colors than the maximum will be employed an error will be produced.
>  
> +config BUDDY_ALLOCATOR_SIZE
> +	string "Buddy allocator reserved memory size" if CACHE_COLORING
> +	default "64M" if CACHE_COLORING
> +	default "0M" if !CACHE_COLORING

I don't understand the purpose of this last line, nor the two earlier
"if". Why not simply

config BUDDY_ALLOCATOR_SIZE
	string "Buddy allocator reserved memory size"
	depend on CACHE_COLORING
	default "64M"

? Also does this really need to be a string, rather than a number (which
then doesn't need parsing) with e.g. MiB granularity?

Finally - how much of this is really Arm-specific? Shouldn't this be a
common config option, perhaps merely restricted to Arm by the top level
option (CACHE_COLORING?) depending on a further HAS_CACHE_COLORING,
which only Arm would select?

> --- a/xen/arch/arm/coloring.c
> +++ b/xen/arch/arm/coloring.c
> @@ -300,6 +300,16 @@ void prepare_color_domain_config(struct xen_arch_domainconfig *config,
>      config->num_colors = (uint16_t)num;
>  }
>  
> +unsigned int page_to_color(struct page_info *pg)

The parameter will want to be pointer-to-const and I wonder whether ...

> +{
> +    return addr_to_color(page_to_maddr(pg));
> +}

... the function as a whole wouldn't be a good candidate for being an
inline one (requiring addr_to_color() to be available in outside of
this file, of course).

> --- a/xen/arch/arm/include/asm/mm.h
> +++ b/xen/arch/arm/include/asm/mm.h
> @@ -143,6 +143,9 @@ struct page_info
>  #define PGC_count_width   PG_shift(10)
>  #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
>  
> +#define _PGC_colored      PG_shift(11)
> +#define PGC_colored       PG_mask(1, 11)

I don't think this can work - you can't use bits already covered by
the count. You'll need to shift up PGC_count_{width,mask} by one and
insert your bit between PGC_extra and that. Or you could use one of
the lower unused ones, between PGC_static and PGC_broken.

> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -150,6 +150,9 @@
>  #define p2m_pod_offline_or_broken_hit(pg) 0
>  #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL)
>  #endif
> +#ifdef CONFIG_CACHE_COLORING
> +#include <asm/coloring.h>
> +#endif
>  
>  #ifndef PGC_static
>  #define PGC_static 0
> @@ -231,6 +234,9 @@ static bool __read_mostly scrub_debug;
>  #define scrub_debug    false
>  #endif
>  
> +/* Memory required for buddy allocator to work with colored one */
> +static unsigned long __initdata buddy_alloc_size;
> +
>  /*
>   * Bit width of the DMA heap -- used to override NUMA-node-first.
>   * allocation strategy, which can otherwise exhaust low memory.
> @@ -440,7 +446,172 @@ mfn_t __init alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
>      BUG();
>  }
>  
> +static DEFINE_SPINLOCK(heap_lock);
>  
> +/* Initialise fields which have other uses for free pages. */
> +static void init_free_page_fields(struct page_info *pg)
> +{
> +    pg->u.inuse.type_info = PGT_TYPE_INFO_INITIALIZER;
> +    page_set_owner(pg, NULL);
> +}
> +
> +static bool is_page_colored(struct page_info *pg)
> +{
> +    return pg->count_info & PGC_colored;
> +}
> +
> +#ifdef CONFIG_CACHE_COLORING
> +/*************************
> + * COLORED SIDE-ALLOCATOR
> + *
> + * Pages are stored by their color in separate lists. Each list defines a color
> + * and it is initialized during end_boot_allocator, where each page's color
> + * is calculated and the page itself is put in the correct list.
> + * After initialization there will be N lists where N is the number of maximum
> + * available colors on the platform.
> + */

Nit: Perhaps either "maximum number of colors" or "number of available
colors"?

> +typedef struct page_list_head colored_pages_t;
> +static colored_pages_t __ro_after_init *color_heap;

Please place the attribute at its canonical slot:

static colored_pages_t *__ro_after_init color_heap;

It applies to the variable, not to the pointed-to type.

> +#define colored_pages(color) &color_heap[(color)]

The parentheses want to move:

#define colored_pages(color) (&color_heap[color])

> +static void color_heap_insert_page(struct page_info *pg)
> +{
> +    struct page_info *pos;
> +    struct page_list_head *head = colored_pages(page_to_color(pg));
> +
> +    pg->count_info |= PGC_colored;

The function isn't marked __init, so runtime correctness as to the
(non-atomic) update here wants clarifying.

> +    /* Insert page in list in ascending machine address order */

Isn't is descending order that you actually use, also seeing that
you take the first page of a list when allocating (further down)?

> +    page_list_for_each( pos, head )
> +    {
> +        if ( page_to_maddr(pos) < page_to_maddr(pg) )
> +        {
> +            head = &pos->list;
> +            break;
> +        }
> +    }

Wait - a linear search for every single page insertion? How well
is that going to perform on a multi-terabyte system?

> +    page_list_add_tail(pg, head);

page_list_head and page_list_entry are generally different things,
so I'm afraid this isn't correct in the common case, and you're
getting away with it only because only Arm currently enables this
code.

> +}
> +
> +static void color_heap_remove_page(struct page_info *pg)
> +{
> +    page_list_del(pg, colored_pages(page_to_color(pg)));
> +}
> +
> +static void __init init_col_heap_pages(struct page_info *pg,
> +                                       unsigned long nr_pages)
> +{
> +    unsigned int i;
> +
> +    if ( !color_heap )
> +    {
> +        unsigned int max_colors = get_max_colors();
> +        color_heap = xmalloc_array(colored_pages_t, max_colors);

Nit: Please always have a blank line between declaration(s) and
statement(s).

> +        BUG_ON(!color_heap);
> +
> +        for ( i = 0; i < max_colors; i++ )
> +            INIT_PAGE_LIST_HEAD(colored_pages(i));
> +
> +        if ( !buddy_alloc_size )
> +            buddy_alloc_size = parse_size_and_unit(CONFIG_BUDDY_ALLOCATOR_SIZE,
> +                                                   NULL);
> +    }
> +
> +    printk(XENLOG_INFO "Init color heap with %lu pages\n", nr_pages);
> +    printk(XENLOG_INFO "Paging starting from: 0x%"PRIx64"\n",
> +           page_to_maddr(pg));

"Paging"? And please prefer %# over 0x% for hex formatting, as we do
elsewhere.

> +    for ( i = 0; i < nr_pages; i++ )
> +        color_heap_insert_page(pg++);
> +}
> +
> +/* Alloc one page based on domain color configuration */
> +static struct page_info *alloc_col_heap_page(unsigned int memflags,
> +                                             const unsigned int *colors,
> +                                             unsigned int num_colors)

Array instead of pointer notation would better reflect the purpose.

> +{
> +    struct page_info *pg = NULL;
> +    unsigned int i;
> +    bool need_tlbflush = false;
> +    uint32_t tlbflush_timestamp = 0;
> +
> +    spin_lock(&heap_lock);
> +
> +    for ( i = 0; i < num_colors; i++ )
> +    {
> +        struct page_info *tmp;
> +
> +        if ( page_list_empty(colored_pages(colors[i])) )
> +            continue;
> +
> +        tmp = page_list_first(colored_pages(colors[i]));
> +        if ( !pg || page_to_maddr(tmp) > page_to_maddr(pg) )
> +            pg = tmp;
> +    }
> +
> +    if ( !pg )
> +    {
> +        spin_unlock(&heap_lock);
> +        return NULL;
> +    }
> +
> +    pg->count_info = PGC_state_inuse;

Aren't you losing PGC_colored here?

> +static struct page_info *alloc_col_domheap_page(struct domain *d,
> +                                                unsigned int memflags)
> +{
> +    struct page_info *pg;
> +
> +    ASSERT(!in_irq());

ASSERT_ALLOC_CONTEXT()? Albeit that's redundant then with the sole caller.

> +#else
> +static void free_col_domheap_page(struct page_info *pg)
> +{
> +    return;
> +}

No need for "return" here.

> @@ -1939,11 +2107,24 @@ void __init end_boot_allocator(void)
>              break;
>          }
>      }
> -    for ( i = nr_bootmem_regions; i-- > 0; )
> +
> +    for ( i = 0; i < nr_bootmem_regions; i++ )

I'm afraid you can't simply go and invert the direction this loop works
without any justification. It's not even clear why you need to work
forwards in the colored case.

Jan
Carlo Nonato Sept. 16, 2022, 4:05 p.m. UTC | #2
Hi Jan,

On Thu, Sep 15, 2022 at 3:13 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 26.08.2022 14:51, Carlo Nonato wrote:
> > This commit adds a new memory page allocator that implements the cache
> > coloring mechanism. The allocation algorithm follows the given color
> > configuration of the domain and maximizes contiguity in the page selection.
> >
> > Pages are stored in a color-indexed data structure of lists, sorted by their
> > machine addresses, that are collectively called the colored heap. A simple
> > initialization function computes the color of any available page and inserts
> > it in the corresponding list. When a domain requests a page, the allocator
> > takes one from the subset of lists whose colors equals the domain
> > configuration. It chooses the page with the highest machine address such that
> > contiguous pages are sequentially allocated, if this is made possible by a
> > color assignment which includes adjacent colors.
> >
> > The allocator can handle only requests with order equals to 0 since the
> > single color granularity is represented in memory by one page.
>
> In the earlier paragraph you talk about contiguous pages using contiguous
> colors. This paragraph then appears to say the opposite, i.e. that
> contiguous multi-page allocations aren't possible. Which of the two is it?

Here there's a little confusion, you're right. Probably it's better explained
in the documentation that comes with this very patch: with a contiguous
colors assignment, we *could* allocate contiguous pages using the "order"
parameter greater than 0, but as a first implementation we simply decided not
to care for such cases and force the order-0 pages only.
However if two order-0 pages are requested subsequently, the allocator tries
to allocate them using different colors so that the cache is used more
efficiently and this is what I meant with "maximizes contiguity".

> > --- a/xen/arch/arm/Kconfig
> > +++ b/xen/arch/arm/Kconfig
> > @@ -147,6 +147,18 @@ config MAX_CACHE_COLORS
> >         colors at boot. Note that if, at any time, a color configuration with more
> >         colors than the maximum will be employed an error will be produced.
> >
> > +config BUDDY_ALLOCATOR_SIZE
> > +     string "Buddy allocator reserved memory size" if CACHE_COLORING
> > +     default "64M" if CACHE_COLORING
> > +     default "0M" if !CACHE_COLORING
>
> I don't understand the purpose of this last line, nor the two earlier
> "if". Why not simply
>
> config BUDDY_ALLOCATOR_SIZE
>         string "Buddy allocator reserved memory size"
>         depend on CACHE_COLORING
>         default "64M"

This was just to have a value for the config option even with cache coloring
disabled. All those ifs emulate the "depends on" keyword, but let the
CONFIG_BUDDY_ALLOCATOR_SIZE takes "0M" when coloring is disabled. With just
the "depends on" the macro isn't defined at all. I know that this can be
handled with some simple #ifdef, but I found this way to be more elegant.
Not an expert here so if you prefer the other way or a whole different one
(more readable/better fitted) please let me know.

> ? Also does this really need to be a string, rather than a number (which
> then doesn't need parsing) with e.g. MiB granularity?

Yeah it's easier like you said. I don't think there's a need for a < 1 MiB
value.

> Finally - how much of this is really Arm-specific? Shouldn't this be a
> common config option, perhaps merely restricted to Arm by the top level
> option (CACHE_COLORING?) depending on a further HAS_CACHE_COLORING,
> which only Arm would select?

I'm sorry, but I don't understand your suggestion. BUDDY_ALLOCATOR_SIZE
is Arm specific because CACHE_COLORING is. In fact it depends only on this
last config value and not on Arm config directly. Why should someone limit
the buddy allocator when coloring isn't enabled?
I've lost you on the HAS_CACHE_COLORING. Why should Arm config select this
one? Cache coloring must remain optional. I'm probably missing something.

> > --- a/xen/arch/arm/coloring.c
> > +++ b/xen/arch/arm/coloring.c
> > @@ -300,6 +300,16 @@ void prepare_color_domain_config(struct xen_arch_domainconfig *config,
> >      config->num_colors = (uint16_t)num;
> >  }
> >
> > +unsigned int page_to_color(struct page_info *pg)
>
> The parameter will want to be pointer-to-const and I wonder whether ...
>
> > +{
> > +    return addr_to_color(page_to_maddr(pg));
> > +}
>
> ... the function as a whole wouldn't be a good candidate for being an
> inline one (requiring addr_to_color() to be available in outside of
> this file, of course).

You mean defining it as static inline in the coloring.h header?

> > --- a/xen/arch/arm/include/asm/mm.h
> > +++ b/xen/arch/arm/include/asm/mm.h
> > @@ -143,6 +143,9 @@ struct page_info
> >  #define PGC_count_width   PG_shift(10)
> >  #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
> >
> > +#define _PGC_colored      PG_shift(11)
> > +#define PGC_colored       PG_mask(1, 11)
>
> I don't think this can work - you can't use bits already covered by
> the count. You'll need to shift up PGC_count_{width,mask} by one and
> insert your bit between PGC_extra and that. Or you could use one of
> the lower unused ones, between PGC_static and PGC_broken.

Yes you're right. I misinterpreted those values. Shame on me.

> > --- a/xen/common/page_alloc.c
> > +++ b/xen/common/page_alloc.c
> > @@ -150,6 +150,9 @@
> >  #define p2m_pod_offline_or_broken_hit(pg) 0
> >  #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL)
> >  #endif
> > +#ifdef CONFIG_CACHE_COLORING
> > +#include <asm/coloring.h>
> > +#endif
> >
> >  #ifndef PGC_static
> >  #define PGC_static 0
> > @@ -231,6 +234,9 @@ static bool __read_mostly scrub_debug;
> >  #define scrub_debug    false
> >  #endif
> >
> > +/* Memory required for buddy allocator to work with colored one */
> > +static unsigned long __initdata buddy_alloc_size;
> > +
> >  /*
> >   * Bit width of the DMA heap -- used to override NUMA-node-first.
> >   * allocation strategy, which can otherwise exhaust low memory.
> > @@ -440,7 +446,172 @@ mfn_t __init alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
> >      BUG();
> >  }
> >
> > +static DEFINE_SPINLOCK(heap_lock);
> >
> > +/* Initialise fields which have other uses for free pages. */
> > +static void init_free_page_fields(struct page_info *pg)
> > +{
> > +    pg->u.inuse.type_info = PGT_TYPE_INFO_INITIALIZER;
> > +    page_set_owner(pg, NULL);
> > +}
> > +
> > +static bool is_page_colored(struct page_info *pg)
> > +{
> > +    return pg->count_info & PGC_colored;
> > +}
> > +
> > +#ifdef CONFIG_CACHE_COLORING
> > +/*************************
> > + * COLORED SIDE-ALLOCATOR
> > + *
> > + * Pages are stored by their color in separate lists. Each list defines a color
> > + * and it is initialized during end_boot_allocator, where each page's color
> > + * is calculated and the page itself is put in the correct list.
> > + * After initialization there will be N lists where N is the number of maximum
> > + * available colors on the platform.
> > + */
>
> Nit: Perhaps either "maximum number of colors" or "number of available
> colors"?
>
> > +typedef struct page_list_head colored_pages_t;
> > +static colored_pages_t __ro_after_init *color_heap;
>
> Please place the attribute at its canonical slot:
>
> static colored_pages_t *__ro_after_init color_heap;
>
> It applies to the variable, not to the pointed-to type.
>
> > +#define colored_pages(color) &color_heap[(color)]
>
> The parentheses want to move:
>
> #define colored_pages(color) (&color_heap[color])
>
> > +static void color_heap_insert_page(struct page_info *pg)
> > +{
> > +    struct page_info *pos;
> > +    struct page_list_head *head = colored_pages(page_to_color(pg));
> > +
> > +    pg->count_info |= PGC_colored;
>
> The function isn't marked __init, so runtime correctness as to the
> (non-atomic) update here wants clarifying.

Yes. I need to check and probably add a spin lock for the color heap.

> > +    /* Insert page in list in ascending machine address order */
>
> Isn't is descending order that you actually use, also seeing that
> you take the first page of a list when allocating (further down)?

Yes you're right. Forgot to change the comment.

> > +    page_list_for_each( pos, head )
> > +    {
> > +        if ( page_to_maddr(pos) < page_to_maddr(pg) )
> > +        {
> > +            head = &pos->list;
> > +            break;
> > +        }
> > +    }
>
> Wait - a linear search for every single page insertion? How well
> is that going to perform on a multi-terabyte system?

For our test cases (embedded systems) the linear search is good enough.
I agree with you that in the general case this is bad (even though the main
targets are indeed embedded systems).
Are there any already available data structures that we can exploit to get
better performances?

> > +    page_list_add_tail(pg, head);
>
> page_list_head and page_list_entry are generally different things,
> so I'm afraid this isn't correct in the common case, and you're
> getting away with it only because only Arm currently enables this
> code.

So how to insert in the middle of the list?

>
> > +}
> > +
> > +static void color_heap_remove_page(struct page_info *pg)
> > +{
> > +    page_list_del(pg, colored_pages(page_to_color(pg)));
> > +}
> > +
> > +static void __init init_col_heap_pages(struct page_info *pg,
> > +                                       unsigned long nr_pages)
> > +{
> > +    unsigned int i;
> > +
> > +    if ( !color_heap )
> > +    {
> > +        unsigned int max_colors = get_max_colors();
> > +        color_heap = xmalloc_array(colored_pages_t, max_colors);
>
> Nit: Please always have a blank line between declaration(s) and
> statement(s).
>
> > +        BUG_ON(!color_heap);
> > +
> > +        for ( i = 0; i < max_colors; i++ )
> > +            INIT_PAGE_LIST_HEAD(colored_pages(i));
> > +
> > +        if ( !buddy_alloc_size )
> > +            buddy_alloc_size = parse_size_and_unit(CONFIG_BUDDY_ALLOCATOR_SIZE,
> > +                                                   NULL);
> > +    }
> > +
> > +    printk(XENLOG_INFO "Init color heap with %lu pages\n", nr_pages);
> > +    printk(XENLOG_INFO "Paging starting from: 0x%"PRIx64"\n",
> > +           page_to_maddr(pg));
>
> "Paging"? And please prefer %# over 0x% for hex formatting, as we do
> elsewhere.
>
> > +    for ( i = 0; i < nr_pages; i++ )
> > +        color_heap_insert_page(pg++);
> > +}
> > +
> > +/* Alloc one page based on domain color configuration */
> > +static struct page_info *alloc_col_heap_page(unsigned int memflags,
> > +                                             const unsigned int *colors,
> > +                                             unsigned int num_colors)
>
> Array instead of pointer notation would better reflect the purpose.
>
> > +{
> > +    struct page_info *pg = NULL;
> > +    unsigned int i;
> > +    bool need_tlbflush = false;
> > +    uint32_t tlbflush_timestamp = 0;
> > +
> > +    spin_lock(&heap_lock);
> > +
> > +    for ( i = 0; i < num_colors; i++ )
> > +    {
> > +        struct page_info *tmp;
> > +
> > +        if ( page_list_empty(colored_pages(colors[i])) )
> > +            continue;
> > +
> > +        tmp = page_list_first(colored_pages(colors[i]));
> > +        if ( !pg || page_to_maddr(tmp) > page_to_maddr(pg) )
> > +            pg = tmp;
> > +    }
> > +
> > +    if ( !pg )
> > +    {
> > +        spin_unlock(&heap_lock);
> > +        return NULL;
> > +    }
> > +
> > +    pg->count_info = PGC_state_inuse;
>
> Aren't you losing PGC_colored here?

Right. Pretty nasty bug you found.

> > +static struct page_info *alloc_col_domheap_page(struct domain *d,
> > +                                                unsigned int memflags)
> > +{
> > +    struct page_info *pg;
> > +
> > +    ASSERT(!in_irq());
>
> ASSERT_ALLOC_CONTEXT()? Albeit that's redundant then with the sole caller.
>
> > +#else
> > +static void free_col_domheap_page(struct page_info *pg)
> > +{
> > +    return;
> > +}
>
> No need for "return" here.
>
> > @@ -1939,11 +2107,24 @@ void __init end_boot_allocator(void)
> >              break;
> >          }
> >      }
> > -    for ( i = nr_bootmem_regions; i-- > 0; )
> > +
> > +    for ( i = 0; i < nr_bootmem_regions; i++ )
>
> I'm afraid you can't simply go and invert the direction this loop works
> without any justification. It's not even clear why you need to work
> forwards in the colored case.

The order was inverted because I'm assuming bootmem regions are stored in
ascending order (this should be the case looking at bootmem_region_add().
Am I wrong?) and (second assumption) pages taken from each memory region
are sorted in ascending order relatively to their machine address.
This means that color_heap_insert_page() is called (at least during
end_boot_allocator()) with always increasing machine addresses and so the
linear search should be O(1).

> Jan

Thanks.

- Carlo Nonato
Jan Beulich Sept. 19, 2022, 6:26 a.m. UTC | #3
On 16.09.2022 18:05, Carlo Nonato wrote:
> On Thu, Sep 15, 2022 at 3:13 PM Jan Beulich <jbeulich@suse.com> wrote:
>> On 26.08.2022 14:51, Carlo Nonato wrote:
>>> --- a/xen/arch/arm/Kconfig
>>> +++ b/xen/arch/arm/Kconfig
>>> @@ -147,6 +147,18 @@ config MAX_CACHE_COLORS
>>>         colors at boot. Note that if, at any time, a color configuration with more
>>>         colors than the maximum will be employed an error will be produced.
>>>
>>> +config BUDDY_ALLOCATOR_SIZE
>>> +     string "Buddy allocator reserved memory size" if CACHE_COLORING
>>> +     default "64M" if CACHE_COLORING
>>> +     default "0M" if !CACHE_COLORING
>>
>> I don't understand the purpose of this last line, nor the two earlier
>> "if". Why not simply
>>
>> config BUDDY_ALLOCATOR_SIZE
>>         string "Buddy allocator reserved memory size"
>>         depend on CACHE_COLORING
>>         default "64M"
> 
> This was just to have a value for the config option even with cache coloring
> disabled. All those ifs emulate the "depends on" keyword, but let the
> CONFIG_BUDDY_ALLOCATOR_SIZE takes "0M" when coloring is disabled. With just
> the "depends on" the macro isn't defined at all. I know that this can be
> handled with some simple #ifdef, but I found this way to be more elegant.
> Not an expert here so if you prefer the other way or a whole different one
> (more readable/better fitted) please let me know.

As far as I saw, the sole use was already inside a suitable #ifdef. Hence
yes, I clearly would see "depends on" as the better fit. Please also don't
forget that if later cache coloring would be enabled for another
architecture, that default of zero (pre-recorded in a .config) would get
in the way; one would need to manually change it (and remember to do so).

>> Finally - how much of this is really Arm-specific? Shouldn't this be a
>> common config option, perhaps merely restricted to Arm by the top level
>> option (CACHE_COLORING?) depending on a further HAS_CACHE_COLORING,
>> which only Arm would select?
> 
> I'm sorry, but I don't understand your suggestion. BUDDY_ALLOCATOR_SIZE
> is Arm specific because CACHE_COLORING is. In fact it depends only on this
> last config value and not on Arm config directly. Why should someone limit
> the buddy allocator when coloring isn't enabled?

My comment wasn't on this on setting alone, but on the coloring ones as a
set.

> I've lost you on the HAS_CACHE_COLORING. Why should Arm config select this
> one? Cache coloring must remain optional. I'm probably missing something.

HAS_* settings only express what an arch is capable of; they don't replace
dependent options which actually are user-selectable. (That said, we have
a number where there's no user selection possible, but that's not of
interest here.)

>>> --- a/xen/arch/arm/coloring.c
>>> +++ b/xen/arch/arm/coloring.c
>>> @@ -300,6 +300,16 @@ void prepare_color_domain_config(struct xen_arch_domainconfig *config,
>>>      config->num_colors = (uint16_t)num;
>>>  }
>>>
>>> +unsigned int page_to_color(struct page_info *pg)
>>
>> The parameter will want to be pointer-to-const and I wonder whether ...
>>
>>> +{
>>> +    return addr_to_color(page_to_maddr(pg));
>>> +}
>>
>> ... the function as a whole wouldn't be a good candidate for being an
>> inline one (requiring addr_to_color() to be available in outside of
>> this file, of course).
> 
> You mean defining it as static inline in the coloring.h header?

That would seem preferable for a simple function like this one.

>>> +static void color_heap_insert_page(struct page_info *pg)
>>> +{
>>> +    struct page_info *pos;
>>> +    struct page_list_head *head = colored_pages(page_to_color(pg));
>>> +
>>> +    pg->count_info |= PGC_colored;
>>
>> The function isn't marked __init, so runtime correctness as to the
>> (non-atomic) update here wants clarifying.
> 
> Yes. I need to check and probably add a spin lock for the color heap.

I'm afraid a spin lock won't help. May I suggest you look at some of
the staticmem discussions that had happened, including a similar
topic. (Sorry, I don't have a link at hand to the exact mail.)

>>> +    page_list_for_each( pos, head )
>>> +    {
>>> +        if ( page_to_maddr(pos) < page_to_maddr(pg) )
>>> +        {
>>> +            head = &pos->list;
>>> +            break;
>>> +        }
>>> +    }
>>
>> Wait - a linear search for every single page insertion? How well
>> is that going to perform on a multi-terabyte system?
> 
> For our test cases (embedded systems) the linear search is good enough.
> I agree with you that in the general case this is bad (even though the main
> targets are indeed embedded systems).
> Are there any already available data structures that we can exploit to get
> better performances?

I'm afraid there aren't any that I would see as a good fit here.

>>> +    page_list_add_tail(pg, head);
>>
>> page_list_head and page_list_entry are generally different things,
>> so I'm afraid this isn't correct in the common case, and you're
>> getting away with it only because only Arm currently enables this
>> code.
> 
> So how to insert in the middle of the list?

That likely would require a page_list_*() new helper function. So
far we simply had no need to insert at other than head or tail,
iirc.

>>> @@ -1939,11 +2107,24 @@ void __init end_boot_allocator(void)
>>>              break;
>>>          }
>>>      }
>>> -    for ( i = nr_bootmem_regions; i-- > 0; )
>>> +
>>> +    for ( i = 0; i < nr_bootmem_regions; i++ )
>>
>> I'm afraid you can't simply go and invert the direction this loop works
>> without any justification. It's not even clear why you need to work
>> forwards in the colored case.
> 
> The order was inverted because I'm assuming bootmem regions are stored in
> ascending order (this should be the case looking at bootmem_region_add().
> Am I wrong?) and (second assumption) pages taken from each memory region
> are sorted in ascending order relatively to their machine address.
> This means that color_heap_insert_page() is called (at least during
> end_boot_allocator()) with always increasing machine addresses and so the
> linear search should be O(1).

Indeed that was my guess. Yet the present order of processing is there
for a reason, so you need to both retain it and supply justification
(perhaps by way of a brief comment) why you need alternative code for
your allocator. Even better would of course be if insertion was more
efficient and didn't require the logic here to specifically take care.

Jan
Stefano Stabellini Sept. 19, 2022, 10:42 p.m. UTC | #4
On Mon, 19 Sep 2022, Jan Beulich wrote:
> On 16.09.2022 18:05, Carlo Nonato wrote:
> > On Thu, Sep 15, 2022 at 3:13 PM Jan Beulich <jbeulich@suse.com> wrote:
> >> On 26.08.2022 14:51, Carlo Nonato wrote:
> >>> --- a/xen/arch/arm/Kconfig
> >>> +++ b/xen/arch/arm/Kconfig
> >>> @@ -147,6 +147,18 @@ config MAX_CACHE_COLORS
> >>>         colors at boot. Note that if, at any time, a color configuration with more
> >>>         colors than the maximum will be employed an error will be produced.
> >>>
> >>> +config BUDDY_ALLOCATOR_SIZE
> >>> +     string "Buddy allocator reserved memory size" if CACHE_COLORING
> >>> +     default "64M" if CACHE_COLORING
> >>> +     default "0M" if !CACHE_COLORING
> >>
> >> I don't understand the purpose of this last line, nor the two earlier
> >> "if". Why not simply
> >>
> >> config BUDDY_ALLOCATOR_SIZE
> >>         string "Buddy allocator reserved memory size"
> >>         depend on CACHE_COLORING
> >>         default "64M"
> > 
> > This was just to have a value for the config option even with cache coloring
> > disabled. All those ifs emulate the "depends on" keyword, but let the
> > CONFIG_BUDDY_ALLOCATOR_SIZE takes "0M" when coloring is disabled. With just
> > the "depends on" the macro isn't defined at all. I know that this can be
> > handled with some simple #ifdef, but I found this way to be more elegant.
> > Not an expert here so if you prefer the other way or a whole different one
> > (more readable/better fitted) please let me know.
> 
> As far as I saw, the sole use was already inside a suitable #ifdef. Hence
> yes, I clearly would see "depends on" as the better fit. Please also don't
> forget that if later cache coloring would be enabled for another
> architecture, that default of zero (pre-recorded in a .config) would get
> in the way; one would need to manually change it (and remember to do so).
> 
> >> Finally - how much of this is really Arm-specific? Shouldn't this be a
> >> common config option, perhaps merely restricted to Arm by the top level
> >> option (CACHE_COLORING?) depending on a further HAS_CACHE_COLORING,
> >> which only Arm would select?
> > 
> > I'm sorry, but I don't understand your suggestion. BUDDY_ALLOCATOR_SIZE
> > is Arm specific because CACHE_COLORING is. In fact it depends only on this
> > last config value and not on Arm config directly. Why should someone limit
> > the buddy allocator when coloring isn't enabled?
> 
> My comment wasn't on this on setting alone, but on the coloring ones as a
> set.
> 
> > I've lost you on the HAS_CACHE_COLORING. Why should Arm config select this
> > one? Cache coloring must remain optional. I'm probably missing something.
> 
> HAS_* settings only express what an arch is capable of; they don't replace
> dependent options which actually are user-selectable. (That said, we have
> a number where there's no user selection possible, but that's not of
> interest here.)
> 
> >>> --- a/xen/arch/arm/coloring.c
> >>> +++ b/xen/arch/arm/coloring.c
> >>> @@ -300,6 +300,16 @@ void prepare_color_domain_config(struct xen_arch_domainconfig *config,
> >>>      config->num_colors = (uint16_t)num;
> >>>  }
> >>>
> >>> +unsigned int page_to_color(struct page_info *pg)
> >>
> >> The parameter will want to be pointer-to-const and I wonder whether ...
> >>
> >>> +{
> >>> +    return addr_to_color(page_to_maddr(pg));
> >>> +}
> >>
> >> ... the function as a whole wouldn't be a good candidate for being an
> >> inline one (requiring addr_to_color() to be available in outside of
> >> this file, of course).
> > 
> > You mean defining it as static inline in the coloring.h header?
> 
> That would seem preferable for a simple function like this one.
> 
> >>> +static void color_heap_insert_page(struct page_info *pg)
> >>> +{
> >>> +    struct page_info *pos;
> >>> +    struct page_list_head *head = colored_pages(page_to_color(pg));
> >>> +
> >>> +    pg->count_info |= PGC_colored;
> >>
> >> The function isn't marked __init, so runtime correctness as to the
> >> (non-atomic) update here wants clarifying.
> > 
> > Yes. I need to check and probably add a spin lock for the color heap.
> 
> I'm afraid a spin lock won't help. May I suggest you look at some of
> the staticmem discussions that had happened, including a similar
> topic. (Sorry, I don't have a link at hand to the exact mail.)

I searched through the recent staticmem discussions to try to provide a
helpful link for Carlo, but I don't think I managed to find what you had
in mind. I found these two lock-related emails:

https://marc.info/?l=xen-devel&m=165476642832402
https://marc.info/?l=xen-devel&m=165632461420257

If they are not relevant, could you please provide a few more details?
Jan Beulich Sept. 20, 2022, 7:54 a.m. UTC | #5
On 20.09.2022 00:42, Stefano Stabellini wrote:
> On Mon, 19 Sep 2022, Jan Beulich wrote:
>> On 16.09.2022 18:05, Carlo Nonato wrote:
>>> On Thu, Sep 15, 2022 at 3:13 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>> On 26.08.2022 14:51, Carlo Nonato wrote:
>>>>> --- a/xen/arch/arm/Kconfig
>>>>> +++ b/xen/arch/arm/Kconfig
>>>>> @@ -147,6 +147,18 @@ config MAX_CACHE_COLORS
>>>>>         colors at boot. Note that if, at any time, a color configuration with more
>>>>>         colors than the maximum will be employed an error will be produced.
>>>>>
>>>>> +config BUDDY_ALLOCATOR_SIZE
>>>>> +     string "Buddy allocator reserved memory size" if CACHE_COLORING
>>>>> +     default "64M" if CACHE_COLORING
>>>>> +     default "0M" if !CACHE_COLORING
>>>>
>>>> I don't understand the purpose of this last line, nor the two earlier
>>>> "if". Why not simply
>>>>
>>>> config BUDDY_ALLOCATOR_SIZE
>>>>         string "Buddy allocator reserved memory size"
>>>>         depend on CACHE_COLORING
>>>>         default "64M"
>>>
>>> This was just to have a value for the config option even with cache coloring
>>> disabled. All those ifs emulate the "depends on" keyword, but let the
>>> CONFIG_BUDDY_ALLOCATOR_SIZE takes "0M" when coloring is disabled. With just
>>> the "depends on" the macro isn't defined at all. I know that this can be
>>> handled with some simple #ifdef, but I found this way to be more elegant.
>>> Not an expert here so if you prefer the other way or a whole different one
>>> (more readable/better fitted) please let me know.
>>
>> As far as I saw, the sole use was already inside a suitable #ifdef. Hence
>> yes, I clearly would see "depends on" as the better fit. Please also don't
>> forget that if later cache coloring would be enabled for another
>> architecture, that default of zero (pre-recorded in a .config) would get
>> in the way; one would need to manually change it (and remember to do so).
>>
>>>> Finally - how much of this is really Arm-specific? Shouldn't this be a
>>>> common config option, perhaps merely restricted to Arm by the top level
>>>> option (CACHE_COLORING?) depending on a further HAS_CACHE_COLORING,
>>>> which only Arm would select?
>>>
>>> I'm sorry, but I don't understand your suggestion. BUDDY_ALLOCATOR_SIZE
>>> is Arm specific because CACHE_COLORING is. In fact it depends only on this
>>> last config value and not on Arm config directly. Why should someone limit
>>> the buddy allocator when coloring isn't enabled?
>>
>> My comment wasn't on this on setting alone, but on the coloring ones as a
>> set.
>>
>>> I've lost you on the HAS_CACHE_COLORING. Why should Arm config select this
>>> one? Cache coloring must remain optional. I'm probably missing something.
>>
>> HAS_* settings only express what an arch is capable of; they don't replace
>> dependent options which actually are user-selectable. (That said, we have
>> a number where there's no user selection possible, but that's not of
>> interest here.)
>>
>>>>> --- a/xen/arch/arm/coloring.c
>>>>> +++ b/xen/arch/arm/coloring.c
>>>>> @@ -300,6 +300,16 @@ void prepare_color_domain_config(struct xen_arch_domainconfig *config,
>>>>>      config->num_colors = (uint16_t)num;
>>>>>  }
>>>>>
>>>>> +unsigned int page_to_color(struct page_info *pg)
>>>>
>>>> The parameter will want to be pointer-to-const and I wonder whether ...
>>>>
>>>>> +{
>>>>> +    return addr_to_color(page_to_maddr(pg));
>>>>> +}
>>>>
>>>> ... the function as a whole wouldn't be a good candidate for being an
>>>> inline one (requiring addr_to_color() to be available in outside of
>>>> this file, of course).
>>>
>>> You mean defining it as static inline in the coloring.h header?
>>
>> That would seem preferable for a simple function like this one.
>>
>>>>> +static void color_heap_insert_page(struct page_info *pg)
>>>>> +{
>>>>> +    struct page_info *pos;
>>>>> +    struct page_list_head *head = colored_pages(page_to_color(pg));
>>>>> +
>>>>> +    pg->count_info |= PGC_colored;
>>>>
>>>> The function isn't marked __init, so runtime correctness as to the
>>>> (non-atomic) update here wants clarifying.
>>>
>>> Yes. I need to check and probably add a spin lock for the color heap.
>>
>> I'm afraid a spin lock won't help. May I suggest you look at some of
>> the staticmem discussions that had happened, including a similar
>> topic. (Sorry, I don't have a link at hand to the exact mail.)
> 
> I searched through the recent staticmem discussions to try to provide a
> helpful link for Carlo, but I don't think I managed to find what you had
> in mind. I found these two lock-related emails:
> 
> https://marc.info/?l=xen-devel&m=165476642832402
> https://marc.info/?l=xen-devel&m=165632461420257
> 
> If they are not relevant, could you please provide a few more details?

Those aren't the ones. The point is that count_info is a somewhat
odd field: It's not consistently updated under (all the same) lock,
and it's also not consistently updated atomically. Hence new
updates that appear in the code need properly justifying that the
way updates are done there doesn't conflict with any other of the
already existing updates.

Jan
Carlo Nonato Oct. 13, 2022, 9:47 a.m. UTC | #6
Hi Jan


On Mon, Sep 19, 2022 at 8:26 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 16.09.2022 18:05, Carlo Nonato wrote:
> > On Thu, Sep 15, 2022 at 3:13 PM Jan Beulich <jbeulich@suse.com> wrote:
> >> On 26.08.2022 14:51, Carlo Nonato wrote:
> >>> --- a/xen/arch/arm/coloring.c
> >>> +++ b/xen/arch/arm/coloring.c
> >>> @@ -300,6 +300,16 @@ void prepare_color_domain_config(struct xen_arch_domainconfig *config,
> >>>      config->num_colors = (uint16_t)num;
> >>>  }
> >>>
> >>> +unsigned int page_to_color(struct page_info *pg)
> >>
> >> The parameter will want to be pointer-to-const and I wonder whether ...
> >>
> >>> +{
> >>> +    return addr_to_color(page_to_maddr(pg));
> >>> +}
> >>
> >> ... the function as a whole wouldn't be a good candidate for being an
> >> inline one (requiring addr_to_color() to be available in outside of
> >> this file, of course).
> >
> > You mean defining it as static inline in the coloring.h header?
>
> That would seem preferable for a simple function like this one.
>

I didn't want to expose that function since I would also have to expose
the addr_col_mask global variable.
Same goes for get_max_colors(): it exist only for the purpose to restrict
the max_colors variable visibility.

> >>> +    page_list_for_each( pos, head )
> >>> +    {
> >>> +        if ( page_to_maddr(pos) < page_to_maddr(pg) )
> >>> +        {
> >>> +            head = &pos->list;
> >>> +            break;
> >>> +        }
> >>> +    }
> >>
> >> Wait - a linear search for every single page insertion? How well
> >> is that going to perform on a multi-terabyte system?
> >
> > For our test cases (embedded systems) the linear search is good enough.
> > I agree with you that in the general case this is bad (even though the main
> > targets are indeed embedded systems).
> > Are there any already available data structures that we can exploit to get
> > better performances?
>
> I'm afraid there aren't any that I would see as a good fit here.
>

Regarding this I can see three options:
1) We leave it as it is and we warn the user in the docs that cache coloring
   is embedded system specific for the moment since it has, probably, bad
   performances with bigger systems.
2) We use some priority queue implementation to replace the actual lists.
   Red/black trees are available in Xen codebase, but I think I would have
   to change the page_info struct to use them.
   Maybe just a binary heap implemented as an array could be viable, but that
   would require me to implement somewhere the logic for insertion,
   extract-min and other operations.
3) I have a working prototype of a buddy allocator that also makes use of
   coloring information. It isn't an extension of the main one, but rather a
   simpler version. This means that nodes, zones, scrubbing, aren't
   supported, but this is true also for the already submitted colored
   allocator. With this, order > 0 pages can be served (up until
   log2(max_colors)) and insertion is no more linear, but constant instead.

>
> Jan

Thanks

- Carlo Nonato
Jan Beulich Oct. 13, 2022, 10:44 a.m. UTC | #7
On 13.10.2022 11:47, Carlo Nonato wrote:
> On Mon, Sep 19, 2022 at 8:26 AM Jan Beulich <jbeulich@suse.com> wrote:
>> On 16.09.2022 18:05, Carlo Nonato wrote:
>>> On Thu, Sep 15, 2022 at 3:13 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>> On 26.08.2022 14:51, Carlo Nonato wrote:
>>>>> --- a/xen/arch/arm/coloring.c
>>>>> +++ b/xen/arch/arm/coloring.c
>>>>> @@ -300,6 +300,16 @@ void prepare_color_domain_config(struct xen_arch_domainconfig *config,
>>>>>      config->num_colors = (uint16_t)num;
>>>>>  }
>>>>>
>>>>> +unsigned int page_to_color(struct page_info *pg)
>>>>
>>>> The parameter will want to be pointer-to-const and I wonder whether ...
>>>>
>>>>> +{
>>>>> +    return addr_to_color(page_to_maddr(pg));
>>>>> +}
>>>>
>>>> ... the function as a whole wouldn't be a good candidate for being an
>>>> inline one (requiring addr_to_color() to be available in outside of
>>>> this file, of course).
>>>
>>> You mean defining it as static inline in the coloring.h header?
>>
>> That would seem preferable for a simple function like this one.
>>
> 
> I didn't want to expose that function since I would also have to expose
> the addr_col_mask global variable.
> Same goes for get_max_colors(): it exist only for the purpose to restrict
> the max_colors variable visibility.

Ah yes, that's a good reason to keep the function out-of-line.

>>>>> +    page_list_for_each( pos, head )
>>>>> +    {
>>>>> +        if ( page_to_maddr(pos) < page_to_maddr(pg) )
>>>>> +        {
>>>>> +            head = &pos->list;
>>>>> +            break;
>>>>> +        }
>>>>> +    }
>>>>
>>>> Wait - a linear search for every single page insertion? How well
>>>> is that going to perform on a multi-terabyte system?
>>>
>>> For our test cases (embedded systems) the linear search is good enough.
>>> I agree with you that in the general case this is bad (even though the main
>>> targets are indeed embedded systems).
>>> Are there any already available data structures that we can exploit to get
>>> better performances?
>>
>> I'm afraid there aren't any that I would see as a good fit here.
>>
> 
> Regarding this I can see three options:
> 1) We leave it as it is and we warn the user in the docs that cache coloring
>    is embedded system specific for the moment since it has, probably, bad
>    performances with bigger systems.

I could live with this as long as it's stated prominently enough, but ...

> 2) We use some priority queue implementation to replace the actual lists.
>    Red/black trees are available in Xen codebase, but I think I would have
>    to change the page_info struct to use them.
>    Maybe just a binary heap implemented as an array could be viable, but that
>    would require me to implement somewhere the logic for insertion,
>    extract-min and other operations.
> 3) I have a working prototype of a buddy allocator that also makes use of
>    coloring information. It isn't an extension of the main one, but rather a
>    simpler version. This means that nodes, zones, scrubbing, aren't
>    supported, but this is true also for the already submitted colored
>    allocator. With this, order > 0 pages can be served (up until
>    log2(max_colors)) and insertion is no more linear, but constant instead.

... this sounds even more promising, not the least because it also eliminates
yet another shortcoming we've talked about already. In fact I would expect
that log2(max_colors) doesn't need to be the limit either, as you'd cycle
back to the first color anyway once you've reached the last one.

Jan
Michal Orzel Oct. 17, 2022, 7:06 a.m. UTC | #8
Hi Carlo,

On 26/08/2022 14:51, Carlo Nonato wrote:

> 
> This commit adds a new memory page allocator that implements the cache
> coloring mechanism. The allocation algorithm follows the given color
> configuration of the domain and maximizes contiguity in the page selection.
> 
> Pages are stored in a color-indexed data structure of lists, sorted by their
> machine addresses, that are collectively called the colored heap. A simple
> initialization function computes the color of any available page and inserts
> it in the corresponding list. When a domain requests a page, the allocator
> takes one from the subset of lists whose colors equals the domain
> configuration. It chooses the page with the highest machine address such that
> contiguous pages are sequentially allocated, if this is made possible by a
> color assignment which includes adjacent colors.
> 
> The allocator can handle only requests with order equals to 0 since the
> single color granularity is represented in memory by one page.
> 
> The buddy allocator must coexist with the colored one because the Xen heap
> isn't colored. For this reason a new Kconfig option and a command line
> parameter are added to let the user set the amount of memory reserved for
> the buddy allocator. Even when cache coloring is enabled, this memory isn't
> managed by the colored allocator.
> 
> Signed-off-by: Carlo Nonato <carlo.nonato@minervasys.tech>
> Signed-off-by: Marco Solieri <marco.solieri@minervasys.tech>
> ---
>  docs/misc/arm/cache-coloring.rst    |  43 +++++-
>  docs/misc/xen-command-line.pandoc   |  14 ++
>  xen/arch/arm/Kconfig                |  12 ++
>  xen/arch/arm/coloring.c             |  10 ++
>  xen/arch/arm/include/asm/coloring.h |   6 +
>  xen/arch/arm/include/asm/mm.h       |   3 +
>  xen/common/page_alloc.c             | 213 ++++++++++++++++++++++++++--
>  7 files changed, 290 insertions(+), 11 deletions(-)
> 
> diff --git a/docs/misc/arm/cache-coloring.rst b/docs/misc/arm/cache-coloring.rst
> index 763acd2d3f..5f9132e525 100644
> --- a/docs/misc/arm/cache-coloring.rst
> +++ b/docs/misc/arm/cache-coloring.rst
> @@ -13,6 +13,9 @@ In order to enable and use it, few steps are needed.
>    (refer to menuconfig help for value meaning and when it should be changed).
> 
>          CONFIG_MAX_CACHE_COLORS=<n>
> +- If needed, change the amount of memory reserved for the buddy allocator either
> +  from the Xen configuration file, via the CONFIG_BUDDY_ALLOCATOR_SIZE value,
> +  or with the command line option. See `Colored allocator and buddy allocator`.
>  - Assign colors to domains using the `Color selection format`_ (see
>    `Coloring parameters`_ for more documentation pointers).
> 
> @@ -157,4 +160,42 @@ Please refer to the relative documentation in
>  "docs/misc/arm/device-tree/booting.txt".
> 
>  Note that if no color configuration is provided for domains, they fallback to
> -the default one, which corresponds simply to all available colors.
> \ No newline at end of file
> +the default one, which corresponds simply to all available colors.
> +
> +Colored allocator and buddy allocator
> +*************************************
> +
> +The colored allocator distributes pages based on color configurations of
> +domains so that each domains only gets pages of its own colors.
> +The colored allocator is meant as an alternative to the buddy allocator because
> +its allocation policy is by definition incompatible with the generic one. Since
> +the Xen heap systems is not colored yet, we need to support the coexistence of
> +the two allocators and some memory must be left for the buddy one.
> +The buddy allocator memory can be reserved from the Xen configuration file or
> +with the help of a command-line option.
> +
> +Known issues and limitations
> +****************************
> +
> +Colored allocator can only make use of order-0 pages
> +####################################################
> +
> +The cache coloring technique relies on memory mappings and on the smallest
> +amount of memory that can be mapped to achieve the maximum number of colors
> +(cache partitions) possible. This amount is what is normally called a page and,
> +in Xen terminology, the order-0 page is the smallest one. The fairly simple
> +colored allocator currently implemented, makes use only of such pages.
> +It must be said that a more complex one could, in theory, adopt higher order
> +pages if the colors selection contained adjacent colors. Two subsequent colors,
> +for example, can be represented by a order-1 page, four colors correspond to
> +a order-2 pages, etc.
> +
> +Fail to boot colored DomUs with large memory size
> +#################################################
> +
> +If the Linux kernel used for Dom0 does not contain the upstream commit
> +3941552aec1e04d63999988a057ae09a1c56ebeb and uses the hypercall buffer device,
> +colored DomUs with memory size larger then 127 MB cannot be created. This is
> +caused by the default limit of this buffer of 64 pages. The solution is to
> +manually apply the above patch, or to check if there is an updated version of
> +the kernel in use for Dom0 that contains this change.
> diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
> index 910ebeb2eb..4e85c4dfe4 100644
> --- a/docs/misc/xen-command-line.pandoc
> +++ b/docs/misc/xen-command-line.pandoc
> @@ -299,6 +299,20 @@ can be maintained with the pv-shim mechanism.
>      cause Xen not to use Indirect Branch Tracking even when support is
>      available in hardware.
> 
> +### buddy-alloc-size (arm64)
> +> `= <size>`
> +
> +> Default: `64M`
> +
> +Amount of memory reserved for the buddy allocator when colored allocator is
> +active. This options is parsed only when cache coloring support is enabled.
> +The colored allocator is meant as an alternative to the buddy allocator,
> +because its allocation policy is by definition incompatible with the
> +generic one. Since the Xen heap systems is not colored yet, we need to
> +support the coexistence of the two allocators for now. This parameter, which is
> +optional and for expert only, it's used to set the amount of memory reserved to
> +the buddy allocator.
> +
>  ### clocksource (x86)
>  > `= pit | hpet | acpi | tsc`
> 
> diff --git a/xen/arch/arm/Kconfig b/xen/arch/arm/Kconfig
> index 8acff9682c..abce4bfc25 100644
> --- a/xen/arch/arm/Kconfig
> +++ b/xen/arch/arm/Kconfig
> @@ -147,6 +147,18 @@ config MAX_CACHE_COLORS
>           colors at boot. Note that if, at any time, a color configuration with more
>           colors than the maximum will be employed an error will be produced.
> 
> +config BUDDY_ALLOCATOR_SIZE
> +       string "Buddy allocator reserved memory size" if CACHE_COLORING
> +       default "64M" if CACHE_COLORING
> +       default "0M" if !CACHE_COLORING
> +       help
> +         Amount of memory reserved for the buddy allocator to work alongside
> +         the colored one. The colored allocator is meant as an alternative to the
> +         buddy allocator because its allocation policy is by definition
> +         incompatible with the generic one. Since the Xen heap systems is not
> +         colored yet, we need to support the coexistence of the two allocators and
> +         some memory must be left for the buddy one.
> +
>  config TEE
>         bool "Enable TEE mediators support (UNSUPPORTED)" if UNSUPPORTED
>         default n
> diff --git a/xen/arch/arm/coloring.c b/xen/arch/arm/coloring.c
> index 87e20b952e..3fb86043d1 100644
> --- a/xen/arch/arm/coloring.c
> +++ b/xen/arch/arm/coloring.c
> @@ -300,6 +300,16 @@ void prepare_color_domain_config(struct xen_arch_domainconfig *config,
>      config->num_colors = (uint16_t)num;
>  }
> 
> +unsigned int page_to_color(struct page_info *pg)
> +{
> +    return addr_to_color(page_to_maddr(pg));
> +}
> +
> +unsigned int get_max_colors(void)
> +{
> +    return max_colors;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> diff --git a/xen/arch/arm/include/asm/coloring.h b/xen/arch/arm/include/asm/coloring.h
> index b7fa323870..0982bc9853 100644
> --- a/xen/arch/arm/include/asm/coloring.h
> +++ b/xen/arch/arm/include/asm/coloring.h
> @@ -29,6 +29,8 @@
> 
>  #include <public/arch-arm.h>
> 
> +struct page_info;
> +
>  bool __init coloring_init(void);
> 
>  int domain_coloring_init(struct domain *d,
> @@ -39,4 +41,8 @@ void domain_dump_coloring_info(struct domain *d);
>  void prepare_color_domain_config(struct xen_arch_domainconfig *config,
>                                   const char *colors_str);
> 
> +unsigned int page_to_color(struct page_info *pg);
> +
> +unsigned int get_max_colors(void);
> +
>  #endif /* !__ASM_ARM_COLORING_H__ */
> diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
> index da25251cda..a59fc3791a 100644
> --- a/xen/arch/arm/include/asm/mm.h
> +++ b/xen/arch/arm/include/asm/mm.h
> @@ -143,6 +143,9 @@ struct page_info
>  #define PGC_count_width   PG_shift(10)
>  #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
> 
> +#define _PGC_colored      PG_shift(11)
> +#define PGC_colored       PG_mask(1, 11)
> +
>  /*
>   * Page needs to be scrubbed. Since this bit can only be set on a page that is
>   * free (i.e. in PGC_state_free) we can reuse PGC_allocated bit.
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index bfd4150be7..4ae3cfe9a7 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -150,6 +150,9 @@
>  #define p2m_pod_offline_or_broken_hit(pg) 0
>  #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL)
>  #endif
> +#ifdef CONFIG_CACHE_COLORING
> +#include <asm/coloring.h>
> +#endif
> 
>  #ifndef PGC_static
>  #define PGC_static 0
> @@ -231,6 +234,9 @@ static bool __read_mostly scrub_debug;
>  #define scrub_debug    false
>  #endif
> 
> +/* Memory required for buddy allocator to work with colored one */
> +static unsigned long __initdata buddy_alloc_size;
> +
>  /*
>   * Bit width of the DMA heap -- used to override NUMA-node-first.
>   * allocation strategy, which can otherwise exhaust low memory.
> @@ -440,7 +446,172 @@ mfn_t __init alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
>      BUG();
>  }
> 
> +static DEFINE_SPINLOCK(heap_lock);
> 
> +/* Initialise fields which have other uses for free pages. */
> +static void init_free_page_fields(struct page_info *pg)
> +{
> +    pg->u.inuse.type_info = PGT_TYPE_INFO_INITIALIZER;
> +    page_set_owner(pg, NULL);
> +}
> +
> +static bool is_page_colored(struct page_info *pg)
> +{
> +    return pg->count_info & PGC_colored;
> +}
> +
> +#ifdef CONFIG_CACHE_COLORING
> +/*************************
> + * COLORED SIDE-ALLOCATOR
> + *
> + * Pages are stored by their color in separate lists. Each list defines a color
> + * and it is initialized during end_boot_allocator, where each page's color
> + * is calculated and the page itself is put in the correct list.
> + * After initialization there will be N lists where N is the number of maximum
> + * available colors on the platform.
> + */
> +typedef struct page_list_head colored_pages_t;
> +static colored_pages_t __ro_after_init *color_heap;
> +
> +#define colored_pages(color) &color_heap[(color)]
> +
> +static void color_heap_insert_page(struct page_info *pg)
> +{
> +    struct page_info *pos;
> +    struct page_list_head *head = colored_pages(page_to_color(pg));
> +
> +    pg->count_info |= PGC_colored;
> +
> +    /* Insert page in list in ascending machine address order */
> +    page_list_for_each( pos, head )
> +    {
> +        if ( page_to_maddr(pos) < page_to_maddr(pg) )
> +        {
> +            head = &pos->list;
> +            break;
> +        }
> +    }
> +
> +    page_list_add_tail(pg, head);
> +}
> +
> +static void color_heap_remove_page(struct page_info *pg)
> +{
> +    page_list_del(pg, colored_pages(page_to_color(pg)));
> +}
> +
> +static void __init init_col_heap_pages(struct page_info *pg,
> +                                       unsigned long nr_pages)
> +{
> +    unsigned int i;
> +
> +    if ( !color_heap )
> +    {
> +        unsigned int max_colors = get_max_colors();
> +        color_heap = xmalloc_array(colored_pages_t, max_colors);
> +        BUG_ON(!color_heap);
> +
> +        for ( i = 0; i < max_colors; i++ )
> +            INIT_PAGE_LIST_HEAD(colored_pages(i));
> +
> +        if ( !buddy_alloc_size )
> +            buddy_alloc_size = parse_size_and_unit(CONFIG_BUDDY_ALLOCATOR_SIZE,
> +                                                   NULL);
> +    }
> +
> +    printk(XENLOG_INFO "Init color heap with %lu pages\n", nr_pages);
> +    printk(XENLOG_INFO "Paging starting from: 0x%"PRIx64"\n",
> +           page_to_maddr(pg));
> +
> +    for ( i = 0; i < nr_pages; i++ )
> +        color_heap_insert_page(pg++);
> +}
> +
> +/* Alloc one page based on domain color configuration */
> +static struct page_info *alloc_col_heap_page(unsigned int memflags,
> +                                             const unsigned int *colors,
> +                                             unsigned int num_colors)
> +{
> +    struct page_info *pg = NULL;
> +    unsigned int i;
> +    bool need_tlbflush = false;
> +    uint32_t tlbflush_timestamp = 0;
> +
> +    spin_lock(&heap_lock);
> +
> +    for ( i = 0; i < num_colors; i++ )
> +    {
> +        struct page_info *tmp;
> +
> +        if ( page_list_empty(colored_pages(colors[i])) )
> +            continue;
> +
> +        tmp = page_list_first(colored_pages(colors[i]));
> +        if ( !pg || page_to_maddr(tmp) > page_to_maddr(pg) )
> +            pg = tmp;
> +    }
> +
> +    if ( !pg )
> +    {
> +        spin_unlock(&heap_lock);
> +        return NULL;
> +    }
> +
> +    pg->count_info = PGC_state_inuse;
> +
> +    if ( !(memflags & MEMF_no_tlbflush) )
> +        accumulate_tlbflush(&need_tlbflush, pg, &tlbflush_timestamp);
> +
> +    init_free_page_fields(pg);
> +    flush_page_to_ram(mfn_x(page_to_mfn(pg)),
> +                      !(memflags & MEMF_no_icache_flush));
> +    color_heap_remove_page(pg);
> +
> +    spin_unlock(&heap_lock);
> +
> +    if ( need_tlbflush )
> +        filtered_flush_tlb_mask(tlbflush_timestamp);
> +
> +    return pg;
> +}
> +
> +static void free_col_domheap_page(struct page_info *pg)
> +{
> +    pg->count_info = PGC_state_free;
> +    page_set_owner(pg, NULL);
> +    color_heap_insert_page(pg);
> +}
> +
> +static struct page_info *alloc_col_domheap_page(struct domain *d,
> +                                                unsigned int memflags)
> +{
> +    struct page_info *pg;
> +
> +    ASSERT(!in_irq());
> +
> +    pg = alloc_col_heap_page(memflags, d->arch.colors, d->arch.num_colors);
> +    if ( !pg )
> +    {
> +        printk(XENLOG_ERR "Colored page is null for domain %pd\n", d);
> +        return NULL;
> +    }
> +
> +    if ( d && !(memflags & MEMF_no_owner) && assign_page(pg, 0, d, memflags) )
> +    {
> +        free_col_domheap_page(pg);
> +        return NULL;
> +    }
> +
> +    return pg;
> +}
> +
> +size_param("buddy-alloc-size", buddy_alloc_size);
> +#else
> +static void free_col_domheap_page(struct page_info *pg)
> +{
> +    return;
> +}
> +#endif /* CONFIG_CACHE_COLORING */
> 
>  /*************************
>   * BINARY BUDDY ALLOCATOR
> @@ -462,7 +633,6 @@ static unsigned long node_need_scrub[MAX_NUMNODES];
>  static unsigned long *avail[MAX_NUMNODES];
>  static long total_avail_pages;
> 
> -static DEFINE_SPINLOCK(heap_lock);
>  static long outstanding_claims; /* total outstanding claims by all domains */
> 
>  unsigned long domain_adjust_tot_pages(struct domain *d, long pages)
> @@ -1027,10 +1197,7 @@ static struct page_info *alloc_heap_pages(
>              accumulate_tlbflush(&need_tlbflush, &pg[i],
>                                  &tlbflush_timestamp);
> 
> -        /* Initialise fields which have other uses for free pages. */
> -        pg[i].u.inuse.type_info = PGT_TYPE_INFO_INITIALIZER;
> -        page_set_owner(&pg[i], NULL);
> -
> +        init_free_page_fields(&pg[i]);
>      }
> 
>      spin_unlock(&heap_lock);
> @@ -1926,6 +2093,7 @@ static unsigned long avail_heap_pages(
>  void __init end_boot_allocator(void)
>  {
>      unsigned int i;
> +    unsigned long buddy_pages = PFN_DOWN(buddy_alloc_size);
> 
>      /* Pages that are free now go to the domain sub-allocator. */
>      for ( i = 0; i < nr_bootmem_regions; i++ )
> @@ -1939,11 +2107,24 @@ void __init end_boot_allocator(void)
>              break;
>          }
>      }
> -    for ( i = nr_bootmem_regions; i-- > 0; )
> +
> +    for ( i = 0; i < nr_bootmem_regions; i++ )
>      {
>          struct bootmem_region *r = &bootmem_region_list[i];
> +
> +        if ( buddy_pages && (r->s < r->e) )
> +        {
> +            unsigned long pages = MIN(r->e - r->s, buddy_pages);
> +            init_heap_pages(mfn_to_page(_mfn(r->s)), pages);

When cache coloring is enabled we have 2 allocators: colored and buddy.
The amount of memory for buddy allocator comes from the config (64MB by default).
So what about the first block of code at the beginning of end_boot_allocator
that calls init_heap_pages for the first free boot memory region?
There are two issues:
1. Buddy will end up having the buddy_pages + pages from the first free bootmem region.
   This is already incorrect as buddy should have the amount of memory set by the user.
2. Allowing the first free memory bank to go to buddy without imposing any restrictions
   on the size, can significantly lower the amount of memory available to colored allocator.
   If we load the images near the end of memory bank, the first free region can actually
   occupy most of the system memory and lead to issues with not enough colored memory for domains allocations.

I cannot see a reason for you to use that block of code. Before your series there was a split
to prefer allocations from higher memory with the exception of inserting one region residing on
the boot CPU node first (4280d3ee4cb1). Because your series modifies this behavior to prefer allocations
from lower memory in ascending order (and also allowing user to set a buddy size) I think this block
of code should not be executed when cache coloring is enabled.


Apart from that, the memory coming from the buddy is of any color. Shouldn't it be that the memory
allocated for domains comes from colored memory of the colors assigned to them and anything else
should come from colors given to Xen? At the moment, the memory for e.g. allocating P2M tables comes
from the buddy which means it can be of any color and might cause some cache interference.


~Michal
Julien Grall Oct. 17, 2022, 8:44 a.m. UTC | #9
Hi,

On 17/10/2022 08:06, Michal Orzel wrote:
> On 26/08/2022 14:51, Carlo Nonato wrote:
> Apart from that, the memory coming from the buddy is of any color. Shouldn't it be that the memory
> allocated for domains comes from colored memory of the colors assigned to them and anything else
> should come from colors given to Xen? At the moment, the memory for e.g. allocating P2M tables comes
> from the buddy which means it can be of any color and might cause some cache interference.

Somewhat related. IIUC what you are saying, the P2M pool will still be 
allocated from the buddy. I would expect we want to instead allocate the 
P2M pool from the same color as the domain to prevent interference when 
the TLBs are loaded. Or is the interference negligible?

Cheers,
Michal Orzel Oct. 17, 2022, 9:16 a.m. UTC | #10
Hi Julien,

On 17/10/2022 10:44, Julien Grall wrote:
> 
> 
> Hi,
> 
> On 17/10/2022 08:06, Michal Orzel wrote:
>> On 26/08/2022 14:51, Carlo Nonato wrote:
>> Apart from that, the memory coming from the buddy is of any color. Shouldn't it be that the memory
>> allocated for domains comes from colored memory of the colors assigned to them and anything else
>> should come from colors given to Xen? At the moment, the memory for e.g. allocating P2M tables comes
>> from the buddy which means it can be of any color and might cause some cache interference.
> 
> Somewhat related. IIUC what you are saying, the P2M pool will still be
> allocated from the buddy. I would expect we want to instead allocate the
> P2M pool from the same color as the domain to prevent interference when
> the TLBs are loaded. Or is the interference negligible?
> 
Good point and I agree. I do not think it is negligible.
All in all, allocating this memory from buddy which is of any color is incorrect.
When cache coloring is enabled, it should not be possible to allocate memory of any color.
If we can do this so that this memory comes from either Xen or domain colors, the
intereference will be reduced. When it comes to P2M tables, if the memory comes from
the colors assigned to a domain, there will be even less interference.


> Cheers,
> 
> --
> Julien Grall
> 

~Michal
diff mbox series

Patch

diff --git a/docs/misc/arm/cache-coloring.rst b/docs/misc/arm/cache-coloring.rst
index 763acd2d3f..5f9132e525 100644
--- a/docs/misc/arm/cache-coloring.rst
+++ b/docs/misc/arm/cache-coloring.rst
@@ -13,6 +13,9 @@  In order to enable and use it, few steps are needed.
   (refer to menuconfig help for value meaning and when it should be changed).
 
         CONFIG_MAX_CACHE_COLORS=<n>
+- If needed, change the amount of memory reserved for the buddy allocator either
+  from the Xen configuration file, via the CONFIG_BUDDY_ALLOCATOR_SIZE value,
+  or with the command line option. See `Colored allocator and buddy allocator`.
 - Assign colors to domains using the `Color selection format`_ (see
   `Coloring parameters`_ for more documentation pointers).
 
@@ -157,4 +160,42 @@  Please refer to the relative documentation in
 "docs/misc/arm/device-tree/booting.txt".
 
 Note that if no color configuration is provided for domains, they fallback to
-the default one, which corresponds simply to all available colors.
\ No newline at end of file
+the default one, which corresponds simply to all available colors.
+
+Colored allocator and buddy allocator
+*************************************
+
+The colored allocator distributes pages based on color configurations of
+domains so that each domains only gets pages of its own colors.
+The colored allocator is meant as an alternative to the buddy allocator because
+its allocation policy is by definition incompatible with the generic one. Since
+the Xen heap systems is not colored yet, we need to support the coexistence of
+the two allocators and some memory must be left for the buddy one.
+The buddy allocator memory can be reserved from the Xen configuration file or
+with the help of a command-line option.
+
+Known issues and limitations
+****************************
+
+Colored allocator can only make use of order-0 pages
+####################################################
+
+The cache coloring technique relies on memory mappings and on the smallest
+amount of memory that can be mapped to achieve the maximum number of colors
+(cache partitions) possible. This amount is what is normally called a page and,
+in Xen terminology, the order-0 page is the smallest one. The fairly simple
+colored allocator currently implemented, makes use only of such pages.
+It must be said that a more complex one could, in theory, adopt higher order
+pages if the colors selection contained adjacent colors. Two subsequent colors,
+for example, can be represented by a order-1 page, four colors correspond to
+a order-2 pages, etc.
+
+Fail to boot colored DomUs with large memory size
+#################################################
+
+If the Linux kernel used for Dom0 does not contain the upstream commit
+3941552aec1e04d63999988a057ae09a1c56ebeb and uses the hypercall buffer device,
+colored DomUs with memory size larger then 127 MB cannot be created. This is
+caused by the default limit of this buffer of 64 pages. The solution is to
+manually apply the above patch, or to check if there is an updated version of
+the kernel in use for Dom0 that contains this change.
diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
index 910ebeb2eb..4e85c4dfe4 100644
--- a/docs/misc/xen-command-line.pandoc
+++ b/docs/misc/xen-command-line.pandoc
@@ -299,6 +299,20 @@  can be maintained with the pv-shim mechanism.
     cause Xen not to use Indirect Branch Tracking even when support is
     available in hardware.
 
+### buddy-alloc-size (arm64)
+> `= <size>`
+
+> Default: `64M`
+
+Amount of memory reserved for the buddy allocator when colored allocator is
+active. This options is parsed only when cache coloring support is enabled.
+The colored allocator is meant as an alternative to the buddy allocator,
+because its allocation policy is by definition incompatible with the
+generic one. Since the Xen heap systems is not colored yet, we need to
+support the coexistence of the two allocators for now. This parameter, which is
+optional and for expert only, it's used to set the amount of memory reserved to
+the buddy allocator.
+
 ### clocksource (x86)
 > `= pit | hpet | acpi | tsc`
 
diff --git a/xen/arch/arm/Kconfig b/xen/arch/arm/Kconfig
index 8acff9682c..abce4bfc25 100644
--- a/xen/arch/arm/Kconfig
+++ b/xen/arch/arm/Kconfig
@@ -147,6 +147,18 @@  config MAX_CACHE_COLORS
 	  colors at boot. Note that if, at any time, a color configuration with more
 	  colors than the maximum will be employed an error will be produced.
 
+config BUDDY_ALLOCATOR_SIZE
+	string "Buddy allocator reserved memory size" if CACHE_COLORING
+	default "64M" if CACHE_COLORING
+	default "0M" if !CACHE_COLORING
+	help
+	  Amount of memory reserved for the buddy allocator to work alongside
+	  the colored one. The colored allocator is meant as an alternative to the
+	  buddy allocator because its allocation policy is by definition
+	  incompatible with the generic one. Since the Xen heap systems is not
+	  colored yet, we need to support the coexistence of the two allocators and
+	  some memory must be left for the buddy one.
+
 config TEE
 	bool "Enable TEE mediators support (UNSUPPORTED)" if UNSUPPORTED
 	default n
diff --git a/xen/arch/arm/coloring.c b/xen/arch/arm/coloring.c
index 87e20b952e..3fb86043d1 100644
--- a/xen/arch/arm/coloring.c
+++ b/xen/arch/arm/coloring.c
@@ -300,6 +300,16 @@  void prepare_color_domain_config(struct xen_arch_domainconfig *config,
     config->num_colors = (uint16_t)num;
 }
 
+unsigned int page_to_color(struct page_info *pg)
+{
+    return addr_to_color(page_to_maddr(pg));
+}
+
+unsigned int get_max_colors(void)
+{
+    return max_colors;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/arm/include/asm/coloring.h b/xen/arch/arm/include/asm/coloring.h
index b7fa323870..0982bc9853 100644
--- a/xen/arch/arm/include/asm/coloring.h
+++ b/xen/arch/arm/include/asm/coloring.h
@@ -29,6 +29,8 @@ 
 
 #include <public/arch-arm.h>
 
+struct page_info;
+
 bool __init coloring_init(void);
 
 int domain_coloring_init(struct domain *d,
@@ -39,4 +41,8 @@  void domain_dump_coloring_info(struct domain *d);
 void prepare_color_domain_config(struct xen_arch_domainconfig *config,
                                  const char *colors_str);
 
+unsigned int page_to_color(struct page_info *pg);
+
+unsigned int get_max_colors(void);
+
 #endif /* !__ASM_ARM_COLORING_H__ */
diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
index da25251cda..a59fc3791a 100644
--- a/xen/arch/arm/include/asm/mm.h
+++ b/xen/arch/arm/include/asm/mm.h
@@ -143,6 +143,9 @@  struct page_info
 #define PGC_count_width   PG_shift(10)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
+#define _PGC_colored      PG_shift(11)
+#define PGC_colored       PG_mask(1, 11)
+
 /*
  * Page needs to be scrubbed. Since this bit can only be set on a page that is
  * free (i.e. in PGC_state_free) we can reuse PGC_allocated bit.
diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index bfd4150be7..4ae3cfe9a7 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -150,6 +150,9 @@ 
 #define p2m_pod_offline_or_broken_hit(pg) 0
 #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL)
 #endif
+#ifdef CONFIG_CACHE_COLORING
+#include <asm/coloring.h>
+#endif
 
 #ifndef PGC_static
 #define PGC_static 0
@@ -231,6 +234,9 @@  static bool __read_mostly scrub_debug;
 #define scrub_debug    false
 #endif
 
+/* Memory required for buddy allocator to work with colored one */
+static unsigned long __initdata buddy_alloc_size;
+
 /*
  * Bit width of the DMA heap -- used to override NUMA-node-first.
  * allocation strategy, which can otherwise exhaust low memory.
@@ -440,7 +446,172 @@  mfn_t __init alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
     BUG();
 }
 
+static DEFINE_SPINLOCK(heap_lock);
 
+/* Initialise fields which have other uses for free pages. */
+static void init_free_page_fields(struct page_info *pg)
+{
+    pg->u.inuse.type_info = PGT_TYPE_INFO_INITIALIZER;
+    page_set_owner(pg, NULL);
+}
+
+static bool is_page_colored(struct page_info *pg)
+{
+    return pg->count_info & PGC_colored;
+}
+
+#ifdef CONFIG_CACHE_COLORING
+/*************************
+ * COLORED SIDE-ALLOCATOR
+ *
+ * Pages are stored by their color in separate lists. Each list defines a color
+ * and it is initialized during end_boot_allocator, where each page's color
+ * is calculated and the page itself is put in the correct list.
+ * After initialization there will be N lists where N is the number of maximum
+ * available colors on the platform.
+ */
+typedef struct page_list_head colored_pages_t;
+static colored_pages_t __ro_after_init *color_heap;
+
+#define colored_pages(color) &color_heap[(color)]
+
+static void color_heap_insert_page(struct page_info *pg)
+{
+    struct page_info *pos;
+    struct page_list_head *head = colored_pages(page_to_color(pg));
+
+    pg->count_info |= PGC_colored;
+
+    /* Insert page in list in ascending machine address order */
+    page_list_for_each( pos, head )
+    {
+        if ( page_to_maddr(pos) < page_to_maddr(pg) )
+        {
+            head = &pos->list;
+            break;
+        }
+    }
+
+    page_list_add_tail(pg, head);
+}
+
+static void color_heap_remove_page(struct page_info *pg)
+{
+    page_list_del(pg, colored_pages(page_to_color(pg)));
+}
+
+static void __init init_col_heap_pages(struct page_info *pg,
+                                       unsigned long nr_pages)
+{
+    unsigned int i;
+
+    if ( !color_heap )
+    {
+        unsigned int max_colors = get_max_colors();
+        color_heap = xmalloc_array(colored_pages_t, max_colors);
+        BUG_ON(!color_heap);
+
+        for ( i = 0; i < max_colors; i++ )
+            INIT_PAGE_LIST_HEAD(colored_pages(i));
+
+        if ( !buddy_alloc_size )
+            buddy_alloc_size = parse_size_and_unit(CONFIG_BUDDY_ALLOCATOR_SIZE,
+                                                   NULL);
+    }
+
+    printk(XENLOG_INFO "Init color heap with %lu pages\n", nr_pages);
+    printk(XENLOG_INFO "Paging starting from: 0x%"PRIx64"\n",
+           page_to_maddr(pg));
+
+    for ( i = 0; i < nr_pages; i++ )
+        color_heap_insert_page(pg++);
+}
+
+/* Alloc one page based on domain color configuration */
+static struct page_info *alloc_col_heap_page(unsigned int memflags,
+                                             const unsigned int *colors,
+                                             unsigned int num_colors)
+{
+    struct page_info *pg = NULL;
+    unsigned int i;
+    bool need_tlbflush = false;
+    uint32_t tlbflush_timestamp = 0;
+
+    spin_lock(&heap_lock);
+
+    for ( i = 0; i < num_colors; i++ )
+    {
+        struct page_info *tmp;
+
+        if ( page_list_empty(colored_pages(colors[i])) )
+            continue;
+
+        tmp = page_list_first(colored_pages(colors[i]));
+        if ( !pg || page_to_maddr(tmp) > page_to_maddr(pg) )
+            pg = tmp;
+    }
+
+    if ( !pg )
+    {
+        spin_unlock(&heap_lock);
+        return NULL;
+    }
+
+    pg->count_info = PGC_state_inuse;
+
+    if ( !(memflags & MEMF_no_tlbflush) )
+        accumulate_tlbflush(&need_tlbflush, pg, &tlbflush_timestamp);
+
+    init_free_page_fields(pg);
+    flush_page_to_ram(mfn_x(page_to_mfn(pg)),
+                      !(memflags & MEMF_no_icache_flush));
+    color_heap_remove_page(pg);
+
+    spin_unlock(&heap_lock);
+
+    if ( need_tlbflush )
+        filtered_flush_tlb_mask(tlbflush_timestamp);
+
+    return pg;
+}
+
+static void free_col_domheap_page(struct page_info *pg)
+{
+    pg->count_info = PGC_state_free;
+    page_set_owner(pg, NULL);
+    color_heap_insert_page(pg);
+}
+
+static struct page_info *alloc_col_domheap_page(struct domain *d,
+                                                unsigned int memflags)
+{
+    struct page_info *pg;
+
+    ASSERT(!in_irq());
+
+    pg = alloc_col_heap_page(memflags, d->arch.colors, d->arch.num_colors);
+    if ( !pg )
+    {
+        printk(XENLOG_ERR "Colored page is null for domain %pd\n", d);
+        return NULL;
+    }
+
+    if ( d && !(memflags & MEMF_no_owner) && assign_page(pg, 0, d, memflags) )
+    {
+        free_col_domheap_page(pg);
+        return NULL;
+    }
+
+    return pg;
+}
+
+size_param("buddy-alloc-size", buddy_alloc_size);
+#else
+static void free_col_domheap_page(struct page_info *pg)
+{
+    return;
+}
+#endif /* CONFIG_CACHE_COLORING */
 
 /*************************
  * BINARY BUDDY ALLOCATOR
@@ -462,7 +633,6 @@  static unsigned long node_need_scrub[MAX_NUMNODES];
 static unsigned long *avail[MAX_NUMNODES];
 static long total_avail_pages;
 
-static DEFINE_SPINLOCK(heap_lock);
 static long outstanding_claims; /* total outstanding claims by all domains */
 
 unsigned long domain_adjust_tot_pages(struct domain *d, long pages)
@@ -1027,10 +1197,7 @@  static struct page_info *alloc_heap_pages(
             accumulate_tlbflush(&need_tlbflush, &pg[i],
                                 &tlbflush_timestamp);
 
-        /* Initialise fields which have other uses for free pages. */
-        pg[i].u.inuse.type_info = PGT_TYPE_INFO_INITIALIZER;
-        page_set_owner(&pg[i], NULL);
-
+        init_free_page_fields(&pg[i]);
     }
 
     spin_unlock(&heap_lock);
@@ -1926,6 +2093,7 @@  static unsigned long avail_heap_pages(
 void __init end_boot_allocator(void)
 {
     unsigned int i;
+    unsigned long buddy_pages = PFN_DOWN(buddy_alloc_size);
 
     /* Pages that are free now go to the domain sub-allocator. */
     for ( i = 0; i < nr_bootmem_regions; i++ )
@@ -1939,11 +2107,24 @@  void __init end_boot_allocator(void)
             break;
         }
     }
-    for ( i = nr_bootmem_regions; i-- > 0; )
+
+    for ( i = 0; i < nr_bootmem_regions; i++ )
     {
         struct bootmem_region *r = &bootmem_region_list[i];
+
+        if ( buddy_pages && (r->s < r->e) )
+        {
+            unsigned long pages = MIN(r->e - r->s, buddy_pages);
+            init_heap_pages(mfn_to_page(_mfn(r->s)), pages);
+            r->s += pages;
+            buddy_pages -= pages;
+        }
         if ( r->s < r->e )
+#ifdef CONFIG_CACHE_COLORING
+            init_col_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s);
+#else
             init_heap_pages(mfn_to_page(_mfn(r->s)), r->e - r->s);
+#endif
     }
     nr_bootmem_regions = 0;
 
@@ -2429,6 +2610,17 @@  struct page_info *alloc_domheap_pages(
 
     ASSERT_ALLOC_CONTEXT();
 
+#ifdef CONFIG_CACHE_COLORING
+    /* Only domains are supported for coloring */
+    if ( d )
+    {
+        /* Colored allocation must be done on 0 order */
+        if ( order )
+            return NULL;
+        return alloc_col_domheap_page(d, memflags);
+    }
+#endif
+
     bits = domain_clamp_alloc_bitsize(memflags & MEMF_no_owner ? NULL : d,
                                       bits ? : (BITS_PER_LONG+PAGE_SHIFT));
 
@@ -2546,7 +2738,10 @@  void free_domheap_pages(struct page_info *pg, unsigned int order)
             scrub = 1;
         }
 
-        free_heap_pages(pg, order, scrub);
+        if ( is_page_colored(pg) )
+            free_col_domheap_page(pg);
+        else
+            free_heap_pages(pg, order, scrub);
     }
 
     if ( drop_dom_ref )
@@ -2759,9 +2954,7 @@  static struct page_info * __init acquire_staticmem_pages(mfn_t smfn,
          * to PGC_state_inuse.
          */
         pg[i].count_info = PGC_static | PGC_state_inuse;
-        /* Initialise fields which have other uses for free pages. */
-        pg[i].u.inuse.type_info = PGT_TYPE_INFO_INITIALIZER;
-        page_set_owner(&pg[i], NULL);
+        init_free_page_fields(&pg[i]);
     }
 
     spin_unlock(&heap_lock);