diff mbox

[v4,04/24] x86: refactor psr: implement CPU init and free flow.

Message ID 1481688484-5093-5-git-send-email-yi.y.sun@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yi Sun Dec. 14, 2016, 4:07 a.m. UTC
This patch implements the CPU init and free flow including L3 CAT
initialization and feature list free.

Per this patch, you can see how callback functions work and how the
feature list is handled.

Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com>
---
 xen/arch/x86/psr.c | 179 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 176 insertions(+), 3 deletions(-)

Comments

Jan Beulich Jan. 10, 2017, 11:45 a.m. UTC | #1
>>> On 14.12.16 at 05:07, <yi.y.sun@linux.intel.com> wrote:
> @@ -141,11 +144,79 @@ struct psr_assoc {
>  
>  struct psr_cmt *__read_mostly psr_cmt;
>  
> +static struct psr_socket_info *__read_mostly socket_info;
> +
>  static unsigned int opt_psr;
>  static unsigned int __initdata opt_rmid_max = 255;
> +static unsigned int __read_mostly opt_cos_max = MAX_COS_REG_CNT;
>  static uint64_t rmid_mask;
>  static DEFINE_PER_CPU(struct psr_assoc, psr_assoc);
>  
> +/* Declare feature list entry. */
> +static struct feat_node *feat_l3_cat;

Hmm, if you indeed (again) need such a helper object, then please
make the comment actually say so. As it is, the comment is mostly
meaningless.

> +/* Common functions. */
> +static void free_feature(struct psr_socket_info *info)
> +{
> +    struct feat_node *feat_tmp;
> +
> +    if ( !info )
> +        return;
> +
> +    list_for_each_entry(feat_tmp, &info->feat_list, list)
> +    {
> +        clear_bit(feat_tmp->feature, &info->feat_mask);
> +        list_del(&feat_tmp->list);
> +        xfree(feat_tmp);
> +    }

This requires list_for_each_entry_safe() to be used, to avoid a
use-after-free issue (or alternatively a while(!list_empty()) loop).

> +    /* Free feature which are not added into feat_list. */
> +    if ( feat_l3_cat )
> +    {
> +        xfree(feat_l3_cat);
> +        feat_l3_cat = NULL;
> +    }

Why don't you leave this around, avoiding the need for an
allocation the next time a CPU comes online? Also note that xfree()
deals fine with a NULL input, so conditionals like this are pointless.

> +/* L3 CAT callback functions implementation. */
> +static void l3_cat_init_feature(unsigned int eax, unsigned int ebx,
> +                                unsigned int ecx, unsigned int edx,

This is rather unfortunate naming: How does the reader of this code
know what these values represent, without having to first go look in
the caller?

> +                                struct feat_node *feat,
> +                                struct psr_socket_info *info)
> +{
> +    struct psr_cat_hw_info l3_cat;
> +    unsigned int socket;
> +
> +    /* No valid value so do not enable feature. */
> +    if ( !eax || !edx )
> +        return;
> +
> +    l3_cat.cbm_len = (eax & CAT_CBM_LEN_MASK) + 1;
> +    l3_cat.cos_max = min(opt_cos_max, edx & CAT_COS_MAX_MASK);
> +
> +    /* cos=0 is reserved as default cbm(all ones). */
> +    feat->cos_reg_val[0] = (1ull << l3_cat.cbm_len) - 1;

Considering how cbm_len gets calculated a few lines up, I can't see
how this can end up being all ones (as the comment says). At most
this can be 0xffffffff (as a 64-bit value) afaics.

> +    feat->feature = PSR_SOCKET_L3_CAT;
> +    __set_bit(PSR_SOCKET_L3_CAT, &info->feat_mask);
> +
> +    feat->info.l3_cat_info = l3_cat;
> +
> +    info->nr_feat++;
> +
> +    /* Add this feature into list. */
> +    list_add_tail(&feat->list, &info->feat_list);
> +
> +    socket = cpu_to_socket(smp_processor_id());
> +    printk(XENLOG_INFO "L3 CAT: enabled on socket %u, cos_max:%u, cbm_len:%u\n",
> +           socket, feat->info.l3_cat_info.cos_max,
> +           feat->info.l3_cat_info.cbm_len);

I don't think we want such printed for every socket, at least not by
default. Please, if you want to keep it, make it dependent upon e.g.
opt_cpu_info.

> +}
> +
> +struct feat_ops l3_cat_ops = {

static const

> @@ -340,18 +414,113 @@ void psr_domain_free(struct domain *d)
>      psr_free_rmid(d);
>  }
>  
> -static int psr_cpu_prepare(unsigned int cpu)
> +static int cpu_prepare_work(unsigned int cpu)
>  {
> +    if ( !socket_info )
> +        return 0;
> +
> +    /* Malloc memory for the global feature head here. */
> +    if ( feat_l3_cat == NULL &&
> +         (feat_l3_cat = xzalloc(struct feat_node)) == NULL )
> +        return -ENOMEM;
> +
>      return 0;
>  }
>  
> +static void cpu_init_work(void)
> +{
> +    unsigned int eax, ebx, ecx, edx;
> +    struct psr_socket_info *info;
> +    unsigned int socket;
> +    unsigned int cpu = smp_processor_id();
> +    const struct cpuinfo_x86 *c = cpu_data + cpu;

Please use current_cpu_data instead of open coding it.

> +    struct feat_node *feat_tmp;

Looking at the uses, I don't think this is temporary in any way - why
not just "feat"?

> +    if ( !cpu_has(c, X86_FEATURE_PQE) || c->cpuid_level < PSR_CPUID_LEVEL_CAT )
> +        return;

Instead of such a double check, please consider clearing the PQE
feature bit when the maximum CPUID level is too low (which
shouldn't happen anyway).

> +    socket = cpu_to_socket(cpu);
> +    info = socket_info + socket;
> +    if ( info->feat_mask )
> +        return;
> +
> +    spin_lock_init(&info->ref_lock);
> +
> +    cpuid_count(PSR_CPUID_LEVEL_CAT, 0, &eax, &ebx, &ecx, &edx);
> +    if ( ebx & PSR_RESOURCE_TYPE_L3 )
> +    {
> +        cpuid_count(PSR_CPUID_LEVEL_CAT, 1, &eax, &ebx, &ecx, &edx);
> +
> +        feat_tmp = feat_l3_cat;
> +        feat_l3_cat = NULL;
> +        feat_tmp->ops = l3_cat_ops;
> +
> +        feat_tmp->ops.init_feature(eax, ebx, ecx, edx, feat_tmp, info);

What's the point of the indirect call here, when you know the
function is l3_cat_init_feature()?

> +static void cpu_fini_work(unsigned int cpu)
> +{
> +    unsigned int socket = cpu_to_socket(cpu);
> +
> +    if ( !socket_cpumask[socket] || cpumask_empty(socket_cpumask[socket]) )
> +    {
> +        struct psr_socket_info *info = socket_info + socket;
> +
> +        free_feature(info);

Pointless local variable "info", unless later patches add further uses.

> +static void __init init_psr(void)
> +{
> +    unsigned int i;
> +
> +    if ( opt_cos_max < 1 )
> +    {
> +        printk(XENLOG_INFO "CAT: disabled, cos_max is too small\n");
> +        return;
> +    }
> +
> +    socket_info = xzalloc_array(struct psr_socket_info, nr_sockets);
> +
> +    if ( !socket_info )
> +    {
> +        printk(XENLOG_INFO "Fail to alloc socket_info!\n");
> +        return;
> +    }
> +
> +    for ( i = 0; i < nr_sockets; i++ )
> +        INIT_LIST_HEAD(&socket_info[i].feat_list);

Please decide for one central place where to do such initialization:
This and spin_lock_init() really should live together (and I think
better there, not here).

> +static int psr_cpu_prepare(unsigned int cpu)
> +{
> +    return cpu_prepare_work(cpu);
> +}

What is this wrapper good for?

Jan
Yi Sun Jan. 11, 2017, 3:14 a.m. UTC | #2
On 17-01-10 04:45:05, Jan Beulich wrote:
> >>> On 14.12.16 at 05:07, <yi.y.sun@linux.intel.com> wrote:
> > @@ -141,11 +144,79 @@ struct psr_assoc {
> >  
> >  struct psr_cmt *__read_mostly psr_cmt;
> >  
> > +static struct psr_socket_info *__read_mostly socket_info;
> > +
> >  static unsigned int opt_psr;
> >  static unsigned int __initdata opt_rmid_max = 255;
> > +static unsigned int __read_mostly opt_cos_max = MAX_COS_REG_CNT;
> >  static uint64_t rmid_mask;
> >  static DEFINE_PER_CPU(struct psr_assoc, psr_assoc);
> >  
> > +/* Declare feature list entry. */
> > +static struct feat_node *feat_l3_cat;
> 
> Hmm, if you indeed (again) need such a helper object, then please
> make the comment actually say so. As it is, the comment is mostly
> meaningless.
> 
Thanks! Will add more comments to explain it.

> > +/* Common functions. */
> > +static void free_feature(struct psr_socket_info *info)
> > +{
> > +    struct feat_node *feat_tmp;
> > +
> > +    if ( !info )
> > +        return;
> > +
> > +    list_for_each_entry(feat_tmp, &info->feat_list, list)
> > +    {
> > +        clear_bit(feat_tmp->feature, &info->feat_mask);
> > +        list_del(&feat_tmp->list);
> > +        xfree(feat_tmp);
> > +    }
> 
> This requires list_for_each_entry_safe() to be used, to avoid a
> use-after-free issue (or alternatively a while(!list_empty()) loop).
> 
Thanks for the suggestion!

> > +    /* Free feature which are not added into feat_list. */
> > +    if ( feat_l3_cat )
> > +    {
> > +        xfree(feat_l3_cat);
> > +        feat_l3_cat = NULL;
> > +    }
> 
> Why don't you leave this around, avoiding the need for an
> allocation the next time a CPU comes online? Also note that xfree()
> deals fine with a NULL input, so conditionals like this are pointless.
> 
Thanks! Will keep them.

> > +/* L3 CAT callback functions implementation. */
> > +static void l3_cat_init_feature(unsigned int eax, unsigned int ebx,
> > +                                unsigned int ecx, unsigned int edx,
> 
> This is rather unfortunate naming: How does the reader of this code
> know what these values represent, without having to first go look in
> the caller?
> 
Do you mean the 'eax'-'edx'? How about 'eax_register'?

> > +                                struct feat_node *feat,
> > +                                struct psr_socket_info *info)
> > +{
> > +    struct psr_cat_hw_info l3_cat;
> > +    unsigned int socket;
> > +
> > +    /* No valid value so do not enable feature. */
> > +    if ( !eax || !edx )
> > +        return;
> > +
> > +    l3_cat.cbm_len = (eax & CAT_CBM_LEN_MASK) + 1;
> > +    l3_cat.cos_max = min(opt_cos_max, edx & CAT_COS_MAX_MASK);
> > +
> > +    /* cos=0 is reserved as default cbm(all ones). */
> > +    feat->cos_reg_val[0] = (1ull << l3_cat.cbm_len) - 1;
> 
> Considering how cbm_len gets calculated a few lines up, I can't see
> how this can end up being all ones (as the comment says). At most
> this can be 0xffffffff (as a 64-bit value) afaics.
> 
Sorry for the confusion. All one means all bits within cbm_len are 1. E.g.
the cbm_len is 11. Then, value of cos_reg_val[0] is '(1 << 11) - 1', equals
0x7ff.

Will correct the comment.

> > +    feat->feature = PSR_SOCKET_L3_CAT;
> > +    __set_bit(PSR_SOCKET_L3_CAT, &info->feat_mask);
> > +
> > +    feat->info.l3_cat_info = l3_cat;
> > +
> > +    info->nr_feat++;
> > +
> > +    /* Add this feature into list. */
> > +    list_add_tail(&feat->list, &info->feat_list);
> > +
> > +    socket = cpu_to_socket(smp_processor_id());
> > +    printk(XENLOG_INFO "L3 CAT: enabled on socket %u, cos_max:%u, cbm_len:%u\n",
> > +           socket, feat->info.l3_cat_info.cos_max,
> > +           feat->info.l3_cat_info.cbm_len);
> 
> I don't think we want such printed for every socket, at least not by
> default. Please, if you want to keep it, make it dependent upon e.g.
> opt_cpu_info.
> 
Thanks! Will limit the print by opt_cpu_info.

> > +}
> > +
> > +struct feat_ops l3_cat_ops = {
> 
> static const
> 
Ok, thanks!

> > @@ -340,18 +414,113 @@ void psr_domain_free(struct domain *d)
> >      psr_free_rmid(d);
> >  }
> >  
> > -static int psr_cpu_prepare(unsigned int cpu)
> > +static int cpu_prepare_work(unsigned int cpu)
> >  {
> > +    if ( !socket_info )
> > +        return 0;
> > +
> > +    /* Malloc memory for the global feature head here. */
> > +    if ( feat_l3_cat == NULL &&
> > +         (feat_l3_cat = xzalloc(struct feat_node)) == NULL )
> > +        return -ENOMEM;
> > +
> >      return 0;
> >  }
> >  
> > +static void cpu_init_work(void)
> > +{
> > +    unsigned int eax, ebx, ecx, edx;
> > +    struct psr_socket_info *info;
> > +    unsigned int socket;
> > +    unsigned int cpu = smp_processor_id();
> > +    const struct cpuinfo_x86 *c = cpu_data + cpu;
> 
> Please use current_cpu_data instead of open coding it.
> 
Thanks for the suggestion!

> > +    struct feat_node *feat_tmp;
> 
> Looking at the uses, I don't think this is temporary in any way - why
> not just "feat"?
> 
No problem, thanks!

> > +    if ( !cpu_has(c, X86_FEATURE_PQE) || c->cpuid_level < PSR_CPUID_LEVEL_CAT )
> > +        return;
> 
> Instead of such a double check, please consider clearing the PQE
> feature bit when the maximum CPUID level is too low (which
> shouldn't happen anyway).
> 
Is this the responsibility of psr.c? X86_FEATURE_PQE bit is set by HW. Even the
bit is set but CPUID level is low, I think SW would be better to keep it but
not clear it. Because it indicates the HW capability. How do you think? Thanks!

> > +    socket = cpu_to_socket(cpu);
> > +    info = socket_info + socket;
> > +    if ( info->feat_mask )
> > +        return;
> > +
> > +    spin_lock_init(&info->ref_lock);
> > +
> > +    cpuid_count(PSR_CPUID_LEVEL_CAT, 0, &eax, &ebx, &ecx, &edx);
> > +    if ( ebx & PSR_RESOURCE_TYPE_L3 )
> > +    {
> > +        cpuid_count(PSR_CPUID_LEVEL_CAT, 1, &eax, &ebx, &ecx, &edx);
> > +
> > +        feat_tmp = feat_l3_cat;
> > +        feat_l3_cat = NULL;
> > +        feat_tmp->ops = l3_cat_ops;
> > +
> > +        feat_tmp->ops.init_feature(eax, ebx, ecx, edx, feat_tmp, info);
> 
> What's the point of the indirect call here, when you know the
> function is l3_cat_init_feature()?
> 
Hmm, just want to keep the callback function calling style.

> > +static void cpu_fini_work(unsigned int cpu)
> > +{
> > +    unsigned int socket = cpu_to_socket(cpu);
> > +
> > +    if ( !socket_cpumask[socket] || cpumask_empty(socket_cpumask[socket]) )
> > +    {
> > +        struct psr_socket_info *info = socket_info + socket;
> > +
> > +        free_feature(info);
> 
> Pointless local variable "info", unless later patches add further uses.
> 
Ok, will remove this variable. Thanks!

> > +static void __init init_psr(void)
> > +{
> > +    unsigned int i;
> > +
> > +    if ( opt_cos_max < 1 )
> > +    {
> > +        printk(XENLOG_INFO "CAT: disabled, cos_max is too small\n");
> > +        return;
> > +    }
> > +
> > +    socket_info = xzalloc_array(struct psr_socket_info, nr_sockets);
> > +
> > +    if ( !socket_info )
> > +    {
> > +        printk(XENLOG_INFO "Fail to alloc socket_info!\n");
> > +        return;
> > +    }
> > +
> > +    for ( i = 0; i < nr_sockets; i++ )
> > +        INIT_LIST_HEAD(&socket_info[i].feat_list);
> 
> Please decide for one central place where to do such initialization:
> This and spin_lock_init() really should live together (and I think
> better there, not here).
> 
Looks good, thanks!

> > +static int psr_cpu_prepare(unsigned int cpu)
> > +{
> > +    return cpu_prepare_work(cpu);
> > +}
> 
> What is this wrapper good for?
> 
Just keep the old codes.

> Jan
Jan Beulich Jan. 11, 2017, 1:48 p.m. UTC | #3
>>> On 11.01.17 at 04:14, <yi.y.sun@linux.intel.com> wrote:
> On 17-01-10 04:45:05, Jan Beulich wrote:
>> >>> On 14.12.16 at 05:07, <yi.y.sun@linux.intel.com> wrote:
>> > +/* L3 CAT callback functions implementation. */
>> > +static void l3_cat_init_feature(unsigned int eax, unsigned int ebx,
>> > +                                unsigned int ecx, unsigned int edx,
>> 
>> This is rather unfortunate naming: How does the reader of this code
>> know what these values represent, without having to first go look in
>> the caller?
>> 
> Do you mean the 'eax'-'edx'?

Yes.

> How about 'eax_register'?

How would that be any better? Perhaps the best way of making the
naming obvious would be to use the new cpuid_leaf structure here.

>> > +    if ( !cpu_has(c, X86_FEATURE_PQE) || c->cpuid_level < PSR_CPUID_LEVEL_CAT )
>> > +        return;
>> 
>> Instead of such a double check, please consider clearing the PQE
>> feature bit when the maximum CPUID level is too low (which
>> shouldn't happen anyway).
>> 
> Is this the responsibility of psr.c? X86_FEATURE_PQE bit is set by HW. Even the
> bit is set but CPUID level is low, I think SW would be better to keep it but
> not clear it. Because it indicates the HW capability. How do you think? 

What use if keeping the flag if we can't use the feature? And to
answer your first question - whether that's being done in psr.c,
cpu/common.c, or cpu/intel.c I don't really care all that much; it
would certainly feel most natural to go here.

>> > +    socket = cpu_to_socket(cpu);
>> > +    info = socket_info + socket;
>> > +    if ( info->feat_mask )
>> > +        return;
>> > +
>> > +    spin_lock_init(&info->ref_lock);
>> > +
>> > +    cpuid_count(PSR_CPUID_LEVEL_CAT, 0, &eax, &ebx, &ecx, &edx);
>> > +    if ( ebx & PSR_RESOURCE_TYPE_L3 )
>> > +    {
>> > +        cpuid_count(PSR_CPUID_LEVEL_CAT, 1, &eax, &ebx, &ecx, &edx);
>> > +
>> > +        feat_tmp = feat_l3_cat;
>> > +        feat_l3_cat = NULL;
>> > +        feat_tmp->ops = l3_cat_ops;
>> > +
>> > +        feat_tmp->ops.init_feature(eax, ebx, ecx, edx, feat_tmp, info);
>> 
>> What's the point of the indirect call here, when you know the
>> function is l3_cat_init_feature()?
>> 
> Hmm, just want to keep the callback function calling style.

Please don't use indirect calls when you don't need them.

>> > +static int psr_cpu_prepare(unsigned int cpu)
>> > +{
>> > +    return cpu_prepare_work(cpu);
>> > +}
>> 
>> What is this wrapper good for?
>> 
> Just keep the old codes.

Well, you're overhauling the old code anyway (and you're actively
adding this function here), so - please don't introduce pointless
wrappers like this. They only complicate anyone following call flow,
even if just slightly.

Jan
Yi Sun Jan. 12, 2017, 1:07 a.m. UTC | #4
On 17-01-11 06:48:27, Jan Beulich wrote:
> >>> On 11.01.17 at 04:14, <yi.y.sun@linux.intel.com> wrote:
> > On 17-01-10 04:45:05, Jan Beulich wrote:
> >> >>> On 14.12.16 at 05:07, <yi.y.sun@linux.intel.com> wrote:
> >> > +/* L3 CAT callback functions implementation. */
> >> > +static void l3_cat_init_feature(unsigned int eax, unsigned int ebx,
> >> > +                                unsigned int ecx, unsigned int edx,
> >> 
> >> This is rather unfortunate naming: How does the reader of this code
> >> know what these values represent, without having to first go look in
> >> the caller?
> >> 
> > Do you mean the 'eax'-'edx'?
> 
> Yes.
> 
> > How about 'eax_register'?
> 
> How would that be any better? Perhaps the best way of making the
> naming obvious would be to use the new cpuid_leaf structure here.
> 
Ok, will consider to assemble them into a structure.

> >> > +    if ( !cpu_has(c, X86_FEATURE_PQE) || c->cpuid_level < PSR_CPUID_LEVEL_CAT )
> >> > +        return;
> >> 
> >> Instead of such a double check, please consider clearing the PQE
> >> feature bit when the maximum CPUID level is too low (which
> >> shouldn't happen anyway).
> >> 
> > Is this the responsibility of psr.c? X86_FEATURE_PQE bit is set by HW. Even the
> > bit is set but CPUID level is low, I think SW would be better to keep it but
> > not clear it. Because it indicates the HW capability. How do you think? 
> 
> What use if keeping the flag if we can't use the feature? And to
> answer your first question - whether that's being done in psr.c,
> cpu/common.c, or cpu/intel.c I don't really care all that much; it
> would certainly feel most natural to go here.
> 
Ok, will consider it.

> >> > +    socket = cpu_to_socket(cpu);
> >> > +    info = socket_info + socket;
> >> > +    if ( info->feat_mask )
> >> > +        return;
> >> > +
> >> > +    spin_lock_init(&info->ref_lock);
> >> > +
> >> > +    cpuid_count(PSR_CPUID_LEVEL_CAT, 0, &eax, &ebx, &ecx, &edx);
> >> > +    if ( ebx & PSR_RESOURCE_TYPE_L3 )
> >> > +    {
> >> > +        cpuid_count(PSR_CPUID_LEVEL_CAT, 1, &eax, &ebx, &ecx, &edx);
> >> > +
> >> > +        feat_tmp = feat_l3_cat;
> >> > +        feat_l3_cat = NULL;
> >> > +        feat_tmp->ops = l3_cat_ops;
> >> > +
> >> > +        feat_tmp->ops.init_feature(eax, ebx, ecx, edx, feat_tmp, info);
> >> 
> >> What's the point of the indirect call here, when you know the
> >> function is l3_cat_init_feature()?
> >> 
> > Hmm, just want to keep the callback function calling style.
> 
> Please don't use indirect calls when you don't need them.
> 
Ok, thanks!

> >> > +static int psr_cpu_prepare(unsigned int cpu)
> >> > +{
> >> > +    return cpu_prepare_work(cpu);
> >> > +}
> >> 
> >> What is this wrapper good for?
> >> 
> > Just keep the old codes.
> 
> Well, you're overhauling the old code anyway (and you're actively
> adding this function here), so - please don't introduce pointless
> wrappers like this. They only complicate anyone following call flow,
> even if just slightly.
> 
Sure, thanks!

> Jan
diff mbox

Patch

diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c
index 49a4598..fa9bc32 100644
--- a/xen/arch/x86/psr.c
+++ b/xen/arch/x86/psr.c
@@ -35,6 +35,9 @@ 
 #define PSR_CAT        (1<<1)
 #define PSR_CDP        (1<<2)
 
+#define CAT_CBM_LEN_MASK 0x1f
+#define CAT_COS_MAX_MASK 0xffff
+
 /*
  * Per SDM 17.17.3.3 'Cache Allocation Technology: Cache Mask Configuration',
  * the MSRs range from 0C90H through 0D0FH (inclusive), enables support for
@@ -141,11 +144,79 @@  struct psr_assoc {
 
 struct psr_cmt *__read_mostly psr_cmt;
 
+static struct psr_socket_info *__read_mostly socket_info;
+
 static unsigned int opt_psr;
 static unsigned int __initdata opt_rmid_max = 255;
+static unsigned int __read_mostly opt_cos_max = MAX_COS_REG_CNT;
 static uint64_t rmid_mask;
 static DEFINE_PER_CPU(struct psr_assoc, psr_assoc);
 
+/* Declare feature list entry. */
+static struct feat_node *feat_l3_cat;
+
+/* Common functions. */
+static void free_feature(struct psr_socket_info *info)
+{
+    struct feat_node *feat_tmp;
+
+    if ( !info )
+        return;
+
+    list_for_each_entry(feat_tmp, &info->feat_list, list)
+    {
+        clear_bit(feat_tmp->feature, &info->feat_mask);
+        list_del(&feat_tmp->list);
+        xfree(feat_tmp);
+    }
+
+    /* Free feature which are not added into feat_list. */
+    if ( feat_l3_cat )
+    {
+        xfree(feat_l3_cat);
+        feat_l3_cat = NULL;
+    }
+}
+
+/* L3 CAT callback functions implementation. */
+static void l3_cat_init_feature(unsigned int eax, unsigned int ebx,
+                                unsigned int ecx, unsigned int edx,
+                                struct feat_node *feat,
+                                struct psr_socket_info *info)
+{
+    struct psr_cat_hw_info l3_cat;
+    unsigned int socket;
+
+    /* No valid value so do not enable feature. */
+    if ( !eax || !edx )
+        return;
+
+    l3_cat.cbm_len = (eax & CAT_CBM_LEN_MASK) + 1;
+    l3_cat.cos_max = min(opt_cos_max, edx & CAT_COS_MAX_MASK);
+
+    /* cos=0 is reserved as default cbm(all ones). */
+    feat->cos_reg_val[0] = (1ull << l3_cat.cbm_len) - 1;
+
+    feat->feature = PSR_SOCKET_L3_CAT;
+    __set_bit(PSR_SOCKET_L3_CAT, &info->feat_mask);
+
+    feat->info.l3_cat_info = l3_cat;
+
+    info->nr_feat++;
+
+    /* Add this feature into list. */
+    list_add_tail(&feat->list, &info->feat_list);
+
+    socket = cpu_to_socket(smp_processor_id());
+    printk(XENLOG_INFO "L3 CAT: enabled on socket %u, cos_max:%u, cbm_len:%u\n",
+           socket, feat->info.l3_cat_info.cos_max,
+           feat->info.l3_cat_info.cbm_len);
+}
+
+struct feat_ops l3_cat_ops = {
+    .init_feature = l3_cat_init_feature,
+};
+
 static void __init parse_psr_bool(char *s, char *value, char *feature,
                                   unsigned int mask)
 {
@@ -185,6 +256,9 @@  static void __init parse_psr_param(char *s)
         if ( val_str && !strcmp(s, "rmid_max") )
             opt_rmid_max = simple_strtoul(val_str, NULL, 0);
 
+        if ( val_str && !strcmp(s, "cos_max") )
+            opt_cos_max = simple_strtoul(val_str, NULL, 0);
+
         s = ss + 1;
     } while ( ss );
 }
@@ -340,18 +414,113 @@  void psr_domain_free(struct domain *d)
     psr_free_rmid(d);
 }
 
-static int psr_cpu_prepare(unsigned int cpu)
+static int cpu_prepare_work(unsigned int cpu)
 {
+    if ( !socket_info )
+        return 0;
+
+    /* Malloc memory for the global feature head here. */
+    if ( feat_l3_cat == NULL &&
+         (feat_l3_cat = xzalloc(struct feat_node)) == NULL )
+        return -ENOMEM;
+
     return 0;
 }
 
+static void cpu_init_work(void)
+{
+    unsigned int eax, ebx, ecx, edx;
+    struct psr_socket_info *info;
+    unsigned int socket;
+    unsigned int cpu = smp_processor_id();
+    const struct cpuinfo_x86 *c = cpu_data + cpu;
+    struct feat_node *feat_tmp;
+
+    if ( !cpu_has(c, X86_FEATURE_PQE) || c->cpuid_level < PSR_CPUID_LEVEL_CAT )
+        return;
+
+    socket = cpu_to_socket(cpu);
+    info = socket_info + socket;
+    if ( info->feat_mask )
+        return;
+
+    spin_lock_init(&info->ref_lock);
+
+    cpuid_count(PSR_CPUID_LEVEL_CAT, 0, &eax, &ebx, &ecx, &edx);
+    if ( ebx & PSR_RESOURCE_TYPE_L3 )
+    {
+        cpuid_count(PSR_CPUID_LEVEL_CAT, 1, &eax, &ebx, &ecx, &edx);
+
+        feat_tmp = feat_l3_cat;
+        feat_l3_cat = NULL;
+        feat_tmp->ops = l3_cat_ops;
+
+        feat_tmp->ops.init_feature(eax, ebx, ecx, edx, feat_tmp, info);
+    }
+}
+
+static void cpu_fini_work(unsigned int cpu)
+{
+    unsigned int socket = cpu_to_socket(cpu);
+
+    if ( !socket_cpumask[socket] || cpumask_empty(socket_cpumask[socket]) )
+    {
+        struct psr_socket_info *info = socket_info + socket;
+
+        free_feature(info);
+    }
+}
+
+static void __init init_psr(void)
+{
+    unsigned int i;
+
+    if ( opt_cos_max < 1 )
+    {
+        printk(XENLOG_INFO "CAT: disabled, cos_max is too small\n");
+        return;
+    }
+
+    socket_info = xzalloc_array(struct psr_socket_info, nr_sockets);
+
+    if ( !socket_info )
+    {
+        printk(XENLOG_INFO "Fail to alloc socket_info!\n");
+        return;
+    }
+
+    for ( i = 0; i < nr_sockets; i++ )
+        INIT_LIST_HEAD(&socket_info[i].feat_list);
+}
+
+static void __init psr_free(void)
+{
+    unsigned int i;
+
+    for ( i = 0; i < nr_sockets; i++ )
+        free_feature(&socket_info[i]);
+
+    xfree(socket_info);
+    socket_info = NULL;
+}
+
+static int psr_cpu_prepare(unsigned int cpu)
+{
+    return cpu_prepare_work(cpu);
+}
+
 static void psr_cpu_init(void)
 {
+    if ( socket_info )
+        cpu_init_work();
+
     psr_assoc_init();
 }
 
 static void psr_cpu_fini(unsigned int cpu)
 {
+    if ( socket_info )
+        cpu_fini_work(cpu);
     return;
 }
 
@@ -393,10 +562,14 @@  static int __init psr_presmp_init(void)
     if ( (opt_psr & PSR_CMT) && opt_rmid_max )
         init_psr_cmt(opt_rmid_max);
 
-    psr_cpu_prepare(0);
+    if ( opt_psr & PSR_CAT )
+        init_psr();
+
+    if ( psr_cpu_prepare(0) )
+        psr_free();
 
     psr_cpu_init();
-    if ( psr_cmt_enabled() )
+    if ( psr_cmt_enabled() || socket_info )
         register_cpu_notifier(&cpu_nfb);
 
     return 0;