Message ID | 1503537289-56036-8-git-send-email-yi.y.sun@linux.intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Aug 24, 2017 at 09:14:41AM +0800, Yi Sun wrote: > This patch implements set value flow for MBA including its callback > function and domctl interface. > > It also changes the memebers in 'cos_write_info' to transfer the > feature array, feature properties array and value array. Then, we > can write all features values on the cos id into MSRs. > > Because multiple features may co-exist, we need handle all features to write > values of them into a COS register with new COS ID. E.g: > 1. L3 CAT and MBA co-exist. > 2. Dom1 and Dom2 share a same COS ID (2). The L3 CAT CBM of Dom1 is 0x1ff, > the MBA Thrtle of Dom1 is 0xa. > 3. User wants to change MBA Thrtl of Dom1 to be 0x14. Because COS ID 2 is > used by Dom2 too, we have to pick a new COS ID 3. The original values of > Dom1 on COS ID 3 may be below: What original values? You said you pick COS ID 3, because I think it's assumed to be empty? In which case there are no original values in COS ID 3. > --------- > | COS 3 | > --------- > L3 CAT | 0x7ff | > --------- > MBA | 0x0 | > --------- > 4. After setting, the L3 CAT CBM value of Dom1 should be kept and the new MBA > Thrtl is set. So, the values on COS ID 3 should be below. > --------- > | COS 3 | > --------- > L3 CAT | 0x1ff | > --------- > MBA | 0x14 | > --------- > > So, we should write all features values into their MSRs. That requires the > feature array, feature properties array and value array are input. ^ as > > Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> > --- > v2: > - remove linear mode 'thrtl_max' check in 'mba_check_thrtl' because it has > been checked in 'mba_init_feature'. > (suggested by Chao Peng) > - for non-linear mode, check if '*thrtl' is not 0 in 'mba_check_thrtl'. If > it is 0, we do not need to change it. > (suggested by Chao Peng) > - move comments to explain changes of 'cos_write_info' from psr.c to commit > message. > (suggested by Chao Peng) > --- > xen/arch/x86/domctl.c | 6 ++ > xen/arch/x86/psr.c | 150 ++++++++++++++++++++++++++++++-------------- > xen/include/public/domctl.h | 1 + > 3 files changed, 109 insertions(+), 48 deletions(-) > > diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c > index 4936bcb..0ae4799 100644 > --- a/xen/arch/x86/domctl.c > +++ b/xen/arch/x86/domctl.c > @@ -1468,6 +1468,12 @@ long arch_do_domctl( > PSR_VAL_TYPE_L2_CBM); > break; > > + case XEN_DOMCTL_PSR_MBA_OP_SET_THRTL: > + ret = psr_set_val(d, domctl->u.psr_alloc_op.target, > + domctl->u.psr_alloc_op.data, > + PSR_VAL_TYPE_MBA); > + break; > + > case XEN_DOMCTL_PSR_CAT_OP_GET_L3_CBM: > ret = psr_get_val(d, domctl->u.psr_alloc_op.target, > &val32, PSR_VAL_TYPE_L3_CBM); > diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c > index 4a0c982..ce82975 100644 > --- a/xen/arch/x86/psr.c > +++ b/xen/arch/x86/psr.c > @@ -138,6 +138,12 @@ static const struct feat_props { > > /* write_msr is used to write out feature MSR register. */ > void (*write_msr)(unsigned int cos, uint32_t val, enum psr_val_type type); > + > + /* > + * check_val is used to check if input val fulfills SDM requirement. > + * Change it to valid value if SDM allows. I'm not really sure it's a good idea to change the value to a valid one, IMHO you should just check and print an error if the value is invalid (and return false of course). > + */ > + bool (*check_val)(const struct feat_node *feat, unsigned long *val); > } *feat_props[FEAT_TYPE_NUM]; > > /* > @@ -275,29 +281,6 @@ static enum psr_feat_type psr_val_type_to_feat_type(enum psr_val_type type) > return feat_type; > } > > -static bool psr_check_cbm(unsigned int cbm_len, unsigned long cbm) > -{ > - unsigned int first_bit, zero_bit; > - > - /* Set bits should only in the range of [0, cbm_len]. */ > - if ( cbm & (~0ul << cbm_len) ) > - return false; > - > - /* At least one bit need to be set. */ > - if ( cbm == 0 ) > - return false; > - > - first_bit = find_first_bit(&cbm, cbm_len); > - zero_bit = find_next_zero_bit(&cbm, cbm_len, first_bit); > - > - /* Set bits should be contiguous. */ > - if ( zero_bit < cbm_len && > - find_next_bit(&cbm, cbm_len, zero_bit) < cbm_len ) > - return false; > - > - return true; > -} > - > /* Implementation of allocation features' functions. */ > static int cat_init_feature(const struct cpuid_leaf *regs, > struct feat_node *feat, > @@ -433,6 +416,30 @@ static bool cat_get_feat_info(const struct feat_node *feat, > return true; > } > > +static bool cat_check_cbm(const struct feat_node *feat, unsigned long *cbm) > +{ > + unsigned int first_bit, zero_bit; > + unsigned int cbm_len = feat->cat_info.cbm_len; > + > + /* Set bits should only in the range of [0, cbm_len]. */ > + if ( *cbm & (~0ul << cbm_len) ) > + return false; > + > + /* At least one bit need to be set. */ > + if ( *cbm == 0 ) > + return false; > + > + first_bit = find_first_bit(cbm, cbm_len); > + zero_bit = find_next_zero_bit(cbm, cbm_len, first_bit); > + > + /* Set bits should be contiguous. */ > + if ( zero_bit < cbm_len && > + find_next_bit(cbm, cbm_len, zero_bit) < cbm_len ) > + return false; > + > + return true; > +} > + > /* L3 CAT props */ > static void l3_cat_write_msr(unsigned int cos, uint32_t val, > enum psr_val_type type) > @@ -446,6 +453,7 @@ static const struct feat_props l3_cat_props = { > .alt_type = PSR_VAL_TYPE_UNKNOWN, > .get_feat_info = cat_get_feat_info, > .write_msr = l3_cat_write_msr, > + .check_val = cat_check_cbm, > }; Maybe the introduction of check_val should be a separate patch? It's mostly code movement and some fixup. > /* L3 CDP props */ > @@ -476,6 +484,7 @@ static const struct feat_props l3_cdp_props = { > .alt_type = PSR_VAL_TYPE_L3_CBM, > .get_feat_info = l3_cdp_get_feat_info, > .write_msr = l3_cdp_write_msr, > + .check_val = cat_check_cbm, > }; > > /* L2 CAT props */ > @@ -491,6 +500,7 @@ static const struct feat_props l2_cat_props = { > .alt_type = PSR_VAL_TYPE_UNKNOWN, > .get_feat_info = cat_get_feat_info, > .write_msr = l2_cat_write_msr, > + .check_val = cat_check_cbm, > }; > > /* MBA props */ > @@ -514,6 +524,40 @@ static bool mba_get_feat_info(const struct feat_node *feat, > static void mba_write_msr(unsigned int cos, uint32_t val, > enum psr_val_type type) > { > + wrmsrl(MSR_IA32_PSR_MBA_MASK(cos), val); > +} > + > +static bool mba_check_thrtl(const struct feat_node *feat, unsigned long *thrtl) > +{ > + if ( *thrtl > feat->mba_info.thrtl_max ) > + return false; > + > + /* > + * Per SDM (chapter "Memory Bandwidth Allocation Configuration"): > + * 1. Linear mode: In the linear mode the input precision is defined > + * as 100-(MBA_MAX). For instance, if the MBA_MAX value is 90, the > + * input precision is 10%. Values not an even multiple of the > + * precision (e.g., 12%) will be rounded down (e.g., to 10% delay > + * applied). > + * 2. Non-linear mode: Input delay values are powers-of-two from zero > + * to the MBA_MAX value from CPUID. In this case any values not a > + * power of two will be rounded down the next nearest power of two. > + */ > + if ( feat->mba_info.linear ) > + { > + unsigned int mod; > + > + mod = *thrtl % (100 - feat->mba_info.thrtl_max); > + *thrtl -= mod; > + } > + else > + { > + /* Not power of 2. */ > + if ( *thrtl && (*thrtl & (*thrtl - 1)) ) This can be joined with the else to avoid another indentation level: else if ( *thrtl && (*thrtl & (*thrtl - 1)) ) ... > + *thrtl = *thrtl & (1 << (flsl(*thrtl) - 1)); > + } > + > + return true; > } > > static const struct feat_props mba_props = { > @@ -522,6 +566,7 @@ static const struct feat_props mba_props = { > .alt_type = PSR_VAL_TYPE_UNKNOWN, > .get_feat_info = mba_get_feat_info, > .write_msr = mba_write_msr, > + .check_val = mba_check_thrtl, > }; > > static void __init parse_psr_bool(char *s, char *value, char *feature, > @@ -942,6 +987,7 @@ static int insert_val_into_array(uint32_t val[], > const struct feat_node *feat; > const struct feat_props *props; > unsigned int i; > + unsigned long check_val = new_val; > int ret; > > ASSERT(feat_type < FEAT_TYPE_NUM); > @@ -966,9 +1012,11 @@ static int insert_val_into_array(uint32_t val[], > if ( array_len < props->cos_num ) > return -ENOSPC; > > - if ( !psr_check_cbm(feat->cat_info.cbm_len, new_val) ) > + if ( !props->check_val(feat, &check_val) ) > return -EINVAL; > > + new_val = check_val; > + > /* > * Value setting position is same as feature array. > * For CDP, user may set both DATA and CODE to same value. For such case, > @@ -1198,25 +1246,42 @@ static unsigned int get_socket_cpu(unsigned int socket) > struct cos_write_info > { > unsigned int cos; > - struct feat_node *feature; > + struct feat_node **features; > const uint32_t *val; > - const struct feat_props *props; > + unsigned int array_len; > + const struct feat_props **props; > }; > > static void do_write_psr_msrs(void *data) > { > const struct cos_write_info *info = data; > - struct feat_node *feat = info->feature; > - const struct feat_props *props = info->props; > - unsigned int i, cos = info->cos, cos_num = props->cos_num; > + unsigned int i, j, index = 0, array_len = info->array_len, cos = info->cos; > + const uint32_t *val_array = info->val; > > - for ( i = 0; i < cos_num; i++ ) > + for ( i = 0; i < ARRAY_SIZE(feat_props); i++ ) > { index and j can be defined here, they are only used inside of this for loop AFAICT. > - if ( feat->cos_reg_val[cos * cos_num + i] != info->val[i] ) > + struct feat_node *feat = info->features[i]; > + const struct feat_props *props = info->props[i]; > + unsigned int cos_num; > + > + if ( !feat || !props ) > + continue; > + > + cos_num = props->cos_num; > + if ( array_len < cos_num ) > + return; > + > + for ( j = 0; j < cos_num; j++ ) > { > - feat->cos_reg_val[cos * cos_num + i] = info->val[i]; > - props->write_msr(cos, info->val[i], props->type[i]); > + if ( feat->cos_reg_val[cos * cos_num + j] != val_array[index + j] ) > + { > + feat->cos_reg_val[cos * cos_num + j] = val_array[index + j]; > + props->write_msr(cos, val_array[index + j], props->type[j]); > + } > } > + > + array_len -= cos_num; > + index += cos_num; > } > } > > @@ -1224,30 +1289,19 @@ static int write_psr_msrs(unsigned int socket, unsigned int cos, > const uint32_t val[], unsigned int array_len, > enum psr_feat_type feat_type) > { > - int ret; > struct psr_socket_info *info = get_socket_info(socket); info should probably be const here. > struct cos_write_info data = > { > .cos = cos, > - .feature = info->features[feat_type], > - .props = feat_props[feat_type], > + .features = info->features, > + .val = val, > + .array_len = array_len, > + .props = feat_props, > }; AFAICT data should also be const, but I guess this is not going to work because on_selected_cpus expects a non-const payload? Thanks, Roger.
On 17-08-30 09:31:04, Roger Pau Monn� wrote: > On Thu, Aug 24, 2017 at 09:14:41AM +0800, Yi Sun wrote: > > It also changes the memebers in 'cos_write_info' to transfer the > > feature array, feature properties array and value array. Then, we > > can write all features values on the cos id into MSRs. > > > > Because multiple features may co-exist, we need handle all features to write > > values of them into a COS register with new COS ID. E.g: > > 1. L3 CAT and MBA co-exist. > > 2. Dom1 and Dom2 share a same COS ID (2). The L3 CAT CBM of Dom1 is 0x1ff, > > the MBA Thrtle of Dom1 is 0xa. > > 3. User wants to change MBA Thrtl of Dom1 to be 0x14. Because COS ID 2 is > > used by Dom2 too, we have to pick a new COS ID 3. The original values of > > Dom1 on COS ID 3 may be below: > > What original values? You said you pick COS ID 3, because I think it's > assumed to be empty? In which case there are no original values in COS > ID 3. > Sorry for confusion. The original value means the default value. For CAT, it is 0x7ff on my machine which is shown below. > > --------- > > | COS 3 | > > --------- > > L3 CAT | 0x7ff | > > --------- > > MBA | 0x0 | > > --------- > > diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c > > index 4a0c982..ce82975 100644 > > --- a/xen/arch/x86/psr.c > > +++ b/xen/arch/x86/psr.c > > @@ -138,6 +138,12 @@ static const struct feat_props { > > > > /* write_msr is used to write out feature MSR register. */ > > void (*write_msr)(unsigned int cos, uint32_t val, enum psr_val_type type); > > + > > + /* > > + * check_val is used to check if input val fulfills SDM requirement. > > + * Change it to valid value if SDM allows. > > I'm not really sure it's a good idea to change the value to a valid > one, IMHO you should just check and print an error if the value is > invalid (and return false of course). > Per SDM, the HW has ability to automatically change the input value to what it wants. E.g: Linear mode: HW wants the input value be 10/20/30/.../90. But user inputs 15. Then, HW can automatically change it to 10. Even user inputs a value that does not fulfill HW requirement, HW can handle it. So, we do not need return error to user. Otherwise, user needs to know details of MBA. But the issue here is how we get the actual value and show it to user. There are two ways to do that: 1. When setting value, check and change it to valid one and save it to our cache. 2. When getting value, call rdmsr to read the actual value back from HW. I think option 1 has better performance and the code looks better. > > + */ > > + bool (*check_val)(const struct feat_node *feat, unsigned long *val); > > } *feat_props[FEAT_TYPE_NUM]; > > [...] > > /* L3 CAT props */ > > static void l3_cat_write_msr(unsigned int cos, uint32_t val, > > enum psr_val_type type) > > @@ -446,6 +453,7 @@ static const struct feat_props l3_cat_props = { > > .alt_type = PSR_VAL_TYPE_UNKNOWN, > > .get_feat_info = cat_get_feat_info, > > .write_msr = l3_cat_write_msr, > > + .check_val = cat_check_cbm, > > }; > > Maybe the introduction of check_val should be a separate patch? It's > mostly code movement and some fixup. > Ok, may consider it. [...] > > +static bool mba_check_thrtl(const struct feat_node *feat, unsigned long *thrtl) > > +{ > > + if ( *thrtl > feat->mba_info.thrtl_max ) > > + return false; > > + > > + /* > > + * Per SDM (chapter "Memory Bandwidth Allocation Configuration"): > > + * 1. Linear mode: In the linear mode the input precision is defined > > + * as 100-(MBA_MAX). For instance, if the MBA_MAX value is 90, the > > + * input precision is 10%. Values not an even multiple of the > > + * precision (e.g., 12%) will be rounded down (e.g., to 10% delay > > + * applied). > > + * 2. Non-linear mode: Input delay values are powers-of-two from zero > > + * to the MBA_MAX value from CPUID. In this case any values not a > > + * power of two will be rounded down the next nearest power of two. > > + */ > > + if ( feat->mba_info.linear ) > > + { > > + unsigned int mod; > > + > > + mod = *thrtl % (100 - feat->mba_info.thrtl_max); > > + *thrtl -= mod; > > + } > > + else > > + { > > + /* Not power of 2. */ > > + if ( *thrtl && (*thrtl & (*thrtl - 1)) ) > > This can be joined with the else to avoid another indentation level: > > else if ( *thrtl && (*thrtl & (*thrtl - 1)) ) > ... > Thanks! > > + *thrtl = *thrtl & (1 << (flsl(*thrtl) - 1)); > > + } > > + > > + return true; > > } > > [...] > > static void do_write_psr_msrs(void *data) > > { > > const struct cos_write_info *info = data; > > - struct feat_node *feat = info->feature; > > - const struct feat_props *props = info->props; > > - unsigned int i, cos = info->cos, cos_num = props->cos_num; > > + unsigned int i, j, index = 0, array_len = info->array_len, cos = info->cos; > > + const uint32_t *val_array = info->val; > > > > - for ( i = 0; i < cos_num; i++ ) > > + for ( i = 0; i < ARRAY_SIZE(feat_props); i++ ) > > { > > index and j can be defined here, they are only used inside of this for > loop AFAICT. > I think definition of j can be moved into the loop. But index cannot unless I declared it to be 'static'. The index is used as a accumulator. > > - if ( feat->cos_reg_val[cos * cos_num + i] != info->val[i] ) > > + struct feat_node *feat = info->features[i]; > > + const struct feat_props *props = info->props[i]; > > + unsigned int cos_num; > > + > > + if ( !feat || !props ) > > + continue; > > + > > + cos_num = props->cos_num; > > + if ( array_len < cos_num ) > > + return; > > + > > + for ( j = 0; j < cos_num; j++ ) > > { > > - feat->cos_reg_val[cos * cos_num + i] = info->val[i]; > > - props->write_msr(cos, info->val[i], props->type[i]); > > + if ( feat->cos_reg_val[cos * cos_num + j] != val_array[index + j] ) > > + { > > + feat->cos_reg_val[cos * cos_num + j] = val_array[index + j]; > > + props->write_msr(cos, val_array[index + j], props->type[j]); > > + } > > } > > + > > + array_len -= cos_num; > > + index += cos_num; > > } > > } > > > > @@ -1224,30 +1289,19 @@ static int write_psr_msrs(unsigned int socket, unsigned int cos, > > const uint32_t val[], unsigned int array_len, > > enum psr_feat_type feat_type) > > { > > - int ret; > > struct psr_socket_info *info = get_socket_info(socket); > > info should probably be const here. > info could not be const. Because 'features' in 'cos_write_info' could not be const. > > struct cos_write_info data = > > { > > .cos = cos, > > - .feature = info->features[feat_type], > > - .props = feat_props[feat_type], > > + .features = info->features, > > + .val = val, > > + .array_len = array_len, > > + .props = feat_props, > > }; > > AFAICT data should also be const, but I guess this is not going to > work because on_selected_cpus expects a non-const payload? > Right, data cannot be const per 'on_selected_cpus' requirement. > Thanks, Roger.
On Thu, Aug 31, 2017 at 10:20:10AM +0800, Yi Sun wrote: > On 17-08-30 09:31:04, Roger Pau Monn� wrote: > > On Thu, Aug 24, 2017 at 09:14:41AM +0800, Yi Sun wrote: > > > diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c > > > index 4a0c982..ce82975 100644 > > > --- a/xen/arch/x86/psr.c > > > +++ b/xen/arch/x86/psr.c > > > @@ -138,6 +138,12 @@ static const struct feat_props { > > > > > > /* write_msr is used to write out feature MSR register. */ > > > void (*write_msr)(unsigned int cos, uint32_t val, enum psr_val_type type); > > > + > > > + /* > > > + * check_val is used to check if input val fulfills SDM requirement. > > > + * Change it to valid value if SDM allows. > > > > I'm not really sure it's a good idea to change the value to a valid > > one, IMHO you should just check and print an error if the value is > > invalid (and return false of course). > > > Per SDM, the HW has ability to automatically change the input value to what it > wants. E.g: > Linear mode: HW wants the input value be 10/20/30/.../90. But user inputs 15. > Then, HW can automatically change it to 10. This seems like a very, very bad idea IMHO. What if I input 15 (which is invalid), but I would rather prefer 20 instead of 10? I would very much prefer an error, so I can get to chose a valid value myself instead of the hardware deciding on my back. > Even user inputs a value that does not fulfill HW requirement, HW can handle it. > So, we do not need return error to user. Otherwise, user needs to know details > of MBA. > > But the issue here is how we get the actual value and show it to user. There are > two ways to do that: > 1. When setting value, check and change it to valid one and save it to our cache. Instead of performing those checks manually, why not simply write the value and read it back to see the one the hardware has actually chosen (and cache it)? Is there any risk in writing an invalid value? > 2. When getting value, call rdmsr to read the actual value back from HW. > > I think option 1 has better performance and the code looks better. > > > > + */ > > > + bool (*check_val)(const struct feat_node *feat, unsigned long *val); > > > } *feat_props[FEAT_TYPE_NUM]; > > > > [...] > > > > /* L3 CAT props */ > > > static void l3_cat_write_msr(unsigned int cos, uint32_t val, > > > enum psr_val_type type) > > > @@ -446,6 +453,7 @@ static const struct feat_props l3_cat_props = { > > > .alt_type = PSR_VAL_TYPE_UNKNOWN, > > > .get_feat_info = cat_get_feat_info, > > > .write_msr = l3_cat_write_msr, > > > + .check_val = cat_check_cbm, > > > }; > > > > Maybe the introduction of check_val should be a separate patch? It's > > mostly code movement and some fixup. > > > Ok, may consider it. > > [...] > > > > +static bool mba_check_thrtl(const struct feat_node *feat, unsigned long *thrtl) > > > +{ > > > + if ( *thrtl > feat->mba_info.thrtl_max ) > > > + return false; > > > + > > > + /* > > > + * Per SDM (chapter "Memory Bandwidth Allocation Configuration"): > > > + * 1. Linear mode: In the linear mode the input precision is defined > > > + * as 100-(MBA_MAX). For instance, if the MBA_MAX value is 90, the > > > + * input precision is 10%. Values not an even multiple of the > > > + * precision (e.g., 12%) will be rounded down (e.g., to 10% delay > > > + * applied). > > > + * 2. Non-linear mode: Input delay values are powers-of-two from zero > > > + * to the MBA_MAX value from CPUID. In this case any values not a > > > + * power of two will be rounded down the next nearest power of two. > > > + */ > > > + if ( feat->mba_info.linear ) > > > + { > > > + unsigned int mod; > > > + > > > + mod = *thrtl % (100 - feat->mba_info.thrtl_max); > > > + *thrtl -= mod; > > > + } > > > + else > > > + { > > > + /* Not power of 2. */ > > > + if ( *thrtl && (*thrtl & (*thrtl - 1)) ) > > > > This can be joined with the else to avoid another indentation level: > > > > else if ( *thrtl && (*thrtl & (*thrtl - 1)) ) > > ... > > > Thanks! > > > > + *thrtl = *thrtl & (1 << (flsl(*thrtl) - 1)); > > > + } > > > + > > > + return true; > > > } > > > > [...] > > > > static void do_write_psr_msrs(void *data) > > > { > > > const struct cos_write_info *info = data; > > > - struct feat_node *feat = info->feature; > > > - const struct feat_props *props = info->props; > > > - unsigned int i, cos = info->cos, cos_num = props->cos_num; > > > + unsigned int i, j, index = 0, array_len = info->array_len, cos = info->cos; > > > + const uint32_t *val_array = info->val; > > > > > > - for ( i = 0; i < cos_num; i++ ) > > > + for ( i = 0; i < ARRAY_SIZE(feat_props); i++ ) > > > { > > > > index and j can be defined here, they are only used inside of this for > > loop AFAICT. > > > I think definition of j can be moved into the loop. But index cannot unless I > declared it to be 'static'. The index is used as a accumulator. Right, I've got my indentation messed up. Please move j. Thanks, Roger.
On 17-08-31 09:30:47, Roger Pau Monn� wrote: > On Thu, Aug 31, 2017 at 10:20:10AM +0800, Yi Sun wrote: > > On 17-08-30 09:31:04, Roger Pau Monn� wrote: > > > On Thu, Aug 24, 2017 at 09:14:41AM +0800, Yi Sun wrote: > > > > diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c > > > > index 4a0c982..ce82975 100644 > > > > --- a/xen/arch/x86/psr.c > > > > +++ b/xen/arch/x86/psr.c > > > > @@ -138,6 +138,12 @@ static const struct feat_props { > > > > > > > > /* write_msr is used to write out feature MSR register. */ > > > > void (*write_msr)(unsigned int cos, uint32_t val, enum psr_val_type type); > > > > + > > > > + /* > > > > + * check_val is used to check if input val fulfills SDM requirement. > > > > + * Change it to valid value if SDM allows. > > > > > > I'm not really sure it's a good idea to change the value to a valid > > > one, IMHO you should just check and print an error if the value is > > > invalid (and return false of course). > > > > > Per SDM, the HW has ability to automatically change the input value to what it > > wants. E.g: > > Linear mode: HW wants the input value be 10/20/30/.../90. But user inputs 15. > > Then, HW can automatically change it to 10. > > This seems like a very, very bad idea IMHO. What if I input 15 (which > is invalid), but I would rather prefer 20 instead of 10? > > I would very much prefer an error, so I can get to chose a valid value > myself instead of the hardware deciding on my back. > > > Even user inputs a value that does not fulfill HW requirement, HW can handle it. > > So, we do not need return error to user. Otherwise, user needs to know details > > of MBA. > > > > But the issue here is how we get the actual value and show it to user. There are > > two ways to do that: > > 1. When setting value, check and change it to valid one and save it to our cache. > > Instead of performing those checks manually, why not simply write the > value and read it back to see the one the hardware has actually > chosen (and cache it)? > So, I guess you prefer optioin 2, right? If so, do we need check if input value is valid? I see above comment that you prefer to return an error and leave the decision to user. > Is there any risk in writing an invalid value? If input value is no more than the thrtl_max got through CPUID, it is fine that HW can handle it. > > > 2. When getting value, call rdmsr to read the actual value back from HW. > >
On Thu, Aug 31, 2017 at 05:13:08PM +0800, Yi Sun wrote: > On 17-08-31 09:30:47, Roger Pau Monn� wrote: > > On Thu, Aug 31, 2017 at 10:20:10AM +0800, Yi Sun wrote: > > > On 17-08-30 09:31:04, Roger Pau Monn� wrote: > > > > On Thu, Aug 24, 2017 at 09:14:41AM +0800, Yi Sun wrote: > > > Even user inputs a value that does not fulfill HW requirement, HW can handle it. > > > So, we do not need return error to user. Otherwise, user needs to know details > > > of MBA. > > > > > > But the issue here is how we get the actual value and show it to user. There are > > > two ways to do that: > > > 1. When setting value, check and change it to valid one and save it to our cache. > > > > Instead of performing those checks manually, why not simply write the > > value and read it back to see the one the hardware has actually > > chosen (and cache it)? > > > So, I guess you prefer optioin 2, right? If so, do we need check if input value > is valid? I see above comment that you prefer to return an error and leave the > decision to user. Yes, I think so. Is reading the MSR in any performance critical path? Or it's just used by the user-space tools? > > Is there any risk in writing an invalid value? > > If input value is no more than the thrtl_max got through CPUID, it is fine that > HW can handle it. OK, so I would just check for that then. Roger.
On 17-08-31 10:30:37, Roger Pau Monn� wrote: > On Thu, Aug 31, 2017 at 05:13:08PM +0800, Yi Sun wrote: > > On 17-08-31 09:30:47, Roger Pau Monn� wrote: > > > On Thu, Aug 31, 2017 at 10:20:10AM +0800, Yi Sun wrote: > > > > On 17-08-30 09:31:04, Roger Pau Monn� wrote: > > > > > On Thu, Aug 24, 2017 at 09:14:41AM +0800, Yi Sun wrote: > > > > Even user inputs a value that does not fulfill HW requirement, HW can handle it. > > > > So, we do not need return error to user. Otherwise, user needs to know details > > > > of MBA. > > > > > > > > But the issue here is how we get the actual value and show it to user. There are > > > > two ways to do that: > > > > 1. When setting value, check and change it to valid one and save it to our cache. > > > > > > Instead of performing those checks manually, why not simply write the > > > value and read it back to see the one the hardware has actually > > > chosen (and cache it)? > > > > > So, I guess you prefer optioin 2, right? If so, do we need check if input value > > is valid? I see above comment that you prefer to return an error and leave the > > decision to user. > > Yes, I think so. Is reading the MSR in any performance critical path? > Or it's just used by the user-space tools? > Just used by user-space tools. > > > Is there any risk in writing an invalid value? > > > > If input value is no more than the thrtl_max got through CPUID, it is fine that > > HW can handle it. > > OK, so I would just check for that then. > Ok, will check this only in 'check_val' function. > Roger.
On Thu, Aug 31, 2017 at 06:10:07PM +0800, Yi Sun wrote: > On 17-08-31 10:30:37, Roger Pau Monn� wrote: > > On Thu, Aug 31, 2017 at 05:13:08PM +0800, Yi Sun wrote: > > > On 17-08-31 09:30:47, Roger Pau Monn� wrote: > > > > On Thu, Aug 31, 2017 at 10:20:10AM +0800, Yi Sun wrote: > > > > > On 17-08-30 09:31:04, Roger Pau Monn� wrote: > > > > > > On Thu, Aug 24, 2017 at 09:14:41AM +0800, Yi Sun wrote: > > > > > Even user inputs a value that does not fulfill HW requirement, HW can handle it. > > > > > So, we do not need return error to user. Otherwise, user needs to know details > > > > > of MBA. > > > > > > > > > > But the issue here is how we get the actual value and show it to user. There are > > > > > two ways to do that: > > > > > 1. When setting value, check and change it to valid one and save it to our cache. > > > > > > > > Instead of performing those checks manually, why not simply write the > > > > value and read it back to see the one the hardware has actually > > > > chosen (and cache it)? > > > > > > > So, I guess you prefer optioin 2, right? If so, do we need check if input value > > > is valid? I see above comment that you prefer to return an error and leave the > > > decision to user. > > > > Yes, I think so. Is reading the MSR in any performance critical path? > > Or it's just used by the user-space tools? > > > Just used by user-space tools. Then doing a MSR read each time doesn't seem that bad IMHO (it's not a critical path anyway). Roger.
diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index 4936bcb..0ae4799 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -1468,6 +1468,12 @@ long arch_do_domctl( PSR_VAL_TYPE_L2_CBM); break; + case XEN_DOMCTL_PSR_MBA_OP_SET_THRTL: + ret = psr_set_val(d, domctl->u.psr_alloc_op.target, + domctl->u.psr_alloc_op.data, + PSR_VAL_TYPE_MBA); + break; + case XEN_DOMCTL_PSR_CAT_OP_GET_L3_CBM: ret = psr_get_val(d, domctl->u.psr_alloc_op.target, &val32, PSR_VAL_TYPE_L3_CBM); diff --git a/xen/arch/x86/psr.c b/xen/arch/x86/psr.c index 4a0c982..ce82975 100644 --- a/xen/arch/x86/psr.c +++ b/xen/arch/x86/psr.c @@ -138,6 +138,12 @@ static const struct feat_props { /* write_msr is used to write out feature MSR register. */ void (*write_msr)(unsigned int cos, uint32_t val, enum psr_val_type type); + + /* + * check_val is used to check if input val fulfills SDM requirement. + * Change it to valid value if SDM allows. + */ + bool (*check_val)(const struct feat_node *feat, unsigned long *val); } *feat_props[FEAT_TYPE_NUM]; /* @@ -275,29 +281,6 @@ static enum psr_feat_type psr_val_type_to_feat_type(enum psr_val_type type) return feat_type; } -static bool psr_check_cbm(unsigned int cbm_len, unsigned long cbm) -{ - unsigned int first_bit, zero_bit; - - /* Set bits should only in the range of [0, cbm_len]. */ - if ( cbm & (~0ul << cbm_len) ) - return false; - - /* At least one bit need to be set. */ - if ( cbm == 0 ) - return false; - - first_bit = find_first_bit(&cbm, cbm_len); - zero_bit = find_next_zero_bit(&cbm, cbm_len, first_bit); - - /* Set bits should be contiguous. */ - if ( zero_bit < cbm_len && - find_next_bit(&cbm, cbm_len, zero_bit) < cbm_len ) - return false; - - return true; -} - /* Implementation of allocation features' functions. */ static int cat_init_feature(const struct cpuid_leaf *regs, struct feat_node *feat, @@ -433,6 +416,30 @@ static bool cat_get_feat_info(const struct feat_node *feat, return true; } +static bool cat_check_cbm(const struct feat_node *feat, unsigned long *cbm) +{ + unsigned int first_bit, zero_bit; + unsigned int cbm_len = feat->cat_info.cbm_len; + + /* Set bits should only in the range of [0, cbm_len]. */ + if ( *cbm & (~0ul << cbm_len) ) + return false; + + /* At least one bit need to be set. */ + if ( *cbm == 0 ) + return false; + + first_bit = find_first_bit(cbm, cbm_len); + zero_bit = find_next_zero_bit(cbm, cbm_len, first_bit); + + /* Set bits should be contiguous. */ + if ( zero_bit < cbm_len && + find_next_bit(cbm, cbm_len, zero_bit) < cbm_len ) + return false; + + return true; +} + /* L3 CAT props */ static void l3_cat_write_msr(unsigned int cos, uint32_t val, enum psr_val_type type) @@ -446,6 +453,7 @@ static const struct feat_props l3_cat_props = { .alt_type = PSR_VAL_TYPE_UNKNOWN, .get_feat_info = cat_get_feat_info, .write_msr = l3_cat_write_msr, + .check_val = cat_check_cbm, }; /* L3 CDP props */ @@ -476,6 +484,7 @@ static const struct feat_props l3_cdp_props = { .alt_type = PSR_VAL_TYPE_L3_CBM, .get_feat_info = l3_cdp_get_feat_info, .write_msr = l3_cdp_write_msr, + .check_val = cat_check_cbm, }; /* L2 CAT props */ @@ -491,6 +500,7 @@ static const struct feat_props l2_cat_props = { .alt_type = PSR_VAL_TYPE_UNKNOWN, .get_feat_info = cat_get_feat_info, .write_msr = l2_cat_write_msr, + .check_val = cat_check_cbm, }; /* MBA props */ @@ -514,6 +524,40 @@ static bool mba_get_feat_info(const struct feat_node *feat, static void mba_write_msr(unsigned int cos, uint32_t val, enum psr_val_type type) { + wrmsrl(MSR_IA32_PSR_MBA_MASK(cos), val); +} + +static bool mba_check_thrtl(const struct feat_node *feat, unsigned long *thrtl) +{ + if ( *thrtl > feat->mba_info.thrtl_max ) + return false; + + /* + * Per SDM (chapter "Memory Bandwidth Allocation Configuration"): + * 1. Linear mode: In the linear mode the input precision is defined + * as 100-(MBA_MAX). For instance, if the MBA_MAX value is 90, the + * input precision is 10%. Values not an even multiple of the + * precision (e.g., 12%) will be rounded down (e.g., to 10% delay + * applied). + * 2. Non-linear mode: Input delay values are powers-of-two from zero + * to the MBA_MAX value from CPUID. In this case any values not a + * power of two will be rounded down the next nearest power of two. + */ + if ( feat->mba_info.linear ) + { + unsigned int mod; + + mod = *thrtl % (100 - feat->mba_info.thrtl_max); + *thrtl -= mod; + } + else + { + /* Not power of 2. */ + if ( *thrtl && (*thrtl & (*thrtl - 1)) ) + *thrtl = *thrtl & (1 << (flsl(*thrtl) - 1)); + } + + return true; } static const struct feat_props mba_props = { @@ -522,6 +566,7 @@ static const struct feat_props mba_props = { .alt_type = PSR_VAL_TYPE_UNKNOWN, .get_feat_info = mba_get_feat_info, .write_msr = mba_write_msr, + .check_val = mba_check_thrtl, }; static void __init parse_psr_bool(char *s, char *value, char *feature, @@ -942,6 +987,7 @@ static int insert_val_into_array(uint32_t val[], const struct feat_node *feat; const struct feat_props *props; unsigned int i; + unsigned long check_val = new_val; int ret; ASSERT(feat_type < FEAT_TYPE_NUM); @@ -966,9 +1012,11 @@ static int insert_val_into_array(uint32_t val[], if ( array_len < props->cos_num ) return -ENOSPC; - if ( !psr_check_cbm(feat->cat_info.cbm_len, new_val) ) + if ( !props->check_val(feat, &check_val) ) return -EINVAL; + new_val = check_val; + /* * Value setting position is same as feature array. * For CDP, user may set both DATA and CODE to same value. For such case, @@ -1198,25 +1246,42 @@ static unsigned int get_socket_cpu(unsigned int socket) struct cos_write_info { unsigned int cos; - struct feat_node *feature; + struct feat_node **features; const uint32_t *val; - const struct feat_props *props; + unsigned int array_len; + const struct feat_props **props; }; static void do_write_psr_msrs(void *data) { const struct cos_write_info *info = data; - struct feat_node *feat = info->feature; - const struct feat_props *props = info->props; - unsigned int i, cos = info->cos, cos_num = props->cos_num; + unsigned int i, j, index = 0, array_len = info->array_len, cos = info->cos; + const uint32_t *val_array = info->val; - for ( i = 0; i < cos_num; i++ ) + for ( i = 0; i < ARRAY_SIZE(feat_props); i++ ) { - if ( feat->cos_reg_val[cos * cos_num + i] != info->val[i] ) + struct feat_node *feat = info->features[i]; + const struct feat_props *props = info->props[i]; + unsigned int cos_num; + + if ( !feat || !props ) + continue; + + cos_num = props->cos_num; + if ( array_len < cos_num ) + return; + + for ( j = 0; j < cos_num; j++ ) { - feat->cos_reg_val[cos * cos_num + i] = info->val[i]; - props->write_msr(cos, info->val[i], props->type[i]); + if ( feat->cos_reg_val[cos * cos_num + j] != val_array[index + j] ) + { + feat->cos_reg_val[cos * cos_num + j] = val_array[index + j]; + props->write_msr(cos, val_array[index + j], props->type[j]); + } } + + array_len -= cos_num; + index += cos_num; } } @@ -1224,30 +1289,19 @@ static int write_psr_msrs(unsigned int socket, unsigned int cos, const uint32_t val[], unsigned int array_len, enum psr_feat_type feat_type) { - int ret; struct psr_socket_info *info = get_socket_info(socket); struct cos_write_info data = { .cos = cos, - .feature = info->features[feat_type], - .props = feat_props[feat_type], + .features = info->features, + .val = val, + .array_len = array_len, + .props = feat_props, }; if ( cos > info->features[feat_type]->cos_max ) return -EINVAL; - /* Skip to the feature's value head. */ - ret = skip_prior_features(&array_len, feat_type); - if ( ret < 0 ) - return ret; - - val += ret; - - if ( array_len < feat_props[feat_type]->cos_num ) - return -ENOSPC; - - data.val = val; - if ( socket == cpu_to_socket(smp_processor_id()) ) do_write_psr_msrs(&data); else diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h index a577a3e..8826cfb 100644 --- a/xen/include/public/domctl.h +++ b/xen/include/public/domctl.h @@ -1144,6 +1144,7 @@ struct xen_domctl_psr_alloc_op { #define XEN_DOMCTL_PSR_CAT_OP_GET_L3_DATA 5 #define XEN_DOMCTL_PSR_CAT_OP_SET_L2_CBM 6 #define XEN_DOMCTL_PSR_CAT_OP_GET_L2_CBM 7 +#define XEN_DOMCTL_PSR_MBA_OP_SET_THRTL 8 #define XEN_DOMCTL_PSR_MBA_OP_GET_THRTL 9 uint32_t cmd; /* IN: XEN_DOMCTL_PSR_CAT_OP_* */ uint32_t target; /* IN */
This patch implements set value flow for MBA including its callback function and domctl interface. It also changes the memebers in 'cos_write_info' to transfer the feature array, feature properties array and value array. Then, we can write all features values on the cos id into MSRs. Because multiple features may co-exist, we need handle all features to write values of them into a COS register with new COS ID. E.g: 1. L3 CAT and MBA co-exist. 2. Dom1 and Dom2 share a same COS ID (2). The L3 CAT CBM of Dom1 is 0x1ff, the MBA Thrtle of Dom1 is 0xa. 3. User wants to change MBA Thrtl of Dom1 to be 0x14. Because COS ID 2 is used by Dom2 too, we have to pick a new COS ID 3. The original values of Dom1 on COS ID 3 may be below: --------- | COS 3 | --------- L3 CAT | 0x7ff | --------- MBA | 0x0 | --------- 4. After setting, the L3 CAT CBM value of Dom1 should be kept and the new MBA Thrtl is set. So, the values on COS ID 3 should be below. --------- | COS 3 | --------- L3 CAT | 0x1ff | --------- MBA | 0x14 | --------- So, we should write all features values into their MSRs. That requires the feature array, feature properties array and value array are input. Signed-off-by: Yi Sun <yi.y.sun@linux.intel.com> --- v2: - remove linear mode 'thrtl_max' check in 'mba_check_thrtl' because it has been checked in 'mba_init_feature'. (suggested by Chao Peng) - for non-linear mode, check if '*thrtl' is not 0 in 'mba_check_thrtl'. If it is 0, we do not need to change it. (suggested by Chao Peng) - move comments to explain changes of 'cos_write_info' from psr.c to commit message. (suggested by Chao Peng) --- xen/arch/x86/domctl.c | 6 ++ xen/arch/x86/psr.c | 150 ++++++++++++++++++++++++++++++-------------- xen/include/public/domctl.h | 1 + 3 files changed, 109 insertions(+), 48 deletions(-)