diff mbox series

[3/4] selftests/resctrl: SNC support for MBM

Message ID def02de22a4747ed56ddc28f334e55872ba95f13.1709721159.git.maciej.wieczor-retman@intel.com (mailing list archive)
State New
Headers show
Series SNC support for resctrl selftests | expand

Commit Message

Maciej Wieczor-Retman March 6, 2024, 10:39 a.m. UTC
Memory Bandwidth Monitoring (MBM) measures how much data flows between
cache levels. Only the flow for a process specified with a Resource
Monitoring ID (RMID) is measured.

Sub-Numa Clustering (SNC) causes MBM selftest to fail because the
increased amount of NUMA nodes per socket is not taken into account.
That in turn makes the test use incorrect values for the start and end
of the measurement by tracking the wrong node.

For the MBM selftest to be NUMA-aware it needs to track the start and end
values of a measurement not for a single node but for all nodes on the
same socket. Then summing all measured values comes out as the real
bandwidth used by the process.

Reported-by: "Shaopeng Tan (Fujitsu)" <tan.shaopeng@fujitsu.com>
Closes: https://lore.kernel.org/lkml/TYAPR01MB6330A4EB3633B791939EA45E8B39A@TYAPR01MB6330.jpnprd01.prod.outlook.com/
Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
---
 tools/testing/selftests/resctrl/mba_test.c    |  1 -
 tools/testing/selftests/resctrl/resctrl_val.c | 37 ++++++++++++-------
 2 files changed, 23 insertions(+), 15 deletions(-)

Comments

Ilpo Järvinen March 8, 2024, 2:07 p.m. UTC | #1
On Wed, 6 Mar 2024, Maciej Wieczor-Retman wrote:

> Memory Bandwidth Monitoring (MBM) measures how much data flows between
> cache levels. Only the flow for a process specified with a Resource
> Monitoring ID (RMID) is measured.
> 
> Sub-Numa Clustering (SNC) causes MBM selftest to fail because the
> increased amount of NUMA nodes per socket is not taken into account.
> That in turn makes the test use incorrect values for the start and end
> of the measurement by tracking the wrong node.
> 
> For the MBM selftest to be NUMA-aware it needs to track the start and end
> values of a measurement not for a single node but for all nodes on the
> same socket. Then summing all measured values comes out as the real
> bandwidth used by the process.
> 
> Reported-by: "Shaopeng Tan (Fujitsu)" <tan.shaopeng@fujitsu.com>
> Closes: https://lore.kernel.org/lkml/TYAPR01MB6330A4EB3633B791939EA45E8B39A@TYAPR01MB6330.jpnprd01.prod.outlook.com/
> Signed-off-by: Maciej Wieczor-Retman <maciej.wieczor-retman@intel.com>
> ---
>  tools/testing/selftests/resctrl/mba_test.c    |  1 -
>  tools/testing/selftests/resctrl/resctrl_val.c | 37 ++++++++++++-------
>  2 files changed, 23 insertions(+), 15 deletions(-)
> 
> diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
> index 7946e32e85c8..fc31a61dab0c 100644
> --- a/tools/testing/selftests/resctrl/mba_test.c
> +++ b/tools/testing/selftests/resctrl/mba_test.c
> @@ -147,7 +147,6 @@ static int mba_run_test(const struct resctrl_test *test, const struct user_param
>  	struct resctrl_val_param param = {
>  		.resctrl_val	= MBA_STR,
>  		.ctrlgrp	= "c1",
> -		.mongrp		= "m1",
>  		.filename	= RESULT_FILE_NAME,
>  		.bw_report	= "reads",
>  		.setup		= mba_setup
> diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
> index e75e3923ebe2..2fe9f8bb4a45 100644
> --- a/tools/testing/selftests/resctrl/resctrl_val.c
> +++ b/tools/testing/selftests/resctrl/resctrl_val.c
> @@ -595,9 +595,10 @@ static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
>  
>  static int measure_vals(const struct user_params *uparams,
>  			struct resctrl_val_param *param,
> -			unsigned long *bw_resc_start)
> +			unsigned long *bw_resc_start,
> +			int res_id)
>  {
> -	unsigned long bw_resc, bw_resc_end;
> +	unsigned long bw_resc = 0, bw_resc_sum = 0, bw_resc_end;
>  	float bw_imc;
>  	int ret;
>  
> @@ -612,17 +613,19 @@ static int measure_vals(const struct user_params *uparams,
>  	if (ret < 0)
>  		return ret;
>  
> -	ret = get_mem_bw_resctrl(&bw_resc_end);
> -	if (ret < 0)
> -		return ret;
> +	for (int i = 0 ; i < snc_ways() ; i++) {
> +		set_mbm_path(param->ctrlgrp, strlen(param->mongrp) > 0 ? param->mongrp : NULL,
> +			     res_id + i);
> +		ret = get_mem_bw_resctrl(&bw_resc_end);
> +		bw_resc = (bw_resc_end - bw_resc_start[i]) / MB;
> +		bw_resc_sum += bw_resc;
> +		bw_resc_start[i] = bw_resc_end;
> +	}
>  
> -	bw_resc = (bw_resc_end - *bw_resc_start) / MB;
> -	ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc);
> +	ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc_sum);
>  	if (ret)
>  		return ret;
>  
> -	*bw_resc_start = bw_resc_end;
> -
>  	return 0;
>  }
>  
> @@ -697,12 +700,16 @@ int resctrl_val(const struct resctrl_test *test,
>  		struct resctrl_val_param *param)
>  {
>  	char *resctrl_val = param->resctrl_val;
> -	unsigned long bw_resc_start = 0;
>  	int res_id, ret = 0, pipefd[2];
> +	unsigned long *bw_resc_start;
>  	struct sigaction sigact;
>  	char pipe_message = 0;
>  	union sigval value;
>  
> +	bw_resc_start = calloc(snc_ways(), sizeof(unsigned long));

While correct, this seems a bit overkill given is MAX_SNC = 4, not 
something large or unbounded.

This patch would be be much simpler on top of my resctrl_val() cleanup 
patches because bw_resc_start is only local to the measurement function.
Maciej Wieczor-Retman March 13, 2024, 10:26 a.m. UTC | #2
On 2024-03-08 at 16:07:05 +0200, Ilpo Järvinen wrote:
>On Wed, 6 Mar 2024, Maciej Wieczor-Retman wrote:
>> @@ -697,12 +700,16 @@ int resctrl_val(const struct resctrl_test *test,
>>  		struct resctrl_val_param *param)
>>  {
>>  	char *resctrl_val = param->resctrl_val;
>> -	unsigned long bw_resc_start = 0;
>>  	int res_id, ret = 0, pipefd[2];
>> +	unsigned long *bw_resc_start;
>>  	struct sigaction sigact;
>>  	char pipe_message = 0;
>>  	union sigval value;
>>  
>> +	bw_resc_start = calloc(snc_ways(), sizeof(unsigned long));
>
>While correct, this seems a bit overkill given is MAX_SNC = 4, not 
>something large or unbounded.
>
>This patch would be be much simpler on top of my resctrl_val() cleanup 
>patches because bw_resc_start is only local to the measurement function.

You're right, the series will get a lot simpler rebased onto yours. I'll try out
some different approaches and comment any relevant points under your thread [1].

[1] https://lore.kernel.org/all/20240311135230.7007-1-ilpo.jarvinen@linux.intel.com/

>
>-- 
> i.
diff mbox series

Patch

diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
index 7946e32e85c8..fc31a61dab0c 100644
--- a/tools/testing/selftests/resctrl/mba_test.c
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -147,7 +147,6 @@  static int mba_run_test(const struct resctrl_test *test, const struct user_param
 	struct resctrl_val_param param = {
 		.resctrl_val	= MBA_STR,
 		.ctrlgrp	= "c1",
-		.mongrp		= "m1",
 		.filename	= RESULT_FILE_NAME,
 		.bw_report	= "reads",
 		.setup		= mba_setup
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
index e75e3923ebe2..2fe9f8bb4a45 100644
--- a/tools/testing/selftests/resctrl/resctrl_val.c
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -595,9 +595,10 @@  static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
 
 static int measure_vals(const struct user_params *uparams,
 			struct resctrl_val_param *param,
-			unsigned long *bw_resc_start)
+			unsigned long *bw_resc_start,
+			int res_id)
 {
-	unsigned long bw_resc, bw_resc_end;
+	unsigned long bw_resc = 0, bw_resc_sum = 0, bw_resc_end;
 	float bw_imc;
 	int ret;
 
@@ -612,17 +613,19 @@  static int measure_vals(const struct user_params *uparams,
 	if (ret < 0)
 		return ret;
 
-	ret = get_mem_bw_resctrl(&bw_resc_end);
-	if (ret < 0)
-		return ret;
+	for (int i = 0 ; i < snc_ways() ; i++) {
+		set_mbm_path(param->ctrlgrp, strlen(param->mongrp) > 0 ? param->mongrp : NULL,
+			     res_id + i);
+		ret = get_mem_bw_resctrl(&bw_resc_end);
+		bw_resc = (bw_resc_end - bw_resc_start[i]) / MB;
+		bw_resc_sum += bw_resc;
+		bw_resc_start[i] = bw_resc_end;
+	}
 
-	bw_resc = (bw_resc_end - *bw_resc_start) / MB;
-	ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc);
+	ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc_sum);
 	if (ret)
 		return ret;
 
-	*bw_resc_start = bw_resc_end;
-
 	return 0;
 }
 
@@ -697,12 +700,16 @@  int resctrl_val(const struct resctrl_test *test,
 		struct resctrl_val_param *param)
 {
 	char *resctrl_val = param->resctrl_val;
-	unsigned long bw_resc_start = 0;
 	int res_id, ret = 0, pipefd[2];
+	unsigned long *bw_resc_start;
 	struct sigaction sigact;
 	char pipe_message = 0;
 	union sigval value;
 
+	bw_resc_start = calloc(snc_ways(), sizeof(unsigned long));
+	if (!bw_resc_start)
+		return -1;
+
 	if (strcmp(param->filename, "") == 0)
 		sprintf(param->filename, "stdio");
 
@@ -710,7 +717,7 @@  int resctrl_val(const struct resctrl_test *test,
 	    !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
 		ret = validate_bw_report_request(param->bw_report);
 		if (ret)
-			return ret;
+			goto out_free;
 	}
 
 	/*
@@ -721,7 +728,7 @@  int resctrl_val(const struct resctrl_test *test,
 
 	if (pipe(pipefd)) {
 		ksft_perror("Unable to create pipe");
-
+		free(bw_resc_start);
 		return -1;
 	}
 
@@ -733,7 +740,7 @@  int resctrl_val(const struct resctrl_test *test,
 	bm_pid = fork();
 	if (bm_pid == -1) {
 		ksft_perror("Unable to fork");
-
+		free(bw_resc_start);
 		return -1;
 	}
 
@@ -841,7 +848,7 @@  int resctrl_val(const struct resctrl_test *test,
 
 		if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
 		    !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
-			ret = measure_vals(uparams, param, &bw_resc_start);
+			ret = measure_vals(uparams, param, bw_resc_start, res_id);
 			if (ret)
 				break;
 		} else if (!strncmp(resctrl_val, CMT_STR, sizeof(CMT_STR))) {
@@ -855,6 +862,8 @@  int resctrl_val(const struct resctrl_test *test,
 
 out:
 	kill(bm_pid, SIGKILL);
+out_free:
+	free(bw_resc_start);
 
 	return ret;
 }