diff mbox series

[v3,6/8] platform/x86/intel/ifs: Implement Array BIST test

Message ID 20230301015942.462799-7-jithu.joseph@intel.com (mailing list archive)
State Changes Requested, archived
Headers show
Series Add Array BIST test support to IFS | expand

Commit Message

Joseph, Jithu March 1, 2023, 1:59 a.m. UTC
Array BIST test (for a particlular core) is triggered by writing
to MSR_ARRAY_BIST from one sibling of the core.

This will initiate a test for all supported arrays on that
CPU. Array BIST test may be aborted before completing all the
arrays in the event of an interrupt or other reasons.
In this case, kernel will restart the test from that point
onwards. Array test will also be aborted when the test fails,
in which case the test is stopped immediately without further
retry.

Signed-off-by: Jithu Joseph <jithu.joseph@intel.com>
Reviewed-by: Tony Luck <tony.luck@intel.com>
---
 drivers/platform/x86/intel/ifs/ifs.h     | 12 ++++
 drivers/platform/x86/intel/ifs/runtest.c | 81 ++++++++++++++++++++++++
 2 files changed, 93 insertions(+)

Comments

Hans de Goede March 13, 2023, 4:24 p.m. UTC | #1
Hi,

On 3/1/23 02:59, Jithu Joseph wrote:
> Array BIST test (for a particlular core) is triggered by writing
> to MSR_ARRAY_BIST from one sibling of the core.
> 
> This will initiate a test for all supported arrays on that
> CPU. Array BIST test may be aborted before completing all the
> arrays in the event of an interrupt or other reasons.
> In this case, kernel will restart the test from that point
> onwards. Array test will also be aborted when the test fails,
> in which case the test is stopped immediately without further
> retry.
> 
> Signed-off-by: Jithu Joseph <jithu.joseph@intel.com>
> Reviewed-by: Tony Luck <tony.luck@intel.com>
> ---
>  drivers/platform/x86/intel/ifs/ifs.h     | 12 ++++
>  drivers/platform/x86/intel/ifs/runtest.c | 81 ++++++++++++++++++++++++
>  2 files changed, 93 insertions(+)
> 
> diff --git a/drivers/platform/x86/intel/ifs/ifs.h b/drivers/platform/x86/intel/ifs/ifs.h
> index f31966e291df..1228101de201 100644
> --- a/drivers/platform/x86/intel/ifs/ifs.h
> +++ b/drivers/platform/x86/intel/ifs/ifs.h
> @@ -127,6 +127,7 @@
>  #include <linux/device.h>
>  #include <linux/miscdevice.h>
>  
> +#define MSR_ARRAY_BIST				0x00000105
>  #define MSR_COPY_SCAN_HASHES			0x000002c2
>  #define MSR_SCAN_HASHES_STATUS			0x000002c3
>  #define MSR_AUTHENTICATE_AND_COPY_CHUNK		0x000002c4
> @@ -192,6 +193,17 @@ union ifs_status {
>  	};
>  };
>  
> +/* MSR_ARRAY_BIST bit fields */
> +union ifs_array {
> +	u64	data;
> +	struct {
> +		u32	array_bitmask;
> +		u16	array_bank;
> +		u16	rsvd			:15;
> +		u16	ctrl_result		:1;
> +	};
> +};
> +
>  /*
>   * Driver populated error-codes
>   * 0xFD: Test timed out before completing all the chunks.
> diff --git a/drivers/platform/x86/intel/ifs/runtest.c b/drivers/platform/x86/intel/ifs/runtest.c
> index 969b3e0946d5..3a5442796c7d 100644
> --- a/drivers/platform/x86/intel/ifs/runtest.c
> +++ b/drivers/platform/x86/intel/ifs/runtest.c
> @@ -229,6 +229,85 @@ static void ifs_test_core(int cpu, struct device *dev)
>  	}
>  }
>  
> +#define SPINUNIT 100 /* 100 nsec */
> +static atomic_t array_cpus_out;

This variable is only inc-ed + read, it is never reset to 0
so the "while (atomic_read(t) < all_cpus)"
check only works for the first test run.

Also even static atomic_t variables must be initialized, you cannot
assume that using using zeroed mem is a valid value for an atomic_t.

And this is also shared between all smt pairs, so if 2 "real"
CPU cores with both 2 sibblings are asked to run IFS tests at
the same time, then array_cpus_out will get increased 4 times
in total, breaking the wait_for_sibbling loop as soon as
the counter reaches 2, so before the tests are done.

It looks like this bit needs to be reworked so that the busy spinning
the sibbling uses per "real" cpu core data and so that the counter
is reset before the tests are started.

Regards,

Hans

> +
> +/*
> + * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus()
> + */
> +static void wait_for_sibling_cpu(atomic_t *t, long long timeout)
> +{
> +	int cpu = smp_processor_id();
> +	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
> +	int all_cpus = cpumask_weight(smt_mask);
> +
> +	atomic_inc(t);
> +	while (atomic_read(t) < all_cpus) {
> +		if (timeout < SPINUNIT)
> +			return;
> +		ndelay(SPINUNIT);
> +		timeout -= SPINUNIT;
> +		touch_nmi_watchdog();
> +	}
> +}
> +
> +static int do_array_test(void *data)
> +{
> +	union ifs_array *command = data;
> +	int cpu = smp_processor_id();
> +	int first;
> +
> +	/*
> +	 * Only one logical CPU on a core needs to trigger the Array test via MSR write.
> +	 */
> +	first = cpumask_first(cpu_smt_mask(cpu));
> +
> +	if (cpu == first) {
> +		wrmsrl(MSR_ARRAY_BIST, command->data);
> +		/* Pass back the result of the test */
> +		rdmsrl(MSR_ARRAY_BIST, command->data);
> +	}
> +
> +	/* Tests complete faster if the sibling is spinning here */
> +	wait_for_sibling_cpu(&array_cpus_out, NSEC_PER_SEC);
> +
> +	return 0;
> +}
> +
> +static void ifs_array_test_core(int cpu, struct device *dev)
> +{
> +	union ifs_array command = {};
> +	bool timed_out = false;
> +	struct ifs_data *ifsd;
> +	unsigned long timeout;
> +
> +	ifsd = ifs_get_data(dev);
> +
> +	command.array_bitmask = ~0U;
> +	timeout = jiffies + HZ / 2;
> +
> +	do {
> +		if (time_after(jiffies, timeout)) {
> +			timed_out = true;
> +			break;
> +		}
> +		atomic_set(&array_cpus_out, 0);
> +		stop_core_cpuslocked(cpu, do_array_test, &command);
> +
> +		if (command.ctrl_result)
> +			break;
> +	} while (command.array_bitmask);
> +
> +	ifsd->scan_details = command.data;
> +
> +	if (command.ctrl_result)
> +		ifsd->status = SCAN_TEST_FAIL;
> +	else if (timed_out || command.array_bitmask)
> +		ifsd->status = SCAN_NOT_TESTED;
> +	else
> +		ifsd->status = SCAN_TEST_PASS;
> +}
> +
>  /*
>   * Initiate per core test. It wakes up work queue threads on the target cpu and
>   * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
> @@ -253,6 +332,8 @@ int do_core_test(int cpu, struct device *dev)
>  		ifs_test_core(cpu, dev);
>  		break;
>  	case IFS_TYPE_ARRAY_BIST:
> +		ifs_array_test_core(cpu, dev);
> +		break;
>  	default:
>  		return -EINVAL;
>  	}
Joseph, Jithu March 13, 2023, 4:37 p.m. UTC | #2
Hans,

Thank-you very much for the review.

On 3/13/2023 9:24 AM, Hans de Goede wrote:


>>  
>> +#define SPINUNIT 100 /* 100 nsec */
>> +static atomic_t array_cpus_out;
> 
> This variable is only inc-ed + read, it is never reset to 0
> so the "while (atomic_read(t) < all_cpus)"
> check only works for the first test run.
> 

It is reset to zero as annotated below. Let me know if this doesn't address your concern.

> Also even static atomic_t variables must be initialized, you cannot
> assume that using using zeroed mem is a valid value for an atomic_t.
> 
> And this is also shared between all smt pairs, so if 2 "real"
> CPU cores with both 2 sibblings are asked to run IFS tests at
> the same time, then array_cpus_out will get increased 4 times
> in total, breaking the wait_for_sibbling loop as soon as
> the counter reaches 2, so before the tests are done.

Only one IFS test is allowed at a time. This is done using "ifs_sem" defined in sysfs.c

...

>> +static void ifs_array_test_core(int cpu, struct device *dev)
>> +{
>> +	union ifs_array command = {};
>> +	bool timed_out = false;
>> +	struct ifs_data *ifsd;
>> +	unsigned long timeout;
>> +
>> +	ifsd = ifs_get_data(dev);
>> +
>> +	command.array_bitmask = ~0U;
>> +	timeout = jiffies + HZ / 2;
>> +
>> +	do {
>> +		if (time_after(jiffies, timeout)) {
>> +			timed_out = true;
>> +			break;
>> +		}
>> +		atomic_set(&array_cpus_out, 0);

The above line is where the zero initialization happens before every test.

>> +		stop_core_cpuslocked(cpu, do_array_test, &command);
>> +
>> +		if (command.ctrl_result)
>> +			break;
>> +	} while (command.array_bitmask);
>> +
>> +	ifsd->scan_details = command.data;
>> +
>> +	if (command.ctrl_result)
>> +		ifsd->status = SCAN_TEST_FAIL;
>> +	else if (timed_out || command.array_bitmask)
>> +		ifsd->status = SCAN_NOT_TESTED;
>> +	else
>> +		ifsd->status = SCAN_TEST_PASS;
>> +}

Jithu
Hans de Goede March 16, 2023, 9:59 a.m. UTC | #3
Hi,

On 3/13/23 17:37, Joseph, Jithu wrote:
> Hans,
> 
> Thank-you very much for the review.
> 
> On 3/13/2023 9:24 AM, Hans de Goede wrote:
> 
> 
>>>  
>>> +#define SPINUNIT 100 /* 100 nsec */
>>> +static atomic_t array_cpus_out;
>>
>> This variable is only inc-ed + read, it is never reset to 0
>> so the "while (atomic_read(t) < all_cpus)"
>> check only works for the first test run.
>>
> 
> It is reset to zero as annotated below. Let me know if this doesn't address your concern.

Ah, I somehow missed that despite looking for it twice, yes that is fine.

>> Also even static atomic_t variables must be initialized, you cannot
>> assume that using using zeroed mem is a valid value for an atomic_t.
>>
>> And this is also shared between all smt pairs, so if 2 "real"
>> CPU cores with both 2 sibblings are asked to run IFS tests at
>> the same time, then array_cpus_out will get increased 4 times
>> in total, breaking the wait_for_sibbling loop as soon as
>> the counter reaches 2, so before the tests are done.
> 
> Only one IFS test is allowed at a time. This is done using "ifs_sem" defined in sysfs.c

Ah I see.

After taking a closer look I do see one unrelated issue with this patch
sysfs.c: run_test_store() does:

        if (!ifsd->loaded)
                rc = -EPERM;
        else
                rc = do_core_test(cpu, dev);

But AFAICT the loaded check really only applies to the first (intel_ifs_0 device) test type and the 
Array BIST test should work fine when loaded is false.

So I think that the if (!ifsd->loaded) error check should be moved to 
ifs_test_core() ?

Regards,

Hans


> 
> ...
> 
>>> +static void ifs_array_test_core(int cpu, struct device *dev)
>>> +{
>>> +	union ifs_array command = {};
>>> +	bool timed_out = false;
>>> +	struct ifs_data *ifsd;
>>> +	unsigned long timeout;
>>> +
>>> +	ifsd = ifs_get_data(dev);
>>> +
>>> +	command.array_bitmask = ~0U;
>>> +	timeout = jiffies + HZ / 2;
>>> +
>>> +	do {
>>> +		if (time_after(jiffies, timeout)) {
>>> +			timed_out = true;
>>> +			break;
>>> +		}
>>> +		atomic_set(&array_cpus_out, 0);
> 
> The above line is where the zero initialization happens before every test.
> 
>>> +		stop_core_cpuslocked(cpu, do_array_test, &command);
>>> +
>>> +		if (command.ctrl_result)
>>> +			break;
>>> +	} while (command.array_bitmask);
>>> +
>>> +	ifsd->scan_details = command.data;
>>> +
>>> +	if (command.ctrl_result)
>>> +		ifsd->status = SCAN_TEST_FAIL;
>>> +	else if (timed_out || command.array_bitmask)
>>> +		ifsd->status = SCAN_NOT_TESTED;
>>> +	else
>>> +		ifsd->status = SCAN_TEST_PASS;
>>> +}
> 
> Jithu
>
Joseph, Jithu March 16, 2023, 5:40 p.m. UTC | #4
On 3/16/2023 2:59 AM, Hans de Goede wrote:

>>
>> Only one IFS test is allowed at a time. This is done using "ifs_sem" defined in sysfs.c
> 
> Ah I see.
> 
> After taking a closer look I do see one unrelated issue with this patch
> sysfs.c: run_test_store() does:
> 
>         if (!ifsd->loaded)
>                 rc = -EPERM;
>         else
>                 rc = do_core_test(cpu, dev);
> 
> But AFAICT the loaded check really only applies to the first (intel_ifs_0 device) test type and the 
> Array BIST test should work fine when loaded is false.

This is true, the load check only applies to first test. And the patch (5/8) in this series adds the
check you are suggesting to allow the second test to proceed (as given below)

	if (ifsd->ro_info->test_num != IFS_TYPE_ARRAY_BIST && !ifsd->loaded)
		rc = -EPERM;
	else
		rc = do_core_test(cpu, dev);

It is possible that the snippet you pasted above was from an earlier state.


Jithu
Joseph, Jithu March 16, 2023, 6:11 p.m. UTC | #5
On 3/16/2023 2:59 AM, Hans de Goede wrote:

> 
> After taking a closer look I do see one unrelated issue with this patch
> sysfs.c: run_test_store() does:
> 
>         if (!ifsd->loaded)
>                 rc = -EPERM;
>         else
>                 rc = do_core_test(cpu, dev);
> 
> But AFAICT the loaded check really only applies to the first (intel_ifs_0 device) test type and the 
> Array BIST test should work fine when loaded is false.
> 
> So I think that the if (!ifsd->loaded) error check should be moved to 
> ifs_test_core() ?
> 

It is possible that I misinterpreted your comment in my earlier reply. (Thanks Tony for pointing it out)
Yes I think moving the load check into ifs_test_core() is better than doing it in run_test_store()

Jithu
Hans de Goede March 16, 2023, 7:38 p.m. UTC | #6
Hi,

On 3/16/23 19:11, Joseph, Jithu wrote:
> 
> 
> On 3/16/2023 2:59 AM, Hans de Goede wrote:
> 
>>
>> After taking a closer look I do see one unrelated issue with this patch
>> sysfs.c: run_test_store() does:
>>
>>         if (!ifsd->loaded)
>>                 rc = -EPERM;
>>         else
>>                 rc = do_core_test(cpu, dev);
>>
>> But AFAICT the loaded check really only applies to the first (intel_ifs_0 device) test type and the 
>> Array BIST test should work fine when loaded is false.
>>
>> So I think that the if (!ifsd->loaded) error check should be moved to 
>> ifs_test_core() ?
>>
> 
> It is possible that I misinterpreted your comment in my earlier reply. (Thanks Tony for pointing it out)

No you were right I was looking at the current code, not the code after this patch-set is applied.

> Yes I think moving the load check into ifs_test_core() is better than doing it in run_test_store()

Ack, doing that is still a cleaner way of dealing with this.

Regards,

Hans
diff mbox series

Patch

diff --git a/drivers/platform/x86/intel/ifs/ifs.h b/drivers/platform/x86/intel/ifs/ifs.h
index f31966e291df..1228101de201 100644
--- a/drivers/platform/x86/intel/ifs/ifs.h
+++ b/drivers/platform/x86/intel/ifs/ifs.h
@@ -127,6 +127,7 @@ 
 #include <linux/device.h>
 #include <linux/miscdevice.h>
 
+#define MSR_ARRAY_BIST				0x00000105
 #define MSR_COPY_SCAN_HASHES			0x000002c2
 #define MSR_SCAN_HASHES_STATUS			0x000002c3
 #define MSR_AUTHENTICATE_AND_COPY_CHUNK		0x000002c4
@@ -192,6 +193,17 @@  union ifs_status {
 	};
 };
 
+/* MSR_ARRAY_BIST bit fields */
+union ifs_array {
+	u64	data;
+	struct {
+		u32	array_bitmask;
+		u16	array_bank;
+		u16	rsvd			:15;
+		u16	ctrl_result		:1;
+	};
+};
+
 /*
  * Driver populated error-codes
  * 0xFD: Test timed out before completing all the chunks.
diff --git a/drivers/platform/x86/intel/ifs/runtest.c b/drivers/platform/x86/intel/ifs/runtest.c
index 969b3e0946d5..3a5442796c7d 100644
--- a/drivers/platform/x86/intel/ifs/runtest.c
+++ b/drivers/platform/x86/intel/ifs/runtest.c
@@ -229,6 +229,85 @@  static void ifs_test_core(int cpu, struct device *dev)
 	}
 }
 
+#define SPINUNIT 100 /* 100 nsec */
+static atomic_t array_cpus_out;
+
+/*
+ * Simplified cpu sibling rendezvous loop based on microcode loader __wait_for_cpus()
+ */
+static void wait_for_sibling_cpu(atomic_t *t, long long timeout)
+{
+	int cpu = smp_processor_id();
+	const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+	int all_cpus = cpumask_weight(smt_mask);
+
+	atomic_inc(t);
+	while (atomic_read(t) < all_cpus) {
+		if (timeout < SPINUNIT)
+			return;
+		ndelay(SPINUNIT);
+		timeout -= SPINUNIT;
+		touch_nmi_watchdog();
+	}
+}
+
+static int do_array_test(void *data)
+{
+	union ifs_array *command = data;
+	int cpu = smp_processor_id();
+	int first;
+
+	/*
+	 * Only one logical CPU on a core needs to trigger the Array test via MSR write.
+	 */
+	first = cpumask_first(cpu_smt_mask(cpu));
+
+	if (cpu == first) {
+		wrmsrl(MSR_ARRAY_BIST, command->data);
+		/* Pass back the result of the test */
+		rdmsrl(MSR_ARRAY_BIST, command->data);
+	}
+
+	/* Tests complete faster if the sibling is spinning here */
+	wait_for_sibling_cpu(&array_cpus_out, NSEC_PER_SEC);
+
+	return 0;
+}
+
+static void ifs_array_test_core(int cpu, struct device *dev)
+{
+	union ifs_array command = {};
+	bool timed_out = false;
+	struct ifs_data *ifsd;
+	unsigned long timeout;
+
+	ifsd = ifs_get_data(dev);
+
+	command.array_bitmask = ~0U;
+	timeout = jiffies + HZ / 2;
+
+	do {
+		if (time_after(jiffies, timeout)) {
+			timed_out = true;
+			break;
+		}
+		atomic_set(&array_cpus_out, 0);
+		stop_core_cpuslocked(cpu, do_array_test, &command);
+
+		if (command.ctrl_result)
+			break;
+	} while (command.array_bitmask);
+
+	ifsd->scan_details = command.data;
+
+	if (command.ctrl_result)
+		ifsd->status = SCAN_TEST_FAIL;
+	else if (timed_out || command.array_bitmask)
+		ifsd->status = SCAN_NOT_TESTED;
+	else
+		ifsd->status = SCAN_TEST_PASS;
+}
+
 /*
  * Initiate per core test. It wakes up work queue threads on the target cpu and
  * its sibling cpu. Once all sibling threads wake up, the scan test gets executed and
@@ -253,6 +332,8 @@  int do_core_test(int cpu, struct device *dev)
 		ifs_test_core(cpu, dev);
 		break;
 	case IFS_TYPE_ARRAY_BIST:
+		ifs_array_test_core(cpu, dev);
+		break;
 	default:
 		return -EINVAL;
 	}