@@ -76,11 +76,48 @@ extern struct task_struct *__switch_to(struct task_struct *,
static inline bool switch_to_should_flush_icache(struct task_struct *task)
{
#ifdef CONFIG_SMP
- bool stale_mm = task->mm && task->mm->context.force_icache_flush;
- bool stale_thread = task->thread.force_icache_flush;
+ bool stale_mm = false;
bool thread_migrated = smp_processor_id() != task->thread.prev_cpu;
+ bool stale_thread;
+
+ /*
+ * This pairs with the smp_wmb() in each case of the switch statement in
+ * riscv_set_icache_flush_ctx() as well as the smp_wmb() in set_icache_stale_mask().
+ *
+ * The pairings with the smp_wmb() in the PR_RISCV_SCOPE_PER_PROCESS
+ * cases in riscv_set_icache_flush_ctx() synchronizes this hart with the
+ * updated value of current->mm->context.force_icache_flush.
+ *
+ * The pairings with the smp_wmb() in the PR_RISCV_SCOPE_PER_THREAD cases
+ * in riscv_set_icache_flush_ctx() synchronizes this hart with the
+ * updated value of task->thread.force_icache_flush.
+ *
+ * The pairing with the smp_wmb() in set_icache_stale_mask()
+ * synchronizes this hart with the updated value of task->mm->context.icache_stale_mask.
+ */
+ smp_rmb();
+ stale_thread = thread_migrated && task->thread.force_icache_flush;
+
+ if (task->mm) {
+ /*
+ * The mm is only stale if the respective CPU bit in
+ * icache_stale_mask is set.
+ */
+ stale_mm = cpumask_test_cpu(smp_processor_id(),
+ &task->mm->context.icache_stale_mask);
+
+ /*
+ * force_icache_flush indicates that icache_stale_mask should be
+ * set again for this hart before returning to userspace. This
+ * ensures that next time this mm is switched to on this hart,
+ * the icache is flushed only if necessary.
+ */
+ cpumask_assign_cpu(smp_processor_id(),
+ &task->mm->context.icache_stale_mask,
+ task->mm->context.force_icache_flush);
+ }
- return thread_migrated && (stale_mm || stale_thread);
+ return stale_mm || stale_thread;
#else
return false;
#endif
@@ -158,20 +158,25 @@ void __init riscv_init_cbo_blocksizes(void)
#ifdef CONFIG_SMP
static void set_icache_stale_mask(void)
{
- cpumask_t *mask;
- bool stale_cpu;
-
/*
* Mark every other hart's icache as needing a flush for
- * this MM. Maintain the previous value of the current
- * cpu to handle the case when this function is called
- * concurrently on different harts.
+ * this MM.
*/
+ cpumask_t *mask;
+ bool stale_cpu;
+
mask = ¤t->mm->context.icache_stale_mask;
stale_cpu = cpumask_test_cpu(smp_processor_id(), mask);
cpumask_setall(mask);
cpumask_assign_cpu(smp_processor_id(), mask, stale_cpu);
+
+ /*
+ * This pairs with the smp_rmb() in switch_to_should_flush_icache() and
+ * flush_icache_deferred() to ensure that the updates to
+ * icache_stale_mask are visible in other harts.
+ */
+ smp_wmb();
}
#endif
@@ -228,9 +233,22 @@ int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long scope)
switch (scope) {
case PR_RISCV_SCOPE_PER_PROCESS:
current->mm->context.force_icache_flush = true;
+ /*
+ * This pairs with the smp_rmb() in
+ * switch_to_should_flush_icache() to ensure that other
+ * harts using the same mm flush the icache on migration.
+ */
+ smp_wmb();
+ set_icache_stale_mask();
break;
case PR_RISCV_SCOPE_PER_THREAD:
current->thread.force_icache_flush = true;
+ /*
+ * This pairs with the smp_rmb() in
+ * switch_to_should_flush_icache() to ensure that the
+ * icache is flushed when this thread is migrated.
+ */
+ smp_wmb();
break;
default:
return -EINVAL;
@@ -240,13 +258,22 @@ int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long scope)
switch (scope) {
case PR_RISCV_SCOPE_PER_PROCESS:
current->mm->context.force_icache_flush = false;
-
+ /*
+ * This pairs with the smp_rmb() in
+ * switch_to_should_flush_icache() to ensure that other
+ * harts using the same mm stop flushing the icache on migration.
+ */
+ smp_wmb();
set_icache_stale_mask();
break;
case PR_RISCV_SCOPE_PER_THREAD:
current->thread.force_icache_flush = false;
-
- set_icache_stale_mask();
+ /*
+ * This pairs with the smp_rmb() in
+ * switch_to_should_flush_icache() to ensure that the
+ * icache stops flushing when this thread is migrated.
+ */
+ smp_wmb();
break;
default:
return -EINVAL;
@@ -299,18 +299,18 @@ static inline void flush_icache_deferred(struct mm_struct *mm, unsigned int cpu,
struct task_struct *task)
{
#ifdef CONFIG_SMP
+ /*
+ * This pairs with the smp_wmb() in set_icache_stale_mask() to
+ * synchronize this hart with changes to mm->context.icache_stale_mask.
+ */
+ smp_rmb();
if (cpumask_test_and_clear_cpu(cpu, &mm->context.icache_stale_mask)) {
/*
* Ensure the remote hart's writes are visible to this hart.
* This pairs with a barrier in flush_icache_mm.
*/
smp_mb();
-
- /*
- * If cache will be flushed in switch_to, no need to flush here.
- */
- if (!(task && switch_to_should_flush_icache(task)))
- local_flush_icache_all();
+ local_flush_icache_all();
}
#endif
}
It is possible for mm->context.icache_stale_mask to be set between switch_mm() and switch_to() when there are two threads on different CPUs executing in the same mm. switch_mm() currently assumes that switch_to() will handle the flushing. However if mm->context.icache_stale_mask is changed in the middle, neither switch_mm() nor switch_to() will issue the necessary flush. This situation can be seen in the following example trace: <---- Thread 1 starts running here on CPU1 <---- Thread 2 starts running here with same mm T2: prctl(PR_RISCV_SET_ICACHE_FLUSH_CTX, PR_RISCV_CTX_SW_FENCEI_ON, PR_RISCV_SCOPE_PER_PROCESS); T2: <-- kernel sets current->mm->context.force_icache_flush to true T2: <modification of instructions> T2: fence.i T2: fence w, w T1: fence r, r T1: fence.i (both fences to synchronize with other thread, has some logic to determine when to do this) T1: <-- thread 1 is preempted T1: <-- thread 1 is placed onto CPU3 and starts context switch sequence T1 (kernel): flush_icache_deferred() -> skips flush because switch_to_should_flush_icache() returns true -> thread has migrated and task->mm->context.force_icache_flush is true T2: prctl(PR_RISCV_SET_ICACHE_FLUSH_CTX, PR_RISCV_CTX_SW_FENCEI_OFF, PR_RISCV_SCOPE_PER_PROCESS); T2 (kernel): kernel sets current->mm->context.force_icache_flush = false T1 (kernel): switch_to() calls switch_to_should_flush_icache() which now returns false because task->mm->context.force_icache_flush is false due to the other thread emitting PR_RISCV_CTX_SW_FENCEI_OFF. T1 (back in userspace): Instruction cache was never flushed on context switch to CPU3, and thus may execute incorrect instructions. This commit fixes this issue by modifying the semantics of mm->context.force_icache_flush to mean that the mm will need to be flushed at some point before returning to userspace. The flush will only happen if task->mm->context.icache_stale_mask for this hart is set. Once the flush has occurred, the bit in task->mm->context.icache_stale_mask is cleared. Before returning back to userspace, task->mm->context.icache_stale_mask is set again if mm->context.force_icache_flush is set. This ensures that even if PR_RISCV_CTX_SW_FENCEI_OFF is called between switch_mm() and switch_to(), an icache flush is still emitted. This also remedies the issue that it is possible for switch_mm() to be called without being followed by switch_to() such as with the riscv efi driver in efi_virtmap_load(), so we cannot do all of the flushing in switch_to(). To ensure that the writes to mm->context.icache_stale_mask, mm->context.force_icache_flush, and thread.force_icache_flush are visible to all harts, add read/write barriers around their uses. One thread on some hart will be responsible for doing PR_RISCV_CTX_SW_FENCEI_ON and PR_RISCV_CTX_SW_FENCEI_OFF. There may be threads in the same mm on different harts that need to see the changes to these values that occurs when the prctl is started/stopped. Signed-off-by: Charlie Jenkins <charlie@rivosinc.com> Fixes: 6b9391b581fd ("riscv: Include riscv_set_icache_flush_ctx prctl") --- Changes since v1: - Split this patch into a separate series - Add memory barriers as suggested by Andrea - Only set the icache_stale_mask on PR_RISCV_SCOPE_PER_PROCESS - Clarify race condition and how this patch solves the issue - Link to v1: https://lore.kernel.org/lkml/20240813-fix_fencei_optimization-v1-0-2aadc2cdde95@rivosinc.com/ --- arch/riscv/include/asm/switch_to.h | 43 +++++++++++++++++++++++++++++++++--- arch/riscv/mm/cacheflush.c | 45 ++++++++++++++++++++++++++++++-------- arch/riscv/mm/context.c | 12 +++++----- 3 files changed, 82 insertions(+), 18 deletions(-) --- base-commit: 7c626ce4bae1ac14f60076d00eafe71af30450ba change-id: 20240812-fix_fencei_optimization-3f81ac200505