Context |
Check |
Description |
bpf/vmtest-bpf-next-PR |
success
|
PR summary
|
bpf/vmtest-bpf-next-VM_Test-1 |
success
|
Logs for ShellCheck
|
bpf/vmtest-bpf-next-VM_Test-2 |
success
|
Logs for Unittests
|
bpf/vmtest-bpf-next-VM_Test-0 |
success
|
Logs for Lint
|
bpf/vmtest-bpf-next-VM_Test-3 |
success
|
Logs for Validate matrix.py
|
bpf/vmtest-bpf-next-VM_Test-4 |
success
|
Logs for aarch64-gcc / GCC BPF
|
bpf/vmtest-bpf-next-VM_Test-5 |
success
|
Logs for aarch64-gcc / build / build for aarch64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-6 |
success
|
Logs for aarch64-gcc / build-release
|
bpf/vmtest-bpf-next-VM_Test-11 |
success
|
Logs for aarch64-gcc / veristat-kernel
|
bpf/vmtest-bpf-next-VM_Test-12 |
success
|
Logs for aarch64-gcc / veristat-meta
|
bpf/vmtest-bpf-next-VM_Test-13 |
success
|
Logs for s390x-gcc / GCC BPF
|
bpf/vmtest-bpf-next-VM_Test-14 |
success
|
Logs for s390x-gcc / build / build for s390x with gcc
|
bpf/vmtest-bpf-next-VM_Test-15 |
success
|
Logs for s390x-gcc / build-release
|
bpf/vmtest-bpf-next-VM_Test-7 |
success
|
Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-10 |
success
|
Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-18 |
success
|
Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
|
bpf/vmtest-bpf-next-VM_Test-19 |
success
|
Logs for s390x-gcc / veristat-kernel
|
bpf/vmtest-bpf-next-VM_Test-20 |
success
|
Logs for s390x-gcc / veristat-meta
|
bpf/vmtest-bpf-next-VM_Test-21 |
success
|
Logs for set-matrix
|
bpf/vmtest-bpf-next-VM_Test-23 |
success
|
Logs for x86_64-gcc / build / build for x86_64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-24 |
success
|
Logs for x86_64-gcc / build-release
|
bpf/vmtest-bpf-next-VM_Test-34 |
success
|
Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
|
bpf/vmtest-bpf-next-VM_Test-35 |
success
|
Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
|
bpf/vmtest-bpf-next-VM_Test-40 |
success
|
Logs for x86_64-llvm-17 / veristat-kernel
|
bpf/vmtest-bpf-next-VM_Test-41 |
success
|
Logs for x86_64-llvm-17 / veristat-meta
|
bpf/vmtest-bpf-next-VM_Test-43 |
success
|
Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
|
bpf/vmtest-bpf-next-VM_Test-44 |
success
|
Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
|
bpf/vmtest-bpf-next-VM_Test-50 |
success
|
Logs for x86_64-llvm-18 / veristat-kernel
|
bpf/vmtest-bpf-next-VM_Test-51 |
success
|
Logs for x86_64-llvm-18 / veristat-meta
|
bpf/vmtest-bpf-next-VM_Test-8 |
success
|
Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-9 |
success
|
Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-16 |
success
|
Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
|
bpf/vmtest-bpf-next-VM_Test-17 |
success
|
Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
|
bpf/vmtest-bpf-next-VM_Test-30 |
success
|
Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-22 |
success
|
Logs for x86_64-gcc / GCC BPF / GCC BPF
|
bpf/vmtest-bpf-next-VM_Test-25 |
success
|
Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-26 |
success
|
Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-27 |
success
|
Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-28 |
success
|
Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-29 |
success
|
Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
|
bpf/vmtest-bpf-next-VM_Test-31 |
success
|
Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
|
bpf/vmtest-bpf-next-VM_Test-32 |
success
|
Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
|
bpf/vmtest-bpf-next-VM_Test-33 |
success
|
Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
|
bpf/vmtest-bpf-next-VM_Test-36 |
success
|
Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
|
bpf/vmtest-bpf-next-VM_Test-37 |
success
|
Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
|
bpf/vmtest-bpf-next-VM_Test-38 |
success
|
Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
|
bpf/vmtest-bpf-next-VM_Test-39 |
success
|
Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
|
bpf/vmtest-bpf-next-VM_Test-42 |
success
|
Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
|
bpf/vmtest-bpf-next-VM_Test-45 |
success
|
Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
|
bpf/vmtest-bpf-next-VM_Test-46 |
success
|
Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
|
bpf/vmtest-bpf-next-VM_Test-47 |
success
|
Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
|
bpf/vmtest-bpf-next-VM_Test-48 |
success
|
Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
|
bpf/vmtest-bpf-next-VM_Test-49 |
success
|
Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
|
new file mode 100644
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RQSPINLOCK_H
+#define _ASM_RQSPINLOCK_H
+
+#include <asm/barrier.h>
+
+/*
+ * Hardcode res_smp_cond_load_acquire implementations for arm64 to a custom
+ * version based on [0]. In rqspinlock code, our conditional expression involves
+ * checking the value _and_ additionally a timeout. However, on arm64, the
+ * WFE-based implementation may never spin again if no stores occur to the
+ * locked byte in the lock word. As such, we may be stuck forever if
+ * event-stream based unblocking is not available on the platform for WFE spin
+ * loops (arch_timer_evtstrm_available).
+ *
+ * Once support for smp_cond_load_acquire_timewait [0] lands, we can drop this
+ * copy-paste.
+ *
+ * While we rely on the implementation to amortize the cost of sampling
+ * cond_expr for us, it will not happen when event stream support is
+ * unavailable, time_expr check is amortized. This is not the common case, and
+ * it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns
+ * comparison, hence just let it be. In case of event-stream, the loop is woken
+ * up at microsecond granularity.
+ *
+ * [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com
+ */
+
+#ifndef smp_cond_load_acquire_timewait
+
+#define smp_cond_time_check_count 200
+
+#define __smp_cond_load_relaxed_spinwait(ptr, cond_expr, time_expr_ns, \
+ time_limit_ns) ({ \
+ typeof(ptr) __PTR = (ptr); \
+ __unqual_scalar_typeof(*ptr) VAL; \
+ unsigned int __count = 0; \
+ for (;;) { \
+ VAL = READ_ONCE(*__PTR); \
+ if (cond_expr) \
+ break; \
+ cpu_relax(); \
+ if (__count++ < smp_cond_time_check_count) \
+ continue; \
+ if ((time_expr_ns) >= (time_limit_ns)) \
+ break; \
+ __count = 0; \
+ } \
+ (typeof(*ptr))VAL; \
+})
+
+#define __smp_cond_load_acquire_timewait(ptr, cond_expr, \
+ time_expr_ns, time_limit_ns) \
+({ \
+ typeof(ptr) __PTR = (ptr); \
+ __unqual_scalar_typeof(*ptr) VAL; \
+ for (;;) { \
+ VAL = smp_load_acquire(__PTR); \
+ if (cond_expr) \
+ break; \
+ __cmpwait_relaxed(__PTR, VAL); \
+ if ((time_expr_ns) >= (time_limit_ns)) \
+ break; \
+ } \
+ (typeof(*ptr))VAL; \
+})
+
+#define smp_cond_load_acquire_timewait(ptr, cond_expr, \
+ time_expr_ns, time_limit_ns) \
+({ \
+ __unqual_scalar_typeof(*ptr) _val; \
+ int __wfe = arch_timer_evtstrm_available(); \
+ \
+ if (likely(__wfe)) { \
+ _val = __smp_cond_load_acquire_timewait(ptr, cond_expr, \
+ time_expr_ns, \
+ time_limit_ns); \
+ } else { \
+ _val = __smp_cond_load_relaxed_spinwait(ptr, cond_expr, \
+ time_expr_ns, \
+ time_limit_ns); \
+ smp_acquire__after_ctrl_dep(); \
+ } \
+ (typeof(*ptr))_val; \
+})
+
+#endif
+
+#define res_smp_cond_load_acquire_timewait(v, c) smp_cond_load_acquire_timewait(v, c, 0, 1)
+
+#include <asm-generic/rqspinlock.h>
+
+#endif /* _ASM_RQSPINLOCK_H */
@@ -92,12 +92,21 @@ static noinline int check_timeout(struct rqspinlock_timeout *ts)
return 0;
}
+/*
+ * Do not amortize with spins when res_smp_cond_load_acquire is defined,
+ * as the macro does internal amortization for us.
+ */
+#ifndef res_smp_cond_load_acquire
#define RES_CHECK_TIMEOUT(ts, ret) \
({ \
if (!(ts).spin++) \
(ret) = check_timeout(&(ts)); \
(ret); \
})
+#else
+#define RES_CHECK_TIMEOUT(ts, ret, mask) \
+ ({ (ret) = check_timeout(&(ts)); })
+#endif
/*
* Initialize the 'spin' member.
@@ -118,6 +127,12 @@ static noinline int check_timeout(struct rqspinlock_timeout *ts)
*/
static DEFINE_PER_CPU_ALIGNED(struct qnode, rqnodes[_Q_MAX_NODES]);
+#ifndef res_smp_cond_load_acquire
+#define res_smp_cond_load_acquire(v, c) smp_cond_load_acquire(v, c)
+#endif
+
+#define res_atomic_cond_read_acquire(v, c) res_smp_cond_load_acquire(&(v)->counter, (c))
+
/**
* resilient_queued_spin_lock_slowpath - acquire the queued spinlock
* @lock: Pointer to queued spinlock structure
Currently, for rqspinlock usage, the implementation of smp_cond_load_acquire (and thus, atomic_cond_read_acquire) are susceptible to stalls on arm64, because they do not guarantee that the conditional expression will be repeatedly invoked if the address being loaded from is not written to by other CPUs. When support for event-streams is absent (which unblocks stuck WFE-based loops every ~100us), we may end up being stuck forever. This causes a problem for us, as we need to repeatedly invoke the RES_CHECK_TIMEOUT in the spin loop to break out when the timeout expires. Let us import the smp_cond_load_acquire_timewait implementation Ankur is proposing in [0], and then fallback to it once it is merged. While we rely on the implementation to amortize the cost of sampling check_timeout for us, it will not happen when event stream support is unavailable. This is not the common case, and it would be difficult to fit our logic in the time_expr_ns >= time_limit_ns comparison, hence just let it be. [0]: https://lore.kernel.org/lkml/20250203214911.898276-1-ankur.a.arora@oracle.com Cc: Ankur Arora <ankur.a.arora@oracle.com> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com> --- arch/arm64/include/asm/rqspinlock.h | 93 +++++++++++++++++++++++++++++ kernel/locking/rqspinlock.c | 15 +++++ 2 files changed, 108 insertions(+) create mode 100644 arch/arm64/include/asm/rqspinlock.h