diff mbox series

[bpf-next,v5,1/4] bpf: add bpf_get_cpu_cycles kfunc

Message ID 20241109004158.2259301-1-vadfed@meta.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [bpf-next,v5,1/4] bpf: add bpf_get_cpu_cycles kfunc | expand

Checks

Context Check Description
netdev/series_format warning Series does not have a cover letter
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 204 this patch: 204
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 14 maintainers not CCed: udknight@gmail.com mingo@redhat.com dave.hansen@linux.intel.com kpsingh@kernel.org jolsa@kernel.org hpa@zytor.com bp@alien8.de dsahern@kernel.org netdev@vger.kernel.org sdf@fomichev.me song@kernel.org john.fastabend@gmail.com haoluo@google.com yonghong.song@linux.dev
netdev/build_clang success Errors and warnings before: 255 this patch: 255
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6971 this patch: 6971
netdev/checkpatch warning WARNING: line length of 104 exceeds 80 columns WARNING: line length of 82 exceeds 80 columns WARNING: line length of 83 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 17 this patch: 17
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-7 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release

Commit Message

Vadim Fedorenko Nov. 9, 2024, 12:41 a.m. UTC
New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
it into rdtsc ordered call. Other architectures will get JIT
implementation too if supported. The fallback is to
__arch_get_hw_counter().

Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
---
v4 -> v5:
* use if instead of ifdef with IS_ENABLED
v3 -> v4:
* change name of the helper to bpf_get_cpu_cycles (Andrii)
* Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing
  it on architectures which do not have vDSO functions and data
* reduce the scope of check of inlined functions in verifier to only 2,
  which are actually inlined.
v2 -> v3:
* change name of the helper to bpf_get_cpu_cycles_counter to explicitly
  mention what counter it provides (Andrii)
* move kfunc definition to bpf.h to use it in JIT.
* introduce another kfunc to convert cycles into nanoseconds as more
  meaningful time units for generic tracing use case (Andrii)
v1 -> v2:
* Fix incorrect function return value type to u64
* Introduce bpf_jit_inlines_kfunc_call() and use it in
  mark_fastcall_pattern_for_call() to avoid clobbering in case of
  running programs with no JIT (Eduard)
* Avoid rewriting instruction and check function pointer directly
  in JIT (Alexei)
* Change includes to fix compile issues on non x86 architectures
---
 arch/x86/net/bpf_jit_comp.c   | 28 ++++++++++++++++++++++++++++
 arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
 include/linux/bpf.h           |  5 +++++
 include/linux/filter.h        |  1 +
 kernel/bpf/core.c             | 11 +++++++++++
 kernel/bpf/helpers.c          | 13 +++++++++++++
 kernel/bpf/verifier.c         | 30 +++++++++++++++++++++++++++++-
 7 files changed, 101 insertions(+), 1 deletion(-)

Comments

Andrii Nakryiko Nov. 12, 2024, 5:50 a.m. UTC | #1
On Fri, Nov 8, 2024 at 4:42 PM Vadim Fedorenko <vadfed@meta.com> wrote:
>
> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
> it into rdtsc ordered call. Other architectures will get JIT
> implementation too if supported. The fallback is to
> __arch_get_hw_counter().
>
> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> ---

nit: please add cover letter for the next revision, multi-patch sets
generally should come with a cover letter, unless it's some set of
trivial and mostly independent patches. Anyways...

I haven't yet looked through the code (yet), but I was curious to
benchmark the perf benefit, so that's what I did for fun this evening.

(!!!) BTW, a complete aside, but I think it's interesting. It turned
out that using bpf_test_prog_run() scales *VERY POORLY* with large
number of CPUs, because we start spending tons of time in
fdget()/fdput(), so I initially got pretty unscalable results,
profiled a bit, and then switched to just doing
syscall(syscall(__NR_getpgid); + SEC("raw_tp/sys_enter")). Anyways,
the point is that microbenchmarking is tricky and we need to improve
our existing bench setup for some benchmarks. Anyways, getting back to
the main topic.

I wrote a quick two benchmarks testing what I see as intended use case
for these kfuncs (batching amortizes the cost of triggering BPF
program, batch_iters = 500 in my case):

SEC("?raw_tp/sys_enter")
int trigger_driver_ktime(void *ctx)
{
        volatile __u64 total = 0;
        int i;

        for (i = 0; i < batch_iters; i++) {
                __u64 start, end;

                start = bpf_ktime_get_ns();
                end = bpf_ktime_get_ns();
                total += end - start;
        }
        inc_counter_batch(batch_iters);

        return 0;
}

extern __u64 bpf_get_cpu_cycles(void) __weak __ksym;
extern __u64 bpf_cpu_cycles_to_ns(__u64 cycles) __weak __ksym;

SEC("?raw_tp/sys_enter")
int trigger_driver_cycles(void *ctx)
{
        volatile __u64 total = 0;
        int i;

        for (i = 0; i < batch_iters; i++) {
                __u64 start, end;

                start = bpf_get_cpu_cycles();
                end = bpf_get_cpu_cycles();
                total += bpf_cpu_cycles_to_ns(end - start);
        }
        inc_counter_batch(batch_iters);

        return 0;
}

And here's what I got across multiple numbers of parallel CPUs on our
production host.

# ./bench_timing.sh

ktime                 ( 1 cpus):   32.286 ± 0.309M/s  ( 32.286M/s/cpu)
ktime                 ( 2 cpus):   63.021 ± 0.538M/s  ( 31.511M/s/cpu)
ktime                 ( 3 cpus):   94.211 ± 0.686M/s  ( 31.404M/s/cpu)
ktime                 ( 4 cpus):  124.757 ± 0.691M/s  ( 31.189M/s/cpu)
ktime                 ( 5 cpus):  154.855 ± 0.693M/s  ( 30.971M/s/cpu)
ktime                 ( 6 cpus):  185.551 ± 2.304M/s  ( 30.925M/s/cpu)
ktime                 ( 7 cpus):  211.117 ± 4.755M/s  ( 30.160M/s/cpu)
ktime                 ( 8 cpus):  236.454 ± 0.226M/s  ( 29.557M/s/cpu)
ktime                 (10 cpus):  295.526 ± 0.126M/s  ( 29.553M/s/cpu)
ktime                 (12 cpus):  322.282 ± 0.153M/s  ( 26.857M/s/cpu)
ktime                 (14 cpus):  375.347 ± 0.087M/s  ( 26.811M/s/cpu)
ktime                 (16 cpus):  399.813 ± 0.181M/s  ( 24.988M/s/cpu)
ktime                 (24 cpus):  617.675 ± 7.053M/s  ( 25.736M/s/cpu)
ktime                 (32 cpus):  819.695 ± 0.231M/s  ( 25.615M/s/cpu)
ktime                 (40 cpus):  996.264 ± 0.290M/s  ( 24.907M/s/cpu)
ktime                 (48 cpus): 1180.201 ± 0.160M/s  ( 24.588M/s/cpu)
ktime                 (56 cpus): 1321.084 ± 0.099M/s  ( 23.591M/s/cpu)
ktime                 (64 cpus): 1482.061 ± 0.121M/s  ( 23.157M/s/cpu)
ktime                 (72 cpus): 1666.540 ± 0.460M/s  ( 23.146M/s/cpu)
ktime                 (80 cpus): 1851.419 ± 0.439M/s  ( 23.143M/s/cpu)

cycles                ( 1 cpus):   45.815 ± 0.018M/s  ( 45.815M/s/cpu)
cycles                ( 2 cpus):   86.706 ± 0.068M/s  ( 43.353M/s/cpu)
cycles                ( 3 cpus):  129.899 ± 0.101M/s  ( 43.300M/s/cpu)
cycles                ( 4 cpus):  168.435 ± 0.073M/s  ( 42.109M/s/cpu)
cycles                ( 5 cpus):  210.520 ± 0.164M/s  ( 42.104M/s/cpu)
cycles                ( 6 cpus):  252.596 ± 0.050M/s  ( 42.099M/s/cpu)
cycles                ( 7 cpus):  294.356 ± 0.159M/s  ( 42.051M/s/cpu)
cycles                ( 8 cpus):  317.167 ± 0.163M/s  ( 39.646M/s/cpu)
cycles                (10 cpus):  396.141 ± 0.208M/s  ( 39.614M/s/cpu)
cycles                (12 cpus):  431.938 ± 0.511M/s  ( 35.995M/s/cpu)
cycles                (14 cpus):  503.055 ± 0.070M/s  ( 35.932M/s/cpu)
cycles                (16 cpus):  534.261 ± 0.107M/s  ( 33.391M/s/cpu)
cycles                (24 cpus):  836.838 ± 0.141M/s  ( 34.868M/s/cpu)
cycles                (32 cpus): 1099.689 ± 0.314M/s  ( 34.365M/s/cpu)
cycles                (40 cpus): 1336.573 ± 0.015M/s  ( 33.414M/s/cpu)
cycles                (48 cpus): 1571.734 ± 11.151M/s  ( 32.744M/s/cpu)
cycles                (56 cpus): 1819.242 ± 4.627M/s  ( 32.486M/s/cpu)
cycles                (64 cpus): 2046.285 ± 5.169M/s  ( 31.973M/s/cpu)
cycles                (72 cpus): 2287.683 ± 0.787M/s  ( 31.773M/s/cpu)
cycles                (80 cpus): 2505.414 ± 0.626M/s  ( 31.318M/s/cpu)

So, from about +42% on a single CPU, to +36% at 80 CPUs. Not that bad.
Scalability-wise, we still see some drop off in performance, but
believe me, with bpf_prog_test_run() it was so much worse :) I also
verified that now we spend cycles almost exclusively inside the BPF
program (so presumably in those benchmarked kfuncs).
Eduard Zingerman Nov. 12, 2024, 9:21 p.m. UTC | #2
On Fri, 2024-11-08 at 16:41 -0800, Vadim Fedorenko wrote:
> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
> it into rdtsc ordered call. Other architectures will get JIT
> implementation too if supported. The fallback is to
> __arch_get_hw_counter().
> 
> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> ---

Aside from a note below, I think this patch is in good shape.

Acked-by: Eduard Zingerman <eddyz87@gmail.com>

[...]

> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 06b080b61aa5..4f78ed93ee7f 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -2126,6 +2126,26 @@ st:			if (is_imm8(insn->off))
>  		case BPF_JMP | BPF_CALL: {
>  			u8 *ip = image + addrs[i - 1];
>  
> +			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> +			    imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> +				/* Save RDX because RDTSC will use EDX:EAX to return u64 */
> +				emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> +				if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> +					EMIT_LFENCE();
> +				EMIT2(0x0F, 0x31);
> +
> +				/* shl RDX, 32 */
> +				maybe_emit_1mod(&prog, BPF_REG_3, true);
> +				EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> +				/* or RAX, RDX */
> +				maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> +				EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> +				/* restore RDX from R11 */
> +				emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);

Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
      which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
      Here we don't do `& S64_MAX`.
      The masking in __arch_get_hw_counter() was added by this commit:
      77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
      Also, the default implementation does not issue `lfence`.
      Not sure if this makes any real-world difference.

> +
> +				break;
> +			}
> +
>  			func = (u8 *) __bpf_call_base + imm32;
>  			if (tail_call_reachable) {
>  				LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);

[...]

> @@ -20488,6 +20510,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>  						node_offset_reg, insn, insn_buf, cnt);
>  	} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
>  		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
> +		if (!verifier_inlines_kfunc_call(env, imm)) {
> +			verbose(env, "verifier internal error: kfunc id %d is not defined in checker\n",
> +				desc->func_id);
> +			return -EFAULT;
> +		}
> +

Nit: still think that moving this check as the first conditional would
     have been better:

     if (verifier_inlines_kfunc_call(env, imm)) {
        if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
           desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
           // ...
        } else {
           // report error
        }
     } else if (...) {
       // ... rest of the cases
     }

>  		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
>  		*cnt = 1;
>  	} else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
Vadim Fedorenko Nov. 12, 2024, 9:39 p.m. UTC | #3
On 12/11/2024 21:21, Eduard Zingerman wrote:
> On Fri, 2024-11-08 at 16:41 -0800, Vadim Fedorenko wrote:
>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
>> it into rdtsc ordered call. Other architectures will get JIT
>> implementation too if supported. The fallback is to
>> __arch_get_hw_counter().
>>
>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>> ---
> 
> Aside from a note below, I think this patch is in good shape.
> 
> Acked-by: Eduard Zingerman <eddyz87@gmail.com>
> 
> [...]
> 
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index 06b080b61aa5..4f78ed93ee7f 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -2126,6 +2126,26 @@ st:			if (is_imm8(insn->off))
>>   		case BPF_JMP | BPF_CALL: {
>>   			u8 *ip = image + addrs[i - 1];
>>   
>> +			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>> +			    imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>> +				/* Save RDX because RDTSC will use EDX:EAX to return u64 */
>> +				emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>> +				if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>> +					EMIT_LFENCE();
>> +				EMIT2(0x0F, 0x31);
>> +
>> +				/* shl RDX, 32 */
>> +				maybe_emit_1mod(&prog, BPF_REG_3, true);
>> +				EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>> +				/* or RAX, RDX */
>> +				maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>> +				EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>> +				/* restore RDX from R11 */
>> +				emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> 
> Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
>        which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
>        Here we don't do `& S64_MAX`.
>        The masking in __arch_get_hw_counter() was added by this commit:
>        77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").

I think we already discussed it with Alexey in v1, we don't really need
any masking here for BPF case. We can use values provided by CPU
directly. It will never happen that within one BPF program we will have
inlined and non-inlined implementation of this helper, hence the values
to compare will be of the same source.

>        Also, the default implementation does not issue `lfence`.
>        Not sure if this makes any real-world difference.

Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
features.

>> +
>> +				break;
>> +			}
>> +
>>   			func = (u8 *) __bpf_call_base + imm32;
>>   			if (tail_call_reachable) {
>>   				LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
> 
> [...]
> 
>> @@ -20488,6 +20510,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
>>   						node_offset_reg, insn, insn_buf, cnt);
>>   	} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
>>   		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
>> +		if (!verifier_inlines_kfunc_call(env, imm)) {
>> +			verbose(env, "verifier internal error: kfunc id %d is not defined in checker\n",
>> +				desc->func_id);
>> +			return -EFAULT;
>> +		}
>> +
> 
> Nit: still think that moving this check as the first conditional would
>       have been better:
> 
>       if (verifier_inlines_kfunc_call(env, imm)) {
>          if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
>             desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
>             // ...
>          } else {
>             // report error
>          }
>       } else if (...) {
>         // ... rest of the cases
>       }

I can change it in the next iteration (if it's needed) if you insist

>>   		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
>>   		*cnt = 1;
>>   	} else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {
> 
>
Vadim Fedorenko Nov. 12, 2024, 9:43 p.m. UTC | #4
On 12/11/2024 05:50, Andrii Nakryiko wrote:
> On Fri, Nov 8, 2024 at 4:42 PM Vadim Fedorenko <vadfed@meta.com> wrote:
>>
>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
>> it into rdtsc ordered call. Other architectures will get JIT
>> implementation too if supported. The fallback is to
>> __arch_get_hw_counter().
>>
>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>> ---
> 
> nit: please add cover letter for the next revision, multi-patch sets
> generally should come with a cover letter, unless it's some set of
> trivial and mostly independent patches. Anyways...

Yeah, sure. This series has grown from the first small version...

> 
> I haven't yet looked through the code (yet), but I was curious to
> benchmark the perf benefit, so that's what I did for fun this evening.
> 
> (!!!) BTW, a complete aside, but I think it's interesting. It turned
> out that using bpf_test_prog_run() scales *VERY POORLY* with large
> number of CPUs, because we start spending tons of time in
> fdget()/fdput(), so I initially got pretty unscalable results,
> profiled a bit, and then switched to just doing
> syscall(syscall(__NR_getpgid); + SEC("raw_tp/sys_enter")). Anyways,
> the point is that microbenchmarking is tricky and we need to improve
> our existing bench setup for some benchmarks. Anyways, getting back to
> the main topic.
> 
> I wrote a quick two benchmarks testing what I see as intended use case
> for these kfuncs (batching amortizes the cost of triggering BPF
> program, batch_iters = 500 in my case):
> 
> SEC("?raw_tp/sys_enter")
> int trigger_driver_ktime(void *ctx)
> {
>          volatile __u64 total = 0;
>          int i;
> 
>          for (i = 0; i < batch_iters; i++) {
>                  __u64 start, end;
> 
>                  start = bpf_ktime_get_ns();
>                  end = bpf_ktime_get_ns();
>                  total += end - start;
>          }
>          inc_counter_batch(batch_iters);
> 
>          return 0;
> }
> 
> extern __u64 bpf_get_cpu_cycles(void) __weak __ksym;
> extern __u64 bpf_cpu_cycles_to_ns(__u64 cycles) __weak __ksym;
> 
> SEC("?raw_tp/sys_enter")
> int trigger_driver_cycles(void *ctx)
> {
>          volatile __u64 total = 0;
>          int i;
> 
>          for (i = 0; i < batch_iters; i++) {
>                  __u64 start, end;
> 
>                  start = bpf_get_cpu_cycles();
>                  end = bpf_get_cpu_cycles();
>                  total += bpf_cpu_cycles_to_ns(end - start);
>          }
>          inc_counter_batch(batch_iters);
> 
>          return 0;
> }
> 
> And here's what I got across multiple numbers of parallel CPUs on our
> production host.
> 
> # ./bench_timing.sh
> 
> ktime                 ( 1 cpus):   32.286 ± 0.309M/s  ( 32.286M/s/cpu)
> ktime                 ( 2 cpus):   63.021 ± 0.538M/s  ( 31.511M/s/cpu)
> ktime                 ( 3 cpus):   94.211 ± 0.686M/s  ( 31.404M/s/cpu)
> ktime                 ( 4 cpus):  124.757 ± 0.691M/s  ( 31.189M/s/cpu)
> ktime                 ( 5 cpus):  154.855 ± 0.693M/s  ( 30.971M/s/cpu)
> ktime                 ( 6 cpus):  185.551 ± 2.304M/s  ( 30.925M/s/cpu)
> ktime                 ( 7 cpus):  211.117 ± 4.755M/s  ( 30.160M/s/cpu)
> ktime                 ( 8 cpus):  236.454 ± 0.226M/s  ( 29.557M/s/cpu)
> ktime                 (10 cpus):  295.526 ± 0.126M/s  ( 29.553M/s/cpu)
> ktime                 (12 cpus):  322.282 ± 0.153M/s  ( 26.857M/s/cpu)
> ktime                 (14 cpus):  375.347 ± 0.087M/s  ( 26.811M/s/cpu)
> ktime                 (16 cpus):  399.813 ± 0.181M/s  ( 24.988M/s/cpu)
> ktime                 (24 cpus):  617.675 ± 7.053M/s  ( 25.736M/s/cpu)
> ktime                 (32 cpus):  819.695 ± 0.231M/s  ( 25.615M/s/cpu)
> ktime                 (40 cpus):  996.264 ± 0.290M/s  ( 24.907M/s/cpu)
> ktime                 (48 cpus): 1180.201 ± 0.160M/s  ( 24.588M/s/cpu)
> ktime                 (56 cpus): 1321.084 ± 0.099M/s  ( 23.591M/s/cpu)
> ktime                 (64 cpus): 1482.061 ± 0.121M/s  ( 23.157M/s/cpu)
> ktime                 (72 cpus): 1666.540 ± 0.460M/s  ( 23.146M/s/cpu)
> ktime                 (80 cpus): 1851.419 ± 0.439M/s  ( 23.143M/s/cpu)
> 
> cycles                ( 1 cpus):   45.815 ± 0.018M/s  ( 45.815M/s/cpu)
> cycles                ( 2 cpus):   86.706 ± 0.068M/s  ( 43.353M/s/cpu)
> cycles                ( 3 cpus):  129.899 ± 0.101M/s  ( 43.300M/s/cpu)
> cycles                ( 4 cpus):  168.435 ± 0.073M/s  ( 42.109M/s/cpu)
> cycles                ( 5 cpus):  210.520 ± 0.164M/s  ( 42.104M/s/cpu)
> cycles                ( 6 cpus):  252.596 ± 0.050M/s  ( 42.099M/s/cpu)
> cycles                ( 7 cpus):  294.356 ± 0.159M/s  ( 42.051M/s/cpu)
> cycles                ( 8 cpus):  317.167 ± 0.163M/s  ( 39.646M/s/cpu)
> cycles                (10 cpus):  396.141 ± 0.208M/s  ( 39.614M/s/cpu)
> cycles                (12 cpus):  431.938 ± 0.511M/s  ( 35.995M/s/cpu)
> cycles                (14 cpus):  503.055 ± 0.070M/s  ( 35.932M/s/cpu)
> cycles                (16 cpus):  534.261 ± 0.107M/s  ( 33.391M/s/cpu)
> cycles                (24 cpus):  836.838 ± 0.141M/s  ( 34.868M/s/cpu)
> cycles                (32 cpus): 1099.689 ± 0.314M/s  ( 34.365M/s/cpu)
> cycles                (40 cpus): 1336.573 ± 0.015M/s  ( 33.414M/s/cpu)
> cycles                (48 cpus): 1571.734 ± 11.151M/s  ( 32.744M/s/cpu)
> cycles                (56 cpus): 1819.242 ± 4.627M/s  ( 32.486M/s/cpu)
> cycles                (64 cpus): 2046.285 ± 5.169M/s  ( 31.973M/s/cpu)
> cycles                (72 cpus): 2287.683 ± 0.787M/s  ( 31.773M/s/cpu)
> cycles                (80 cpus): 2505.414 ± 0.626M/s  ( 31.318M/s/cpu)
> 
> So, from about +42% on a single CPU, to +36% at 80 CPUs. Not that bad.
> Scalability-wise, we still see some drop off in performance, but
> believe me, with bpf_prog_test_run() it was so much worse :) I also
> verified that now we spend cycles almost exclusively inside the BPF
> program (so presumably in those benchmarked kfuncs).

Am I right that the numbers show how many iterations were done during
the very same amount of time? It would be also great to understand if
we get more precise measurements - just in case you have your tests
ready...
Eduard Zingerman Nov. 12, 2024, 9:53 p.m. UTC | #5
On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:

[...]

> > > +			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> > > +			    imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> > > +				/* Save RDX because RDTSC will use EDX:EAX to return u64 */
> > > +				emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> > > +				if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> > > +					EMIT_LFENCE();
> > > +				EMIT2(0x0F, 0x31);
> > > +
> > > +				/* shl RDX, 32 */
> > > +				maybe_emit_1mod(&prog, BPF_REG_3, true);
> > > +				EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> > > +				/* or RAX, RDX */
> > > +				maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> > > +				EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> > > +				/* restore RDX from R11 */
> > > +				emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> > 
> > Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
> >        which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
> >        Here we don't do `& S64_MAX`.
> >        The masking in __arch_get_hw_counter() was added by this commit:
> >        77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
> 
> I think we already discussed it with Alexey in v1, we don't really need
> any masking here for BPF case. We can use values provided by CPU
> directly. It will never happen that within one BPF program we will have
> inlined and non-inlined implementation of this helper, hence the values
> to compare will be of the same source.
> 
> >        Also, the default implementation does not issue `lfence`.
> >        Not sure if this makes any real-world difference.
> 
> Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
> or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
> features.

I see the following disassembly:

0000000000008980 <bpf_get_cpu_cycles>:
; {
    8980: f3 0f 1e fa                   endbr64
    8984: e8 00 00 00 00                callq   0x8989 <bpf_get_cpu_cycles+0x9>
                0000000000008985:  R_X86_64_PLT32       __fentry__-0x4
;       asm volatile(ALTERNATIVE_2("rdtsc",
    8989: 0f 31                         rdtsc
    898b: 90                            nop
    898c: 90                            nop
    898d: 90                            nop
;       return EAX_EDX_VAL(val, low, high);
    898e: 48 c1 e2 20                   shlq    $0x20, %rdx
    8992: 48 09 d0                      orq     %rdx, %rax
    8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
;               return (u64)rdtsc_ordered() & S64_MAX;
    899f: 48 21 c8                      andq    %rcx, %rax
;       return __arch_get_hw_counter(1, NULL);
    89a2: 2e e9 00 00 00 00             jmp     0x89a8 <bpf_get_cpu_cycles+0x28>

Is it patched when kernel is loaded to replace nops with lfence?
By real-world difference I meant difference between default
implementation and inlined assembly.

[...]

> > > @@ -20488,6 +20510,12 @@ static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
> > >   						node_offset_reg, insn, insn_buf, cnt);
> > >   	} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
> > >   		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
> > > +		if (!verifier_inlines_kfunc_call(env, imm)) {
> > > +			verbose(env, "verifier internal error: kfunc id %d is not defined in checker\n",
> > > +				desc->func_id);
> > > +			return -EFAULT;
> > > +		}
> > > +
> > 
> > Nit: still think that moving this check as the first conditional would
> >       have been better:
> > 
> >       if (verifier_inlines_kfunc_call(env, imm)) {
> >          if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
> >             desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
> >             // ...
> >          } else {
> >             // report error
> >          }
> >       } else if (...) {
> >         // ... rest of the cases
> >       }
> 
> I can change it in the next iteration (if it's needed) if you insist

No need to change if there would be no next iteration.

[...]
Eduard Zingerman Nov. 12, 2024, 10:19 p.m. UTC | #6
On Tue, 2024-11-12 at 13:53 -0800, Eduard Zingerman wrote:
> On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
> 
> [...]
> 
> > > > +			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> > > > +			    imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> > > > +				/* Save RDX because RDTSC will use EDX:EAX to return u64 */
> > > > +				emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> > > > +				if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> > > > +					EMIT_LFENCE();
> > > > +				EMIT2(0x0F, 0x31);
> > > > +
> > > > +				/* shl RDX, 32 */
> > > > +				maybe_emit_1mod(&prog, BPF_REG_3, true);
> > > > +				EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> > > > +				/* or RAX, RDX */
> > > > +				maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> > > > +				EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> > > > +				/* restore RDX from R11 */
> > > > +				emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> > > 
> > > Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
> > >        which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
> > >        Here we don't do `& S64_MAX`.
> > >        The masking in __arch_get_hw_counter() was added by this commit:
> > >        77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
> > 
> > I think we already discussed it with Alexey in v1, we don't really need
> > any masking here for BPF case. We can use values provided by CPU
> > directly. It will never happen that within one BPF program we will have
> > inlined and non-inlined implementation of this helper, hence the values
> > to compare will be of the same source.
> > 
> > >        Also, the default implementation does not issue `lfence`.
> > >        Not sure if this makes any real-world difference.
> > 
> > Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
> > or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
> > features.
> 
> I see the following disassembly:
> 
> 0000000000008980 <bpf_get_cpu_cycles>:
> ; {
>     8980: f3 0f 1e fa                   endbr64
>     8984: e8 00 00 00 00                callq   0x8989 <bpf_get_cpu_cycles+0x9>
>                 0000000000008985:  R_X86_64_PLT32       __fentry__-0x4
> ;       asm volatile(ALTERNATIVE_2("rdtsc",
>     8989: 0f 31                         rdtsc
>     898b: 90                            nop
>     898c: 90                            nop
>     898d: 90                            nop
> ;       return EAX_EDX_VAL(val, low, high);
>     898e: 48 c1 e2 20                   shlq    $0x20, %rdx
>     8992: 48 09 d0                      orq     %rdx, %rax
>     8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
> ;               return (u64)rdtsc_ordered() & S64_MAX;
>     899f: 48 21 c8                      andq    %rcx, %rax
> ;       return __arch_get_hw_counter(1, NULL);
>     89a2: 2e e9 00 00 00 00             jmp     0x89a8 <bpf_get_cpu_cycles+0x28>
> 
> Is it patched when kernel is loaded to replace nops with lfence?
> By real-world difference I meant difference between default
> implementation and inlined assembly.

Talked with Vadim off-list, he explained that 'rttsc nop nop nop' is
indeed patched at kernel load. Regarding S64_MAX patching we just hope
this should never be an issue for BPF use-case.
So, no more questions from my side.

[...]
Alexei Starovoitov Nov. 12, 2024, 10:27 p.m. UTC | #7
On Tue, Nov 12, 2024 at 2:20 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Tue, 2024-11-12 at 13:53 -0800, Eduard Zingerman wrote:
> > On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
> >
> > [...]
> >
> > > > > +                       if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> > > > > +                           imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> > > > > +                               /* Save RDX because RDTSC will use EDX:EAX to return u64 */
> > > > > +                               emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> > > > > +                               if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> > > > > +                                       EMIT_LFENCE();
> > > > > +                               EMIT2(0x0F, 0x31);
> > > > > +
> > > > > +                               /* shl RDX, 32 */
> > > > > +                               maybe_emit_1mod(&prog, BPF_REG_3, true);
> > > > > +                               EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> > > > > +                               /* or RAX, RDX */
> > > > > +                               maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> > > > > +                               EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> > > > > +                               /* restore RDX from R11 */
> > > > > +                               emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> > > >
> > > > Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
> > > >        which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
> > > >        Here we don't do `& S64_MAX`.
> > > >        The masking in __arch_get_hw_counter() was added by this commit:
> > > >        77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
> > >
> > > I think we already discussed it with Alexey in v1, we don't really need
> > > any masking here for BPF case. We can use values provided by CPU
> > > directly. It will never happen that within one BPF program we will have
> > > inlined and non-inlined implementation of this helper, hence the values
> > > to compare will be of the same source.
> > >
> > > >        Also, the default implementation does not issue `lfence`.
> > > >        Not sure if this makes any real-world difference.
> > >
> > > Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
> > > or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
> > > features.
> >
> > I see the following disassembly:
> >
> > 0000000000008980 <bpf_get_cpu_cycles>:
> > ; {
> >     8980: f3 0f 1e fa                   endbr64
> >     8984: e8 00 00 00 00                callq   0x8989 <bpf_get_cpu_cycles+0x9>
> >                 0000000000008985:  R_X86_64_PLT32       __fentry__-0x4
> > ;       asm volatile(ALTERNATIVE_2("rdtsc",
> >     8989: 0f 31                         rdtsc
> >     898b: 90                            nop
> >     898c: 90                            nop
> >     898d: 90                            nop
> > ;       return EAX_EDX_VAL(val, low, high);
> >     898e: 48 c1 e2 20                   shlq    $0x20, %rdx
> >     8992: 48 09 d0                      orq     %rdx, %rax
> >     8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
> > ;               return (u64)rdtsc_ordered() & S64_MAX;
> >     899f: 48 21 c8                      andq    %rcx, %rax
> > ;       return __arch_get_hw_counter(1, NULL);
> >     89a2: 2e e9 00 00 00 00             jmp     0x89a8 <bpf_get_cpu_cycles+0x28>
> >
> > Is it patched when kernel is loaded to replace nops with lfence?
> > By real-world difference I meant difference between default
> > implementation and inlined assembly.
>
> Talked with Vadim off-list, he explained that 'rttsc nop nop nop' is
> indeed patched at kernel load. Regarding S64_MAX patching we just hope
> this should never be an issue for BPF use-case.
> So, no more questions from my side.

since s64 question came up twice it should be a comment.

nop nop as well.

pw-bot: cr
Vadim Fedorenko Nov. 12, 2024, 11:08 p.m. UTC | #8
On 12/11/2024 22:27, Alexei Starovoitov wrote:
> On Tue, Nov 12, 2024 at 2:20 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>>
>> On Tue, 2024-11-12 at 13:53 -0800, Eduard Zingerman wrote:
>>> On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
>>>
>>> [...]
>>>
>>>>>> +                       if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>>>>> +                           imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>>> +                               /* Save RDX because RDTSC will use EDX:EAX to return u64 */
>>>>>> +                               emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>>>>> +                               if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>>> +                                       EMIT_LFENCE();
>>>>>> +                               EMIT2(0x0F, 0x31);
>>>>>> +
>>>>>> +                               /* shl RDX, 32 */
>>>>>> +                               maybe_emit_1mod(&prog, BPF_REG_3, true);
>>>>>> +                               EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>>>>> +                               /* or RAX, RDX */
>>>>>> +                               maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>>>>> +                               EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>>>>> +                               /* restore RDX from R11 */
>>>>>> +                               emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>>>>
>>>>> Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
>>>>>         which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
>>>>>         Here we don't do `& S64_MAX`.
>>>>>         The masking in __arch_get_hw_counter() was added by this commit:
>>>>>         77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
>>>>
>>>> I think we already discussed it with Alexey in v1, we don't really need
>>>> any masking here for BPF case. We can use values provided by CPU
>>>> directly. It will never happen that within one BPF program we will have
>>>> inlined and non-inlined implementation of this helper, hence the values
>>>> to compare will be of the same source.
>>>>
>>>>>         Also, the default implementation does not issue `lfence`.
>>>>>         Not sure if this makes any real-world difference.
>>>>
>>>> Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
>>>> or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
>>>> features.
>>>
>>> I see the following disassembly:
>>>
>>> 0000000000008980 <bpf_get_cpu_cycles>:
>>> ; {
>>>      8980: f3 0f 1e fa                   endbr64
>>>      8984: e8 00 00 00 00                callq   0x8989 <bpf_get_cpu_cycles+0x9>
>>>                  0000000000008985:  R_X86_64_PLT32       __fentry__-0x4
>>> ;       asm volatile(ALTERNATIVE_2("rdtsc",
>>>      8989: 0f 31                         rdtsc
>>>      898b: 90                            nop
>>>      898c: 90                            nop
>>>      898d: 90                            nop
>>> ;       return EAX_EDX_VAL(val, low, high);
>>>      898e: 48 c1 e2 20                   shlq    $0x20, %rdx
>>>      8992: 48 09 d0                      orq     %rdx, %rax
>>>      8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
>>> ;               return (u64)rdtsc_ordered() & S64_MAX;
>>>      899f: 48 21 c8                      andq    %rcx, %rax
>>> ;       return __arch_get_hw_counter(1, NULL);
>>>      89a2: 2e e9 00 00 00 00             jmp     0x89a8 <bpf_get_cpu_cycles+0x28>
>>>
>>> Is it patched when kernel is loaded to replace nops with lfence?
>>> By real-world difference I meant difference between default
>>> implementation and inlined assembly.
>>
>> Talked with Vadim off-list, he explained that 'rttsc nop nop nop' is
>> indeed patched at kernel load. Regarding S64_MAX patching we just hope
>> this should never be an issue for BPF use-case.
>> So, no more questions from my side.
> 
> since s64 question came up twice it should be a comment.

sure, will do it.

> 
> nop nop as well.

do you mean why there are nop;nop instructions in the kernel's assembly?

> 
> pw-bot: cr
Andrii Nakryiko Nov. 12, 2024, 11:59 p.m. UTC | #9
On Tue, Nov 12, 2024 at 1:43 PM Vadim Fedorenko
<vadim.fedorenko@linux.dev> wrote:
>
> On 12/11/2024 05:50, Andrii Nakryiko wrote:
> > On Fri, Nov 8, 2024 at 4:42 PM Vadim Fedorenko <vadfed@meta.com> wrote:
> >>
> >> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
> >> it into rdtsc ordered call. Other architectures will get JIT
> >> implementation too if supported. The fallback is to
> >> __arch_get_hw_counter().
> >>
> >> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> >> ---
> >
> > nit: please add cover letter for the next revision, multi-patch sets
> > generally should come with a cover letter, unless it's some set of
> > trivial and mostly independent patches. Anyways...
>
> Yeah, sure. This series has grown from the first small version...
>
> >
> > I haven't yet looked through the code (yet), but I was curious to
> > benchmark the perf benefit, so that's what I did for fun this evening.
> >
> > (!!!) BTW, a complete aside, but I think it's interesting. It turned
> > out that using bpf_test_prog_run() scales *VERY POORLY* with large
> > number of CPUs, because we start spending tons of time in
> > fdget()/fdput(), so I initially got pretty unscalable results,
> > profiled a bit, and then switched to just doing
> > syscall(syscall(__NR_getpgid); + SEC("raw_tp/sys_enter")). Anyways,
> > the point is that microbenchmarking is tricky and we need to improve
> > our existing bench setup for some benchmarks. Anyways, getting back to
> > the main topic.
> >
> > I wrote a quick two benchmarks testing what I see as intended use case
> > for these kfuncs (batching amortizes the cost of triggering BPF
> > program, batch_iters = 500 in my case):
> >
> > SEC("?raw_tp/sys_enter")
> > int trigger_driver_ktime(void *ctx)
> > {
> >          volatile __u64 total = 0;
> >          int i;
> >
> >          for (i = 0; i < batch_iters; i++) {
> >                  __u64 start, end;
> >
> >                  start = bpf_ktime_get_ns();
> >                  end = bpf_ktime_get_ns();
> >                  total += end - start;
> >          }
> >          inc_counter_batch(batch_iters);
> >
> >          return 0;
> > }
> >
> > extern __u64 bpf_get_cpu_cycles(void) __weak __ksym;
> > extern __u64 bpf_cpu_cycles_to_ns(__u64 cycles) __weak __ksym;
> >
> > SEC("?raw_tp/sys_enter")
> > int trigger_driver_cycles(void *ctx)
> > {
> >          volatile __u64 total = 0;
> >          int i;
> >
> >          for (i = 0; i < batch_iters; i++) {
> >                  __u64 start, end;
> >
> >                  start = bpf_get_cpu_cycles();
> >                  end = bpf_get_cpu_cycles();
> >                  total += bpf_cpu_cycles_to_ns(end - start);
> >          }
> >          inc_counter_batch(batch_iters);
> >
> >          return 0;
> > }
> >
> > And here's what I got across multiple numbers of parallel CPUs on our
> > production host.
> >
> > # ./bench_timing.sh
> >
> > ktime                 ( 1 cpus):   32.286 ± 0.309M/s  ( 32.286M/s/cpu)
> > ktime                 ( 2 cpus):   63.021 ± 0.538M/s  ( 31.511M/s/cpu)
> > ktime                 ( 3 cpus):   94.211 ± 0.686M/s  ( 31.404M/s/cpu)
> > ktime                 ( 4 cpus):  124.757 ± 0.691M/s  ( 31.189M/s/cpu)
> > ktime                 ( 5 cpus):  154.855 ± 0.693M/s  ( 30.971M/s/cpu)
> > ktime                 ( 6 cpus):  185.551 ± 2.304M/s  ( 30.925M/s/cpu)
> > ktime                 ( 7 cpus):  211.117 ± 4.755M/s  ( 30.160M/s/cpu)
> > ktime                 ( 8 cpus):  236.454 ± 0.226M/s  ( 29.557M/s/cpu)
> > ktime                 (10 cpus):  295.526 ± 0.126M/s  ( 29.553M/s/cpu)
> > ktime                 (12 cpus):  322.282 ± 0.153M/s  ( 26.857M/s/cpu)
> > ktime                 (14 cpus):  375.347 ± 0.087M/s  ( 26.811M/s/cpu)
> > ktime                 (16 cpus):  399.813 ± 0.181M/s  ( 24.988M/s/cpu)
> > ktime                 (24 cpus):  617.675 ± 7.053M/s  ( 25.736M/s/cpu)
> > ktime                 (32 cpus):  819.695 ± 0.231M/s  ( 25.615M/s/cpu)
> > ktime                 (40 cpus):  996.264 ± 0.290M/s  ( 24.907M/s/cpu)
> > ktime                 (48 cpus): 1180.201 ± 0.160M/s  ( 24.588M/s/cpu)
> > ktime                 (56 cpus): 1321.084 ± 0.099M/s  ( 23.591M/s/cpu)
> > ktime                 (64 cpus): 1482.061 ± 0.121M/s  ( 23.157M/s/cpu)
> > ktime                 (72 cpus): 1666.540 ± 0.460M/s  ( 23.146M/s/cpu)
> > ktime                 (80 cpus): 1851.419 ± 0.439M/s  ( 23.143M/s/cpu)
> >
> > cycles                ( 1 cpus):   45.815 ± 0.018M/s  ( 45.815M/s/cpu)
> > cycles                ( 2 cpus):   86.706 ± 0.068M/s  ( 43.353M/s/cpu)
> > cycles                ( 3 cpus):  129.899 ± 0.101M/s  ( 43.300M/s/cpu)
> > cycles                ( 4 cpus):  168.435 ± 0.073M/s  ( 42.109M/s/cpu)
> > cycles                ( 5 cpus):  210.520 ± 0.164M/s  ( 42.104M/s/cpu)
> > cycles                ( 6 cpus):  252.596 ± 0.050M/s  ( 42.099M/s/cpu)
> > cycles                ( 7 cpus):  294.356 ± 0.159M/s  ( 42.051M/s/cpu)
> > cycles                ( 8 cpus):  317.167 ± 0.163M/s  ( 39.646M/s/cpu)
> > cycles                (10 cpus):  396.141 ± 0.208M/s  ( 39.614M/s/cpu)
> > cycles                (12 cpus):  431.938 ± 0.511M/s  ( 35.995M/s/cpu)
> > cycles                (14 cpus):  503.055 ± 0.070M/s  ( 35.932M/s/cpu)
> > cycles                (16 cpus):  534.261 ± 0.107M/s  ( 33.391M/s/cpu)
> > cycles                (24 cpus):  836.838 ± 0.141M/s  ( 34.868M/s/cpu)
> > cycles                (32 cpus): 1099.689 ± 0.314M/s  ( 34.365M/s/cpu)
> > cycles                (40 cpus): 1336.573 ± 0.015M/s  ( 33.414M/s/cpu)
> > cycles                (48 cpus): 1571.734 ± 11.151M/s  ( 32.744M/s/cpu)
> > cycles                (56 cpus): 1819.242 ± 4.627M/s  ( 32.486M/s/cpu)
> > cycles                (64 cpus): 2046.285 ± 5.169M/s  ( 31.973M/s/cpu)
> > cycles                (72 cpus): 2287.683 ± 0.787M/s  ( 31.773M/s/cpu)
> > cycles                (80 cpus): 2505.414 ± 0.626M/s  ( 31.318M/s/cpu)
> >
> > So, from about +42% on a single CPU, to +36% at 80 CPUs. Not that bad.
> > Scalability-wise, we still see some drop off in performance, but
> > believe me, with bpf_prog_test_run() it was so much worse :) I also
> > verified that now we spend cycles almost exclusively inside the BPF
> > program (so presumably in those benchmarked kfuncs).
>
> Am I right that the numbers show how many iterations were done during
> the very same amount of time? It would be also great to understand if

yeah, it's millions of operations per second

> we get more precise measurements - just in case you have your tests
> ready...
>

not sure what precision you mean here?... I didn't plan to publish
tests, but I can push what I have into a local branch if you'd like to
play with this
Alexei Starovoitov Nov. 13, 2024, 12:09 a.m. UTC | #10
On Tue, Nov 12, 2024 at 3:08 PM Vadim Fedorenko
<vadim.fedorenko@linux.dev> wrote:
>
> On 12/11/2024 22:27, Alexei Starovoitov wrote:
> > On Tue, Nov 12, 2024 at 2:20 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
> >>
> >> On Tue, 2024-11-12 at 13:53 -0800, Eduard Zingerman wrote:
> >>> On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
> >>>
> >>> [...]
> >>>
> >>>>>> +                       if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> >>>>>> +                           imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> >>>>>> +                               /* Save RDX because RDTSC will use EDX:EAX to return u64 */
> >>>>>> +                               emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> >>>>>> +                               if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> >>>>>> +                                       EMIT_LFENCE();
> >>>>>> +                               EMIT2(0x0F, 0x31);
> >>>>>> +
> >>>>>> +                               /* shl RDX, 32 */
> >>>>>> +                               maybe_emit_1mod(&prog, BPF_REG_3, true);
> >>>>>> +                               EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> >>>>>> +                               /* or RAX, RDX */
> >>>>>> +                               maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> >>>>>> +                               EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> >>>>>> +                               /* restore RDX from R11 */
> >>>>>> +                               emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> >>>>>
> >>>>> Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
> >>>>>         which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
> >>>>>         Here we don't do `& S64_MAX`.
> >>>>>         The masking in __arch_get_hw_counter() was added by this commit:
> >>>>>         77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
> >>>>
> >>>> I think we already discussed it with Alexey in v1, we don't really need
> >>>> any masking here for BPF case. We can use values provided by CPU
> >>>> directly. It will never happen that within one BPF program we will have
> >>>> inlined and non-inlined implementation of this helper, hence the values
> >>>> to compare will be of the same source.
> >>>>
> >>>>>         Also, the default implementation does not issue `lfence`.
> >>>>>         Not sure if this makes any real-world difference.
> >>>>
> >>>> Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
> >>>> or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
> >>>> features.
> >>>
> >>> I see the following disassembly:
> >>>
> >>> 0000000000008980 <bpf_get_cpu_cycles>:
> >>> ; {
> >>>      8980: f3 0f 1e fa                   endbr64
> >>>      8984: e8 00 00 00 00                callq   0x8989 <bpf_get_cpu_cycles+0x9>
> >>>                  0000000000008985:  R_X86_64_PLT32       __fentry__-0x4
> >>> ;       asm volatile(ALTERNATIVE_2("rdtsc",
> >>>      8989: 0f 31                         rdtsc
> >>>      898b: 90                            nop
> >>>      898c: 90                            nop
> >>>      898d: 90                            nop
> >>> ;       return EAX_EDX_VAL(val, low, high);
> >>>      898e: 48 c1 e2 20                   shlq    $0x20, %rdx
> >>>      8992: 48 09 d0                      orq     %rdx, %rax
> >>>      8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
> >>> ;               return (u64)rdtsc_ordered() & S64_MAX;
> >>>      899f: 48 21 c8                      andq    %rcx, %rax
> >>> ;       return __arch_get_hw_counter(1, NULL);
> >>>      89a2: 2e e9 00 00 00 00             jmp     0x89a8 <bpf_get_cpu_cycles+0x28>
> >>>
> >>> Is it patched when kernel is loaded to replace nops with lfence?
> >>> By real-world difference I meant difference between default
> >>> implementation and inlined assembly.
> >>
> >> Talked with Vadim off-list, he explained that 'rttsc nop nop nop' is
> >> indeed patched at kernel load. Regarding S64_MAX patching we just hope
> >> this should never be an issue for BPF use-case.
> >> So, no more questions from my side.
> >
> > since s64 question came up twice it should be a comment.
>
> sure, will do it.
>
> >
> > nop nop as well.
>
> do you mean why there are nop;nop instructions in the kernel's assembly?

Explanation on why JITed matches __arch_get_hw_counter.
Vadim Fedorenko Nov. 13, 2024, 12:20 a.m. UTC | #11
On 13/11/2024 00:09, Alexei Starovoitov wrote:
> On Tue, Nov 12, 2024 at 3:08 PM Vadim Fedorenko
> <vadim.fedorenko@linux.dev> wrote:
>>
>> On 12/11/2024 22:27, Alexei Starovoitov wrote:
>>> On Tue, Nov 12, 2024 at 2:20 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>>>>
>>>> On Tue, 2024-11-12 at 13:53 -0800, Eduard Zingerman wrote:
>>>>> On Tue, 2024-11-12 at 21:39 +0000, Vadim Fedorenko wrote:
>>>>>
>>>>> [...]
>>>>>
>>>>>>>> +                       if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>>>>>>> +                           imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>>>>> +                               /* Save RDX because RDTSC will use EDX:EAX to return u64 */
>>>>>>>> +                               emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>>>>>>> +                               if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>>>>> +                                       EMIT_LFENCE();
>>>>>>>> +                               EMIT2(0x0F, 0x31);
>>>>>>>> +
>>>>>>>> +                               /* shl RDX, 32 */
>>>>>>>> +                               maybe_emit_1mod(&prog, BPF_REG_3, true);
>>>>>>>> +                               EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>>>>>>> +                               /* or RAX, RDX */
>>>>>>>> +                               maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>>>>>>> +                               EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>>>>>>> +                               /* restore RDX from R11 */
>>>>>>>> +                               emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>>>>>>
>>>>>>> Note: The default implementation of this kfunc uses __arch_get_hw_counter(),
>>>>>>>          which is implemented as `(u64)rdtsc_ordered() & S64_MAX`.
>>>>>>>          Here we don't do `& S64_MAX`.
>>>>>>>          The masking in __arch_get_hw_counter() was added by this commit:
>>>>>>>          77750f78b0b3 ("x86/vdso: Fix gettimeofday masking").
>>>>>>
>>>>>> I think we already discussed it with Alexey in v1, we don't really need
>>>>>> any masking here for BPF case. We can use values provided by CPU
>>>>>> directly. It will never happen that within one BPF program we will have
>>>>>> inlined and non-inlined implementation of this helper, hence the values
>>>>>> to compare will be of the same source.
>>>>>>
>>>>>>>          Also, the default implementation does not issue `lfence`.
>>>>>>>          Not sure if this makes any real-world difference.
>>>>>>
>>>>>> Well, it actually does. rdtsc_ordered is translated into `lfence; rdtsc`
>>>>>> or `rdtscp` (which is rdtsc + lfence + u32 cookie) depending on the cpu
>>>>>> features.
>>>>>
>>>>> I see the following disassembly:
>>>>>
>>>>> 0000000000008980 <bpf_get_cpu_cycles>:
>>>>> ; {
>>>>>       8980: f3 0f 1e fa                   endbr64
>>>>>       8984: e8 00 00 00 00                callq   0x8989 <bpf_get_cpu_cycles+0x9>
>>>>>                   0000000000008985:  R_X86_64_PLT32       __fentry__-0x4
>>>>> ;       asm volatile(ALTERNATIVE_2("rdtsc",
>>>>>       8989: 0f 31                         rdtsc
>>>>>       898b: 90                            nop
>>>>>       898c: 90                            nop
>>>>>       898d: 90                            nop
>>>>> ;       return EAX_EDX_VAL(val, low, high);
>>>>>       898e: 48 c1 e2 20                   shlq    $0x20, %rdx
>>>>>       8992: 48 09 d0                      orq     %rdx, %rax
>>>>>       8995: 48 b9 ff ff ff ff ff ff ff 7f movabsq $0x7fffffffffffffff, %rcx # imm = 0x7FFFFFFFFFFFFFFF
>>>>> ;               return (u64)rdtsc_ordered() & S64_MAX;
>>>>>       899f: 48 21 c8                      andq    %rcx, %rax
>>>>> ;       return __arch_get_hw_counter(1, NULL);
>>>>>       89a2: 2e e9 00 00 00 00             jmp     0x89a8 <bpf_get_cpu_cycles+0x28>
>>>>>
>>>>> Is it patched when kernel is loaded to replace nops with lfence?
>>>>> By real-world difference I meant difference between default
>>>>> implementation and inlined assembly.
>>>>
>>>> Talked with Vadim off-list, he explained that 'rttsc nop nop nop' is
>>>> indeed patched at kernel load. Regarding S64_MAX patching we just hope
>>>> this should never be an issue for BPF use-case.
>>>> So, no more questions from my side.
>>>
>>> since s64 question came up twice it should be a comment.
>>
>> sure, will do it.
>>
>>>
>>> nop nop as well.
>>
>> do you mean why there are nop;nop instructions in the kernel's assembly?
> 
> Explanation on why JITed matches __arch_get_hw_counter.

Got it, will do
Yonghong Song Nov. 13, 2024, 5:38 p.m. UTC | #12
On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
> it into rdtsc ordered call. Other architectures will get JIT
> implementation too if supported. The fallback is to
> __arch_get_hw_counter().
>
> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
> ---
> v4 -> v5:
> * use if instead of ifdef with IS_ENABLED
> v3 -> v4:
> * change name of the helper to bpf_get_cpu_cycles (Andrii)
> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing
>    it on architectures which do not have vDSO functions and data
> * reduce the scope of check of inlined functions in verifier to only 2,
>    which are actually inlined.
> v2 -> v3:
> * change name of the helper to bpf_get_cpu_cycles_counter to explicitly
>    mention what counter it provides (Andrii)
> * move kfunc definition to bpf.h to use it in JIT.
> * introduce another kfunc to convert cycles into nanoseconds as more
>    meaningful time units for generic tracing use case (Andrii)
> v1 -> v2:
> * Fix incorrect function return value type to u64
> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>    mark_fastcall_pattern_for_call() to avoid clobbering in case of
>    running programs with no JIT (Eduard)
> * Avoid rewriting instruction and check function pointer directly
>    in JIT (Alexei)
> * Change includes to fix compile issues on non x86 architectures
> ---
>   arch/x86/net/bpf_jit_comp.c   | 28 ++++++++++++++++++++++++++++
>   arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>   include/linux/bpf.h           |  5 +++++
>   include/linux/filter.h        |  1 +
>   kernel/bpf/core.c             | 11 +++++++++++
>   kernel/bpf/helpers.c          | 13 +++++++++++++
>   kernel/bpf/verifier.c         | 30 +++++++++++++++++++++++++++++-
>   7 files changed, 101 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 06b080b61aa5..4f78ed93ee7f 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -2126,6 +2126,26 @@ st:			if (is_imm8(insn->off))
>   		case BPF_JMP | BPF_CALL: {
>   			u8 *ip = image + addrs[i - 1];
>   
> +			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
> +			    imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> +				/* Save RDX because RDTSC will use EDX:EAX to return u64 */
> +				emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
> +				if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> +					EMIT_LFENCE();
> +				EMIT2(0x0F, 0x31);
> +
> +				/* shl RDX, 32 */
> +				maybe_emit_1mod(&prog, BPF_REG_3, true);
> +				EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
> +				/* or RAX, RDX */
> +				maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
> +				EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
> +				/* restore RDX from R11 */
> +				emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
> +
> +				break;
> +			}
> +
>   			func = (u8 *) __bpf_call_base + imm32;
>   			if (tail_call_reachable) {
>   				LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>   {
>   	return 0;
>   }
> +
> +/* x86-64 JIT can inline kfunc */
> +bool bpf_jit_inlines_kfunc_call(s32 imm)
> +{
> +	if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
> +		return true;
> +	return false;
> +}
> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
> index de0f9e5f9f73..e6097a371b69 100644
> --- a/arch/x86/net/bpf_jit_comp32.c
> +++ b/arch/x86/net/bpf_jit_comp32.c
> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
>   			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>   				int err;
>   
> +				if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
> +					if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
> +						EMIT3(0x0F, 0xAE, 0xE8);
> +					EMIT2(0x0F, 0x31);
> +					break;
> +				}
> +
>   				err = emit_kfunc_call(bpf_prog,
>   						      image + addrs[i],
>   						      insn, &prog);
> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>   {
>   	return true;
>   }
> +
> +bool bpf_jit_inlines_kfunc_call(s32 imm)
> +{
> +	if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
> +		return true;
> +	return false;
> +}

[...]

> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index 395221e53832..5c6c0383ebf4 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -23,6 +23,9 @@
>   #include <linux/btf_ids.h>
>   #include <linux/bpf_mem_alloc.h>
>   #include <linux/kasan.h>
> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
> +#include <vdso/datapage.h>
> +#endif
>   
>   #include "../../lib/kstrtox.h"
>   
> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user
>   	return ret + 1;
>   }
>   
> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
> +{
> +	return __arch_get_hw_counter(1, NULL);

Some comment to explain what '1' mean in the above?

> +}
> +#endif
> +
>   __bpf_kfunc_end_defs();
>   
>   BTF_KFUNCS_START(generic_btf_ids)
> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
> +#endif
>   BTF_KFUNCS_END(common_btf_ids)
>   
>   static const struct btf_kfunc_id_set common_kfunc_set = {
[...]
Vadim Fedorenko Nov. 13, 2024, 5:52 p.m. UTC | #13
On 13/11/2024 17:38, Yonghong Song wrote:
> 
> 
> 
> On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
>> it into rdtsc ordered call. Other architectures will get JIT
>> implementation too if supported. The fallback is to
>> __arch_get_hw_counter().
>>
>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>> ---
>> v4 -> v5:
>> * use if instead of ifdef with IS_ENABLED
>> v3 -> v4:
>> * change name of the helper to bpf_get_cpu_cycles (Andrii)
>> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing
>>    it on architectures which do not have vDSO functions and data
>> * reduce the scope of check of inlined functions in verifier to only 2,
>>    which are actually inlined.
>> v2 -> v3:
>> * change name of the helper to bpf_get_cpu_cycles_counter to explicitly
>>    mention what counter it provides (Andrii)
>> * move kfunc definition to bpf.h to use it in JIT.
>> * introduce another kfunc to convert cycles into nanoseconds as more
>>    meaningful time units for generic tracing use case (Andrii)
>> v1 -> v2:
>> * Fix incorrect function return value type to u64
>> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>>    mark_fastcall_pattern_for_call() to avoid clobbering in case of
>>    running programs with no JIT (Eduard)
>> * Avoid rewriting instruction and check function pointer directly
>>    in JIT (Alexei)
>> * Change includes to fix compile issues on non x86 architectures
>> ---
>>   arch/x86/net/bpf_jit_comp.c   | 28 ++++++++++++++++++++++++++++
>>   arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>>   include/linux/bpf.h           |  5 +++++
>>   include/linux/filter.h        |  1 +
>>   kernel/bpf/core.c             | 11 +++++++++++
>>   kernel/bpf/helpers.c          | 13 +++++++++++++
>>   kernel/bpf/verifier.c         | 30 +++++++++++++++++++++++++++++-
>>   7 files changed, 101 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index 06b080b61aa5..4f78ed93ee7f 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -2126,6 +2126,26 @@ st:            if (is_imm8(insn->off))
>>           case BPF_JMP | BPF_CALL: {
>>               u8 *ip = image + addrs[i - 1];
>> +            if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>> +                imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>> +                /* Save RDX because RDTSC will use EDX:EAX to return 
>> u64 */
>> +                emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>> +                if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>> +                    EMIT_LFENCE();
>> +                EMIT2(0x0F, 0x31);
>> +
>> +                /* shl RDX, 32 */
>> +                maybe_emit_1mod(&prog, BPF_REG_3, true);
>> +                EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>> +                /* or RAX, RDX */
>> +                maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>> +                EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>> +                /* restore RDX from R11 */
>> +                emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>> +
>> +                break;
>> +            }
>> +
>>               func = (u8 *) __bpf_call_base + imm32;
>>               if (tail_call_reachable) {
>>                   LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>>   {
>>       return 0;
>>   }
>> +
>> +/* x86-64 JIT can inline kfunc */
>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>> +{
>> +    if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>> +        return true;
>> +    return false;
>> +}
>> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/ 
>> bpf_jit_comp32.c
>> index de0f9e5f9f73..e6097a371b69 100644
>> --- a/arch/x86/net/bpf_jit_comp32.c
>> +++ b/arch/x86/net/bpf_jit_comp32.c
>> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog *bpf_prog, 
>> int *addrs, u8 *image,
>>               if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>>                   int err;
>> +                if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>> +                    if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>> +                        EMIT3(0x0F, 0xAE, 0xE8);
>> +                    EMIT2(0x0F, 0x31);
>> +                    break;
>> +                }
>> +
>>                   err = emit_kfunc_call(bpf_prog,
>>                                 image + addrs[i],
>>                                 insn, &prog);
>> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>>   {
>>       return true;
>>   }
>> +
>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>> +{
>> +    if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>> +        return true;
>> +    return false;
>> +}
> 
> [...]
> 
>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>> index 395221e53832..5c6c0383ebf4 100644
>> --- a/kernel/bpf/helpers.c
>> +++ b/kernel/bpf/helpers.c
>> @@ -23,6 +23,9 @@
>>   #include <linux/btf_ids.h>
>>   #include <linux/bpf_mem_alloc.h>
>>   #include <linux/kasan.h>
>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>> +#include <vdso/datapage.h>
>> +#endif
>>   #include "../../lib/kstrtox.h"
>> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void 
>> *dst, u32 dst__sz, const void __user
>>       return ret + 1;
>>   }
>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
>> +{
>> +    return __arch_get_hw_counter(1, NULL);
> 
> Some comment to explain what '1' mean in the above?

That's arch-specific value which translates to HW implemented counter on
all architectures which have vDSO gettimeofday() implementation.

For x86 it translates to VDSO_CLOCKMODE_TSC, while for aarch64/RISC-V
it's VDSO_CLOCKMODE_ARCHTIMER. Actually, for RISC-V the value of the
first parameter doesn't matter at all, for aarch64 it should be 0.
The only arch which is more strict about this parameter is x86, but it
has it's own special name...

> 
>> +}
>> +#endif
>> +
>>   __bpf_kfunc_end_defs();
>>   BTF_KFUNCS_START(generic_btf_ids)
>> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | 
>> KF_RET_NULL | KF_SLEEPABLE)
>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | 
>> KF_SLEEPABLE)
>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
>> +#endif
>>   BTF_KFUNCS_END(common_btf_ids)
>>   static const struct btf_kfunc_id_set common_kfunc_set = {
> [...]
Yonghong Song Nov. 13, 2024, 6:42 p.m. UTC | #14
On 11/13/24 9:52 AM, Vadim Fedorenko wrote:
> On 13/11/2024 17:38, Yonghong Song wrote:
>>
>>
>>
>> On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
>>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
>>> it into rdtsc ordered call. Other architectures will get JIT
>>> implementation too if supported. The fallback is to
>>> __arch_get_hw_counter().
>>>
>>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>> ---
>>> v4 -> v5:
>>> * use if instead of ifdef with IS_ENABLED
>>> v3 -> v4:
>>> * change name of the helper to bpf_get_cpu_cycles (Andrii)
>>> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing
>>>    it on architectures which do not have vDSO functions and data
>>> * reduce the scope of check of inlined functions in verifier to only 2,
>>>    which are actually inlined.
>>> v2 -> v3:
>>> * change name of the helper to bpf_get_cpu_cycles_counter to explicitly
>>>    mention what counter it provides (Andrii)
>>> * move kfunc definition to bpf.h to use it in JIT.
>>> * introduce another kfunc to convert cycles into nanoseconds as more
>>>    meaningful time units for generic tracing use case (Andrii)
>>> v1 -> v2:
>>> * Fix incorrect function return value type to u64
>>> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>>>    mark_fastcall_pattern_for_call() to avoid clobbering in case of
>>>    running programs with no JIT (Eduard)
>>> * Avoid rewriting instruction and check function pointer directly
>>>    in JIT (Alexei)
>>> * Change includes to fix compile issues on non x86 architectures
>>> ---
>>>   arch/x86/net/bpf_jit_comp.c   | 28 ++++++++++++++++++++++++++++
>>>   arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>>>   include/linux/bpf.h           |  5 +++++
>>>   include/linux/filter.h        |  1 +
>>>   kernel/bpf/core.c             | 11 +++++++++++
>>>   kernel/bpf/helpers.c          | 13 +++++++++++++
>>>   kernel/bpf/verifier.c         | 30 +++++++++++++++++++++++++++++-
>>>   7 files changed, 101 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>>> index 06b080b61aa5..4f78ed93ee7f 100644
>>> --- a/arch/x86/net/bpf_jit_comp.c
>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>> @@ -2126,6 +2126,26 @@ st:            if (is_imm8(insn->off))
>>>           case BPF_JMP | BPF_CALL: {
>>>               u8 *ip = image + addrs[i - 1];
>>> +            if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>> +                imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>> +                /* Save RDX because RDTSC will use EDX:EAX to 
>>> return u64 */
>>> +                emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>> +                if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>> +                    EMIT_LFENCE();
>>> +                EMIT2(0x0F, 0x31);
>>> +
>>> +                /* shl RDX, 32 */
>>> +                maybe_emit_1mod(&prog, BPF_REG_3, true);
>>> +                EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>> +                /* or RAX, RDX */
>>> +                maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>> +                EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>> +                /* restore RDX from R11 */
>>> +                emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>> +
>>> +                break;
>>> +            }
>>> +
>>>               func = (u8 *) __bpf_call_base + imm32;
>>>               if (tail_call_reachable) {
>>> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>>> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>>>   {
>>>       return 0;
>>>   }
>>> +
>>> +/* x86-64 JIT can inline kfunc */
>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>> +{
>>> +    if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>> +        return true;
>>> +    return false;
>>> +}
>>> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/ 
>>> bpf_jit_comp32.c
>>> index de0f9e5f9f73..e6097a371b69 100644
>>> --- a/arch/x86/net/bpf_jit_comp32.c
>>> +++ b/arch/x86/net/bpf_jit_comp32.c
>>> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog *bpf_prog, 
>>> int *addrs, u8 *image,
>>>               if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>>>                   int err;
>>> +                if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>> +                    if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>> +                        EMIT3(0x0F, 0xAE, 0xE8);
>>> +                    EMIT2(0x0F, 0x31);
>>> +                    break;
>>> +                }
>>> +
>>>                   err = emit_kfunc_call(bpf_prog,
>>>                                 image + addrs[i],
>>>                                 insn, &prog);
>>> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>>>   {
>>>       return true;
>>>   }
>>> +
>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>> +{
>>> +    if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>> +        return true;
>>> +    return false;
>>> +}
>>
>> [...]
>>
>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>> index 395221e53832..5c6c0383ebf4 100644
>>> --- a/kernel/bpf/helpers.c
>>> +++ b/kernel/bpf/helpers.c
>>> @@ -23,6 +23,9 @@
>>>   #include <linux/btf_ids.h>
>>>   #include <linux/bpf_mem_alloc.h>
>>>   #include <linux/kasan.h>
>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>> +#include <vdso/datapage.h>
>>> +#endif
>>>   #include "../../lib/kstrtox.h"
>>> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void 
>>> *dst, u32 dst__sz, const void __user
>>>       return ret + 1;
>>>   }
>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
>>> +{
>>> +    return __arch_get_hw_counter(1, NULL);
>>
>> Some comment to explain what '1' mean in the above?
>
> That's arch-specific value which translates to HW implemented counter on
> all architectures which have vDSO gettimeofday() implementation.
>
> For x86 it translates to VDSO_CLOCKMODE_TSC, while for aarch64/RISC-V
> it's VDSO_CLOCKMODE_ARCHTIMER. Actually, for RISC-V the value of the
> first parameter doesn't matter at all, for aarch64 it should be 0.
> The only arch which is more strict about this parameter is x86, but it
> has it's own special name...

So in the future, if we want add aarch64 support or other architecture,
the argument could be different, right?

I think we should avoid to have arch specific control in helpers.c.
How about we define a __weak func like bpf_arch_get_hw_counter() so we
have

__bpf_kfunc u64 bpf_get_cpu_cycles(void)
{
	return bpf_arch_get_hw_counter();
}

Each arch can implement their own bpf_arch_get_hw_counter().
Do you think this will make more sense? This should not impact jit inlining
of this kfunc.

>
>>
>>> +}
>>> +#endif
>>> +
>>>   __bpf_kfunc_end_defs();
>>>   BTF_KFUNCS_START(generic_btf_ids)
>>> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | 
>>> KF_SLEEPABLE)
>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | 
>>> KF_RET_NULL | KF_SLEEPABLE)
>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | 
>>> KF_SLEEPABLE)
>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
>>> +#endif
>>>   BTF_KFUNCS_END(common_btf_ids)
>>>   static const struct btf_kfunc_id_set common_kfunc_set = {
>> [...]
>
Vadim Fedorenko Nov. 13, 2024, 10:28 p.m. UTC | #15
On 13/11/2024 18:42, Yonghong Song wrote:
> 
> 
> 
> On 11/13/24 9:52 AM, Vadim Fedorenko wrote:
>> On 13/11/2024 17:38, Yonghong Song wrote:
>>>
>>>
>>>
>>> On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
>>>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT converts
>>>> it into rdtsc ordered call. Other architectures will get JIT
>>>> implementation too if supported. The fallback is to
>>>> __arch_get_hw_counter().
>>>>
>>>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>>> ---
>>>> v4 -> v5:
>>>> * use if instead of ifdef with IS_ENABLED
>>>> v3 -> v4:
>>>> * change name of the helper to bpf_get_cpu_cycles (Andrii)
>>>> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid exposing
>>>>    it on architectures which do not have vDSO functions and data
>>>> * reduce the scope of check of inlined functions in verifier to only 2,
>>>>    which are actually inlined.
>>>> v2 -> v3:
>>>> * change name of the helper to bpf_get_cpu_cycles_counter to explicitly
>>>>    mention what counter it provides (Andrii)
>>>> * move kfunc definition to bpf.h to use it in JIT.
>>>> * introduce another kfunc to convert cycles into nanoseconds as more
>>>>    meaningful time units for generic tracing use case (Andrii)
>>>> v1 -> v2:
>>>> * Fix incorrect function return value type to u64
>>>> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>>>>    mark_fastcall_pattern_for_call() to avoid clobbering in case of
>>>>    running programs with no JIT (Eduard)
>>>> * Avoid rewriting instruction and check function pointer directly
>>>>    in JIT (Alexei)
>>>> * Change includes to fix compile issues on non x86 architectures
>>>> ---
>>>>   arch/x86/net/bpf_jit_comp.c   | 28 ++++++++++++++++++++++++++++
>>>>   arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>>>>   include/linux/bpf.h           |  5 +++++
>>>>   include/linux/filter.h        |  1 +
>>>>   kernel/bpf/core.c             | 11 +++++++++++
>>>>   kernel/bpf/helpers.c          | 13 +++++++++++++
>>>>   kernel/bpf/verifier.c         | 30 +++++++++++++++++++++++++++++-
>>>>   7 files changed, 101 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>>>> index 06b080b61aa5..4f78ed93ee7f 100644
>>>> --- a/arch/x86/net/bpf_jit_comp.c
>>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>>> @@ -2126,6 +2126,26 @@ st:            if (is_imm8(insn->off))
>>>>           case BPF_JMP | BPF_CALL: {
>>>>               u8 *ip = image + addrs[i - 1];
>>>> +            if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>>> +                imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>> +                /* Save RDX because RDTSC will use EDX:EAX to 
>>>> return u64 */
>>>> +                emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>>> +                if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>> +                    EMIT_LFENCE();
>>>> +                EMIT2(0x0F, 0x31);
>>>> +
>>>> +                /* shl RDX, 32 */
>>>> +                maybe_emit_1mod(&prog, BPF_REG_3, true);
>>>> +                EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>>> +                /* or RAX, RDX */
>>>> +                maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>>> +                EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>>> +                /* restore RDX from R11 */
>>>> +                emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>>> +
>>>> +                break;
>>>> +            }
>>>> +
>>>>               func = (u8 *) __bpf_call_base + imm32;
>>>>               if (tail_call_reachable) {
>>>> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>>>> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>>>>   {
>>>>       return 0;
>>>>   }
>>>> +
>>>> +/* x86-64 JIT can inline kfunc */
>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>> +{
>>>> +    if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>> +        return true;
>>>> +    return false;
>>>> +}
>>>> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/ 
>>>> bpf_jit_comp32.c
>>>> index de0f9e5f9f73..e6097a371b69 100644
>>>> --- a/arch/x86/net/bpf_jit_comp32.c
>>>> +++ b/arch/x86/net/bpf_jit_comp32.c
>>>> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog *bpf_prog, 
>>>> int *addrs, u8 *image,
>>>>               if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>>>>                   int err;
>>>> +                if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>> +                    if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>> +                        EMIT3(0x0F, 0xAE, 0xE8);
>>>> +                    EMIT2(0x0F, 0x31);
>>>> +                    break;
>>>> +                }
>>>> +
>>>>                   err = emit_kfunc_call(bpf_prog,
>>>>                                 image + addrs[i],
>>>>                                 insn, &prog);
>>>> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>>>>   {
>>>>       return true;
>>>>   }
>>>> +
>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>> +{
>>>> +    if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>> +        return true;
>>>> +    return false;
>>>> +}
>>>
>>> [...]
>>>
>>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>>> index 395221e53832..5c6c0383ebf4 100644
>>>> --- a/kernel/bpf/helpers.c
>>>> +++ b/kernel/bpf/helpers.c
>>>> @@ -23,6 +23,9 @@
>>>>   #include <linux/btf_ids.h>
>>>>   #include <linux/bpf_mem_alloc.h>
>>>>   #include <linux/kasan.h>
>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>> +#include <vdso/datapage.h>
>>>> +#endif
>>>>   #include "../../lib/kstrtox.h"
>>>> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void 
>>>> *dst, u32 dst__sz, const void __user
>>>>       return ret + 1;
>>>>   }
>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
>>>> +{
>>>> +    return __arch_get_hw_counter(1, NULL);
>>>
>>> Some comment to explain what '1' mean in the above?
>>
>> That's arch-specific value which translates to HW implemented counter on
>> all architectures which have vDSO gettimeofday() implementation.
>>
>> For x86 it translates to VDSO_CLOCKMODE_TSC, while for aarch64/RISC-V
>> it's VDSO_CLOCKMODE_ARCHTIMER. Actually, for RISC-V the value of the
>> first parameter doesn't matter at all, for aarch64 it should be 0.
>> The only arch which is more strict about this parameter is x86, but it
>> has it's own special name...
> 
> So in the future, if we want add aarch64 support or other architecture,
> the argument could be different, right?

No, that's the point. This value will be the same for all architectures.
I'll do the implementation for aarch64 once this series is in.

> 
> I think we should avoid to have arch specific control in helpers.c.
> How about we define a __weak func like bpf_arch_get_hw_counter() so we
> have
> 
> __bpf_kfunc u64 bpf_get_cpu_cycles(void)
> {
>      return bpf_arch_get_hw_counter();
> }
> 
> Each arch can implement their own bpf_arch_get_hw_counter().
> Do you think this will make more sense? This should not impact jit inlining
> of this kfunc.
> 
>>
>>>
>>>> +}
>>>> +#endif
>>>> +
>>>>   __bpf_kfunc_end_defs();
>>>>   BTF_KFUNCS_START(generic_btf_ids)
>>>> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | 
>>>> KF_SLEEPABLE)
>>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | 
>>>> KF_RET_NULL | KF_SLEEPABLE)
>>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | 
>>>> KF_SLEEPABLE)
>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
>>>> +#endif
>>>>   BTF_KFUNCS_END(common_btf_ids)
>>>>   static const struct btf_kfunc_id_set common_kfunc_set = {
>>> [...]
>>
>
Yonghong Song Nov. 13, 2024, 11:02 p.m. UTC | #16
On 11/13/24 2:28 PM, Vadim Fedorenko wrote:
> On 13/11/2024 18:42, Yonghong Song wrote:
>>
>>
>>
>> On 11/13/24 9:52 AM, Vadim Fedorenko wrote:
>>> On 13/11/2024 17:38, Yonghong Song wrote:
>>>>
>>>>
>>>>
>>>> On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
>>>>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT 
>>>>> converts
>>>>> it into rdtsc ordered call. Other architectures will get JIT
>>>>> implementation too if supported. The fallback is to
>>>>> __arch_get_hw_counter().
>>>>>
>>>>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>>>> ---
>>>>> v4 -> v5:
>>>>> * use if instead of ifdef with IS_ENABLED
>>>>> v3 -> v4:
>>>>> * change name of the helper to bpf_get_cpu_cycles (Andrii)
>>>>> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid 
>>>>> exposing
>>>>>    it on architectures which do not have vDSO functions and data
>>>>> * reduce the scope of check of inlined functions in verifier to 
>>>>> only 2,
>>>>>    which are actually inlined.
>>>>> v2 -> v3:
>>>>> * change name of the helper to bpf_get_cpu_cycles_counter to 
>>>>> explicitly
>>>>>    mention what counter it provides (Andrii)
>>>>> * move kfunc definition to bpf.h to use it in JIT.
>>>>> * introduce another kfunc to convert cycles into nanoseconds as more
>>>>>    meaningful time units for generic tracing use case (Andrii)
>>>>> v1 -> v2:
>>>>> * Fix incorrect function return value type to u64
>>>>> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>>>>>    mark_fastcall_pattern_for_call() to avoid clobbering in case of
>>>>>    running programs with no JIT (Eduard)
>>>>> * Avoid rewriting instruction and check function pointer directly
>>>>>    in JIT (Alexei)
>>>>> * Change includes to fix compile issues on non x86 architectures
>>>>> ---
>>>>>   arch/x86/net/bpf_jit_comp.c   | 28 ++++++++++++++++++++++++++++
>>>>>   arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>>>>>   include/linux/bpf.h           |  5 +++++
>>>>>   include/linux/filter.h        |  1 +
>>>>>   kernel/bpf/core.c             | 11 +++++++++++
>>>>>   kernel/bpf/helpers.c          | 13 +++++++++++++
>>>>>   kernel/bpf/verifier.c         | 30 +++++++++++++++++++++++++++++-
>>>>>   7 files changed, 101 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/arch/x86/net/bpf_jit_comp.c 
>>>>> b/arch/x86/net/bpf_jit_comp.c
>>>>> index 06b080b61aa5..4f78ed93ee7f 100644
>>>>> --- a/arch/x86/net/bpf_jit_comp.c
>>>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>>>> @@ -2126,6 +2126,26 @@ st:            if (is_imm8(insn->off))
>>>>>           case BPF_JMP | BPF_CALL: {
>>>>>               u8 *ip = image + addrs[i - 1];
>>>>> +            if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>>>> +                imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>> +                /* Save RDX because RDTSC will use EDX:EAX to 
>>>>> return u64 */
>>>>> +                emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>>>> +                if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>> +                    EMIT_LFENCE();
>>>>> +                EMIT2(0x0F, 0x31);
>>>>> +
>>>>> +                /* shl RDX, 32 */
>>>>> +                maybe_emit_1mod(&prog, BPF_REG_3, true);
>>>>> +                EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>>>> +                /* or RAX, RDX */
>>>>> +                maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>>>> +                EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>>>> +                /* restore RDX from R11 */
>>>>> +                emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>>>> +
>>>>> +                break;
>>>>> +            }
>>>>> +
>>>>>               func = (u8 *) __bpf_call_base + imm32;
>>>>>               if (tail_call_reachable) {
>>>>> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>>>>> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>>>>>   {
>>>>>       return 0;
>>>>>   }
>>>>> +
>>>>> +/* x86-64 JIT can inline kfunc */
>>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>>> +{
>>>>> +    if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>>> +        return true;
>>>>> +    return false;
>>>>> +}
>>>>> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/ 
>>>>> bpf_jit_comp32.c
>>>>> index de0f9e5f9f73..e6097a371b69 100644
>>>>> --- a/arch/x86/net/bpf_jit_comp32.c
>>>>> +++ b/arch/x86/net/bpf_jit_comp32.c
>>>>> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog 
>>>>> *bpf_prog, int *addrs, u8 *image,
>>>>>               if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>>>>>                   int err;
>>>>> +                if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>> +                    if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>> +                        EMIT3(0x0F, 0xAE, 0xE8);
>>>>> +                    EMIT2(0x0F, 0x31);
>>>>> +                    break;
>>>>> +                }
>>>>> +
>>>>>                   err = emit_kfunc_call(bpf_prog,
>>>>>                                 image + addrs[i],
>>>>>                                 insn, &prog);
>>>>> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>>>>>   {
>>>>>       return true;
>>>>>   }
>>>>> +
>>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>>> +{
>>>>> +    if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>>> +        return true;
>>>>> +    return false;
>>>>> +}
>>>>
>>>> [...]
>>>>
>>>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>>>> index 395221e53832..5c6c0383ebf4 100644
>>>>> --- a/kernel/bpf/helpers.c
>>>>> +++ b/kernel/bpf/helpers.c
>>>>> @@ -23,6 +23,9 @@
>>>>>   #include <linux/btf_ids.h>
>>>>>   #include <linux/bpf_mem_alloc.h>
>>>>>   #include <linux/kasan.h>
>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>> +#include <vdso/datapage.h>
>>>>> +#endif
>>>>>   #include "../../lib/kstrtox.h"
>>>>> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void 
>>>>> *dst, u32 dst__sz, const void __user
>>>>>       return ret + 1;
>>>>>   }
>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
>>>>> +{
>>>>> +    return __arch_get_hw_counter(1, NULL);
>>>>
>>>> Some comment to explain what '1' mean in the above?
>>>
>>> That's arch-specific value which translates to HW implemented 
>>> counter on
>>> all architectures which have vDSO gettimeofday() implementation.
>>>
>>> For x86 it translates to VDSO_CLOCKMODE_TSC, while for aarch64/RISC-V
>>> it's VDSO_CLOCKMODE_ARCHTIMER. Actually, for RISC-V the value of the
>>> first parameter doesn't matter at all, for aarch64 it should be 0.
>>> The only arch which is more strict about this parameter is x86, but it
>>> has it's own special name...
>>
>> So in the future, if we want add aarch64 support or other architecture,
>> the argument could be different, right?
>
> No, that's the point. This value will be the same for all architectures.
> I'll do the implementation for aarch64 once this series is in.

I did a little bit research and the following are two callsites for
__arch_get_hw_counter:
0 lib/vdso/gettimeofday.c do_hres_timens  96 cycles = __arch_get_hw_counter(vd->clock_mode, vd);
1 lib/vdso/gettimeofday.c do_hres        164 cycles = __arch_get_hw_counter(vd->clock_mode, vd);

Let us pick func do_hres_timens():

	vd = vdns - (clk == CLOCK_MONOTONIC_RAW ? CS_RAW : CS_HRES_COARSE);
	vd = __arch_get_timens_vdso_data(vd);
         if (clk != CLOCK_MONOTONIC_RAW)
                 vd = &vd[CS_HRES_COARSE];
         else
                 vd = &vd[CS_RAW];
		..
		cycles = __arch_get_hw_counter(vd->clock_mode, vd);

So 'vd' is supplied by arch specific func (__arch_get_timens_vdso_data
()), so theoretically vd->clock_mode in __arch_get_hw_counter(vd->clock_mode, vd)
could be different for different archs. The other case in do_hres() is similar.

But if you are sure that the first argument is the same for all architectures, please
add a comment right above __arch_get_hw_counter() to say
	All architectures have the same parameters as below

>
>>
>> I think we should avoid to have arch specific control in helpers.c.
>> How about we define a __weak func like bpf_arch_get_hw_counter() so we
>> have
>>
>> __bpf_kfunc u64 bpf_get_cpu_cycles(void)
>> {
>>      return bpf_arch_get_hw_counter();
>> }
>>
>> Each arch can implement their own bpf_arch_get_hw_counter().
>> Do you think this will make more sense? This should not impact jit 
>> inlining
>> of this kfunc.
>>
>>>
>>>>
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>>   __bpf_kfunc_end_defs();
>>>>>   BTF_KFUNCS_START(generic_btf_ids)
>>>>> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>>>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | 
>>>>> KF_SLEEPABLE)
>>>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | 
>>>>> KF_RET_NULL | KF_SLEEPABLE)
>>>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY 
>>>>> | KF_SLEEPABLE)
>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
>>>>> +#endif
>>>>>   BTF_KFUNCS_END(common_btf_ids)
>>>>>   static const struct btf_kfunc_id_set common_kfunc_set = {
>>>> [...]
>>>
>>
>
Vadim Fedorenko Nov. 14, 2024, 1:05 a.m. UTC | #17
On 13/11/2024 23:02, Yonghong Song wrote:
> 
> 
> 
> On 11/13/24 2:28 PM, Vadim Fedorenko wrote:
>> On 13/11/2024 18:42, Yonghong Song wrote:
>>>
>>>
>>>
>>> On 11/13/24 9:52 AM, Vadim Fedorenko wrote:
>>>> On 13/11/2024 17:38, Yonghong Song wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 11/8/24 4:41 PM, Vadim Fedorenko wrote:
>>>>>> New kfunc to return ARCH-specific timecounter. For x86 BPF JIT 
>>>>>> converts
>>>>>> it into rdtsc ordered call. Other architectures will get JIT
>>>>>> implementation too if supported. The fallback is to
>>>>>> __arch_get_hw_counter().
>>>>>>
>>>>>> Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
>>>>>> ---
>>>>>> v4 -> v5:
>>>>>> * use if instead of ifdef with IS_ENABLED
>>>>>> v3 -> v4:
>>>>>> * change name of the helper to bpf_get_cpu_cycles (Andrii)
>>>>>> * Hide the helper behind CONFIG_GENERIC_GETTIMEOFDAY to avoid 
>>>>>> exposing
>>>>>>    it on architectures which do not have vDSO functions and data
>>>>>> * reduce the scope of check of inlined functions in verifier to 
>>>>>> only 2,
>>>>>>    which are actually inlined.
>>>>>> v2 -> v3:
>>>>>> * change name of the helper to bpf_get_cpu_cycles_counter to 
>>>>>> explicitly
>>>>>>    mention what counter it provides (Andrii)
>>>>>> * move kfunc definition to bpf.h to use it in JIT.
>>>>>> * introduce another kfunc to convert cycles into nanoseconds as more
>>>>>>    meaningful time units for generic tracing use case (Andrii)
>>>>>> v1 -> v2:
>>>>>> * Fix incorrect function return value type to u64
>>>>>> * Introduce bpf_jit_inlines_kfunc_call() and use it in
>>>>>>    mark_fastcall_pattern_for_call() to avoid clobbering in case of
>>>>>>    running programs with no JIT (Eduard)
>>>>>> * Avoid rewriting instruction and check function pointer directly
>>>>>>    in JIT (Alexei)
>>>>>> * Change includes to fix compile issues on non x86 architectures
>>>>>> ---
>>>>>>   arch/x86/net/bpf_jit_comp.c   | 28 ++++++++++++++++++++++++++++
>>>>>>   arch/x86/net/bpf_jit_comp32.c | 14 ++++++++++++++
>>>>>>   include/linux/bpf.h           |  5 +++++
>>>>>>   include/linux/filter.h        |  1 +
>>>>>>   kernel/bpf/core.c             | 11 +++++++++++
>>>>>>   kernel/bpf/helpers.c          | 13 +++++++++++++
>>>>>>   kernel/bpf/verifier.c         | 30 +++++++++++++++++++++++++++++-
>>>>>>   7 files changed, 101 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/ 
>>>>>> bpf_jit_comp.c
>>>>>> index 06b080b61aa5..4f78ed93ee7f 100644
>>>>>> --- a/arch/x86/net/bpf_jit_comp.c
>>>>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>>>>> @@ -2126,6 +2126,26 @@ st:            if (is_imm8(insn->off))
>>>>>>           case BPF_JMP | BPF_CALL: {
>>>>>>               u8 *ip = image + addrs[i - 1];
>>>>>> +            if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
>>>>>> +                imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>>> +                /* Save RDX because RDTSC will use EDX:EAX to 
>>>>>> return u64 */
>>>>>> +                emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
>>>>>> +                if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>>> +                    EMIT_LFENCE();
>>>>>> +                EMIT2(0x0F, 0x31);
>>>>>> +
>>>>>> +                /* shl RDX, 32 */
>>>>>> +                maybe_emit_1mod(&prog, BPF_REG_3, true);
>>>>>> +                EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
>>>>>> +                /* or RAX, RDX */
>>>>>> +                maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
>>>>>> +                EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
>>>>>> +                /* restore RDX from R11 */
>>>>>> +                emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
>>>>>> +
>>>>>> +                break;
>>>>>> +            }
>>>>>> +
>>>>>>               func = (u8 *) __bpf_call_base + imm32;
>>>>>>               if (tail_call_reachable) {
>>>>>> LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
>>>>>> @@ -3652,3 +3672,11 @@ u64 bpf_arch_uaddress_limit(void)
>>>>>>   {
>>>>>>       return 0;
>>>>>>   }
>>>>>> +
>>>>>> +/* x86-64 JIT can inline kfunc */
>>>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>>>> +{
>>>>>> +    if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>>>> +        return true;
>>>>>> +    return false;
>>>>>> +}
>>>>>> diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/ 
>>>>>> bpf_jit_comp32.c
>>>>>> index de0f9e5f9f73..e6097a371b69 100644
>>>>>> --- a/arch/x86/net/bpf_jit_comp32.c
>>>>>> +++ b/arch/x86/net/bpf_jit_comp32.c
>>>>>> @@ -2094,6 +2094,13 @@ static int do_jit(struct bpf_prog 
>>>>>> *bpf_prog, int *addrs, u8 *image,
>>>>>>               if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
>>>>>>                   int err;
>>>>>> +                if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
>>>>>> +                    if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
>>>>>> +                        EMIT3(0x0F, 0xAE, 0xE8);
>>>>>> +                    EMIT2(0x0F, 0x31);
>>>>>> +                    break;
>>>>>> +                }
>>>>>> +
>>>>>>                   err = emit_kfunc_call(bpf_prog,
>>>>>>                                 image + addrs[i],
>>>>>>                                 insn, &prog);
>>>>>> @@ -2621,3 +2628,10 @@ bool bpf_jit_supports_kfunc_call(void)
>>>>>>   {
>>>>>>       return true;
>>>>>>   }
>>>>>> +
>>>>>> +bool bpf_jit_inlines_kfunc_call(s32 imm)
>>>>>> +{
>>>>>> +    if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
>>>>>> +        return true;
>>>>>> +    return false;
>>>>>> +}
>>>>>
>>>>> [...]
>>>>>
>>>>>> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
>>>>>> index 395221e53832..5c6c0383ebf4 100644
>>>>>> --- a/kernel/bpf/helpers.c
>>>>>> +++ b/kernel/bpf/helpers.c
>>>>>> @@ -23,6 +23,9 @@
>>>>>>   #include <linux/btf_ids.h>
>>>>>>   #include <linux/bpf_mem_alloc.h>
>>>>>>   #include <linux/kasan.h>
>>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>>> +#include <vdso/datapage.h>
>>>>>> +#endif
>>>>>>   #include "../../lib/kstrtox.h"
>>>>>> @@ -3023,6 +3026,13 @@ __bpf_kfunc int bpf_copy_from_user_str(void 
>>>>>> *dst, u32 dst__sz, const void __user
>>>>>>       return ret + 1;
>>>>>>   }
>>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>>> +__bpf_kfunc u64 bpf_get_cpu_cycles(void)
>>>>>> +{
>>>>>> +    return __arch_get_hw_counter(1, NULL);
>>>>>
>>>>> Some comment to explain what '1' mean in the above?
>>>>
>>>> That's arch-specific value which translates to HW implemented 
>>>> counter on
>>>> all architectures which have vDSO gettimeofday() implementation.
>>>>
>>>> For x86 it translates to VDSO_CLOCKMODE_TSC, while for aarch64/RISC-V
>>>> it's VDSO_CLOCKMODE_ARCHTIMER. Actually, for RISC-V the value of the
>>>> first parameter doesn't matter at all, for aarch64 it should be 0.
>>>> The only arch which is more strict about this parameter is x86, but it
>>>> has it's own special name...
>>>
>>> So in the future, if we want add aarch64 support or other architecture,
>>> the argument could be different, right?
>>
>> No, that's the point. This value will be the same for all architectures.
>> I'll do the implementation for aarch64 once this series is in.
> 
> I did a little bit research and the following are two callsites for
> __arch_get_hw_counter:
> 0 lib/vdso/gettimeofday.c do_hres_timens  96 cycles = 
> __arch_get_hw_counter(vd->clock_mode, vd);
> 1 lib/vdso/gettimeofday.c do_hres        164 cycles = 
> __arch_get_hw_counter(vd->clock_mode, vd);
> 
> Let us pick func do_hres_timens():
> 
>      vd = vdns - (clk == CLOCK_MONOTONIC_RAW ? CS_RAW : CS_HRES_COARSE);
>      vd = __arch_get_timens_vdso_data(vd);
>          if (clk != CLOCK_MONOTONIC_RAW)
>                  vd = &vd[CS_HRES_COARSE];
>          else
>                  vd = &vd[CS_RAW];
>          ..
>          cycles = __arch_get_hw_counter(vd->clock_mode, vd);
> 
> So 'vd' is supplied by arch specific func (__arch_get_timens_vdso_data
> ()), so theoretically vd->clock_mode in __arch_get_hw_counter(vd- 
>  >clock_mode, vd)
> could be different for different archs. The other case in do_hres() is 
> similar.

Well, ok, even though I'm pretty sure all the implememtations will end
up with 1 as a value, let's be on the safe side and implement it the
same way with reading vdso data structure. Now with cycles2ns kfunc we
include all the needed pieces, should be no blockers on this
implementations. Thanks for bringing it up.

> 
> But if you are sure that the first argument is the same for all 
> architectures, please
> add a comment right above __arch_get_hw_counter() to say
>      All architectures have the same parameters as below
> 
>>
>>>
>>> I think we should avoid to have arch specific control in helpers.c.
>>> How about we define a __weak func like bpf_arch_get_hw_counter() so we
>>> have
>>>
>>> __bpf_kfunc u64 bpf_get_cpu_cycles(void)
>>> {
>>>      return bpf_arch_get_hw_counter();
>>> }
>>>
>>> Each arch can implement their own bpf_arch_get_hw_counter().
>>> Do you think this will make more sense? This should not impact jit 
>>> inlining
>>> of this kfunc.
>>>
>>>>
>>>>>
>>>>>> +}
>>>>>> +#endif
>>>>>> +
>>>>>>   __bpf_kfunc_end_defs();
>>>>>>   BTF_KFUNCS_START(generic_btf_ids)
>>>>>> @@ -3115,6 +3125,9 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
>>>>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | 
>>>>>> KF_SLEEPABLE)
>>>>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | 
>>>>>> KF_RET_NULL | KF_SLEEPABLE)
>>>>>>   BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY 
>>>>>> | KF_SLEEPABLE)
>>>>>> +#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
>>>>>> +BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
>>>>>> +#endif
>>>>>>   BTF_KFUNCS_END(common_btf_ids)
>>>>>>   static const struct btf_kfunc_id_set common_kfunc_set = {
>>>>> [...]
>>>>
>>>
>>
>
diff mbox series

Patch

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 06b080b61aa5..4f78ed93ee7f 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -2126,6 +2126,26 @@  st:			if (is_imm8(insn->off))
 		case BPF_JMP | BPF_CALL: {
 			u8 *ip = image + addrs[i - 1];
 
+			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
+			    imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
+				/* Save RDX because RDTSC will use EDX:EAX to return u64 */
+				emit_mov_reg(&prog, true, AUX_REG, BPF_REG_3);
+				if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
+					EMIT_LFENCE();
+				EMIT2(0x0F, 0x31);
+
+				/* shl RDX, 32 */
+				maybe_emit_1mod(&prog, BPF_REG_3, true);
+				EMIT3(0xC1, add_1reg(0xE0, BPF_REG_3), 32);
+				/* or RAX, RDX */
+				maybe_emit_mod(&prog, BPF_REG_0, BPF_REG_3, true);
+				EMIT2(0x09, add_2reg(0xC0, BPF_REG_0, BPF_REG_3));
+				/* restore RDX from R11 */
+				emit_mov_reg(&prog, true, BPF_REG_3, AUX_REG);
+
+				break;
+			}
+
 			func = (u8 *) __bpf_call_base + imm32;
 			if (tail_call_reachable) {
 				LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
@@ -3652,3 +3672,11 @@  u64 bpf_arch_uaddress_limit(void)
 {
 	return 0;
 }
+
+/* x86-64 JIT can inline kfunc */
+bool bpf_jit_inlines_kfunc_call(s32 imm)
+{
+	if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
+		return true;
+	return false;
+}
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
index de0f9e5f9f73..e6097a371b69 100644
--- a/arch/x86/net/bpf_jit_comp32.c
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -2094,6 +2094,13 @@  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
 			if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
 				int err;
 
+				if (imm32 == BPF_CALL_IMM(bpf_get_cpu_cycles)) {
+					if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
+						EMIT3(0x0F, 0xAE, 0xE8);
+					EMIT2(0x0F, 0x31);
+					break;
+				}
+
 				err = emit_kfunc_call(bpf_prog,
 						      image + addrs[i],
 						      insn, &prog);
@@ -2621,3 +2628,10 @@  bool bpf_jit_supports_kfunc_call(void)
 {
 	return true;
 }
+
+bool bpf_jit_inlines_kfunc_call(s32 imm)
+{
+	if (imm == BPF_CALL_IMM(bpf_get_cpu_cycles))
+		return true;
+	return false;
+}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 1b84613b10ac..fed5f36d387a 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3328,6 +3328,11 @@  void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+/* Inlined kfuncs */
+#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
+u64 bpf_get_cpu_cycles(void);
+#endif
+
 #if defined(CONFIG_NET)
 bool bpf_sock_common_is_valid_access(int off, int size,
 				     enum bpf_access_type type,
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 7d7578a8eac1..8bdd5e6b2a65 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1111,6 +1111,7 @@  struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
 void bpf_jit_compile(struct bpf_prog *prog);
 bool bpf_jit_needs_zext(void);
 bool bpf_jit_inlines_helper_call(s32 imm);
+bool bpf_jit_inlines_kfunc_call(s32 imm);
 bool bpf_jit_supports_subprog_tailcalls(void);
 bool bpf_jit_supports_percpu_insn(void);
 bool bpf_jit_supports_kfunc_call(void);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 233ea78f8f1b..ab6a2452ade0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2965,6 +2965,17 @@  bool __weak bpf_jit_inlines_helper_call(s32 imm)
 	return false;
 }
 
+/* Return true if the JIT inlines the call to the kfunc corresponding to
+ * the imm.
+ *
+ * The verifier will not patch the insn->imm for the call to the helper if
+ * this returns true.
+ */
+bool __weak bpf_jit_inlines_kfunc_call(s32 imm)
+{
+	return false;
+}
+
 /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
 bool __weak bpf_jit_supports_subprog_tailcalls(void)
 {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 395221e53832..5c6c0383ebf4 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -23,6 +23,9 @@ 
 #include <linux/btf_ids.h>
 #include <linux/bpf_mem_alloc.h>
 #include <linux/kasan.h>
+#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
+#include <vdso/datapage.h>
+#endif
 
 #include "../../lib/kstrtox.h"
 
@@ -3023,6 +3026,13 @@  __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user
 	return ret + 1;
 }
 
+#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
+__bpf_kfunc u64 bpf_get_cpu_cycles(void)
+{
+	return __arch_get_hw_counter(1, NULL);
+}
+#endif
+
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(generic_btf_ids)
@@ -3115,6 +3125,9 @@  BTF_ID_FLAGS(func, bpf_get_kmem_cache)
 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
 BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+#if IS_ENABLED(CONFIG_GENERIC_GETTIMEOFDAY)
+BTF_ID_FLAGS(func, bpf_get_cpu_cycles, KF_FASTCALL)
+#endif
 BTF_KFUNCS_END(common_btf_ids)
 
 static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7958d6ff6b73..b5220d996231 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -16273,6 +16273,24 @@  static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm)
 	}
 }
 
+/* True if fixup_kfunc_call() replaces calls to kfunc number 'imm',
+ * replacement patch is presumed to follow bpf_fastcall contract
+ * (see mark_fastcall_pattern_for_call() below).
+ */
+static bool verifier_inlines_kfunc_call(struct bpf_verifier_env *env, s32 imm)
+{
+	const struct bpf_kfunc_desc *desc = find_kfunc_desc(env->prog, imm, 0);
+
+	if (!env->prog->jit_requested)
+		return false;
+
+	if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
+	    desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast])
+		return true;
+
+	return false;
+}
+
 /* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */
 static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta)
 {
@@ -16400,7 +16418,10 @@  static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env,
 			return;
 
 		clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta);
-		can_be_inlined = is_fastcall_kfunc_call(&meta);
+		can_be_inlined = is_fastcall_kfunc_call(&meta) &&
+				 (verifier_inlines_kfunc_call(env, call->imm) ||
+				 (meta.btf == btf_vmlinux &&
+				  bpf_jit_inlines_kfunc_call(call->imm)));
 	}
 
 	if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS)
@@ -20402,6 +20423,7 @@  static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 			    struct bpf_insn *insn_buf, int insn_idx, int *cnt)
 {
 	const struct bpf_kfunc_desc *desc;
+	s32 imm = insn->imm;
 
 	if (!insn->imm) {
 		verbose(env, "invalid kernel function call not eliminated in verifier pass\n");
@@ -20488,6 +20510,12 @@  static int fixup_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 						node_offset_reg, insn, insn_buf, cnt);
 	} else if (desc->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx] ||
 		   desc->func_id == special_kfunc_list[KF_bpf_rdonly_cast]) {
+		if (!verifier_inlines_kfunc_call(env, imm)) {
+			verbose(env, "verifier internal error: kfunc id %d is not defined in checker\n",
+				desc->func_id);
+			return -EFAULT;
+		}
+
 		insn_buf[0] = BPF_MOV64_REG(BPF_REG_0, BPF_REG_1);
 		*cnt = 1;
 	} else if (is_bpf_wq_set_callback_impl_kfunc(desc->func_id)) {