diff mbox series

[bpf-next,4/5] selftests/bpf: Add benchmark for bpf_csum_diff() helper

Message ID 20241021122112.101513-5-puranjay@kernel.org (mailing list archive)
State Superseded
Headers show
Series Optimize bpf_csum_diff() and homogenize for all archs | expand

Commit Message

Puranjay Mohan Oct. 21, 2024, 12:21 p.m. UTC
Add a microbenchmark for bpf_csum_diff() helper. This benchmark works by
filling a 4KB buffer with random data and calculating the internet
checksum on different parts of this buffer using bpf_csum_diff().

Example run using ./benchs/run_bench_csum_diff.sh on x86_64:

[bpf]$ ./benchs/run_bench_csum_diff.sh
4                    2.296 ± 0.066M/s (drops 0.000 ± 0.000M/s)
8                    2.320 ± 0.003M/s (drops 0.000 ± 0.000M/s)
16                   2.315 ± 0.001M/s (drops 0.000 ± 0.000M/s)
20                   2.318 ± 0.001M/s (drops 0.000 ± 0.000M/s)
32                   2.308 ± 0.003M/s (drops 0.000 ± 0.000M/s)
40                   2.300 ± 0.029M/s (drops 0.000 ± 0.000M/s)
64                   2.286 ± 0.001M/s (drops 0.000 ± 0.000M/s)
128                  2.250 ± 0.001M/s (drops 0.000 ± 0.000M/s)
256                  2.173 ± 0.001M/s (drops 0.000 ± 0.000M/s)
512                  2.023 ± 0.055M/s (drops 0.000 ± 0.000M/s)

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
 tools/testing/selftests/bpf/Makefile          |   2 +
 tools/testing/selftests/bpf/bench.c           |   4 +
 .../selftests/bpf/benchs/bench_csum_diff.c    | 164 ++++++++++++++++++
 .../bpf/benchs/run_bench_csum_diff.sh         |  10 ++
 .../selftests/bpf/progs/csum_diff_bench.c     |  25 +++
 5 files changed, 205 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/benchs/bench_csum_diff.c
 create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_csum_diff.sh
 create mode 100644 tools/testing/selftests/bpf/progs/csum_diff_bench.c

Comments

Daniel Borkmann Oct. 21, 2024, 1:43 p.m. UTC | #1
On 10/21/24 2:21 PM, Puranjay Mohan wrote:
> Add a microbenchmark for bpf_csum_diff() helper. This benchmark works by
> filling a 4KB buffer with random data and calculating the internet
> checksum on different parts of this buffer using bpf_csum_diff().
> 
> Example run using ./benchs/run_bench_csum_diff.sh on x86_64:
> 
> [bpf]$ ./benchs/run_bench_csum_diff.sh
> 4                    2.296 ± 0.066M/s (drops 0.000 ± 0.000M/s)
> 8                    2.320 ± 0.003M/s (drops 0.000 ± 0.000M/s)
> 16                   2.315 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> 20                   2.318 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> 32                   2.308 ± 0.003M/s (drops 0.000 ± 0.000M/s)
> 40                   2.300 ± 0.029M/s (drops 0.000 ± 0.000M/s)
> 64                   2.286 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> 128                  2.250 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> 256                  2.173 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> 512                  2.023 ± 0.055M/s (drops 0.000 ± 0.000M/s)
> 
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>

Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Andrii Nakryiko Oct. 21, 2024, 11:28 p.m. UTC | #2
On Mon, Oct 21, 2024 at 5:22 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> Add a microbenchmark for bpf_csum_diff() helper. This benchmark works by
> filling a 4KB buffer with random data and calculating the internet
> checksum on different parts of this buffer using bpf_csum_diff().
>
> Example run using ./benchs/run_bench_csum_diff.sh on x86_64:
>
> [bpf]$ ./benchs/run_bench_csum_diff.sh
> 4                    2.296 ± 0.066M/s (drops 0.000 ± 0.000M/s)
> 8                    2.320 ± 0.003M/s (drops 0.000 ± 0.000M/s)
> 16                   2.315 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> 20                   2.318 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> 32                   2.308 ± 0.003M/s (drops 0.000 ± 0.000M/s)
> 40                   2.300 ± 0.029M/s (drops 0.000 ± 0.000M/s)
> 64                   2.286 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> 128                  2.250 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> 256                  2.173 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> 512                  2.023 ± 0.055M/s (drops 0.000 ± 0.000M/s)

you are not benchmarking bpf_csum_diff(), you are benchmarking how
often you can call bpf_prog_test_run(). Add some batching on the BPF
side, these numbers tell you that there is no difference between
calculating checksum for 4 bytes and for 512, that didn't seem strange
to you?

pw-bot: cr

>
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
>  tools/testing/selftests/bpf/Makefile          |   2 +
>  tools/testing/selftests/bpf/bench.c           |   4 +
>  .../selftests/bpf/benchs/bench_csum_diff.c    | 164 ++++++++++++++++++
>  .../bpf/benchs/run_bench_csum_diff.sh         |  10 ++
>  .../selftests/bpf/progs/csum_diff_bench.c     |  25 +++
>  5 files changed, 205 insertions(+)
>  create mode 100644 tools/testing/selftests/bpf/benchs/bench_csum_diff.c
>  create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_csum_diff.sh
>  create mode 100644 tools/testing/selftests/bpf/progs/csum_diff_bench.c
>

[...]

> +
> +static void csum_diff_setup(void)
> +{
> +       int err;
> +       char *buff;
> +       size_t i, sz;
> +
> +       sz = sizeof(ctx.skel->rodata->buff);
> +
> +       setup_libbpf();
> +
> +       ctx.skel = csum_diff_bench__open();
> +       if (!ctx.skel) {
> +               fprintf(stderr, "failed to open skeleton\n");
> +               exit(1);
> +       }
> +
> +       srandom(time(NULL));
> +       buff = ctx.skel->rodata->buff;
> +
> +       /*
> +        * Set first 8 bytes of buffer to 0xdeadbeefdeadbeef, this is later used to verify the
> +        * correctness of the helper by comparing the checksum result for 0xdeadbeefdeadbeef that
> +        * should be 0x3b3b
> +        */
> +
> +       *(u64 *)buff = 0xdeadbeefdeadbeef;
> +
> +       for (i = 8; i < sz; i++)
> +               buff[i] = '1' + random() % 9;

so, you only generate 9 different values for bytes, why? Why not full
byte range?

> +
> +       ctx.skel->rodata->buff_len = args.buff_len;
> +
> +       err = csum_diff_bench__load(ctx.skel);
> +       if (err) {
> +               fprintf(stderr, "failed to load skeleton\n");
> +               csum_diff_bench__destroy(ctx.skel);
> +               exit(1);
> +       }
> +}
> +

[...]
Puranjay Mohan Oct. 22, 2024, 10:21 a.m. UTC | #3
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Mon, Oct 21, 2024 at 5:22 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>>
>> Add a microbenchmark for bpf_csum_diff() helper. This benchmark works by
>> filling a 4KB buffer with random data and calculating the internet
>> checksum on different parts of this buffer using bpf_csum_diff().
>>
>> Example run using ./benchs/run_bench_csum_diff.sh on x86_64:
>>
>> [bpf]$ ./benchs/run_bench_csum_diff.sh
>> 4                    2.296 ± 0.066M/s (drops 0.000 ± 0.000M/s)
>> 8                    2.320 ± 0.003M/s (drops 0.000 ± 0.000M/s)
>> 16                   2.315 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>> 20                   2.318 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>> 32                   2.308 ± 0.003M/s (drops 0.000 ± 0.000M/s)
>> 40                   2.300 ± 0.029M/s (drops 0.000 ± 0.000M/s)
>> 64                   2.286 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>> 128                  2.250 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>> 256                  2.173 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>> 512                  2.023 ± 0.055M/s (drops 0.000 ± 0.000M/s)
>
> you are not benchmarking bpf_csum_diff(), you are benchmarking how
> often you can call bpf_prog_test_run(). Add some batching on the BPF
> side, these numbers tell you that there is no difference between
> calculating checksum for 4 bytes and for 512, that didn't seem strange
> to you?

This didn't seem strange to me because if you see the tables I added to
the cover letter, there is a clear improvement after optimizing the
helper and arm64 even shows a linear drop going from 4 bytes to 512
bytes, even after the optimization.

On x86 after the improvement, 4 bytes and 512 bytes show similar numbers
but there is still a small drop that can be seen going from 4 to 512
bytes.

My thought was that because the bpf_csum_diff() calls csum_partial() on
x86 which is already optimised, most of the overhead was due to copying
the buffer which is now removed.

I guess I can amplify the difference between 4B and 512B by calling
bpf_csum_diff() multiple times in a loop, or by calculating the csum by
dividing the buffer into more parts (currently the BPF code divides it
into 2 parts only).

>>
>> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
>> ---
>>  tools/testing/selftests/bpf/Makefile          |   2 +
>>  tools/testing/selftests/bpf/bench.c           |   4 +
>>  .../selftests/bpf/benchs/bench_csum_diff.c    | 164 ++++++++++++++++++
>>  .../bpf/benchs/run_bench_csum_diff.sh         |  10 ++
>>  .../selftests/bpf/progs/csum_diff_bench.c     |  25 +++
>>  5 files changed, 205 insertions(+)
>>  create mode 100644 tools/testing/selftests/bpf/benchs/bench_csum_diff.c
>>  create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_csum_diff.sh
>>  create mode 100644 tools/testing/selftests/bpf/progs/csum_diff_bench.c
>>
>
> [...]
>
>> +
>> +static void csum_diff_setup(void)
>> +{
>> +       int err;
>> +       char *buff;
>> +       size_t i, sz;
>> +
>> +       sz = sizeof(ctx.skel->rodata->buff);
>> +
>> +       setup_libbpf();
>> +
>> +       ctx.skel = csum_diff_bench__open();
>> +       if (!ctx.skel) {
>> +               fprintf(stderr, "failed to open skeleton\n");
>> +               exit(1);
>> +       }
>> +
>> +       srandom(time(NULL));
>> +       buff = ctx.skel->rodata->buff;
>> +
>> +       /*
>> +        * Set first 8 bytes of buffer to 0xdeadbeefdeadbeef, this is later used to verify the
>> +        * correctness of the helper by comparing the checksum result for 0xdeadbeefdeadbeef that
>> +        * should be 0x3b3b
>> +        */
>> +
>> +       *(u64 *)buff = 0xdeadbeefdeadbeef;
>> +
>> +       for (i = 8; i < sz; i++)
>> +               buff[i] = '1' + random() % 9;
>
> so, you only generate 9 different values for bytes, why? Why not full
> byte range?

Thanks for catching this, there is no reason for this to be [1,10] I
will use the full byte range in the next version.

Thanks,
Puranjay
Andrii Nakryiko Oct. 22, 2024, 5:47 p.m. UTC | #4
On Tue, Oct 22, 2024 at 3:21 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
> > On Mon, Oct 21, 2024 at 5:22 AM Puranjay Mohan <puranjay@kernel.org> wrote:
> >>
> >> Add a microbenchmark for bpf_csum_diff() helper. This benchmark works by
> >> filling a 4KB buffer with random data and calculating the internet
> >> checksum on different parts of this buffer using bpf_csum_diff().
> >>
> >> Example run using ./benchs/run_bench_csum_diff.sh on x86_64:
> >>
> >> [bpf]$ ./benchs/run_bench_csum_diff.sh
> >> 4                    2.296 ± 0.066M/s (drops 0.000 ± 0.000M/s)
> >> 8                    2.320 ± 0.003M/s (drops 0.000 ± 0.000M/s)
> >> 16                   2.315 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> >> 20                   2.318 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> >> 32                   2.308 ± 0.003M/s (drops 0.000 ± 0.000M/s)
> >> 40                   2.300 ± 0.029M/s (drops 0.000 ± 0.000M/s)
> >> 64                   2.286 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> >> 128                  2.250 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> >> 256                  2.173 ± 0.001M/s (drops 0.000 ± 0.000M/s)
> >> 512                  2.023 ± 0.055M/s (drops 0.000 ± 0.000M/s)
> >
> > you are not benchmarking bpf_csum_diff(), you are benchmarking how
> > often you can call bpf_prog_test_run(). Add some batching on the BPF
> > side, these numbers tell you that there is no difference between
> > calculating checksum for 4 bytes and for 512, that didn't seem strange
> > to you?
>
> This didn't seem strange to me because if you see the tables I added to
> the cover letter, there is a clear improvement after optimizing the
> helper and arm64 even shows a linear drop going from 4 bytes to 512
> bytes, even after the optimization.
>

Regardless of optimization, it's strange that throughput barely
differs when you vary the amount of work by more than 100x. This
wouldn't be strange if this checksum calculation was some sort of
cryptographic hash, where it's intentional to have the same timing
regardless of amount of work, or something along those lines. But I
don't think that's the case here.

But as it is right now, this benchmark is benchmarking
bpf_prog_test_run(), as I mentioned, which seems to be bottlenecking
at about 2mln/s throughput for your machine. bpf_csum_diff()'s
overhead is trivial compared to bpf_prog_test_run() overhead and
syscall/context switch overhead.

We shouldn't add the benchmark that doesn't benchmark the right thing.
So just add a bpf_for(i, 0, 100) loop doing bpf_csum_diff(), and then
do atomic increment *after* the loop (to minimize atomics overhead).

> On x86 after the improvement, 4 bytes and 512 bytes show similar numbers
> but there is still a small drop that can be seen going from 4 to 512
> bytes.
>
> My thought was that because the bpf_csum_diff() calls csum_partial() on
> x86 which is already optimised, most of the overhead was due to copying
> the buffer which is now removed.
>
> I guess I can amplify the difference between 4B and 512B by calling
> bpf_csum_diff() multiple times in a loop, or by calculating the csum by
> dividing the buffer into more parts (currently the BPF code divides it
> into 2 parts only).
>
> >>
> >> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> >> ---
> >>  tools/testing/selftests/bpf/Makefile          |   2 +
> >>  tools/testing/selftests/bpf/bench.c           |   4 +
> >>  .../selftests/bpf/benchs/bench_csum_diff.c    | 164 ++++++++++++++++++
> >>  .../bpf/benchs/run_bench_csum_diff.sh         |  10 ++
> >>  .../selftests/bpf/progs/csum_diff_bench.c     |  25 +++
> >>  5 files changed, 205 insertions(+)
> >>  create mode 100644 tools/testing/selftests/bpf/benchs/bench_csum_diff.c
> >>  create mode 100755 tools/testing/selftests/bpf/benchs/run_bench_csum_diff.sh
> >>  create mode 100644 tools/testing/selftests/bpf/progs/csum_diff_bench.c
> >>
> >

[...]
Puranjay Mohan Oct. 22, 2024, 5:58 p.m. UTC | #5
Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:

> On Tue, Oct 22, 2024 at 3:21 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>>
>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>
>> > On Mon, Oct 21, 2024 at 5:22 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>> >>
>> >> Add a microbenchmark for bpf_csum_diff() helper. This benchmark works by
>> >> filling a 4KB buffer with random data and calculating the internet
>> >> checksum on different parts of this buffer using bpf_csum_diff().
>> >>
>> >> Example run using ./benchs/run_bench_csum_diff.sh on x86_64:
>> >>
>> >> [bpf]$ ./benchs/run_bench_csum_diff.sh
>> >> 4                    2.296 ± 0.066M/s (drops 0.000 ± 0.000M/s)
>> >> 8                    2.320 ± 0.003M/s (drops 0.000 ± 0.000M/s)
>> >> 16                   2.315 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>> >> 20                   2.318 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>> >> 32                   2.308 ± 0.003M/s (drops 0.000 ± 0.000M/s)
>> >> 40                   2.300 ± 0.029M/s (drops 0.000 ± 0.000M/s)
>> >> 64                   2.286 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>> >> 128                  2.250 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>> >> 256                  2.173 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>> >> 512                  2.023 ± 0.055M/s (drops 0.000 ± 0.000M/s)
>> >
>> > you are not benchmarking bpf_csum_diff(), you are benchmarking how
>> > often you can call bpf_prog_test_run(). Add some batching on the BPF
>> > side, these numbers tell you that there is no difference between
>> > calculating checksum for 4 bytes and for 512, that didn't seem strange
>> > to you?
>>
>> This didn't seem strange to me because if you see the tables I added to
>> the cover letter, there is a clear improvement after optimizing the
>> helper and arm64 even shows a linear drop going from 4 bytes to 512
>> bytes, even after the optimization.
>>
>
> Regardless of optimization, it's strange that throughput barely
> differs when you vary the amount of work by more than 100x. This
> wouldn't be strange if this checksum calculation was some sort of
> cryptographic hash, where it's intentional to have the same timing
> regardless of amount of work, or something along those lines. But I
> don't think that's the case here.
>
> But as it is right now, this benchmark is benchmarking
> bpf_prog_test_run(), as I mentioned, which seems to be bottlenecking
> at about 2mln/s throughput for your machine. bpf_csum_diff()'s
> overhead is trivial compared to bpf_prog_test_run() overhead and
> syscall/context switch overhead.
>
> We shouldn't add the benchmark that doesn't benchmark the right thing.
> So just add a bpf_for(i, 0, 100) loop doing bpf_csum_diff(), and then
> do atomic increment *after* the loop (to minimize atomics overhead).

Thanks, now I undestand what you meant. Will add the bpf_for() in the
next version.

Thanks,
Puranjay
Puranjay Mohan Oct. 23, 2024, 3:37 p.m. UTC | #6
Puranjay Mohan <puranjay@kernel.org> writes:

> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>
>> On Tue, Oct 22, 2024 at 3:21 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>>>
>>> Andrii Nakryiko <andrii.nakryiko@gmail.com> writes:
>>>
>>> > On Mon, Oct 21, 2024 at 5:22 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>>> >>
>>> >> Add a microbenchmark for bpf_csum_diff() helper. This benchmark works by
>>> >> filling a 4KB buffer with random data and calculating the internet
>>> >> checksum on different parts of this buffer using bpf_csum_diff().
>>> >>
>>> >> Example run using ./benchs/run_bench_csum_diff.sh on x86_64:
>>> >>
>>> >> [bpf]$ ./benchs/run_bench_csum_diff.sh
>>> >> 4                    2.296 ± 0.066M/s (drops 0.000 ± 0.000M/s)
>>> >> 8                    2.320 ± 0.003M/s (drops 0.000 ± 0.000M/s)
>>> >> 16                   2.315 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>>> >> 20                   2.318 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>>> >> 32                   2.308 ± 0.003M/s (drops 0.000 ± 0.000M/s)
>>> >> 40                   2.300 ± 0.029M/s (drops 0.000 ± 0.000M/s)
>>> >> 64                   2.286 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>>> >> 128                  2.250 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>>> >> 256                  2.173 ± 0.001M/s (drops 0.000 ± 0.000M/s)
>>> >> 512                  2.023 ± 0.055M/s (drops 0.000 ± 0.000M/s)
>>> >
>>> > you are not benchmarking bpf_csum_diff(), you are benchmarking how
>>> > often you can call bpf_prog_test_run(). Add some batching on the BPF
>>> > side, these numbers tell you that there is no difference between
>>> > calculating checksum for 4 bytes and for 512, that didn't seem strange
>>> > to you?
>>>
>>> This didn't seem strange to me because if you see the tables I added to
>>> the cover letter, there is a clear improvement after optimizing the
>>> helper and arm64 even shows a linear drop going from 4 bytes to 512
>>> bytes, even after the optimization.
>>>
>>
>> Regardless of optimization, it's strange that throughput barely
>> differs when you vary the amount of work by more than 100x. This
>> wouldn't be strange if this checksum calculation was some sort of
>> cryptographic hash, where it's intentional to have the same timing
>> regardless of amount of work, or something along those lines. But I
>> don't think that's the case here.
>>
>> But as it is right now, this benchmark is benchmarking
>> bpf_prog_test_run(), as I mentioned, which seems to be bottlenecking
>> at about 2mln/s throughput for your machine. bpf_csum_diff()'s
>> overhead is trivial compared to bpf_prog_test_run() overhead and
>> syscall/context switch overhead.
>>
>> We shouldn't add the benchmark that doesn't benchmark the right thing.
>> So just add a bpf_for(i, 0, 100) loop doing bpf_csum_diff(), and then
>> do atomic increment *after* the loop (to minimize atomics overhead).
>
> Thanks, now I undestand what you meant. Will add the bpf_for() in the
> next version.

I have decided to drop this patch as even after adding bpf_for() the
difference between 4B and 512B is not that much. So, benchmarking
bpf_csum_diff() using this triggering based framework is not useful.

So, v2 will not have this patch but the cover letter will still have the
tables to show the difference before/after the optimization.

Thanks,
Puranjay
diff mbox series

Patch

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 28a76baa854d3..a0d86dd453e16 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -809,6 +809,7 @@  $(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
 $(OUTPUT)/bench_bloom_filter_map.o: $(OUTPUT)/bloom_filter_bench.skel.h
 $(OUTPUT)/bench_bpf_loop.o: $(OUTPUT)/bpf_loop_bench.skel.h
 $(OUTPUT)/bench_strncmp.o: $(OUTPUT)/strncmp_bench.skel.h
+$(OUTPUT)/bench_csum_diff.o: $(OUTPUT)/csum_diff_bench.skel.h
 $(OUTPUT)/bench_bpf_hashmap_full_update.o: $(OUTPUT)/bpf_hashmap_full_update_bench.skel.h
 $(OUTPUT)/bench_local_storage.o: $(OUTPUT)/local_storage_bench.skel.h
 $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o: $(OUTPUT)/local_storage_rcu_tasks_trace_bench.skel.h
@@ -829,6 +830,7 @@  $(OUTPUT)/bench: $(OUTPUT)/bench.o \
 		 $(OUTPUT)/bench_bloom_filter_map.o \
 		 $(OUTPUT)/bench_bpf_loop.o \
 		 $(OUTPUT)/bench_strncmp.o \
+		 $(OUTPUT)/bench_csum_diff.o \
 		 $(OUTPUT)/bench_bpf_hashmap_full_update.o \
 		 $(OUTPUT)/bench_local_storage.o \
 		 $(OUTPUT)/bench_local_storage_rcu_tasks_trace.o \
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
index 1bd403a5ef7b3..29bd6f4498ebc 100644
--- a/tools/testing/selftests/bpf/bench.c
+++ b/tools/testing/selftests/bpf/bench.c
@@ -278,6 +278,7 @@  extern struct argp bench_bpf_loop_argp;
 extern struct argp bench_local_storage_argp;
 extern struct argp bench_local_storage_rcu_tasks_trace_argp;
 extern struct argp bench_strncmp_argp;
+extern struct argp bench_csum_diff_argp;
 extern struct argp bench_hashmap_lookup_argp;
 extern struct argp bench_local_storage_create_argp;
 extern struct argp bench_htab_mem_argp;
@@ -290,6 +291,7 @@  static const struct argp_child bench_parsers[] = {
 	{ &bench_bpf_loop_argp, 0, "bpf_loop helper benchmark", 0 },
 	{ &bench_local_storage_argp, 0, "local_storage benchmark", 0 },
 	{ &bench_strncmp_argp, 0, "bpf_strncmp helper benchmark", 0 },
+	{ &bench_csum_diff_argp, 0, "bpf_csum_diff helper benchmark", 0 },
 	{ &bench_local_storage_rcu_tasks_trace_argp, 0,
 		"local_storage RCU Tasks Trace slowdown benchmark", 0 },
 	{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
@@ -539,6 +541,7 @@  extern const struct bench bench_hashmap_with_bloom;
 extern const struct bench bench_bpf_loop;
 extern const struct bench bench_strncmp_no_helper;
 extern const struct bench bench_strncmp_helper;
+extern const struct bench bench_csum_diff;
 extern const struct bench bench_bpf_hashmap_full_update;
 extern const struct bench bench_local_storage_cache_seq_get;
 extern const struct bench bench_local_storage_cache_interleaved_get;
@@ -599,6 +602,7 @@  static const struct bench *benchs[] = {
 	&bench_bpf_loop,
 	&bench_strncmp_no_helper,
 	&bench_strncmp_helper,
+	&bench_csum_diff,
 	&bench_bpf_hashmap_full_update,
 	&bench_local_storage_cache_seq_get,
 	&bench_local_storage_cache_interleaved_get,
diff --git a/tools/testing/selftests/bpf/benchs/bench_csum_diff.c b/tools/testing/selftests/bpf/benchs/bench_csum_diff.c
new file mode 100644
index 0000000000000..2c30c8b54d9bc
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_csum_diff.c
@@ -0,0 +1,164 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates */
+#include <argp.h>
+#include "bench.h"
+#include "csum_diff_bench.skel.h"
+
+static struct csum_diff_ctx {
+	struct csum_diff_bench *skel;
+	int pfd;
+} ctx;
+
+static struct csum_diff_args {
+	u32 buff_len;
+} args = {
+	.buff_len = 32,
+};
+
+enum {
+	ARG_BUFF_LEN = 5000,
+};
+
+static const struct argp_option opts[] = {
+	{ "buff-len", ARG_BUFF_LEN, "BUFF_LEN", 0,
+	  "Set the length of the buffer" },
+	{},
+};
+
+static error_t csum_diff_parse_arg(int key, char *arg, struct argp_state *state)
+{
+	switch (key) {
+	case ARG_BUFF_LEN:
+		args.buff_len = strtoul(arg, NULL, 10);
+		if (!args.buff_len ||
+		    args.buff_len >= sizeof(ctx.skel->rodata->buff)) {
+			fprintf(stderr, "Invalid buff len (limit %zu)\n",
+				sizeof(ctx.skel->rodata->buff));
+			argp_usage(state);
+		}
+		break;
+	default:
+		return ARGP_ERR_UNKNOWN;
+	}
+
+	return 0;
+}
+
+const struct argp bench_csum_diff_argp = {
+	.options = opts,
+	.parser = csum_diff_parse_arg,
+};
+
+static void csum_diff_validate(void)
+{
+	if (env.consumer_cnt != 0) {
+		fprintf(stderr, "csum_diff benchmark doesn't support consumer!\n");
+		exit(1);
+	}
+}
+
+static void csum_diff_setup(void)
+{
+	int err;
+	char *buff;
+	size_t i, sz;
+
+	sz = sizeof(ctx.skel->rodata->buff);
+
+	setup_libbpf();
+
+	ctx.skel = csum_diff_bench__open();
+	if (!ctx.skel) {
+		fprintf(stderr, "failed to open skeleton\n");
+		exit(1);
+	}
+
+	srandom(time(NULL));
+	buff = ctx.skel->rodata->buff;
+
+	/*
+	 * Set first 8 bytes of buffer to 0xdeadbeefdeadbeef, this is later used to verify the
+	 * correctness of the helper by comparing the checksum result for 0xdeadbeefdeadbeef that
+	 * should be 0x3b3b
+	 */
+
+	*(u64 *)buff = 0xdeadbeefdeadbeef;
+
+	for (i = 8; i < sz; i++)
+		buff[i] = '1' + random() % 9;
+
+	ctx.skel->rodata->buff_len = args.buff_len;
+
+	err = csum_diff_bench__load(ctx.skel);
+	if (err) {
+		fprintf(stderr, "failed to load skeleton\n");
+		csum_diff_bench__destroy(ctx.skel);
+		exit(1);
+	}
+}
+
+static void csum_diff_helper_setup(void)
+{
+	u8 tmp_out[64 << 2] = {};
+	u8 tmp_in[64] = {};
+	int err, saved_errno;
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.data_in = tmp_in,
+		.data_size_in = sizeof(tmp_in),
+		.data_out = tmp_out,
+		.data_size_out = sizeof(tmp_out),
+		.repeat = 1,
+	);
+	csum_diff_setup();
+	ctx.pfd = bpf_program__fd(ctx.skel->progs.compute_checksum);
+
+	err = bpf_prog_test_run_opts(ctx.pfd, &topts);
+	saved_errno = errno;
+
+	if (err) {
+		fprintf(stderr, "failed to run setup prog: err %d, result %d, serror %d\n",
+			err, ctx.skel->bss->result, saved_errno);
+		csum_diff_bench__destroy(ctx.skel);
+		exit(1);
+	}
+
+	/* Sanity check for correctness of helper */
+	if (args.buff_len == 8 && ctx.skel->bss->result != 0x3b3b) {
+		fprintf(stderr, "csum_diff helper broken: buff: %lx, result: %x, expected: %x\n",
+			*(u64 *)ctx.skel->rodata->buff, ctx.skel->bss->result, 0x3b3b);
+	}
+}
+
+static void *csum_diff_producer(void *unused)
+{
+	u8 tmp_out[64 << 2] = {};
+	u8 tmp_in[64] = {};
+
+	LIBBPF_OPTS(bpf_test_run_opts, topts,
+		.data_in = tmp_in,
+		.data_size_in = sizeof(tmp_in),
+		.data_out = tmp_out,
+		.data_size_out = sizeof(tmp_out),
+		.repeat = 64,
+	);
+	while (true)
+		(void)bpf_prog_test_run_opts(ctx.pfd, &topts);
+	return NULL;
+}
+
+static void csum_diff_measure(struct bench_res *res)
+{
+	res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+}
+
+const struct bench bench_csum_diff = {
+	.name = "csum-diff-helper",
+	.argp = &bench_csum_diff_argp,
+	.validate = csum_diff_validate,
+	.setup = csum_diff_helper_setup,
+	.producer_thread = csum_diff_producer,
+	.measure = csum_diff_measure,
+	.report_progress = hits_drops_report_progress,
+	.report_final = hits_drops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_csum_diff.sh b/tools/testing/selftests/bpf/benchs/run_bench_csum_diff.sh
new file mode 100755
index 0000000000000..c4e147fbf2f98
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_csum_diff.sh
@@ -0,0 +1,10 @@ 
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source ./benchs/run_common.sh
+
+set -eufo pipefail
+
+for s in 4 8 16 20 32 40 64 128 256 512; do
+	summarize ${s} "$($RUN_BENCH --buff-len=$s csum-diff-helper)"
+done
diff --git a/tools/testing/selftests/bpf/progs/csum_diff_bench.c b/tools/testing/selftests/bpf/progs/csum_diff_bench.c
new file mode 100644
index 0000000000000..85245edd6f9dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/csum_diff_bench.c
@@ -0,0 +1,25 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright Amazon.com Inc. or its affiliates */
+#include <linux/types.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define BUFF_SZ 4096
+
+/* Will be updated by benchmark before program loading */
+const char buff[BUFF_SZ];
+const volatile unsigned int buff_len = 4;
+
+long hits = 0;
+short result;
+
+char _license[] SEC("license") = "GPL";
+
+SEC("tc")
+int compute_checksum(void *ctx)
+{
+	result = bpf_csum_diff(0, 0, (void *)buff, buff_len, 0);
+	__sync_add_and_fetch(&hits, 1);
+	return 0;
+}