diff mbox series

[1/2] bpf: Introduce cpu affinity for sockmap

Message ID 20241101023832.32404-1-mrpre@163.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series [1/2] bpf: Introduce cpu affinity for sockmap | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 202 this patch: 202
netdev/build_tools success Errors and warnings before: 2 (+0) this patch: 2 (+0)
netdev/cc_maintainers warning 11 maintainers not CCed: horms@kernel.org daniel@iogearbox.net ast@kernel.org kpsingh@kernel.org sdf@fomichev.me eddyz87@gmail.com martin.lau@linux.dev song@kernel.org haoluo@google.com andrii@kernel.org jolsa@kernel.org
netdev/build_clang success Errors and warnings before: 235 this patch: 235
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6932 this patch: 6932
netdev/checkpatch warning WARNING: From:/Signed-off-by: email name mismatch: 'From: mrpre <mrpre@163.com>' != 'Signed-off-by: Jiayuan Chen <mrpre@163.com>' WARNING: line length of 102 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0

Commit Message

Jiayuan Chen Nov. 1, 2024, 2:38 a.m. UTC
Why we need cpu affinity:
Mainstream data planes, like Nginx and HAProxy, utilize CPU affinity
by binding user processes to specific CPUs. This avoids interference
between processes and prevents impact from other processes.

Sockmap, as an optimization to accelerate such proxy programs,
currently lacks the ability to specify CPU affinity. The current
implementation of sockmap handling backlog is based on workqueue,
which operates by calling 'schedule_delayed_work()'. It's current
implementation prefers to schedule on the local CPU, i.e., the CPU
that handled the packet under softirq.

For extremely high traffic with large numbers of packets,
'sk_psock_backlog' becomes a large loop.

For multi-threaded programs with only one map, we expect different
sockets to run on different CPUs. It is important to note that this
feature is not a general performance optimization. Instead, it
provides users with the ability to bind to specific CPU, allowing
them to enhance overall operating system utilization based on their
own system environments.

Implementation:
1.When updating the sockmap, support passing a CPU parameter and
save it to the psock.
2.When scheduling psock, determine which CPU to run on using the
psock's CPU information.
3.For thoes sockmap without CPU affinity, keep original logic by using
'schedule_delayed_work()'.

Performance Testing:
'client <-> sockmap proxy <-> server'

Using 'iperf3' tests, with the iperf server bound to CPU5 and the iperf
client bound to CPU6, performance without using CPU affinity is
around 34 Gbits/s, and CPU usage is concentrated on CPU5 and CPU6.
'''
[  5] local 127.0.0.1 port 57144 connected to 127.0.0.1 port 10000
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec  3.95 GBytes  33.9 Gbits/sec
[  5]   1.00-2.00   sec  3.95 GBytes  34.0 Gbits/sec
......
'''

With using CPU affinity, the performnce is close to direct connection
(without any proxy).
'''
[  5] local 127.0.0.1 port 56518 connected to 127.0.0.1 port 10000
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec  7.76 GBytes  66.6 Gbits/sec
[  5]   1.00-2.00   sec  7.76 GBytes  66.7 Gbits/sec
......
'''

Signed-off-by: Jiayuan Chen <mrpre@163.com>
---
 include/linux/bpf.h      |  3 ++-
 include/linux/skmsg.h    |  8 ++++++++
 include/uapi/linux/bpf.h |  4 ++++
 kernel/bpf/syscall.c     | 23 +++++++++++++++++------
 net/core/skmsg.c         | 11 +++++++----
 net/core/sock_map.c      | 12 +++++++-----
 6 files changed, 45 insertions(+), 16 deletions(-)

Comments

kernel test robot Nov. 1, 2024, 1:20 p.m. UTC | #1
Hi mrpre,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/master]
[also build test ERROR on bpf/master net-next/main net/main linus/master v6.12-rc5 next-20241101]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/mrpre/bpf-implement-libbpf-sockmap-cpu-affinity/20241101-104144
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20241101023832.32404-1-mrpre%40163.com
patch subject: [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
config: i386-buildonly-randconfig-001-20241101 (https://download.01.org/0day-ci/archive/20241101/202411012119.5fFOfrH9-lkp@intel.com/config)
compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241101/202411012119.5fFOfrH9-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411012119.5fFOfrH9-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from kernel/bpf/syscall.c:4:
   In file included from include/linux/bpf.h:21:
   In file included from include/linux/kallsyms.h:13:
   In file included from include/linux/mm.h:2213:
   include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     518 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
>> kernel/bpf/syscall.c:254:59: error: too many arguments to function call, expected 4, have 5
     254 |                 return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
         |                        ~~~~~~~~~~~~~~~~~~~~~~~~                         ^~~~~~~~~~
   include/linux/bpf.h:3175:19: note: 'sock_map_update_elem_sys' declared here
    3175 | static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
         |                   ^                        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    3176 |                                            u64 flags)
         |                                            ~~~~~~~~~
   kernel/bpf/syscall.c:5961:30: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
    5961 |         .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
         |                           ~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
   kernel/bpf/syscall.c:6011:41: warning: bitwise operation between different enumeration types ('enum bpf_arg_type' and 'enum bpf_type_flag') [-Wenum-enum-conversion]
    6011 |         .arg4_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,
         |                           ~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~~~~~~~~~
   3 warnings and 1 error generated.


vim +254 kernel/bpf/syscall.c

   240	
   241	static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
   242					void *key, void *value, __u64 flags, s32 target_cpu)
   243	{
   244		int err;
   245		/* Need to create a kthread, thus must support schedule */
   246		if (bpf_map_is_offloaded(map)) {
   247			return bpf_map_offload_update_elem(map, key, value, flags);
   248		} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
   249			   map->map_type == BPF_MAP_TYPE_ARENA ||
   250			   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
   251			return map->ops->map_update_elem(map, key, value, flags);
   252		} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
   253			   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
 > 254			return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
   255		} else if (IS_FD_PROG_ARRAY(map)) {
   256			return bpf_fd_array_map_update_elem(map, map_file, key, value,
   257							    flags);
   258		}
   259	
   260		bpf_disable_instrumentation();
   261		if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
   262		    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
   263			err = bpf_percpu_hash_update(map, key, value, flags);
   264		} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
   265			err = bpf_percpu_array_update(map, key, value, flags);
   266		} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
   267			err = bpf_percpu_cgroup_storage_update(map, key, value,
   268							       flags);
   269		} else if (IS_FD_ARRAY(map)) {
   270			err = bpf_fd_array_map_update_elem(map, map_file, key, value,
   271							   flags);
   272		} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
   273			err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
   274							  flags);
   275		} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
   276			/* rcu_read_lock() is not needed */
   277			err = bpf_fd_reuseport_array_update_elem(map, key, value,
   278								 flags);
   279		} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
   280			   map->map_type == BPF_MAP_TYPE_STACK ||
   281			   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
   282			err = map->ops->map_push_elem(map, value, flags);
   283		} else {
   284			err = bpf_obj_pin_uptrs(map->record, value);
   285			if (!err) {
   286				rcu_read_lock();
   287				err = map->ops->map_update_elem(map, key, value, flags);
   288				rcu_read_unlock();
   289				if (err)
   290					bpf_obj_unpin_uptrs(map->record, value);
   291			}
   292		}
   293		bpf_enable_instrumentation();
   294	
   295		return err;
   296	}
   297
kernel test robot Nov. 1, 2024, 2:02 p.m. UTC | #2
Hi mrpre,

kernel test robot noticed the following build errors:

[auto build test ERROR on bpf-next/master]
[also build test ERROR on bpf/master net-next/main net/main linus/master v6.12-rc5 next-20241101]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/mrpre/bpf-implement-libbpf-sockmap-cpu-affinity/20241101-104144
base:   https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link:    https://lore.kernel.org/r/20241101023832.32404-1-mrpre%40163.com
patch subject: [PATCH 1/2] bpf: Introduce cpu affinity for sockmap
config: arc-randconfig-001-20241101 (https://download.01.org/0day-ci/archive/20241101/202411012135.447KNHZK-lkp@intel.com/config)
compiler: arc-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241101/202411012135.447KNHZK-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411012135.447KNHZK-lkp@intel.com/

All errors (new ones prefixed by >>):

   kernel/bpf/syscall.c: In function 'bpf_map_update_value':
>> kernel/bpf/syscall.c:254:24: error: too many arguments to function 'sock_map_update_elem_sys'
     254 |                 return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
         |                        ^~~~~~~~~~~~~~~~~~~~~~~~
   In file included from kernel/bpf/syscall.c:4:
   include/linux/bpf.h:3175:19: note: declared here
    3175 | static inline int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
         |                   ^~~~~~~~~~~~~~~~~~~~~~~~

Kconfig warnings: (for reference only)
   WARNING: unmet direct dependencies detected for GET_FREE_REGION
   Depends on [n]: SPARSEMEM [=n]
   Selected by [m]:
   - RESOURCE_KUNIT_TEST [=m] && RUNTIME_TESTING_MENU [=y] && KUNIT [=m]


vim +/sock_map_update_elem_sys +254 kernel/bpf/syscall.c

   240	
   241	static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
   242					void *key, void *value, __u64 flags, s32 target_cpu)
   243	{
   244		int err;
   245		/* Need to create a kthread, thus must support schedule */
   246		if (bpf_map_is_offloaded(map)) {
   247			return bpf_map_offload_update_elem(map, key, value, flags);
   248		} else if (map->map_type == BPF_MAP_TYPE_CPUMAP ||
   249			   map->map_type == BPF_MAP_TYPE_ARENA ||
   250			   map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
   251			return map->ops->map_update_elem(map, key, value, flags);
   252		} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
   253			   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
 > 254			return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
   255		} else if (IS_FD_PROG_ARRAY(map)) {
   256			return bpf_fd_array_map_update_elem(map, map_file, key, value,
   257							    flags);
   258		}
   259	
   260		bpf_disable_instrumentation();
   261		if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
   262		    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
   263			err = bpf_percpu_hash_update(map, key, value, flags);
   264		} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
   265			err = bpf_percpu_array_update(map, key, value, flags);
   266		} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
   267			err = bpf_percpu_cgroup_storage_update(map, key, value,
   268							       flags);
   269		} else if (IS_FD_ARRAY(map)) {
   270			err = bpf_fd_array_map_update_elem(map, map_file, key, value,
   271							   flags);
   272		} else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
   273			err = bpf_fd_htab_map_update_elem(map, map_file, key, value,
   274							  flags);
   275		} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
   276			/* rcu_read_lock() is not needed */
   277			err = bpf_fd_reuseport_array_update_elem(map, key, value,
   278								 flags);
   279		} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
   280			   map->map_type == BPF_MAP_TYPE_STACK ||
   281			   map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
   282			err = map->ops->map_push_elem(map, value, flags);
   283		} else {
   284			err = bpf_obj_pin_uptrs(map->record, value);
   285			if (!err) {
   286				rcu_read_lock();
   287				err = map->ops->map_update_elem(map, key, value, flags);
   288				rcu_read_unlock();
   289				if (err)
   290					bpf_obj_unpin_uptrs(map->record, value);
   291			}
   292		}
   293		bpf_enable_instrumentation();
   294	
   295		return err;
   296	}
   297
Andrii Nakryiko Nov. 1, 2024, 7:25 p.m. UTC | #3
On Thu, Oct 31, 2024 at 7:40 PM mrpre <mrpre@163.com> wrote:
>
> Why we need cpu affinity:
> Mainstream data planes, like Nginx and HAProxy, utilize CPU affinity
> by binding user processes to specific CPUs. This avoids interference
> between processes and prevents impact from other processes.
>
> Sockmap, as an optimization to accelerate such proxy programs,
> currently lacks the ability to specify CPU affinity. The current
> implementation of sockmap handling backlog is based on workqueue,
> which operates by calling 'schedule_delayed_work()'. It's current
> implementation prefers to schedule on the local CPU, i.e., the CPU
> that handled the packet under softirq.
>
> For extremely high traffic with large numbers of packets,
> 'sk_psock_backlog' becomes a large loop.
>
> For multi-threaded programs with only one map, we expect different
> sockets to run on different CPUs. It is important to note that this
> feature is not a general performance optimization. Instead, it
> provides users with the ability to bind to specific CPU, allowing
> them to enhance overall operating system utilization based on their
> own system environments.
>
> Implementation:
> 1.When updating the sockmap, support passing a CPU parameter and
> save it to the psock.
> 2.When scheduling psock, determine which CPU to run on using the
> psock's CPU information.
> 3.For thoes sockmap without CPU affinity, keep original logic by using
> 'schedule_delayed_work()'.
>
> Performance Testing:
> 'client <-> sockmap proxy <-> server'
>
> Using 'iperf3' tests, with the iperf server bound to CPU5 and the iperf
> client bound to CPU6, performance without using CPU affinity is
> around 34 Gbits/s, and CPU usage is concentrated on CPU5 and CPU6.
> '''
> [  5] local 127.0.0.1 port 57144 connected to 127.0.0.1 port 10000
> [ ID] Interval           Transfer     Bitrate
> [  5]   0.00-1.00   sec  3.95 GBytes  33.9 Gbits/sec
> [  5]   1.00-2.00   sec  3.95 GBytes  34.0 Gbits/sec
> ......
> '''
>
> With using CPU affinity, the performnce is close to direct connection
> (without any proxy).
> '''
> [  5] local 127.0.0.1 port 56518 connected to 127.0.0.1 port 10000
> [ ID] Interval           Transfer     Bitrate
> [  5]   0.00-1.00   sec  7.76 GBytes  66.6 Gbits/sec
> [  5]   1.00-2.00   sec  7.76 GBytes  66.7 Gbits/sec
> ......
> '''
>
> Signed-off-by: Jiayuan Chen <mrpre@163.com>
> ---
>  include/linux/bpf.h      |  3 ++-
>  include/linux/skmsg.h    |  8 ++++++++
>  include/uapi/linux/bpf.h |  4 ++++
>  kernel/bpf/syscall.c     | 23 +++++++++++++++++------
>  net/core/skmsg.c         | 11 +++++++----
>  net/core/sock_map.c      | 12 +++++++-----
>  6 files changed, 45 insertions(+), 16 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index c3ba4d475174..a56028c389e7 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -3080,7 +3080,8 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog,
>
>  int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
>  int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
> -int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
> +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags,
> +                            s32 target_cpu);
>  int sock_map_bpf_prog_query(const union bpf_attr *attr,
>                             union bpf_attr __user *uattr);
>  int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index d9b03e0746e7..919425a92adf 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -117,6 +117,7 @@ struct sk_psock {
>         struct delayed_work             work;
>         struct sock                     *sk_pair;
>         struct rcu_work                 rwork;
> +       s32                             target_cpu;
>  };
>
>  int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
> @@ -514,6 +515,13 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
>         return !!psock->saved_data_ready;
>  }
>
> +static inline int sk_psock_strp_get_cpu(struct sk_psock *psock)
> +{
> +       if (psock->target_cpu != -1)
> +               return psock->target_cpu;
> +       return WORK_CPU_UNBOUND;
> +}
> +
>  #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
>
>  #define BPF_F_STRPARSER        (1UL << 1)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index f28b6527e815..2019a87b5d4a 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -1509,6 +1509,10 @@ union bpf_attr {
>                         __aligned_u64 next_key;
>                 };
>                 __u64           flags;
> +               union {
> +                       /* specify the CPU where the sockmap job run on */
> +                       __aligned_u64 target_cpu;

I have no opinion on the feature itself, I'll leave this to others.
But from UAPI perspective:

a) why is this a u64 and not, say, int?
b) maybe we should just specify this as flags and not have to update
all the UAPIs (including libbpf-side)? Just add a new
BPF_F_SOCKNMAP_TARGET_CPU flag or something, and specify that highest
32 bits specify the CPU itself?

We have similar schema for some other helpers, so not *that* unusual.

> +               };
>         };
>
>         struct { /* struct used by BPF_MAP_*_BATCH commands */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 8254b2973157..95f719b9c3f3 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -239,10 +239,9 @@ static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
>  }
>
>  static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> -                               void *key, void *value, __u64 flags)
> +                               void *key, void *value, __u64 flags, s32 target_cpu)

yeah, this is what I'm talking about. Think how ridiculous it is for a
generic "BPF map update" operation to accept the "target_cpu"
parameter.

pw-bot: cr

>  {
>         int err;
> -

why? don't break whitespace formatting

>         /* Need to create a kthread, thus must support schedule */
>         if (bpf_map_is_offloaded(map)) {
>                 return bpf_map_offload_update_elem(map, key, value, flags);
> @@ -252,7 +251,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
>                 return map->ops->map_update_elem(map, key, value, flags);
>         } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
>                    map->map_type == BPF_MAP_TYPE_SOCKMAP) {
> -               return sock_map_update_elem_sys(map, key, value, flags);
> +               return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
>         } else if (IS_FD_PROG_ARRAY(map)) {
>                 return bpf_fd_array_map_update_elem(map, map_file, key, value,
>                                                     flags);

[...]
Jiayuan Chen Nov. 4, 2024, 6:12 a.m. UTC | #4
On Fri, Nov 01, 2024 at 12:25:51PM -0700, Andrii Nakryiko wrote:
> On Thu, Oct 31, 2024 at 7:40 PM mrpre <mrpre@163.com> wrote:
> >
> > Why we need cpu affinity:
> > Mainstream data planes, like Nginx and HAProxy, utilize CPU affinity
> > by binding user processes to specific CPUs. This avoids interference
> > between processes and prevents impact from other processes.
> >
> > Sockmap, as an optimization to accelerate such proxy programs,
> > currently lacks the ability to specify CPU affinity. The current
> > implementation of sockmap handling backlog is based on workqueue,
> > which operates by calling 'schedule_delayed_work()'. It's current
> > implementation prefers to schedule on the local CPU, i.e., the CPU
> > that handled the packet under softirq.
> >
> > For extremely high traffic with large numbers of packets,
> > 'sk_psock_backlog' becomes a large loop.
> >
> > For multi-threaded programs with only one map, we expect different
> > sockets to run on different CPUs. It is important to note that this
> > feature is not a general performance optimization. Instead, it
> > provides users with the ability to bind to specific CPU, allowing
> > them to enhance overall operating system utilization based on their
> > own system environments.
> >
> > Implementation:
> > 1.When updating the sockmap, support passing a CPU parameter and
> > save it to the psock.
> > 2.When scheduling psock, determine which CPU to run on using the
> > psock's CPU information.
> > 3.For thoes sockmap without CPU affinity, keep original logic by using
> > 'schedule_delayed_work()'.
> >
> > Performance Testing:
> > 'client <-> sockmap proxy <-> server'
> >
> > Using 'iperf3' tests, with the iperf server bound to CPU5 and the iperf
> > client bound to CPU6, performance without using CPU affinity is
> > around 34 Gbits/s, and CPU usage is concentrated on CPU5 and CPU6.
> > '''
> > [  5] local 127.0.0.1 port 57144 connected to 127.0.0.1 port 10000
> > [ ID] Interval           Transfer     Bitrate
> > [  5]   0.00-1.00   sec  3.95 GBytes  33.9 Gbits/sec
> > [  5]   1.00-2.00   sec  3.95 GBytes  34.0 Gbits/sec
> > ......
> > '''
> >
> > With using CPU affinity, the performnce is close to direct connection
> > (without any proxy).
> > '''
> > [  5] local 127.0.0.1 port 56518 connected to 127.0.0.1 port 10000
> > [ ID] Interval           Transfer     Bitrate
> > [  5]   0.00-1.00   sec  7.76 GBytes  66.6 Gbits/sec
> > [  5]   1.00-2.00   sec  7.76 GBytes  66.7 Gbits/sec
> > ......
> > '''
> >
> > Signed-off-by: Jiayuan Chen <mrpre@163.com>
> > ---
> >  include/linux/bpf.h      |  3 ++-
> >  include/linux/skmsg.h    |  8 ++++++++
> >  include/uapi/linux/bpf.h |  4 ++++
> >  kernel/bpf/syscall.c     | 23 +++++++++++++++++------
> >  net/core/skmsg.c         | 11 +++++++----
> >  net/core/sock_map.c      | 12 +++++++-----
> >  6 files changed, 45 insertions(+), 16 deletions(-)
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index c3ba4d475174..a56028c389e7 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -3080,7 +3080,8 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog,
> >
> >  int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
> >  int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
> > -int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
> > +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags,
> > +                            s32 target_cpu);
> >  int sock_map_bpf_prog_query(const union bpf_attr *attr,
> >                             union bpf_attr __user *uattr);
> >  int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
> > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> > index d9b03e0746e7..919425a92adf 100644
> > --- a/include/linux/skmsg.h
> > +++ b/include/linux/skmsg.h
> > @@ -117,6 +117,7 @@ struct sk_psock {
> >         struct delayed_work             work;
> >         struct sock                     *sk_pair;
> >         struct rcu_work                 rwork;
> > +       s32                             target_cpu;
> >  };
> >
> >  int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
> > @@ -514,6 +515,13 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
> >         return !!psock->saved_data_ready;
> >  }
> >
> > +static inline int sk_psock_strp_get_cpu(struct sk_psock *psock)
> > +{
> > +       if (psock->target_cpu != -1)
> > +               return psock->target_cpu;
> > +       return WORK_CPU_UNBOUND;
> > +}
> > +
> >  #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
> >
> >  #define BPF_F_STRPARSER        (1UL << 1)
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index f28b6527e815..2019a87b5d4a 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -1509,6 +1509,10 @@ union bpf_attr {
> >                         __aligned_u64 next_key;
> >                 };
> >                 __u64           flags;
> > +               union {
> > +                       /* specify the CPU where the sockmap job run on */
> > +                       __aligned_u64 target_cpu;
> 
> I have no opinion on the feature itself, I'll leave this to others.
> But from UAPI perspective:
> 
> a) why is this a u64 and not, say, int?
> b) maybe we should just specify this as flags and not have to update
> all the UAPIs (including libbpf-side)? Just add a new
> BPF_F_SOCKNMAP_TARGET_CPU flag or something, and specify that highest
> 32 bits specify the CPU itself?
> 
> We have similar schema for some other helpers, so not *that* unusual.
> 
Thank you for your response. I think I should clarify my thoughts:

My idea is to pass a user-space pointer, with the pointer being null
to indicate that the user has not provided anything.For example, when
users use the old interface 'bpf_map_update_elem' and pass in u64 of
0, it means that the user hasn't specified a CPU. If a u32 or another
type of value is passed in, when it is 0, it's ambiguous whether this
indicates target CPU 0 or that the user hasn't provided a value. So
my design involves passing a user-space pointer.

I also considered using the highest 32 bits of the flag as target_cpu, but
this approach still encounters the ambiguity mentioned above. Of course
for programs using libbpf, I can naturally init all the higher 32 bits
default to 1 to indicate the user hasn't specified a CPU, but this is
incompatible with programs not using libbpf. Another approach could be
that a value of 1 for the higher 32 bits indicates CPU 0, and 2 indicates
CPU 1..., but this seems odd and would require a helper to assist users
in passing arguments.

There is another method, like providing an extra 'attr', to replace the
passed 'target_cpu', which maintains the general nature of 
'map_update_elem' interface, like:
'''
+struct extra_bpf_attr {
+    u32 target_cpu;
+};
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
    __u32   map_fd;
    __aligned_u64 key;
    union {
        __aligned_u64 value;
        __aligned_u64 next_key;
    };
    __u64   flags;
    +struct extra_bpf_attr extra;
};

static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
-                               void *key, void *value, __u64 flags)
+                               void *key, void *value, __u64 flags, struct bpf_attr_extra *extra);
'''

> > +               };
> >         };
> >
> >         struct { /* struct used by BPF_MAP_*_BATCH commands */
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index 8254b2973157..95f719b9c3f3 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -239,10 +239,9 @@ static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
> >  }
> >
> >  static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> > -                               void *key, void *value, __u64 flags)
> > +                               void *key, void *value, __u64 flags, s32 target_cpu)
> 
> yeah, this is what I'm talking about. Think how ridiculous it is for a
> generic "BPF map update" operation to accept the "target_cpu"
> parameter.
> 
> pw-bot: cr
> 
> >  {
> >         int err;
> > -
> 
> why? don't break whitespace formatting
> 
> >         /* Need to create a kthread, thus must support schedule */
> >         if (bpf_map_is_offloaded(map)) {
> >                 return bpf_map_offload_update_elem(map, key, value, flags);
> > @@ -252,7 +251,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> >                 return map->ops->map_update_elem(map, key, value, flags);
> >         } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
> >                    map->map_type == BPF_MAP_TYPE_SOCKMAP) {
> > -               return sock_map_update_elem_sys(map, key, value, flags);
> > +               return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
> >         } else if (IS_FD_PROG_ARRAY(map)) {
> >                 return bpf_fd_array_map_update_elem(map, map_file, key, value,
> >                                                     flags);
> 
> [...]
Simon Horman Nov. 6, 2024, 1:49 p.m. UTC | #5
On Fri, Nov 01, 2024 at 10:38:31AM +0800, mrpre wrote:

...

> diff --git a/net/core/sock_map.c b/net/core/sock_map.c
> index 07d6aa4e39ef..36e9787c60de 100644
> --- a/net/core/sock_map.c
> +++ b/net/core/sock_map.c
> @@ -465,7 +465,7 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
>  }
>  
>  static int sock_map_update_common(struct bpf_map *map, u32 idx,
> -				  struct sock *sk, u64 flags)
> +				  struct sock *sk, u64 flags, s32 target_cpu)
>  {
>  	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
>  	struct sk_psock_link *link;
> @@ -490,6 +490,8 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx,
>  	psock = sk_psock(sk);
>  	WARN_ON_ONCE(!psock);
>  
> +	psock->target_cpu = target_cpu;
> +
>  	spin_lock_bh(&stab->lock);
>  	osk = stab->sks[idx];
>  	if (osk && flags == BPF_NOEXIST) {

Hi Jiayuan Chen,

The code immediately following the hunk above is:

		ret = -EEXIST;
		goto out_unlock;
	} else if (!osk && flags == BPF_EXIST) {
		ret = -ENOENT;
		goto out_unlock;
	}

And it seems that these gotos are the only code paths that lead to
out_unlock, which looks like this:

out_unlock:
	spin_unlock_bh(&stab->lock);
	if (psock)
		sk_psock_put(sk, psock);
out_free:
	sk_psock_free_link(link);
	return ret;
}

As you can see, the code under out_unlock expects that psock may be NULL.
But the code added to this function by your patch dereferences it
unconditionally. This seems inconsistent.

Flagged by Smatch.

...
Andrii Nakryiko Nov. 6, 2024, 9:43 p.m. UTC | #6
On Sun, Nov 3, 2024 at 10:12 PM Jiayuan Chen <mrpre@163.com> wrote:
>
> On Fri, Nov 01, 2024 at 12:25:51PM -0700, Andrii Nakryiko wrote:
> > On Thu, Oct 31, 2024 at 7:40 PM mrpre <mrpre@163.com> wrote:
> > >
> > > Why we need cpu affinity:
> > > Mainstream data planes, like Nginx and HAProxy, utilize CPU affinity
> > > by binding user processes to specific CPUs. This avoids interference
> > > between processes and prevents impact from other processes.
> > >
> > > Sockmap, as an optimization to accelerate such proxy programs,
> > > currently lacks the ability to specify CPU affinity. The current
> > > implementation of sockmap handling backlog is based on workqueue,
> > > which operates by calling 'schedule_delayed_work()'. It's current
> > > implementation prefers to schedule on the local CPU, i.e., the CPU
> > > that handled the packet under softirq.
> > >
> > > For extremely high traffic with large numbers of packets,
> > > 'sk_psock_backlog' becomes a large loop.
> > >
> > > For multi-threaded programs with only one map, we expect different
> > > sockets to run on different CPUs. It is important to note that this
> > > feature is not a general performance optimization. Instead, it
> > > provides users with the ability to bind to specific CPU, allowing
> > > them to enhance overall operating system utilization based on their
> > > own system environments.
> > >
> > > Implementation:
> > > 1.When updating the sockmap, support passing a CPU parameter and
> > > save it to the psock.
> > > 2.When scheduling psock, determine which CPU to run on using the
> > > psock's CPU information.
> > > 3.For thoes sockmap without CPU affinity, keep original logic by using
> > > 'schedule_delayed_work()'.
> > >
> > > Performance Testing:
> > > 'client <-> sockmap proxy <-> server'
> > >
> > > Using 'iperf3' tests, with the iperf server bound to CPU5 and the iperf
> > > client bound to CPU6, performance without using CPU affinity is
> > > around 34 Gbits/s, and CPU usage is concentrated on CPU5 and CPU6.
> > > '''
> > > [  5] local 127.0.0.1 port 57144 connected to 127.0.0.1 port 10000
> > > [ ID] Interval           Transfer     Bitrate
> > > [  5]   0.00-1.00   sec  3.95 GBytes  33.9 Gbits/sec
> > > [  5]   1.00-2.00   sec  3.95 GBytes  34.0 Gbits/sec
> > > ......
> > > '''
> > >
> > > With using CPU affinity, the performnce is close to direct connection
> > > (without any proxy).
> > > '''
> > > [  5] local 127.0.0.1 port 56518 connected to 127.0.0.1 port 10000
> > > [ ID] Interval           Transfer     Bitrate
> > > [  5]   0.00-1.00   sec  7.76 GBytes  66.6 Gbits/sec
> > > [  5]   1.00-2.00   sec  7.76 GBytes  66.7 Gbits/sec
> > > ......
> > > '''
> > >
> > > Signed-off-by: Jiayuan Chen <mrpre@163.com>
> > > ---
> > >  include/linux/bpf.h      |  3 ++-
> > >  include/linux/skmsg.h    |  8 ++++++++
> > >  include/uapi/linux/bpf.h |  4 ++++
> > >  kernel/bpf/syscall.c     | 23 +++++++++++++++++------
> > >  net/core/skmsg.c         | 11 +++++++----
> > >  net/core/sock_map.c      | 12 +++++++-----
> > >  6 files changed, 45 insertions(+), 16 deletions(-)
> > >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index c3ba4d475174..a56028c389e7 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -3080,7 +3080,8 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog,
> > >
> > >  int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
> > >  int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
> > > -int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
> > > +int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags,
> > > +                            s32 target_cpu);
> > >  int sock_map_bpf_prog_query(const union bpf_attr *attr,
> > >                             union bpf_attr __user *uattr);
> > >  int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
> > > diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> > > index d9b03e0746e7..919425a92adf 100644
> > > --- a/include/linux/skmsg.h
> > > +++ b/include/linux/skmsg.h
> > > @@ -117,6 +117,7 @@ struct sk_psock {
> > >         struct delayed_work             work;
> > >         struct sock                     *sk_pair;
> > >         struct rcu_work                 rwork;
> > > +       s32                             target_cpu;
> > >  };
> > >
> > >  int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
> > > @@ -514,6 +515,13 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
> > >         return !!psock->saved_data_ready;
> > >  }
> > >
> > > +static inline int sk_psock_strp_get_cpu(struct sk_psock *psock)
> > > +{
> > > +       if (psock->target_cpu != -1)
> > > +               return psock->target_cpu;
> > > +       return WORK_CPU_UNBOUND;
> > > +}
> > > +
> > >  #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
> > >
> > >  #define BPF_F_STRPARSER        (1UL << 1)
> > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > index f28b6527e815..2019a87b5d4a 100644
> > > --- a/include/uapi/linux/bpf.h
> > > +++ b/include/uapi/linux/bpf.h
> > > @@ -1509,6 +1509,10 @@ union bpf_attr {
> > >                         __aligned_u64 next_key;
> > >                 };
> > >                 __u64           flags;
> > > +               union {
> > > +                       /* specify the CPU where the sockmap job run on */
> > > +                       __aligned_u64 target_cpu;
> >
> > I have no opinion on the feature itself, I'll leave this to others.
> > But from UAPI perspective:
> >
> > a) why is this a u64 and not, say, int?
> > b) maybe we should just specify this as flags and not have to update
> > all the UAPIs (including libbpf-side)? Just add a new
> > BPF_F_SOCKNMAP_TARGET_CPU flag or something, and specify that highest
> > 32 bits specify the CPU itself?
> >
> > We have similar schema for some other helpers, so not *that* unusual.
> >
> Thank you for your response. I think I should clarify my thoughts:
>
> My idea is to pass a user-space pointer, with the pointer being null
> to indicate that the user has not provided anything.For example, when
> users use the old interface 'bpf_map_update_elem' and pass in u64 of
> 0, it means that the user hasn't specified a CPU. If a u32 or another
> type of value is passed in, when it is 0, it's ambiguous whether this
> indicates target CPU 0 or that the user hasn't provided a value. So
> my design involves passing a user-space pointer.
>
> I also considered using the highest 32 bits of the flag as target_cpu, but
> this approach still encounters the ambiguity mentioned above. Of course
> for programs using libbpf, I can naturally init all the higher 32 bits
> default to 1 to indicate the user hasn't specified a CPU, but this is
> incompatible with programs not using libbpf. Another approach could be
> that a value of 1 for the higher 32 bits indicates CPU 0, and 2 indicates
> CPU 1..., but this seems odd and would require a helper to assist users
> in passing arguments.

See BPF_F_SOCKMAP_TARGET_CPU flag point in my reply. You need an extra
flag that would specify that those 32 bits are specifying a CPU
number. There is no ambiguity. No flag - no CPU, Flag - CPU (even if
zero).

>
> There is another method, like providing an extra 'attr', to replace the
> passed 'target_cpu', which maintains the general nature of
> 'map_update_elem' interface, like:
> '''
> +struct extra_bpf_attr {
> +    u32 target_cpu;
> +};
> struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
>     __u32   map_fd;
>     __aligned_u64 key;
>     union {
>         __aligned_u64 value;
>         __aligned_u64 next_key;
>     };
>     __u64   flags;
>     +struct extra_bpf_attr extra;
> };
>
> static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> -                               void *key, void *value, __u64 flags)
> +                               void *key, void *value, __u64 flags, struct bpf_attr_extra *extra);
> '''
>
> > > +               };
> > >         };
> > >
> > >         struct { /* struct used by BPF_MAP_*_BATCH commands */
> > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > > index 8254b2973157..95f719b9c3f3 100644
> > > --- a/kernel/bpf/syscall.c
> > > +++ b/kernel/bpf/syscall.c
> > > @@ -239,10 +239,9 @@ static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
> > >  }
> > >
> > >  static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> > > -                               void *key, void *value, __u64 flags)
> > > +                               void *key, void *value, __u64 flags, s32 target_cpu)
> >
> > yeah, this is what I'm talking about. Think how ridiculous it is for a
> > generic "BPF map update" operation to accept the "target_cpu"
> > parameter.
> >
> > pw-bot: cr
> >
> > >  {
> > >         int err;
> > > -
> >
> > why? don't break whitespace formatting
> >
> > >         /* Need to create a kthread, thus must support schedule */
> > >         if (bpf_map_is_offloaded(map)) {
> > >                 return bpf_map_offload_update_elem(map, key, value, flags);
> > > @@ -252,7 +251,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
> > >                 return map->ops->map_update_elem(map, key, value, flags);
> > >         } else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
> > >                    map->map_type == BPF_MAP_TYPE_SOCKMAP) {
> > > -               return sock_map_update_elem_sys(map, key, value, flags);
> > > +               return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
> > >         } else if (IS_FD_PROG_ARRAY(map)) {
> > >                 return bpf_fd_array_map_update_elem(map, map_file, key, value,
> > >                                                     flags);
> >
> > [...]
>
>
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3ba4d475174..a56028c389e7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3080,7 +3080,8 @@  int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 
 int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
 int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
-int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags,
+			     s32 target_cpu);
 int sock_map_bpf_prog_query(const union bpf_attr *attr,
 			    union bpf_attr __user *uattr);
 int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index d9b03e0746e7..919425a92adf 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -117,6 +117,7 @@  struct sk_psock {
 	struct delayed_work		work;
 	struct sock			*sk_pair;
 	struct rcu_work			rwork;
+	s32				target_cpu;
 };
 
 int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
@@ -514,6 +515,13 @@  static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 	return !!psock->saved_data_ready;
 }
 
+static inline int sk_psock_strp_get_cpu(struct sk_psock *psock)
+{
+	if (psock->target_cpu != -1)
+		return psock->target_cpu;
+	return WORK_CPU_UNBOUND;
+}
+
 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
 
 #define BPF_F_STRPARSER	(1UL << 1)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f28b6527e815..2019a87b5d4a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1509,6 +1509,10 @@  union bpf_attr {
 			__aligned_u64 next_key;
 		};
 		__u64		flags;
+		union {
+			/* specify the CPU where the sockmap job run on */
+			__aligned_u64 target_cpu;
+		};
 	};
 
 	struct { /* struct used by BPF_MAP_*_BATCH commands */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8254b2973157..95f719b9c3f3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -239,10 +239,9 @@  static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
 }
 
 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
-				void *key, void *value, __u64 flags)
+				void *key, void *value, __u64 flags, s32 target_cpu)
 {
 	int err;
-
 	/* Need to create a kthread, thus must support schedule */
 	if (bpf_map_is_offloaded(map)) {
 		return bpf_map_offload_update_elem(map, key, value, flags);
@@ -252,7 +251,7 @@  static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
 		return map->ops->map_update_elem(map, key, value, flags);
 	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
 		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-		return sock_map_update_elem_sys(map, key, value, flags);
+		return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
 	} else if (IS_FD_PROG_ARRAY(map)) {
 		return bpf_fd_array_map_update_elem(map, map_file, key, value,
 						    flags);
@@ -1680,12 +1679,14 @@  static int map_lookup_elem(union bpf_attr *attr)
 }
 
 
-#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD target_cpu
 
 static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 {
 	bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
 	bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
+	bpfptr_t utarget_cpu = make_bpfptr(attr->target_cpu, uattr.is_kernel);
+	s64 target_cpu = 0;
 	struct bpf_map *map;
 	void *key, *value;
 	u32 value_size;
@@ -1723,7 +1724,17 @@  static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 		goto free_key;
 	}
 
-	err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP &&
+	    !bpfptr_is_null(utarget_cpu)) {
+		if (copy_from_bpfptr(&target_cpu, utarget_cpu, sizeof(target_cpu)) ||
+		    target_cpu > nr_cpu_ids) {
+			err = -EINVAL;
+			goto err_put;
+		}
+	} else {
+		target_cpu = -1;
+	}
+	err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags, (s32)target_cpu);
 	if (!err)
 		maybe_wait_bpf_programs(map);
 
@@ -1947,7 +1958,7 @@  int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 			break;
 
 		err = bpf_map_update_value(map, map_file, key, value,
-					   attr->batch.elem_flags);
+					   attr->batch.elem_flags, -1);
 
 		if (err)
 			break;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index b1dcbd3be89e..d3b6f2468dab 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -679,7 +679,8 @@  static void sk_psock_backlog(struct work_struct *work)
 					 * other work that might be here.
 					 */
 					if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
-						schedule_delayed_work(&psock->work, 1);
+						schedule_delayed_work_on(sk_psock_strp_get_cpu(psock),
+									 &psock->work, 1);
 					goto end;
 				}
 				/* Hard errors break pipe and stop xmit. */
@@ -729,6 +730,7 @@  struct sk_psock *sk_psock_init(struct sock *sk, int node)
 	psock->saved_destroy = prot->destroy;
 	psock->saved_close = prot->close;
 	psock->saved_write_space = sk->sk_write_space;
+	psock->target_cpu = -1;
 
 	INIT_LIST_HEAD(&psock->link);
 	spin_lock_init(&psock->link_lock);
@@ -934,7 +936,7 @@  static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
 	}
 
 	skb_queue_tail(&psock_other->ingress_skb, skb);
-	schedule_delayed_work(&psock_other->work, 0);
+	schedule_delayed_work_on(sk_psock_strp_get_cpu(psock_other), &psock_other->work, 0);
 	spin_unlock_bh(&psock_other->ingress_lock);
 	return 0;
 }
@@ -1012,7 +1014,8 @@  static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
 			spin_lock_bh(&psock->ingress_lock);
 			if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
 				skb_queue_tail(&psock->ingress_skb, skb);
-				schedule_delayed_work(&psock->work, 0);
+				schedule_delayed_work_on(sk_psock_strp_get_cpu(psock),
+							 &psock->work, 0);
 				err = 0;
 			}
 			spin_unlock_bh(&psock->ingress_lock);
@@ -1044,7 +1047,7 @@  static void sk_psock_write_space(struct sock *sk)
 	psock = sk_psock(sk);
 	if (likely(psock)) {
 		if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
-			schedule_delayed_work(&psock->work, 0);
+			schedule_delayed_work_on(sk_psock_strp_get_cpu(psock), &psock->work, 0);
 		write_space = psock->saved_write_space;
 	}
 	rcu_read_unlock();
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 07d6aa4e39ef..36e9787c60de 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -465,7 +465,7 @@  static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
 }
 
 static int sock_map_update_common(struct bpf_map *map, u32 idx,
-				  struct sock *sk, u64 flags)
+				  struct sock *sk, u64 flags, s32 target_cpu)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	struct sk_psock_link *link;
@@ -490,6 +490,8 @@  static int sock_map_update_common(struct bpf_map *map, u32 idx,
 	psock = sk_psock(sk);
 	WARN_ON_ONCE(!psock);
 
+	psock->target_cpu = target_cpu;
+
 	spin_lock_bh(&stab->lock);
 	osk = stab->sks[idx];
 	if (osk && flags == BPF_NOEXIST) {
@@ -548,7 +550,7 @@  static int sock_hash_update_common(struct bpf_map *map, void *key,
 				   struct sock *sk, u64 flags);
 
 int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
-			     u64 flags)
+			     u64 flags, s32 target_cpu)
 {
 	struct socket *sock;
 	struct sock *sk;
@@ -579,7 +581,7 @@  int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
 	if (!sock_map_sk_state_allowed(sk))
 		ret = -EOPNOTSUPP;
 	else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
-		ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+		ret = sock_map_update_common(map, *(u32 *)key, sk, flags, target_cpu);
 	else
 		ret = sock_hash_update_common(map, key, sk, flags);
 	sock_map_sk_release(sk);
@@ -605,7 +607,7 @@  static long sock_map_update_elem(struct bpf_map *map, void *key,
 	if (!sock_map_sk_state_allowed(sk))
 		ret = -EOPNOTSUPP;
 	else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
-		ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+		ret = sock_map_update_common(map, *(u32 *)key, sk, flags, -1);
 	else
 		ret = sock_hash_update_common(map, key, sk, flags);
 	bh_unlock_sock(sk);
@@ -621,7 +623,7 @@  BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
 	if (likely(sock_map_sk_is_suitable(sops->sk) &&
 		   sock_map_op_okay(sops)))
 		return sock_map_update_common(map, *(u32 *)key, sops->sk,
-					      flags);
+					      flags, -1);
 	return -EOPNOTSUPP;
 }