diff mbox series

[bpf-next,04/10] bpf: Handle in-place update for full LPM trie correctly

Message ID 20241118010808.2243555-5-houtao@huaweicloud.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Fixes for LPM trie | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 3 this patch: 3
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 13 of 13 maintainers
netdev/build_clang success Errors and warnings before: 3 this patch: 3
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 4 this patch: 4
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 102 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 1 this patch: 1
netdev/source_inline success Was 0 now: 0

Commit Message

Hou Tao Nov. 18, 2024, 1:08 a.m. UTC
From: Hou Tao <houtao1@huawei.com>

When a LPM trie is full, in-place updates of existing elements
incorrectly return -ENOSPC.

Fix this by deferring the check of trie->n_entries. For new insertions,
n_entries must not exceed max_entries. However, in-place updates are
allowed even when the trie is full.

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 kernel/bpf/lpm_trie.c | 46 +++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

Comments

Sebastian Andrzej Siewior Nov. 18, 2024, 1:13 p.m. UTC | #1
On 2024-11-18 09:08:02 [+0800], Hou Tao wrote:
> From: Hou Tao <houtao1@huawei.com>
> 
> When a LPM trie is full, in-place updates of existing elements
> incorrectly return -ENOSPC.
> 
> Fix this by deferring the check of trie->n_entries. For new insertions,
> n_entries must not exceed max_entries. However, in-place updates are
> allowed even when the trie is full.

This and the previous patch look like a fix to an existing problem.
Shouldn't both have a fixes tag?

> Signed-off-by: Hou Tao <houtao1@huawei.com>

Sebastian
Hou Tao Nov. 19, 2024, 1:05 a.m. UTC | #2
On 11/18/2024 9:13 PM, Sebastian Andrzej Siewior wrote:
> On 2024-11-18 09:08:02 [+0800], Hou Tao wrote:
>> From: Hou Tao <houtao1@huawei.com>
>>
>> When a LPM trie is full, in-place updates of existing elements
>> incorrectly return -ENOSPC.
>>
>> Fix this by deferring the check of trie->n_entries. For new insertions,
>> n_entries must not exceed max_entries. However, in-place updates are
>> allowed even when the trie is full.
> This and the previous patch look like a fix to an existing problem.
> Shouldn't both have a fixes tag?

Will add in v2. Thanks for the suggestion.
>
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
> Sebastian
Toke Høiland-Jørgensen Nov. 21, 2024, 10:53 a.m. UTC | #3
Hou Tao <houtao@huaweicloud.com> writes:

> From: Hou Tao <houtao1@huawei.com>
>
> When a LPM trie is full, in-place updates of existing elements
> incorrectly return -ENOSPC.
>
> Fix this by deferring the check of trie->n_entries. For new insertions,
> n_entries must not exceed max_entries. However, in-place updates are
> allowed even when the trie is full.
>
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>  kernel/bpf/lpm_trie.c | 46 +++++++++++++++++++++----------------------
>  1 file changed, 23 insertions(+), 23 deletions(-)
>
> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
> index 4300bd51ec6e..ff57e35357ae 100644
> --- a/kernel/bpf/lpm_trie.c
> +++ b/kernel/bpf/lpm_trie.c
> @@ -310,6 +310,15 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
>  	return node;
>  }
>  
> +static int trie_check_noreplace_update(const struct lpm_trie *trie, u64 flags)

I think this function name is hard to parse (took me a few tries). How
about trie_check_add_entry() instead?

> +{
> +	if (flags == BPF_EXIST)
> +		return -ENOENT;
> +	if (trie->n_entries == trie->map.max_entries)
> +		return -ENOSPC;

The calls to this function are always paired with a trie->n_entries++; -
so how about moving that into the function after the checks? You'll have
to then add a decrement if the im_node allocation fails, but I think
that is still clearer than having the n_entries++ statements scattered
around the function.

-Toke
Hou Tao Nov. 22, 2024, 2:06 a.m. UTC | #4
Hi,

On 11/21/2024 6:53 PM, Toke Høiland-Jørgensen wrote:
> Hou Tao <houtao@huaweicloud.com> writes:
>
>> From: Hou Tao <houtao1@huawei.com>
>>
>> When a LPM trie is full, in-place updates of existing elements
>> incorrectly return -ENOSPC.
>>
>> Fix this by deferring the check of trie->n_entries. For new insertions,
>> n_entries must not exceed max_entries. However, in-place updates are
>> allowed even when the trie is full.
>>
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
>>  kernel/bpf/lpm_trie.c | 46 +++++++++++++++++++++----------------------
>>  1 file changed, 23 insertions(+), 23 deletions(-)
>>
>> diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
>> index 4300bd51ec6e..ff57e35357ae 100644
>> --- a/kernel/bpf/lpm_trie.c
>> +++ b/kernel/bpf/lpm_trie.c
>> @@ -310,6 +310,15 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
>>  	return node;
>>  }
>>  
>> +static int trie_check_noreplace_update(const struct lpm_trie *trie, u64 flags)
> I think this function name is hard to parse (took me a few tries). How
> about trie_check_add_entry() instead?

Better. However, "entry" is not commonly used. The commonly used noun is
either "node" or "element". Will use trie_check_add_elem().
>
>> +{
>> +	if (flags == BPF_EXIST)
>> +		return -ENOENT;
>> +	if (trie->n_entries == trie->map.max_entries)
>> +		return -ENOSPC;
> The calls to this function are always paired with a trie->n_entries++; -
> so how about moving that into the function after the checks? You'll have
> to then add a decrement if the im_node allocation fails, but I think
> that is still clearer than having the n_entries++ statements scattered
> around the function.

Got it. Will update it in v2.  One motivation for adding n_entries only
necessary is to prevent trie_mem_usage from reading an inaccurate
n_entries, but consider it again I don't think it matters much.
>
> -Toke
diff mbox series

Patch

diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 4300bd51ec6e..ff57e35357ae 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -310,6 +310,15 @@  static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
 	return node;
 }
 
+static int trie_check_noreplace_update(const struct lpm_trie *trie, u64 flags)
+{
+	if (flags == BPF_EXIST)
+		return -ENOENT;
+	if (trie->n_entries == trie->map.max_entries)
+		return -ENOSPC;
+	return 0;
+}
+
 /* Called from syscall or from eBPF program */
 static long trie_update_elem(struct bpf_map *map,
 			     void *_key, void *value, u64 flags)
@@ -333,20 +342,12 @@  static long trie_update_elem(struct bpf_map *map,
 	spin_lock_irqsave(&trie->lock, irq_flags);
 
 	/* Allocate and fill a new node */
-
-	if (trie->n_entries == trie->map.max_entries) {
-		ret = -ENOSPC;
-		goto out;
-	}
-
 	new_node = lpm_trie_node_alloc(trie, value);
 	if (!new_node) {
 		ret = -ENOMEM;
 		goto out;
 	}
 
-	trie->n_entries++;
-
 	new_node->prefixlen = key->prefixlen;
 	RCU_INIT_POINTER(new_node->child[0], NULL);
 	RCU_INIT_POINTER(new_node->child[1], NULL);
@@ -375,10 +376,11 @@  static long trie_update_elem(struct bpf_map *map,
 	 * simply assign the @new_node to that slot and be done.
 	 */
 	if (!node) {
-		if (flags == BPF_EXIST) {
-			ret = -ENOENT;
+		ret = trie_check_noreplace_update(trie, flags);
+		if (ret)
 			goto out;
-		}
+
+		trie->n_entries++;
 		rcu_assign_pointer(*slot, new_node);
 		goto out;
 	}
@@ -392,10 +394,11 @@  static long trie_update_elem(struct bpf_map *map,
 				ret = -EEXIST;
 				goto out;
 			}
-			trie->n_entries--;
-		} else if (flags == BPF_EXIST) {
-			ret = -ENOENT;
-			goto out;
+		} else {
+			ret = trie_check_noreplace_update(trie, flags);
+			if (ret)
+				goto out;
+			trie->n_entries++;
 		}
 
 		new_node->child[0] = node->child[0];
@@ -407,15 +410,15 @@  static long trie_update_elem(struct bpf_map *map,
 		goto out;
 	}
 
-	if (flags == BPF_EXIST) {
-		ret = -ENOENT;
+	ret = trie_check_noreplace_update(trie, flags);
+	if (ret)
 		goto out;
-	}
 
 	/* If the new node matches the prefix completely, it must be inserted
 	 * as an ancestor. Simply insert it between @node and *@slot.
 	 */
 	if (matchlen == key->prefixlen) {
+		trie->n_entries++;
 		next_bit = extract_bit(node->data, matchlen);
 		rcu_assign_pointer(new_node->child[next_bit], node);
 		rcu_assign_pointer(*slot, new_node);
@@ -428,6 +431,7 @@  static long trie_update_elem(struct bpf_map *map,
 		goto out;
 	}
 
+	trie->n_entries++;
 	im_node->prefixlen = matchlen;
 	im_node->flags |= LPM_TREE_NODE_FLAG_IM;
 	memcpy(im_node->data, node->data, trie->data_size);
@@ -445,12 +449,8 @@  static long trie_update_elem(struct bpf_map *map,
 	rcu_assign_pointer(*slot, im_node);
 
 out:
-	if (ret) {
-		if (new_node)
-			trie->n_entries--;
+	if (ret)
 		kfree(new_node);
-	}
-
 	spin_unlock_irqrestore(&trie->lock, irq_flags);
 	kfree_rcu(free_node, rcu);