diff mbox series

[3/5] maple_tree: use vacant nodes to reduce worst case allocations

Message ID 20241114170524.64391-4-sidhartha.kumar@oracle.com (mailing list archive)
State New
Headers show
Series Track node vacancy to reduce worst case allocation counts | expand

Commit Message

Sidhartha Kumar Nov. 14, 2024, 5:05 p.m. UTC
In order to determine the store type for a maple tree operation, a walk
of the tree is done through mas_wr_walk(). This function descends the
tree until a spanning write is detected or we reach a leaf node. While
descending, keep track of the height at which we encounter a node with
available space. This is done by checking if mas->end is less than the
number of slots a given node type can fit.

Now that the height of the vacant node is tracked, we can use the
difference between the height of the tree and the height of the vacant
node to know how many levels we will have to propagate creating new
nodes. Update mas_prealloc_calc() to consider the vacant height and
reduce the number of worst allocations.

Rebalancing stores are not supported and fall back to using the full
height of the tree for allocations.

Update preallocation testing assertions to take into account vacant
height.

Signed-off-by: Sidhartha <sidhartha.kumar@oracle.com>
---
 include/linux/maple_tree.h       |  2 +
 lib/maple_tree.c                 | 13 +++--
 tools/testing/radix-tree/maple.c | 97 +++++++++++++++++++++++++++++---
 3 files changed, 100 insertions(+), 12 deletions(-)

Comments

Wei Yang Nov. 15, 2024, 7:52 a.m. UTC | #1
On Thu, Nov 14, 2024 at 12:05:22PM -0500, Sidhartha Kumar wrote:
>In order to determine the store type for a maple tree operation, a walk
>of the tree is done through mas_wr_walk(). This function descends the
>tree until a spanning write is detected or we reach a leaf node. While
>descending, keep track of the height at which we encounter a node with
>available space. This is done by checking if mas->end is less than the
>number of slots a given node type can fit.
>
>Now that the height of the vacant node is tracked, we can use the
>difference between the height of the tree and the height of the vacant
>node to know how many levels we will have to propagate creating new
>nodes. Update mas_prealloc_calc() to consider the vacant height and
>reduce the number of worst allocations.
>
>Rebalancing stores are not supported and fall back to using the full
>height of the tree for allocations.
>
>Update preallocation testing assertions to take into account vacant
>height.
>
>Signed-off-by: Sidhartha <sidhartha.kumar@oracle.com>
>---
> include/linux/maple_tree.h       |  2 +
> lib/maple_tree.c                 | 13 +++--
> tools/testing/radix-tree/maple.c | 97 +++++++++++++++++++++++++++++---
> 3 files changed, 100 insertions(+), 12 deletions(-)
>
>diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
>index cbbcd18d4186..7d777aa2d9ed 100644
>--- a/include/linux/maple_tree.h
>+++ b/include/linux/maple_tree.h
>@@ -463,6 +463,7 @@ struct ma_wr_state {
> 	void __rcu **slots;		/* mas->node->slots pointer */
> 	void *entry;			/* The entry to write */
> 	void *content;			/* The existing entry that is being overwritten */
>+	unsigned char vacant_height;	/* Depth of lowest node with free space */
                             ^^^           ^^^

Would this be a little misleading?

> };
> 
> #define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
>@@ -498,6 +499,7 @@ struct ma_wr_state {
> 		.mas = ma_state,					\
> 		.content = NULL,					\
> 		.entry = wr_entry,					\
>+		.vacant_height = 0					\
> 	}
> 
> #define MA_TOPIARY(name, tree)						\
>diff --git a/lib/maple_tree.c b/lib/maple_tree.c
>index 21289e350382..f14d70c171c2 100644
>--- a/lib/maple_tree.c
>+++ b/lib/maple_tree.c
>@@ -3545,6 +3545,9 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas)
> 		if (ma_is_leaf(wr_mas->type))
> 			return true;
> 
>+		if (mas->end < mt_slots[wr_mas->type] - 1)
>+			wr_mas->vacant_height = mas->depth + 1;

For some cases in rebalance, we may split data into three parts, which means
we need 2 extra vacant slot.

Maybe this check is not accurate?

>+
> 		mas_wr_walk_traverse(wr_mas);
> 	}
> 
>@@ -4159,7 +4162,9 @@ static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas)
> static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
> {
> 	struct ma_state *mas = wr_mas->mas;
>-	int ret = mas_mt_height(mas) * 3 + 1;
>+	unsigned char height = mas_mt_height(mas);
>+	int ret = height * 3 + 1;
>+	unsigned char delta = height - wr_mas->vacant_height;
> 
> 	switch (mas->store_type) {
> 	case wr_invalid:
>@@ -4177,13 +4182,13 @@ static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
> 			ret = 0;
> 		break;
> 	case wr_spanning_store:
>-		ret =  mas_mt_height(mas) * 3 + 1;
>+		ret = delta * 3 + 1;

Hmm... I am afraid we need to put this patch after next one.

Without the change in next patch, we still need to go up the tree till root to
rebalance.

> 		break;
> 	case wr_split_store:
>-		ret =  mas_mt_height(mas) * 2 + 1;
>+		ret = delta * 2 + 1;
> 		break;
> 	case wr_rebalance:
>-		ret =  mas_mt_height(mas) * 2 - 1;
>+		ret = height * 2 + 1;

Looks current calculation is not correct?
If so, do we need to have a fix to be backported?
Sidhartha Kumar Nov. 15, 2024, 8:34 p.m. UTC | #2
On 11/15/24 2:52 AM, Wei Yang wrote:
> On Thu, Nov 14, 2024 at 12:05:22PM -0500, Sidhartha Kumar wrote:
>> In order to determine the store type for a maple tree operation, a walk
>> of the tree is done through mas_wr_walk(). This function descends the
>> tree until a spanning write is detected or we reach a leaf node. While
>> descending, keep track of the height at which we encounter a node with
>> available space. This is done by checking if mas->end is less than the
>> number of slots a given node type can fit.
>>
>> Now that the height of the vacant node is tracked, we can use the
>> difference between the height of the tree and the height of the vacant
>> node to know how many levels we will have to propagate creating new
>> nodes. Update mas_prealloc_calc() to consider the vacant height and
>> reduce the number of worst allocations.
>>
>> Rebalancing stores are not supported and fall back to using the full
>> height of the tree for allocations.
>>
>> Update preallocation testing assertions to take into account vacant
>> height.
>>
>> Signed-off-by: Sidhartha <sidhartha.kumar@oracle.com>
>> ---
>> include/linux/maple_tree.h       |  2 +
>> lib/maple_tree.c                 | 13 +++--
>> tools/testing/radix-tree/maple.c | 97 +++++++++++++++++++++++++++++---
>> 3 files changed, 100 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
>> index cbbcd18d4186..7d777aa2d9ed 100644
>> --- a/include/linux/maple_tree.h
>> +++ b/include/linux/maple_tree.h
>> @@ -463,6 +463,7 @@ struct ma_wr_state {
>> 	void __rcu **slots;		/* mas->node->slots pointer */
>> 	void *entry;			/* The entry to write */
>> 	void *content;			/* The existing entry that is being overwritten */
>> +	unsigned char vacant_height;	/* Depth of lowest node with free space */
>                               ^^^           ^^^
> 
> Would this be a little misleading?
> 

Could you elaborate on how its misleading?

>> };
>>
>> #define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
>> @@ -498,6 +499,7 @@ struct ma_wr_state {
>> 		.mas = ma_state,					\
>> 		.content = NULL,					\
>> 		.entry = wr_entry,					\
>> +		.vacant_height = 0					\
>> 	}
>>
>> #define MA_TOPIARY(name, tree)						\
>> diff --git a/lib/maple_tree.c b/lib/maple_tree.c
>> index 21289e350382..f14d70c171c2 100644
>> --- a/lib/maple_tree.c
>> +++ b/lib/maple_tree.c
>> @@ -3545,6 +3545,9 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas)
>> 		if (ma_is_leaf(wr_mas->type))
>> 			return true;
>>
>> +		if (mas->end < mt_slots[wr_mas->type] - 1)
>> +			wr_mas->vacant_height = mas->depth + 1;
> 
> For some cases in rebalance, we may split data into three parts, which means
> we need 2 extra vacant slot.
> 
> Maybe this check is not accurate?
> 

The triple split scenario which you are describing comes from the 
spanning store case not on the wr_rebalance case. There is a check 
before we set vacant height to return if is_span_wr() so I believe this 
is correct still.

>> +
>> 		mas_wr_walk_traverse(wr_mas);
>> 	}
>>
>> @@ -4159,7 +4162,9 @@ static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas)
>> static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
>> {
>> 	struct ma_state *mas = wr_mas->mas;
>> -	int ret = mas_mt_height(mas) * 3 + 1;
>> +	unsigned char height = mas_mt_height(mas);
>> +	int ret = height * 3 + 1;
>> +	unsigned char delta = height - wr_mas->vacant_height;
>>
>> 	switch (mas->store_type) {
>> 	case wr_invalid:
>> @@ -4177,13 +4182,13 @@ static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
>> 			ret = 0;
>> 		break;
>> 	case wr_spanning_store:
>> -		ret =  mas_mt_height(mas) * 3 + 1;
>> +		ret = delta * 3 + 1;
> 
> Hmm... I am afraid we need to put this patch after next one.
> 
> Without the change in next patch, we still need to go up the tree till root to
> rebalance.
> 

I think you are right here as mas_wr_spanning_store() calls 
mas_spanning_rebalance(), I'll switch the order of patch 3 and patch 4.

>> 		break;
>> 	case wr_split_store:
>> -		ret =  mas_mt_height(mas) * 2 + 1;
>> +		ret = delta * 2 + 1;
>> 		break;
>> 	case wr_rebalance:
>> -		ret =  mas_mt_height(mas) * 2 - 1;
>> +		ret = height * 2 + 1;
> 
> Looks current calculation is not correct?
> If so, do we need to have a fix to be backported?
> 

This was a typo, it can remain as height * 2 -1.

Thanks for taking a look,
Sidhartha Kumar

>
Wei Yang Nov. 16, 2024, 1:41 a.m. UTC | #3
On Fri, Nov 15, 2024 at 03:34:55PM -0500, Sidhartha Kumar wrote:
>On 11/15/24 2:52 AM, Wei Yang wrote:
>> On Thu, Nov 14, 2024 at 12:05:22PM -0500, Sidhartha Kumar wrote:
>> > In order to determine the store type for a maple tree operation, a walk
>> > of the tree is done through mas_wr_walk(). This function descends the
>> > tree until a spanning write is detected or we reach a leaf node. While
>> > descending, keep track of the height at which we encounter a node with
>> > available space. This is done by checking if mas->end is less than the
>> > number of slots a given node type can fit.
>> > 
>> > Now that the height of the vacant node is tracked, we can use the
>> > difference between the height of the tree and the height of the vacant
>> > node to know how many levels we will have to propagate creating new
>> > nodes. Update mas_prealloc_calc() to consider the vacant height and
>> > reduce the number of worst allocations.
>> > 
>> > Rebalancing stores are not supported and fall back to using the full
>> > height of the tree for allocations.
>> > 
>> > Update preallocation testing assertions to take into account vacant
>> > height.
>> > 
>> > Signed-off-by: Sidhartha <sidhartha.kumar@oracle.com>
>> > ---
>> > include/linux/maple_tree.h       |  2 +
>> > lib/maple_tree.c                 | 13 +++--
>> > tools/testing/radix-tree/maple.c | 97 +++++++++++++++++++++++++++++---
>> > 3 files changed, 100 insertions(+), 12 deletions(-)
>> > 
>> > diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
>> > index cbbcd18d4186..7d777aa2d9ed 100644
>> > --- a/include/linux/maple_tree.h
>> > +++ b/include/linux/maple_tree.h
>> > @@ -463,6 +463,7 @@ struct ma_wr_state {
>> > 	void __rcu **slots;		/* mas->node->slots pointer */
>> > 	void *entry;			/* The entry to write */
>> > 	void *content;			/* The existing entry that is being overwritten */
>> > +	unsigned char vacant_height;	/* Depth of lowest node with free space */
>>                               ^^^           ^^^
>> 
>> Would this be a little misleading?
>> 
>
>Could you elaborate on how its misleading?
>

As you mentioned in previous patch, depth and height has different meaning.

Root node has depth of 0 and height of 1. So I may wandering whether this is
depth or height.

>> > };
>> > 
>> > #define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
>> > @@ -498,6 +499,7 @@ struct ma_wr_state {
>> > 		.mas = ma_state,					\
>> > 		.content = NULL,					\
>> > 		.entry = wr_entry,					\
>> > +		.vacant_height = 0					\
>> > 	}
>> > 
>> > #define MA_TOPIARY(name, tree)						\
>> > diff --git a/lib/maple_tree.c b/lib/maple_tree.c
>> > index 21289e350382..f14d70c171c2 100644
>> > --- a/lib/maple_tree.c
>> > +++ b/lib/maple_tree.c
>> > @@ -3545,6 +3545,9 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas)
>> > 		if (ma_is_leaf(wr_mas->type))
>> > 			return true;
>> > 
>> > +		if (mas->end < mt_slots[wr_mas->type] - 1)
>> > +			wr_mas->vacant_height = mas->depth + 1;
>> 
>> For some cases in rebalance, we may split data into three parts, which means
>> we need 2 extra vacant slot.
>> 
>> Maybe this check is not accurate?
>> 
>
>The triple split scenario which you are describing comes from the spanning
>store case not on the wr_rebalance case. There is a check before we set
>vacant height to return if is_span_wr() so I believe this is correct still.
>

Hmm... I come up with a case in which vacant_height may not high enough.

Here is the subtree where spanning write locates. The first level is the
parent node of the second level nodes.

                vacant node
                +--------+-+-+-------+
                |        |l|r|       |
                +--------+-+-+-------+

                l                 r                 
    +------+    +----+-------+    +---------+--+    +------+
    |      |    |    |       |    |         |  |    |      |
    +------+    +----+-------+    +---------+--+    +------+
                     ^                      ^
		     |                      |
		   index                   last

When mas_wr_walk_descend() to node l, mas_is_span_wr() return true since last
is in the right sibling, node r. Let's say the parent is vacant and l/r is
leaf. So the request number is (1 * 3) + 1.

Let's assume:

  * vacant node is just sufficient
  * l's left part + r's right part is sufficient but not overflow

Then the "new vacant node" would be deficient and needs another round
rebalance.

For this case, I am afraid it doesn't allocate enough nodes. Or I
misunderstand this?
Sidhartha Kumar Nov. 18, 2024, 4:36 p.m. UTC | #4
On 11/15/24 8:41 PM, Wei Yang wrote:
> On Fri, Nov 15, 2024 at 03:34:55PM -0500, Sidhartha Kumar wrote:
>> On 11/15/24 2:52 AM, Wei Yang wrote:
>>> On Thu, Nov 14, 2024 at 12:05:22PM -0500, Sidhartha Kumar wrote:
>>>> In order to determine the store type for a maple tree operation, a walk
>>>> of the tree is done through mas_wr_walk(). This function descends the
>>>> tree until a spanning write is detected or we reach a leaf node. While
>>>> descending, keep track of the height at which we encounter a node with
>>>> available space. This is done by checking if mas->end is less than the
>>>> number of slots a given node type can fit.
>>>>
>>>> Now that the height of the vacant node is tracked, we can use the
>>>> difference between the height of the tree and the height of the vacant
>>>> node to know how many levels we will have to propagate creating new
>>>> nodes. Update mas_prealloc_calc() to consider the vacant height and
>>>> reduce the number of worst allocations.
>>>>
>>>> Rebalancing stores are not supported and fall back to using the full
>>>> height of the tree for allocations.
>>>>
>>>> Update preallocation testing assertions to take into account vacant
>>>> height.
>>>>
>>>> Signed-off-by: Sidhartha <sidhartha.kumar@oracle.com>
>>>> ---
>>>> include/linux/maple_tree.h       |  2 +
>>>> lib/maple_tree.c                 | 13 +++--
>>>> tools/testing/radix-tree/maple.c | 97 +++++++++++++++++++++++++++++---
>>>> 3 files changed, 100 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
>>>> index cbbcd18d4186..7d777aa2d9ed 100644
>>>> --- a/include/linux/maple_tree.h
>>>> +++ b/include/linux/maple_tree.h
>>>> @@ -463,6 +463,7 @@ struct ma_wr_state {
>>>> 	void __rcu **slots;		/* mas->node->slots pointer */
>>>> 	void *entry;			/* The entry to write */
>>>> 	void *content;			/* The existing entry that is being overwritten */
>>>> +	unsigned char vacant_height;	/* Depth of lowest node with free space */
>>>                                ^^^           ^^^
>>>
>>> Would this be a little misleading?
>>>
>>
>> Could you elaborate on how its misleading?
>>
> 
> As you mentioned in previous patch, depth and height has different meaning.
> 
> Root node has depth of 0 and height of 1. So I may wandering whether this is
> depth or height.
> 
>>>> };
>>>>
>>>> #define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
>>>> @@ -498,6 +499,7 @@ struct ma_wr_state {
>>>> 		.mas = ma_state,					\
>>>> 		.content = NULL,					\
>>>> 		.entry = wr_entry,					\
>>>> +		.vacant_height = 0					\
>>>> 	}
>>>>
>>>> #define MA_TOPIARY(name, tree)						\
>>>> diff --git a/lib/maple_tree.c b/lib/maple_tree.c
>>>> index 21289e350382..f14d70c171c2 100644
>>>> --- a/lib/maple_tree.c
>>>> +++ b/lib/maple_tree.c
>>>> @@ -3545,6 +3545,9 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas)
>>>> 		if (ma_is_leaf(wr_mas->type))
>>>> 			return true;
>>>>
>>>> +		if (mas->end < mt_slots[wr_mas->type] - 1)
>>>> +			wr_mas->vacant_height = mas->depth + 1;
>>>
>>> For some cases in rebalance, we may split data into three parts, which means
>>> we need 2 extra vacant slot.
>>>
>>> Maybe this check is not accurate?
>>>
>>
>> The triple split scenario which you are describing comes from the spanning
>> store case not on the wr_rebalance case. There is a check before we set
>> vacant height to return if is_span_wr() so I believe this is correct still.
>>
> 
> Hmm... I come up with a case in which vacant_height may not high enough.
> 
> Here is the subtree where spanning write locates. The first level is the
> parent node of the second level nodes.
> 
>                  vacant node
>                  +--------+-+-+-------+
>                  |        |l|r|       |
>                  +--------+-+-+-------+
> 
>                  l                 r
>      +------+    +----+-------+    +---------+--+    +------+
>      |      |    |    |       |    |         |  |    |      |
>      +------+    +----+-------+    +---------+--+    +------+
>                       ^                      ^
> 		     |                      |
> 		   index                   last
> 
> When mas_wr_walk_descend() to node l, mas_is_span_wr() return true since last
> is in the right sibling, node r. Let's say the parent is vacant and l/r is
> leaf. So the request number is (1 * 3) + 1.
> 
> Let's assume:
> 
>    * vacant node is just sufficient
>    * l's left part + r's right part is sufficient but not overflow
> 
> Then the "new vacant node" would be deficient and needs another round
> rebalance.
> 
> For this case, I am afraid it doesn't allocate enough nodes. Or I
> misunderstand this?

I think you are correct and we need to use the sufficient height which 
is introduced in patch 5 in the spanning store case similar to how patch 
5 uses it for the rebalance store case.

	case wr_rebalance:
		if (wr_mas->sufficient_height < wr_mas->vacant_height) {
			ret = (height - wr_mas->sufficient_height)*2 +1;
			break;
		}
		ret = delta * 2 + 1;
		break;

>
Wei Yang Nov. 19, 2024, 2:30 a.m. UTC | #5
On Mon, Nov 18, 2024 at 11:36:18AM -0500, Sidhartha Kumar wrote:
>On 11/15/24 8:41 PM, Wei Yang wrote:
>> On Fri, Nov 15, 2024 at 03:34:55PM -0500, Sidhartha Kumar wrote:
>> > On 11/15/24 2:52 AM, Wei Yang wrote:
>> > > On Thu, Nov 14, 2024 at 12:05:22PM -0500, Sidhartha Kumar wrote:
>> > > > In order to determine the store type for a maple tree operation, a walk
>> > > > of the tree is done through mas_wr_walk(). This function descends the
>> > > > tree until a spanning write is detected or we reach a leaf node. While
>> > > > descending, keep track of the height at which we encounter a node with
>> > > > available space. This is done by checking if mas->end is less than the
>> > > > number of slots a given node type can fit.
>> > > > 
>> > > > Now that the height of the vacant node is tracked, we can use the
>> > > > difference between the height of the tree and the height of the vacant
>> > > > node to know how many levels we will have to propagate creating new
>> > > > nodes. Update mas_prealloc_calc() to consider the vacant height and
>> > > > reduce the number of worst allocations.
>> > > > 
>> > > > Rebalancing stores are not supported and fall back to using the full
>> > > > height of the tree for allocations.
>> > > > 
>> > > > Update preallocation testing assertions to take into account vacant
>> > > > height.
>> > > > 
>> > > > Signed-off-by: Sidhartha <sidhartha.kumar@oracle.com>
>> > > > ---
>> > > > include/linux/maple_tree.h       |  2 +
>> > > > lib/maple_tree.c                 | 13 +++--
>> > > > tools/testing/radix-tree/maple.c | 97 +++++++++++++++++++++++++++++---
>> > > > 3 files changed, 100 insertions(+), 12 deletions(-)
>> > > > 
>> > > > diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
>> > > > index cbbcd18d4186..7d777aa2d9ed 100644
>> > > > --- a/include/linux/maple_tree.h
>> > > > +++ b/include/linux/maple_tree.h
>> > > > @@ -463,6 +463,7 @@ struct ma_wr_state {
>> > > > 	void __rcu **slots;		/* mas->node->slots pointer */
>> > > > 	void *entry;			/* The entry to write */
>> > > > 	void *content;			/* The existing entry that is being overwritten */
>> > > > +	unsigned char vacant_height;	/* Depth of lowest node with free space */
>> > >                                ^^^           ^^^
>> > > 
>> > > Would this be a little misleading?
>> > > 
>> > 
>> > Could you elaborate on how its misleading?
>> > 
>> 
>> As you mentioned in previous patch, depth and height has different meaning.
>> 
>> Root node has depth of 0 and height of 1. So I may wandering whether this is
>> depth or height.
>> 
>> > > > };
>> > > > 
>> > > > #define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
>> > > > @@ -498,6 +499,7 @@ struct ma_wr_state {
>> > > > 		.mas = ma_state,					\
>> > > > 		.content = NULL,					\
>> > > > 		.entry = wr_entry,					\
>> > > > +		.vacant_height = 0					\
>> > > > 	}
>> > > > 
>> > > > #define MA_TOPIARY(name, tree)						\
>> > > > diff --git a/lib/maple_tree.c b/lib/maple_tree.c
>> > > > index 21289e350382..f14d70c171c2 100644
>> > > > --- a/lib/maple_tree.c
>> > > > +++ b/lib/maple_tree.c
>> > > > @@ -3545,6 +3545,9 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas)
>> > > > 		if (ma_is_leaf(wr_mas->type))
>> > > > 			return true;
>> > > > 
>> > > > +		if (mas->end < mt_slots[wr_mas->type] - 1)
>> > > > +			wr_mas->vacant_height = mas->depth + 1;
>> > > 
>> > > For some cases in rebalance, we may split data into three parts, which means
>> > > we need 2 extra vacant slot.
>> > > 
>> > > Maybe this check is not accurate?
>> > > 
>> > 
>> > The triple split scenario which you are describing comes from the spanning
>> > store case not on the wr_rebalance case. There is a check before we set
>> > vacant height to return if is_span_wr() so I believe this is correct still.
>> > 
>> 
>> Hmm... I come up with a case in which vacant_height may not high enough.
>> 
>> Here is the subtree where spanning write locates. The first level is the
>> parent node of the second level nodes.
>> 
>>                  vacant node
>>                  +--------+-+-+-------+
>>                  |        |l|r|       |
>>                  +--------+-+-+-------+
>> 
>>                  l                 r
>>      +------+    +----+-------+    +---------+--+    +------+
>>      |      |    |    |       |    |         |  |    |      |
>>      +------+    +----+-------+    +---------+--+    +------+
>>                       ^                      ^
>> 		     |                      |
>> 		   index                   last
>> 
>> When mas_wr_walk_descend() to node l, mas_is_span_wr() return true since last
>> is in the right sibling, node r. Let's say the parent is vacant and l/r is
>> leaf. So the request number is (1 * 3) + 1.
>> 
>> Let's assume:
>> 
>>    * vacant node is just sufficient
>>    * l's left part + r's right part is sufficient but not overflow
>> 
>> Then the "new vacant node" would be deficient and needs another round
>> rebalance.
>> 
>> For this case, I am afraid it doesn't allocate enough nodes. Or I
>> misunderstand this?
>
>I think you are correct and we need to use the sufficient height which is
>introduced in patch 5 in the spanning store case similar to how patch 5 uses
>it for the rebalance store case.
>
>	case wr_rebalance:
>		if (wr_mas->sufficient_height < wr_mas->vacant_height) {
>			ret = (height - wr_mas->sufficient_height)*2 +1;
>			break;
>		}
>		ret = delta * 2 + 1;
>		break;
>

I have thought about this.

But the spanning write case seems to be a little special to
splitting/rebalance.

It could be both require one more extra slot and may need to be rebalanced.
We are not sure the exact behavior on converge node. Only check one aspect
seems not enough.

Do you think so ?
Liam R. Howlett Nov. 19, 2024, 2:15 p.m. UTC | #6
* Wei Yang <richard.weiyang@gmail.com> [241118 21:31]:
> On Mon, Nov 18, 2024 at 11:36:18AM -0500, Sidhartha Kumar wrote:
> >On 11/15/24 8:41 PM, Wei Yang wrote:
> >> On Fri, Nov 15, 2024 at 03:34:55PM -0500, Sidhartha Kumar wrote:
> >> > On 11/15/24 2:52 AM, Wei Yang wrote:
...

> >> 
> >> Hmm... I come up with a case in which vacant_height may not high enough.
> >> 
> >> Here is the subtree where spanning write locates. The first level is the
> >> parent node of the second level nodes.
> >> 
> >>                  vacant node
> >>                  +--------+-+-+-------+
> >>                  |        |l|r|       |
> >>                  +--------+-+-+-------+
> >> 
> >>                  l                 r
> >>      +------+    +----+-------+    +---------+--+    +------+
> >>      |      |    |    |       |    |         |  |    |      |
> >>      +------+    +----+-------+    +---------+--+    +------+
> >>                       ^                      ^
> >> 		     |                      |
> >> 		   index                   last
> >> 
> >> When mas_wr_walk_descend() to node l, mas_is_span_wr() return true since last
> >> is in the right sibling, node r. Let's say the parent is vacant and l/r is
> >> leaf. So the request number is (1 * 3) + 1.
> >> 
> >> Let's assume:
> >> 
> >>    * vacant node is just sufficient
> >>    * l's left part + r's right part is sufficient but not overflow
> >> 
> >> Then the "new vacant node" would be deficient and needs another round
> >> rebalance.
> >> 
> >> For this case, I am afraid it doesn't allocate enough nodes. Or I
> >> misunderstand this?
> >
> >I think you are correct and we need to use the sufficient height which is
> >introduced in patch 5 in the spanning store case similar to how patch 5 uses
> >it for the rebalance store case.
> >
> >	case wr_rebalance:
> >		if (wr_mas->sufficient_height < wr_mas->vacant_height) {
> >			ret = (height - wr_mas->sufficient_height)*2 +1;
> >			break;
> >		}
> >		ret = delta * 2 + 1;
> >		break;
> >
> 
> I have thought about this.
> 
> But the spanning write case seems to be a little special to
> splitting/rebalance.
> 
> It could be both require one more extra slot and may need to be rebalanced.
> We are not sure the exact behavior on converge node. Only check one aspect
> seems not enough.
> 
> Do you think so ?

I'm pretty sure the calculation will need to be different than the
rebalance case.  He was just saying that it's going to need to be like
rebalance, but not the same code.

Let's see what Sid comes up with.
diff mbox series

Patch

diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h
index cbbcd18d4186..7d777aa2d9ed 100644
--- a/include/linux/maple_tree.h
+++ b/include/linux/maple_tree.h
@@ -463,6 +463,7 @@  struct ma_wr_state {
 	void __rcu **slots;		/* mas->node->slots pointer */
 	void *entry;			/* The entry to write */
 	void *content;			/* The existing entry that is being overwritten */
+	unsigned char vacant_height;	/* Depth of lowest node with free space */
 };
 
 #define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
@@ -498,6 +499,7 @@  struct ma_wr_state {
 		.mas = ma_state,					\
 		.content = NULL,					\
 		.entry = wr_entry,					\
+		.vacant_height = 0					\
 	}
 
 #define MA_TOPIARY(name, tree)						\
diff --git a/lib/maple_tree.c b/lib/maple_tree.c
index 21289e350382..f14d70c171c2 100644
--- a/lib/maple_tree.c
+++ b/lib/maple_tree.c
@@ -3545,6 +3545,9 @@  static bool mas_wr_walk(struct ma_wr_state *wr_mas)
 		if (ma_is_leaf(wr_mas->type))
 			return true;
 
+		if (mas->end < mt_slots[wr_mas->type] - 1)
+			wr_mas->vacant_height = mas->depth + 1;
+
 		mas_wr_walk_traverse(wr_mas);
 	}
 
@@ -4159,7 +4162,9 @@  static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas)
 static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
 {
 	struct ma_state *mas = wr_mas->mas;
-	int ret = mas_mt_height(mas) * 3 + 1;
+	unsigned char height = mas_mt_height(mas);
+	int ret = height * 3 + 1;
+	unsigned char delta = height - wr_mas->vacant_height;
 
 	switch (mas->store_type) {
 	case wr_invalid:
@@ -4177,13 +4182,13 @@  static inline int mas_prealloc_calc(struct ma_wr_state *wr_mas, void *entry)
 			ret = 0;
 		break;
 	case wr_spanning_store:
-		ret =  mas_mt_height(mas) * 3 + 1;
+		ret = delta * 3 + 1;
 		break;
 	case wr_split_store:
-		ret =  mas_mt_height(mas) * 2 + 1;
+		ret = delta * 2 + 1;
 		break;
 	case wr_rebalance:
-		ret =  mas_mt_height(mas) * 2 - 1;
+		ret = height * 2 + 1;
 		break;
 	case wr_node_store:
 		ret = mt_in_rcu(mas->tree) ? 1 : 0;
diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c
index bc30050227fd..bc8b107e0177 100644
--- a/tools/testing/radix-tree/maple.c
+++ b/tools/testing/radix-tree/maple.c
@@ -35475,12 +35475,85 @@  static void check_dfs_preorder(struct maple_tree *mt)
 }
 /* End of depth first search tests */
 
+/* same implementation as mas_is_span_wr() in lib/maple_tree.c */
+static bool is_span_wr(struct ma_state *mas, unsigned long r_max,
+				enum maple_type type, void *entry)
+{
+	unsigned long max = r_max;
+	unsigned long last = mas->last;
+
+	/* Contained in this pivot, fast path */
+	if (last < max)
+		return false;
+
+	if (ma_is_leaf(type)) {
+		max = mas->max;
+		if (last < max)
+			return false;
+	}
+
+	if (last == max) {
+		/*
+		 * The last entry of leaf node cannot be NULL unless it is the
+		 * rightmost node (writing ULONG_MAX), otherwise it spans slots.
+		 */
+		if (entry || last == ULONG_MAX)
+			return false;
+	}
+
+	return true;
+}
+
+/* get height of the lowest non-leaf node with free space */
+static unsigned char get_vacant_height(struct ma_state *mas, void *entry)
+{
+	char vacant_height = 0;
+	enum maple_type type;
+	unsigned long *pivots;
+	unsigned long min = 0;
+	unsigned long max = ULONG_MAX;
+
+	/* start traversal */
+	mas_reset(mas);
+	mas_start(mas);
+	if (!xa_is_node(mas_root(mas)))
+		return 0;
+
+	type = mte_node_type(mas->node);
+	while (!ma_is_leaf(type)) {
+		mas_node_walk(mas, mte_to_node(mas->node), type, &min, &max);
+		mas->end = mas_data_end(mas);
+		pivots = ma_pivots(mte_to_node(mas->node), type);
+
+		if (pivots) {
+			if (mas->offset)
+				min = pivots[mas->offset - 1];
+			if (mas->offset < mas->end)
+				max = pivots[mas->offset];
+		}
+
+		/* detect spanning write */
+		if (is_span_wr(mas, max, type, entry))
+			break;
+
+		if (mas->end < mt_slot_count(mas->node) - 1)
+			vacant_height = mas->depth + 1;
+
+		mas_descend(mas);
+		type = mte_node_type(mas->node);
+		mas->depth++;
+	}
+
+	return vacant_height;
+}
+
 /* Preallocation testing */
 static noinline void __init check_prealloc(struct maple_tree *mt)
 {
 	unsigned long i, max = 100;
 	unsigned long allocated;
 	unsigned char height;
+	unsigned char vacant_height;
 	struct maple_node *mn;
 	void *ptr = check_prealloc;
 	MA_STATE(mas, mt, 10, 20);
@@ -35494,8 +35567,9 @@  static noinline void __init check_prealloc(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
+	vacant_height = get_vacant_height(&mas, ptr);
 	MT_BUG_ON(mt, allocated == 0);
-	MT_BUG_ON(mt, allocated != 1 + height * 3);
+	MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3);
 	mas_destroy(&mas);
 	allocated = mas_allocated(&mas);
 	MT_BUG_ON(mt, allocated != 0);
@@ -35503,8 +35577,9 @@  static noinline void __init check_prealloc(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
+	vacant_height = get_vacant_height(&mas, ptr);
 	MT_BUG_ON(mt, allocated == 0);
-	MT_BUG_ON(mt, allocated != 1 + height * 3);
+	MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3);
 	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
 	mas_destroy(&mas);
 	allocated = mas_allocated(&mas);
@@ -35514,7 +35589,8 @@  static noinline void __init check_prealloc(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
-	MT_BUG_ON(mt, allocated != 1 + height * 3);
+	vacant_height = get_vacant_height(&mas, ptr);
+	MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3);
 	mn = mas_pop_node(&mas);
 	MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
 	mn->parent = ma_parent_ptr(mn);
@@ -35527,7 +35603,8 @@  static noinline void __init check_prealloc(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
-	MT_BUG_ON(mt, allocated != 1 + height * 3);
+	vacant_height = get_vacant_height(&mas, ptr);
+	MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3);
 	mn = mas_pop_node(&mas);
 	MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
 	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
@@ -35540,7 +35617,8 @@  static noinline void __init check_prealloc(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
-	MT_BUG_ON(mt, allocated != 1 + height * 3);
+	vacant_height = get_vacant_height(&mas, ptr);
+	MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3);
 	mn = mas_pop_node(&mas);
 	MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
 	mas_push_node(&mas, mn);
@@ -35553,7 +35631,8 @@  static noinline void __init check_prealloc(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
-	MT_BUG_ON(mt, allocated != 1 + height * 3);
+	vacant_height = get_vacant_height(&mas, ptr);
+	MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3);
 	mas_store_prealloc(&mas, ptr);
 	MT_BUG_ON(mt, mas_allocated(&mas) != 0);
 
@@ -35578,7 +35657,8 @@  static noinline void __init check_prealloc(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
-	MT_BUG_ON(mt, allocated != 1 + height * 2);
+	vacant_height = get_vacant_height(&mas, ptr);
+	MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 2);
 	mas_store_prealloc(&mas, ptr);
 	MT_BUG_ON(mt, mas_allocated(&mas) != 0);
 	mt_set_non_kernel(1);
@@ -35595,8 +35675,9 @@  static noinline void __init check_prealloc(struct maple_tree *mt)
 	MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0);
 	allocated = mas_allocated(&mas);
 	height = mas_mt_height(&mas);
+	vacant_height = get_vacant_height(&mas, ptr);
 	MT_BUG_ON(mt, allocated == 0);
-	MT_BUG_ON(mt, allocated != 1 + height * 3);
+	MT_BUG_ON(mt, allocated != 1 + (height - vacant_height) * 3);
 	mas_store_prealloc(&mas, ptr);
 	MT_BUG_ON(mt, mas_allocated(&mas) != 0);
 	mas_set_range(&mas, 0, 200);