diff mbox series

station: fix reentrant FT wiphy radio work insertion

Message ID 20230501220604.366100-1-prestwoj@gmail.com (mailing list archive)
State New
Headers show
Series station: fix reentrant FT wiphy radio work insertion | expand

Checks

Context Check Description
tedd_an/pre-ci_am fail error: patch failed: src/station.c:2300 error: src/station.c: patch does not apply hint: Use 'git am --show-current-patch' to see the failed patch

Commit Message

James Prestwood May 1, 2023, 10:06 p.m. UTC
The failure path for FT was inserting the same work item inside
the 'do_work' callback. This caused duplicate work items to be
inside the radio work queue, and upon destroy the ID would get
zero'ed out. This caused a host of valgrind complaints when this
behavior is triggered.

This can be seen with slight modifiations to the FT test:

src/station.c:station_try_next_transition() 9, target 12:00:00:00:00:02
src/wiphy.c:wiphy_radio_work_insert() Inserting work item 6
src/wiphy.c:wiphy_radio_work_insert() Inserting work item 7
src/wiphy.c:wiphy_radio_work_done() Work item 5 done
src/wiphy.c:wiphy_radio_work_next() Starting work item 6
src/netdev.c:netdev_mlme_notify() MLME notification Remain on Channel(55)
src/ft.c:ft_send_authenticate()
src/netdev.c:netdev_mlme_notify() MLME notification Frame TX Status(60)
src/netdev.c:netdev_mlme_notify() MLME notification Frame Wait Cancel(67)
src/netdev.c:netdev_mlme_notify() MLME notification Cancel Remain on Channel(56)
src/wiphy.c:wiphy_radio_work_done() Work item 6 done

--- Work item 7 is 'station_ft_work_ready()' ---

src/wiphy.c:wiphy_radio_work_next() Starting work item 7
src/station.c:station_debug_event() StationDebug.Event(ft-roam-failed)
src/station.c:station_try_next_transition() 9, target 12:00:00:00:00:03
src/wiphy.c:wiphy_radio_work_insert() Inserting work item 8

--- Here we insert work item 9, which is a duplicate of 7 ---

src/wiphy.c:wiphy_radio_work_insert() Inserting work item 9
src/wiphy.c:wiphy_radio_work_next() Starting work item 8
src/netdev.c:netdev_mlme_notify() MLME notification Remain on Channel(55)
src/ft.c:ft_send_authenticate()
src/netdev.c:netdev_mlme_notify() MLME notification Frame TX Status(60)
src/netdev.c:netdev_mlme_notify() MLME notification Frame Wait Cancel(67)
src/netdev.c:netdev_mlme_notify() MLME notification Cancel Remain on Channel(56)
src/wiphy.c:wiphy_radio_work_done() Work item 8 done

--- And finally, the duplicated work item starts ---

src/wiphy.c:wiphy_radio_work_next() Starting work item 0
src/station.c:station_debug_event() StationDebug.Event(ft-roam-failed)
src/station.c:station_roam_failed() 9
src/station.c:station_roam_trigger_cb() 9
---
 src/station.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

Comments

Denis Kenzior May 8, 2023, 5:56 p.m. UTC | #1
Hi James,

On 5/1/23 17:06, James Prestwood wrote:
> The failure path for FT was inserting the same work item inside
> the 'do_work' callback. This caused duplicate work items to be
> inside the radio work queue, and upon destroy the ID would get
> zero'ed out. This caused a host of valgrind complaints when this
> behavior is triggered.
> 
> This can be seen with slight modifiations to the FT test:

Typo here, 'modifiations'

> 
> src/station.c:station_try_next_transition() 9, target 12:00:00:00:00:02
> src/wiphy.c:wiphy_radio_work_insert() Inserting work item 6
> src/wiphy.c:wiphy_radio_work_insert() Inserting work item 7
> src/wiphy.c:wiphy_radio_work_done() Work item 5 done
> src/wiphy.c:wiphy_radio_work_next() Starting work item 6
> src/netdev.c:netdev_mlme_notify() MLME notification Remain on Channel(55)
> src/ft.c:ft_send_authenticate()
> src/netdev.c:netdev_mlme_notify() MLME notification Frame TX Status(60)
> src/netdev.c:netdev_mlme_notify() MLME notification Frame Wait Cancel(67)
> src/netdev.c:netdev_mlme_notify() MLME notification Cancel Remain on Channel(56)
> src/wiphy.c:wiphy_radio_work_done() Work item 6 done
> 
> --- Work item 7 is 'station_ft_work_ready()' ---
> 
> src/wiphy.c:wiphy_radio_work_next() Starting work item 7
> src/station.c:station_debug_event() StationDebug.Event(ft-roam-failed)
> src/station.c:station_try_next_transition() 9, target 12:00:00:00:00:03
> src/wiphy.c:wiphy_radio_work_insert() Inserting work item 8
> 
> --- Here we insert work item 9, which is a duplicate of 7 ---
> 

Duplicate in what way? We're going to a different BSS?  Or do you mean that we 
overwrite station->ft_work because we queue the same work item in the work queue 
callback?

> src/wiphy.c:wiphy_radio_work_insert() Inserting work item 9
> src/wiphy.c:wiphy_radio_work_next() Starting work item 8
> src/netdev.c:netdev_mlme_notify() MLME notification Remain on Channel(55)
> src/ft.c:ft_send_authenticate()
> src/netdev.c:netdev_mlme_notify() MLME notification Frame TX Status(60)
> src/netdev.c:netdev_mlme_notify() MLME notification Frame Wait Cancel(67)
> src/netdev.c:netdev_mlme_notify() MLME notification Cancel Remain on Channel(56)
> src/wiphy.c:wiphy_radio_work_done() Work item 8 done
> 
> --- And finally, the duplicated work item starts ---
> 
> src/wiphy.c:wiphy_radio_work_next() Starting work item 0
> src/station.c:station_debug_event() StationDebug.Event(ft-roam-failed)
> src/station.c:station_roam_failed() 9
> src/station.c:station_roam_trigger_cb() 9
> ---
>   src/station.c | 19 ++++++++++++++++++-
>   1 file changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/src/station.c b/src/station.c
> index 0d8f7628..3c6e43ce 100644
> --- a/src/station.c
> +++ b/src/station.c
> @@ -122,6 +122,7 @@ struct station {
>   
>   	uint32_t wiphy_watch;
>   
> +	struct l_idle *ft_idle;
>   	struct wiphy_radio_work_item ft_work;
>   
>   	bool preparing_roam : 1;
> @@ -1739,6 +1740,11 @@ static void station_roam_state_clear(struct station *station)
>   		station->roam_freqs = NULL;
>   	}
>   
> +	if (station->ft_idle) {
> +		l_idle_remove(station->ft_idle);
> +		station->ft_idle = NULL;
> +	}
> +
>   	l_queue_clear(station->roam_bss_list, l_free);
>   
>   	ft_clear_authentications(netdev_get_ifindex(station->netdev));
> @@ -2282,6 +2288,16 @@ static void station_preauthenticate_cb(struct netdev *netdev,
>   
>   static void station_transition_start(struct station *station);
>   
> +static void station_transition_idle(struct l_idle *idle, void *user_data)
> +{
> +	struct station *station = user_data;
> +
> +	l_idle_remove(station->ft_idle);
> +	station->ft_idle = NULL;
> +
> +	station_transition_start(station);
> +}
> +
>   static bool station_ft_work_ready(struct wiphy_radio_work_item *item)
>   {
>   	struct station *station = l_container_of(item, struct station, ft_work);
> @@ -2300,7 +2316,8 @@ static bool station_ft_work_ready(struct wiphy_radio_work_item *item)
>   	if (ret == -ENOENT) {
>   		station_debug_event(station, "ft-roam-failed");
>   try_next:
> -		station_transition_start(station);
> +		station->ft_idle = l_idle_create(station_transition_idle,
> +							station, NULL);
>   		return true;
>   	} else if (ret < 0)
>   		goto assoc_failed;

The patch makes sense to me.  However, I do wonder whether we can take care of 
this in the wiphy work queue itself?  Maybe by adding wiphy_work_reschedule() or 
something to that effect?  I would hate to keep doing this l_idle business every 
time we need to do something similar.

Regards,
-Denis
James Prestwood May 8, 2023, 6:10 p.m. UTC | #2
Hi Denis,

On 5/8/23 10:56 AM, Denis Kenzior wrote:
> Hi James,
> 
> On 5/1/23 17:06, James Prestwood wrote:
>> The failure path for FT was inserting the same work item inside
>> the 'do_work' callback. This caused duplicate work items to be
>> inside the radio work queue, and upon destroy the ID would get
>> zero'ed out. This caused a host of valgrind complaints when this
>> behavior is triggered.
>>
>> This can be seen with slight modifiations to the FT test:
> 
> Typo here, 'modifiations'
> 
>>
>> src/station.c:station_try_next_transition() 9, target 12:00:00:00:00:02
>> src/wiphy.c:wiphy_radio_work_insert() Inserting work item 6
>> src/wiphy.c:wiphy_radio_work_insert() Inserting work item 7
>> src/wiphy.c:wiphy_radio_work_done() Work item 5 done
>> src/wiphy.c:wiphy_radio_work_next() Starting work item 6
>> src/netdev.c:netdev_mlme_notify() MLME notification Remain on Channel(55)
>> src/ft.c:ft_send_authenticate()
>> src/netdev.c:netdev_mlme_notify() MLME notification Frame TX Status(60)
>> src/netdev.c:netdev_mlme_notify() MLME notification Frame Wait Cancel(67)
>> src/netdev.c:netdev_mlme_notify() MLME notification Cancel Remain on 
>> Channel(56)
>> src/wiphy.c:wiphy_radio_work_done() Work item 6 done
>>
>> --- Work item 7 is 'station_ft_work_ready()' ---
>>
>> src/wiphy.c:wiphy_radio_work_next() Starting work item 7
>> src/station.c:station_debug_event() StationDebug.Event(ft-roam-failed)
>> src/station.c:station_try_next_transition() 9, target 12:00:00:00:00:03
>> src/wiphy.c:wiphy_radio_work_insert() Inserting work item 8
>>
>> --- Here we insert work item 9, which is a duplicate of 7 ---
>>
> 
> Duplicate in what way? We're going to a different BSS?  Or do you mean 
> that we overwrite station->ft_work because we queue the same work item 
> in the work queue callback?

That we are using the same ft_work structure which already exists in the 
queue. So when it gets cleaned up it modifies both queue items.

> 
>> src/wiphy.c:wiphy_radio_work_insert() Inserting work item 9
>> src/wiphy.c:wiphy_radio_work_next() Starting work item 8
>> src/netdev.c:netdev_mlme_notify() MLME notification Remain on Channel(55)
>> src/ft.c:ft_send_authenticate()
>> src/netdev.c:netdev_mlme_notify() MLME notification Frame TX Status(60)
>> src/netdev.c:netdev_mlme_notify() MLME notification Frame Wait Cancel(67)
>> src/netdev.c:netdev_mlme_notify() MLME notification Cancel Remain on 
>> Channel(56)
>> src/wiphy.c:wiphy_radio_work_done() Work item 8 done
>>
>> --- And finally, the duplicated work item starts ---
>>
>> src/wiphy.c:wiphy_radio_work_next() Starting work item 0
>> src/station.c:station_debug_event() StationDebug.Event(ft-roam-failed)
>> src/station.c:station_roam_failed() 9
>> src/station.c:station_roam_trigger_cb() 9
>> ---
>>   src/station.c | 19 ++++++++++++++++++-
>>   1 file changed, 18 insertions(+), 1 deletion(-)
>>
>> diff --git a/src/station.c b/src/station.c
>> index 0d8f7628..3c6e43ce 100644
>> --- a/src/station.c
>> +++ b/src/station.c
>> @@ -122,6 +122,7 @@ struct station {
>>       uint32_t wiphy_watch;
>> +    struct l_idle *ft_idle;
>>       struct wiphy_radio_work_item ft_work;
>>       bool preparing_roam : 1;
>> @@ -1739,6 +1740,11 @@ static void station_roam_state_clear(struct 
>> station *station)
>>           station->roam_freqs = NULL;
>>       }
>> +    if (station->ft_idle) {
>> +        l_idle_remove(station->ft_idle);
>> +        station->ft_idle = NULL;
>> +    }
>> +
>>       l_queue_clear(station->roam_bss_list, l_free);
>>       ft_clear_authentications(netdev_get_ifindex(station->netdev));
>> @@ -2282,6 +2288,16 @@ static void station_preauthenticate_cb(struct 
>> netdev *netdev,
>>   static void station_transition_start(struct station *station);
>> +static void station_transition_idle(struct l_idle *idle, void 
>> *user_data)
>> +{
>> +    struct station *station = user_data;
>> +
>> +    l_idle_remove(station->ft_idle);
>> +    station->ft_idle = NULL;
>> +
>> +    station_transition_start(station);
>> +}
>> +
>>   static bool station_ft_work_ready(struct wiphy_radio_work_item *item)
>>   {
>>       struct station *station = l_container_of(item, struct station, 
>> ft_work);
>> @@ -2300,7 +2316,8 @@ static bool station_ft_work_ready(struct 
>> wiphy_radio_work_item *item)
>>       if (ret == -ENOENT) {
>>           station_debug_event(station, "ft-roam-failed");
>>   try_next:
>> -        station_transition_start(station);
>> +        station->ft_idle = l_idle_create(station_transition_idle,
>> +                            station, NULL);
>>           return true;
>>       } else if (ret < 0)
>>           goto assoc_failed;
> 
> The patch makes sense to me.  However, I do wonder whether we can take 
> care of this in the wiphy work queue itself?  Maybe by adding 
> wiphy_work_reschedule() or something to that effect?  I would hate to 
> keep doing this l_idle business every time we need to do something similar.

I was thinking that, maybe adding an API would work. I'll see if that is 
possible. Basically the problem I ran into trying to handle it 
automatically was the fact that the work item memory is provided by the 
caller, so when you change one it changes both.

> 
> Regards,
> -Denis
diff mbox series

Patch

diff --git a/src/station.c b/src/station.c
index 0d8f7628..3c6e43ce 100644
--- a/src/station.c
+++ b/src/station.c
@@ -122,6 +122,7 @@  struct station {
 
 	uint32_t wiphy_watch;
 
+	struct l_idle *ft_idle;
 	struct wiphy_radio_work_item ft_work;
 
 	bool preparing_roam : 1;
@@ -1739,6 +1740,11 @@  static void station_roam_state_clear(struct station *station)
 		station->roam_freqs = NULL;
 	}
 
+	if (station->ft_idle) {
+		l_idle_remove(station->ft_idle);
+		station->ft_idle = NULL;
+	}
+
 	l_queue_clear(station->roam_bss_list, l_free);
 
 	ft_clear_authentications(netdev_get_ifindex(station->netdev));
@@ -2282,6 +2288,16 @@  static void station_preauthenticate_cb(struct netdev *netdev,
 
 static void station_transition_start(struct station *station);
 
+static void station_transition_idle(struct l_idle *idle, void *user_data)
+{
+	struct station *station = user_data;
+
+	l_idle_remove(station->ft_idle);
+	station->ft_idle = NULL;
+
+	station_transition_start(station);
+}
+
 static bool station_ft_work_ready(struct wiphy_radio_work_item *item)
 {
 	struct station *station = l_container_of(item, struct station, ft_work);
@@ -2300,7 +2316,8 @@  static bool station_ft_work_ready(struct wiphy_radio_work_item *item)
 	if (ret == -ENOENT) {
 		station_debug_event(station, "ft-roam-failed");
 try_next:
-		station_transition_start(station);
+		station->ft_idle = l_idle_create(station_transition_idle,
+							station, NULL);
 		return true;
 	} else if (ret < 0)
 		goto assoc_failed;