diff mbox series

[dwarves] dwarf_loader: fix termination on BTF encoding error

Message ID 20250328174003.3945581-1-ihor.solodrai@linux.dev (mailing list archive)
State Not Applicable
Headers show
Series [dwarves] dwarf_loader: fix termination on BTF encoding error | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Ihor Solodrai March 28, 2025, 5:40 p.m. UTC
When BTF encoding thread aborts because of an error, dwarf loader
worker threads get stuck in cus_queue__enqdeq_job() at:

    pthread_cond_wait(&cus_processing_queue.job_added, &cus_processing_queue.mutex);

To avoid this, introduce an abort flag into cus_processing_queue, and
atomically check for it in the deq loop. The flag is only set in case
of a worker thread exiting on error. Make sure to pthread_cond_signal
to the waiting threads to let them exit too.

In cus__process_file fix the check of an error returned from
dwfl_getmodules: it may return a positive number when a
callback (cus__process_dwflmod in our case) returns an error.

Link: https://lore.kernel.org/dwarves/Z-JzFrXaopQCYd6h@localhost/

Reported-by: Domenico Andreoli <domenico.andreoli@linux.com>
Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
---
 dwarf_loader.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

Comments

Alan Maguire April 1, 2025, 12:57 p.m. UTC | #1
On 28/03/2025 17:40, Ihor Solodrai wrote:
> When BTF encoding thread aborts because of an error, dwarf loader
> worker threads get stuck in cus_queue__enqdeq_job() at:
> 
>     pthread_cond_wait(&cus_processing_queue.job_added, &cus_processing_queue.mutex);
> 
> To avoid this, introduce an abort flag into cus_processing_queue, and
> atomically check for it in the deq loop. The flag is only set in case
> of a worker thread exiting on error. Make sure to pthread_cond_signal
> to the waiting threads to let them exit too.
> 
> In cus__process_file fix the check of an error returned from
> dwfl_getmodules: it may return a positive number when a
> callback (cus__process_dwflmod in our case) returns an error.
> 
> Link: https://lore.kernel.org/dwarves/Z-JzFrXaopQCYd6h@localhost/
> 
> Reported-by: Domenico Andreoli <domenico.andreoli@linux.com>
> Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>

Thanks for the fix! I've tested this with the problematic module+vmlinux
BTF and the previously-hanging pahole goes on to fail as expected; also
run it through the work-in-progress CI, building and testing on x86_64
and aarch64, no issues found. If anyone else has a chance to ack or test
it, that would be great. Thanks!

Alan

> ---
>  dwarf_loader.c | 12 ++++++++++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/dwarf_loader.c b/dwarf_loader.c
> index 84122d0..e1ba7bc 100644
> --- a/dwarf_loader.c
> +++ b/dwarf_loader.c
> @@ -3459,6 +3459,7 @@ static struct {
>  	 */
>  	uint32_t next_cu_id;
>  	struct list_head jobs;
> +	bool abort;
>  } cus_processing_queue;
>  
>  enum job_type {
> @@ -3479,6 +3480,7 @@ static void cus_queue__init(void)
>  	pthread_cond_init(&cus_processing_queue.job_added, NULL);
>  	INIT_LIST_HEAD(&cus_processing_queue.jobs);
>  	cus_processing_queue.next_cu_id = 0;
> +	cus_processing_queue.abort = false;
>  }
>  
>  static void cus_queue__destroy(void)
> @@ -3535,8 +3537,9 @@ static struct cu_processing_job *cus_queue__enqdeq_job(struct cu_processing_job
>  		pthread_cond_signal(&cus_processing_queue.job_added);
>  	}
>  	for (;;) {
> +		bool abort = __atomic_load_n(&cus_processing_queue.abort, __ATOMIC_SEQ_CST);
>  		job = cus_queue__try_dequeue();
> -		if (job)
> +		if (job || abort)
>  			break;
>  		/* No jobs or only steals out of order */
>  		pthread_cond_wait(&cus_processing_queue.job_added, &cus_processing_queue.mutex);
> @@ -3653,6 +3656,9 @@ static void *dwarf_loader__worker_thread(void *arg)
>  
>  	while (!stop) {
>  		job = cus_queue__enqdeq_job(job);
> +		if (!job)
> +			goto out_abort;
> +
>  		switch (job->type) {
>  
>  		case JOB_DECODE:
> @@ -3688,6 +3694,8 @@ static void *dwarf_loader__worker_thread(void *arg)
>  
>  	return (void *)DWARF_CB_OK;
>  out_abort:
> +	__atomic_store_n(&cus_processing_queue.abort, true, __ATOMIC_SEQ_CST);
> +	pthread_cond_signal(&cus_processing_queue.job_added);
>  	return (void *)DWARF_CB_ABORT;
>  }
>  
> @@ -4028,7 +4036,7 @@ static int cus__process_file(struct cus *cus, struct conf_load *conf, int fd,
>  
>  	/* Process the one or more modules gleaned from this file. */
>  	int err = dwfl_getmodules(dwfl, cus__process_dwflmod, &parms, 0);
> -	if (err < 0)
> +	if (err)
>  		return -1;
>  
>  	// We can't call dwfl_end(dwfl) here, as we keep pointers to strings
Domenico Andreoli April 1, 2025, 1:43 p.m. UTC | #2
On Tue, Apr 01, 2025 at 01:57:25PM +0100, Alan Maguire wrote:
> On 28/03/2025 17:40, Ihor Solodrai wrote:
> > When BTF encoding thread aborts because of an error, dwarf loader
> > worker threads get stuck in cus_queue__enqdeq_job() at:
> > 
> >     pthread_cond_wait(&cus_processing_queue.job_added, &cus_processing_queue.mutex);
> > 
> > To avoid this, introduce an abort flag into cus_processing_queue, and
> > atomically check for it in the deq loop. The flag is only set in case
> > of a worker thread exiting on error. Make sure to pthread_cond_signal
> > to the waiting threads to let them exit too.
> > 
> > In cus__process_file fix the check of an error returned from
> > dwfl_getmodules: it may return a positive number when a
> > callback (cus__process_dwflmod in our case) returns an error.
> > 
> > Link: https://lore.kernel.org/dwarves/Z-JzFrXaopQCYd6h@localhost/
> > 
> > Reported-by: Domenico Andreoli <domenico.andreoli@linux.com>
> > Signed-off-by: Ihor Solodrai <ihor.solodrai@linux.dev>
> 
> Thanks for the fix! I've tested this with the problematic module+vmlinux
> BTF and the previously-hanging pahole goes on to fail as expected; also
> run it through the work-in-progress CI, building and testing on x86_64
> and aarch64, no issues found. If anyone else has a chance to ack or test
> it, that would be great. Thanks!

Tested-by: Domenico Andreoli <domenico.andreoli@linux.com>

I rebuilt the Debian package with that patch applied and it then started
to fail consistently because of the extra c++ symbols.

When I use the switch --lang_exclude=rust,c++11, it works without
errors.

Thank you Alan and Ihor for the fast support!

Dom

> 
> Alan
> 
> > ---
> >  dwarf_loader.c | 12 ++++++++++--
> >  1 file changed, 10 insertions(+), 2 deletions(-)
> > 
> > diff --git a/dwarf_loader.c b/dwarf_loader.c
> > index 84122d0..e1ba7bc 100644
> > --- a/dwarf_loader.c
> > +++ b/dwarf_loader.c
> > @@ -3459,6 +3459,7 @@ static struct {
> >  	 */
> >  	uint32_t next_cu_id;
> >  	struct list_head jobs;
> > +	bool abort;
> >  } cus_processing_queue;
> >  
> >  enum job_type {
> > @@ -3479,6 +3480,7 @@ static void cus_queue__init(void)
> >  	pthread_cond_init(&cus_processing_queue.job_added, NULL);
> >  	INIT_LIST_HEAD(&cus_processing_queue.jobs);
> >  	cus_processing_queue.next_cu_id = 0;
> > +	cus_processing_queue.abort = false;
> >  }
> >  
> >  static void cus_queue__destroy(void)
> > @@ -3535,8 +3537,9 @@ static struct cu_processing_job *cus_queue__enqdeq_job(struct cu_processing_job
> >  		pthread_cond_signal(&cus_processing_queue.job_added);
> >  	}
> >  	for (;;) {
> > +		bool abort = __atomic_load_n(&cus_processing_queue.abort, __ATOMIC_SEQ_CST);
> >  		job = cus_queue__try_dequeue();
> > -		if (job)
> > +		if (job || abort)
> >  			break;
> >  		/* No jobs or only steals out of order */
> >  		pthread_cond_wait(&cus_processing_queue.job_added, &cus_processing_queue.mutex);
> > @@ -3653,6 +3656,9 @@ static void *dwarf_loader__worker_thread(void *arg)
> >  
> >  	while (!stop) {
> >  		job = cus_queue__enqdeq_job(job);
> > +		if (!job)
> > +			goto out_abort;
> > +
> >  		switch (job->type) {
> >  
> >  		case JOB_DECODE:
> > @@ -3688,6 +3694,8 @@ static void *dwarf_loader__worker_thread(void *arg)
> >  
> >  	return (void *)DWARF_CB_OK;
> >  out_abort:
> > +	__atomic_store_n(&cus_processing_queue.abort, true, __ATOMIC_SEQ_CST);
> > +	pthread_cond_signal(&cus_processing_queue.job_added);
> >  	return (void *)DWARF_CB_ABORT;
> >  }
> >  
> > @@ -4028,7 +4036,7 @@ static int cus__process_file(struct cus *cus, struct conf_load *conf, int fd,
> >  
> >  	/* Process the one or more modules gleaned from this file. */
> >  	int err = dwfl_getmodules(dwfl, cus__process_dwflmod, &parms, 0);
> > -	if (err < 0)
> > +	if (err)
> >  		return -1;
> >  
> >  	// We can't call dwfl_end(dwfl) here, as we keep pointers to strings
> 
>
diff mbox series

Patch

diff --git a/dwarf_loader.c b/dwarf_loader.c
index 84122d0..e1ba7bc 100644
--- a/dwarf_loader.c
+++ b/dwarf_loader.c
@@ -3459,6 +3459,7 @@  static struct {
 	 */
 	uint32_t next_cu_id;
 	struct list_head jobs;
+	bool abort;
 } cus_processing_queue;
 
 enum job_type {
@@ -3479,6 +3480,7 @@  static void cus_queue__init(void)
 	pthread_cond_init(&cus_processing_queue.job_added, NULL);
 	INIT_LIST_HEAD(&cus_processing_queue.jobs);
 	cus_processing_queue.next_cu_id = 0;
+	cus_processing_queue.abort = false;
 }
 
 static void cus_queue__destroy(void)
@@ -3535,8 +3537,9 @@  static struct cu_processing_job *cus_queue__enqdeq_job(struct cu_processing_job
 		pthread_cond_signal(&cus_processing_queue.job_added);
 	}
 	for (;;) {
+		bool abort = __atomic_load_n(&cus_processing_queue.abort, __ATOMIC_SEQ_CST);
 		job = cus_queue__try_dequeue();
-		if (job)
+		if (job || abort)
 			break;
 		/* No jobs or only steals out of order */
 		pthread_cond_wait(&cus_processing_queue.job_added, &cus_processing_queue.mutex);
@@ -3653,6 +3656,9 @@  static void *dwarf_loader__worker_thread(void *arg)
 
 	while (!stop) {
 		job = cus_queue__enqdeq_job(job);
+		if (!job)
+			goto out_abort;
+
 		switch (job->type) {
 
 		case JOB_DECODE:
@@ -3688,6 +3694,8 @@  static void *dwarf_loader__worker_thread(void *arg)
 
 	return (void *)DWARF_CB_OK;
 out_abort:
+	__atomic_store_n(&cus_processing_queue.abort, true, __ATOMIC_SEQ_CST);
+	pthread_cond_signal(&cus_processing_queue.job_added);
 	return (void *)DWARF_CB_ABORT;
 }
 
@@ -4028,7 +4036,7 @@  static int cus__process_file(struct cus *cus, struct conf_load *conf, int fd,
 
 	/* Process the one or more modules gleaned from this file. */
 	int err = dwfl_getmodules(dwfl, cus__process_dwflmod, &parms, 0);
-	if (err < 0)
+	if (err)
 		return -1;
 
 	// We can't call dwfl_end(dwfl) here, as we keep pointers to strings