diff mbox series

[RESEND,v2] kernel-shark: Multi-thread the computaion of stream/combo plots

Message ID 20250314220719.1065523-1-libo.chen@oracle.com (mailing list archive)
State New
Headers show
Series [RESEND,v2] kernel-shark: Multi-thread the computaion of stream/combo plots | expand

Commit Message

Libo Chen March 14, 2025, 10:07 p.m. UTC
Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically
speed up graph rendering particularly for traces from very large systems.

OpenMP technically is a new dependency here, but it's part of GCC, so long
as your GCC >= v4.9, the libgomp library will make the code compiled.

Signed-off-by: Libo Chen <libo.chen@oracle.com>
---
 CMakeLists.txt     |  6 ++++++
 src/KsGLWidget.cpp | 25 +++++++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

Comments

Yordan Karadzhov March 23, 2025, 4:01 p.m. UTC | #1
Hi Libo,
Please see my comments below.

On 3/15/25 00:07, Libo Chen wrote:
> Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically
> speed up graph rendering particularly for traces from very large systems.
> 
> OpenMP technically is a new dependency here, but it's part of GCC, so long
> as your GCC >= v4.9, the libgomp library will make the code compiled.
> 
> Signed-off-by: Libo Chen <libo.chen@oracle.com>
> ---
>   CMakeLists.txt     |  6 ++++++
>   src/KsGLWidget.cpp | 25 +++++++++++++++++++++++--
>   2 files changed, 29 insertions(+), 2 deletions(-)
> 
> diff --git a/CMakeLists.txt b/CMakeLists.txt
> index 988bfd6..7847177 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
>   set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>   
> +find_package(OpenMP 3.2.5)
> +if (OPENMP_FOUND)
> +    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
> +    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
> +endif(OPENMP_FOUND)
> +
>   set(CMAKE_CXX_STANDARD 17)
>   set(CMAKE_CXX_STANDARD_REQUIRED ON)
>   set(CMAKE_CXX_EXTENSIONS OFF)
> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
> index 9311d98..004d64b 100644
> --- a/src/KsGLWidget.cpp
> +++ b/src/KsGLWidget.cpp
> @@ -13,6 +13,9 @@
>   #include <GL/glut.h>
>   #include <GL/gl.h>
>   
> +// OpenMP
> +#include <omp.h>
> +
>   // KernelShark
>   #include "libkshark-plugin.h"
>   #include "KsGLWidget.hpp"
> @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs()
>   		return graph;
>   	};
>   
> +	omp_set_num_threads(omp_get_num_procs());
I think I already asked you to check if it is possible to move this to 
the constructor of the widget so that it is called just once. If there 
is some reason why this is not possible, at least provide some explanation.

>   	for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
>   		sd = it.key();
> +		QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
> +		QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
> +
>   		/* Create CPU graphs according to the cpuList. */
>   		it.value()._cpuGraphs = {};
> +		#pragma omp parallel for
>   		for (auto const &cpu: it.value()._cpuList) {
> -			g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
> +			int idx = it.value()._cpuList.indexOf(cpu);

Maybe I do not understand what you want to do here, but this looks 
over-complicated to me. Isn't it equivalent to having simply

		for (size_t idx = 0; idx < nCpus; ++idx) {
	 		int cpu = it.value()._cpuList[idx];

The same comment applies for the other loop below.

> +			cpuGraphs[idx] = _newCPUGraph(sd, cpu);
> +		}
> +		QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
> +		while (itCpuGraphs.hasNext()) {
> +			g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
>   			it.value()._cpuGraphs.append(g);
>   		}
>   
>   		/* Create Task graphs according to the taskList. */
>   		it.value()._taskGraphs = {};
> +		#pragma omp parallel for
>   		for (auto const &pid: it.value()._taskList) {
> -			g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing);
> +			int idx = it.value()._taskList.indexOf(pid);
> +			taskGraphs[idx] = _newTaskGraph(sd, pid);
> +		}
> +		QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs);
> +		while (itTaskGraphs.hasNext()) {
> +			g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing);
>   			it.value()._taskGraphs.append(g);
>   		}
> +
Please remove this empty line.

Beside those minor things, the patch looks good to me. Please address 
the comments and I will be happy to apply your patch.

Thanks for helping us improve KerrnelShark!

Cheers,
Yordan
>   	}
>   
>   	for (auto &c: _comboPlots) {
>   		int n = c.count();
> +		#pragma omp parallel for
>   		for (int i = 0; i < n; ++i) {
>   			sd = c[i]._streamId;
>   			if (c[i]._type & KSHARK_TASK_DRAW) {
Libo Chen March 24, 2025, 10:08 a.m. UTC | #2
On 3/23/25 09:01, Yordan Karadzhov wrote:
> Hi Libo,
> Please see my comments below.
> 
> On 3/15/25 00:07, Libo Chen wrote:
>> Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically
>> speed up graph rendering particularly for traces from very large systems.
>>
>> OpenMP technically is a new dependency here, but it's part of GCC, so long
>> as your GCC >= v4.9, the libgomp library will make the code compiled.
>>
>> Signed-off-by: Libo Chen <libo.chen@oracle.com>
>> ---
>>   CMakeLists.txt     |  6 ++++++
>>   src/KsGLWidget.cpp | 25 +++++++++++++++++++++++--
>>   2 files changed, 29 insertions(+), 2 deletions(-)
>>
>> diff --git a/CMakeLists.txt b/CMakeLists.txt
>> index 988bfd6..7847177 100644
>> --- a/CMakeLists.txt
>> +++ b/CMakeLists.txt
>> @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
>>   set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>>   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>>   +find_package(OpenMP 3.2.5)
>> +if (OPENMP_FOUND)
>> +    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
>> +    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
>> +endif(OPENMP_FOUND)
>> +
>>   set(CMAKE_CXX_STANDARD 17)
>>   set(CMAKE_CXX_STANDARD_REQUIRED ON)
>>   set(CMAKE_CXX_EXTENSIONS OFF)
>> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
>> index 9311d98..004d64b 100644
>> --- a/src/KsGLWidget.cpp
>> +++ b/src/KsGLWidget.cpp
>> @@ -13,6 +13,9 @@
>>   #include <GL/glut.h>
>>   #include <GL/gl.h>
>>   +// OpenMP
>> +#include <omp.h>
>> +
>>   // KernelShark
>>   #include "libkshark-plugin.h"
>>   #include "KsGLWidget.hpp"
>> @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs()
>>           return graph;
>>       };
>>   +    omp_set_num_threads(omp_get_num_procs());
> I think I already asked you to check if it is possible to move this to the constructor of the widget so that it is called just once. If there is some reason why this is not possible, at least provide some explanation.
Hi Yordan, thanks for your review.

oops I missed that, will move it over.
> 
>>       for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
>>           sd = it.key();
>> +        QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
>> +        QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
>> +
>>           /* Create CPU graphs according to the cpuList. */
>>           it.value()._cpuGraphs = {};
>> +        #pragma omp parallel for
>>           for (auto const &cpu: it.value()._cpuList) {
>> -            g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
>> +            int idx = it.value()._cpuList.indexOf(cpu);
> 
> Maybe I do not understand what you want to do here, but this looks over-complicated to me. Isn't it equivalent to having simply
> 
>         for (size_t idx = 0; idx < nCpus; ++idx) {
>              int cpu = it.value()._cpuList[idx];
> 
I think _cpuList isn't exactly the same as [0..nCpus) here. In a default plot, some idle cpus may not be appended to the vector

	/* Do not add plots for idle CPUs. */
	if (!kshark_hash_id_find(stream->idle_cpus, cpu))
		plotVec.append(cpu);

Also you can set a subset of CPUs to show, so _cpuList could be quite random. 


Best,
Libo
> The same comment applies for the other loop below.
> 
>> +            cpuGraphs[idx] = _newCPUGraph(sd, cpu);
>> +        }
>> +        QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
>> +        while (itCpuGraphs.hasNext()) {
>> +            g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
>>               it.value()._cpuGraphs.append(g);
>>           }
>>             /* Create Task graphs according to the taskList. */
>>           it.value()._taskGraphs = {};
>> +        #pragma omp parallel for
>>           for (auto const &pid: it.value()._taskList) {
>> -            g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing);
>> +            int idx = it.value()._taskList.indexOf(pid);
>> +            taskGraphs[idx] = _newTaskGraph(sd, pid);
>> +        }
>> +        QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs);
>> +        while (itTaskGraphs.hasNext()) {
>> +            g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing);
>>               it.value()._taskGraphs.append(g);
>>           }
>> +
> Please remove this empty line.
> 
> Beside those minor things, the patch looks good to me. Please address the comments and I will be happy to apply your patch.
> 
> Thanks for helping us improve KerrnelShark!
> 
> Cheers,
> Yordan
>>       }
>>         for (auto &c: _comboPlots) {
>>           int n = c.count();
>> +        #pragma omp parallel for
>>           for (int i = 0; i < n; ++i) {
>>               sd = c[i]._streamId;
>>               if (c[i]._type & KSHARK_TASK_DRAW) {
Libo Chen March 26, 2025, 11:46 p.m. UTC | #3
On 3/24/25 03:08, Libo Chen wrote:
> 
> 
> On 3/23/25 09:01, Yordan Karadzhov wrote:
>> Hi Libo,
>> Please see my comments below.
>>
>> On 3/15/25 00:07, Libo Chen wrote:
>>> Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically
>>> speed up graph rendering particularly for traces from very large systems.
>>>
>>> OpenMP technically is a new dependency here, but it's part of GCC, so long
>>> as your GCC >= v4.9, the libgomp library will make the code compiled.
>>>
>>> Signed-off-by: Libo Chen <libo.chen@oracle.com>
>>> ---
>>>   CMakeLists.txt     |  6 ++++++
>>>   src/KsGLWidget.cpp | 25 +++++++++++++++++++++++--
>>>   2 files changed, 29 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/CMakeLists.txt b/CMakeLists.txt
>>> index 988bfd6..7847177 100644
>>> --- a/CMakeLists.txt
>>> +++ b/CMakeLists.txt
>>> @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
>>>   set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>>>   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>>>   +find_package(OpenMP 3.2.5)
>>> +if (OPENMP_FOUND)
>>> +    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
>>> +    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
>>> +endif(OPENMP_FOUND)
>>> +
>>>   set(CMAKE_CXX_STANDARD 17)
>>>   set(CMAKE_CXX_STANDARD_REQUIRED ON)
>>>   set(CMAKE_CXX_EXTENSIONS OFF)
>>> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
>>> index 9311d98..004d64b 100644
>>> --- a/src/KsGLWidget.cpp
>>> +++ b/src/KsGLWidget.cpp
>>> @@ -13,6 +13,9 @@
>>>   #include <GL/glut.h>
>>>   #include <GL/gl.h>
>>>   +// OpenMP
>>> +#include <omp.h>
>>> +
>>>   // KernelShark
>>>   #include "libkshark-plugin.h"
>>>   #include "KsGLWidget.hpp"
>>> @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs()
>>>           return graph;
>>>       };
>>>   +    omp_set_num_threads(omp_get_num_procs());
>> I think I already asked you to check if it is possible to move this to the constructor of the widget so that it is called just once. If there is some reason why this is not possible, at least provide some explanation.
> Hi Yordan, thanks for your review.
> 
> oops I missed that, will move it over.
>>
>>>       for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
>>>           sd = it.key();
>>> +        QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
>>> +        QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
>>> +
>>>           /* Create CPU graphs according to the cpuList. */
>>>           it.value()._cpuGraphs = {};
>>> +        #pragma omp parallel for
>>>           for (auto const &cpu: it.value()._cpuList) {
>>> -            g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
>>> +            int idx = it.value()._cpuList.indexOf(cpu);
>>
>> Maybe I do not understand what you want to do here, but this looks over-complicated to me. Isn't it equivalent to having simply
>>
>>         for (size_t idx = 0; idx < nCpus; ++idx) {
>>              int cpu = it.value()._cpuList[idx];
>>
> I think _cpuList isn't exactly the same as [0..nCpus) here. In a default plot, some idle cpus may not be appended to the vector
> 
> 	/* Do not add plots for idle CPUs. */
> 	if (!kshark_hash_id_find(stream->idle_cpus, cpu))
> 		plotVec.append(cpu);
> 
> Also you can set a subset of CPUs to show, so _cpuList could be quite random. 
> 
Never mind, you're right. I got myself confused here. Will send you v3 soon. Thanks
> 
> Best,
> Libo
>> The same comment applies for the other loop below.
>>
>>> +            cpuGraphs[idx] = _newCPUGraph(sd, cpu);
>>> +        }
>>> +        QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
>>> +        while (itCpuGraphs.hasNext()) {
>>> +            g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
>>>               it.value()._cpuGraphs.append(g);
>>>           }
>>>             /* Create Task graphs according to the taskList. */
>>>           it.value()._taskGraphs = {};
>>> +        #pragma omp parallel for
>>>           for (auto const &pid: it.value()._taskList) {
>>> -            g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing);
>>> +            int idx = it.value()._taskList.indexOf(pid);
>>> +            taskGraphs[idx] = _newTaskGraph(sd, pid);
>>> +        }
>>> +        QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs);
>>> +        while (itTaskGraphs.hasNext()) {
>>> +            g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing);
>>>               it.value()._taskGraphs.append(g);
>>>           }
>>> +
>> Please remove this empty line.
>>
>> Beside those minor things, the patch looks good to me. Please address the comments and I will be happy to apply your patch.
>>
>> Thanks for helping us improve KerrnelShark!
>>
>> Cheers,
>> Yordan
>>>       }
>>>         for (auto &c: _comboPlots) {
>>>           int n = c.count();
>>> +        #pragma omp parallel for
>>>           for (int i = 0; i < n; ++i) {
>>>               sd = c[i]._streamId;
>>>               if (c[i]._type & KSHARK_TASK_DRAW) {
>
diff mbox series

Patch

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 988bfd6..7847177 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,12 @@  set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
 set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
 
+find_package(OpenMP 3.2.5)
+if (OPENMP_FOUND)
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif(OPENMP_FOUND)
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
index 9311d98..004d64b 100644
--- a/src/KsGLWidget.cpp
+++ b/src/KsGLWidget.cpp
@@ -13,6 +13,9 @@ 
 #include <GL/glut.h>
 #include <GL/gl.h>
 
+// OpenMP
+#include <omp.h>
+
 // KernelShark
 #include "libkshark-plugin.h"
 #include "KsGLWidget.hpp"
@@ -688,25 +691,43 @@  void KsGLWidget::_makeGraphs()
 		return graph;
 	};
 
+	omp_set_num_threads(omp_get_num_procs());
 	for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
 		sd = it.key();
+		QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
+		QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
+
 		/* Create CPU graphs according to the cpuList. */
 		it.value()._cpuGraphs = {};
+		#pragma omp parallel for
 		for (auto const &cpu: it.value()._cpuList) {
-			g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
+			int idx = it.value()._cpuList.indexOf(cpu);
+			cpuGraphs[idx] = _newCPUGraph(sd, cpu);
+		}
+		QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
+		while (itCpuGraphs.hasNext()) {
+			g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
 			it.value()._cpuGraphs.append(g);
 		}
 
 		/* Create Task graphs according to the taskList. */
 		it.value()._taskGraphs = {};
+		#pragma omp parallel for
 		for (auto const &pid: it.value()._taskList) {
-			g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing);
+			int idx = it.value()._taskList.indexOf(pid);
+			taskGraphs[idx] = _newTaskGraph(sd, pid);
+		}
+		QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs);
+		while (itTaskGraphs.hasNext()) {
+			g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing);
 			it.value()._taskGraphs.append(g);
 		}
+
 	}
 
 	for (auto &c: _comboPlots) {
 		int n = c.count();
+		#pragma omp parallel for
 		for (int i = 0; i < n; ++i) {
 			sd = c[i]._streamId;
 			if (c[i]._type & KSHARK_TASK_DRAW) {