diff mbox series

[v3] kernel-shark: Multi-thread the computaion of stream/combo plots

Message ID 20250327002133.135719-1-libo.chen@oracle.com (mailing list archive)
State Accepted
Commit 81e98952524d8e7712dd52e24eb96ad687ab39ac
Headers show
Series [v3] kernel-shark: Multi-thread the computaion of stream/combo plots | expand

Commit Message

Libo Chen March 27, 2025, 12:21 a.m. UTC
Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically
speed up graph rendering particularly for traces from very large systems.

OpenMP technically is a new dependency here, but it's part of GCC, so long
as your GCC >= v4.9, the libgomp library will make the code compiled.

Signed-off-by: Libo Chen <libo.chen@oracle.com>
---
 CMakeLists.txt     |  6 ++++++
 src/KsGLWidget.cpp | 30 ++++++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 4 deletions(-)

Comments

Yordan Karadzhov April 3, 2025, 5:35 p.m. UTC | #1
The patch is applied. Thanks!

Yordan

On 3/27/25 02:21, Libo Chen wrote:
> Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically
> speed up graph rendering particularly for traces from very large systems.
> 
> OpenMP technically is a new dependency here, but it's part of GCC, so long
> as your GCC >= v4.9, the libgomp library will make the code compiled.
> 
> Signed-off-by: Libo Chen <libo.chen@oracle.com>
> ---
>   CMakeLists.txt     |  6 ++++++
>   src/KsGLWidget.cpp | 30 ++++++++++++++++++++++++++----
>   2 files changed, 32 insertions(+), 4 deletions(-)
> 
> diff --git a/CMakeLists.txt b/CMakeLists.txt
> index 988bfd6..7847177 100644
> --- a/CMakeLists.txt
> +++ b/CMakeLists.txt
> @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
>   set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
>   
> +find_package(OpenMP 3.2.5)
> +if (OPENMP_FOUND)
> +    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
> +    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
> +endif(OPENMP_FOUND)
> +
>   set(CMAKE_CXX_STANDARD 17)
>   set(CMAKE_CXX_STANDARD_REQUIRED ON)
>   set(CMAKE_CXX_EXTENSIONS OFF)
> diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
> index 9311d98..7f2001c 100644
> --- a/src/KsGLWidget.cpp
> +++ b/src/KsGLWidget.cpp
> @@ -13,6 +13,9 @@
>   #include <GL/glut.h>
>   #include <GL/gl.h>
>   
> +// OpenMP
> +#include <omp.h>
> +
>   // KernelShark
>   #include "libkshark-plugin.h"
>   #include "KsGLWidget.hpp"
> @@ -54,6 +57,7 @@ KsGLWidget::KsGLWidget(QWidget *parent)
>   
>   	connect(&_model,	&QAbstractTableModel::modelReset,
>   		this,		qOverload<>(&KsGLWidget::update));
> +	omp_set_num_threads(omp_get_num_procs());
>   }
>   
>   void KsGLWidget::_freeGraphs()
> @@ -690,23 +694,41 @@ void KsGLWidget::_makeGraphs()
>   
>   	for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
>   		sd = it.key();
> +		size_t nCpus = it.value()._cpuList.count();
> +		size_t nTasks = it.value()._taskList.count();
> +		QVector<KsPlot::Graph *> cpuGraphs(nCpus);
> +		QVector<KsPlot::Graph *> taskGraphs(nTasks);
> +
>   		/* Create CPU graphs according to the cpuList. */
>   		it.value()._cpuGraphs = {};
> -		for (auto const &cpu: it.value()._cpuList) {
> -			g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
> +		#pragma omp parallel for
> +		for (size_t idx = 0; idx < nCpus; ++idx) {
> +			int cpu = it.value()._cpuList[idx];
> +			cpuGraphs[idx] = _newCPUGraph(sd, cpu);
> +		}
> +		QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
> +		while (itCpuGraphs.hasNext()) {
> +			g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
>   			it.value()._cpuGraphs.append(g);
>   		}
>   
>   		/* Create Task graphs according to the taskList. */
>   		it.value()._taskGraphs = {};
> -		for (auto const &pid: it.value()._taskList) {
> -			g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing);
> +		#pragma omp parallel for
> +		for (size_t idx = 0; idx < nTasks; ++idx) {
> +			int pid = it.value()._taskList[idx];
> +			taskGraphs[idx] = _newTaskGraph(sd, pid);
> +		}
> +		QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs);
> +		while (itTaskGraphs.hasNext()) {
> +			g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing);
>   			it.value()._taskGraphs.append(g);
>   		}
>   	}
>   
>   	for (auto &c: _comboPlots) {
>   		int n = c.count();
> +		#pragma omp parallel for
>   		for (int i = 0; i < n; ++i) {
>   			sd = c[i]._streamId;
>   			if (c[i]._type & KSHARK_TASK_DRAW) {
diff mbox series

Patch

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 988bfd6..7847177 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,12 @@  set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
 set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
 
+find_package(OpenMP 3.2.5)
+if (OPENMP_FOUND)
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif(OPENMP_FOUND)
+
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
index 9311d98..7f2001c 100644
--- a/src/KsGLWidget.cpp
+++ b/src/KsGLWidget.cpp
@@ -13,6 +13,9 @@ 
 #include <GL/glut.h>
 #include <GL/gl.h>
 
+// OpenMP
+#include <omp.h>
+
 // KernelShark
 #include "libkshark-plugin.h"
 #include "KsGLWidget.hpp"
@@ -54,6 +57,7 @@  KsGLWidget::KsGLWidget(QWidget *parent)
 
 	connect(&_model,	&QAbstractTableModel::modelReset,
 		this,		qOverload<>(&KsGLWidget::update));
+	omp_set_num_threads(omp_get_num_procs());
 }
 
 void KsGLWidget::_freeGraphs()
@@ -690,23 +694,41 @@  void KsGLWidget::_makeGraphs()
 
 	for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {
 		sd = it.key();
+		size_t nCpus = it.value()._cpuList.count();
+		size_t nTasks = it.value()._taskList.count();
+		QVector<KsPlot::Graph *> cpuGraphs(nCpus);
+		QVector<KsPlot::Graph *> taskGraphs(nTasks);
+
 		/* Create CPU graphs according to the cpuList. */
 		it.value()._cpuGraphs = {};
-		for (auto const &cpu: it.value()._cpuList) {
-			g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
+		#pragma omp parallel for
+		for (size_t idx = 0; idx < nCpus; ++idx) {
+			int cpu = it.value()._cpuList[idx];
+			cpuGraphs[idx] = _newCPUGraph(sd, cpu);
+		}
+		QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
+		while (itCpuGraphs.hasNext()) {
+			g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
 			it.value()._cpuGraphs.append(g);
 		}
 
 		/* Create Task graphs according to the taskList. */
 		it.value()._taskGraphs = {};
-		for (auto const &pid: it.value()._taskList) {
-			g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing);
+		#pragma omp parallel for
+		for (size_t idx = 0; idx < nTasks; ++idx) {
+			int pid = it.value()._taskList[idx];
+			taskGraphs[idx] = _newTaskGraph(sd, pid);
+		}
+		QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs);
+		while (itTaskGraphs.hasNext()) {
+			g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing);
 			it.value()._taskGraphs.append(g);
 		}
 	}
 
 	for (auto &c: _comboPlots) {
 		int n = c.count();
+		#pragma omp parallel for
 		for (int i = 0; i < n; ++i) {
 			sd = c[i]._streamId;
 			if (c[i]._type & KSHARK_TASK_DRAW) {