Message ID | 20250327002133.135719-1-libo.chen@oracle.com (mailing list archive) |
---|---|
State | Accepted |
Commit | 81e98952524d8e7712dd52e24eb96ad687ab39ac |
Headers | show |
Series | [v3] kernel-shark: Multi-thread the computaion of stream/combo plots | expand |
The patch is applied. Thanks! Yordan On 3/27/25 02:21, Libo Chen wrote: > Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically > speed up graph rendering particularly for traces from very large systems. > > OpenMP technically is a new dependency here, but it's part of GCC, so long > as your GCC >= v4.9, the libgomp library will make the code compiled. > > Signed-off-by: Libo Chen <libo.chen@oracle.com> > --- > CMakeLists.txt | 6 ++++++ > src/KsGLWidget.cpp | 30 ++++++++++++++++++++++++++---- > 2 files changed, 32 insertions(+), 4 deletions(-) > > diff --git a/CMakeLists.txt b/CMakeLists.txt > index 988bfd6..7847177 100644 > --- a/CMakeLists.txt > +++ b/CMakeLists.txt > @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin") > set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") > set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") > > +find_package(OpenMP 3.2.5) > +if (OPENMP_FOUND) > + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") > + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") > +endif(OPENMP_FOUND) > + > set(CMAKE_CXX_STANDARD 17) > set(CMAKE_CXX_STANDARD_REQUIRED ON) > set(CMAKE_CXX_EXTENSIONS OFF) > diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp > index 9311d98..7f2001c 100644 > --- a/src/KsGLWidget.cpp > +++ b/src/KsGLWidget.cpp > @@ -13,6 +13,9 @@ > #include <GL/glut.h> > #include <GL/gl.h> > > +// OpenMP > +#include <omp.h> > + > // KernelShark > #include "libkshark-plugin.h" > #include "KsGLWidget.hpp" > @@ -54,6 +57,7 @@ KsGLWidget::KsGLWidget(QWidget *parent) > > connect(&_model, &QAbstractTableModel::modelReset, > this, qOverload<>(&KsGLWidget::update)); > + omp_set_num_threads(omp_get_num_procs()); > } > > void KsGLWidget::_freeGraphs() > @@ -690,23 +694,41 @@ void KsGLWidget::_makeGraphs() > > for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) { > sd = it.key(); > + size_t nCpus = it.value()._cpuList.count(); > + size_t nTasks = it.value()._taskList.count(); > + QVector<KsPlot::Graph *> cpuGraphs(nCpus); > + QVector<KsPlot::Graph *> taskGraphs(nTasks); > + > /* Create CPU graphs according to the cpuList. */ > it.value()._cpuGraphs = {}; > - for (auto const &cpu: it.value()._cpuList) { > - g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing); > + #pragma omp parallel for > + for (size_t idx = 0; idx < nCpus; ++idx) { > + int cpu = it.value()._cpuList[idx]; > + cpuGraphs[idx] = _newCPUGraph(sd, cpu); > + } > + QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs); > + while (itCpuGraphs.hasNext()) { > + g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing); > it.value()._cpuGraphs.append(g); > } > > /* Create Task graphs according to the taskList. */ > it.value()._taskGraphs = {}; > - for (auto const &pid: it.value()._taskList) { > - g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing); > + #pragma omp parallel for > + for (size_t idx = 0; idx < nTasks; ++idx) { > + int pid = it.value()._taskList[idx]; > + taskGraphs[idx] = _newTaskGraph(sd, pid); > + } > + QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs); > + while (itTaskGraphs.hasNext()) { > + g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing); > it.value()._taskGraphs.append(g); > } > } > > for (auto &c: _comboPlots) { > int n = c.count(); > + #pragma omp parallel for > for (int i = 0; i < n; ++i) { > sd = c[i]._streamId; > if (c[i]._type & KSHARK_TASK_DRAW) {
diff --git a/CMakeLists.txt b/CMakeLists.txt index 988bfd6..7847177 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") +find_package(OpenMP 3.2.5) +if (OPENMP_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif(OPENMP_FOUND) + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp index 9311d98..7f2001c 100644 --- a/src/KsGLWidget.cpp +++ b/src/KsGLWidget.cpp @@ -13,6 +13,9 @@ #include <GL/glut.h> #include <GL/gl.h> +// OpenMP +#include <omp.h> + // KernelShark #include "libkshark-plugin.h" #include "KsGLWidget.hpp" @@ -54,6 +57,7 @@ KsGLWidget::KsGLWidget(QWidget *parent) connect(&_model, &QAbstractTableModel::modelReset, this, qOverload<>(&KsGLWidget::update)); + omp_set_num_threads(omp_get_num_procs()); } void KsGLWidget::_freeGraphs() @@ -690,23 +694,41 @@ void KsGLWidget::_makeGraphs() for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) { sd = it.key(); + size_t nCpus = it.value()._cpuList.count(); + size_t nTasks = it.value()._taskList.count(); + QVector<KsPlot::Graph *> cpuGraphs(nCpus); + QVector<KsPlot::Graph *> taskGraphs(nTasks); + /* Create CPU graphs according to the cpuList. */ it.value()._cpuGraphs = {}; - for (auto const &cpu: it.value()._cpuList) { - g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing); + #pragma omp parallel for + for (size_t idx = 0; idx < nCpus; ++idx) { + int cpu = it.value()._cpuList[idx]; + cpuGraphs[idx] = _newCPUGraph(sd, cpu); + } + QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs); + while (itCpuGraphs.hasNext()) { + g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing); it.value()._cpuGraphs.append(g); } /* Create Task graphs according to the taskList. */ it.value()._taskGraphs = {}; - for (auto const &pid: it.value()._taskList) { - g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing); + #pragma omp parallel for + for (size_t idx = 0; idx < nTasks; ++idx) { + int pid = it.value()._taskList[idx]; + taskGraphs[idx] = _newTaskGraph(sd, pid); + } + QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs); + while (itTaskGraphs.hasNext()) { + g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing); it.value()._taskGraphs.append(g); } } for (auto &c: _comboPlots) { int n = c.count(); + #pragma omp parallel for for (int i = 0; i < n; ++i) { sd = c[i]._streamId; if (c[i]._type & KSHARK_TASK_DRAW) {
Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically speed up graph rendering particularly for traces from very large systems. OpenMP technically is a new dependency here, but it's part of GCC, so long as your GCC >= v4.9, the libgomp library will make the code compiled. Signed-off-by: Libo Chen <libo.chen@oracle.com> --- CMakeLists.txt | 6 ++++++ src/KsGLWidget.cpp | 30 ++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-)