-
Notifications
You must be signed in to change notification settings - Fork 247
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Core][Parallelization] Making explicitily schedule(runtime)
, with dynamic
by default, in OMP loops in ParallelUtils
#12923
base: master
Are you sure you want to change the base?
Changes from all commits
897cd72
dc8a7e2
91374a0
83ce3e7
d7754da
7ece75a
8ba1ab5
5503861
67c4c6f
ad3f345
ddfdca6
9c95018
bc5074e
606882b
24f40aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
// | / | | ||
// ' / __| _` | __| _ \ __| | ||
// . \ | ( | | ( |\__ ` | ||
// _|\_\_| \__,_|\__|\___/ ____/ | ||
// Multi-Physics | ||
// | ||
// License: BSD License | ||
// Kratos default license: kratos/license.txt | ||
// | ||
// Main authors: Vicente Mataix Ferrandiz | ||
// | ||
|
||
// System includes | ||
#include <utility> | ||
#include <numeric> | ||
#include <iostream> | ||
#include <unordered_map> | ||
|
||
// External includes | ||
#include <benchmark/benchmark.h> | ||
|
||
// Project includes | ||
#include "utilities/parallel_utilities.h" | ||
#include "utilities/reduction_utilities.h" | ||
|
||
namespace Kratos | ||
{ | ||
// Template class for testing | ||
template<std::size_t TSize> | ||
class RHSElement { | ||
public: | ||
explicit RHSElement(const double Val) : mRHSVal(Val) {} | ||
void CalculateRHS(std::vector<double>& rVector) { | ||
if (rVector.size() != TSize) { rVector.resize(TSize); } | ||
std::fill(rVector.begin(), rVector.end(), mRHSVal); | ||
} | ||
double GetAccumRHSValue() { return mAccumRHSValue; } | ||
void SetAccumRHSValue(double Value) { mAccumRHSValue = Value; } | ||
|
||
private: | ||
double mRHSVal; | ||
double mAccumRHSValue = 0.0; | ||
}; | ||
|
||
// Benchmark for power operation on a vector | ||
static void BM_VectorPower(benchmark::State& state) { | ||
int nsize = state.range(0); | ||
std::vector<double> data_vector(nsize, 5.0); | ||
|
||
for (auto _ : state) { | ||
block_for_each(data_vector, [](double& item) { | ||
item = std::pow(item, 0.1); | ||
}); | ||
} | ||
} | ||
|
||
// Benchmark for reduction | ||
static void BM_VectorReduction(benchmark::State& state) { | ||
int nsize = state.range(0); | ||
std::vector<double> data_vector(nsize, 5.0); | ||
|
||
for (auto _ : state) { | ||
auto final_sum = BlockPartition<std::vector<double>::iterator>(data_vector.begin(), | ||
data_vector.end()).for_each<SumReduction<double>>( | ||
[](double& item){ | ||
return item; | ||
}); | ||
} | ||
} | ||
|
||
// Benchmark for element-wise operations with thread-local storage | ||
static void BM_ThreadLocalStorage(benchmark::State& state) { | ||
constexpr std::size_t vec_size = 6; | ||
std::size_t n_elems = state.range(0); | ||
|
||
using RHSElementType = RHSElement<vec_size>; | ||
|
||
std::vector<double> rhs_vals(n_elems); | ||
for (std::size_t i = 0; i < n_elems; ++i) { | ||
rhs_vals[i] = (i % 12) * 1.889; | ||
} | ||
|
||
std::vector<RHSElementType> elements; | ||
for (std::size_t i = 0; i < rhs_vals.size(); ++i) { | ||
elements.push_back(RHSElementType(rhs_vals[i])); | ||
} | ||
|
||
auto tls_lambda_manual_reduction = [](RHSElementType& rElem, std::vector<double>& rTLS) | ||
{ | ||
rElem.CalculateRHS(rTLS); | ||
double rhs_sum = std::accumulate(rTLS.begin(), rTLS.end(), 0.0); | ||
rElem.SetAccumRHSValue(rhs_sum); | ||
}; | ||
|
||
for (auto _ : state) { | ||
BlockPartition<std::vector<RHSElementType>::iterator>(elements.begin(), | ||
elements.end()).for_each(std::vector<double>(), tls_lambda_manual_reduction); | ||
|
||
const double sum_elem_rhs_vals = std::accumulate(elements.begin(), elements.end(), 0.0, [](double acc, RHSElementType& rElem){ | ||
return acc + rElem.GetAccumRHSValue(); | ||
}); | ||
} | ||
} | ||
|
||
// Register benchmarks and provide input size as a command-line option | ||
BENCHMARK(BM_VectorPower)->Arg(1e3)->Arg(1e5)->Arg(1e6); | ||
BENCHMARK(BM_VectorReduction)->Arg(1e3)->Arg(1e5)->Arg(1e6); | ||
BENCHMARK(BM_ThreadLocalStorage)->Arg(1e3)->Arg(1e5)->Arg(1e6); | ||
|
||
} // namespace Kratos | ||
|
||
BENCHMARK_MAIN(); |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
// Main authors: Riccardo Rossi | ||
// Denis Demidov | ||
// Philipp Bucher (https://github.com/philbucher) | ||
// Vicente Mataix Ferrandiz | ||
// | ||
|
||
#pragma once | ||
|
@@ -183,7 +184,7 @@ class BlockPartition | |
{ | ||
KRATOS_PREPARE_CATCH_THREAD_EXCEPTION | ||
|
||
#pragma omp parallel for | ||
#pragma omp parallel for schedule(runtime) | ||
for (int i=0; i<mNchunks; ++i) { | ||
KRATOS_TRY | ||
for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) { | ||
|
@@ -206,7 +207,7 @@ class BlockPartition | |
KRATOS_PREPARE_CATCH_THREAD_EXCEPTION | ||
|
||
TReducer global_reducer; | ||
#pragma omp parallel for | ||
#pragma omp parallel for schedule(runtime) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @loumalouomega as i am telling, take a look to line 154. It does not make sense to change this unless we change what happens there. also to my understanding the runtime behaviour has potentially a very high overhead due to the need of making a syscall to fetch an env variable. not sure if that matters...but at least we need to beware of this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should use the benchmark to check that it affects significantly There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea of the runtime is to give flexibility, if you prefer we can define it on compiling time... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is recommended by OMP itself: https://www.openmp.org/wp-content/uploads/openmp-webinar-vanderPas-20210318.pdf (And I found a master thesis saying that it doesn't penalize https://hpc.dmi.unibas.ch/wp-content/uploads/sites/87/2020/10/2019_akan_yilmaz_ma_thesisjune2019.pdf) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @loumalouomega aside of the comments on the opportunity of using the OMP_SCHEDULE did u take a look at what i am writing? we are doing "by hand" the chunking. If we don't change that, it makes no sense to use a different scheduling, as everyone will be working on its chunk (as of now we dot have more chunks than threads!) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay...let me think this... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In that case we may need to rethink the chunging (to be dependent of the CPU architecture) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @RiccardoRossi what do you suggest exactly, because I has been studying this and our chunking conflicts with the OMP scheduling, and a priori the most efficient would be to let OMP to do the chunking. The problem is that with that we lose the parallel_utilities design and reduction utilities. |
||
for (int i=0; i<mNchunks; ++i) { | ||
KRATOS_TRY | ||
TReducer local_reducer; | ||
|
@@ -238,7 +239,7 @@ class BlockPartition | |
// copy the prototype to create the thread local storage | ||
TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype); | ||
|
||
#pragma omp for | ||
#pragma omp for schedule(runtime) | ||
for(int i=0; i<mNchunks; ++i){ | ||
KRATOS_TRY | ||
for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it){ | ||
|
@@ -270,7 +271,7 @@ class BlockPartition | |
// copy the prototype to create the thread local storage | ||
TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype); | ||
|
||
#pragma omp for | ||
#pragma omp for schedule(runtime) | ||
for (int i=0; i<mNchunks; ++i) { | ||
KRATOS_TRY | ||
TReducer local_reducer; | ||
|
@@ -519,7 +520,7 @@ class IndexPartition | |
{ | ||
KRATOS_PREPARE_CATCH_THREAD_EXCEPTION | ||
|
||
#pragma omp parallel for | ||
#pragma omp parallel for schedule(runtime) | ||
for (int i=0; i<mNchunks; ++i) { | ||
KRATOS_TRY | ||
for (auto k = mBlockPartition[i]; k < mBlockPartition[i+1]; ++k) { | ||
|
@@ -541,7 +542,7 @@ class IndexPartition | |
KRATOS_PREPARE_CATCH_THREAD_EXCEPTION | ||
|
||
TReducer global_reducer; | ||
#pragma omp parallel for | ||
#pragma omp parallel for schedule(runtime) | ||
for (int i=0; i<mNchunks; ++i) { | ||
KRATOS_TRY | ||
TReducer local_reducer; | ||
|
@@ -572,7 +573,7 @@ class IndexPartition | |
// copy the prototype to create the thread local storage | ||
TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype); | ||
|
||
#pragma omp for | ||
#pragma omp for schedule(runtime) | ||
for (int i=0; i<mNchunks; ++i) { | ||
KRATOS_TRY | ||
for (auto k = mBlockPartition[i]; k < mBlockPartition[i+1]; ++k) { | ||
|
@@ -604,7 +605,7 @@ class IndexPartition | |
// copy the prototype to create the thread local storage | ||
TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype); | ||
|
||
#pragma omp for | ||
#pragma omp for schedule(runtime) | ||
for (int i=0; i<mNchunks; ++i) { | ||
KRATOS_TRY | ||
TReducer local_reducer; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OMP_SCHEDULE
is a runtime env variable, it is a extremely bad idea to use it a compilation switch (IMO).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I understand, but this is the following.
During compilation the
OMP_SCHEDULE
will setKRATOS_OMP_SCHEDULE
that will be used as default if actuallyOMP_SCHEDULE
is not defined, but ifOMP_SCHEDULE
is definedOMP_SCHEDULE
will be taken into account. Do you understand me?