KratosMultiphysics · loumalouomega · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 12, 2024
@@ -424,6 +424,27 @@ if (KRATOS_SHARED_MEMORY_PARALLELIZATION STREQUAL "OpenMP")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+
+    # Check if the environment variable OMP_SCHEDULE is defined
+    if(DEFINED ENV{OMP_SCHEDULE})
+        # Set the already defined one
+        set(KRATOS_OMP_SCHEDULE $ENV{OMP_SCHEDULE})
+    else(DEFINED ENV{OMP_SCHEDULE})
+        # If not defined set the default value
+        if(NOT DEFINED KRATOS_OMP_SCHEDULE)
+            message(STATUS "OMP_SCHEDULE is not defined, setting to dynamic. You can also set it with the environment variable OMP_SCHEDULE or with the CMake variable KRATOS_OMP_SCHEDULE (e.g., dynamic,4)")
+            set(KRATOS_OMP_SCHEDULE "dynamic")
+        endif(NOT DEFINED KRATOS_OMP_SCHEDULE)
+    endif((DEFINED ENV{OMP_SCHEDULE}))
+
+    # Display the selected schedule in the build output
+    message(STATUS "KRATOS_OMP_SCHEDULE is set to: ${KRATOS_OMP_SCHEDULE}")
+
+    # Define the OMP_SCHEDULE as a preprocessor macro
+    add_definitions(-DKRATOS_OMP_SCHEDULE="${KRATOS_OMP_SCHEDULE}")
+
+    # This is the only way to run OMP loops with dynamic schedule without conflicting the GIL
+    add_definitions(-DPYBIND11_NO_ASSERT_GIL_HELD_INCREF_DECREF)
   else (OPENMP_FOUND)
     message(FATAL_ERROR "OpenMP could not be found!")
     # fallback solution => in future once better supported we can use the C++11 based parallelization instead

@@ -0,0 +1,112 @@
+//    |  /           |
+//    ' /   __| _` | __|  _ \   __|
+//    . \  |   (   | |   (   |\__ `
+//   _|\_\_|  \__,_|\__|\___/ ____/
+//                   Multi-Physics
+//
+//  License:         BSD License
+//                   Kratos default license: kratos/license.txt
+//
+//  Main authors:    Vicente Mataix Ferrandiz
+//
+
+// System includes
+#include <utility>
+#include <numeric>
+#include <iostream>
+#include <unordered_map>
+
+// External includes
+#include <benchmark/benchmark.h>
+
+// Project includes
+#include "utilities/parallel_utilities.h"
+#include "utilities/reduction_utilities.h"
+
+namespace Kratos
+{
+// Template class for testing
+template<std::size_t TSize>
+class RHSElement {
+public:
+    explicit RHSElement(const double Val) : mRHSVal(Val) {}
+    void CalculateRHS(std::vector<double>& rVector) {
+        if (rVector.size() != TSize) { rVector.resize(TSize); }
+        std::fill(rVector.begin(), rVector.end(), mRHSVal);
+    }
+    double GetAccumRHSValue() { return mAccumRHSValue; }
+    void SetAccumRHSValue(double Value) { mAccumRHSValue = Value; }
+
+private:
+    double mRHSVal;
+    double mAccumRHSValue = 0.0;
+};
+
+// Benchmark for power operation on a vector
+static void BM_VectorPower(benchmark::State& state) {
+    int nsize = state.range(0);
+    std::vector<double> data_vector(nsize, 5.0);
+
+    for (auto _ : state) {
+        block_for_each(data_vector, [](double& item) {
+            item = std::pow(item, 0.1);
+        });
+    }
+}
+
+// Benchmark for reduction
+static void BM_VectorReduction(benchmark::State& state) {
+    int nsize = state.range(0);
+    std::vector<double> data_vector(nsize, 5.0);
+
+    for (auto _ : state) {
+        auto final_sum = BlockPartition<std::vector<double>::iterator>(data_vector.begin(),
+                                                                   data_vector.end()).for_each<SumReduction<double>>(
+        [](double& item){
+            return item;
+        });
+    }
+}
+
+// Benchmark for element-wise operations with thread-local storage
+static void BM_ThreadLocalStorage(benchmark::State& state) {
+    constexpr std::size_t vec_size = 6;
+    std::size_t n_elems = state.range(0);
+
+    using RHSElementType = RHSElement<vec_size>;
+
+    std::vector<double> rhs_vals(n_elems);
+    for (std::size_t i = 0; i < n_elems; ++i) {
+        rhs_vals[i] = (i % 12) * 1.889;
+    }
+
+    std::vector<RHSElementType> elements;
+    for (std::size_t i = 0; i < rhs_vals.size(); ++i) {
+        elements.push_back(RHSElementType(rhs_vals[i]));
+    }
+
+    auto tls_lambda_manual_reduction = [](RHSElementType& rElem, std::vector<double>& rTLS)
+    {
+        rElem.CalculateRHS(rTLS);
+        double rhs_sum = std::accumulate(rTLS.begin(), rTLS.end(), 0.0);
+        rElem.SetAccumRHSValue(rhs_sum);
+    };
+
+    for (auto _ : state) {
+        BlockPartition<std::vector<RHSElementType>::iterator>(elements.begin(),
+                                                          elements.end()).for_each(std::vector<double>(), tls_lambda_manual_reduction);
+
+        const double sum_elem_rhs_vals = std::accumulate(elements.begin(), elements.end(), 0.0, [](double acc, RHSElementType& rElem){
+        return acc + rElem.GetAccumRHSValue();
+    });
+    }
+}
+
+// Register benchmarks and provide input size as a command-line option
+BENCHMARK(BM_VectorPower)->Arg(1e3)->Arg(1e5)->Arg(1e6);
+BENCHMARK(BM_VectorReduction)->Arg(1e3)->Arg(1e5)->Arg(1e6);
+BENCHMARK(BM_ThreadLocalStorage)->Arg(1e3)->Arg(1e5)->Arg(1e6);
+
+}  // namespace Kratos
+
+BENCHMARK_MAIN();
@@ -4,8 +4,8 @@
 //   _|\_\_|  \__,_|\__|\___/ ____/
 //                   Multi-Physics
 //
-//  License:		 BSD License
-//					 Kratos default license: kratos/license.txt
+//  License:         BSD License
+//                   Kratos default license: kratos/license.txt
 //
 //  Main authors:    Pooyan Dadvand
 //
@@ -141,33 +141,62 @@ void Kernel::SetPythonVersion(std::string pyVersion) {
 
 void Kernel::PrintParallelismSupportInfo() const
 {
-    #ifdef KRATOS_SMP_NONE
+#ifdef KRATOS_SMP_NONE
     constexpr bool threading_support = false;
-    #else
+    constexpr auto smp = "None";
+#else
     constexpr bool threading_support = true;
+    std::string scheduling_str;
+    #if defined(KRATOS_SMP_OPENMP)
+        // Check if the environment variable is defined
+        const char* var_name = "OMP_SCHEDULE";
+        const char* scheduling = getenv(var_name);
+
+        if (scheduling != nullptr) { // Correct variable name and nullptr comparison
+            scheduling_str = scheduling;
+        } else {
+        #ifdef KRATOS_OMP_SCHEDULE
+            scheduling_str = KRATOS_OMP_SCHEDULE; // Use the preprocessor-defined value
+        #else
+            scheduling_str = "dynamic"; // NOTE: This should not happen as defined in compiling time
+        #endif
+        #ifdef KRATOS_COMPILED_IN_WINDOWS
+            const int output_setenv = _putenv_s(var_name, scheduling_str.c_str());
+        #else
+            const int overwrite = 1; // Overwrite if it exists, a priori not, that's why we are setting it
+            const int output_setenv = setenv(var_name, scheduling_str.c_str(), overwrite);
+        #endif
+            KRATOS_ERROR_IF_NOT(output_setenv == 0) << "Error setting environment variable " << var_name << std::endl;
+            scheduling_str = "\"" + scheduling_str + "\"";
+            scheduling_str += " (retrieving from KRATOS_OMP_SCHEDULE)";
+        }
+
+        const auto smp = "OpenMP, scheduling type is " + scheduling_str; // Use `std::string` for concatenation
+    #elif defined(KRATOS_SMP_CXX11)
+        constexpr auto smp = "C++11";
+    #else
+        constexpr auto smp = "Unknown";
     #endif
+#endif
 
-    #ifdef KRATOS_USING_MPI
+#ifdef KRATOS_USING_MPI
     constexpr bool mpi_support = true;
-    #else
+#else
     constexpr bool mpi_support = false;
-    #endif
+#endif
 
     Logger logger("");
     logger << LoggerMessage::Severity::INFO;
 
     if (threading_support) {
         if (mpi_support) {
-            logger << "Compiled with threading and MPI support." << std::endl;
-        }
-        else {
-            logger << "Compiled with threading support." << std::endl;
+            logger << "Compiled with threading and MPI support. Threading support with " << smp << "." << std::endl;
+        } else {
+            logger << "Compiled with threading support. Threading support with " << smp << "." << std::endl;
         }
-    }
-    else if (mpi_support) {
+    } else if (mpi_support) {
         logger << "Compiled with MPI support." << std::endl;
-    }
-    else {
+    } else {
         logger << "Serial compilation." << std::endl;
     }
 
@@ -179,8 +208,7 @@ void Kernel::PrintParallelismSupportInfo() const
         if (mIsDistributedRun) {
             const DataCommunicator& r_world = ParallelEnvironment::GetDataCommunicator("World");
             logger << "MPI world size:         " << r_world.Size() << "." << std::endl;
-        }
-        else {
+        } else {
             logger << "Running without MPI." << std::endl;
         }
     }

@@ -4,11 +4,12 @@
 //   _|\_\_|  \__,_|\__|\___/ ____/
 //                   Multi-Physics
 //
-//  License:		 BSD License
-//					 Kratos default license: kratos/license.txt
+//  License:         BSD License
+//                   Kratos default license: kratos/license.txt
 //
 //  Main authors:    Riccardo Rossi
 //                   Philipp Bucher (https://github.com/philbucher)
+//
 
 // System includes
 #include <utility>

@@ -10,6 +10,7 @@
 //  Main authors:    Riccardo Rossi
 //                   Denis Demidov
 //                   Philipp Bucher (https://github.com/philbucher)
+//                   Vicente Mataix Ferrandiz
 //
 
 #pragma once
@@ -183,7 +184,7 @@ class BlockPartition
     {
         KRATOS_PREPARE_CATCH_THREAD_EXCEPTION
 
-        #pragma omp parallel for
+        #pragma omp parallel for schedule(runtime)
         for (int i=0; i<mNchunks; ++i) {
             KRATOS_TRY
             for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it) {
@@ -206,7 +207,7 @@ class BlockPartition
         KRATOS_PREPARE_CATCH_THREAD_EXCEPTION
 
         TReducer global_reducer;
-        #pragma omp parallel for
+        #pragma omp parallel for schedule(runtime)
         for (int i=0; i<mNchunks; ++i) {
             KRATOS_TRY
             TReducer local_reducer;
@@ -238,7 +239,7 @@ class BlockPartition
             // copy the prototype to create the thread local storage
             TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype);
 
-            #pragma omp for
+            #pragma omp for schedule(runtime)
             for(int i=0; i<mNchunks; ++i){
                 KRATOS_TRY
                 for (auto it = mBlockPartition[i]; it != mBlockPartition[i+1]; ++it){
@@ -270,7 +271,7 @@ class BlockPartition
             // copy the prototype to create the thread local storage
             TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype);
 
-            #pragma omp for
+            #pragma omp for schedule(runtime)
             for (int i=0; i<mNchunks; ++i) {
                 KRATOS_TRY
                 TReducer local_reducer;
@@ -519,7 +520,7 @@ class IndexPartition
     {
         KRATOS_PREPARE_CATCH_THREAD_EXCEPTION
 
-        #pragma omp parallel for
+        #pragma omp parallel for schedule(runtime)
         for (int i=0; i<mNchunks; ++i) {
             KRATOS_TRY
             for (auto k = mBlockPartition[i]; k < mBlockPartition[i+1]; ++k) {
@@ -541,7 +542,7 @@ class IndexPartition
         KRATOS_PREPARE_CATCH_THREAD_EXCEPTION
 
         TReducer global_reducer;
-        #pragma omp parallel for
+        #pragma omp parallel for schedule(runtime)
         for (int i=0; i<mNchunks; ++i) {
             KRATOS_TRY
             TReducer local_reducer;
@@ -572,7 +573,7 @@ class IndexPartition
             // copy the prototype to create the thread local storage
             TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype);
 
-            #pragma omp for
+            #pragma omp for schedule(runtime)
             for (int i=0; i<mNchunks; ++i) {
                 KRATOS_TRY
                 for (auto k = mBlockPartition[i]; k < mBlockPartition[i+1]; ++k) {
@@ -604,7 +605,7 @@ class IndexPartition
             // copy the prototype to create the thread local storage
             TThreadLocalStorage thread_local_storage(rThreadLocalStoragePrototype);
 
-            #pragma omp for
+            #pragma omp for schedule(runtime)
             for (int i=0; i<mNchunks; ++i) {
                 KRATOS_TRY
                 TReducer local_reducer;