embedded-dev-research · AndreySorokin7 · Nov 4, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
@@ -32,18 +32,28 @@ add_subdirectory(3rdparty)
 
 include(cmake/opencv_config.cmake)
 
-if (NOT WIN32)
-    find_package(OpenMP REQUIRED)
-endif()
-
-if (NOT WIN32)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
+find_package(OpenMP)
+
+if(OpenMP_FOUND)
+    message(STATUS "OpenMP found - enabling parallel support")
+    add_definitions(-DHAS_OPENMP)
+    if(TARGET OpenMP::OpenMP_CXX)
+        set(OPENMP_TARGET OpenMP::OpenMP_CXX)
+        message(STATUS "Using OpenMP target: OpenMP::OpenMP_CXX")
+    else()
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        if(OpenMP_CXX_LIBRARIES)
+            set(OPENMP_LIBRARIES ${OpenMP_CXX_LIBRARIES})
+        endif()
+        message(STATUS "OpenMP CXX flags: ${OpenMP_CXX_FLAGS}")
+        message(STATUS "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}")
+    endif()
 else()
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX")
+    message(STATUS "OpenMP not found - parallel features disabled")
 endif()
 
+
 foreach(CONFIG "" _DEBUG _RELEASE)
     set("CMAKE_ARCHIVE_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib")
     set("CMAKE_LIBRARY_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib")

@@ -15,3 +15,6 @@ set(PERF_HEADERS "${perf_headers}" PARENT_SCOPE)
 
 file(GLOB_RECURSE reader_headers Weights_Reader/*.h Weights_Reader/*.hpp)
 set(READER_HEADERS "${reader_headers}" PARENT_SCOPE)
+
+file(GLOB_RECURSE parallel_headers parallel/*.h parallel/*.hpp)
+set(LAYERS_HEADERS "${parallel_headers}" PARENT_SCOPE)
@@ -46,7 +46,8 @@ class EWLayerImpl : public LayerImpl<ValueType> {
  public:
   EWLayerImpl() = delete;
   EWLayerImpl(const Shape& shape, std::string function, float alpha = 0.0F,
-              float beta = 0.0F);
+              float beta = 0.0F,
+              ParBackend parallel_backend = ParBackend::kSeq);
   EWLayerImpl(const EWLayerImpl& c) = default;
   EWLayerImpl& operator=(const EWLayerImpl& c) = default;
   std::vector<ValueType> run(
@@ -56,57 +57,84 @@ class EWLayerImpl : public LayerImpl<ValueType> {
   std::string func_;
   float alpha_;
   float beta_;
+  ParBackend parallel_backend_;
 };
 
 template <typename ValueType>
 EWLayerImpl<ValueType>::EWLayerImpl(const Shape& shape, std::string function,
-                                    float alpha, float beta)
+                                    float alpha, float beta,
+                                    ParBackend parallel_backend)
     : LayerImpl<ValueType>(shape, shape),
       func_(std::move(function)),
       alpha_(alpha),
-      beta_(beta) {}
+      beta_(beta),
+      parallel_backend_(parallel_backend) {}
 
 template <typename ValueType>
 std::vector<ValueType> EWLayerImpl<ValueType>::run(
     const std::vector<ValueType>& input) const {
   std::vector<ValueType> res(this->outputShape_.count());
+
+  parallel::Options options;
+  options.backend = parallel_backend_;
+
   if (func_ == "relu") {
-    std::transform(input.begin(), input.end(), res.begin(), relu<ValueType>);
+    parallel::parallel_for(
+        input.size(),
+        [&](std::size_t i) {
+          res[i] = input[i] > ValueType(0) ? input[i] : ValueType(0);
+        },
+        options);
   } else if (func_ == "tanh") {
-    auto tanh = [&](const ValueType& value) -> ValueType {
-      return static_cast<ValueType>(std::tanh(value));
-    };
-    std::transform(input.begin(), input.end(), res.begin(), tanh);
+    parallel::parallel_for(
+        input.size(),
+        [&](std::size_t i) {
+          res[i] = static_cast<ValueType>(std::tanh(input[i]));
+        },
+        options);
   } else if (func_ == "sin") {
-    auto sin = [&](const ValueType& value) -> ValueType {
-      return static_cast<ValueType>(std::sin(value));
-    };
-    std::transform(input.begin(), input.end(), res.begin(), sin);
+    parallel::parallel_for(
+        input.size(),
+        [&](std::size_t i) {
+          res[i] = static_cast<ValueType>(std::sin(input[i]));
+        },
+        options);
   } else if (func_ == "minus") {
-    auto minus = [&](const ValueType& value) -> ValueType { return -value; };
-    std::transform(input.begin(), input.end(), res.begin(), minus);
+    parallel::parallel_for(
+        input.size(), [&](std::size_t i) { res[i] = -input[i]; }, options);
   } else if (func_ == "linear") {
-    auto linear = [&](const ValueType& value) -> ValueType {
-      return value * static_cast<ValueType>(alpha_) +
-             static_cast<ValueType>(beta_);
-    };
-    std::transform(input.begin(), input.end(), res.begin(), linear);
+    parallel::parallel_for(
+        input.size(),
+        [&](std::size_t i) {
+          res[i] = input[i] * static_cast<ValueType>(alpha_) +
+                   static_cast<ValueType>(beta_);
+        },
+        options);
   } else if (func_ == "sigmoid") {
-    auto sigmoid = [](ValueType x) -> ValueType {
-      if constexpr (std::is_integral_v<ValueType>) {
-        auto x_float = static_cast<float>(x);
-        float result = 1.0F / (1.0F + std::exp(-x_float));
-        return static_cast<ValueType>(std::round(result));
-      } else {
-        if (x >= ValueType(0)) {
-          ValueType z = std::exp(-x);
-          return ValueType(1) / (ValueType(1) + z);
-        }
-        ValueType z = std::exp(x);
-        return z / (ValueType(1) + z);
-      }
-    };
-    std::transform(input.cbegin(), input.cend(), res.begin(), sigmoid);
+    if constexpr (std::is_integral_v<ValueType>) {
+      parallel::parallel_for(
+          input.size(),
+          [&](std::size_t i) {
+            auto x_float = static_cast<float>(input[i]);
+            float result = 1.0F / (1.0F + std::exp(-x_float));
+            res[i] = static_cast<ValueType>(std::round(result));
+          },
+          options);
+    } else {
+      parallel::parallel_for(
+          input.size(),
+          [&](std::size_t i) {
+            ValueType x = input[i];
+            if (x >= ValueType(0)) {
+              ValueType z = std::exp(-x);
+              res[i] = ValueType(1) / (ValueType(1) + z);
+            } else {
+              ValueType z = std::exp(x);
+              res[i] = z / (ValueType(1) + z);
+            }
+          },
+          options);
+    }
   } else {
     throw std::invalid_argument("No such function for EWLayer");
   }

@@ -1,13 +1,17 @@
 #pragma once
+#include <algorithm>
+#include <execution>
+#include <functional>
 #include <initializer_list>
+#include <iostream>
 #include <numeric>
 #include <stdexcept>
 #include <string>
 #include <vector>
 
 #include "layers/Shape.hpp"
 #include "layers/Tensor.hpp"
-#include "oneapi/tbb.h"
+#include "parallel/parallel.hpp"
 
 namespace it_lab_ai {
 
@@ -33,6 +37,7 @@ enum LayerType : uint8_t {
 };
 
 enum ImplType : uint8_t { kDefault, kTBB, kSTL };
+using ParBackend = parallel::Backend;
 
 class Layer;
 
@@ -49,6 +54,8 @@ class Layer {
   PostOperations postops;
   int getID() const { return id_; }
   void setID(int id) { id_ = id; }
+  void setParallelBackend(ParBackend backend) { parallel_backend_ = backend; }
+  ParBackend getParallelBackend() const { return parallel_backend_; }
   LayerType getName() const { return type_; }
   virtual void run(const std::vector<Tensor>& input,
                    std::vector<Tensor>& output) = 0;
@@ -59,6 +66,7 @@ class Layer {
  protected:
   int id_ = 0;
   LayerType type_;
+  ParBackend parallel_backend_ = ParBackend::kSeq;
 };
 
 template <typename ValueType>
@@ -82,5 +90,4 @@ class LayerImpl {
   Shape inputShape_;
   Shape outputShape_;
 };
-
 }  // namespace it_lab_ai
@@ -0,0 +1,130 @@
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <thread>
+#include <vector>
+
+#ifdef HAS_OPENMP
+// #include <omp.h>
+#endif
+
+#include <oneapi/tbb/blocked_range.h>
+#include <oneapi/tbb/info.h>
+#include <oneapi/tbb/parallel_for.h>
+
+namespace it_lab_ai {
+namespace parallel {
+
+enum class Backend : std::uint8_t {
+  kSeq = 0,
+  kThreads = 1,
+  kTbb = 2,
+  kOmp = 3
+};
+
+struct Options {
+  Backend backend = Backend::kSeq;
+  int max_threads = 0;
+  std::size_t min_parallel_n = 1000;
+  std::size_t grain = 1024;
+};
+
+inline void impl_seq(std::size_t count,
+                     const std::function<void(std::size_t)>& func) {
+  for (std::size_t i = 0; i < count; ++i) {
+    func(i);
+  }
+  std::cout << "Seq " << std::endl;
+}
+
+inline void impl_threads(std::size_t count,
+                         const std::function<void(std::size_t)>& func,
+                         const Options& opt) {
+  int num_threads = opt.max_threads > 0
+                        ? opt.max_threads
+                        : static_cast<int>(std::thread::hardware_concurrency());
+  if (num_threads == 0) num_threads = 4;
+
+  std::size_t min_chunk_size = std::max(opt.grain, count / (num_threads * 4));
+  if (count / num_threads < min_chunk_size) {
+    num_threads = std::max(1, static_cast<int>(count / min_chunk_size));
+  }
+
+  std::vector<std::thread> threads;
+  threads.reserve(num_threads);
+
+  std::size_t chunk_size = count / num_threads;
+  std::size_t remainder = count % num_threads;
+
+  std::size_t start = 0;
+  for (int t = 0; t < num_threads; ++t) {
+    std::size_t end =
+        start + chunk_size + (t < static_cast<int>(remainder) ? 1 : 0);
+    if (start >= end) break;
+
+    threads.emplace_back([start, end, &func]() {
+      for (std::size_t i = start; i < end; ++i) {
+        func(i);
+      }
+    });
+
+    start = end;
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  std::cout << "Stl " << std::endl;
+}
+
+inline void impl_tbb(std::size_t count,
+                     const std::function<void(std::size_t)>& func,
+                     const Options& opt) {
+  std::cout << "tbb " << std::endl;
+  oneapi::tbb::parallel_for(
+      oneapi::tbb::blocked_range<std::size_t>(0, count, opt.grain),
+      [&](const oneapi::tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+          func(i);
+        }
+      },
+      oneapi::tbb::auto_partitioner());
+}
+
+#ifdef HAS_OPENMP
+inline void impl_omp(std::size_t count,
+                     const std::function<void(std::size_t)>& func,
+                     const Options& opt) {
+  if (count == 0) return;
+
+  int num_threads = opt.max_threads > 0
+                        ? opt.max_threads
+                        : static_cast<int>(std::thread::hardware_concurrency());
+
+  static_cast<void>(std::max(opt.grain, count / (num_threads * 8)));
+
+  int int_count = static_cast<int>(count);
+  if (int_count < 0 || static_cast<std::size_t>(int_count) != count) {
+    impl_seq(count, func);
+    return;
+  }
+
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+  for (int i = 0; i < int_count; ++i) {
+    func(static_cast<std::size_t>(i));
+  }
+  std::cout << "OMP " << std::endl;
+}
+#else
+inline void impl_omp(std::size_t count,
+                     const std::function<void(std::size_t)>& func,
+                     const Options& opt) {
+  impl_seq(count, func);
+}
+#endif
+
+}  // namespace parallel
+}  // namespace it_lab_ai