diff --git a/CMakeLists.txt b/CMakeLists.txt
index 029ae198..c51b3248 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,18 +32,28 @@ add_subdirectory(3rdparty)
 
 include(cmake/opencv_config.cmake)
 
-if (NOT WIN32)
-    find_package(OpenMP REQUIRED)
-endif()
-
-if (NOT WIN32)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
+find_package(OpenMP)
+
+if(OpenMP_FOUND)
+    message(STATUS "OpenMP found - enabling parallel support")
+    add_definitions(-DHAS_OPENMP)
+    if(TARGET OpenMP::OpenMP_CXX)
+        set(OPENMP_TARGET OpenMP::OpenMP_CXX)
+        message(STATUS "Using OpenMP target: OpenMP::OpenMP_CXX")
+    else()
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        if(OpenMP_CXX_LIBRARIES)
+            set(OPENMP_LIBRARIES ${OpenMP_CXX_LIBRARIES})
+        endif()
+        message(STATUS "OpenMP CXX flags: ${OpenMP_CXX_FLAGS}")
+        message(STATUS "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}")
+    endif()
 else()
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX")
+    message(STATUS "OpenMP not found - parallel features disabled")
 endif()
 
+
 foreach(CONFIG "" _DEBUG _RELEASE)
     set("CMAKE_ARCHIVE_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib")
     set("CMAKE_LIBRARY_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib")
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 760af1d8..c9ac0850 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -15,3 +15,6 @@ set(PERF_HEADERS "${perf_headers}" PARENT_SCOPE)
 
 file(GLOB_RECURSE reader_headers Weights_Reader/*.h Weights_Reader/*.hpp)
 set(READER_HEADERS "${reader_headers}" PARENT_SCOPE)
+
+file(GLOB_RECURSE parallel_headers parallel/*.h parallel/*.hpp)
+set(LAYERS_HEADERS "${parallel_headers}" PARENT_SCOPE)
diff --git a/include/layers/EWLayer.hpp b/include/layers/EWLayer.hpp
index 7361689c..a7f18272 100644
--- a/include/layers/EWLayer.hpp
+++ b/include/layers/EWLayer.hpp
@@ -46,7 +46,8 @@ class EWLayerImpl : public LayerImpl<ValueType> {
  public:
   EWLayerImpl() = delete;
   EWLayerImpl(const Shape& shape, std::string function, float alpha = 0.0F,
-              float beta = 0.0F);
+              float beta = 0.0F,
+              ParBackend parallel_backend = ParBackend::kSeq);
   EWLayerImpl(const EWLayerImpl& c) = default;
   EWLayerImpl& operator=(const EWLayerImpl& c) = default;
   std::vector<ValueType> run(
@@ -56,57 +57,84 @@ class EWLayerImpl : public LayerImpl<ValueType> {
   std::string func_;
   float alpha_;
   float beta_;
+  ParBackend parallel_backend_;
 };
 
 template <typename ValueType>
 EWLayerImpl<ValueType>::EWLayerImpl(const Shape& shape, std::string function,
-                                    float alpha, float beta)
+                                    float alpha, float beta,
+                                    ParBackend parallel_backend)
     : LayerImpl<ValueType>(shape, shape),
       func_(std::move(function)),
       alpha_(alpha),
-      beta_(beta) {}
+      beta_(beta),
+      parallel_backend_(parallel_backend) {}
 
 template <typename ValueType>
 std::vector<ValueType> EWLayerImpl<ValueType>::run(
     const std::vector<ValueType>& input) const {
   std::vector<ValueType> res(this->outputShape_.count());
+
+  parallel::Options options;
+  options.backend = parallel_backend_;
+
   if (func_ == "relu") {
-    std::transform(input.begin(), input.end(), res.begin(), relu<ValueType>);
+    parallel::parallel_for(
+        input.size(),
+        [&](std::size_t i) {
+          res[i] = input[i] > ValueType(0) ? input[i] : ValueType(0);
+        },
+        options);
   } else if (func_ == "tanh") {
-    auto tanh = [&](const ValueType& value) -> ValueType {
-      return static_cast<ValueType>(std::tanh(value));
-    };
-    std::transform(input.begin(), input.end(), res.begin(), tanh);
+    parallel::parallel_for(
+        input.size(),
+        [&](std::size_t i) {
+          res[i] = static_cast<ValueType>(std::tanh(input[i]));
+        },
+        options);
   } else if (func_ == "sin") {
-    auto sin = [&](const ValueType& value) -> ValueType {
-      return static_cast<ValueType>(std::sin(value));
-    };
-    std::transform(input.begin(), input.end(), res.begin(), sin);
+    parallel::parallel_for(
+        input.size(),
+        [&](std::size_t i) {
+          res[i] = static_cast<ValueType>(std::sin(input[i]));
+        },
+        options);
   } else if (func_ == "minus") {
-    auto minus = [&](const ValueType& value) -> ValueType { return -value; };
-    std::transform(input.begin(), input.end(), res.begin(), minus);
+    parallel::parallel_for(
+        input.size(), [&](std::size_t i) { res[i] = -input[i]; }, options);
   } else if (func_ == "linear") {
-    auto linear = [&](const ValueType& value) -> ValueType {
-      return value * static_cast<ValueType>(alpha_) +
-             static_cast<ValueType>(beta_);
-    };
-    std::transform(input.begin(), input.end(), res.begin(), linear);
+    parallel::parallel_for(
+        input.size(),
+        [&](std::size_t i) {
+          res[i] = input[i] * static_cast<ValueType>(alpha_) +
+                   static_cast<ValueType>(beta_);
+        },
+        options);
   } else if (func_ == "sigmoid") {
-    auto sigmoid = [](ValueType x) -> ValueType {
-      if constexpr (std::is_integral_v<ValueType>) {
-        auto x_float = static_cast<float>(x);
-        float result = 1.0F / (1.0F + std::exp(-x_float));
-        return static_cast<ValueType>(std::round(result));
-      } else {
-        if (x >= ValueType(0)) {
-          ValueType z = std::exp(-x);
-          return ValueType(1) / (ValueType(1) + z);
-        }
-        ValueType z = std::exp(x);
-        return z / (ValueType(1) + z);
-      }
-    };
-    std::transform(input.cbegin(), input.cend(), res.begin(), sigmoid);
+    if constexpr (std::is_integral_v<ValueType>) {
+      parallel::parallel_for(
+          input.size(),
+          [&](std::size_t i) {
+            auto x_float = static_cast<float>(input[i]);
+            float result = 1.0F / (1.0F + std::exp(-x_float));
+            res[i] = static_cast<ValueType>(std::round(result));
+          },
+          options);
+    } else {
+      parallel::parallel_for(
+          input.size(),
+          [&](std::size_t i) {
+            ValueType x = input[i];
+            if (x >= ValueType(0)) {
+              ValueType z = std::exp(-x);
+              res[i] = ValueType(1) / (ValueType(1) + z);
+            } else {
+              ValueType z = std::exp(x);
+              res[i] = z / (ValueType(1) + z);
+            }
+          },
+          options);
+    }
   } else {
     throw std::invalid_argument("No such function for EWLayer");
   }
diff --git a/include/layers/Layer.hpp b/include/layers/Layer.hpp
index 2da4e0a5..b0127942 100644
--- a/include/layers/Layer.hpp
+++ b/include/layers/Layer.hpp
@@ -1,5 +1,9 @@
 #pragma once
+#include <algorithm>
+#include <execution>
+#include <functional>
 #include <initializer_list>
+#include <iostream>
 #include <numeric>
 #include <stdexcept>
 #include <string>
@@ -7,7 +11,7 @@
 
 #include "layers/Shape.hpp"
 #include "layers/Tensor.hpp"
-#include "oneapi/tbb.h"
+#include "parallel/parallel.hpp"
 
 namespace it_lab_ai {
 
@@ -33,6 +37,7 @@ enum LayerType : uint8_t {
 };
 
 enum ImplType : uint8_t { kDefault, kTBB, kSTL };
+using ParBackend = parallel::Backend;
 
 class Layer;
 
@@ -49,6 +54,8 @@ class Layer {
   PostOperations postops;
   int getID() const { return id_; }
   void setID(int id) { id_ = id; }
+  void setParallelBackend(ParBackend backend) { parallel_backend_ = backend; }
+  ParBackend getParallelBackend() const { return parallel_backend_; }
   LayerType getName() const { return type_; }
   virtual void run(const std::vector<Tensor>& input,
                    std::vector<Tensor>& output) = 0;
@@ -59,6 +66,7 @@ class Layer {
  protected:
   int id_ = 0;
   LayerType type_;
+  ParBackend parallel_backend_ = ParBackend::kSeq;
 };
 
 template <typename ValueType>
@@ -82,5 +90,4 @@ class LayerImpl {
   Shape inputShape_;
   Shape outputShape_;
 };
-
 }  // namespace it_lab_ai
diff --git a/include/parallel/backends.hpp b/include/parallel/backends.hpp
new file mode 100644
index 00000000..28f118eb
--- /dev/null
+++ b/include/parallel/backends.hpp
@@ -0,0 +1,130 @@
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <thread>
+#include <vector>
+
+#ifdef HAS_OPENMP
+// #include <omp.h>
+#endif
+
+#include <oneapi/tbb/blocked_range.h>
+#include <oneapi/tbb/info.h>
+#include <oneapi/tbb/parallel_for.h>
+
+namespace it_lab_ai {
+namespace parallel {
+
+enum class Backend : std::uint8_t {
+  kSeq = 0,
+  kThreads = 1,
+  kTbb = 2,
+  kOmp = 3
+};
+
+struct Options {
+  Backend backend = Backend::kSeq;
+  int max_threads = 0;
+  std::size_t min_parallel_n = 1000;
+  std::size_t grain = 1024;
+};
+
+inline void impl_seq(std::size_t count,
+                     const std::function<void(std::size_t)>& func) {
+  for (std::size_t i = 0; i < count; ++i) {
+    func(i);
+  }
+  std::cout << "Seq " << std::endl;
+}
+
+inline void impl_threads(std::size_t count,
+                         const std::function<void(std::size_t)>& func,
+                         const Options& opt) {
+  int num_threads = opt.max_threads > 0
+                        ? opt.max_threads
+                        : static_cast<int>(std::thread::hardware_concurrency());
+  if (num_threads == 0) num_threads = 4;
+
+  std::size_t min_chunk_size = std::max(opt.grain, count / (num_threads * 4));
+  if (count / num_threads < min_chunk_size) {
+    num_threads = std::max(1, static_cast<int>(count / min_chunk_size));
+  }
+
+  std::vector<std::thread> threads;
+  threads.reserve(num_threads);
+
+  std::size_t chunk_size = count / num_threads;
+  std::size_t remainder = count % num_threads;
+
+  std::size_t start = 0;
+  for (int t = 0; t < num_threads; ++t) {
+    std::size_t end =
+        start + chunk_size + (t < static_cast<int>(remainder) ? 1 : 0);
+    if (start >= end) break;
+
+    threads.emplace_back([start, end, &func]() {
+      for (std::size_t i = start; i < end; ++i) {
+        func(i);
+      }
+    });
+
+    start = end;
+  }
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+  std::cout << "Stl " << std::endl;
+}
+
+inline void impl_tbb(std::size_t count,
+                     const std::function<void(std::size_t)>& func,
+                     const Options& opt) {
+  std::cout << "tbb " << std::endl;
+  oneapi::tbb::parallel_for(
+      oneapi::tbb::blocked_range<std::size_t>(0, count, opt.grain),
+      [&](const oneapi::tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+          func(i);
+        }
+      },
+      oneapi::tbb::auto_partitioner());
+}
+
+#ifdef HAS_OPENMP
+inline void impl_omp(std::size_t count,
+                     const std::function<void(std::size_t)>& func,
+                     const Options& opt) {
+  if (count == 0) return;
+
+  int num_threads = opt.max_threads > 0
+                        ? opt.max_threads
+                        : static_cast<int>(std::thread::hardware_concurrency());
+
+  static_cast<void>(std::max(opt.grain, count / (num_threads * 8)));
+
+  int int_count = static_cast<int>(count);
+  if (int_count < 0 || static_cast<std::size_t>(int_count) != count) {
+    impl_seq(count, func);
+    return;
+  }
+
+#pragma omp parallel for schedule(static) num_threads(num_threads)
+  for (int i = 0; i < int_count; ++i) {
+    func(static_cast<std::size_t>(i));
+  }
+  std::cout << "OMP " << std::endl;
+}
+#else
+inline void impl_omp(std::size_t count,
+                     const std::function<void(std::size_t)>& func,
+                     const Options& opt) {
+  impl_seq(count, func);
+}
+#endif
+
+}  // namespace parallel
+}  // namespace it_lab_ai
\ No newline at end of file
diff --git a/include/parallel/parallel.hpp b/include/parallel/parallel.hpp
new file mode 100644
index 00000000..5232dcae
--- /dev/null
+++ b/include/parallel/parallel.hpp
@@ -0,0 +1,76 @@
+#pragma once
+#include "backends.hpp"
+
+namespace it_lab_ai {
+namespace parallel {
+
+constexpr bool kHasOmp =
+#ifdef HAS_OPENMP
+    true;
+#else
+    false;
+#endif
+
+inline Backend resolve_default_backend(std::size_t n, const Options& opt) {
+  if (n < opt.min_parallel_n) {
+    return Backend::kSeq;
+  }
+
+#ifdef HAS_OPENMP
+  return Backend::kOmp;
+#else
+  return Backend::kTbb;
+#endif
+}
+
+inline Backend select_backend(const Options& opt, std::size_t n) {
+  if (opt.backend != Backend::kSeq && n < opt.min_parallel_n) {
+    return Backend::kSeq;
+  }
+
+  if (opt.backend == Backend::kSeq || opt.backend == Backend::kThreads ||
+      opt.backend == Backend::kTbb || opt.backend == Backend::kOmp) {
+    return opt.backend;
+  }
+
+  return resolve_default_backend(n, opt);
+}
+
+template <typename Func>
+inline void parallel_for(std::size_t count, Func&& func,
+                         const Options& opt = {}) {
+  if (count == 0) return;
+
+  Backend backend = select_backend(opt, count);
+
+  switch (backend) {
+    case Backend::kSeq:
+      impl_seq(count, std::forward<Func>(func));
+      break;
+    case Backend::kThreads:
+      impl_threads(count, std::forward<Func>(func), opt);
+      break;
+    case Backend::kTbb:
+      impl_tbb(count, std::forward<Func>(func), opt);
+      break;
+    case Backend::kOmp:
+      impl_omp(count, std::forward<Func>(func), opt);
+      break;
+  }
+}
+
+template <typename Func>
+inline void parallel_for(std::size_t count, Func&& func, Backend backend) {
+  Options opt;
+  opt.backend = backend;
+  parallel_for(count, std::forward<Func>(func), opt);
+}
+
+template <typename Func>
+inline void parallel_for(int count, Func&& func, const Options& opt = {}) {
+  if (count <= 0) return;
+  parallel_for(static_cast<std::size_t>(count), std::forward<Func>(func), opt);
+}
+
+}  // namespace parallel
+}  // namespace it_lab_ai
\ No newline at end of file
diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt
index f8ac6d84..7f22b872 100644
--- a/src/layers/CMakeLists.txt
+++ b/src/layers/CMakeLists.txt
@@ -1,4 +1,7 @@
 file(GLOB_RECURSE layers_src *.cpp)
 add_library(layers_lib STATIC "${LAYERS_HEADERS}" "${layers_src}")
 target_link_libraries(layers_lib PUBLIC TBB_unified)
+# if(OpenMP_FOUND)
+#     target_link_libraries(layers_lib PUBLIC OpenMP::OpenMP_CXX)
+# endif()
 target_link_libraries(layers_lib PUBLIC dnnl)
diff --git a/src/layers/EWLayer.cpp b/src/layers/EWLayer.cpp
index dc86b381..6a7da06c 100644
--- a/src/layers/EWLayer.cpp
+++ b/src/layers/EWLayer.cpp
@@ -1,5 +1,8 @@
 #include "layers/EWLayer.hpp"
 
+#include <chrono>
+#include <cmath>
+
 namespace it_lab_ai {
 
 void EWLayer::run(const std::vector<Tensor>& input,
@@ -7,15 +10,20 @@ void EWLayer::run(const std::vector<Tensor>& input,
   if (input.size() != 1) {
     throw std::runtime_error("EWLayer: Input tensors not 1");
   }
+
+  ParBackend backend = getParallelBackend();
+
   switch (input[0].get_type()) {
     case Type::kInt: {
-      EWLayerImpl<int> used_impl(input[0].get_shape(), func_, alpha_, beta_);
-      output[0] =
-          make_tensor(used_impl.run(*input[0].as<int>()), input[0].get_shape());
+      EWLayerImpl<int> used_impl(input[0].get_shape(), func_, alpha_, beta_,
+                                 backend);
+      std::vector<int> tmp = used_impl.run(*input[0].as<int>());
+      output[0] = make_tensor(tmp, input[0].get_shape());
       break;
     }
     case Type::kFloat: {
-      EWLayerImpl<float> used_impl(input[0].get_shape(), func_, alpha_, beta_);
+      EWLayerImpl<float> used_impl(input[0].get_shape(), func_, alpha_, beta_,
+                                   backend);
       output[0] = make_tensor(used_impl.run(*input[0].as<float>()),
                               input[0].get_shape());
       break;
diff --git a/src/layers_oneDNN/EWLayer.cpp b/src/layers_oneDNN/EWLayer.cpp
index fc838705..fc7d66d7 100644
--- a/src/layers_oneDNN/EWLayer.cpp
+++ b/src/layers_oneDNN/EWLayer.cpp
@@ -1,5 +1,7 @@
 #include "layers_oneDNN/EWLayer.hpp"
 
+#include <chrono>
+#include <cmath>
 #include <iostream>
 #include <stdexcept>
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index fc95325c..0c72d407 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,9 +1,9 @@
 file(GLOB_RECURSE TEST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 
 add_executable(run_test ${TEST_SRC_FILES})
-if (NOT WIN32)
-    target_link_libraries(run_test PUBLIC OpenMP::OpenMP_CXX)
-endif()
+# if(OpenMP_FOUND)
+#     target_link_libraries(run_test PUBLIC OpenMP::OpenMP_CXX)
+# endif()
 target_link_libraries(run_test PUBLIC perf_lib layers_lib layers_oneDNN_lib)
 target_link_libraries(run_test PUBLIC gtest)
 target_link_libraries(run_test PUBLIC ReadLib)
diff --git a/test/single_layer/test_ewlayer.cpp b/test/single_layer/test_ewlayer.cpp
index 65547b2a..67895210 100644
--- a/test/single_layer/test_ewlayer.cpp
+++ b/test/single_layer/test_ewlayer.cpp
@@ -1,4 +1,5 @@
 #include <algorithm>
+#include <chrono>
 #include <cmath>
 #include <vector>
 
@@ -215,3 +216,223 @@ TEST(ewlayer, new_ewlayer_can_sigmoid_float_extreme_values) {
     EXPECT_NEAR((*out[0].as<float>())[i], expected_output[i], 1e-5F);
   }
 }
+
+TEST(ewlayer, parallel_for_ew) {
+  EWLayer layer0("relu");
+  layer0.setParallelBackend(ParBackend::kSeq);
+  EWLayer layer1("relu");
+  layer1.setParallelBackend(ParBackend::kThreads);
+  EWLayer layer2("relu");
+  layer2.setParallelBackend(ParBackend::kTbb);
+  EWLayer layer3("relu");
+  layer3.setParallelBackend(ParBackend::kOmp);
+
+  std::vector<int> vec(800000000, -1);
+  Tensor input = make_tensor<int>(vec);
+  Tensor output;
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  auto start = std::chrono::high_resolution_clock::now();
+  layer0.run(in, out);
+  auto end = std::chrono::high_resolution_clock::now();
+  auto total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Sequential: " << total_duration.count() << " ms" << std::endl;
+  for (size_t i = 0; i < 800000000; i++) {
+    EXPECT_EQ((*out[0].as<int>())[i], 0);
+  }
+
+  start = std::chrono::high_resolution_clock::now();
+  layer1.run(in, out);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Threads: " << total_duration.count() << " ms" << std::endl;
+  for (size_t i = 0; i < 800000000; i++) {
+    EXPECT_EQ((*out[0].as<int>())[i], 0);
+  }
+
+  start = std::chrono::high_resolution_clock::now();
+  layer2.run(in, out);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "TBB: " << total_duration.count() << " ms" << std::endl;
+  for (size_t i = 0; i < 800000000; i++) {
+    EXPECT_EQ((*out[0].as<int>())[i], 0);
+  }
+
+  start = std::chrono::high_resolution_clock::now();
+  layer3.run(in, out);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "OpenMP: " << total_duration.count() << " ms" << std::endl;
+  for (size_t i = 0; i < 800000000; i++) {
+    EXPECT_EQ((*out[0].as<int>())[i], 0);
+  }
+}
+
+TEST(ewlayer, parallel_for_ew_sigmoid) {
+  EWLayer layer0("sigmoid");
+  layer0.setParallelBackend(ParBackend::kSeq);
+  EWLayer layer1("sigmoid");
+  layer1.setParallelBackend(ParBackend::kThreads);
+  EWLayer layer2("sigmoid");
+  layer2.setParallelBackend(ParBackend::kTbb);
+  EWLayer layer3("sigmoid");
+  layer3.setParallelBackend(ParBackend::kOmp);
+
+  std::vector<int> vec(800000000, -1);
+  Tensor input = make_tensor<int>(vec);
+  Tensor output;
+  std::vector<Tensor> in{input};
+  std::vector<Tensor> out{output};
+
+  auto start = std::chrono::high_resolution_clock::now();
+  layer0.run(in, out);
+  auto end = std::chrono::high_resolution_clock::now();
+  auto total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Sequential sigmoid: " << total_duration.count() << " ms"
+            << std::endl;
+
+  start = std::chrono::high_resolution_clock::now();
+  layer1.run(in, out);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Threads sigmoid: " << total_duration.count() << " ms"
+            << std::endl;
+
+  start = std::chrono::high_resolution_clock::now();
+  layer2.run(in, out);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "TBB sigmoid: " << total_duration.count() << " ms" << std::endl;
+
+  start = std::chrono::high_resolution_clock::now();
+  layer3.run(in, out);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "OpenMP sigmoid: " << total_duration.count() << " ms"
+            << std::endl;
+
+  EXPECT_EQ(0, 0);
+}
+
+TEST(ewlayer, parallel_for_direct) {
+  const int SIZE = 20000;
+  std::vector<int> matrix1(SIZE * SIZE);
+  std::vector<int> matrix2(SIZE * SIZE);
+  std::vector<int> result(SIZE * SIZE);
+
+  for (int i = 0; i < SIZE * SIZE; ++i) {
+    matrix1[i] = 1;
+    matrix2[i] = 1;
+  }
+
+  auto start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
+      ParBackend::kSeq);
+
+  auto end = std::chrono::high_resolution_clock::now();
+  auto total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Sequential direct: " << total_duration.count() << " ms"
+            << std::endl;
+
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
+      ParBackend::kThreads);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Threads direct: " << total_duration.count() << " ms"
+            << std::endl;
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
+      ParBackend::kTbb);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "TBB direct: " << total_duration.count() << " ms" << std::endl;
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; },
+      ParBackend::kOmp);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "OpenMP direct: " << total_duration.count() << " ms"
+            << std::endl;
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+}
+
+TEST(ewlayer, parallel_for_notmatrix) {
+  const int SIZE = 30000;
+  std::vector<int> matrix1(SIZE * SIZE);
+  std::vector<int> result(SIZE * SIZE);
+
+  for (int i = 0; i < SIZE * SIZE; ++i) {
+    matrix1[i] = 1;
+  }
+
+  auto start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
+      ParBackend::kSeq);
+
+  auto end = std::chrono::high_resolution_clock::now();
+  auto total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Sequential notmatrix: " << total_duration.count() << " ms"
+            << std::endl;
+
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
+      ParBackend::kThreads);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "Threads notmatrix: " << total_duration.count() << " ms"
+            << std::endl;
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
+      ParBackend::kTbb);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "TBB notmatrix: " << total_duration.count() << " ms"
+            << std::endl;
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+
+  start = std::chrono::high_resolution_clock::now();
+  parallel::parallel_for(
+      SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; },
+      ParBackend::kOmp);
+  end = std::chrono::high_resolution_clock::now();
+  total_duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
+  std::cout << "OpenMP notmatrix: " << total_duration.count() << " ms"
+            << std::endl;
+  for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2);
+}