diff --git a/CMakeLists.txt b/CMakeLists.txt index 029ae198..c51b3248 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,18 +32,28 @@ add_subdirectory(3rdparty) include(cmake/opencv_config.cmake) -if (NOT WIN32) - find_package(OpenMP REQUIRED) -endif() - -if (NOT WIN32) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror") +find_package(OpenMP) + +if(OpenMP_FOUND) + message(STATUS "OpenMP found - enabling parallel support") + add_definitions(-DHAS_OPENMP) + if(TARGET OpenMP::OpenMP_CXX) + set(OPENMP_TARGET OpenMP::OpenMP_CXX) + message(STATUS "Using OpenMP target: OpenMP::OpenMP_CXX") + else() + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + if(OpenMP_CXX_LIBRARIES) + set(OPENMP_LIBRARIES ${OpenMP_CXX_LIBRARIES}) + endif() + message(STATUS "OpenMP CXX flags: ${OpenMP_CXX_FLAGS}") + message(STATUS "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}") + endif() else() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX") + message(STATUS "OpenMP not found - parallel features disabled") endif() + foreach(CONFIG "" _DEBUG _RELEASE) set("CMAKE_ARCHIVE_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib") set("CMAKE_LIBRARY_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib") diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt index 760af1d8..c9ac0850 100644 --- a/include/CMakeLists.txt +++ b/include/CMakeLists.txt @@ -15,3 +15,6 @@ set(PERF_HEADERS "${perf_headers}" PARENT_SCOPE) file(GLOB_RECURSE reader_headers Weights_Reader/*.h Weights_Reader/*.hpp) set(READER_HEADERS "${reader_headers}" PARENT_SCOPE) + +file(GLOB_RECURSE parallel_headers parallel/*.h parallel/*.hpp) +set(LAYERS_HEADERS "${parallel_headers}" PARENT_SCOPE) diff --git a/include/layers/EWLayer.hpp b/include/layers/EWLayer.hpp index 7361689c..a7f18272 100644 --- a/include/layers/EWLayer.hpp +++ b/include/layers/EWLayer.hpp @@ -46,7 +46,8 @@ class EWLayerImpl : public LayerImpl { public: EWLayerImpl() = delete; EWLayerImpl(const Shape& shape, std::string function, float alpha = 0.0F, - float beta = 0.0F); + float beta = 0.0F, + ParBackend parallel_backend = ParBackend::kSeq); EWLayerImpl(const EWLayerImpl& c) = default; EWLayerImpl& operator=(const EWLayerImpl& c) = default; std::vector run( @@ -56,57 +57,84 @@ class EWLayerImpl : public LayerImpl { std::string func_; float alpha_; float beta_; + ParBackend parallel_backend_; }; template EWLayerImpl::EWLayerImpl(const Shape& shape, std::string function, - float alpha, float beta) + float alpha, float beta, + ParBackend parallel_backend) : LayerImpl(shape, shape), func_(std::move(function)), alpha_(alpha), - beta_(beta) {} + beta_(beta), + parallel_backend_(parallel_backend) {} template std::vector EWLayerImpl::run( const std::vector& input) const { std::vector res(this->outputShape_.count()); + + parallel::Options options; + options.backend = parallel_backend_; + if (func_ == "relu") { - std::transform(input.begin(), input.end(), res.begin(), relu); + parallel::parallel_for( + input.size(), + [&](std::size_t i) { + res[i] = input[i] > ValueType(0) ? input[i] : ValueType(0); + }, + options); } else if (func_ == "tanh") { - auto tanh = [&](const ValueType& value) -> ValueType { - return static_cast(std::tanh(value)); - }; - std::transform(input.begin(), input.end(), res.begin(), tanh); + parallel::parallel_for( + input.size(), + [&](std::size_t i) { + res[i] = static_cast(std::tanh(input[i])); + }, + options); } else if (func_ == "sin") { - auto sin = [&](const ValueType& value) -> ValueType { - return static_cast(std::sin(value)); - }; - std::transform(input.begin(), input.end(), res.begin(), sin); + parallel::parallel_for( + input.size(), + [&](std::size_t i) { + res[i] = static_cast(std::sin(input[i])); + }, + options); } else if (func_ == "minus") { - auto minus = [&](const ValueType& value) -> ValueType { return -value; }; - std::transform(input.begin(), input.end(), res.begin(), minus); + parallel::parallel_for( + input.size(), [&](std::size_t i) { res[i] = -input[i]; }, options); } else if (func_ == "linear") { - auto linear = [&](const ValueType& value) -> ValueType { - return value * static_cast(alpha_) + - static_cast(beta_); - }; - std::transform(input.begin(), input.end(), res.begin(), linear); + parallel::parallel_for( + input.size(), + [&](std::size_t i) { + res[i] = input[i] * static_cast(alpha_) + + static_cast(beta_); + }, + options); } else if (func_ == "sigmoid") { - auto sigmoid = [](ValueType x) -> ValueType { - if constexpr (std::is_integral_v) { - auto x_float = static_cast(x); - float result = 1.0F / (1.0F + std::exp(-x_float)); - return static_cast(std::round(result)); - } else { - if (x >= ValueType(0)) { - ValueType z = std::exp(-x); - return ValueType(1) / (ValueType(1) + z); - } - ValueType z = std::exp(x); - return z / (ValueType(1) + z); - } - }; - std::transform(input.cbegin(), input.cend(), res.begin(), sigmoid); + if constexpr (std::is_integral_v) { + parallel::parallel_for( + input.size(), + [&](std::size_t i) { + auto x_float = static_cast(input[i]); + float result = 1.0F / (1.0F + std::exp(-x_float)); + res[i] = static_cast(std::round(result)); + }, + options); + } else { + parallel::parallel_for( + input.size(), + [&](std::size_t i) { + ValueType x = input[i]; + if (x >= ValueType(0)) { + ValueType z = std::exp(-x); + res[i] = ValueType(1) / (ValueType(1) + z); + } else { + ValueType z = std::exp(x); + res[i] = z / (ValueType(1) + z); + } + }, + options); + } } else { throw std::invalid_argument("No such function for EWLayer"); } diff --git a/include/layers/Layer.hpp b/include/layers/Layer.hpp index 2da4e0a5..b0127942 100644 --- a/include/layers/Layer.hpp +++ b/include/layers/Layer.hpp @@ -1,5 +1,9 @@ #pragma once +#include +#include +#include #include +#include #include #include #include @@ -7,7 +11,7 @@ #include "layers/Shape.hpp" #include "layers/Tensor.hpp" -#include "oneapi/tbb.h" +#include "parallel/parallel.hpp" namespace it_lab_ai { @@ -33,6 +37,7 @@ enum LayerType : uint8_t { }; enum ImplType : uint8_t { kDefault, kTBB, kSTL }; +using ParBackend = parallel::Backend; class Layer; @@ -49,6 +54,8 @@ class Layer { PostOperations postops; int getID() const { return id_; } void setID(int id) { id_ = id; } + void setParallelBackend(ParBackend backend) { parallel_backend_ = backend; } + ParBackend getParallelBackend() const { return parallel_backend_; } LayerType getName() const { return type_; } virtual void run(const std::vector& input, std::vector& output) = 0; @@ -59,6 +66,7 @@ class Layer { protected: int id_ = 0; LayerType type_; + ParBackend parallel_backend_ = ParBackend::kSeq; }; template @@ -82,5 +90,4 @@ class LayerImpl { Shape inputShape_; Shape outputShape_; }; - } // namespace it_lab_ai diff --git a/include/parallel/backends.hpp b/include/parallel/backends.hpp new file mode 100644 index 00000000..28f118eb --- /dev/null +++ b/include/parallel/backends.hpp @@ -0,0 +1,130 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAS_OPENMP +// #include +#endif + +#include +#include +#include + +namespace it_lab_ai { +namespace parallel { + +enum class Backend : std::uint8_t { + kSeq = 0, + kThreads = 1, + kTbb = 2, + kOmp = 3 +}; + +struct Options { + Backend backend = Backend::kSeq; + int max_threads = 0; + std::size_t min_parallel_n = 1000; + std::size_t grain = 1024; +}; + +inline void impl_seq(std::size_t count, + const std::function& func) { + for (std::size_t i = 0; i < count; ++i) { + func(i); + } + std::cout << "Seq " << std::endl; +} + +inline void impl_threads(std::size_t count, + const std::function& func, + const Options& opt) { + int num_threads = opt.max_threads > 0 + ? opt.max_threads + : static_cast(std::thread::hardware_concurrency()); + if (num_threads == 0) num_threads = 4; + + std::size_t min_chunk_size = std::max(opt.grain, count / (num_threads * 4)); + if (count / num_threads < min_chunk_size) { + num_threads = std::max(1, static_cast(count / min_chunk_size)); + } + + std::vector threads; + threads.reserve(num_threads); + + std::size_t chunk_size = count / num_threads; + std::size_t remainder = count % num_threads; + + std::size_t start = 0; + for (int t = 0; t < num_threads; ++t) { + std::size_t end = + start + chunk_size + (t < static_cast(remainder) ? 1 : 0); + if (start >= end) break; + + threads.emplace_back([start, end, &func]() { + for (std::size_t i = start; i < end; ++i) { + func(i); + } + }); + + start = end; + } + + for (auto& thread : threads) { + thread.join(); + } + std::cout << "Stl " << std::endl; +} + +inline void impl_tbb(std::size_t count, + const std::function& func, + const Options& opt) { + std::cout << "tbb " << std::endl; + oneapi::tbb::parallel_for( + oneapi::tbb::blocked_range(0, count, opt.grain), + [&](const oneapi::tbb::blocked_range& range) { + for (std::size_t i = range.begin(); i < range.end(); ++i) { + func(i); + } + }, + oneapi::tbb::auto_partitioner()); +} + +#ifdef HAS_OPENMP +inline void impl_omp(std::size_t count, + const std::function& func, + const Options& opt) { + if (count == 0) return; + + int num_threads = opt.max_threads > 0 + ? opt.max_threads + : static_cast(std::thread::hardware_concurrency()); + + static_cast(std::max(opt.grain, count / (num_threads * 8))); + + int int_count = static_cast(count); + if (int_count < 0 || static_cast(int_count) != count) { + impl_seq(count, func); + return; + } + +#pragma omp parallel for schedule(static) num_threads(num_threads) + for (int i = 0; i < int_count; ++i) { + func(static_cast(i)); + } + std::cout << "OMP " << std::endl; +} +#else +inline void impl_omp(std::size_t count, + const std::function& func, + const Options& opt) { + impl_seq(count, func); +} +#endif + +} // namespace parallel +} // namespace it_lab_ai \ No newline at end of file diff --git a/include/parallel/parallel.hpp b/include/parallel/parallel.hpp new file mode 100644 index 00000000..5232dcae --- /dev/null +++ b/include/parallel/parallel.hpp @@ -0,0 +1,76 @@ +#pragma once +#include "backends.hpp" + +namespace it_lab_ai { +namespace parallel { + +constexpr bool kHasOmp = +#ifdef HAS_OPENMP + true; +#else + false; +#endif + +inline Backend resolve_default_backend(std::size_t n, const Options& opt) { + if (n < opt.min_parallel_n) { + return Backend::kSeq; + } + +#ifdef HAS_OPENMP + return Backend::kOmp; +#else + return Backend::kTbb; +#endif +} + +inline Backend select_backend(const Options& opt, std::size_t n) { + if (opt.backend != Backend::kSeq && n < opt.min_parallel_n) { + return Backend::kSeq; + } + + if (opt.backend == Backend::kSeq || opt.backend == Backend::kThreads || + opt.backend == Backend::kTbb || opt.backend == Backend::kOmp) { + return opt.backend; + } + + return resolve_default_backend(n, opt); +} + +template +inline void parallel_for(std::size_t count, Func&& func, + const Options& opt = {}) { + if (count == 0) return; + + Backend backend = select_backend(opt, count); + + switch (backend) { + case Backend::kSeq: + impl_seq(count, std::forward(func)); + break; + case Backend::kThreads: + impl_threads(count, std::forward(func), opt); + break; + case Backend::kTbb: + impl_tbb(count, std::forward(func), opt); + break; + case Backend::kOmp: + impl_omp(count, std::forward(func), opt); + break; + } +} + +template +inline void parallel_for(std::size_t count, Func&& func, Backend backend) { + Options opt; + opt.backend = backend; + parallel_for(count, std::forward(func), opt); +} + +template +inline void parallel_for(int count, Func&& func, const Options& opt = {}) { + if (count <= 0) return; + parallel_for(static_cast(count), std::forward(func), opt); +} + +} // namespace parallel +} // namespace it_lab_ai \ No newline at end of file diff --git a/src/layers/CMakeLists.txt b/src/layers/CMakeLists.txt index f8ac6d84..7f22b872 100644 --- a/src/layers/CMakeLists.txt +++ b/src/layers/CMakeLists.txt @@ -1,4 +1,7 @@ file(GLOB_RECURSE layers_src *.cpp) add_library(layers_lib STATIC "${LAYERS_HEADERS}" "${layers_src}") target_link_libraries(layers_lib PUBLIC TBB_unified) +# if(OpenMP_FOUND) +# target_link_libraries(layers_lib PUBLIC OpenMP::OpenMP_CXX) +# endif() target_link_libraries(layers_lib PUBLIC dnnl) diff --git a/src/layers/EWLayer.cpp b/src/layers/EWLayer.cpp index dc86b381..6a7da06c 100644 --- a/src/layers/EWLayer.cpp +++ b/src/layers/EWLayer.cpp @@ -1,5 +1,8 @@ #include "layers/EWLayer.hpp" +#include +#include + namespace it_lab_ai { void EWLayer::run(const std::vector& input, @@ -7,15 +10,20 @@ void EWLayer::run(const std::vector& input, if (input.size() != 1) { throw std::runtime_error("EWLayer: Input tensors not 1"); } + + ParBackend backend = getParallelBackend(); + switch (input[0].get_type()) { case Type::kInt: { - EWLayerImpl used_impl(input[0].get_shape(), func_, alpha_, beta_); - output[0] = - make_tensor(used_impl.run(*input[0].as()), input[0].get_shape()); + EWLayerImpl used_impl(input[0].get_shape(), func_, alpha_, beta_, + backend); + std::vector tmp = used_impl.run(*input[0].as()); + output[0] = make_tensor(tmp, input[0].get_shape()); break; } case Type::kFloat: { - EWLayerImpl used_impl(input[0].get_shape(), func_, alpha_, beta_); + EWLayerImpl used_impl(input[0].get_shape(), func_, alpha_, beta_, + backend); output[0] = make_tensor(used_impl.run(*input[0].as()), input[0].get_shape()); break; diff --git a/src/layers_oneDNN/EWLayer.cpp b/src/layers_oneDNN/EWLayer.cpp index fc838705..fc7d66d7 100644 --- a/src/layers_oneDNN/EWLayer.cpp +++ b/src/layers_oneDNN/EWLayer.cpp @@ -1,5 +1,7 @@ #include "layers_oneDNN/EWLayer.hpp" +#include +#include #include #include diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fc95325c..0c72d407 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,9 +1,9 @@ file(GLOB_RECURSE TEST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) add_executable(run_test ${TEST_SRC_FILES}) -if (NOT WIN32) - target_link_libraries(run_test PUBLIC OpenMP::OpenMP_CXX) -endif() +# if(OpenMP_FOUND) +# target_link_libraries(run_test PUBLIC OpenMP::OpenMP_CXX) +# endif() target_link_libraries(run_test PUBLIC perf_lib layers_lib layers_oneDNN_lib) target_link_libraries(run_test PUBLIC gtest) target_link_libraries(run_test PUBLIC ReadLib) diff --git a/test/single_layer/test_ewlayer.cpp b/test/single_layer/test_ewlayer.cpp index 65547b2a..67895210 100644 --- a/test/single_layer/test_ewlayer.cpp +++ b/test/single_layer/test_ewlayer.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -215,3 +216,223 @@ TEST(ewlayer, new_ewlayer_can_sigmoid_float_extreme_values) { EXPECT_NEAR((*out[0].as())[i], expected_output[i], 1e-5F); } } + +TEST(ewlayer, parallel_for_ew) { + EWLayer layer0("relu"); + layer0.setParallelBackend(ParBackend::kSeq); + EWLayer layer1("relu"); + layer1.setParallelBackend(ParBackend::kThreads); + EWLayer layer2("relu"); + layer2.setParallelBackend(ParBackend::kTbb); + EWLayer layer3("relu"); + layer3.setParallelBackend(ParBackend::kOmp); + + std::vector vec(800000000, -1); + Tensor input = make_tensor(vec); + Tensor output; + std::vector in{input}; + std::vector out{output}; + + auto start = std::chrono::high_resolution_clock::now(); + layer0.run(in, out); + auto end = std::chrono::high_resolution_clock::now(); + auto total_duration = + std::chrono::duration_cast(end - start); + std::cout << "Sequential: " << total_duration.count() << " ms" << std::endl; + for (size_t i = 0; i < 800000000; i++) { + EXPECT_EQ((*out[0].as())[i], 0); + } + + start = std::chrono::high_resolution_clock::now(); + layer1.run(in, out); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "Threads: " << total_duration.count() << " ms" << std::endl; + for (size_t i = 0; i < 800000000; i++) { + EXPECT_EQ((*out[0].as())[i], 0); + } + + start = std::chrono::high_resolution_clock::now(); + layer2.run(in, out); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "TBB: " << total_duration.count() << " ms" << std::endl; + for (size_t i = 0; i < 800000000; i++) { + EXPECT_EQ((*out[0].as())[i], 0); + } + + start = std::chrono::high_resolution_clock::now(); + layer3.run(in, out); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "OpenMP: " << total_duration.count() << " ms" << std::endl; + for (size_t i = 0; i < 800000000; i++) { + EXPECT_EQ((*out[0].as())[i], 0); + } +} + +TEST(ewlayer, parallel_for_ew_sigmoid) { + EWLayer layer0("sigmoid"); + layer0.setParallelBackend(ParBackend::kSeq); + EWLayer layer1("sigmoid"); + layer1.setParallelBackend(ParBackend::kThreads); + EWLayer layer2("sigmoid"); + layer2.setParallelBackend(ParBackend::kTbb); + EWLayer layer3("sigmoid"); + layer3.setParallelBackend(ParBackend::kOmp); + + std::vector vec(800000000, -1); + Tensor input = make_tensor(vec); + Tensor output; + std::vector in{input}; + std::vector out{output}; + + auto start = std::chrono::high_resolution_clock::now(); + layer0.run(in, out); + auto end = std::chrono::high_resolution_clock::now(); + auto total_duration = + std::chrono::duration_cast(end - start); + std::cout << "Sequential sigmoid: " << total_duration.count() << " ms" + << std::endl; + + start = std::chrono::high_resolution_clock::now(); + layer1.run(in, out); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "Threads sigmoid: " << total_duration.count() << " ms" + << std::endl; + + start = std::chrono::high_resolution_clock::now(); + layer2.run(in, out); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "TBB sigmoid: " << total_duration.count() << " ms" << std::endl; + + start = std::chrono::high_resolution_clock::now(); + layer3.run(in, out); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "OpenMP sigmoid: " << total_duration.count() << " ms" + << std::endl; + + EXPECT_EQ(0, 0); +} + +TEST(ewlayer, parallel_for_direct) { + const int SIZE = 20000; + std::vector matrix1(SIZE * SIZE); + std::vector matrix2(SIZE * SIZE); + std::vector result(SIZE * SIZE); + + for (int i = 0; i < SIZE * SIZE; ++i) { + matrix1[i] = 1; + matrix2[i] = 1; + } + + auto start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, + ParBackend::kSeq); + + auto end = std::chrono::high_resolution_clock::now(); + auto total_duration = + std::chrono::duration_cast(end - start); + std::cout << "Sequential direct: " << total_duration.count() << " ms" + << std::endl; + + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, + ParBackend::kThreads); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "Threads direct: " << total_duration.count() << " ms" + << std::endl; + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, + ParBackend::kTbb); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "TBB direct: " << total_duration.count() << " ms" << std::endl; + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + matrix2[i]; }, + ParBackend::kOmp); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "OpenMP direct: " << total_duration.count() << " ms" + << std::endl; + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); +} + +TEST(ewlayer, parallel_for_notmatrix) { + const int SIZE = 30000; + std::vector matrix1(SIZE * SIZE); + std::vector result(SIZE * SIZE); + + for (int i = 0; i < SIZE * SIZE; ++i) { + matrix1[i] = 1; + } + + auto start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, + ParBackend::kSeq); + + auto end = std::chrono::high_resolution_clock::now(); + auto total_duration = + std::chrono::duration_cast(end - start); + std::cout << "Sequential notmatrix: " << total_duration.count() << " ms" + << std::endl; + + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, + ParBackend::kThreads); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "Threads notmatrix: " << total_duration.count() << " ms" + << std::endl; + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, + ParBackend::kTbb); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "TBB notmatrix: " << total_duration.count() << " ms" + << std::endl; + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); + + start = std::chrono::high_resolution_clock::now(); + parallel::parallel_for( + SIZE * SIZE, [&](std::size_t i) { result[i] = matrix1[i] + 1; }, + ParBackend::kOmp); + end = std::chrono::high_resolution_clock::now(); + total_duration = + std::chrono::duration_cast(end - start); + std::cout << "OpenMP notmatrix: " << total_duration.count() << " ms" + << std::endl; + for (int i = 0; i < SIZE * SIZE; i++) ASSERT_EQ(result[i], 2); +}