Skip to content
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 19 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,28 @@ add_subdirectory(3rdparty)

include(cmake/opencv_config.cmake)

if (NOT WIN32)
find_package(OpenMP REQUIRED)
endif()

if (NOT WIN32)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
find_package(OpenMP)

if(OpenMP_FOUND)
message(STATUS "OpenMP found - enabling parallel support")
add_definitions(-DHAS_OPENMP)
if(TARGET OpenMP::OpenMP_CXX)
set(OPENMP_TARGET OpenMP::OpenMP_CXX)
message(STATUS "Using OpenMP target: OpenMP::OpenMP_CXX")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
if(OpenMP_CXX_LIBRARIES)
set(OPENMP_LIBRARIES ${OpenMP_CXX_LIBRARIES})
endif()
message(STATUS "OpenMP CXX flags: ${OpenMP_CXX_FLAGS}")
message(STATUS "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}")
endif()
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX")
message(STATUS "OpenMP not found - parallel features disabled")
endif()


foreach(CONFIG "" _DEBUG _RELEASE)
set("CMAKE_ARCHIVE_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib")
set("CMAKE_LIBRARY_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib")
Expand Down
94 changes: 60 additions & 34 deletions include/layers/EWLayer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class EWLayerImpl : public LayerImpl<ValueType> {
public:
EWLayerImpl() = delete;
EWLayerImpl(const Shape& shape, std::string function, float alpha = 0.0F,
float beta = 0.0F);
float beta = 0.0F, int type_parall = 0);
Copy link
Member

@allnes allnes Nov 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use a strongly-typed backend enum instead of int for readability and safety.

enum class ParBackend { Seq = 0, Threads = 1, TBB = 2, OMP = 3 };

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Propagate ParBackend through API instead of raw int

EWLayerImpl(const EWLayerImpl& c) = default;
EWLayerImpl& operator=(const EWLayerImpl& c) = default;
std::vector<ValueType> run(
Expand All @@ -56,57 +56,83 @@ class EWLayerImpl : public LayerImpl<ValueType> {
std::string func_;
float alpha_;
float beta_;
int type_parall_;
};

template <typename ValueType>
EWLayerImpl<ValueType>::EWLayerImpl(const Shape& shape, std::string function,
float alpha, float beta)
float alpha, float beta, int type_parall)
: LayerImpl<ValueType>(shape, shape),
func_(std::move(function)),
alpha_(alpha),
beta_(beta) {}
beta_(beta),
type_parall_(type_parall) {}

template <typename ValueType>
std::vector<ValueType> EWLayerImpl<ValueType>::run(
const std::vector<ValueType>& input) const {
std::vector<ValueType> res(this->outputShape_.count());
int available_threads = -1;
if (type_parall_ == 0) available_threads = 1;
if (type_parall_ == 1)
available_threads = std::thread::hardware_concurrency();
if (type_parall_ == 2)
available_threads = oneapi::tbb::info::default_concurrency();
if (type_parall_ == 3) available_threads = omp_get_max_threads();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please wrap common function for getting thread number


if (func_ == "relu") {
std::transform(input.begin(), input.end(), res.begin(), relu<ValueType>);
parallel_for(
input.size(),
[&](int i) {
res[i] = input[i] > ValueType(0) ? input[i] : ValueType(0);
},
type_parall_);
} else if (func_ == "tanh") {
auto tanh = [&](const ValueType& value) -> ValueType {
return static_cast<ValueType>(std::tanh(value));
};
std::transform(input.begin(), input.end(), res.begin(), tanh);
parallel_for(
input.size(),
[&](int i) { res[i] = static_cast<ValueType>(std::tanh(input[i])); },
type_parall_);
} else if (func_ == "sin") {
auto sin = [&](const ValueType& value) -> ValueType {
return static_cast<ValueType>(std::sin(value));
};
std::transform(input.begin(), input.end(), res.begin(), sin);
parallel_for(
input.size(),
[&](int i) { res[i] = static_cast<ValueType>(std::sin(input[i])); },
type_parall_);
} else if (func_ == "minus") {
auto minus = [&](const ValueType& value) -> ValueType { return -value; };
std::transform(input.begin(), input.end(), res.begin(), minus);
parallel_for(
input.size(), [&](int i) { res[i] = -input[i]; }, type_parall_);
} else if (func_ == "linear") {
auto linear = [&](const ValueType& value) -> ValueType {
return value * static_cast<ValueType>(alpha_) +
static_cast<ValueType>(beta_);
};
std::transform(input.begin(), input.end(), res.begin(), linear);
parallel_for(
input.size(),
[&](int i) {
res[i] = input[i] * static_cast<ValueType>(alpha_) +
static_cast<ValueType>(beta_);
},
type_parall_);
} else if (func_ == "sigmoid") {
auto sigmoid = [](ValueType x) -> ValueType {
if constexpr (std::is_integral_v<ValueType>) {
auto x_float = static_cast<float>(x);
float result = 1.0F / (1.0F + std::exp(-x_float));
return static_cast<ValueType>(std::round(result));
} else {
if (x >= ValueType(0)) {
ValueType z = std::exp(-x);
return ValueType(1) / (ValueType(1) + z);
}
ValueType z = std::exp(x);
return z / (ValueType(1) + z);
}
};
std::transform(input.cbegin(), input.cend(), res.begin(), sigmoid);
if constexpr (std::is_integral_v<ValueType>) {
parallel_for(
input.size(),
[&](int i) {
auto x_float = static_cast<float>(input[i]);
float result = 1.0F / (1.0F + std::exp(-x_float));
res[i] = static_cast<ValueType>(std::round(result));
},
type_parall_);
} else {
parallel_for(
input.size(),
[&](int i) {
ValueType x = input[i];
if (x >= ValueType(0)) {
ValueType z = std::exp(-x);
res[i] = ValueType(1) / (ValueType(1) + z);
} else {
ValueType z = std::exp(x);
res[i] = z / (ValueType(1) + z);
}
},
type_parall_);
}
} else {
throw std::invalid_argument("No such function for EWLayer");
}
Expand Down
129 changes: 129 additions & 0 deletions include/layers/Layer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
#pragma once
#include <omp.h>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Guard the OpenMP/TBB includes and add ; otherwise non-OpenMP builds fail.

#ifdef HAS_OPENMP
#include <omp.h>
#endif
#include <thread>
#ifdef HAS_TBB
#include <oneapi/tbb/blocked_range.h>
#include <oneapi/tbb/parallel_for.h>
#include <oneapi/tbb/info.h>
#endif

Copy link
Member

@allnes allnes Nov 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move all backend headers (OpenMP/TBB/Threads) and implementation details into a small parallel module. Expose a single, inline header API so call sites incur no extra call/indirection.

  • include/parallel/parallel.hpp (inline API)
  • include/parallel/backends.hpp (backend helpers; guarded includes)
  • No <omp.h>/TBB headers leaking into layer headers.

Example:

// include/parallel/parallel.hpp
#pragma once
#include <cstddef>

enum class ParBackend { Auto, Seq, Threads, TBB, OMP };

struct ParOptions {
  ParBackend backend = ParBackend::Auto;
  int max_threads = 0;         // 0 = runtime default
  std::size_t min_parallel_n = 4096; // small tasks stay sequential
  std::size_t grain = 1024;    // backend-specific chunk hint
};

// Header-only: one branch + inlined backend
template <class F>
inline void parallel_for(std::size_t n, F&& f, const ParOptions& opt) {
  if (n == 0) return;
  const ParBackend b = select_backend(opt, n); // inline, cheap
  switch (b) {
    case ParBackend::Seq:     return impl_seq(n, f);
    case ParBackend::Threads: return impl_threads(n, f, opt);
    case ParBackend::TBB:     return impl_tbb(n, f, opt);
    case ParBackend::OMP:     return impl_omp(n, f, opt);
    case ParBackend::Auto:    return impl_seq(n, f); // unreachable
  }
}


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid re-evaluating “Auto” logic every call. Resolve once (feature flags + environment + problem size) and cache in the layer/context.

// Called once per layer or first use
inline ParBackend resolve_auto_once(const ParOptions& opt, std::size_t n) noexcept {
#if defined(HAS_OMP)
  if (n >= opt.min_parallel_n) return ParBackend::OMP;
#elif defined(HAS_TBB)
  if (n >= opt.min_parallel_n) return ParBackend::TBB;
#elif defined(HAS_THREADS)
  if (n >= opt.min_parallel_n) return ParBackend::Threads;
#endif
  return ParBackend::Seq;
}

inline ParBackend select_backend(const ParOptions& opt, std::size_t n) noexcept {
  if (opt.backend != ParBackend::Auto) return opt.backend;
  static ParBackend cached = resolve_auto_once(opt, n); // or store in the layer
  return cached;
}

#include <algorithm>
#include <execution>
#include <functional>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include <stdexcept>
#include <string>
Expand Down Expand Up @@ -49,6 +55,7 @@ class Layer {
PostOperations postops;
int getID() const { return id_; }
void setID(int id) { id_ = id; }
void setTypeParall(int type) { type_parall_ = type; }
LayerType getName() const { return type_; }
virtual void run(const std::vector<Tensor>& input,
std::vector<Tensor>& output) = 0;
Expand All @@ -59,6 +66,7 @@ class Layer {
protected:
int id_ = 0;
LayerType type_;
int type_parall_;
};

template <typename ValueType>
Expand All @@ -83,4 +91,125 @@ class LayerImpl {
Shape outputShape_;
};

template <typename Func>
inline void parallel_for(int count, Func func, int mode = 0) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
inline void parallel_for(int count, Func func, int mode = 0) {
inline void parallel_for(int count, Func func, int mode = 0) {
if (count <= 0) return;

static bool stl_available = true;
static bool tbb_available = true;
static bool omp_available = true;
const int MIN_CHUNK_SIZE = 1000;
if (count < MIN_CHUNK_SIZE) {
mode = 0;
}

switch (mode) {
case 0: // Sequential
{
for (int i = 0; i < count; ++i) {
func(i);
}
break;
}

case 1: // STL
{
if (stl_available) {
try {
int num_threads =
static_cast<int>(std::thread::hardware_concurrency());
if (num_threads == 0) num_threads = 4;

int min_chunk_size = std::max(1000, count / (num_threads * 4));
if (count / num_threads < min_chunk_size) {
num_threads = std::max(1, count / min_chunk_size);
}

std::vector<std::thread> threads;
threads.reserve(num_threads);

int chunk_size = count / num_threads;
int remainder = count % num_threads;

int start = 0;
for (int t = 0; t < num_threads; ++t) {
int end = start + chunk_size + (t < remainder ? 1 : 0);
if (start >= end) break;

threads.emplace_back([start, end, &func]() {
for (int i = start; i < end; ++i) {
func(i);
}
});

start = end;
}

for (auto& thread : threads) {
thread.join();
}

} catch (const std::exception& e) {
std::cout << "Thread execution failed: " << e.what()
<< ". Falling back to sequential.\n";
stl_available = false;
for (int i = 0; i < count; ++i) func(i);
}
} else {
for (int i = 0; i < count; ++i) func(i);
}
break;
}

case 2: // Intel TBB
{
if (tbb_available) {
try {
oneapi::tbb::parallel_for(
oneapi::tbb::blocked_range<int>(0, count),
[&](const oneapi::tbb::blocked_range<int>& range) {
for (int i = range.begin(); i < range.end(); ++i) {
func(i);
}
},
oneapi::tbb::auto_partitioner());
} catch (const std::exception& e) {
std::cout << "TBB execution failed: " << e.what()
<< ". Falling back to sequential.\n";
tbb_available = false;
for (int i = 0; i < count; ++i) func(i);
}
} else {
for (int i = 0; i < count; ++i) func(i);
}
break;
}

case 3: // OpenMP
{
if (omp_available) {
try {
int num_threads = omp_get_max_threads();

int chunk_size = std::max(1000, count / (num_threads * 8));

#pragma omp parallel for schedule(static, chunk_size) num_threads(num_threads)
for (int i = 0; i < count; ++i) {
func(i);
}

} catch (...) {
std::cout << "OpenMP execution failed. Falling back to sequential.\n";
omp_available = false;
for (int i = 0; i < count; ++i) func(i);
}
} else {
for (int i = 0; i < count; ++i) func(i);
}
break;
}

default:
for (int i = 0; i < count; ++i) func(i);
}
}

} // namespace it_lab_ai
3 changes: 3 additions & 0 deletions src/layers/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
file(GLOB_RECURSE layers_src *.cpp)
add_library(layers_lib STATIC "${LAYERS_HEADERS}" "${layers_src}")
target_link_libraries(layers_lib PUBLIC TBB_unified)
# if(OpenMP_FOUND)
# target_link_libraries(layers_lib PUBLIC OpenMP::OpenMP_CXX)
# endif()
target_link_libraries(layers_lib PUBLIC dnnl)
15 changes: 11 additions & 4 deletions src/layers/EWLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,20 @@ void EWLayer::run(const std::vector<Tensor>& input,
}
switch (input[0].get_type()) {
case Type::kInt: {
EWLayerImpl<int> used_impl(input[0].get_shape(), func_, alpha_, beta_);
output[0] =
make_tensor(used_impl.run(*input[0].as<int>()), input[0].get_shape());
EWLayerImpl<int> used_impl(input[0].get_shape(), func_, alpha_, beta_,
type_parall_);
std::vector<int> tmp = used_impl.run(*input[0].as<int>());
auto start = std::chrono::high_resolution_clock::now();
output[0] = make_tensor(tmp, input[0].get_shape());
auto end = std::chrono::high_resolution_clock::now();
auto total_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << total_duration.count() << std::endl;
break;
}
case Type::kFloat: {
EWLayerImpl<float> used_impl(input[0].get_shape(), func_, alpha_, beta_);
EWLayerImpl<float> used_impl(input[0].get_shape(), func_, alpha_, beta_,
type_parall_);
output[0] = make_tensor(used_impl.run(*input[0].as<float>()),
input[0].get_shape());
break;
Expand Down
2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
file(GLOB_RECURSE TEST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)

add_executable(run_test ${TEST_SRC_FILES})
if (NOT WIN32)
if(OpenMP_FOUND)
target_link_libraries(run_test PUBLIC OpenMP::OpenMP_CXX)
endif()
target_link_libraries(run_test PUBLIC perf_lib layers_lib layers_oneDNN_lib)
Expand Down
Loading
Loading