Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 19 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,28 @@ add_subdirectory(3rdparty)

include(cmake/opencv_config.cmake)

if (NOT WIN32)
find_package(OpenMP REQUIRED)
endif()

if (NOT WIN32)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
find_package(OpenMP)

if(OpenMP_FOUND)
message(STATUS "OpenMP found - enabling parallel support")
add_definitions(-DHAS_OPENMP)
if(TARGET OpenMP::OpenMP_CXX)
set(OPENMP_TARGET OpenMP::OpenMP_CXX)
message(STATUS "Using OpenMP target: OpenMP::OpenMP_CXX")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
if(OpenMP_CXX_LIBRARIES)
set(OPENMP_LIBRARIES ${OpenMP_CXX_LIBRARIES})
endif()
message(STATUS "OpenMP CXX flags: ${OpenMP_CXX_FLAGS}")
message(STATUS "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}")
endif()
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX")
message(STATUS "OpenMP not found - parallel features disabled")
endif()


foreach(CONFIG "" _DEBUG _RELEASE)
set("CMAKE_ARCHIVE_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib")
set("CMAKE_LIBRARY_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib")
Expand Down
3 changes: 3 additions & 0 deletions include/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ set(PERF_HEADERS "${perf_headers}" PARENT_SCOPE)

file(GLOB_RECURSE reader_headers Weights_Reader/*.h Weights_Reader/*.hpp)
set(READER_HEADERS "${reader_headers}" PARENT_SCOPE)

file(GLOB_RECURSE parallel_headers parallel/*.h parallel/*.hpp)
set(LAYERS_HEADERS "${parallel_headers}" PARENT_SCOPE)
96 changes: 62 additions & 34 deletions include/layers/EWLayer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ class EWLayerImpl : public LayerImpl<ValueType> {
public:
EWLayerImpl() = delete;
EWLayerImpl(const Shape& shape, std::string function, float alpha = 0.0F,
float beta = 0.0F);
float beta = 0.0F,
ParBackend parallel_backend = ParBackend::kSeq);
EWLayerImpl(const EWLayerImpl& c) = default;
EWLayerImpl& operator=(const EWLayerImpl& c) = default;
std::vector<ValueType> run(
Expand All @@ -56,57 +57,84 @@ class EWLayerImpl : public LayerImpl<ValueType> {
std::string func_;
float alpha_;
float beta_;
ParBackend parallel_backend_;
};

template <typename ValueType>
EWLayerImpl<ValueType>::EWLayerImpl(const Shape& shape, std::string function,
float alpha, float beta)
float alpha, float beta,
ParBackend parallel_backend)
: LayerImpl<ValueType>(shape, shape),
func_(std::move(function)),
alpha_(alpha),
beta_(beta) {}
beta_(beta),
parallel_backend_(parallel_backend) {}

template <typename ValueType>
std::vector<ValueType> EWLayerImpl<ValueType>::run(
const std::vector<ValueType>& input) const {
std::vector<ValueType> res(this->outputShape_.count());

parallel::Options options;
options.backend = parallel_backend_;

if (func_ == "relu") {
std::transform(input.begin(), input.end(), res.begin(), relu<ValueType>);
parallel::parallel_for(
input.size(),
[&](std::size_t i) {
res[i] = input[i] > ValueType(0) ? input[i] : ValueType(0);
},
options);
} else if (func_ == "tanh") {
auto tanh = [&](const ValueType& value) -> ValueType {
return static_cast<ValueType>(std::tanh(value));
};
std::transform(input.begin(), input.end(), res.begin(), tanh);
parallel::parallel_for(
input.size(),
[&](std::size_t i) {
res[i] = static_cast<ValueType>(std::tanh(input[i]));
},
options);
} else if (func_ == "sin") {
auto sin = [&](const ValueType& value) -> ValueType {
return static_cast<ValueType>(std::sin(value));
};
std::transform(input.begin(), input.end(), res.begin(), sin);
parallel::parallel_for(
input.size(),
[&](std::size_t i) {
res[i] = static_cast<ValueType>(std::sin(input[i]));
},
options);
} else if (func_ == "minus") {
auto minus = [&](const ValueType& value) -> ValueType { return -value; };
std::transform(input.begin(), input.end(), res.begin(), minus);
parallel::parallel_for(
input.size(), [&](std::size_t i) { res[i] = -input[i]; }, options);
} else if (func_ == "linear") {
auto linear = [&](const ValueType& value) -> ValueType {
return value * static_cast<ValueType>(alpha_) +
static_cast<ValueType>(beta_);
};
std::transform(input.begin(), input.end(), res.begin(), linear);
parallel::parallel_for(
input.size(),
[&](std::size_t i) {
res[i] = input[i] * static_cast<ValueType>(alpha_) +
static_cast<ValueType>(beta_);
},
options);
} else if (func_ == "sigmoid") {
auto sigmoid = [](ValueType x) -> ValueType {
if constexpr (std::is_integral_v<ValueType>) {
auto x_float = static_cast<float>(x);
float result = 1.0F / (1.0F + std::exp(-x_float));
return static_cast<ValueType>(std::round(result));
} else {
if (x >= ValueType(0)) {
ValueType z = std::exp(-x);
return ValueType(1) / (ValueType(1) + z);
}
ValueType z = std::exp(x);
return z / (ValueType(1) + z);
}
};
std::transform(input.cbegin(), input.cend(), res.begin(), sigmoid);
if constexpr (std::is_integral_v<ValueType>) {
parallel::parallel_for(
input.size(),
[&](std::size_t i) {
auto x_float = static_cast<float>(input[i]);
float result = 1.0F / (1.0F + std::exp(-x_float));
res[i] = static_cast<ValueType>(std::round(result));
},
options);
} else {
parallel::parallel_for(
input.size(),
[&](std::size_t i) {
ValueType x = input[i];
if (x >= ValueType(0)) {
ValueType z = std::exp(-x);
res[i] = ValueType(1) / (ValueType(1) + z);
} else {
ValueType z = std::exp(x);
res[i] = z / (ValueType(1) + z);
}
},
options);
}
} else {
throw std::invalid_argument("No such function for EWLayer");
}
Expand Down
11 changes: 9 additions & 2 deletions include/layers/Layer.hpp
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
#pragma once
#include <algorithm>
#include <execution>
#include <functional>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include <stdexcept>
#include <string>
#include <vector>

#include "layers/Shape.hpp"
#include "layers/Tensor.hpp"
#include "oneapi/tbb.h"
#include "parallel/parallel.hpp"

namespace it_lab_ai {

Expand All @@ -33,6 +37,7 @@ enum LayerType : uint8_t {
};

enum ImplType : uint8_t { kDefault, kTBB, kSTL };
using ParBackend = parallel::Backend;

class Layer;

Expand All @@ -49,6 +54,8 @@ class Layer {
PostOperations postops;
int getID() const { return id_; }
void setID(int id) { id_ = id; }
void setParallelBackend(ParBackend backend) { parallel_backend_ = backend; }
ParBackend getParallelBackend() const { return parallel_backend_; }
LayerType getName() const { return type_; }
virtual void run(const std::vector<Tensor>& input,
std::vector<Tensor>& output) = 0;
Expand All @@ -59,6 +66,7 @@ class Layer {
protected:
int id_ = 0;
LayerType type_;
ParBackend parallel_backend_ = ParBackend::kSeq;
};

template <typename ValueType>
Expand All @@ -82,5 +90,4 @@ class LayerImpl {
Shape inputShape_;
Shape outputShape_;
};

} // namespace it_lab_ai
130 changes: 130 additions & 0 deletions include/parallel/backends.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#pragma once
#include <cstddef>
#include <cstdint>
#include <functional>
#include <iostream>
#include <limits>
#include <thread>
#include <vector>

#ifdef HAS_OPENMP
// #include <omp.h>
#endif

#include <oneapi/tbb/blocked_range.h>
#include <oneapi/tbb/info.h>
#include <oneapi/tbb/parallel_for.h>

namespace it_lab_ai {
namespace parallel {

enum class Backend : std::uint8_t {
kSeq = 0,
kThreads = 1,
kTbb = 2,
kOmp = 3
};

struct Options {
Backend backend = Backend::kSeq;
int max_threads = 0;
std::size_t min_parallel_n = 1000;
std::size_t grain = 1024;
};

inline void impl_seq(std::size_t count,
const std::function<void(std::size_t)>& func) {
for (std::size_t i = 0; i < count; ++i) {
func(i);
}
std::cout << "Seq " << std::endl;
}

inline void impl_threads(std::size_t count,
const std::function<void(std::size_t)>& func,
const Options& opt) {
int num_threads = opt.max_threads > 0
? opt.max_threads
: static_cast<int>(std::thread::hardware_concurrency());
if (num_threads == 0) num_threads = 4;

std::size_t min_chunk_size = std::max(opt.grain, count / (num_threads * 4));
if (count / num_threads < min_chunk_size) {
num_threads = std::max(1, static_cast<int>(count / min_chunk_size));
}

std::vector<std::thread> threads;
threads.reserve(num_threads);

std::size_t chunk_size = count / num_threads;
std::size_t remainder = count % num_threads;

std::size_t start = 0;
for (int t = 0; t < num_threads; ++t) {
std::size_t end =
start + chunk_size + (t < static_cast<int>(remainder) ? 1 : 0);
if (start >= end) break;

threads.emplace_back([start, end, &func]() {
for (std::size_t i = start; i < end; ++i) {
func(i);
}
});

start = end;
}

for (auto& thread : threads) {
thread.join();
}
std::cout << "Stl " << std::endl;
}

inline void impl_tbb(std::size_t count,
const std::function<void(std::size_t)>& func,
const Options& opt) {
std::cout << "tbb " << std::endl;
oneapi::tbb::parallel_for(
oneapi::tbb::blocked_range<std::size_t>(0, count, opt.grain),
[&](const oneapi::tbb::blocked_range<std::size_t>& range) {
for (std::size_t i = range.begin(); i < range.end(); ++i) {
func(i);
}
},
oneapi::tbb::auto_partitioner());
}

#ifdef HAS_OPENMP
inline void impl_omp(std::size_t count,
const std::function<void(std::size_t)>& func,
const Options& opt) {
if (count == 0) return;

int num_threads = opt.max_threads > 0
? opt.max_threads
: static_cast<int>(std::thread::hardware_concurrency());

static_cast<void>(std::max(opt.grain, count / (num_threads * 8)));

int int_count = static_cast<int>(count);
if (int_count < 0 || static_cast<std::size_t>(int_count) != count) {
impl_seq(count, func);
return;
}

#pragma omp parallel for schedule(static) num_threads(num_threads)
for (int i = 0; i < int_count; ++i) {
func(static_cast<std::size_t>(i));
}
std::cout << "OMP " << std::endl;
}
#else
inline void impl_omp(std::size_t count,
const std::function<void(std::size_t)>& func,
const Options& opt) {
impl_seq(count, func);
}
#endif

} // namespace parallel
} // namespace it_lab_ai
Loading
Loading