Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 19 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,28 @@ add_subdirectory(3rdparty)

include(cmake/opencv_config.cmake)

if (NOT WIN32)
find_package(OpenMP REQUIRED)
endif()

if (NOT WIN32)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -Werror")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Werror")
find_package(OpenMP)

if(OpenMP_FOUND)
message(STATUS "OpenMP found - enabling parallel support")
add_definitions(-DHAS_OPENMP)
if(TARGET OpenMP::OpenMP_CXX)
set(OPENMP_TARGET OpenMP::OpenMP_CXX)
message(STATUS "Using OpenMP target: OpenMP::OpenMP_CXX")
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
if(OpenMP_CXX_LIBRARIES)
set(OPENMP_LIBRARIES ${OpenMP_CXX_LIBRARIES})
endif()
message(STATUS "OpenMP CXX flags: ${OpenMP_CXX_FLAGS}")
message(STATUS "OpenMP libraries: ${OpenMP_CXX_LIBRARIES}")
endif()
else()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /wd4996 /wd4190 /wd4189 /WX")
message(STATUS "OpenMP not found - parallel features disabled")
endif()


foreach(CONFIG "" _DEBUG _RELEASE)
set("CMAKE_ARCHIVE_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib")
set("CMAKE_LIBRARY_OUTPUT_DIRECTORY${CONFIG}" "${CMAKE_BINARY_DIR}/lib")
Expand Down
94 changes: 60 additions & 34 deletions include/layers/EWLayer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class EWLayerImpl : public LayerImpl<ValueType> {
public:
EWLayerImpl() = delete;
EWLayerImpl(const Shape& shape, std::string function, float alpha = 0.0F,
float beta = 0.0F);
float beta = 0.0F, int type_parall = 0);
EWLayerImpl(const EWLayerImpl& c) = default;
EWLayerImpl& operator=(const EWLayerImpl& c) = default;
std::vector<ValueType> run(
Expand All @@ -56,57 +56,83 @@ class EWLayerImpl : public LayerImpl<ValueType> {
std::string func_;
float alpha_;
float beta_;
int type_parall_;
};

template <typename ValueType>
EWLayerImpl<ValueType>::EWLayerImpl(const Shape& shape, std::string function,
float alpha, float beta)
float alpha, float beta, int type_parall)
: LayerImpl<ValueType>(shape, shape),
func_(std::move(function)),
alpha_(alpha),
beta_(beta) {}
beta_(beta),
type_parall_(type_parall) {}

template <typename ValueType>
std::vector<ValueType> EWLayerImpl<ValueType>::run(
const std::vector<ValueType>& input) const {
std::vector<ValueType> res(this->outputShape_.count());
int available_threads = -1;
if (type_parall_ == 0) available_threads = 1;
if (type_parall_ == 1)
available_threads = std::thread::hardware_concurrency();
if (type_parall_ == 2)
available_threads = oneapi::tbb::info::default_concurrency();
if (type_parall_ == 3) available_threads = omp_get_max_threads();

if (func_ == "relu") {
std::transform(input.begin(), input.end(), res.begin(), relu<ValueType>);
parallel_for(
input.size(),
[&](int i) {
res[i] = input[i] > ValueType(0) ? input[i] : ValueType(0);
},
type_parall_);
} else if (func_ == "tanh") {
auto tanh = [&](const ValueType& value) -> ValueType {
return static_cast<ValueType>(std::tanh(value));
};
std::transform(input.begin(), input.end(), res.begin(), tanh);
parallel_for(
input.size(),
[&](int i) { res[i] = static_cast<ValueType>(std::tanh(input[i])); },
type_parall_);
} else if (func_ == "sin") {
auto sin = [&](const ValueType& value) -> ValueType {
return static_cast<ValueType>(std::sin(value));
};
std::transform(input.begin(), input.end(), res.begin(), sin);
parallel_for(
input.size(),
[&](int i) { res[i] = static_cast<ValueType>(std::sin(input[i])); },
type_parall_);
} else if (func_ == "minus") {
auto minus = [&](const ValueType& value) -> ValueType { return -value; };
std::transform(input.begin(), input.end(), res.begin(), minus);
parallel_for(
input.size(), [&](int i) { res[i] = -input[i]; }, type_parall_);
} else if (func_ == "linear") {
auto linear = [&](const ValueType& value) -> ValueType {
return value * static_cast<ValueType>(alpha_) +
static_cast<ValueType>(beta_);
};
std::transform(input.begin(), input.end(), res.begin(), linear);
parallel_for(
input.size(),
[&](int i) {
res[i] = input[i] * static_cast<ValueType>(alpha_) +
static_cast<ValueType>(beta_);
},
type_parall_);
} else if (func_ == "sigmoid") {
auto sigmoid = [](ValueType x) -> ValueType {
if constexpr (std::is_integral_v<ValueType>) {
auto x_float = static_cast<float>(x);
float result = 1.0F / (1.0F + std::exp(-x_float));
return static_cast<ValueType>(std::round(result));
} else {
if (x >= ValueType(0)) {
ValueType z = std::exp(-x);
return ValueType(1) / (ValueType(1) + z);
}
ValueType z = std::exp(x);
return z / (ValueType(1) + z);
}
};
std::transform(input.cbegin(), input.cend(), res.begin(), sigmoid);
if constexpr (std::is_integral_v<ValueType>) {
parallel_for(
input.size(),
[&](int i) {
auto x_float = static_cast<float>(input[i]);
float result = 1.0F / (1.0F + std::exp(-x_float));
res[i] = static_cast<ValueType>(std::round(result));
},
type_parall_);
} else {
parallel_for(
input.size(),
[&](int i) {
ValueType x = input[i];
if (x >= ValueType(0)) {
ValueType z = std::exp(-x);
res[i] = ValueType(1) / (ValueType(1) + z);
} else {
ValueType z = std::exp(x);
res[i] = z / (ValueType(1) + z);
}
},
type_parall_);
}
} else {
throw std::invalid_argument("No such function for EWLayer");
}
Expand Down
129 changes: 129 additions & 0 deletions include/layers/Layer.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
#pragma once
#include <omp.h>

#include <algorithm>
#include <execution>
#include <functional>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include <stdexcept>
#include <string>
Expand Down Expand Up @@ -49,6 +55,7 @@ class Layer {
PostOperations postops;
int getID() const { return id_; }
void setID(int id) { id_ = id; }
void setTypeParall(int type) { type_parall_ = type; }
LayerType getName() const { return type_; }
virtual void run(const std::vector<Tensor>& input,
std::vector<Tensor>& output) = 0;
Expand All @@ -59,6 +66,7 @@ class Layer {
protected:
int id_ = 0;
LayerType type_;
int type_parall_;
};

template <typename ValueType>
Expand All @@ -83,4 +91,125 @@ class LayerImpl {
Shape outputShape_;
};

template <typename Func>
inline void parallel_for(int count, Func func, int mode = 0) {
static bool stl_available = true;
static bool tbb_available = true;
static bool omp_available = true;
const int MIN_CHUNK_SIZE = 1000;
if (count < MIN_CHUNK_SIZE) {
mode = 0;
}

switch (mode) {
case 0: // Sequential
{
for (int i = 0; i < count; ++i) {
func(i);
}
break;
}

case 1: // STL
{
if (stl_available) {
try {
int num_threads =
static_cast<int>(std::thread::hardware_concurrency());
if (num_threads == 0) num_threads = 4;

int min_chunk_size = std::max(1000, count / (num_threads * 4));
if (count / num_threads < min_chunk_size) {
num_threads = std::max(1, count / min_chunk_size);
}

std::vector<std::thread> threads;
threads.reserve(num_threads);

int chunk_size = count / num_threads;
int remainder = count % num_threads;

int start = 0;
for (int t = 0; t < num_threads; ++t) {
int end = start + chunk_size + (t < remainder ? 1 : 0);
if (start >= end) break;

threads.emplace_back([start, end, &func]() {
for (int i = start; i < end; ++i) {
func(i);
}
});

start = end;
}

for (auto& thread : threads) {
thread.join();
}

} catch (const std::exception& e) {
std::cout << "Thread execution failed: " << e.what()
<< ". Falling back to sequential.\n";
stl_available = false;
for (int i = 0; i < count; ++i) func(i);
}
} else {
for (int i = 0; i < count; ++i) func(i);
}
break;
}

case 2: // Intel TBB
{
if (tbb_available) {
try {
oneapi::tbb::parallel_for(
oneapi::tbb::blocked_range<int>(0, count),
[&](const oneapi::tbb::blocked_range<int>& range) {
for (int i = range.begin(); i < range.end(); ++i) {
func(i);
}
},
oneapi::tbb::auto_partitioner());
} catch (const std::exception& e) {
std::cout << "TBB execution failed: " << e.what()
<< ". Falling back to sequential.\n";
tbb_available = false;
for (int i = 0; i < count; ++i) func(i);
}
} else {
for (int i = 0; i < count; ++i) func(i);
}
break;
}

case 3: // OpenMP
{
if (omp_available) {
try {
int num_threads = omp_get_max_threads();

int chunk_size = std::max(1000, count / (num_threads * 8));

#pragma omp parallel for schedule(static, chunk_size) num_threads(num_threads)
for (int i = 0; i < count; ++i) {
func(i);
}

} catch (...) {
std::cout << "OpenMP execution failed. Falling back to sequential.\n";
omp_available = false;
for (int i = 0; i < count; ++i) func(i);
}
} else {
for (int i = 0; i < count; ++i) func(i);
}
break;
}

default:
for (int i = 0; i < count; ++i) func(i);
}
}

} // namespace it_lab_ai
3 changes: 3 additions & 0 deletions src/layers/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
file(GLOB_RECURSE layers_src *.cpp)
add_library(layers_lib STATIC "${LAYERS_HEADERS}" "${layers_src}")
target_link_libraries(layers_lib PUBLIC TBB_unified)
# if(OpenMP_FOUND)
# target_link_libraries(layers_lib PUBLIC OpenMP::OpenMP_CXX)
# endif()
target_link_libraries(layers_lib PUBLIC dnnl)
15 changes: 11 additions & 4 deletions src/layers/EWLayer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,20 @@ void EWLayer::run(const std::vector<Tensor>& input,
}
switch (input[0].get_type()) {
case Type::kInt: {
EWLayerImpl<int> used_impl(input[0].get_shape(), func_, alpha_, beta_);
output[0] =
make_tensor(used_impl.run(*input[0].as<int>()), input[0].get_shape());
EWLayerImpl<int> used_impl(input[0].get_shape(), func_, alpha_, beta_,
type_parall_);
std::vector<int> tmp = used_impl.run(*input[0].as<int>());
auto start = std::chrono::high_resolution_clock::now();
output[0] = make_tensor(tmp, input[0].get_shape());
auto end = std::chrono::high_resolution_clock::now();
auto total_duration =
std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << total_duration.count() << std::endl;
break;
}
case Type::kFloat: {
EWLayerImpl<float> used_impl(input[0].get_shape(), func_, alpha_, beta_);
EWLayerImpl<float> used_impl(input[0].get_shape(), func_, alpha_, beta_,
type_parall_);
output[0] = make_tensor(used_impl.run(*input[0].as<float>()),
input[0].get_shape());
break;
Expand Down
2 changes: 1 addition & 1 deletion test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
file(GLOB_RECURSE TEST_SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)

add_executable(run_test ${TEST_SRC_FILES})
if (NOT WIN32)
if(OpenMP_FOUND)
target_link_libraries(run_test PUBLIC OpenMP::OpenMP_CXX)
endif()
target_link_libraries(run_test PUBLIC perf_lib layers_lib layers_oneDNN_lib)
Expand Down
Loading
Loading