mirror of
https://github.com/opencv/opencv.git
synced 2025-06-10 11:03:03 +08:00
Merge pull request #16092 from YashasSamaga:cuda4dnn-conv-act-fuse
cuda4dnn: fuse activations with convolutions * fuse ReLU, ReLU6, TanH, Sigmoid with conv * fix OpenCL errors * improve ReLU, add power, swish and mish * fix missing fusion entries * fix handling of unsetAttached * remove whole file indentation * optimize power = 1.0, use IDENTITY instead of NONE * handle edge case: change backend and then clear
This commit is contained in:
parent
5b0b59ecfb
commit
17c485eb03
336
modules/dnn/src/cuda/bias_activation.cu
Normal file
336
modules/dnn/src/cuda/bias_activation.cu
Normal file
@ -0,0 +1,336 @@
|
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
|
||||||
|
#include "types.hpp"
|
||||||
|
#include "math.hpp"
|
||||||
|
#include "vector_traits.hpp"
|
||||||
|
#include "grid_stride_range.hpp"
|
||||||
|
#include "execution.hpp"
|
||||||
|
|
||||||
|
#include "../cuda4dnn/csl/stream.hpp"
|
||||||
|
#include "../cuda4dnn/csl/span.hpp"
|
||||||
|
|
||||||
|
using namespace cv::dnn::cuda4dnn::csl;
|
||||||
|
using namespace cv::dnn::cuda4dnn::csl::device;
|
||||||
|
|
||||||
|
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||||
|
|
||||||
|
namespace raw {
|
||||||
|
|
||||||
|
template <class T, std::size_t N>
|
||||||
|
__global__ void biasN_relu_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, T slope) {
|
||||||
|
using vector_type = get_vector_type_t<T, N>;
|
||||||
|
|
||||||
|
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||||
|
|
||||||
|
inner_size /= vector_type::size();
|
||||||
|
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||||
|
const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
|
||||||
|
|
||||||
|
vector_type vec;
|
||||||
|
v_load(vec, inplace_output_vPtr[i]);
|
||||||
|
for(int j = 0; j < vec.size(); j++) {
|
||||||
|
vec.data[j] += bias[bias_idx];
|
||||||
|
vec.data[j] = vec.data[j] >= T(0) ? vec.data[j] : slope * vec.data[j];
|
||||||
|
}
|
||||||
|
v_store(inplace_output_vPtr[i], vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, std::size_t N>
|
||||||
|
__global__ void biasN_clipped_relu_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, T floor, T ceil) {
|
||||||
|
using vector_type = get_vector_type_t<T, N>;
|
||||||
|
|
||||||
|
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||||
|
|
||||||
|
inner_size /= vector_type::size();
|
||||||
|
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||||
|
const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
|
||||||
|
|
||||||
|
vector_type vec;
|
||||||
|
v_load(vec, inplace_output_vPtr[i]);
|
||||||
|
for(int j = 0; j < vec.size(); j++) {
|
||||||
|
using device::clamp;
|
||||||
|
vec.data[j] = clamp(vec.data[j] + bias[bias_idx], floor, ceil);
|
||||||
|
}
|
||||||
|
v_store(inplace_output_vPtr[i], vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, std::size_t N>
|
||||||
|
__global__ void biasN_power_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias, T power) {
|
||||||
|
using vector_type = get_vector_type_t<T, N>;
|
||||||
|
|
||||||
|
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||||
|
|
||||||
|
inner_size /= vector_type::size();
|
||||||
|
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||||
|
const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
|
||||||
|
|
||||||
|
vector_type vec;
|
||||||
|
v_load(vec, inplace_output_vPtr[i]);
|
||||||
|
for(int j = 0; j < vec.size(); j++) {
|
||||||
|
using device::pow;
|
||||||
|
vec.data[j] = pow(vec.data[j] + bias[bias_idx], power);
|
||||||
|
}
|
||||||
|
v_store(inplace_output_vPtr[i], vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, std::size_t N>
|
||||||
|
__global__ void biasN_tanh_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias) {
|
||||||
|
using vector_type = get_vector_type_t<T, N>;
|
||||||
|
|
||||||
|
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||||
|
|
||||||
|
inner_size /= vector_type::size();
|
||||||
|
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||||
|
const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
|
||||||
|
|
||||||
|
vector_type vec;
|
||||||
|
v_load(vec, inplace_output_vPtr[i]);
|
||||||
|
for(int j = 0; j < vec.size(); j++) {
|
||||||
|
using device::tanh;
|
||||||
|
vec.data[j] = tanh(vec.data[j] + bias[bias_idx]);
|
||||||
|
}
|
||||||
|
v_store(inplace_output_vPtr[i], vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, std::size_t N>
|
||||||
|
__global__ void biasN_sigmoid_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias) {
|
||||||
|
using vector_type = get_vector_type_t<T, N>;
|
||||||
|
|
||||||
|
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||||
|
|
||||||
|
inner_size /= vector_type::size();
|
||||||
|
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||||
|
const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
|
||||||
|
|
||||||
|
vector_type vec;
|
||||||
|
v_load(vec, inplace_output_vPtr[i]);
|
||||||
|
for(int j = 0; j < vec.size(); j++) {
|
||||||
|
using device::sigmoid;
|
||||||
|
vec.data[j] = sigmoid(vec.data[j] + bias[bias_idx]);
|
||||||
|
}
|
||||||
|
v_store(inplace_output_vPtr[i], vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, std::size_t N>
|
||||||
|
__global__ void biasN_swish_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias) {
|
||||||
|
using vector_type = get_vector_type_t<T, N>;
|
||||||
|
|
||||||
|
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||||
|
|
||||||
|
inner_size /= vector_type::size();
|
||||||
|
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||||
|
const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
|
||||||
|
|
||||||
|
vector_type vec;
|
||||||
|
v_load(vec, inplace_output_vPtr[i]);
|
||||||
|
for(int j = 0; j < vec.size(); j++) {
|
||||||
|
using device::sigmoid;
|
||||||
|
vec.data[j] += bias[bias_idx];
|
||||||
|
vec.data[j] = vec.data[j] * sigmoid(vec.data[j]);
|
||||||
|
}
|
||||||
|
v_store(inplace_output_vPtr[i], vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, std::size_t N>
|
||||||
|
__global__ void biasN_mish_inplace_vec(Span<T> inplace_output, size_type inner_size, View<T> bias) {
|
||||||
|
using vector_type = get_vector_type_t<T, N>;
|
||||||
|
|
||||||
|
auto inplace_output_vPtr = vector_type::get_pointer(inplace_output.data());
|
||||||
|
|
||||||
|
inner_size /= vector_type::size();
|
||||||
|
for (auto i : grid_stride_range(inplace_output.size() / vector_type::size())) {
|
||||||
|
const index_type bias_idx = (i / inner_size) % static_cast<size_type>(bias.size());
|
||||||
|
|
||||||
|
vector_type vec;
|
||||||
|
v_load(vec, inplace_output_vPtr[i]);
|
||||||
|
for(int j = 0; j < vec.size(); j++) {
|
||||||
|
using device::tanh;
|
||||||
|
using device::log1pexp;
|
||||||
|
vec.data[j] += bias[bias_idx];
|
||||||
|
vec.data[j] = vec.data[j] * tanh(log1pexp(vec.data[j]));
|
||||||
|
}
|
||||||
|
v_store(inplace_output_vPtr[i], vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, std::size_t N> static
|
||||||
|
void launch_biasN_relu_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T slope) {
|
||||||
|
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||||
|
CV_Assert(inner_size % N == 0);
|
||||||
|
|
||||||
|
auto kernel = raw::biasN_relu_inplace_vec<T, N>;
|
||||||
|
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||||
|
launch_kernel(kernel, policy, inplace_output, inner_size, bias, slope);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T slope) {
|
||||||
|
if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
|
||||||
|
launch_biasN_relu_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias, slope);
|
||||||
|
} else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
|
||||||
|
launch_biasN_relu_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias, slope);
|
||||||
|
} else {
|
||||||
|
launch_biasN_relu_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias, slope);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template void biasN_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half);
|
||||||
|
template void biasN_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float);
|
||||||
|
|
||||||
|
template <class T, std::size_t N> static
|
||||||
|
void launch_biasN_clipped_relu_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T floor, T ceil) {
|
||||||
|
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||||
|
CV_Assert(inner_size % N == 0);
|
||||||
|
|
||||||
|
auto kernel = raw::biasN_clipped_relu_inplace_vec<T, N>;
|
||||||
|
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||||
|
launch_kernel(kernel, policy, inplace_output, inner_size, bias, floor, ceil);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_clipped_relu_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T floor, T ceil) {
|
||||||
|
if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
|
||||||
|
launch_biasN_clipped_relu_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias, floor, ceil);
|
||||||
|
} else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
|
||||||
|
launch_biasN_clipped_relu_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias, floor, ceil);
|
||||||
|
} else {
|
||||||
|
launch_biasN_clipped_relu_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias, floor, ceil);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template void biasN_clipped_relu_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half, __half);
|
||||||
|
template void biasN_clipped_relu_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float, float);
|
||||||
|
|
||||||
|
template <class T, std::size_t N> static
|
||||||
|
void launch_biasN_power_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T power) {
|
||||||
|
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||||
|
CV_Assert(inner_size % N == 0);
|
||||||
|
|
||||||
|
auto kernel = raw::biasN_power_inplace_vec<T, N>;
|
||||||
|
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||||
|
launch_kernel(kernel, policy, inplace_output, inner_size, bias, power);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_power_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias, T power) {
|
||||||
|
if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
|
||||||
|
launch_biasN_power_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias, power);
|
||||||
|
} else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
|
||||||
|
launch_biasN_power_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias, power);
|
||||||
|
} else {
|
||||||
|
launch_biasN_power_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias, power);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template void biasN_power_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>, __half);
|
||||||
|
template void biasN_power_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>, float);
|
||||||
|
|
||||||
|
template <class T, std::size_t N> static
|
||||||
|
void launch_biasN_tanh_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||||
|
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||||
|
CV_Assert(inner_size % N == 0);
|
||||||
|
|
||||||
|
auto kernel = raw::biasN_tanh_inplace_vec<T, N>;
|
||||||
|
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||||
|
launch_kernel(kernel, policy, inplace_output, inner_size, bias);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_tanh_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||||
|
if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
|
||||||
|
launch_biasN_tanh_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias);
|
||||||
|
} else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
|
||||||
|
launch_biasN_tanh_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias);
|
||||||
|
} else {
|
||||||
|
launch_biasN_tanh_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template void biasN_tanh_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
|
||||||
|
template void biasN_tanh_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
|
||||||
|
|
||||||
|
template <class T, std::size_t N> static
|
||||||
|
void launch_biasN_sigmoid_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||||
|
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||||
|
CV_Assert(inner_size % N == 0);
|
||||||
|
|
||||||
|
auto kernel = raw::biasN_sigmoid_inplace_vec<T, N>;
|
||||||
|
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||||
|
launch_kernel(kernel, policy, inplace_output, inner_size, bias);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_sigmoid_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||||
|
if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
|
||||||
|
launch_biasN_sigmoid_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias);
|
||||||
|
} else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
|
||||||
|
launch_biasN_sigmoid_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias);
|
||||||
|
} else {
|
||||||
|
launch_biasN_sigmoid_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template void biasN_sigmoid_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
|
||||||
|
template void biasN_sigmoid_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
|
||||||
|
|
||||||
|
template <class T, std::size_t N> static
|
||||||
|
void launch_biasN_swish_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||||
|
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||||
|
CV_Assert(inner_size % N == 0);
|
||||||
|
|
||||||
|
auto kernel = raw::biasN_swish_inplace_vec<T, N>;
|
||||||
|
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||||
|
launch_kernel(kernel, policy, inplace_output, inner_size, bias);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_swish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||||
|
if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
|
||||||
|
launch_biasN_swish_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias);
|
||||||
|
} else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
|
||||||
|
launch_biasN_swish_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias);
|
||||||
|
} else {
|
||||||
|
launch_biasN_swish_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template void biasN_swish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
|
||||||
|
template void biasN_swish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
|
||||||
|
|
||||||
|
template <class T, std::size_t N> static
|
||||||
|
void launch_biasN_mish_inplace_vec_kernel(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||||
|
CV_Assert(is_fully_aligned<T>(inplace_output, N));
|
||||||
|
CV_Assert(inner_size % N == 0);
|
||||||
|
|
||||||
|
auto kernel = raw::biasN_mish_inplace_vec<T, N>;
|
||||||
|
auto policy = make_policy(kernel, inplace_output.size() / N, 0, stream);
|
||||||
|
launch_kernel(kernel, policy, inplace_output, inner_size, bias);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_mish_inplace(const Stream& stream, Span<T> inplace_output, std::size_t inner_size, View<T> bias) {
|
||||||
|
if (is_fully_aligned<T>(inplace_output, 4) && inner_size % 4 == 0) {
|
||||||
|
launch_biasN_mish_inplace_vec_kernel<T, 4>(stream, inplace_output, inner_size, bias);
|
||||||
|
} else if (is_fully_aligned<T>(inplace_output, 2) && inner_size % 2 == 0) {
|
||||||
|
launch_biasN_mish_inplace_vec_kernel<T, 2>(stream, inplace_output, inner_size, bias);
|
||||||
|
} else {
|
||||||
|
launch_biasN_mish_inplace_vec_kernel<T, 1>(stream, inplace_output, inner_size, bias);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template void biasN_mish_inplace<__half>(const Stream&, Span<__half>, std::size_t, View<__half>);
|
||||||
|
template void biasN_mish_inplace<float>(const Stream&, Span<float>, std::size_t, View<float>);
|
||||||
|
|
||||||
|
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
38
modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp
Normal file
38
modules/dnn/src/cuda4dnn/kernels/bias_activation.hpp
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
// This file is part of OpenCV project.
|
||||||
|
// It is subject to the license terms in the LICENSE file found in the top-level directory
|
||||||
|
// of this distribution and at http://opencv.org/license.html.
|
||||||
|
|
||||||
|
#ifndef OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP
|
||||||
|
#define OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP
|
||||||
|
|
||||||
|
#include "../csl/stream.hpp"
|
||||||
|
#include "../csl/span.hpp"
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
|
||||||
|
namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T slope);
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_clipped_relu_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T floor, T ceiling);
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_power_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias, T exp);
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_tanh_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_sigmoid_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_swish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||||
|
|
||||||
|
template <class T>
|
||||||
|
void biasN_mish_inplace(const csl::Stream& stream, csl::Span<T> inplace_output, std::size_t inner_size, csl::View<T> bias);
|
||||||
|
|
||||||
|
}}}} /* namespace cv::dnn::cuda4dnn::kernels */
|
||||||
|
|
||||||
|
#endif /* OPENCV_DNN_SRC_CUDA4DNN_KERNELS_BIAS_ACTIVATION_HPP */
|
@ -12,6 +12,8 @@
|
|||||||
#include "../csl/tensor.hpp"
|
#include "../csl/tensor.hpp"
|
||||||
#include "../csl/tensor_ops.hpp"
|
#include "../csl/tensor_ops.hpp"
|
||||||
#include "../kernels/scale_shift.hpp"
|
#include "../kernels/scale_shift.hpp"
|
||||||
|
#include "../kernels/activations.hpp"
|
||||||
|
#include "../kernels/bias_activation.hpp"
|
||||||
|
|
||||||
#include <opencv2/core.hpp>
|
#include <opencv2/core.hpp>
|
||||||
|
|
||||||
@ -44,6 +46,20 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
|||||||
|
|
||||||
/* group count for grouped convolution */
|
/* group count for grouped convolution */
|
||||||
std::size_t groups;
|
std::size_t groups;
|
||||||
|
|
||||||
|
enum class ActivationType {
|
||||||
|
IDENTITY,
|
||||||
|
RELU, /* uses value provided in `relu_negative_slope` */
|
||||||
|
CLIPPED_RELU, /* uses values provided in `crelu_floor` and `crelu_ceil` */
|
||||||
|
POWER, /* scale and shift fused beforehand (fuseWeights); only `power_exp` is handled by CUDA */
|
||||||
|
TANH,
|
||||||
|
SIGMOID,
|
||||||
|
SWISH,
|
||||||
|
MISH
|
||||||
|
};
|
||||||
|
|
||||||
|
ActivationType activation_type;
|
||||||
|
float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
@ -59,7 +75,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
|||||||
const auto& strides = config.strides;
|
const auto& strides = config.strides;
|
||||||
|
|
||||||
const auto convolution_order = kernel_size.size();
|
const auto convolution_order = kernel_size.size();
|
||||||
CV_Assert(convolution_order >= 1);
|
CV_Assert(convolution_order > 1);
|
||||||
|
|
||||||
CV_Assert(convolution_order == dilations.size());
|
CV_Assert(convolution_order == dilations.size());
|
||||||
CV_Assert(convolution_order == strides.size());
|
CV_Assert(convolution_order == strides.size());
|
||||||
@ -72,7 +88,7 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
|||||||
const auto groups = config.groups;
|
const auto groups = config.groups;
|
||||||
|
|
||||||
if (convolution_order > 3)
|
if (convolution_order > 3)
|
||||||
CV_Error(Error::StsNotImplemented, "Only 1D/2D/3D convolution is supported.");
|
CV_Error(Error::StsNotImplemented, "Only 2D/3D convolution is supported.");
|
||||||
|
|
||||||
const auto rank = input_shape.size();
|
const auto rank = input_shape.size();
|
||||||
const auto output_feature_maps = output_shape[1];
|
const auto output_feature_maps = output_shape[1];
|
||||||
@ -190,6 +206,15 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
|||||||
|
|
||||||
convoluter = csl::Convolution<T>(cudnnHandle, params);
|
convoluter = csl::Convolution<T>(cudnnHandle, params);
|
||||||
|
|
||||||
|
activation = config.activation_type;
|
||||||
|
relu_negative_slope = config.relu_negative_slope;
|
||||||
|
crelu_floor = config.crelu_floor;
|
||||||
|
crelu_ceil = config.crelu_ceil;
|
||||||
|
power_exp = config.power_exp;
|
||||||
|
|
||||||
|
if (activation == ConvolutionConfiguration::ActivationType::POWER && power_exp == 1.0f)
|
||||||
|
activation = ConvolutionConfiguration::ActivationType::IDENTITY;
|
||||||
|
|
||||||
csl::WorkspaceBuilder builder;
|
csl::WorkspaceBuilder builder;
|
||||||
if (!transformed_shape.empty()) {
|
if (!transformed_shape.empty()) {
|
||||||
auto& shape = transformed_shape;
|
auto& shape = transformed_shape;
|
||||||
@ -227,7 +252,62 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
|||||||
if (!biasTensor.empty())
|
if (!biasTensor.empty())
|
||||||
{
|
{
|
||||||
std::size_t inner_size = output.size_range(2, output.rank());
|
std::size_t inner_size = output.size_range(2, output.rank());
|
||||||
kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
|
switch(activation)
|
||||||
|
{
|
||||||
|
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||||
|
kernels::biasN<T>(stream, output, output, inner_size, biasTensor);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::RELU:
|
||||||
|
kernels::biasN_relu_inplace<T>(stream, output, inner_size, biasTensor, relu_negative_slope);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||||
|
kernels::biasN_clipped_relu_inplace<T>(stream, output, inner_size, biasTensor, crelu_floor, crelu_ceil);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::POWER:
|
||||||
|
kernels::biasN_power_inplace<T>(stream, output, inner_size, biasTensor, power_exp);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::TANH:
|
||||||
|
kernels::biasN_tanh_inplace<T>(stream, output, inner_size, biasTensor);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||||
|
kernels::biasN_sigmoid_inplace<T>(stream, output, inner_size, biasTensor);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||||
|
kernels::biasN_swish_inplace<T>(stream, output, inner_size, biasTensor);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::MISH:
|
||||||
|
kernels::biasN_mish_inplace<T>(stream, output, inner_size, biasTensor);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
switch(activation)
|
||||||
|
{
|
||||||
|
case ConvolutionConfiguration::ActivationType::IDENTITY:
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::RELU:
|
||||||
|
kernels::relu<T>(stream, output, output, relu_negative_slope);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::CLIPPED_RELU:
|
||||||
|
kernels::clipped_relu<T>(stream, output, output, crelu_floor, crelu_ceil);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::POWER:
|
||||||
|
kernels::power<T>(stream, output, output, power_exp, 1.0, 0.0);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::TANH:
|
||||||
|
kernels::tanh<T>(stream, output, output);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::SIGMOID:
|
||||||
|
kernels::sigmoid<T>(stream, output, output);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::SWISH:
|
||||||
|
kernels::swish<T>(stream, output, output);
|
||||||
|
break;
|
||||||
|
case ConvolutionConfiguration::ActivationType::MISH:
|
||||||
|
kernels::mish<T>(stream, output, output);
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -243,6 +323,9 @@ namespace cv { namespace dnn { namespace cuda4dnn {
|
|||||||
csl::TensorTransform<T> inputTransformer;
|
csl::TensorTransform<T> inputTransformer;
|
||||||
|
|
||||||
std::size_t scratch_mem_in_bytes;
|
std::size_t scratch_mem_in_bytes;
|
||||||
|
|
||||||
|
ConvolutionConfiguration::ActivationType activation;
|
||||||
|
float relu_negative_slope, crelu_floor, crelu_ceil, power_exp;
|
||||||
};
|
};
|
||||||
|
|
||||||
}}} /* namespace cv::dnn::cuda4dnn */
|
}}} /* namespace cv::dnn::cuda4dnn */
|
||||||
|
@ -2405,7 +2405,7 @@ struct Net::Impl
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (preferableBackend != DNN_BACKEND_OPENCV)
|
if (preferableBackend != DNN_BACKEND_OPENCV && preferableBackend != DNN_BACKEND_CUDA)
|
||||||
continue; // Go to the next layer.
|
continue; // Go to the next layer.
|
||||||
|
|
||||||
// TODO: OpenCL target support more fusion styles.
|
// TODO: OpenCL target support more fusion styles.
|
||||||
@ -2415,6 +2415,9 @@ struct Net::Impl
|
|||||||
ld.layerInstance->type != "Concat")) )
|
ld.layerInstance->type != "Concat")) )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
if (preferableBackend == DNN_BACKEND_CUDA && IS_DNN_CUDA_TARGET(preferableTarget) && ld.layerInstance->type != "Convolution")
|
||||||
|
continue;
|
||||||
|
|
||||||
while (nextData)
|
while (nextData)
|
||||||
{
|
{
|
||||||
// For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
|
// For now, OpenCL target support fusion with activation of ReLU/ChannelsPReLU/Power/Tanh
|
||||||
@ -2426,6 +2429,16 @@ struct Net::Impl
|
|||||||
nextData->type != "Power")
|
nextData->type != "Power")
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
if (IS_DNN_CUDA_TARGET(preferableTarget) &&
|
||||||
|
nextData->type != "ReLU" &&
|
||||||
|
nextData->type != "ReLU6" &&
|
||||||
|
nextData->type != "Power" &&
|
||||||
|
nextData->type != "TanH" &&
|
||||||
|
nextData->type != "Sigmoid" &&
|
||||||
|
nextData->type != "Swish" &&
|
||||||
|
nextData->type != "Mish")
|
||||||
|
break;
|
||||||
|
|
||||||
Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
|
Ptr<ActivationLayer> nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
|
||||||
if (nextActivLayer.empty())
|
if (nextActivLayer.empty())
|
||||||
break;
|
break;
|
||||||
|
@ -239,6 +239,12 @@ public:
|
|||||||
ocl4dnnFusedActiv_t activType;
|
ocl4dnnFusedActiv_t activType;
|
||||||
float power;
|
float power;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_CUDA
|
||||||
|
cuda4dnn::ConvolutionConfiguration::ActivationType cudaActType;
|
||||||
|
float cuda_relu_slope, cuda_crelu_floor, cuda_crelu_ceil, cuda_power_exp;
|
||||||
|
#endif
|
||||||
|
|
||||||
ConvolutionLayerImpl(const LayerParams ¶ms) : BaseConvolutionLayerImpl(params)
|
ConvolutionLayerImpl(const LayerParams ¶ms) : BaseConvolutionLayerImpl(params)
|
||||||
{
|
{
|
||||||
#ifdef HAVE_OPENCL
|
#ifdef HAVE_OPENCL
|
||||||
@ -246,6 +252,10 @@ public:
|
|||||||
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
|
activType = OCL4DNN_CONV_FUSED_ACTIV_NONE;
|
||||||
power = 0.f;
|
power = 0.f;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_CUDA
|
||||||
|
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
|
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const CV_OVERRIDE
|
||||||
@ -406,6 +416,61 @@ public:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAVE_CUDA
|
||||||
|
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY;
|
||||||
|
|
||||||
|
if(IS_DNN_CUDA_TARGET(preferableTarget))
|
||||||
|
{
|
||||||
|
Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
|
||||||
|
if(!activ_relu.empty())
|
||||||
|
{
|
||||||
|
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::RELU;
|
||||||
|
cuda_relu_slope = activ_relu->negativeSlope;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ptr<ReLU6Layer> activ_relu6 = activ.dynamicCast<ReLU6Layer>();
|
||||||
|
if(!activ_relu6.empty())
|
||||||
|
{
|
||||||
|
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::CLIPPED_RELU;
|
||||||
|
cuda_crelu_floor = activ_relu6->minValue;
|
||||||
|
cuda_crelu_ceil = activ_relu6->maxValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ptr<PowerLayer> activ_power = activ.dynamicCast<PowerLayer>();
|
||||||
|
if (!activ_power.empty())
|
||||||
|
{
|
||||||
|
if (activ_power->scale != 1.f || activ_power->shift != 0.f)
|
||||||
|
{
|
||||||
|
const int outCh = blobs[0].size[0];
|
||||||
|
fuseWeights(Mat(1, outCh, CV_32F, Scalar(activ_power->scale)),
|
||||||
|
Mat(1, outCh, CV_32F, Scalar(activ_power->shift)));
|
||||||
|
}
|
||||||
|
|
||||||
|
cuda_power_exp = activ_power->power;
|
||||||
|
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::POWER;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ptr<TanHLayer> activ_tanh = activ.dynamicCast<TanHLayer>();
|
||||||
|
if(!activ_tanh.empty())
|
||||||
|
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::TANH;
|
||||||
|
|
||||||
|
Ptr<SigmoidLayer> activ_sigmoid = activ.dynamicCast<SigmoidLayer>();
|
||||||
|
if(!activ_sigmoid.empty())
|
||||||
|
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::SIGMOID;
|
||||||
|
|
||||||
|
Ptr<SwishLayer> activ_swish = activ.dynamicCast<SwishLayer>();
|
||||||
|
if(!activ_swish.empty())
|
||||||
|
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::SWISH;
|
||||||
|
|
||||||
|
Ptr<MishLayer> activ_mish = activ.dynamicCast<MishLayer>();
|
||||||
|
if(!activ_mish.empty())
|
||||||
|
cudaActType = cuda4dnn::ConvolutionConfiguration::ActivationType::MISH;
|
||||||
|
|
||||||
|
if (cudaActType == cuda4dnn::ConvolutionConfiguration::ActivationType::IDENTITY)
|
||||||
|
activ.reset();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return !activ.empty();
|
return !activ.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1418,6 +1483,12 @@ public:
|
|||||||
config.output_shape.assign(std::begin(output_shape), std::end(output_shape));
|
config.output_shape.assign(std::begin(output_shape), std::end(output_shape));
|
||||||
config.groups = groups;
|
config.groups = groups;
|
||||||
|
|
||||||
|
config.activation_type = cudaActType;
|
||||||
|
config.relu_negative_slope = cuda_relu_slope;
|
||||||
|
config.crelu_floor = cuda_crelu_floor;
|
||||||
|
config.crelu_ceil = cuda_crelu_ceil;
|
||||||
|
config.power_exp = cuda_power_exp;
|
||||||
|
|
||||||
Mat filtersMat = fusedWeights ? weightsMat : blobs[0];
|
Mat filtersMat = fusedWeights ? weightsMat : blobs[0];
|
||||||
Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat();
|
Mat biasMat = (hasBias() || fusedBias) ? Mat(output_feature_maps, 1, CV_32F, biasvec.data()) : Mat();
|
||||||
if (countNonZero(biasMat) == 0)
|
if (countNonZero(biasMat) == 0)
|
||||||
|
Loading…
Reference in New Issue
Block a user